def test_rusboost(imbalanced_dataset, algorithm):
    X, y = imbalanced_dataset
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        stratify=y,
                                                        random_state=1)
    classes = np.unique(y)

    n_estimators = 500
    rusboost = RUSBoostClassifier(n_estimators=n_estimators,
                                  algorithm=algorithm,
                                  random_state=0)
    rusboost.fit(X_train, y_train)
    assert_array_equal(classes, rusboost.classes_)

    # check that we have an ensemble of samplers and estimators with a
    # consistent size
    assert len(rusboost.estimators_) > 1
    assert len(rusboost.estimators_) == len(rusboost.samplers_)
    assert len(rusboost.pipelines_) == len(rusboost.samplers_)

    # each sampler in the ensemble should have different random state
    assert (len({sampler.random_state
                 for sampler in rusboost.samplers_
                 }) == len(rusboost.samplers_))
    # each estimator in the ensemble should have different random state
    assert (len({est.random_state
                 for est in rusboost.estimators_
                 }) == len(rusboost.estimators_))

    # check the consistency of the feature importances
    assert len(rusboost.feature_importances_) == imbalanced_dataset[0].shape[1]

    # check the consistency of the prediction outpus
    y_pred = rusboost.predict_proba(X_test)
    assert y_pred.shape[1] == len(classes)
    assert rusboost.decision_function(X_test).shape[1] == len(classes)

    score = rusboost.score(X_test, y_test)
    assert score > 0.7, "Failed with algorithm {} and score {}".format(
        algorithm, score)

    y_pred = rusboost.predict(X_test)
    assert y_pred.shape == y_test.shape
def test_rusboost(imbalanced_dataset, algorithm):
    X, y = imbalanced_dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
    classes = np.unique(y)

    n_estimators = 500
    rusboost = RUSBoostClassifier(n_estimators=n_estimators,
                                  algorithm=algorithm,
                                  random_state=0)
    rusboost.fit(X_train, y_train)
    assert_array_equal(classes, rusboost.classes_)

    # check that we have an ensemble of samplers and estimators with a
    # consistent size
    assert len(rusboost.estimators_) > 1
    assert len(rusboost.estimators_) == len(rusboost.samplers_)
    assert len(rusboost.pipelines_) == len(rusboost.samplers_)

    # each sampler in the ensemble should have different random state
    assert (len(set(sampler.random_state for sampler in rusboost.samplers_)) ==
            len(rusboost.samplers_))
    # each estimator in the ensemble should have different random state
    assert (len(set(est.random_state for est in rusboost.estimators_)) ==
            len(rusboost.estimators_))

    # check the consistency of the feature importances
    assert len(rusboost.feature_importances_) == imbalanced_dataset[0].shape[1]

    # check the consistency of the prediction outpus
    y_pred = rusboost.predict_proba(X_test)
    assert y_pred.shape[1] == len(classes)
    assert rusboost.decision_function(X_test).shape[1] == len(classes)

    score = rusboost.score(X_test, y_test)
    assert score > 0.7, "Failed with algorithm {} and score {}".format(
        algorithm, score)

    y_pred = rusboost.predict(X_test)
    assert y_pred.shape == y_test.shape
Exemple #3
0
     #classifier = RusBoost(depth=depth, n_estimators=estimators)
     #classifier = AdaboostNC_Classifier(**a)
     #classifier = CUSBoostNC_Classifier(**a)
     #classifier = RusBoost(**a)
     classifier = RUSBoostClassifier(DecisionTreeClassifier(max_depth=8), n_estimators=64)
 
     #classifier.fit(X_train, y_train, number_of_clusters, 0.5) #CUSBoost classifier        
     #classifier.fit(X_train, y_train) #Adaboost classifier
     #classifier.fit(X_train, y_train, 0.5) #AdaboostNC classifier
     #classifier.fit(X_train, y_train, 6, 0.5)
     #classifier.fit(X_train, y_train, 6, fraction/100, 8)
     classifier.fit(X_train, y_train)
     
     
     
     predictions = classifier.predict_proba(X_test)
     prediction_ = classifier.predict(X_test)
 
     auc = roc_auc_score(y_test, predictions[:, 1])
     f1 = f1_score(y_test, prediction_)
     accuracy = accuracy_score(y_test, prediction_)
 
     #aupr = average_precision_score(y_test, predictions[:, 1])
 
     current_param_auc.append(auc)
     current_param_f1.append(f1)
     current_param_accuracy.append(accuracy)
 
     #current_param_aupr.append(aupr)
 
     #fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1])
Exemple #4
0
def learning_model(year, class_weight):
    iters = 300
    gap = 2
    year_test = year

    data_test = reader.ordinary_data_reader('uscecchini28.csv', year_test, year_test)
    x_test = data_test.features
    y_test = data_test.labels
    test = np.c_[data_test.years, data_test.firms]

    '''
        an if-else is used to judge whether the class_weight is None to prevent Exception from string concatenation
        
        a try-except for RusBoost with DecisionTreeClassifier using custom class_weight
        
        if we can find the right model trained last time on disk, we can directly use that model to predict
        the result without training twice
        otherwise, we have to train that model and save it on disk
        
    '''
    # if class_weight is not None:
    # we use current_model_name to find/save the trained model with custom class_weight
    #     current_model_name = class_weight + "_" + str(year_test) + ".m"
    # else:
    #     current_model_name = str(year_test) + ".m"
    current_model_name = class_weight + "_" + str(year_test) + ".m"
    try:

        rusboost_model = joblib.load(current_model_name)

    except Exception as e:

        print('Running RUSBoost (training period: 1991-' + str(year_test - gap) + ', testing period: ' + str(
            year_test) + ', with ' + str(gap) + '-year gap)...')

        data_train = reader.ordinary_data_reader('uscecchini28.csv', 1991, year_test - gap)

        x_train = data_train.features
        y_train = data_train.labels
        newpaaer_train = data_train.newpaaers

        # formatter labels and newpaaers for the step: data_test.newpaaers(data_test.labels~=0)
        data_test.newpaaers = np.array(data_test.newpaaers)
        data_test.labels = np.array(data_test.labels)
        # replace the nan that should be remained in the array with 0
        for i in range(len(data_test.newpaaers)):
            if np.isnan(data_test.newpaaers[i]):
                if data_test.labels[i] != 0:
                    data_test.newpaaers[i] = 0
        # replace all the nans remain in the array
        data_test.newpaaers = np.array([x for x in data_test.newpaaers if str(x) != 'nan'])
        # replace all the 0 back to nan
        for i in range(len(data_test.newpaaers)):
            if int(data_test.newpaaers[i]) == 0.0:
                data_test.newpaaers[i] = np.NaN

        # do the unique to get final result for newpaaer_test
        newpaaer_test = np.unique(data_test.newpaaers)

        ''' 
        Caution:
            here we change the type of variable called y_train for matching the array index of
            formatted array newpaaer_train in the following loop

        '''
        y_train = np.array(y_train)
        num_frauds = sum(y_train == 1)

        print(num_frauds)
        '''
            here we use the function in1d to replace the function ismember used in matlab
            and a temp array for the other operation to handle serial frauds finish the step:
            y_train[ismember(newpaaer_train, newpaaer_test)] = 0
        '''
        temp_array = np.array(np.in1d(newpaaer_train, newpaaer_test)).astype(int)
        for i in range(len(temp_array)):
            if temp_array[i] == 1:
                y_train[i] = 0

        # delete the temp array
        del temp_array

        num_frauds = num_frauds - sum(y_train == 1)
        print('Recode', num_frauds, 'overlapped frauds (i.e., change fraud label from 1 to 0).')

        start_time = time.perf_counter()
        rusboost_model = RUSBoostClassifier(DecisionTreeClassifier(min_samples_leaf=5, class_weight=class_weight),
                                            learning_rate=0.1, n_estimators=iters)
        rusboost_model.fit(x_train, y_train)
        end_time = time.perf_counter()
        t_train = end_time - start_time
        joblib.dump(rusboost_model, current_model_name)
        print(end_time - start_time)
        print('Training time: %.3f seconds' % t_train)

    start_time = time.perf_counter()
    predit = rusboost_model.predict(x_test)
    prob = rusboost_model.predict_proba(x_test)
    end_time = time.perf_counter()
    t_test = end_time - start_time

    print('Testing time %.3f seconds' % t_test)

    # test figures
    print("AUC: %.4f" % metrics.roc_auc_score(y_test, predit))
    # np.set_printoptions(precision=4, threshold=8, edgeitems=4, linewidth=75, suppress=True, nanstr='nan', infstr='inf')
    print("precision: %.2f%%" % np.multiply(metrics.precision_score(y_test, predit, zero_division=0), 100))
    print("recall: %.2f%%" % np.multiply(metrics.recall_score(y_test, predit), 100))

    # dump part of the results(fraud probability)
    prob = np.around(np.delete(prob, 0, axis=1) * 100, decimals=5)
    data = np.c_[predit, prob]
    data = np.c_[test, data]
    file_data = pd.DataFrame(data)
    csv_file_name = 'data.csv'
    file_data.to_csv(csv_file_name, header=False, index=False)
Exemple #5
0
##### Everything is ready for cell type prediction #####

rusboost = RUSBoostClassifier(random_state=0)
rusboost.fit(exMtrain, cellTypesTrain)

##### Cell types prediction #####
cellTypesPred = rusboost.predict(exMpred)

#accuracy_score = balanced_accuracy_score(cellTypesTrue, cellTypesPred)
#print accuracy_score
#classification_report(cellTypesTrue, cellTypesPred)

##### Checking performance #####
#confusionMatrix = confusion_matrix(cellTypesTrue, cellTypesPred)
cellTypesProbs = rusboost.predict_proba(exMpred)
#print confusionMatrix
##### Merging the cell types and probability score #####

cellID_Probs = np.concatenate((cellID[:, None], cellTypesProbs), axis=1)
combine = np.concatenate((cellID_Probs, cellTypesPred[:, None]), axis=1)

###################################

##### Output the results from array #####
# file format:
# Cell_ID Cell_types_prediction Cell_types_prediction_probability_score
#
##### Prediction complete, and generate the output file #####

outFile = open(sys.argv[3], 'w')  #The name of output file