def UncertaintyEstimatesFromClassifiers():
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.datasets import make_circles
    import numpy as np
    from sklearn.model_selection import train_test_split

    X, y = make_circles(noise=0.25, factor=0.5, random_state=1)

    #We rename the class "blue" and "red" for illustration purposes
    y_named = np.array(['blue', 'red'])[y]

    #We can call train_test_split with arbitrarily many arrays;
    #all will split in a consistent manner
    X_train, X_test, y_train_named, y_test_named, y_train, y_test = train_test_split(
        X, y_named, y, random_state=0)

    #Build the gradient boosting model
    gbrt = GradientBoostingClassifier(random_state=0)
    gbrt.fit(X_train, y_train_named)

    print('X_test.shape: {}'.format(X_test.shape))
    print('Decision function shape: {}'.format(
        gbrt.decision_function(X_test).shape))

    #Show the first few entries of decision_function
    print('Decision function: \n{}'.format(gbrt.decision_function(X_test[:6])))

    print('Threshold decision function:\n{}'.format(
        gbrt.decision_function(X_test) > 0))
    print('Predictions:\n{}'.format(gbrt.predict(X_test)))
Example #2
0
def uncertainty_multiclass_clf():
    iris = load_iris()
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=42)
    gbrt = GradientBoostingClassifier(learning_rate=0.01, random_state=0)
    gbrt.fit(X_train, y_train)
    print("Decision function shape: {}".format(
        gbrt.decision_function(X_test).shape))  # (38, 3)
    print("Decision function:\n{}".format(
        gbrt.decision_function(X_test)[:6, :]))

    # recover predictions from these scores by finding max entry for each data point:
    print('Argmax of decision function:\n{0}'.format(
        np.argmax(gbrt.decision_function(X_test), axis=1)))
    print('Predictions:\n{0}'.format(gbrt.predict(X_test)))

    # Probabilities:
    print("Predicted probabilities:\n{}".format(
        gbrt.predict_proba(X_test)[:6]))
    print("Sums: {}".format(gbrt.predict_proba(X_test)[:6].sum(
        axis=1)))  # Sums: [ 1. 1. 1. ... 1.]

    print("Argmax of predicted probabilities:\n{}".format(
        np.argmax(gbrt.predict_proba(X_test), axis=1)))
    print("Predictions:\n{}".format(gbrt.predict(X_test)))
Example #3
0
def salary_predictions():

    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score, recall_score, auc, roc_curve, precision_score
    from sklearn.ensemble import GradientBoostingClassifier

    df = pd.DataFrame()
    df["degree_cent"] = nx.degree_centrality(G).values()
    df["clustering"] = nx.clustering(G)
    df["closeness"] = nx.closeness_centrality(G, normalized=True).values()
    df["betweenness"] = nx.betweenness_centrality(G,
                                                  normalized=True,
                                                  endpoints=False,
                                                  k=200).values()

    dep = [x[1]["Department"] for x in G.nodes(data=True)]
    man_salary = [x[1]["ManagementSalary"] for x in G.nodes(data=True)]
    df["department"] = dep
    df["management salary"] = man_salary

    #Separate the data with management salary reported from the rows where no salary is reported
    salary_reported = df.dropna()
    salary_not_reported = df[df["management salary"].isnull()]

    x = salary_reported.drop("management salary", axis=1)
    y = salary_reported["management salary"]
    test_df = salary_not_reported.drop("management salary", axis=1)

    #Training the gradient boosting model
    X_train, X_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        train_size=0.9,
                                                        random_state=0)
    gbm = GradientBoostingClassifier(random_state=0,
                                     learning_rate=0.1,
                                     n_estimators=45,
                                     max_depth=5).fit(X_train, y_train)
    y_score_eval = gbm.decision_function(X_test)
    y_proba_eval = gbm.predict_proba(X_test)
    y_score = gbm.decision_function(test_df)
    y_proba = gbm.predict_proba(test_df)

    fpr, tpr, _ = roc_curve(y_test, y_score_eval)
    roc_auc = auc(fpr, tpr)

    prob_management_salary = pd.Series(y_proba[:, 1])
    prob_management_salary.index = test_df.index

    return prob_management_salary
def classConfidence():
    iris = load_iris()
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=42)
    gbrt = GradientBoostingClassifier(learning_rate=0.01, random_state=0)
    gbrt.fit(X_train, y_train)

    print("Decision Func shape:{}".format(
        gbrt.decision_function(X_test).shape))
    print("Decision Func:{}".format(gbrt.decision_function(X_test)[:6, :]))

    print("Argmax of decision func:\n{}".format(
        np.argmax(gbrt.decision_function(X_test), axis=1)))
    print("Predictions:\n{}".format(gbrt.predict(X_test)))
Example #5
0
def test_sum_match_gradient_boosting_classifier():
    import shap
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import GradientBoostingClassifier
    import sklearn

    X_train, X_test, Y_train, Y_test = train_test_split(*shap.datasets.adult(),
                                                        test_size=0.2,
                                                        random_state=0)
    clf = GradientBoostingClassifier(random_state=202,
                                     n_estimators=10,
                                     max_depth=10)
    clf.fit(X_train, Y_train)

    # Use decision function to get prediction before it is mapped to a probability
    predicted = clf.decision_function(X_test)

    # check SHAP values
    ex = shap.TreeExplainer(clf)
    initial_ex_value = ex.expected_value
    shap_values = ex.shap_values(X_test)
    assert np.abs(shap_values.sum(1) + ex.expected_value - predicted).max() < 1e-6, \
        "SHAP values don't sum to model output!"

    # check initial expected value
    assert np.abs(initial_ex_value -
                  ex.expected_value) < 1e-6, "Inital expected value is wrong!"

    # check SHAP interaction values
    shap_interaction_values = ex.shap_interaction_values(X_test.iloc[:10, :])
    assert np.abs(shap_interaction_values.sum(1).sum(1) + ex.expected_value - predicted[:10]).max() < 1e-6, \
        "SHAP interaction values don't sum to model output!"
Example #6
0
def blight_model():
    from sklearn.model_selection import train_test_split
    train=pd.read_csv('train.csv',encoding = 'ISO-8859-1',usecols=['ticket_id','compliance','fine_amount','judgment_amount','hearing_date','ticket_issued_date'])
    train_blight=train[(train['compliance']==1) | (train['compliance']==0)]
    train_blight['hearing_date']=pd.to_datetime(train_blight['hearing_date'].fillna('1900-01-01 00:00:00')).dt.date
    train_blight['ticket_issued_date']=pd.to_datetime(train_blight['ticket_issued_date'].fillna('1900-01-01 00:00:00')).dt.date
    train_blight['gap']=train_blight['hearing_date']-train_blight['ticket_issued_date']
    train_blight['gap']=train_blight['gap'].fillna(pd.Timedelta('-1 days')).dt.days
    test_blight=pd.read_csv('test.csv',encoding = 'ISO-8859-1',usecols=['ticket_id','fine_amount','judgment_amount','hearing_date','ticket_issued_date'])
    test_blight['hearing_date']=pd.to_datetime(test_blight['hearing_date'].fillna('1900-01-01 00:00:00')).dt.date
    test_blight['ticket_issued_date']=pd.to_datetime(test_blight['ticket_issued_date'].fillna('1900-01-01 00:00:00')).dt.date
    test_blight['gap']=test_blight['hearing_date']-test_blight['ticket_issued_date']
    test_blight['gap']=test_blight['gap'].fillna(pd.Timedelta('-1 days')).dt.days
    feature_names2=['fine_amount','judgment_amount','gap']
    X=train_blight[feature_names2]
    y=train_blight['compliance']

    X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)

    from sklearn.ensemble import GradientBoostingClassifier
    clGrad=GradientBoostingClassifier().fit(X_train,y_train)
    clGrad.score(X_train,y_train)
    clGrad.score(X_test,y_test)
    
    # Your code here
    
    return pd.Series(clGrad.decision_function(test_blight[feature_names2]),index=test_blight['ticket_id'])
def test_gbm_classifier_backupsklearn(backend='auto'):
    df = pd.read_csv("./open_data/creditcard.csv")
    X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
    y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
    import h2o4gpu
    Solver = h2o4gpu.GradientBoostingClassifier

    # Run h2o4gpu version of RandomForest Regression
    gbm = Solver(backend=backend, random_state=1234)
    print("h2o4gpu fit()")
    gbm.fit(X, y)

    # Run Sklearn version of RandomForest Regression
    from sklearn.ensemble import GradientBoostingClassifier
    gbm_sk = GradientBoostingClassifier(random_state=1234, max_depth=3)
    print("Scikit fit()")
    gbm_sk.fit(X, y)

    if backend == "sklearn":
        assert (gbm.predict(X) == gbm_sk.predict(X)).all() == True
        assert (gbm.predict_log_proba(X) == gbm_sk.predict_log_proba(X)
                ).all() == True
        assert (gbm.predict_proba(X) == gbm_sk.predict_proba(X)).all() == True
        assert (gbm.score(X, y) == gbm_sk.score(X, y)).all() == True
        assert (gbm.decision_function(X)[1] == gbm_sk.decision_function(X)[1]
                ).all() == True
        assert np.allclose(list(gbm.staged_predict(X)),
                           list(gbm_sk.staged_predict(X)))
        assert np.allclose(list(gbm.staged_predict_proba(X)),
                           list(gbm_sk.staged_predict_proba(X)))
        assert (gbm.apply(X) == gbm_sk.apply(X)).all() == True

        print("Estimators")
        print(gbm.estimators_)
        print(gbm_sk.estimators_)

        print("loss")
        print(gbm.loss_)
        print(gbm_sk.loss_)
        assert gbm.loss_.__dict__ == gbm_sk.loss_.__dict__

        print("init_")
        print(gbm.init)
        print(gbm_sk.init)

        print("Feature importance")
        print(gbm.feature_importances_)
        print(gbm_sk.feature_importances_)
        assert (gbm.feature_importances_ == gbm_sk.feature_importances_
                ).all() == True

        print("train_score_")
        print(gbm.train_score_)
        print(gbm_sk.train_score_)
        assert (gbm.train_score_ == gbm_sk.train_score_).all() == True
Example #8
0
def d():
    X, y = make_circles(noise=0.25, factor=0.5, random_state=1)
    # we rename the classes "blue" and "red" for illustration purposes
    y_named = np.array(["blue", "red"])[y]

    # we can call train_test_split with arbitrarily many arrays;
    # all will be split in a consistent manner
    X_train, X_test, y_train, y_test, y_train, y_test = \
        train_test_split(X, y_named, y, random_state=0)
    # build the gradient boosting model
    gbrt = GradientBoostingClassifier(random_state=0)
    gbrt.fit(X_train, y_train)

    # print("X_test.shape: {}".format(X_test.shape))
    # print("Decision function shape: {}".format(
    #     gbrt.decision_function(X_test).shape))

    # show the first few entries of decision_function
    # print("Thresholded decision function:\n{}".format(gbrt.decision_function(X_test) > 0))
    greater_zero = (gbrt.decision_function(X_test) > 0).astype(int)
    print("分类类型 :\n{}".format(gbrt.classes_))
    print("Shape of probabilities: \n{}".format(gbrt.predict_proba(X_test)))
    print("decision function: \n{}".format(gbrt.decision_function(X_test)))
    print("Predictions:\n{}".format(gbrt.predict(X_test)))
    # print("pred is equal to predictions:{}".format(np.all(gbrt.classes_[greater_zero] == gbrt.predict(X_test))))
    """
    决策边界
    decision_function
        it returns one floating-point number for each sample
    预测可能性
    predict_proba
        a probability for each class, and 
        is often more easily understood than the output of decision_function
        列举说有分类的可能性 sum=1
        大于50%作为predict结果
    
    overfit and 复杂 预测的准确性更高
    
    """
    # plot_dicision(X, X_test, X_train, gbrt, y_test, y_train)

    plot_probla(X, X_test, X_train, gbrt, y_test, y_train)
Example #9
0
def learn(trainX1, cvX1, trainy1, cvy1):
    # use Gradient Boosting to learn
    trainX1 = normalize(trainX1)
    cvX1 = normalize(cvX1)
    gradboot = GradientBoostingClassifier().fit(trainX1, trainy1)

    # calculate AUC-ROC score
    y_score = gradboot.decision_function(cvX1)
    fpr, tpr, _ = roc_curve(cvy1, y_score)
    roc_auc = auc(fpr, tpr)
    return roc_auc
def gradient_booster():
    gbrt = GradientBoostingClassifier(learning_rate=0.01, random_state=0)
    gbrt.fit(X_train, y_train)
    print("Decision function shape: {}".format(
        gbrt.decision_function(X_test).shape))
    # plot the first few entries of the descision function
    print("Decisioon function:\n{}".format(
        gbrt.decision_function(X_test)[:6, :]))
    print("Argmax of decision functions:\n{}".format(
        np.argmax(gbrt.decision_function(X_test), axis=1)))
    print("Predictions:\n{}".format(gbrt.predict(X_test)))

    # show first few entries of predict_proba
    print("Predicted probabilities:\n{}".format(
        gbrt.predict_proba(X_test)[:6]))
    # show that  sums accros rows are one
    print("Sums: {}".format(gbrt.predict_proba(X_test)[:6].sum(axis=1)))

    print("Argmax of predicted probabilities:\n{}".format(
        np.argmax(gbrt.predict_proba(X_test), axis=1)))
    print("Predictions:\n{}".format(gbrt.predict(X_test)))
Example #11
0
def test_max_feature_regression():
    # Test to make sure random state is set properly.
    X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)

    X_train, X_test = X[:2000], X[2000:]
    y_train, y_test = y[:2000], y[2000:]

    gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=5,
                                      max_depth=2, learning_rate=.1,
                                      max_features=2, random_state=1)
    gbrt.fit(X_train, y_train)
    deviance = gbrt.loss_(y_test, gbrt.decision_function(X_test))
    assert_true(deviance < 0.5, "GB failed with deviance %.4f" % deviance)
def test_max_feature_regression():
    # Test to make sure random state is set properly.
    X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)

    X_train, X_test = X[:2000], X[2000:]
    y_train, y_test = y[:2000], y[2000:]

    gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=5,
                                      max_depth=2, learning_rate=.1,
                                      max_features=2, random_state=1)
    gbrt.fit(X_train, y_train)
    deviance = gbrt.loss_(y_test, gbrt.decision_function(X_test))
    assert deviance < 0.5, "GB failed with deviance %.4f" % deviance
def test_gbm_classifier_backupsklearn(backend='auto'):
    df = pd.read_csv("./open_data/creditcard.csv")
    X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
    y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
    import h2o4gpu
    Solver = h2o4gpu.GradientBoostingClassifier

    # Run h2o4gpu version of RandomForest Regression
    gbm = Solver(backend=backend, random_state=1234)
    print("h2o4gpu fit()")
    gbm.fit(X, y)

    # Run Sklearn version of RandomForest Regression
    from sklearn.ensemble import GradientBoostingClassifier
    gbm_sk = GradientBoostingClassifier(random_state=1234, max_depth=3)
    print("Scikit fit()")
    gbm_sk.fit(X, y)

    if backend == "sklearn":
        assert (gbm.predict(X) == gbm_sk.predict(X)).all() == True
        assert (gbm.predict_log_proba(X) == gbm_sk.predict_log_proba(X)).all() == True
        assert (gbm.predict_proba(X) == gbm_sk.predict_proba(X)).all() == True
        assert (gbm.score(X, y) == gbm_sk.score(X, y)).all() == True
        assert (gbm.decision_function(X)[1] == gbm_sk.decision_function(X)[1]).all() == True
        assert np.allclose(list(gbm.staged_predict(X)), list(gbm_sk.staged_predict(X)))
        assert np.allclose(list(gbm.staged_predict_proba(X)), list(gbm_sk.staged_predict_proba(X)))
        assert (gbm.apply(X) == gbm_sk.apply(X)).all() == True

        print("Estimators")
        print(gbm.estimators_)
        print(gbm_sk.estimators_)

        print("loss")
        print(gbm.loss_)
        print(gbm_sk.loss_)
        assert gbm.loss_.__dict__ == gbm_sk.loss_.__dict__

        print("init_")
        print(gbm.init)
        print(gbm_sk.init)

        print("Feature importance")
        print(gbm.feature_importances_)
        print(gbm_sk.feature_importances_)
        assert (gbm.feature_importances_ == gbm_sk.feature_importances_).all() == True

        print("train_score_")
        print(gbm.train_score_)
        print(gbm_sk.train_score_)
        assert (gbm.train_score_ == gbm_sk.train_score_).all() == True
Example #14
0
def test_single_row_gradient_boosting_classifier():
    import shap
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import GradientBoostingClassifier
    import sklearn

    X_train,X_test,Y_train,_ = train_test_split(*shap.datasets.adult(), test_size=0.2, random_state=0)
    clf = GradientBoostingClassifier(random_state=202, n_estimators=10, max_depth=10)
    clf.fit(X_train, Y_train)
    predicted = clf.decision_function(X_test)
    ex = shap.TreeExplainer(clf)
    shap_values = ex.shap_values(X_test.iloc[0,:])
    assert np.abs(shap_values.sum() + ex.expected_value - predicted[0]) < 1e-4, \
        "SHAP values don't sum to model output!"
Example #15
0
def in_101():
    from sklearn.datasets import load_iris
    from sklearn.ensemble import GradientBoostingClassifier
    iris = load_iris()
    x_train, x_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=42)
    gbrt = GradientBoostingClassifier(learning_rate=0.1, random_state=0)
    gbrt.fit(x_train, y_train)

    print(gbrt.decision_function(x_test[0:5]))
    print(gbrt.predict_proba(x_test[0:5]))
    print(y_train[0])
    for i in y_train[0:5]:
        print(iris.target_names[i])
Example #16
0
def boosting(X_train, y_train, X_test, y_test):
    seed = 7
    num_trees = 100
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    model = GradientBoostingClassifier(
        n_estimators=num_trees, random_state=seed
    ).fit(
        X_train, y_train
    )  # of model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
    results = model.score(X_test, y_test)
    y_df = model.decision_function(X_test)
    y_pred = model.predict(X_test)
    precicions, recall, t = precision_recall_curve(y_test, y_df, pos_label=1)
    print(precicions[:10], recall[:10], t[:10])
    precision = precicions[0]
    confmat = confusion_matrix(y_test, y_pred)
    return results, precision, confmat
Example #17
0
def gdb_machine(df_list, label_list, penalty=1, scale=False):
    random_state = np.random.RandomState(20180213)
    gdb_results = {
        'prediction': [],
        'probaility': [],
        'y_test': [],
        'y_score': []
    }
    try:
        if scale:
            df_list = [scale_df(df) for df in df_list]
            print('DF Scaling successful.')
    except:
        raise ValueError('Failed to execute DF Scaling.')

    for x, y in zip(df_list, label_list):
        try:
            x_train, x_test, y_train, y_test = train_test_split(
                x, y, test_size=.2, random_state=random_state)
        except:
            raise ValueError('Train/Test split failed.')

        #df = pd.DataFrame(x_train).assign(outcome=y_train)
        #df = pd.concat([df[df.outcome==1].sample(n=5 * sum(np.array(y_train)==0)), df[df.outcome==0]])
        #x_train = df.drop(['outcome'])
        #y_train = df.outcome
        #del df
        gdb = GradientBoostingClassifier(n_estimators=7,
                                         max_depth=6,
                                         min_samples_split=1780,
                                         min_samples_leaf=1,
                                         random_state=20180320,
                                         max_features=310,
                                         subsample=0.8,
                                         learning_rate=0.11)

        weighting = lambda x: 1 if x else penalty
        gdb.fit(x_train,
                y_train,
                sample_weight=[weighting(i) for i in y_train])

        gdb_results['y_test'].append(y_test)
        gdb_results['prediction'].append(gdb.predict(x_test))
        gdb_results['probaility'].append(gdb.predict_proba(x_test)[::, 1])
        gdb_results['y_score'].append(gdb.decision_function(x_test))
    return gdb_results
class GradientBoosting(Processor):
    def __init__(self,
                 name='rfe',
                 c=1.0,
                 n_estimators=100,
                 keys_correspondences=DEFAULT_KEYS_CORRESPONDENCES):
        super(GradientBoosting, self).__init__(name)
        self._model = GradientBoostingClassifier(LinearSVC(C=c),
                                                 n_estimators=n_estimators,
                                                 max_depth=1000)
        self.keys_correspondences = keys_correspondences

    def to_dict(self):
        output_dict = {
            'data': np.array(pickle.dumps(self._model)),
        }
        return output_dict

    def from_dict(self, dict):
        self._model = pickle.loads(dict['data'])

    def fit(self, x):
        labels_key = self.keys_correspondences["labels_key"]
        features_key = self.keys_correspondences["features_key"]

        labels = copy.deepcopy(x[labels_key])
        labels[labels > 0] = 1
        self._model.fit(x[features_key], labels)

    def run(self, x):
        features_key = self.keys_correspondences["features_key"]
        scores_key = self.keys_correspondences["scores_key"]
        output_type_key = self.keys_correspondences["output_type_key"]

        x[scores_key] = self._model.decision_function(x[features_key])
        x[output_type_key] = ProcessorOutputType.LIKELIHOOD
        return x

    def __str__(self):
        description = {
            'type': 'Gradient boosting processor',
            'name': self.name
        }
        return str(description)
def test_probability_exponential():
    """Predict probabilities."""
    clf = GradientBoostingClassifier(loss="exponential", n_estimators=100, random_state=1)

    assert_raises(ValueError, clf.predict_proba, T)

    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)

    # check if probabilities are in [0, 1].
    y_proba = clf.predict_proba(T)
    assert np.all(y_proba >= 0.0)
    assert np.all(y_proba <= 1.0)
    score = clf.decision_function(T).ravel()
    assert_array_equal(y_proba[:, 1], 1.0 / (1.0 + np.exp(-2 * score)))

    # derive predictions from probabilities
    y_pred = clf.classes_.take(y_proba.argmax(axis=1), axis=0)
    assert_array_equal(y_pred, true_result)
def Uncertainty_eval():
    '''test'''
    X, y = make_circles(noise=0.25, factor=0.5, random_state=1)

    y = np.array(["blue", "red"])[y]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    gb = GradientBoostingClassifier(learning_rate=0.01, random_state=0)
    gb.fit(X_train, y_train)
    print("Decusuib functions: \n{}".format(gb.decision_function(X_test)[:6]))
    print(gb.classes_)
    print("Decusuib functions: \n{}".format(gb.predict_proba(X_test)[:6]))

    iris = load_iris()
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=42)
    gb.fit(X_train, y_train)
    print(gb.score(X_test, y_test), '\n',
          np.argmax(gb.predict_proba(X_test), axis=1))
Example #21
0
def gradboost(n_trees):
    iris_dataset = load_iris()
    X_train, X_test, y_train, y_test = train_test_split(iris_dataset['data'],
                                                        iris_dataset['target'],
                                                        random_state=0)
    x = []
    y = []
    for i in range(n_trees):
        knn = GradientBoostingClassifier(random_state=0,
                                         n_estimators=i + 1,
                                         max_depth=1)
        knn.fit(X_train, y_train)
        x.append(knn.score(X_train, y_train))
        y.append(knn.score(X_test, y_test))
    I = [i + 1 for i in range(n_trees)]
    plt.figure()
    plt.plot(I, x, 'r', I, y, 'k')
    plt.show()
    print(knn.decision_function(X_test)[0][0])
    print(knn.predict_proba(X_test)[0][0])
class GradientBoostingClassifierImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)
Example #23
0
def test_probability_exponential():
    # Predict probabilities.
    clf = GradientBoostingClassifier(loss='exponential',
                                     n_estimators=100,
                                     random_state=1)

    assert_raises(ValueError, clf.predict_proba, T)

    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)

    # check if probabilities are in [0, 1].
    y_proba = clf.predict_proba(T)
    assert np.all(y_proba >= 0.0)
    assert np.all(y_proba <= 1.0)
    score = clf.decision_function(T).ravel()
    assert_array_almost_equal(y_proba[:, 1], 1.0 / (1.0 + np.exp(-2 * score)))

    # derive predictions from probabilities
    y_pred = clf.classes_.take(y_proba.argmax(axis=1), axis=0)
    assert_array_equal(y_pred, true_result)
Example #24
0
def blight_model():

    # Your code here

    # Read dataset
    df = pd.read_csv("train.csv", encoding='ISO-8859-1', low_memory=False)
    df = df[np.isfinite(df['compliance'])]
    df2 = pd.read_csv("test.csv")
    add_df = pd.read_csv("addresses.csv")
    lat_df = pd.read_csv("latlons.csv")

    # assign y,X for training and X2 for testing

    y = df['compliance'].values
    X = df[list(['judgment_amount', 'late_fee'])].values
    X2 = df2[list(['judgment_amount', 'late_fee'])].values

    # Split dataset into train and test/dev dataset using an inbuilt function of sklearn

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    # training the data using Gradient Booster classifier

    clf = GradientBoostingClassifier().fit(X_train, y_train)

    # Predicting various score on the trained model

    y_score = clf.decision_function(X_test)
    print('Score training Set: ' + str(clf.score(X_train, y_train)))
    print('Score test set: ' + str(clf.score(X_test, y_test)))
    print('ROC_AUC Score:' + str(roc_auc_score(y_test, y_score)) + "\n")

    # Testing is important ,b ut measuring that is even more thus plotting precision recall curve for the model
    # Now the curve depicts that or model is more precision oriented than recall

    precision, recall, thresholds = precision_recall_curve(y_test, y_score)
    closest_zero = np.argmin(np.abs(thresholds))
    closest_zero_p = precision[closest_zero]
    closest_zero_r = recall[closest_zero]

    plt.figure()
    plt.xlim([0.0, 1.01])
    plt.ylim([0.0, 1.01])
    plt.plot(precision, recall, label='Precision-Recall Curve')
    plt.plot(closest_zero_p,
             closest_zero_r,
             'o',
             markersize=12,
             fillstyle='none',
             c='r',
             mew=3)
    plt.xlabel('Precision', fontsize=16)
    plt.ylabel('Recall', fontsize=16)
    plt.axes().set_aspect('equal')
    plt.show()

    # plotting roc_auc graph

    fpr_lr, tpr_lr, _ = roc_curve(y_test, y_score)
    roc_auc_lr = auc(fpr_lr, tpr_lr)
    plt.figure()
    plt.xlim([-0.01, 1.00])
    plt.ylim([-0.01, 1.01])
    plt.plot(fpr_lr,
             tpr_lr,
             lw=3,
             label='LogRegr ROC curve (area = {:0.2f})'.format(roc_auc_lr))
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    plt.title('ROC curve ', fontsize=16)
    plt.legend(loc='lower right', fontsize=13)
    plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')
    plt.axes().set_aspect('equal')
    plt.show()

    ind = df2['ticket_id'].values
    #print(ind)
    pred2 = clf.predict_proba(X2)[:, 1]

    #print(pred)
    #print(np.shape(pred))
    #print(np.shape(ind))

    ans = pd.Series(pred2, ind, dtype='float64')

    return ans
Example #25
0
def decision_function():
    # circle数据集是一个大圆,一个小圆组成的数据集
    # 准备有噪声的circle数据集
    from sklearn.datasets import make_circles
    from sklearn.model_selection import train_test_split
    X, y = make_circles(noise=0.25, factor=0.5, random_state=1)
    y_named = np.array(['blue', 'red'])[y]
    X_train, X_test, y_train_named, y_test_named, y_train, y_test = train_test_split(
        X, y_named, y, random_state=seed)

    # 构建梯度提升模型
    from sklearn.ensemble import GradientBoostingClassifier
    gbdt = GradientBoostingClassifier(random_state=seed)
    gbdt.fit(X_train, y_train_named)

    # 2.4.1. 决策函数
    print('测试集的形状:{}'.format(X_test.shape))
    decision_function_values = gbdt.decision_function(X_test)
    # 二分类问题的决策函数是一维数据
    # ToDo:什么历史原因造成的?
    print('决策函数的形状:{}'.format(decision_function_values.shape))
    print('决策函数计算测试集的输出值:\n{}'.format(decision_function_values))
    print('决策函数计算测试集的输出值经过阈值判断的结果:\n{}'.format(decision_function_values > 0))
    print('模型预测测试集的输出结果:\n{}'.format(gbdt.predict(X_test)))

    # 将布尔值转化为0和1
    greater_zero = (decision_function_values > 0).astype(int)
    # 将0和1转化为类别名称
    pred = gbdt.classes_[greater_zero]

    print('决策函数输出结果与模型计算测试集的输出结果是否相等: {}'.format(
        np.all(pred == gbdt.predict(X_test))))

    decision_function_values = decision_function_values
    print('-' * 20)
    print("决策函数的输出很难解释。")
    print('决策函数的最小值: {:.2f} 与最大值: {:.2f}'.format(
        np.min(decision_function_values), np.max(decision_function_values)))

    fig, axes = plt.subplots(1, 2, figsize=(13, 5))

    mglearn.tools.plot_2d_separator(gbdt,
                                    X,
                                    ax=axes[0],
                                    alpha=.4,
                                    fill=True,
                                    cm=mglearn.cm2)

    scores_image = mglearn.tools.plot_2d_scores(gbdt,
                                                X,
                                                ax=axes[1],
                                                alpha=.4,
                                                cm=mglearn.ReBl)

    from mglearn import discrete_scatter
    for ax in axes:
        discrete_scatter(X_test[:, 0],
                         X_test[:, 1],
                         y_test,
                         markers=['^'],
                         ax=ax)
        discrete_scatter(X_train[:, 0],
                         X_train[:, 1],
                         y_train,
                         markers=['o'],
                         ax=ax)
        ax.set_xlabel('Feature 0')
        ax.set_ylabel('Feature 1')

    plt.colorbar(scores_image, ax=axes.tolist())
    axes[0].legend(
        ['Test class 0', 'Test class 1', 'Train Class 0', 'Train Class 1'],
        ncol=4,
        loc=(.1, 1.1))
    plt.title("图2-55 梯度提升模型在一个二维圆数据集上的决策边界(左)和决策函数(右)")
Example #26
0
def multi_classes():
    # 鸢尾花(iris)数据集是一个三分类数据集。
    from sklearn.datasets import load_iris
    iris = load_iris()

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=seed)

    from sklearn.ensemble import GradientBoostingClassifier
    gbdt = GradientBoostingClassifier(learning_rate=0.01, random_state=seed)
    gbdt.fit(X_train, y_train)
    print('=' * 20)
    print("使用 GBDT 对 iris 数据集进行学习")

    # 每一列对应每个类别的“确定度分类”
    #   - 分数越高,则可能性越大
    decision_func_values = gbdt.decision_function(X_test)
    print('-' * 20)
    print('决策函数输出值的形状:{}'.format(decision_func_values.shape))
    print('决策函数的前六个输出值:\n{}'.format(decision_func_values[:6]))

    # argmax_decision_func = np.argmax(decision_func_values, axis = 1)
    argmax_decision_func = decision_func_values.argmax(axis=1)
    print('-' * 20)
    print('决策函数的输出值中的最大项:\n{}'.format(argmax_decision_func))

    predict_prob = gbdt.predict_proba(X_test)
    print('-' * 20)
    print('预测概率输出值的形状:{}'.format(predict_prob.shape))
    print('预测概率的前6个输出值:\n{}'.format(predict_prob[:6]))

    # argmax_predict_prob = np.argmax(predict_prob, axis = 1)
    argmax_predict_prob = predict_prob.argmax(axis=1)
    print('-' * 20)
    print('预测概率的输出值中的最大项:\n{}'.format(argmax_predict_prob))

    predict_result = gbdt.predict(X_test)
    print('-' * 20)
    print('测试数据集的预测结果:\n{}'.format(predict_result))
    print('-' * 20)
    print("决策函数的输出值中的最大项与测试数据集的预测结果是否相等?",
          np.all(argmax_decision_func == predict_result))
    print("预测概率的输出值中的最大项与测试数据集的预测结果是否相等?",
          np.all(argmax_predict_prob == predict_result))

    from sklearn.linear_model import LogisticRegression
    log_reg = LogisticRegression(solver='lbfgs',
                                 multi_class='auto',
                                 max_iter=10000)
    named_target = iris.target_names[y_train]
    log_reg.fit(X_train, named_target)
    print('=' * 20)
    print("使用 LogisticRegression 对 iris 数据集进行学习")
    print('-' * 20)
    print('训练数据集中的类别:{}'.format(log_reg.classes_))

    decision_func_values = log_reg.decision_function(X_test)
    print('-' * 20)
    print('决策函数输出值的形状:{}'.format(decision_func_values.shape))
    print('决策函数的前六个输出值:')
    print(decision_func_values[:6])

    # argmax_dec_func = np.argmax(decision_func_values, axis = 1)
    argmax_dec_func = decision_func_values.argmax(axis=1)
    print('-' * 20)
    print('决策函数的输出值中的前十个最大项:')
    print(argmax_dec_func[:10])
    print('利用分类器的classes_属性转换决策函数的输出值中的前十个最大项:')
    print(log_reg.classes_[argmax_dec_func][:10])

    predict_prob = log_reg.predict_proba(X_test)
    print('-' * 20)
    print('预测概率输出值的形状:{}'.format(predict_prob.shape))
    # argmax_predict_prob = np.argmax(predict_prob, axis = 1)
    argmax_predict_prob = predict_prob.argmax(axis=1)
    print('-' * 20)
    print('预测概率的输出值中的前十个最大项:')
    print(argmax_predict_prob[:10])
    print('利用分类器的classes_属性转换预测概率的输出值中的前十个最大项:')
    print(log_reg.classes_[argmax_predict_prob][:10])

    predict_result = log_reg.predict(X_test)
    print('-' * 20)
    print('测试数据集的前十个预测结果:')
    print(predict_result[:10])

    print('-' * 20)
    print("决策函数的输出值中的最大项与测试数据集的预测结果是否相等?",
          np.all(log_reg.classes_[argmax_decision_func] == predict_result))
    print("预测概率的输出值中的最大项与测试数据集的预测结果是否相等?",
          np.all(log_reg.classes_[argmax_predict_prob] == predict_result))

    pass
Example #27
0
    print("Accuracy score (training): {0:.3f}".format(
        gb.score(X_train_sub, y_train_sub)))
    print("Accuracy score (validation): {0:.3f}".format(
        gb.score(X_validation_sub, y_validation_sub)))
    print()

# Output confusion matrix and classification report of Gradient Boosting algorithm on validation set
gb = GradientBoostingClassifier(n_estimators=20,
                                learning_rate=0.5,
                                max_features=2,
                                max_depth=2,
                                random_state=0)
gb.fit(X_train_sub, y_train_sub)
predictions = gb.predict(X_validation_sub)

# ROC curve and Area-Under-Curve (AUC)
y_scores_gb = gb.decision_function(X_validation_sub)
fpr_gb, tpr_gb, _ = roc_curve(y_validation_sub, y_scores_gb)
roc_auc_gb = auc(fpr_gb, tpr_gb)

print("Area under ROC curve = {:0.2f}".format(roc_auc_gb))

print("Confusion Matrix:")
print(confusion_matrix(y_validation_sub, predictions))
print()
print("Classification Report")
print(classification_report(y_validation_sub, predictions))

if __name__ == '__main__':
    pass
def gbdt_plus_liner_classifier_grid_search(stack_setting_,
                                           upper_param_keys=None, upper_param_vals=None,
                                           lower_param_keys=None, lower_param_vals=None,
                                           num_proc=None):

    """
     upper model is GBDT or Random Forest
     lower model is Linear Classifier
    """
    if stack_setting_ is None:
        sys.stderr.write('You have no setting Json file\n')
        sys.exit()

    if num_proc is None:
        num_proc = 6
    upper_best_params = None
    lower_best_param = None


    # 1. upper model
    if upper_param_keys is None:
        upper_param_keys = ['model_type', 'n_estimators', 'loss', 'random_state', 'subsample', 'max_features', 'max_leaf_nodes', 'learning_rate', 'max_depth', 'min_samples_leaf']

    if upper_param_vals is None:
        upper_param_vals = [[GradientBoostingClassifier], [100], ['deviance'], [0], [0.1], [5], [20], [0.1], [2], [8]]


    # grid search for upper model : GBDT or Random Forest
    # ExperimentL1 has model free. On the other hand, data is fix
    exp = ExperimentL1(data_folder = stack_setting_['0-Level']['folder'],
                       train_fname = stack_setting_['0-Level']['train'], 
                       test_fname = stack_setting_['0-Level']['test'])


    model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder']
    model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train']
    model_train_fname = os.path.join(Config.get_string('data.path'), 
                                     model_folder, 
                                     model_train_fname)
    model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder']
    model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test']
    model_test_fname = os.path.join(Config.get_string('data.path'), 
                                    model_folder, 
                                    model_test_fname)
    upper_param_dict = dict(zip(upper_param_keys, upper_param_vals))
    if os.path.isfile(model_train_fname) is False and \
            os.path.isfile(model_test_fname) is False:
        #upper_param_dict['model_type'] == [GradientBoostingClassifier]
        del upper_param_dict['model_type']        
        clf = GradientBoostingClassifier()
        clf_cv = GridSearchCV(clf, upper_param_dict, 
                              verbose = 10, 
                              scoring = stack_setting_['1-Level']['gbdt_linear']['upper']['metrics'],#scoring = "precision" or "recall" or "f1"
                              n_jobs = num_proc, cv = 5)
        
        X_train, y_train = exp.get_train_data()
        clf_cv.fit(X_train, y_train)
        upper_best_params = clf_cv.best_params_
        print upper_best_params
        del clf_cv
        clf.set_params(**upper_best_params)
        clf.fit(X_train, y_train)
        train_loss = clf.train_score_
        test_loss = np.empty(len(clf.estimators_))
        X_test, y_test = exp.get_test_data()
        for i, pred in enumerate(clf.staged_predict(X_test)):
            test_loss[i] = clf.loss_(y_test, pred)

        graph_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['graph']['folder']
        graph_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['graph']['name']
        graph_fname = os.path.join(Config.get_string('data.path'), 
                                   graph_folder, 
                                   graph_fname)
        gs = GridSpec(2,2)
        ax1 = plt.subplot(gs[0,1])
        ax2 = plt.subplot(gs[1:,1])
        ax3 = plt.subplot(gs[:,0])

        #ax1.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test')
        #ax1.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train')
        #ax1.set_xlabel('the number of weak learner')
        #ax1.set_ylabel('%s Loss' % (upper_best_params.get('loss','RMSE')))
        #ax1.legend(loc="best") 
        confidence_score = clf.decision_function(X_test)
        #sns.distplot(confidence_score, kde=False, rug=False, ax=ax1)
        num_bins = 100
        try:
            counts, bin_edges = np.histogram(confidence_score, bins=num_bins, normed=True)
        except:
            counts, bin_edges = np.histogram(confidence_score, normed=True)
        cdf = np.cumsum(counts)
        ax1.plot(bin_edges[1:], cdf / cdf.max())
        ax1.set_ylabel('CDF')
        ax1.set_xlabel('Decision_Function:Confidence_Score', fontsize=10)

        # dump for the transformated feature
        clf = TreeTransform(GradientBoostingClassifier(),
                            best_params_ = upper_best_params)
        if type(X_train) == pd.core.frame.DataFrame:
            clf.fit(X_train.as_matrix().astype(np.float32), y_train)
        elif X_train == np.ndarray:
            clf.fit(X_train.astype(np.float32), y_train)

        # train result
        train_loss = clf.estimator_.train_score_
        test_loss = np.zeros((len(clf.estimator_.train_score_),), dtype=np.float32)

        if type(X_train) == pd.core.frame.DataFrame:
            for iter, y_pred in enumerate(clf.estimator_.staged_decision_function(X_test.as_matrix().astype(np.float32))):
                test_loss[iter] = clf.estimator_.loss_(y_test, y_pred)
        elif type(X_train) == np.ndarray:
            for iter, y_pred in enumerate(clf.estimator_.staged_decision_function(X_test.astype(np.float32))):
                test_loss[iter] = clf.estimator_.loss_(y_test, y_pred)
        ax2.plot(train_loss, label="train_loss")
        ax2.plot(test_loss, label="test_loss")
        ax2.set_xlabel('Boosting Iterations')
        ax2.set_ylabel('%s Loss' % (upper_best_params.get('loss','RMSE')))
        ax2.legend(loc="best")

        # tree ensambles
        score_threshold=0.8
        index2feature = dict(zip(np.arange(len(X_train.columns.values)), X_train.columns.values))
        feature_importances_index = [str(j) for j in clf.estimator_.feature_importances_.argsort()[::-1]]
        feature_importances_score = [clf.estimator_.feature_importances_[int(j)] for j in feature_importances_index]
        fis = pd.DataFrame(
            {'name':[index2feature.get(int(key),'Null') for key in feature_importances_index],
             'score':feature_importances_score}
            )
        if len(fis.index) > 20:
            score_threshold = fis['score'][fis['score'] > 0.0].quantile(score_threshold)
            # where_str = 'score > %f & score > %f' % (score_threshold, 0.0)
            where_str = 'score >= %f' % (score_threshold)
            fis = fis.query(where_str)

        sns.barplot(x = 'score', y = 'name',
                    data = fis,
                    ax=ax3,
                    color="blue")
        ax3.set_xlabel("Feature_Importance", fontsize=10)
        plt.tight_layout()
        plt.savefig(graph_fname)
        plt.close()

        #print clf.toarray().shape
        # >(26049, 100)
        #input_features = 26049, weak_learners = 100
        #print len(one_hot.toarray()[:,0]), one_hot.toarray()[:,0]
        #print len(one_hot.toarray()[0,:]), one_hot.toarray()[0,:]

        ## feature transformation : get test data from train trees
        #print transformated_train_features.shape, X_train.shape
        #print transformated_test_features.shape, X_test.shape

        transformated_train_features = clf.one_hot_encoding
        if type(X_test) == pd.core.frame.DataFrame:
            transformated_test_features = clf.transform(X_test.as_matrix().astype(np.float32), 
                                                        y_test)
        elif type(X_train) == np.ndarray:
            transformated_test_features = clf.transform(X_test, y_test)

        #model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder']
        #model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train']
        #model_train_fname = os.path.join(Config.get_string('data.path'), 
        #                                 model_folder, 
        #                                 model_train_fname)
        with gzip.open(model_train_fname, "wb") as gf:
            cPickle.dump([transformated_train_features, y_train], 
                         gf,
                         cPickle.HIGHEST_PROTOCOL)

        #model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder']
        #model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test']
        #model_test_fname = os.path.join(Config.get_string('data.path'), 
        #                                model_folder, 
        #                                model_test_fname)
        with gzip.open(model_test_fname, "wb") as gf:
            cPickle.dump([transformated_test_features, y_test],
                         gf,
                         cPickle.HIGHEST_PROTOCOL)


    # 2. lower model
    if lower_param_keys is None:
        lower_param_keys = ['model_type', 'n_neighbors', 'weights',
                            'algorithm', 'leaf_size', 'metric', 'p', 'n_jobs']

    if lower_param_vals is None:
        lower_param_vals = [[KNeighborsClassifier], [1, 2, 4, 8, 16, 24, 32, 64], ['uniform', 'distance'],
                            ['ball_tree'], [30], ['minkowski'], [2], [4]]

    lower_param_dict = dict(zip(lower_param_keys, lower_param_vals))
    clf_lower_model = None
    clf_lower_mname = None

    # grid search for lower model : Linear Classifier
    # ExperimentL1_1 has model free. On the other hand, data is fix
    if lower_param_dict['model_type'] == [LogisticRegression] and lower_param_dict['penalty'] == ['l1']:
        # Logistic Regression
        clf_lower_model = LogisticRegression()
        clf_lower_mname = 'LR-L1'

    elif lower_param_dict['model_type'] == [LogisticRegression] and lower_param_dict['penalty'] == ['l2']:
        # Logistic Regression
        clf_lower_model = LogisticRegression()
        clf_lower_mname = 'LR-L2'

    elif lower_param_dict['model_type'] == [LinearSVC] and lower_param_dict['penalty'] == ['l1']:
        # SVM L1
        clf_lower_model = LinearSVC()
        clf_lower_mname = 'SVM-L1'

    elif lower_param_dict['model_type'] == [LinearSVC] and lower_param_dict['penalty'] == ['l2']:
        # SVM L1
        clf_lower_model = LinearSVC()
        clf_lower_mname = 'SVM-L2'

    else:
        sys.stderr.write("You should input lower liner model\n")
        sys.exit()

    model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train']
    model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test']
    exp = ExperimentL1_1(data_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'],
                         train_fname = model_train_fname, 
                         test_fname = model_test_fname)
    # GridSearch has a single model. model is dertermined by param
    gs = GridSearch(SklearnModel, exp, lower_param_keys, lower_param_vals,
                    cv_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['folder'],
                    cv_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_out'], 
                    cv_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_pred_out'], 
                    refit_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['refit_pred_out'])
    lower_best_param, lower_best_score = gs.search_by_cv()
    print lower_best_param

    # get meta_feature
    meta_train_fname_ = "%s_%s.%s" % (
        ".".join(stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'].split(".")[:-1]),
        clf_lower_mname,
        stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'].split(".")[-1]
        )
    meta_test_fname_ = "%s_%s.%s" % (
        ".".join(stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'].split(".")[:-1]),
        clf_lower_mname,
        stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'].split(".")[-1]
        )
    meta_header_ = "%s_%s,%s" % (
        ",".join(stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['header'].split(",")[:-1]),
        clf_lower_mname,
        stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['header'].split(",")[-1]
        )
    exp.write2csv_meta_feature(
        model = clf_lower_model,
        meta_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['folder'],
        meta_train_fname = meta_train_fname_,
        meta_test_fname = meta_test_fname_,
        meta_header = meta_header_,
        best_param_ = lower_best_param
        )

    ## best parameter for GBDT and anohter sklearn classifier
    #return best_param, best_score
    
    if upper_best_params is None:
        upper_best_params = stack_setting_['1-Level']['gbdt_linear']['upper']['best_parameter']

    return upper_best_params, lower_best_param
Example #29
0

X_train, X_test, y_train, y_test, ind_train, ind_test = load_data(full=False)

clf = GradientBoostingClassifier(n_estimators=500, max_depth=6,
                                 learning_rate=0.1, max_features=256,
                                 min_samples_split=15, verbose=3,
                                 random_state=13)
print('_' * 80)
print('training')
print
print clf
clf.fit(X_train, y_train)

if y_test is not None:
    from sklearn.metrics import auc_score
    print clf

    y_scores = clf.decision_function(X_test).ravel()
    print "AUC: %.6f" % auc_score(y_test, y_scores)

    if generate_report:
        from error_analysis import error_report

        data = np.load("data/train.npz")
        X = data['X_train']
        X_test_raw = X[ind_test]
        error_report(clf, X_test_raw, y_test, y_scores=y_scores, ind=ind_test)

np.savetxt("gbrt3.txt", clf.decision_function(X_test))
class ClassifierModeling:
    def __init__(self,
                 model_name,
                 X_train=None,
                 y_train=None,
                 X_test=None,
                 y_test=None,
                 kfold=None):

        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.kfold = kfold
        self.y_pred = None
        self.model_name = model_name
        if self.model_name == "RandomForestClassifier":
            self.model = RandomForestClassifier()
        elif self.model_name == "LogisticRegression":
            self.model = LogisticRegression(solver='saga', random_state=0)
        elif self.model_name == "DecisionTreeClassifier":
            self.model = DecisionTreeClassifier()
        elif self.model_name == "XG_Boost":
            data_dmatrix = xgb.DMatrix(data=self.X_train, label=self.y_train)
            self.model = xgb.XGBClassifier()
            print()
        elif self.model_name == "Multilayer Perceptron":
            print("not implemented yet")
        elif self.model_name == "svm":
            self.model = svm.SVC(kernel='linear', C=0.01)
        elif self.model_name == "adboost":
            self.model = AdaBoostClassifier()
        elif self.model_name == "gradienBoost":
            self.model = GradientBoostingClassifier()

    def fit(self):
        print("fitting the ", self.model_name)
        self.model.fit(self.X_train, self.y_train)

    def get_predicate(self):
        print("predicting by ", self.model_name)
        self.y_pred = pd.Series(self.model.predict(self.X_test),
                                name="predict")
        return self.y_pred

    def get_MSE(self):
        return mean_squared_error(self.y_test, self.y_pred)

    def get_score(self):
        return -(r2_score(self.y_test, self.y_pred))

    def get_loss(self):
        return np.sqrt(mean_squared_error(self.y_test, self.y_pred))

    def validate_model(self):
        print("validate the model")

        model_fit = pd.DataFrame()
        model_fit = pd.concat([self.y_pred, self.y_test], axis=1)
        matrix = confusion_matrix(self.y_test, self.y_pred)

        fig, axs = plt.subplots(1, 3, squeeze=False, figsize=(15, 3))
        plt.rcParams.update({'font.size': 10})
        d = plot_confusion_matrix(self.model,
                                  self.X_test,
                                  self.y_test,
                                  display_labels=["yes", "no"],
                                  cmap=plt.cm.Blues,
                                  ax=axs[0, 2])
        d.ax_.set_title("{} confusion matrix".format(self.model_name))
        total = float(len(model_fit))

        for ax in axs.flatten():
            plt.rcParams.update({'font.size': 16})

            for i, var in enumerate(model_fit.columns):
                ax = sns.countplot(var, data=model_fit, ax=axs[0][i])
                ax.set_title(self.model_name)
                for p in ax.patches:
                    percentage = '{:.1f}%'.format(100 * p.get_height() / total)
                    x = p.get_x() + p.get_width()
                    y = p.get_height()
                    ax.annotate(percentage, (x, y), ha='center')
        #fig.savefig("https://github.com/muluwork-shegaw/10Academy-week6/blob/master/data/{}.png".format(self.model_name))

        return matrix, model_fit

    def get_eff_model(self):
        if self.model_name != "svm":
            print("calculate  model performance ")
            metrics = pd.DataFrame()
            metrics["model"] = [self.model_name]
            metrics["MSE"] = mean_squared_error(self.y_test, self.y_pred)
            metrics["Loss"] = np.sqrt(
                mean_squared_error(self.y_test, self.y_pred))
            metrics["Score"] = -(r2_score(self.y_test, self.y_pred))
            metrics["Kappa"] = cohen_kappa_score(self.y_test, self.y_pred)
            metrics["ROC_Auc"] = roc_auc_score(self.y_test, self.y_pred)
            metrics["precision"] = precision_score(self.y_test, self.y_pred)
            metrics["recall"] = recall_score(self.y_test, self.y_pred)
            metrics["f1_score"] = f1_score(self.y_test, self.y_pred)
            metrics["accuracy"] = accuracy_score(self.y_test, self.y_pred)

            return metrics

    def get_accuracy_with_kfold(self):

        return cross_val_score(self.model,
                               self.X_test,
                               self.y_test,
                               cv=self.kfold,
                               scoring='accuracy').mean()

    def get_loss_with_kfold(self, valid_data, valid_targ, k_fold):
        return -(cross_val_score(self.model,
                                 self.X_test,
                                 self.y_test,
                                 cv=self.kfold,
                                 scoring='neg_log_loss').mean())

    def eff_model_with_kfold(self):

        if self.model_name != "svm":
            print("calculate  model performance with stratified k_fold")

            scoring = [
                "accuracy", "roc_auc", "neg_log_loss", "r2",
                "neg_mean_squared_error", "neg_mean_absolute_error"
            ]

            metrics = pd.DataFrame()
            metrics["model"] = [self.model_name]
            for scor in scoring:
                score = []
                result = cross_val_score(estimator=self.model,
                                         X=self.X_test,
                                         y=self.y_test,
                                         cv=self.kfold,
                                         scoring=scor)
                score.append(result.mean())

                metrics[scor] = pd.Series(score)

            return metrics

    def get_feature_impo(self):
        if self.model_name != "LogisticRegression":
            feat_importance = pd.Series(self.model.feature_importances_,
                                        index=self.X_train.columns)
            feat_importance.plot(kind='bar')
            plt.show()
        return feat_importance

    def get_summary(self):  # for feature importance of logistic regression
        if self.model_name == "LogisticRegression":

            denom = (2.0 * (1.0 + np.cosh(self.model.decision_function(X))))
            denom = np.tile(denom, (X.shape[1], 1)).T
            F_ij = np.dot((X / denom).T, X)  ## Fisher Information Matrix
            Cramer_Rao = np.linalg.inv(F_ij)  ## Inverse Information Matrix
            sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao))
            z_scores = self.model.coef_[
                0] / sigma_estimates  # z-score for eaach model coefficient
            p_values = [stat.norm.sf(abs(x)) * 2
                        for x in z_scores]  ### two tailed test for p-values

            z_scores = z_scores
            p_values = p_values
            sigma_estimates = sigma_estimates
            F_ij = F_ij

            summary = pd.DataFrame()
            summary["features"] = self.X_train.columns
            summary["z_score"] = self.z_scores
            summary["p_value"] = self.p_values
            sns.barplot(summary["features"], summary["p_value"], data=summary)
        return summary

    def save_model(self):

        now = datetime.datetime.now().strftime('%Y-%m-%d')
        # Saving model to disk
        filename = now + '.pkl'
        pickle.dump(self.model, open(filename, 'wb'))
        return filename

    '''
        use stratified k-fold cross-validation 
        with imbalanced datasets to preserve the 
        class distribution in the train and test 
        sets for each evaluation of a given model.
        '''

    def make_it_stratified(self,
                           data,
                           target,
                           reduction_model='pca',
                           dim=7,
                           show=False):
        X = data.drop(target, axis=1)
        y = data[target]

        eff = []
        model_pred = []
        kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

        #enumerate the splits and summarize the distributions
        i = 0
        for train_ix, test_ix in kfold.split(X, y):
            i = i + 1
            print("k_fold -{}   with {} model".format(i, self.model_name))

            if reduction_model == 'pca':  # using PCA
                pca = PCA(n_components=7)
                reduced_df = pca.fit_transform(
                    X)  # reduce the dimention and convert to data frame
                # columns=[f'pca {i}'  for i in range(1,8)])

            elif reduction_model == 'tsne':  # using TSNE
                tsne = TSNE(n_components=7, n_iter=300)

            # select rows
            self.X_train, self.X_test = reduced_df[train_ix], reduced_df[
                test_ix]
            self.y_train, self.y_test = y[train_ix], y[test_ix]

            self.fit()
            self.get_predicate()
            if show == True:
                matrix, model_fit = self.validate_model()
                model_pred.append(model_fit)
            eff.append(self.get_eff_model())

        df_eff = pd.concat(eff)

        df_eff = pd.DataFrame(df_eff.mean()).transpose()
        df_eff.index = [self.model_name]

        return df_eff, model_pred
Example #31
0
histogram_base.SetTitle("")
histogram_base.SetStats(False)
histogram_base.SetMinimum(0.001)
histogram_base.SetMaximum(10.)
histogram_base.GetXaxis().SetTitle("Signal Eff.")
histogram_base.GetYaxis().SetTitle("Background Eff.")
histogram_base.Draw("hist")

x_train = array.array("f", [0])
y_train = array.array("f", [0])
x_test = array.array("f", [0])
y_test = array.array("f", [0])

effs = np.linspace(0, 1, 50)

train_scores = cls.decision_function(d_train)
fpr_tr, tpr_tr, tresholds_tr = sklearn.metrics.roc_curve(t_train,
                                                         train_scores,
                                                         pos_label=None)
for eff in effs:
    #print 'Fake rate at signal eff', eff, fpr_tr[np.argmax(tpr_tr>eff)]
    x_train.append(eff)
    y_train.append(fpr_tr[np.argmax(tpr_tr > eff)])

print 'from test sample'
test_scores = cls.decision_function(d_test)
fpr_te, tpr_te, tresholds_te = sklearn.metrics.roc_curve(t_test,
                                                         test_scores,
                                                         pos_label=None)
for eff in effs:
    #print 'Fake rate at signal eff', eff, fpr_te[np.argmax(tpr_te>eff)]
import matplotlib.pyplot as plt
import pandas as pd
import mglearn

from sklearn.datasets import load_iris
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, \
            iris.target, random_state=42)
gbrt = GradientBoostingClassifier(learning_rate=0.01, random_state=0)
gbrt.fit(X_train, y_train)

print("Decision function shape: {}".format(
    gbrt.decision_function(X_test).shape))
print("Decision function: {}".format(gbrt.decision_function(X_test[:6, :])))

print("Argmax of decision function:\n{}".format(\
      np.argmax(gbrt.decision_function(X_test), axis=1)))
print("Predictions:\n{}".format(gbrt.predict(X_test)))

print("Predicted probabilities:\n{}".format(gbrt.predict_proba(X_test[:6])))
print("Sums: {}".format(gbrt.predict_proba(X_test[:6]).sum(axis=1)))

print("Argmax of predicted probabilities:\n{}".format(\
      np.argmax(gbrt.predict_proba(X_test), axis=1)))
print("Predictions:\n{}".format(gbrt.predict(X_test)))

logreg = LogisticRegression()
named_target = iris.target_names[y_train]
Example #33
0
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_blobs, make_circles
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import mglearn

X, y = make_circles(noise=0.25, factor=0.5)

named_y = np.array(['blue', 'red'])[y]
X_train, X_test, y_train, y_test, named_y_train, named_y_test = \
    train_test_split(X, y, named_y)
gbc = GradientBoostingClassifier().fit(X_train, named_y_train)
print gbc.predict(X_test)
print 'X_test shape:', X_test.shape
print 'Decision function shape:', gbc.decision_function(X_test).shape
print 'Decision function:', gbc.decision_function(X_test)
print 'Probabilities:', gbc.predict_proba(X_test)

fig, axes = plt.subplots(1, 3, figsize=(20, 5))
mglearn.tools.plot_2d_separator(gbc,
                                X,
                                ax=axes[0],
                                alpha=0.4,
                                fill=True,
                                cm=mglearn.cm2)
scores_images_df = mglearn.tools.plot_2d_scores(gbc,
                                                X,
                                                ax=axes[1],
                                                alpha=.4,
                                                cm=mglearn.ReBl)
    return ceo

ceo_lst = []
for sent in ceo_sentences:
    ceo = return_ceos(sent)
    ceo_lst.extend(ceo)

ceo_lst = list(set(ceo_lst))
ceo_df = pd.DataFrame({'ceo': ceo_lst})
ceo_df.to_csv('results/ceo_matches.csv')

## CEO HIGH CONFIDENCE FALSE POSITIVES
# including high confidence false positives as these may be 
# ceos not included in the training set

ceo_conf_train = gbc.decision_function(X_per_train)
ceosFP_train = y_per_train[(y_per_train != y_train_gbm) & (y_train_gbm==1) & (ceo_conf_train >= 2.0)]

ceo_conf_test = gbc.decision_function(X_per_test)
ceosFP_test = y_per_test[(y_per_test != y_test_gbm) & (y_test_gbm==1) & (ceo_conf_test >= 2.0)]

ceosFP_idx = ceosFP_train.index.append(ceosFP_test.index).to_list()
ceoFP_df = person_df.iloc[ceosFP_idx]
ceoFP_sent = [nlp(sent) for sent in ceoFP_df.sentences.to_list()]

ceoFP_lst =[]
for sent in ceoFP_sent:
    ceoFP = return_ceos(sent)
    ceoFP_lst.extend(ceoFP)

ceoFP_lst = list(set(ceoFP_lst))
Example #35
0
from sklearn.ensemble make_blobs, make_circles
X, y = make_circles(noise=0.25, factor=o.5, random_state=1)
X
y

# rename the y label using blue and red
y_named = np.array(['blue', 'red'])[y]

X_train, X_test, y_train_named, y_test_named, y_train, y_test = train_test_split(X, y_named, y, random_state=0)


gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(X_train, y_train_named)

# model method decision_function has shape (n_samples,)
gbrt.decision_function(X_test)
X_test.shape
gbrt.decision_function(X_test).shape

# for binary classification, the negative class is the first entry of the classes_ attribute
gbrt.decision_function(X_test) > 0
gbrt.predict(X_test)


# make true/false into 0 and 1
greater_zero = (gbrt.decision_function(X_test) > 0).astype(int)
pred = gbrt.classes_[greater_zero]

pred
gbrt.predict(X_test)
# these two are the same
Example #36
0
    def run(self):

        if not self.verify_data():
            print ("\x1b[31mERROR: training input data array shapes are incompatible!\x1b[0m")
            raise Exception("BadTrainingInputData")

        applyClassWeights = False
        if self.parameters['classifier'] == 'GradientBoostingClassifier':
            clf = GradientBoostingClassifier(
                    min_samples_leaf=self.parameters['min_samples_leaf'], 
                    max_depth=self.parameters['max_depth'], 
                    max_leaf_nodes=self.parameters['max_leaf_nodes'],
                    criterion=self.parameters['criterion'],
                    max_features=self.parameters['max_features'],
                    n_estimators=self.parameters['n_estimators'], 
                    learning_rate=self.parameters['learning_rate'], 
                    subsample=self.parameters['subsample'],
                    min_impurity_split=self.parameters['min_impurity_split'],
                )
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
        elif self.parameters['classifier'] == 'RandomForestClassifier':
            clf = RandomForestClassifier(
                    min_samples_leaf=self.parameters['min_samples_leaf'], 
                    max_depth=self.parameters['max_depth'], 
                    max_leaf_nodes=self.parameters['max_leaf_nodes'],
                    criterion=self.parameters['criterion'],
                    max_features=self.parameters['max_features'],
                    n_estimators=self.parameters['n_estimators'], 
                    bootstrap=self.parameters['bootstrap'],
                )
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
        elif self.parameters['classifier'] == 'ExtraTreesClassifier':
            clf = ExtraTreesClassifier(
                    min_samples_leaf=self.parameters['min_samples_leaf'], 
                    max_depth=self.parameters['max_depth'], 
                    max_leaf_nodes=self.parameters['max_leaf_nodes'],
                    criterion=self.parameters['criterion'],
                    max_features=self.parameters['max_features'],
                    n_estimators=self.parameters['n_estimators'], 
                    bootstrap=self.parameters['bootstrap'],
                )
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
        elif self.parameters['classifier'] == 'FT_GradientBoostingClassifier':
            rt = RandomTreesEmbedding(max_depth=3, n_estimators=20, random_state=0)
            clf0 = GradientBoostingClassifier(
                    min_samples_leaf=self.parameters['min_samples_leaf'], 
                    max_depth=self.parameters['max_depth'], 
                    max_leaf_nodes=self.parameters['max_leaf_nodes'],
                    criterion=self.parameters['criterion'],
                    max_features=self.parameters['max_features'],
                    n_estimators=self.parameters['n_estimators'], 
                    learning_rate=self.parameters['learning_rate'], 
                    subsample=self.parameters['subsample'],
                    min_impurity_split=self.parameters['min_impurity_split'],
                )
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
            clf = make_pipeline(rt, clf0)
        elif self.parameters['classifier'] == 'XGBClassifier':
            clf = XGBClassifier(
                    learning_rate=self.parameters['learning_rate'],
                    max_depth=self.parameters['max_depth'],
                    n_estimators=self.parameters['n_estimators'],
                    objective='binary:logitraw',
                    colsample_bytree=self.parameters['colsample_bytree'],
                    subsample=self.parameters['subsample'],
                    min_child_weight=self.parameters['min_child_weight'],
                    gamma=self.parameters['gamma'] if 'gamma' in self.parameters else 0.0,
                    #reg_alpha=8,
                    reg_lambda=self.parameters['reg_lambda'] if 'reg_lambda' in self.parameters else 1.0,
                    reg_alpha=self.parameters['reg_alpha'] if 'reg_alpha' in self.parameters else 0.0,
                    ) 
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
        elif self.parameters['classifier'] == 'MLPClassifier':
            classifierParams = {k:v for k,v in self.parameters.iteritems() if k in ['solver', 'alpha', 'hidden_layer_sizes', 'max_iter', 'warm_start', 'learning_rate_init', 'learning_rate', 'momentum', 'epsilon', 'beta_1', 'beta_2', 'validation_fraction', 'early_stopping']}
            clf = MLPClassifier(**classifierParams) 
        elif self.parameters['classifier'] in ['SVC', 'LinearSVC']:
            '''
            clf = SVC(
                        C=1.0,
                        cache_size=4000,
                        class_weight='balanced',
                        coef0=0.0,
                        decision_function_shape='ovr',
                        degree=3,
                        gamma='auto',
                        kernel='rbf',
                        max_iter=100000,
                        probability=False,
                        random_state=None,
                        shrinking=True,
                        tol=0.001,
                        verbose=True
                    )
            '''
            bagged = int(self.parameters['bagged']) if 'bagged' in self.parameters else False
            if self.parameters['classifier'] == 'LinearSVC':
                clf = LinearSVC(
                            class_weight='balanced',
                            dual=self.parameters['dual'],
                            max_iter=self.parameters['max_iter'],
                            C=self.parameters['C'],
                            penalty=self.parameters['penalty'],
                            loss=self.parameters['loss'],
                            tol=self.parameters['tol'],
                            verbose=True,
                        )
            else:
                # classifier='SVC':C=random.choice([1.0, 10.0, 100.0, 500.0, 1000.0]):kernel=random.choice(['rbf','poly','linear']):degree=random.choice([2,3,4]):gamma=random.choice(['auto', 0.1, 0.3, 0.6]):shrinking=random.choice([True, False]):max_iter=10000:penalty=random.choice(['l1','l2']):tol=random.choice([0.005, 0.001, 0.0005, 0.0001]):cache_size=1000
                clf =  SVC(
                        C=self.parameters['C'],
                        cache_size=self.parameters['cache_size'],
                        class_weight='balanced',
                        coef0=0.0,
                        decision_function_shape='ovr',
                        degree=self.parameters['degree'],
                        gamma=self.parameters['gamma'],
                        kernel=self.parameters['kernel'],
                        max_iter=self.parameters['max_iter'],
                        probability=False,
                        random_state=None,
                        shrinking=self.parameters['shrinking'],
                        tol=self.parameters['tol'],
                        verbose=True
                    )

            if bagged:
                n_estimators = bagged
                if 'bag_oversampling' in self.parameters:
                    n_estimators = int(n_estimators * self.parameters['bag_oversampling'])

                clf0 = clf
                clf = BaggingClassifier(
                        clf0,
                        max_samples=1.0 / bagged,
                        max_features=self.parameters['baggedfeatures'] if 'baggedfeatures' in self.parameters else 1.0,
                        bootstrap_features=self.parameters['bootstrapfeatures'] if 'bootstrapfeatures' in self.parameters else False,
                        n_estimators=n_estimators,
                    )

        else:
            clf = AdaBoostClassifier(
                    DecisionTreeClassifier(
                        min_samples_leaf=self.parameters['min_samples_leaf'], 
                        max_depth=self.parameters['max_depth'], 
                        class_weight=self.parameters['class_weight'], 
                        criterion=self.parameters['criterion'],
                        splitter=self.parameters['splitter'],
                        max_features=self.parameters['max_features'],
                        ), 
                    n_estimators=self.parameters['n_estimators'], 
                    learning_rate=self.parameters['learning_rate'], 
                    algorithm=self.parameters['algorithm'],
                )

        #with open("/mnt/t3nfs01/data01/shome/berger_p2/VHbb/CMSSW_9_4_0_pre3/src/Xbb/python/logs_v25//test-scikit-svm/Logs//../cache/b7d92f50a52f8474e66cf4e2c3ad3fa4725aa489e7a6b288e4ed3855//clf2018-01-31_18-22-38_be9479a2.pkl","rb") as inputFile:
        #    clf = pickle.load(inputFile)

        # preprocessing
        print("transformation...")

        if 'scaler' in self.parameters:
            if self.parameters['scaler'] == 'standard':
                self.scaler = preprocessing.StandardScaler().fit(self.data['train']['X'])
            elif self.parameters['scaler'] == 'minmax':
                self.scaler = preprocessing.MinMaxScaler().fit(self.data['train']['X'])
            elif self.parameters['scaler'] == 'robust':
                self.scaler = preprocessing.RobustScaler().fit(self.data['train']['X'])
            else:
                self.scaler = None
        else:
            self.scaler = None

        if self.scaler:
            self.data['train']['X'] = self.scaler.transform(self.data['train']['X'])
            self.data['test']['X'] = self.scaler.transform(self.data['test']['X'])

        # SHUFFLE all samples before
        self.shuffle = False
        if self.shuffle:
            print("shuffle input data...")
            for dataset in self.datasets:
                nSamples = self.data[dataset][self.varsets[0]].shape[0]
                randomPermutation = np.random.permutation(nSamples)
                for var in self.varsets:
                    self.data[dataset][var] = np.take(self.data[dataset][var], randomPermutation, axis=0)

        # LIMIT number of training samples
        # recommended to also shuffle samples before, because they are ordered by signal/background
        limitNumTrainingSamples = self.parameters['limit']
        if (limitNumTrainingSamples > 0):
            print("limit training samples to:", limitNumTrainingSamples)
            #for dataset in self.datasets:
            #    for var in self.varsets:
            #        self.data[dataset][var] = self.data[dataset][var][0:limitNumTrainingSamples]
            for dataset in self.datasets:
                self.data[dataset] = resample(self.data[dataset], n_samples=limitNumTrainingSamples, replace=False)

        # oversample
        upscale = self.parameters['upscalefactor'] if 'upscalefactor' in self.parameters else None
        if upscale:
            upscalemax =  self.parameters['upscalemax'] if 'upscalemax' in self.parameters else 10 
            upscalesignal = self.parameters['upscalefactorsignal'] if 'upscalefactorsignal' in self.parameters else 1.0 #upscalefactorsignal
            indices = []
            for i in range(len(self.data['train']['sample_weight'])):
                #print(x)
                x= self.data['train']['sample_weight'][i]
                if self.data['train']['y'][i] > 0.5:
                    x *= upscalesignal
                n = x * upscale
                # limit oversampling factor!
                if n > upscalemax:
                    n=upscalemax
                if n<1:
                    n=1
                intN = int(n)
                indices += [i]*intN
                #floatN = n-intN
                #if floatN > 0:
                #    if random.uniform(0.0,1.0) < floatN:
                #        indices += [i]

            self.data['train']['X'] = self.data['train']['X'][indices]
            self.data['train']['y'] = self.data['train']['y'][indices]
            self.data['train']['sample_weight'] = self.data['train']['sample_weight'][indices]
            self.verify_data()

        # BALANCE weights
        # calculate total weights and class_weights
        nSig = len([x for x in self.data['train']['y'] if x >= 0.5])
        nBkg = len([x for x in self.data['train']['y'] if x < 0.5])
        print("#SIG:", nSig)
        print("#BKG:", nBkg)
        weightsSignal = []
        weightsBackground = []
        for i in range(len(self.data['train']['sample_weight'])):
            if self.data['train']['y'][i] < 0.5:
                weightsBackground.append(self.data['train']['sample_weight'][i])
            else:
                weightsSignal.append(self.data['train']['sample_weight'][i])
        weightsSignal.sort()
        weightsBackground.sort()
        totalWeightSignal = sum(weightsSignal)
        totalWeightBackground = sum(weightsBackground)
        signalReweight = (totalWeightSignal+totalWeightBackground)/totalWeightSignal * self.parameters['additional_signal_weight']
        backgroundReweight = (totalWeightSignal+totalWeightBackground)/totalWeightBackground
        print("SUM of weights for signal:", totalWeightSignal)
        print("SUM of weights for background:", totalWeightBackground)
        
        if applyClassWeights:
            print("re-weight signals by:", signalReweight)
            print("re-weight background by:", backgroundReweight)
            for i in range(len(self.data['train']['sample_weight'])):
                if self.data['train']['y'][i] < 0.5:
                    self.data['train']['sample_weight'][i] *= backgroundReweight
                else:
                    self.data['train']['sample_weight'][i] *= signalReweight
        else:
            print("DO NOT re-weight signals by:", signalReweight)
        print("...")
        # TRAINING

        learningCurve = []
        if self.parameters['classifier'] == 'XGBClassifier':
            clf = clf.fit(self.data['train']['X'], self.data['train']['y'], self.data['train']['sample_weight'], verbose=True)
        else:
            try:
                clf = clf.fit(**self.data['train'])
            except:
                clf = clf.fit(X=self.data['train']['X'], y=self.data['train']['y'])
                
                if 'rounds' in self.parameters and self.parameters['rounds'] > 1:
                    for rNumber in range(self.parameters['rounds']):
                        results = clf.predict_proba(self.data['test']['X']) 
                        auc1 = roc_auc_score(self.data['test']['y'], results[:,1], sample_weight=self.data['test']['sample_weight'])
                        print(" round ", rNumber, " AUC=", auc1)
                        learningCurve.append(auc1)
                        clf = clf.fit(X=self.data['train']['X'], y=self.data['train']['y'])

        print("***** FIT done")

        # TEST
        try:
            results = clf.decision_function(self.data['test']['X'])
            print("***** EVALUATION on test sample done")
            results_train = clf.decision_function(self.data['train']['X'])
            print("***** EVALUATION on training sample done")

            print("R:", results.shape, results)

            results = np.c_[np.ones(results.shape[0]), results]
            results_train = np.c_[np.ones(results_train.shape[0]), results_train]
        except:
            results = clf.predict_proba(self.data['test']['X'])
            results_train = clf.predict_proba(self.data['train']['X'])

        # ROC curve
        print("calculating auc...")
        auc1 = roc_auc_score(self.data['test']['y'], results[:,1], sample_weight=self.data['test']['sample_weight'])
        auc_training = roc_auc_score(self.data['train']['y'], results_train[:,1], sample_weight=self.data['train']['sample_weight'])
        print("AUC:", auc1, " (training:", auc_training, ")")

        print("**** compute quantiles")
        qx = np.array([0.01, 0.99])
        qy = np.array([0.0, 0.0])
        thq = ROOT.TH1D("quant","quant",500000,-5.0,5.0)
        nS = len(results)
        for i in range(nS):
            thq.Fill(results[i][1])
        thq.GetQuantiles(2, qy, qx)

        # rescaling of SCORE to [0, 1]
        minProb = 2.0
        maxProb = -1.0
        #for i in range(len(self.data['train']['X'])):
        #    if results_train[i][1] > maxProb:
        #        maxProb = results_train[i][1]
        #    if results_train[i][1] < minProb:
        #        minProb = results_train[i][1]
        #for i in range(len(self.data['test']['X'])):
        #    if results[i][1] > maxProb:
        #        maxProb = results[i][1]
        #    if results[i][1] < minProb:
        #        minProb = results[i][1]

        minProb = qy[0]
        maxProb = qy[1]
        delta = maxProb-minProb
        minProb -= delta * 0.01
        maxProb += delta * 0.10

        useSqrt = False

        # fill TRAINING SCORE histogram (class probability)
        h1t = ROOT.TH1D("h1t","h1t",50,0.0,1.0)
        h2t = ROOT.TH1D("h2t","h2t",50,0.0,1.0)
        for i in range(len(self.data['train']['X'])):
            result = (results_train[i][1]-minProb)/(maxProb-minProb)