def ADA_Classifier(X_train, X_cv, X_test, Y_train,Y_cv,Y_test, Actual_DS):
    print("***************Starting  AdaBoost Classifier***************")
    t0 = time()
    clf = AdaBoostClassifier(n_estimators=300)
    clf.fit(X_train, Y_train)
    preds = clf.predict(X_cv)
    score = clf.score(X_cv,Y_cv)

    print("AdaBoost Classifier - {0:.2f}%".format(100 * score))
    Summary = pd.crosstab(label_enc.inverse_transform(Y_cv), label_enc.inverse_transform(preds),
                      rownames=['actual'], colnames=['preds'])
    Summary['pct'] = (Summary.divide(Summary.sum(axis=1), axis=1)).max(axis=1)*100
    print(Summary)

    #Check with log loss function
    epsilon = 1e-15
    #ll_output = log_loss_func(Y_cv, preds, epsilon)
    preds2 = clf.predict_proba(X_cv)
    ll_output2= log_loss(Y_cv, preds2, eps=1e-15, normalize=True)
    print(ll_output2)
    print("done in %0.3fs" % (time() - t0))

    preds3 = clf.predict_proba(X_test)
    #preds4 = clf.predict_proba((Actual_DS.ix[:,'feat_1':]))
    preds4 = clf.predict_proba(Actual_DS)

    print("***************Ending AdaBoost Classifier***************")
    return pd.DataFrame(preds2) , pd.DataFrame(preds3),pd.DataFrame(preds4)
Example #2
0
def ab(train_data,train_label,val_data,val_label,test_data,name="adaboost_submission.csv"):
	print "Start training AdaBoost..."
	abClf = AdaBoostClassifier()
	abClf.fit(train_data,train_label)
	#evaluate on validation set
	val_pred_label = abClf.predict_proba(val_data)
	logloss = preprocess.evaluation(val_label,val_pred_label)
	print "logloss of validation set:",logloss

	print "Start classify test set..."
	test_label = abClf.predict_proba(test_data)
	preprocess.saveResult(test_label,filename = name)
Example #3
0
def ab_predictedValue():
    print '----------AdaBoost----------'
    ab_clf = AdaBoostClassifier(n_estimators = NoOfEstimators)
    ab_clf.fit(train_df[features], train_df['SeriousDlqin2yrs'])
    ab_predictedValue = ab_clf.predict_proba(test_df[features])
    print 'Feature Importance = %s' % ab_clf.feature_importances_
    return ab_predictedValue[:,1]
def do_all_study(X,y):
    
    names = [ "Decision Tree","Gradient Boosting",
             "Random Forest", "AdaBoost", "Naive Bayes"]

    classifiers = [
        #SVC(),
        DecisionTreeClassifier(max_depth=10),
        GradientBoostingClassifier(max_depth=10, n_estimators=20, max_features=1),
        RandomForestClassifier(max_depth=10, n_estimators=20, max_features=1),
        AdaBoostClassifier()]
    for name, clf in zip(names, classifiers):
        estimator,score = plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')


    clf_GBC = GradientBoostingClassifier(max_depth=10, n_estimators=20, max_features=1)
    param_name = 'n_estimators'
    param_range = [1, 5, 10, 20,40]

    plot_validation_curve(clf_GBC, X_train, y_train,
                          param_name, param_range, scoring='roc_auc')
    clf_GBC.fit(X_train,y_train)
    y_pred_GBC = clf_GBC.predict_proba(X_test)[:,1]
    print("ROC AUC GradientBoostingClassifier: %0.4f" % roc_auc_score(y_test, y_pred_GBC))

    clf_AB = AdaBoostClassifier()
    param_name = 'n_estimators'
    param_range = [1, 5, 10, 20,40]

    plot_validation_curve(clf_AB, X_train, y_train,
                          param_name, param_range, scoring='roc_auc')
    clf_AB.fit(X_train,y_train)
    y_pred_AB = clf_AB.predict_proba(X_test)[:,1]
    print("ROC AUC AdaBoost: %0.4f" % roc_auc_score(y_test, y_pred_AB))
def training(baseclassparameters, adaparameters, queue):
    treeclassifier = DecisionTreeClassifier(**baseclassparameters)
    adaclassifier = AdaBoostClassifier(treeclassifier, **adaparameters)

    print "\nBegin calculation with {0} and {1}".format(str(baseclassparameters), str(adaparameters))
    adaclassifier.fit(Xtrain, ytrain)

    #Predict with the model
    prob_predict_test = adaclassifier.predict_proba(Xtest)[:,1]

    #Calculate maximal significance
    True_Signal_test = prob_predict_test[ytest==1]
    True_Bkg_test = prob_predict_test[ytest==0]
    best_significance = 0
    for x in np.linspace(0, 1, 1000):
        S = float(len(True_Signal_test[True_Signal_test>x]))
        B = float(len(True_Bkg_test[True_Bkg_test>x]))

        significance = S/np.sqrt(S+B)
        if significance > best_significance:
            best_significance = significance
            best_x = x
            best_S = S
            best_B = B

    print "\nCalculation with {} and {} done ".format(str(baseclassparameters), str(adaparameters))
    print "Best significance of {0:.2f} archived when cutting at {1:.3f}".format(best_significance, best_x)
    print "Signal efficiency: {0:.2f}%".format(100.*best_S/len(True_Signal_test))
    print "Background efficiency: {0:.2f}%".format(100.*best_B/len(True_Bkg_test))
    print "Purity: {0:.2f}%".format(100.*best_S/(best_S+best_B))

    queue.put( (best_significance, baseclassparameters, adaparameters) )
def test_oneclass_adaboost_proba():
    # Test predict_proba robustness for one class label input.
    # In response to issue #7501
    # https://github.com/scikit-learn/scikit-learn/issues/7501
    y_t = np.ones(len(X))
    clf = AdaBoostClassifier().fit(X, y_t)
    assert_array_almost_equal(clf.predict_proba(X), np.ones((len(X), 1)))
def ada_prediction(features_train, labels_train, features_test, ids):

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(features_train, labels_train, random_state=1301, stratify=labels_train, test_size=0.3)

    clf = AdaBoostClassifier(RandomForestClassifier(bootstrap=True,
                                                    criterion='entropy', max_depth=None, max_features=2,
                                                    max_leaf_nodes=16, min_samples_split=10, n_estimators=1000,
                                                    n_jobs=-1, oob_score=False),
                              algorithm="SAMME",
                              n_estimators=200)


    #clf_acc = clf.fit(X_train, y_train)
    # print(clf.best_estimator_)
    #feature_importance = clf.feature_importances_
    #print (feature_importance)

    #pred = clf_acc.predict_proba(X_test)[:,1]
    #print (y_test, pred)
    # acc = accuracy_score(y_test, pred)
    # print ("Acc {}".format(acc))

    clf = clf.fit(features_train, labels_train)

    pred = clf.predict_proba(features_test)[:,1]

    predictions_file = open("data/canivel_ada_forest.csv", "wb")
    predictions_file_object = csv.writer(predictions_file)
    predictions_file_object.writerow(["ID", "TARGET"])
    predictions_file_object.writerows(zip(ids, pred))
    predictions_file.close()
def ada_boost_cv(x_train,
                 y_train,
                 cv,
                 max_tree_depth,
                 n_estimators,
                 learning_rate):

    tree_classifier = DecisionTreeClassifier(max_depth=max_tree_depth,
                                             class_weight="balanced")


    ada_boost_classifier = AdaBoostClassifier(base_estimator=tree_classifier,
                                              n_estimators=n_estimators,
                                              learning_rate=learning_rate)

    y_bar = cross_val_predict(estimator=ada_boost_classifier,
                              X=x_train,
                              y=y_train,
                              cv=cv,
                              n_jobs=cv)

    y_bar_proba = ada_boost_classifier.predict_proba(x_train)
    print(list(zip(y_bar,y_bar_proba)))

    cm = confusion_matrix(y_train,y_bar)

    accuracy_negative = cm[0,0] / np.sum(cm[0,:])
    accuracy_positive = cm[1,1] / np.sum(cm[1,:])

    precision = cm[1,1] / (cm[1,1] + cm[0,1])
    recall = cm[1,1] / (cm[1,1] + cm[1,0])

    f1_score = 2 * precision * recall / (precision + recall)

    return accuracy_positive, accuracy_negative, precision, recall, f1_score
Example #9
0
def Adaboost(TrainData,TestData):
    features=['Time','Season','Hour','Minute','District']

    clf = AdaBoostClassifier(tree.DecisionTreeClassifier(),n_estimators=30)

    size=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
    for i in range(0,len(size)):
        train,validation= train_test_split(TrainData, train_size=size[i])

        while len(set(train['Category'])) != len(set(validation['Category'])):
            train,validation= train_test_split(TrainData, train_size=size[i])
        clf = clf.fit(train[features], train['Category'])
        """stop = timeit.default_timer()
        print "Runnin  time adaboost is ", stop-start"""
        predicted=np.array(clf.predict_proba(validation[features]))
        model=clf.predict(train[features])
        model1=clf.predict(validation[features])

        #scores = cross_val_score(clf, validation[features], validation['Category'])
        #print "Scores mean is",scores.mean()
        #accuracy
        print "Training accuracy is", accuracy_score(train['Category'].values.tolist(),model)
        print "Validation accuracy is",accuracy_score(validation['Category'].values.tolist(),model1)
        print "Precision is ",precision_score(validation['Category'].values.tolist(),model1,average='macro')
        print "Recall is ",recall_score(validation['Category'].values.tolist(),model1,average='macro')
        print "Log loss is", log_loss(validation['Category'].values.tolist(),predicted,eps=1e-15, normalize=True, sample_weight=None)


        #writing to file
        """Category_new=[]
def test_staged_predict():
    """Check staged predictions."""
    # AdaBoost classification
    for alg in ['SAMME', 'SAMME.R']:
        clf = AdaBoostClassifier(algorithm=alg, n_estimators=10)
        clf.fit(iris.data, iris.target)

        predictions = clf.predict(iris.data)
        staged_predictions = [p for p in clf.staged_predict(iris.data)]
        proba = clf.predict_proba(iris.data)
        staged_probas = [p for p in clf.staged_predict_proba(iris.data)]
        score = clf.score(iris.data, iris.target)
        staged_scores = [s for s in clf.staged_score(iris.data, iris.target)]

        assert_equal(len(staged_predictions), 10)
        assert_array_almost_equal(predictions, staged_predictions[-1])
        assert_equal(len(staged_probas), 10)
        assert_array_almost_equal(proba, staged_probas[-1])
        assert_equal(len(staged_scores), 10)
        assert_array_almost_equal(score, staged_scores[-1])

    # AdaBoost regression
    clf = AdaBoostRegressor(n_estimators=10)
    clf.fit(boston.data, boston.target)

    predictions = clf.predict(boston.data)
    staged_predictions = [p for p in clf.staged_predict(boston.data)]
    score = clf.score(boston.data, boston.target)
    staged_scores = [s for s in clf.staged_score(boston.data, boston.target)]

    assert_equal(len(staged_predictions), 10)
    assert_array_almost_equal(predictions, staged_predictions[-1])
    assert_equal(len(staged_scores), 10)
    assert_array_almost_equal(score, staged_scores[-1])
def test_iris():
    # Check consistency on dataset iris.
    classes = np.unique(iris.target)
    clf_samme = prob_samme = None

    for alg in ['SAMME', 'SAMME.R']:
        clf = AdaBoostClassifier(algorithm=alg)
        clf.fit(iris.data, iris.target)

        assert_array_equal(classes, clf.classes_)
        proba = clf.predict_proba(iris.data)
        if alg == "SAMME":
            clf_samme = clf
            prob_samme = proba
        assert_equal(proba.shape[1], len(classes))
        assert_equal(clf.decision_function(iris.data).shape[1], len(classes))

        score = clf.score(iris.data, iris.target)
        assert score > 0.9, "Failed with algorithm %s and score = %f" % \
            (alg, score)

    # Somewhat hacky regression test: prior to
    # ae7adc880d624615a34bafdb1d75ef67051b8200,
    # predict_proba returned SAMME.R values for SAMME.
    clf_samme.algorithm = "SAMME.R"
    assert_array_less(0,
                      np.abs(clf_samme.predict_proba(iris.data) - prob_samme))
def adaboost(X,training_target,Y,est):

    from sklearn.ensemble import AdaBoostClassifier


    clf = AdaBoostClassifier(n_estimators=est)
    clf.fit(X,training_target)
    proba = clf.predict_proba(Y)
Example #13
0
def train(xTrain, yTrain, metric):
    print 'adaboost'
    global boost
    boost = AdaBoostClassifier()
    boost.fit(xTrain,yTrain)
    global trainResults
    trainResults = boost.predict_proba(xTrain)[:,1]
    i.setSuccess(trainResults, metric)
def test_classification_toy():
    # Check classification on a toy dataset.
    for alg in ['SAMME', 'SAMME.R']:
        clf = AdaBoostClassifier(algorithm=alg, random_state=0)
        clf.fit(X, y_class)
        assert_array_equal(clf.predict(T), y_t_class)
        assert_array_equal(np.unique(np.asarray(y_t_class)), clf.classes_)
        assert_equal(clf.predict_proba(T).shape, (len(T), 2))
        assert_equal(clf.decision_function(T).shape, (len(T),))
Example #15
0
def main():
    X, Y = load('train.csv')
    adaboost = AdaBoostClassifier(n_estimators=150, learning_rate=0.1)
    adaboost.fit(X, Y)
    X_test, ID = loadTest('test.csv')
    target = adaboost.predict_proba(X_test)
    df = pandas.DataFrame()
    df['TARGET'] = target[:,1]
    df.index = pandas.Series(ID, name='ID')
    df.to_csv('sumbit.csv')
Example #16
0
def abClassifier(X_train,y_train,X_test,y_test,to_plot=False):
    params = {
          'random_state': [None,0,1,2,3,4,5]}
    for param in ParameterGrid(params):
        print param
        clf = AdaBoostClassifier(algorithm='SAMME.R',n_estimators=50,learning_rate=0.7,**param)
        clf.fit(X_train,y_train)
#        auc_compute(y_test,clf.predict_proba(X_test)[:,1])
        predictions=clf.predict(X_test)
        scores(y_test,predictions,clf.predict_proba(X_test)[:,1],'ab',to_plot=to_plot)   
Example #17
0
def classify_AdaBoost(train, test):
	from sklearn.ensemble import AdaBoostClassifier as ABC

	x, y = train
	clf = ABC()
	clf.fit(x, y)
	
	x, y = test
	proba = clf.predict_proba(x)
	return proba
Example #18
0
class ABClassifier(Model):
    '''
    Adaptive Boosting Classifier
    Boosting an initial result of 1-depth decision tress classifier
    '''
    def __init__(self):
        Model.__init__(self)
        self.model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),algorithm="SAMME",n_estimators=200)

    def predict(self, test):
        return self.model.predict_proba(test)
def adaboost(X, y, train, valid):
	from sklearn.ensemble import AdaBoostClassifier
	clf2 = AdaBoostClassifier(n_estimators=100).fit(X[train], y[train])
	yhat = clf2.predict(X[valid])
	print(classification_report(y[valid], yhat))
	accuracy_score(y[valid], yhat)
	print("adaboost" + str(accuracy_score(y[valid], yhat)))
	yhat_prob = clf2.predict_proba(X[valid])[:,1]
	print("extra tree randomForest roc_accuracy" + str(roc_auc_score(y[valid], yhat_prob)))
	np.savetxt("y_ada.csv", yhat_prob)
	return yhat_prob
def test_multidimensional_X():
    """
    Check that the AdaBoost estimators can work with n-dimensional
    data matrix
    """

    from sklearn.dummy import DummyClassifier, DummyRegressor

    rng = np.random.RandomState(0)

    X = rng.randn(50, 3, 3)
    yc = rng.choice([0, 1], 50)
    yr = rng.randn(50)

    boost = AdaBoostClassifier(DummyClassifier(strategy='most_frequent'))
    boost.fit(X, yc)
    boost.predict(X)
    boost.predict_proba(X)

    boost = AdaBoostRegressor(DummyRegressor())
    boost.fit(X, yr)
    boost.predict(X)
def run_adaboost(estimators_and_learn_rt):
    print estimators_and_learn_rt[0]
    print estimators_and_learn_rt[1]
    clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1, 
                                                                   max_features=features-1,
                                                                   splitter='best', min_samples_leaf=10),
                                                                   n_estimators = int(estimators_and_learn_rt[0]), 
                                                                   learning_rate=estimators_and_learn_rt[1])
    clf.fit(train_features, train_outcome)
    validation['predictions_clf']=clf.predict_proba(validation_for_p)[:,1]
    fpr, tpr, thresholds = roc_curve(validation.is_exciting, validation.predictions_clf)
    auc_score = auc(fpr,tpr)
    return auc_score
def AdaBoost(X, Y, XTest, YTest):
    print '-----------------------------------------------------'

    # param_grid = {'learning_rate': [0.1, 0.3, 0.6, 1, 3, 6, 10]}

    # tree_grid = GridSearchCV(AdaBoostClassifier(), param_grid)
    tree_grid = AdaBoostClassifier(n_estimators=100, learning_rate=2)
    tree_grid.fit(X, Y)

    # print("The best parameters are %s with a score of %0.2f"
    #       % (tree_grid.best_params_, tree_grid.best_score_))

    print "Computing training statistics"
    dtree_predict_time_training = time.time()
    Ypred_dtree_training = tree_grid.predict(X)
    dtree_predict_time_training = time.time() - dtree_predict_time_training

    dtree_accuracy_training = metrics.accuracy_score(Y, Ypred_dtree_training)
    dt_precision_training = metrics.precision_score(Y, Ypred_dtree_training,
                                                    average='binary')
    dtree_recall_training = metrics.recall_score(Y, Ypred_dtree_training,
                                                 average='binary')

    print "DT training prediction time: " + str(dtree_predict_time_training)
    print "DT training accuracy Score: " + str(dtree_accuracy_training)
    print "DT training precision Score: " + str(dt_precision_training)
    print "DT training recall Score: " + str(dtree_recall_training)

    print "Computing testing statistics"
    dtree_predict_time_test = time.time()
    Ypred_dtree_test = tree_grid.predict(XTest)
    dtree_predict_time_test = time.time() - dtree_predict_time_test

    dtree_accuracy_test = metrics.accuracy_score(YTest, Ypred_dtree_test)
    dt_precision_test = metrics.precision_score(YTest, Ypred_dtree_test,
                                                average='binary')
    dtree_recall_test = metrics.recall_score(YTest, Ypred_dtree_test,
                                             average='binary')

    print "DT test prediction time: " + str(dtree_predict_time_test)
    print "DT test accuracy Score: " + str(dtree_accuracy_test)
    print "DT test precision Score: " + str(dt_precision_test)
    print "DT test recall Score: " + str(dtree_recall_test)

    print "Creating ROC curve"
    y_true = YTest
    y_score = tree_grid.predict_proba(XTest)
    fprSVM, trpSVM, _ = metrics.roc_curve(y_true=y_true,
                                          y_score=y_score[:, 0],
                                          pos_label=0)
    plt.plot(fprSVM, trpSVM, 'c-', label='ADA')
Example #23
0
	def iterate(self, n_estimators_conf=[10], learning_rate_conf=[0.25]):
		print '-'*80
		print 'Running AdaBoost Iterations...'
		# performance by number of estimators and max depth
		results = []
		for ne in n_estimators_conf:
			for lr in learning_rate_conf:
				print 'Iteration: n_estimators=%s, learning_rate=%s' % (str(ne), str(lr))
				m = AB(n_estimators=ne, learning_rate=lr)
				m.fit(self.xtrain, self.ytrain)
				predtrain = m.predict(self.xtrain)
				predtest = m.predict(self.xtest)
				predprobatrain = m.predict_proba(self.xtrain)
				predprobatest = m.predict_proba(self.xtest)
				accuracytrain = metrics.accuracy_score(predtrain, self.ytrain)
				accuracytest = metrics.accuracy_score(predtest, self.ytest)
				kstrain = multiclass_log_loss(self.ytrain, predprobatrain)
				kstest = multiclass_log_loss(self.ytest, predprobatest)
				cr = self.convert_cr(metrics.classification_report(self.ytest, predtest))
				results.append([ne, lr, accuracytrain, accuracytest, kstrain, kstest, cr])
		self.results = pd.DataFrame(results)
		self.results.columns = ['ne', 'lr', 'accuracy_train', 'accuracy_test',
						   'ks_train', 'ks_test', 'cr']
Example #24
0
def ada_boost_predict(new_train_data, new_train_labels, test_data, test_labels, base_est = "tree", n = 50):

    # Create a classifier: AdaBoost classifier
    if base_est == "tree":
        base = DecisionTreeClassifier(max_depth=5)
    classifier = AdaBoostClassifier(base_estimator = base, n_estimators = n)

    # We learn the digits on the first half of the digits
    classifier.fit(new_train_data, new_train_labels)

    # Now predict the value of the digit on the second half:
    expected = test_labels
    predicted = classifier.predict_proba(test_data)

    return predicted
def test_iris():
    """Check consistency on dataset iris."""
    classes = np.unique(iris.target)

    for alg in ['SAMME', 'SAMME.R']:
        clf = AdaBoostClassifier(algorithm=alg)
        clf.fit(iris.data, iris.target)

        assert_array_equal(classes, clf.classes_)
        assert_equal(clf.predict_proba(iris.data).shape[1], len(classes))
        assert_equal(clf.decision_function(iris.data).shape[1], len(classes))

        score = clf.score(iris.data, iris.target)
        assert score > 0.9, "Failed with algorithm %s and score = %f" % \
            (alg, score)
Example #26
0
def main():
    Algorithm = 'CamKt12LCTopoSplitFilteredMu67SmallR0YCut9'
    print 'Loading training data ...'

    data_train = pd.read_csv(Algorithm+'merged.csv')   
    r =np.random.rand(data_train.shape[0])
    
    #Set label and weight vectors - and drop any unwanted tranining one
    Y_train = data_train['label'].values[r<0.5]
    # W_train = data_train['weight'].values[r<0.9]
    Y_valid = data_train['label'].values[r>=0.5]
    # W_valid = data_train['weight'].values[r>=0.9]
    # data_train.drop('AKT10LCTRIM530_MassDropSplit', axis=1, inplace=True)

    varcombinations = itertools.combinations(data_train.columns.values[1:-1],2)
    fac = lambda n: 1 if n < 2 else n * fac(n - 1)
    combos = lambda n, k: fac(n) / fac(k) / fac(n - k)

    colors = plt.get_cmap('jet')(np.linspace(0, 1.0,combos(len(data_train.columns.values[1:-1]),2) ))

    for varset,color in zip(varcombinations, colors):
        print list(varset)
        X_train = data_train[list(varset)].values[r<0.5]
        X_valid = data_train[list(varset)].values[r>=0.5]


        dt = DC(max_depth=3,min_samples_leaf=0.05*len(X_train))
        abc = ABC(dt,algorithm='SAMME',
                 n_estimators=8,
                 learning_rate=0.5)
        print 'Training classifier with all the data..'
        abc.fit(X_train, Y_train)
        print 'Done.. Applying to validation sample and drawing ROC' 
        prob_predict_valid = abc.predict_proba(X_valid)[:,1]
        Y_score = abc.decision_function(X_valid)
        fpr, tpr, _ = roc_curve(Y_valid, prob_predict_valid)
        labelstring = ' And '.join(var.replace('_','') for var in varset)
        print labelstring
        plt.plot(tpr, (1-fpr), label=labelstring, color=color)

        
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.ylabel('1- Background Efficiency')
    plt.xlabel('Signal Efficiency')
    plt.title(Algorithm+' ROC Curve')
    plt.legend(loc="lower left",prop={'size':6})
    plt.savefig(Algorithm+'rocmva.pdf')
def training(thistrainingfeatures, baseclassparameters, adaparameters):
    treeclassifier = DecisionTreeClassifier(**baseclassparameters)
    adaclassifier = AdaBoostClassifier(treeclassifier, **adaparameters)

    #Split training and testdata
    Xtrain, Xtest, ytrain, ytest = train_test_split(thistrainingfeatures, label)
    #Cast pd.Series to arrays to apply mask later
    ytrain = np.asarray(ytrain)
    ytest = np.asarray(ytest)

    #print "\nBegin calculation with {0} and {1}".format(str(baseclassparameters), str(adaparameters))
    adaclassifier.fit(Xtrain, ytrain)

    #Predict with the model
    prob_predict_test = adaclassifier.predict_proba(Xtest)[:,1]


    #Calculate maximal significance
    True_Signal_test = prob_predict_test[ytest==1]
    True_Bkg_test = prob_predict_test[ytest==0]
    best_significance = 0
    for x in np.linspace(0, 1, 1000):
        S = float(len(True_Signal_test[True_Signal_test>x]))
        B = float(len(True_Bkg_test[True_Bkg_test>x]))

        significance = S/np.sqrt(S+B)
        if significance > best_significance:
            best_significance = significance
            best_x = x
            best_S = S
            best_B = B

    if best_significance > best_overall_significance:
        print """\nCalculation with {0} and {1} done.
            Variables: {2}
            Best significance of {3:.2f} archived when cutting at {4:.3f}
            Signal efficiency: {5:.2f}%
            Background efficiency: {6:.2f}%
            Purity: {7:.2f}%""".format( str(baseclassparameters), str(adaparameters),
                                    str(list(thistrainingfeatures.columns)),
                                    best_significance, best_x,
                                    100.*best_S/len(True_Signal_test),
                                    100.*best_B/len(True_Bkg_test),
                                    100.*best_S/(best_S+best_B) )

        #Print feature importances
        for (feature, importance) in izip(thistrainingfeatures.columns, adaclassifier.feature_importances_):
            print "{0:45s}: {1:>10.2f}%".format(feature, importance*100.)
class AdaBoostPredictor(PredictorBase):
    '''
    AdaBoost
    '''

    def __init__(self):
        self.clf = AdaBoostClassifier()

    def fit(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def predict(self, X_test):
        predictions = self.clf.predict_proba(X_test)
        predictions_df = self.bundle_predictions(predictions)

        return predictions_df
Example #29
0
class ADAClassifier(Classifier, ProbClassifier):

    def __init__(self, maxTreeDepth=1, estimators=50, learningRate=1.):
        self.cl = AdaBoostClassifier(n_estimators=estimators, learning_rate=learningRate,
                                      base_estimator=DecisionTreeClassifier(max_depth=maxTreeDepth))

    def retrain(self, vectorFeature, vectorTarget):
        # self.cl.fit([v.toarray()[0] for v in vectorFeature], vectorTarget)
        self.cl.fit(vectorFeature, vectorTarget)

    def classify(self, vectorizedTest):
        # return self.cl.predict(vectorizedTest.toarray()[0])[0]
        return self.cl.predict(vectorizedTest)[0]

    def getProb(self, vectorizedTest):
        # return self.cl.predict_proba(vectorizedTest.toarray()[0])[0][1]
        return self.cl.predict_proba(vectorizedTest)[0][1]
Example #30
0
	 def test(self):
                 X, y = self.dataMat,self.labelMat
                 X_test = self.testData
                 params = {'n_estimators': 1200, 'max_depth': 4, 'subsample': 0.5,'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3}
                 #clf = GradientBoostingClassifier(**params)
                 clf=AdaBoostClassifier(DecisionTreeClassifier(max_depth=6),algorithm="SAMME.R",n_estimators=280)
                 clf.fit(X, y);
                 y_pred = clf.predict(X_test);
                 y_predprob = clf.predict_proba(X_test);
                 output.write('bidder_id'+','+'prediction'+'\n')
                 for i in range(0,len(self.totalid)):
                     if self.totalid[i] in self.testid:
                         idx = self.testid.index(self.totalid[i])
                         output.write(str(self.testid[idx])+','+str(y_predprob[idx][1])+'\n')
                         #print str(self.testid[idx])+','+str(y_predprob[idx][1])
                     else:
                         #print str(self.totalid[idx])+','+str(0.0)
                         output.write(str(self.totalid[i])+','+str(0.0)+'\n')
Example #31
0
def test_sparse_classification():
    # Check classification with sparse input.

    class CustomSVC(SVC):
        """SVC variant that records the nature of the training set."""

        def fit(self, X, y, sample_weight=None):
            """Modification on fit caries data type for later verification."""
            super().fit(X, y, sample_weight=sample_weight)
            self.data_type_ = type(X)
            return self

    X, y = datasets.make_multilabel_classification(n_classes=1, n_samples=15,
                                                   n_features=5,
                                                   random_state=42)
    # Flatten y to a 1d array
    y = np.ravel(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix,
                          dok_matrix]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)

        # Trained on sparse format
        sparse_classifier = AdaBoostClassifier(
            base_estimator=CustomSVC(probability=True),
            random_state=1,
            algorithm="SAMME"
        ).fit(X_train_sparse, y_train)

        # Trained on dense format
        dense_classifier = AdaBoostClassifier(
            base_estimator=CustomSVC(probability=True),
            random_state=1,
            algorithm="SAMME"
        ).fit(X_train, y_train)

        # predict
        sparse_results = sparse_classifier.predict(X_test_sparse)
        dense_results = dense_classifier.predict(X_test)
        assert_array_equal(sparse_results, dense_results)

        # decision_function
        sparse_results = sparse_classifier.decision_function(X_test_sparse)
        dense_results = dense_classifier.decision_function(X_test)
        assert_array_almost_equal(sparse_results, dense_results)

        # predict_log_proba
        sparse_results = sparse_classifier.predict_log_proba(X_test_sparse)
        dense_results = dense_classifier.predict_log_proba(X_test)
        assert_array_almost_equal(sparse_results, dense_results)

        # predict_proba
        sparse_results = sparse_classifier.predict_proba(X_test_sparse)
        dense_results = dense_classifier.predict_proba(X_test)
        assert_array_almost_equal(sparse_results, dense_results)

        # score
        sparse_results = sparse_classifier.score(X_test_sparse, y_test)
        dense_results = dense_classifier.score(X_test, y_test)
        assert_array_almost_equal(sparse_results, dense_results)

        # staged_decision_function
        sparse_results = sparse_classifier.staged_decision_function(
            X_test_sparse)
        dense_results = dense_classifier.staged_decision_function(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_almost_equal(sprase_res, dense_res)

        # staged_predict
        sparse_results = sparse_classifier.staged_predict(X_test_sparse)
        dense_results = dense_classifier.staged_predict(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # staged_predict_proba
        sparse_results = sparse_classifier.staged_predict_proba(X_test_sparse)
        dense_results = dense_classifier.staged_predict_proba(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_almost_equal(sprase_res, dense_res)

        # staged_score
        sparse_results = sparse_classifier.staged_score(X_test_sparse,
                                                        y_test)
        dense_results = dense_classifier.staged_score(X_test, y_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # Verify sparsity of data is maintained during training
        types = [i.data_type_ for i in sparse_classifier.estimators_]

        assert all([(t == csc_matrix or t == csr_matrix)
                   for t in types])
Example #32
0
acc = cross_val_score(estimator = clf_ada, X = X_train, y = y_train, cv = cv, scoring='f1')
acc.mean(), acc.std()


# last step
clf_ada = AdaBoostClassifier(dt,
                             algorithm = 'SAMME',
                             n_estimators = 100, 
                             learning_rate = 0.1,
                             random_state= 1337 )
clf_ada.fit(X_train, y_train)
y_pred = clf_ada.predict(X_test)
print(classification_report(y_test, y_pred))


y_pred = clf_ada.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred)




#kf = KFold(n_splits = 5, random_state = 1337, shuffle = True)
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1337)
# Create arrays and dataframes to store results
auc_preds = []
test_preds = np.zeros(df_test.shape[0])
n_fold = 0
for idx_train, idx_valid in rskf.split(X_train, y_train):
    train_x, train_y = X_train[idx_train], y_train[idx_train]
    valid_x, valid_y = X_train[idx_valid], y_train[idx_valid]
    
Example #33
0
df = pd.DataFrame(vals)

X = df.drop('Class', axis=1)
y = df.loc[:, 'Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=420)

#:# preprocessing

#:# model
model = AdaBoostClassifier(n_estimators=10, learning_rate=0.3)
model.fit(X_train, y_train)

#:# hash
#:# e37d46e1a5c0065376d1471f564f3ac7
md5 = hashlib.md5(str(model).encode('utf-8')).hexdigest()
print(f'md5: {md5}')

#:# audit
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
print(f'Accuracy: {model.score(X_test, y_test)}')
print(f'Area under ROC: {roc_auc_score(y_test, y_pred_proba)}')

#:# session info
sessionInfo = {
    "python_version": python_version(),
    "library_versions":[str(d) for d in pkg_resources.working_set]
}
with open('sessionInfo.txt', 'w') as f:
    json.dump(sessionInfo, f, indent=4)
Example #34
0
                to_predict = [calendar[ii].lrank - calendar[ii].wrank, calendar[ii].welo-calendar[ii].lelo, calendar[ii].welosur-calendar[ii].lelosur, surf_into_num[calendar[ii].surface], \
                               round(wins_percent(*winner) - wins_percent(*loser), 3), round(wins_per_surface(*winner) - wins_per_surface(*loser), 3), \
                               round(av_first_serve(*winner) - av_first_serve(*loser), 3), round(av_first_serve_surface(*winner) - av_first_serve_surface(*loser), 3), \
                               round(av_second_serve(*winner) - av_second_serve(*loser), 3), round(av_second_serve_surface(*winner) - av_second_serve_surface(*loser), 3), \
                               round(av_first_return(*winner) - av_first_return(*loser), 3), round(av_first_return_surface(*winner) - av_first_return_surface(*loser), 3), \
                               round(av_second_return(*winner) - av_second_return(*loser), 3), round(av_second_return_surface(*winner) - av_second_return_surface(*loser), 3), \
                               round(av_aces(*winner) - av_aces(*loser), 3), round(av_aces_surface(*winner) - av_aces_surface(*loser), 3), \
                               round(av_dfs(*winner) - av_dfs(*loser), 5), round(av_dfs_surface(*winner) - av_dfs_surface(*loser), 5), \
                               round(av_bps(*winner) - av_bps(*loser), 3), round(av_bps_surface(*winner) - av_bps_surface(*loser), 3)]

                clf.fit(X, Y)

                X_test = to_predict
                y_test = [1]
                prediction = clf.predict([to_predict])
                proba = clf.predict_proba([to_predict])
                coeffs_test = [calendar[ii].cfw, calendar[ii].cfl]

                q1 = 0.1*bank1
                q2 = 0.1*bank2
                q3 = 0.1*bank3
                q4 = 0.1*bank4
                
                '''print("Best params:", model.best_params_)
                print("To predict:", to_predict)
                print("Prediction:", prediction)
                print("Probabilities:", proba)
                print("testing:", "("+str(roi_1(prediction, q1, coeffs_test)), str(roi_2(prediction, q2, coeffs_test)), str(roi_3(prediction, q3, coeffs_test)), \
                      str(roi_4(prediction, proba, q4, coeffs_test))+")")'''

                profit1 += roi_1(prediction, q1, coeffs_test)
Example #35
0
    data_test = pd.read_csv('adult.test',
                            header=None,
                            skiprows=1,
                            names=column_names)
    for name in data_test.columns:
        data_test[name] = pd.Categorical(data_test[name]).codes
    x_test = data_test[data_test.columns[:-1]]
    y_test = data_test[data_test.columns[-1]]
    y_test_pred = model.predict(x_test)
    print('测试集准确率:', accuracy_score(y_test, y_test_pred))
    print('\t测试集查准率:', precision_score(y_test, y_test_pred))
    print('\t测试集召回率:', recall_score(y_test, y_test_pred))
    print('\t测试集F1:', f1_score(y_test, y_test_pred))

    y_test_proba = model.predict_proba(x_test)
    # print y_test_proba
    y_test_proba = y_test_proba[:, 1]
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_proba)
    auc = metrics.auc(fpr, tpr)
    print('AUC = ', auc)
    # 或直接调用roc_auc_score
    # print 'AUC = ', metrics.roc_auc_score(y_test, y_test_proba)

    mpl.rcParams['font.sans-serif'] = 'SimHei'
    mpl.rcParams['axes.unicode_minus'] = False
    plt.figure(facecolor='w')
    plt.plot(fpr, tpr, 'r-', lw=2, alpha=0.8, label='AUC=%.3f' % auc)
    plt.plot((0, 1), (0, 1), c='b', lw=1.5, ls='--', alpha=0.7)
    plt.xlim((-0.01, 1.02))
    plt.ylim((-0.01, 1.02))
Example #36
0
# Draw a horizontal barplot of importances_sorted
vImportancesSorted.plot(kind='barh', color='lightgreen')
plt.title('Features Importances')
plt.show()

# METHOD V: Boosting
# Ada Boosting
vAdaBoostClassifier = AdaBoostClassifier(base_estimator=vDecisionTree,
                                         n_estimators=180,
                                         random_state=SEED)

# Fit ada to the training set
vAdaBoostClassifier.fit(vXTrain, vYTrain)

# Compute the probabilities of obtaining the positive class
vYPredProba = vAdaBoostClassifier.predict_proba(vXTest)[:, 1]
vAdaROCAUC = roc_auc_score(vYTest, vYPredProba)
print('ROC AUC score: {:.2f}'.format(vAdaROCAUC))

# Gradient Boosting
vGradientBoostingClassifier = GradientBoostingClassifier(max_depth=4,
                                                         n_estimators=180,
                                                         random_state=SEED)
vGradientBoostingClassifier.fit(vXTrain, vYTrain)
vYPred = vGradientBoostingClassifier.predict(vXTest)
vRMSE = MSE(vYTest, vYPred)**(1 / 2)
print('Test set RMSE of Gradient Boosting Classifier: {:.2f}'.format(vRMSE))

# Stochastic Gradient Boosting
vStochasticGradientBoostingClassifier = GradientBoostingClassifier(
    max_depth=4,
Example #37
0
                                random_state=None)
classifier.fit(train_data, train_label)
tra_label = classifier.predict(train_data)  # 训练集的预测标签
tes_label = classifier.predict(test_data)  # 测试集的预测标签
print("训练集:", accuracy_score(train_label, tra_label))
print("测试集:", accuracy_score(test_label, tes_label))

matrix = confusion_matrix(test_label, tes_label, labels=[0, 1])
TP = matrix[1, 1]
TN = matrix[0, 0]
FP = matrix[0, 1]
FN = matrix[1, 0]
sn = TP / (TP + FN)
sp = TN / (TN + FP)

decision_score = classifier.predict_proba(test_data)
fprs, tprs, thresholds = roc_curve(test_label, decision_score[:, 1])

# plt.plot(fprs, tprs)
# plt.show()
roc_auc = auc(fprs, tprs)
plt.figure()
lw = 2
plt.plot(fprs,
         tprs,
         color='darkorange',
         lw=lw,
         label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    stratify=y,
                                                    random_state=SEED)

# Instantiate a classification-tree 'dt'
dt = DecisionTreeClassifier(max_depth=1, random_state=SEED)

# Instantiate an AdaBoostClassifier 'adb_clf'
adb_clf = AdaBoostClassifier(base_estimator=dt, n_estimators=100)

# Fit adb_clf to the training set
adb_clf.fit(X_train, y_train)

# Predict the test set probabilities of positive class
y_pred_proba = adb_clf.predict_proba(X_test)[:,1]

# Evaluate test roc_auc score
adb_clf_roc_auc_score = roc_auc_score(y_test, y_pred_proba)

# Print adb_clf_roc_auc score
print('ROC AUC score: {:.2f}'.format(adb_clf_roc_auc_score))

# Gradient Boosting (GB)
# Gradient Boosted Trees - sequential correction of predecessor's errors
# Does not tweak the weights of the training instances
# Fit each predictor is trained using its predecessor's residual errors as labels
# Gradient Boosted Trees - CART is used as a base learner
# Important parameter - shrinkage - prediction of each tree is shrinked after multiplication by a learning rate, eta (0 to 1)
# Similar to AdaBoost - trade-off between Eta and the number of estimators
# Decreasing learning rate, needs to be compensated by increasing the number of estimators
print(classification_report(y_test, rf.predict(X_test)))
#print('Accuracy of Random Forest Classifier on test set: {:.2f}'.format(rf.score(X_test, y_test)*100))

# Classification report for the optimised RF Regression
rf.fit(X_train, y_train)
rfp = rf.predict(X_test)

#adaboost model
ada = AdaBoostClassifier(n_estimators=100, random_state=0)
ada.fit(X_train, y_train)
print("AdaBoost accuracy is %2.2f" %
      accuracy_score(y_test, ada.predict(X_test)))
ada_roc_auc = roc_auc_score(y_test, ada.predict(X_test))
print("AdaBoost AUC = %2.2f" % ada_roc_auc)
######probability leaves employee
probsada = ada.predict_proba(
    X_test)[:, 1]  # predict probabilities associated with the employee leaving
adaProb_roc_auc = roc_auc_score(
    y_test, probsada)  # calculate AUC score using test dataset
print('AUC score: %.3f' % adaProb_roc_auc)

print(classification_report(y_test, ada.predict(X_test)))

#decision tree model
dtree = tree.DecisionTreeClassifier(max_depth=3,
                                    class_weight="balanced",
                                    min_weight_fraction_leaf=0.01)
dtree.fit(X_train, y_train)
print("Decision Tree accuracy is %2.2f" %
      accuracy_score(y_test, dtree.predict(X_test)))
dt_roc_auc = roc_auc_score(y_test, dtree.predict(X_test))
print("Decision Tree AUC = %2.2f" % dt_roc_auc)
Example #40
0
def train_bdt_multiclass():
    print("Loading data...")
    if SMALL_DATA:
        signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn = import_data_small()
    else:
        signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn = import_data()

    print("Creating arrays...")
    # X = Features (i.e. the data)
    X = np.concatenate((signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn))

    # y = Labels (i.e. what it is, signal / background)
    y = np.concatenate((np.ones(signal.shape[0]), np.full(bkg2nu.shape[0], 2),
                        np.full(bkg214Bi.shape[0],
                                3), np.full(bkg208Tl.shape[0],
                                            4), np.full(bkgRn.shape[0], 5)))

    print("Splitting Data...")
    # Split the data
    X_dev, X_eval, y_dev, y_eval = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=48)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)

    print("Creating classifier for DT")
    # Create classifiers
    dt = DecisionTreeClassifier(max_depth=12,
                                min_samples_split=0.5,
                                min_samples_leaf=400)

    print("Creating classifier for BDT")
    bdt = AdaBoostClassifier(dt,
                             algorithm='SAMME',
                             n_estimators=1200,
                             learning_rate=0.5)

    print("Fitting BDT...")
    # Train the classifier - not using weights here as it is a multiclassifier
    fitted_tree = bdt.fit(X_train, y_train)

    print("Predicting on training data...")
    # Use the fitted tree to predict on training data and new test data
    y_predicted_train = bdt.predict(X_train)

    print("Predicting on test data...")
    y_predicted_test = bdt.predict(X_test)

    print(
        classification_report(
            y_train,
            y_predicted_train,
            target_names=["signal", "2nu", "214Bi", "208Tl", "Radon"]))
    print("Area under ROC curve for training data: {0:.4f}".format(
        roc_auc_score(y_train,
                      bdt.predict_proba(X_train),
                      average="weighted",
                      multi_class="ovr")))

    print(
        classification_report(
            y_test,
            y_predicted_test,
            target_names=["signal", "2nu", "214Bi", "208Tl", "Radon"]))
    print("Area under ROC curve for test data: {0:.4f}".format(
        roc_auc_score(y_test,
                      bdt.predict_proba(X_test),
                      average="weighted",
                      multi_class="ovr")))

    plot_roc_curve(bdt, X_test, y_test)
    compare_train_test_multi(bdt, X_train, y_train, X_test, y_test)

    print("Saving classifier...")
    save_path = BASE_PATH + 'ml_calculated_data/multiClass/'
    dump(bdt, save_path + 'bdt_classifier.joblib')
    dump(fitted_tree, save_path + 'bdt_fitted_tree.joblib')
    dump(X_train, save_path + 'bdt_X_train.joblib')
    dump(X_test, save_path + 'bdt_X_test.joblib')
    dump(X_dev, save_path + 'bdt_X_dev.joblib')
    dump(X_eval, save_path + 'bdt_X_eval.joblib')
    dump(y_test, save_path + 'bdt_y_test.joblib')
    dump(y_train, save_path + 'bdt_y_train.joblib')
    dump(y_dev, save_path + 'bdt_y_dev.joblib')
    dump(y_eval, save_path + 'bdt_y_eval.joblib')

    print("Finished Training.")
Example #41
0
def AdaMECvsAdaBoost(dataset, C_FP, C_FN, base_estimator, algorithm,
                     n_estimators, calibration_method, test_set_prcnt,
                     cal_set_prcnt):
    ## Load data
    mat_contents = sio.loadmat(os.getcwd() + '\\Datasets\\' + dataset + '.mat')
    data = mat_contents['data']
    target = np.asarray([float(i) for i in mat_contents['labels'].ravel()])

    target[np.where(target != 1)] = 0  # One-vs-all if multiclass

    ## Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        data, target, test_size=test_set_prcnt)

    Pos = sum(
        y_train[np.where(y_train == 1)]
    )  #Number of positive training examples --estimate of prior of positive class
    Neg = len(
        y_train
    ) - Pos  #Number of negative training examples --estimate of prior of negative class

    C_FP_effective = C_FP * Neg / (
        C_FN * Pos + C_FP * Neg
    )  #Positive skew (overall importance of a single positive example)
    #C_FN_effective = 1 - C_FP_effective              #Negative skew (overall importance of a single negative example)

    #Define weak learner
    base_estimator = eval(base_estimator)

    ## Train ensembles
    #I.Train an AdaBoost ensemble (algorithm="SAMME" for discrete AdaBoost, algorithm="SAMME.R" for real AdaBoost)
    AdaBoost = AdaBoostClassifier(base_estimator,
                                  algorithm=algorithm,
                                  n_estimators=n_estimators)
    AdaBoost = AdaBoost.fit(X_train, y_train)

    #II.Train a Calibrated AdaBoost ensemble
    AdaBoostCal = CalibratedAdaMEC.trainCalibratedAdaMEC(
        base_estimator, algorithm, n_estimators, calibration_method,
        cal_set_prcnt, X_train, y_train)

    ## Generate predictions
    #I.AdaBoost predictions and scores
    scores_AdaBoost = AdaBoost.predict_proba(X_test)[:,
                                                     1]  #Positive Class scores
    y_pred_AdaBoost = np.zeros(X_test.shape[0])
    y_pred_AdaBoost[np.where(
        scores_AdaBoost > 0.5
    )] = 1  #Classifications, the standard AdaBoost decision rule corresponds to a threshold of 0.5 (skew-insensitive)

    #II.Calibrated AdaMEC predictions and scores
    y_pred_CalibratedAdaMEC, scores_CalibratedAdaMEC = CalibratedAdaMEC.predictCalibratedAdaMEC(
        AdaBoostCal, C_FP_effective, X_test)

    ##Print results: comment/uncomment to your liking!

    # #Confusion matrices
    #print('AdaBoost Confusion Matrix:')
    #conf_mat_AdaBoost = metrics.confusion_matrix(y_test, y_pred_AdaBoost)
    #print(conf_mat_AdaBoost)
    #print('Calibrated AdaMEC Confusion Matrix:')
    #conf_mat_CalibratedAdaMEC = metrics.confusion_matrix(y_test, y_pred_CalibratedAdaMEC)
    #print(conf_mat_CalibratedAdaMEC)

    # #Accuracy (lower means better *skew-insensitive* classification).
    #            Note: Not a good measure for *skew-sensitive* learning.
    #print('Accuracy:')
    #print('\t\t\tAdaBoost: {0}'.format(metrics.accuracy_score(y_test, y_pred_AdaBoost)))
    #print('\t\t\tCalibrated AdaMEC: {0}'.format(metrics.accuracy_score(y_test, y_pred_CalibratedAdaMEC)))

    #Brier Score (lower means better probability estimates)
    print('Brier Score:')
    print('\t\t\tAdaBoost: {0}'.format(
        metrics.brier_score_loss(y_test, scores_AdaBoost)))
    print('\t\t\tCalibrated AdaMEC: {0}'.format(
        metrics.brier_score_loss(y_test, scores_CalibratedAdaMEC)))

    #Negative Log-likelihood (lower means better probability estimates)
    print('Negative Log-likelihood:')
    print('\t\t\tAdaBoost: {0}'.format(
        metrics.log_loss(y_test, scores_AdaBoost)))
    print('\t\t\tCalibrated AdaMEC: {0}'.format(
        metrics.log_loss(y_test, scores_CalibratedAdaMEC)))

    #Misclassification Cost (lower means better skew-sensitive classification)
    print('Misclassification Cost:')
    conf_mat_AdaBoost = metrics.confusion_matrix(
        y_test, y_pred_AdaBoost)  #Confusion matrix
    cost_AdaBoost = conf_mat_AdaBoost[
        0, 1] * C_FP_effective + conf_mat_AdaBoost[1, 0] * (
            1 - C_FP_effective)  #Skew-Sensitive Cost
    print('\t\t\tAdaBoost: {0}'.format(cost_AdaBoost))
    conf_mat_CalibratedAdaMEC = metrics.confusion_matrix(
        y_test, y_pred_CalibratedAdaMEC)  #Confusion matrix
    cost_AdaMEC = conf_mat_CalibratedAdaMEC[
        0, 1] * C_FP_effective + conf_mat_CalibratedAdaMEC[1, 0] * (
            1 - C_FP_effective)  #Skew-Sensitive Cost
    print('\t\t\tCalibrated AdaMEC: {0}'.format(cost_AdaMEC))
    if cost_AdaBoost > cost_AdaMEC:
        print('Calibrated AdaMEC outperformed AdaBoost!')
    else:
        print('AdaBoost produced a lower cost solution this time. Try again.')
        print('Calibrated AdaMEC should lead to lower cost in expectation.')
Example #42
0
                         algorithm="SAMME",
                         n_estimators=number_of_estimators_all_attr,
                         learning_rate=rate_of_learning_all_attr)
print 'fitting bdt...'
ti = timer()
bdt.fit(X_train, y_train, sample_weight=w_train)
clf.fit(X_train, y_train, sample_weight=w_train)
tf = timer()
print 'bdt fit completed>>>>>>>>>>>>>>>>>>>>>>>'
print 'time taken for bdt fit1: ' + str(tf - ti) + 'sec'
#joblib.dump(bdt,'bdt.pkl')
#bdt = joblib.load('bdt.pkl')

#~~~~~~~~calculate the decision scores
#twoclass_output = bdt.decision_function(X_train)
all_probs = bdt.predict_proba(X_test)
dftt = df_test_orig.copy()
class_names = {0: "background", 1: "signal"}
classes = sorted(class_names.keys())
for cls in classes:
    dftt[class_names[cls]] = all_probs[:, cls]
sig = dftt[isSigL] == 1
bkg = dftt[isSigL] == 0

probs = dftt["signal"][sig].values
probb = dftt["signal"][bkg].values

es, eb = [], []
for c in np.arange(-1, 1, roc_resolution):
    es.append((float((probs > c).sum()) / probs.size))
    eb.append((float((probb > c).sum()) / probb.size))
def create_model(dataset):

    print("dataset : ", dataset)
    df = pd.read_csv('/home/farshid/Desktop/' + dataset, header=None)

    print('reading', dataset)
    df['label'] = df[df.shape[1] - 1]
    #
    df.drop([df.shape[1] - 2], axis=1, inplace=True)
    labelencoder = LabelEncoder()
    df['label'] = labelencoder.fit_transform(df['label'])
    #
    X = np.array(df.drop(['label'], axis=1))
    y = np.array(df['label'])

    number_of_clusters = 23
    sampler = RandomUnderSampler()
    normalization_object = Normalizer()
    X = normalization_object.fit_transform(X)
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    n_classes = 2

    for train_index, test_index in skf.split(X, y):
        X_train = X[train_index]
        X_test = X[test_index]

        y_train = y[train_index]
        y_test = y[test_index]

        break
    print('training', dataset)
    top_roc = 0

    depth_for_rus = 0
    split_for_rus = 0

    for depth in range(3, 20, 20):
        for split in range(3, 9, 20):

            classifier = AdaBoostClassifier(DecisionTreeClassifier(
                max_depth=depth, min_samples_split=split),
                                            n_estimators=100,
                                            learning_rate=1,
                                            algorithm='SAMME')

            X_train, y_train = sampler.fit_sample(X_train, y_train)

            classifier.fit(X_train, y_train)

            predictions = classifier.predict_proba(X_test)

            score = roc_auc_score(y_test, predictions[:, 1])

            if top_roc < score:
                top_roc = score

                tpr = dict()
                fpr = dict()
                roc = dict()
                for i in range(n_classes):
                    fpr[i], tpr[i], _ = roc_curve(y_test, predictions[:, i])
                    roc[i] = roc_auc_score(y_test, predictions[:, i])

    major_class = max(sampler.fit(X_train, y_train).stats_c_,
                      key=sampler.fit(X_train, y_train).stats_c_.get)

    major_class_X_train = []
    major_class_y_train = []
    minor_class_X_train = []
    minor_class_y_train = []

    for index in range(len(X_train)):
        if y_train[index] == major_class:
            major_class_X_train.append(X_train[index])
            major_class_y_train.append(y_train[index])
        else:
            minor_class_X_train.append(X_train[index])
            minor_class_y_train.append(y_train[index])

    # optimize for number of clusters here
    kmeans = KMeans(max_iter=200, n_jobs=4, n_clusters=number_of_clusters)
    kmeans.fit(major_class_X_train)

    # get the centroids of each of the clusters
    cluster_centroids = kmeans.cluster_centers_

    # get the points under each cluster
    points_under_each_cluster = {
        i: np.where(kmeans.labels_ == i)[0]
        for i in range(kmeans.n_clusters)
    }

    for i in range(number_of_clusters):
        size = len(points_under_each_cluster[i])
        random_indexes = np.random.randint(low=0,
                                           high=size,
                                           size=int(size / 2))
        temp = points_under_each_cluster[i]
        feature_indexes = temp[random_indexes]
        X_train_major = np.concatenate(
            (X_train_major, X_train[feature_indexes]), axis=0)
        y_train_major = np.concatenate(
            (y_train_major, y_train[feature_indexes]), axis=0)

    final_train_x = np.concatenate((X_train_major, minor_class_X_train),
                                   axis=0)
    final_train_y = np.concatenate((y_train_major, minor_class_y_train),
                                   axis=0)

    classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=150))
    # classifier = sklearn.svm.SVC(C=50 , gamma= .0008 , kernel='rbf', probability=True)
    # classifier = sklearn.svm.SVC(C=100, gamma=.006, kernel='rbf', probability=True)

    classifier.fit(final_train_x, final_train_y)

    predicted = classifier.predict_proba(X_test)

    tpr_c = dict()
    fpr_c = dict()
    roc_c = dict()
    for i in range(n_classes):
        fpr_c[i], tpr_c[i], _ = roc_curve(y_test, predictions[:, i])
        roc_c[i] = auc(y_test, predictions[:, i])

    print('ploting', dataset)
    #    plt.clf()
    plt.plot(fpr[1],
             tpr[1],
             lw=2,
             color='red',
             label='Roc curve: Clustered sampling')

    plt.plot(fpr_c[1],
             tpr_c[1],
             lw=2,
             color='navy',
             label='Roc curve: random under sampling')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('Area under ROC curve')
    plt.legend(loc="lower right")
    plt.show()
Example #44
0
    for index in sortedIndicies.tolist():
        if classLabels[index] == 1.0:
            delX = 0
            delY = yStep
        else:
            delX = xStep
            delY = 0
            ySum += cursor[1]
        # draw line from cursor to (cursor[0]-delX,cursor[1]-delY)
        ax.plot([cursor[0], cursor[0] - delX], [cursor[1], cursor[1] - delY],
                c='b')
        cursor = (cursor[0] - delX, cursor[1] - delY)
    ax.plot([0, 1], [0, 1], 'b--')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC cursorve for AdaBoost horse colic detection system')
    ax.axis([0, 1, 0, 1])
    plt.show()
    # 每个小矩形相加,矩形的宽度为xStep,因此对矩形的高度进行相加得到ySum
    print("the Area Under the cursorve is: ", ySum * xStep)


if __name__ == "__main__":
    X, y = make_hastie_10_2(n_samples=4000, random_state=1)
    X_test, y_test = X[2000:], y[2000:]
    X_train, y_train = X[:2000], y[:2000]
    clf = AdaBoostClassifier(n_estimators=100)
    clf.fit(X_train, y_train)
    preds = clf.predict_proba(X_test)
    plotROC(preds[:, 1], y_test)
Example #45
0
#NOTE: change classifier here
clf = AdaBoostClassifier(n_estimators=500, algorithm='SAMME')

#training
st = time.time()
print "training started"
clf.fit(x_train, y_train)
print "training ended"
et = time.time()
tt = et - st
print "Training Time = " + str(tt) + "\n"

#predictions
pred = clf.predict(x_test)
#NOTE: change to decision_function or predict_proba depending on the classifier
y_score = clf.predict_proba(x_test)
#y_score = clf.decision_function(x_test)

#################################################################################
pp = PdfPages('results/EXP_Result.pdf')
#PrecisionRecall-plot
precision = dict()
recall = dict()
PR_area = dict()
PR_thresholds = dict()
average_precision = dict()
for i in range(n_classes):
    precision[i], recall[i], PR_thresholds[i] = precision_recall_curve(
        y_test[:, i], y_score[:, i])
    PR_area[i] = auc(recall[i], precision[i])
    average_precision[i] = average_precision_score(y_test[:, i], y_score[:, i])
Example #46
0
plt.ylabel('Test Accuracy%')
plt.xlabel('n_estimators')
plt.show()

# ROC curve for baseline classification tree
clf_probs=clf.predict_proba(wine_test.loc[:,['price','regn_enc','var_enc', \
    'wnry_enc']])
fpr1,tpr1,thr1=roc_curve(np.where(wine_test['point_bins']=='90+',1.,0.), \
    clf_probs[:,0])
# ROC curve for bagging ensemble using full classification trees
bag_probs=baglfy.predict_proba(wine_test.loc[:,['price','regn_enc', \
    'var_enc','wnry_enc']])
fpr2,tpr2,thr2=roc_curve(np.where(wine_test['point_bins']=='90+',1.,0.), \
    bag_probs[:,0])
# ROC curve for boosting ensemble using full classification trees
bst_probs=bstlfy.predict_proba(wine_test.loc[:,['price','regn_enc', \
    'var_enc','wnry_enc']])
fpr3,tpr3,thr3=roc_curve(np.where(wine_test['point_bins']=='90+',1.,0.), \
    bst_probs[:,0])

# Plot ROC Curves
plt.plot(fpr1,tpr1,color='#4d4d33',label='Baseline CART')
plt.plot(fpr2,tpr2,color='#0080ff',label='Bagging Ensemble')
plt.plot(fpr3,tpr3,color='#ff3300',label='Boosting Ensemble')
plt.plot([0.,1.],[0.,1.],color='k',linestyle='--')
plt.title('ROC Curves for 90+ Point Wine Classification')
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.legend(fontsize=8)
plt.show()
Example #47
0
y_pred_dtc = dtc.predict(X_test_prepared0)
print("dtc percentage: ", 100 * np.sum(y_pred_dtc == Val_y) / len(Val_y))

y_score_sgd = f1_score(Val_y, y_pred_sgd)
print("sgd f1 score: ", y_score_sgd)
y_score_gbc = f1_score(Val_y, y_pred_gbc)
print("gbc f1 score: ", y_score_gbc)
y_score_adb = f1_score(Val_y, y_pred_adb)
print("adb f1 score: ", y_score_adb)
y_score_dtc = f1_score(Val_y, y_pred_dtc)
print("dtc f1 score: ", y_score_dtc)

auc_sgd = roc_auc_score(Val_y, y_pred_sgd)
auc_gbc = roc_auc_score(Val_y, y_pred_gbc)
auc_adb = roc_auc_score(Val_y, y_pred_adb)
auc_dtc = roc_auc_score(Val_y, y_pred_dtc)

print("sgd auc: ", auc_sgd)
print("gbc auc: ", auc_gbc)
print("adb auc: ", auc_adb)
print("dtc auc: ", auc_dtc)

# y_sgd_predict = sgd.predict_proba(X_test_prepared)
y_gbc_predict = gbc.predict_proba(X_test_prepared)
y_adb_predict = adb.predict_proba(X_test_prepared)
y_dtc_predict = dtc.predict_proba(X_test_prepared)

np.save("y_gbc_predict", y_gbc_predict)
np.save("y_adb_predict", y_adb_predict)
np.save("y_dtc_predict", y_dtc_predict)
Example #48
0
#KNN
from sklearn.neighbors import KNeighborsClassifier

rf6 = KNeighborsClassifier()
rf6.fit(X_train, y_train)
y_val_pred6 = rf6.predict_proba(X_val)
y_val_pred_acc6 = rf6.predict(X_val)
print(log_loss(y_val, y_val_pred6))
print(accuracy_score(y_val, y_val_pred_acc6))

#AdaBoost
from sklearn.ensemble import AdaBoostClassifier

rf7 = AdaBoostClassifier(n_estimators=250)
rf7.fit(X_train, y_train)
y_val_pred6 = rf7.predict_proba(X_val)
y_val_pred_acc7 = rf7.predict(X_val)
print(log_loss(y_val, y_val_pred7))
print(accuracy_score(y_val, y_val_pred_acc7))

#Compare ROC of each Algorithm
import matplotlib.pyplot as plt
from sklearn import metrics
#RandomForest
fpr1, tpr1, threshold1 = metrics.roc_curve(y_val_pred_acc1, y_val_pred1)
roc_auc1 = metrics.auc(fpr1, tpr1)
plt.title('ROC of RandomForest')
plt.plot(fpr1, tpr1, 'b', label='AUC = %0.2f' % roc_auc1)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
Example #49
0
def main(Testbatch, DocumentName):
    start_time = time.time()
    Indexlist = pd.read_csv('Index.csv', sep=";", header=None)
    Indexlist.columns = ['Index', 'document']
    
    Dataset = 'ENRON' #'TREC'
    catR = 'Fraud' #'spam'
    catNR = 'Legit' #'ham'     
    
    #%%
    NrMails = 1640 #75000/5
    SavePer = 40 #100
    
    Traininglist = pd.read_csv('Traininglist'+str(Dataset),sep = '\t', index_col = 0)
    wordselection =pd.read_csv('wordselection-'+str(Dataset)+str(DocumentName), sep = '\t', index_col = 0, names=['Words'])
    
    #%%
    # =============================================================================
    # start of training LOGISTIC REGRESSION
    # =============================================================================
    Training = pd.DataFrame(0, columns = wordselection, index = [], dtype = 'uint32')
    y = pd.DataFrame(0, columns = [], index = [], dtype = 'uint32')
    Batches = list(Traininglist.columns.values)
    Batches.remove(Testbatch)
    
    for batch in Batches:
        print(batch)
        for Files in range(int(SavePer),int(NrMails+SavePer),int(SavePer)):
            TrainingFile = pd.DataFrame() 
            TrainingFile = pd.read_csv('Frequencies'+str(Dataset)+batch+'-'+str(Files),sep = '\t', index_col = 0)
            y = pd.concat([y, TrainingFile['Index_given']], sort=False, ignore_index=True)
            Training = pd.concat([Training, TrainingFile], sort=False, ignore_index=True)
            Training = Training[wordselection]
            Training = Training.fillna(0).to_sparse(fill_value=0)
            print(round(Files/float(NrMails)*100,4), '%')
    
    del TrainingFile
    
    Training.to_csv('Training'+str(Dataset)+str(DocumentName),sep='\t')
    
    train = AdaBoostClassifier(n_estimators = 100, random_state=0).fit(Training, y)
    
   
    #%%
    # =============================================================================
    # Applying LOGISTIC REGRESSION to test data
    # =============================================================================
    start_time2 = time.time()
    TrainingWords = list(Training.columns.values)
    Test = pd.DataFrame(0, columns = TrainingWords, index = [], dtype = 'uint32')
    
    Traininglist = pd.read_csv('Traininglist'+str(Dataset),sep = '\t', index_col = 0)
    Batches = list(Traininglist.columns.values)
    Batches.remove(Testbatch)
    
    ProbSpam = list()
    ProbHam = list()
    Given_y = list()
    Predicted_y = list()
    for Files in range(int(SavePer),int(NrMails+SavePer),int(SavePer)):
        Test = pd.DataFrame(0, columns = TrainingWords, index = [], dtype = 'uint32')
        TestFile = pd.read_csv('Frequencies'+str(Dataset)+batch+'-'+str(Files),sep = '\t', index_col=0)     
        y = TestFile['Index_given']
        del TestFile['Index_given']
        
        Test = Test.merge(TestFile, how='outer')
        for word in list(set(Test.columns.values)-set(TrainingWords)):
            del Test[word]
                    
        TestNew = Test.fillna(0).to_sparse(fill_value=0)
          
        pred = train.predict(TestNew)
        proba = train.predict_proba(TestNew)
    
        for i in range(0,len(Test)):
            ProbSpam.append(proba[i][0])
            ProbHam.append(proba[i][1])
            Given_y.append(y[i])
            Predicted_y.append(pred[i])
        
        print(round(Files/float(NrMails)*100,4), '%')
        
    Result = pd.DataFrame(0, columns = ["Given_Label", "Predicted_Label", "ProbSpam", "ProbHam", "expSpam", "expHam"], index = [], dtype = 'uint32')
    Result["Given_Label"] =  Given_y
    Result["Predicted_Label"] = Predicted_y
    Result["ProbSpam"] = ProbSpam
    Result["ProbHam"] = ProbHam   
    Result["expSpam"] = 0
    Result["expHam"] = 0  
        
    Result.to_csv('Result'+str(Dataset)+'-'+str(DocumentName),sep='\t')
    
    Timings = pd.DataFrame(columns = ['Description','time'])
    Timings = Timings.append(pd.Series({'Description':'Script', 'time': time.time()-start_time}), ignore_index=True)
    Timings = Timings.append(pd.Series({'Description':'Training', 'time': start_time2-start_time}), ignore_index=True)
    Timings = Timings.append(pd.Series({'Description':'Classification', 'time': time.time()- start_time2}), ignore_index=True)
    Timings.to_csv('Timings'+str(Dataset)'-'+str(DocumentName),sep='\t')
Example #50
0
pred = rf_model.predict_proba(X_train[training_vars])
print('RF train roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
pred = rf_model.predict_proba(X_test[training_vars])
print('RF test roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))


# #### Adaboost

# In[283]:


ada_model = AdaBoostClassifier()
ada_model.fit(X_train[training_vars], y_train)

pred = ada_model.predict_proba(X_train[training_vars])
print('Adaboost train roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
pred = ada_model.predict_proba(X_test[training_vars])
print('Adaboost test roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))


# #### Logistic Regression

# In[4]:


logit_model = LogisticRegression()
logit_model.fit(scaler.transform(X_train[training_vars]), y_train)

pred = logit_model.predict_proba(scaler.transform(X_train[training_vars]))
print('Logit train roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
Example #51
0
def adaBoostClassifier(xTrain, yTrain, xTest):
    adaClassifier = AdaBoostClassifier()
    adaClassifier.fit(xTrain, yTrain)
    yPredict = adaClassifier.predict(xTest)
    probability = adaClassifier.predict_proba(xTest)
    return yPredict, probability
Example #52
0
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_moons, make_circles, make_classification
#引入训练数据
#X, y = make_circles(noise=0.2, factor=0.5, random_state=1)
X, y = make_moons(noise=0.1, random_state=1)
#定义AdaBoost分类器
adb = AdaBoostClassifier()
#训练过程
adb.fit(X, y)
#绘图库引入
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
#调整图片风格adbadb
mpl.style.use('fivethirtyeight')
#定义xy网格,用于绘制等值线图
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 0.1))
#预测可能性
Z = adb.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=.8)
#绘制散点图
plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k')
plt.title("AdaBoost")
plt.axis("equal")
plt.show()
##tfidf = feature_extraction.text.TfidfTransformer()
##train_data = tfidf.fit_transform(train_data).toarray()
##test_data = tfidf.transform(test_data).toarray()

print 'Training...'
forest = GradientBoostingClassifier(n_estimators=200, verbose=1,
                                    learning_rate = 0.2, max_depth=3)
forest2 = RandomForestClassifier(n_estimators = 400, verbose = 1,
                                 max_features = 13)
learner = AdaBoostClassifier(base_estimator = forest2, n_estimators = 50)
forest = forest.fit(train_data, y)
learner = learner.fit(train_data, y)

print 'Predicting...'
output1 = forest.predict_proba(test_data)
output2 = learner.predict_proba(test_data)

output = []
for t, row in enumerate(output1):
    tmp = np.vstack([output1[t], output2[t]])
    tmp = np.average(tmp, axis = 0)
    output.append(tmp)
output = np.array(output)

predictions_file = open("submission.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(['Id', 'Class_1', 'Class_2', 'Class_3',
                           'Class_4', 'Class_5', 'Class_6',
                           'Class_7', 'Class_8', 'Class_9'])

for t, row in enumerate(Ids):
Example #54
0
# Gradient Boosting classifier
gbc = GradientBoostingClassifier(n_estimators = 200, learning_rate = 0.05, random_state=12)
gbc.fit(x_train,y_train)


# **Best solo performer from above is AdaBoost with AUROC of ~0.8612. A close second was the MLP, with AUROC of ~0.8574
# on the validation set.


predictions_LR_train = logreg.predict_proba(x_2)[:,1]
predictions_DT_train = dt.predict_proba(x_2)[:,1]
predictions_NN_train = nn.predict_proba(x_2)[:,1]
predictions_GBC_train = gbc.predict_proba(x_2)[:,1]
predictions_KNN_train = knn.predict_proba(x_2)[:,1]
predictions_RF_train = rf.predict_proba(x_2)[:,1]
predictions_AB_train = ab.predict_proba(x_2)[:,1]
predictions_GNB_train = gnb.predict_proba(x_2)[:,1]

# Reshape to get the arrays to work
predictions_LR_train = predictions_LR_train.reshape(-1, 1)
predictions_DT_train = predictions_DT_train.reshape(-1, 1)
predictions_NN_train = predictions_NN_train.reshape(-1, 1)
predictions_GBC_train = predictions_GBC_train.reshape(-1, 1)
predictions_KNN_train = predictions_KNN_train.reshape(-1, 1)
predictions_RF_train = predictions_RF_train.reshape(-1, 1)
predictions_AB_train = predictions_AB_train.reshape(-1, 1)
predictions_GNB_train = predictions_GNB_train.reshape(-1, 1)

# What to train the meta model on
next_x_train = np.concatenate((predictions_LR_train,predictions_DT_train, predictions_NN_train, predictions_KNN_train,
                               predictions_RF_train, predictions_AB_train, predictions_GNB_train,
Example #55
0
with open('resultAda.csv', 'w') as csvFile:
    writer = csv.writer(csvFile, delimiter=' ')
    writer.writerows(predict)
    dfAda = predict
csvFile.close()

uniciAda, counteggioAda = np.unique(dfAda, return_counts=True)

print(uniciAda, counteggioAda)
print("\nRilevanza attributi Ada")
for nameAda, scoreAda in zip(COLUMNS, classificatore.feature_importances_):
    print(nameAda, scoreAda)

# testo il classificatore con delle classi che già conosce e di cui conosce anche l array di feature per testarne la precisione
predict_proba = classificatore.predict_proba(dataframe_training)
predict = np.array(predict)
classi_target = list(np.array(classi_target))

cnf_matrix = confusion_matrix(classi_target, predict)
print(' - Confusion Matrix -')
print(cnf_matrix)
print(' - Accuracy Score -', accuracy_score(classi_target, predict))
print(' - Report  -'), print(classification_report(classi_target, predict))

FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
TP = np.diag(cnf_matrix)
TN = cnf_matrix.sum() - (FP + FN + TP)

FP = FP.astype(float)
Example #56
0
print ("\nAdaBoost for Ensemble  - Train Confusion Matrix\n\n",pd.crosstab(y_train,clf4_adabst_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]))      
print ("\nAdaBoost for Ensemble   - Train accuracy",round(accuracy_score(y_train,clf4_adabst_fit.predict(x_train)),3))
print ("\nAdaBoost for Ensemble   - Train Classification Report\n",classification_report(y_train,clf4_adabst_fit.predict(x_train)))

print ("\n\nAdaBoost for Ensemble   - Test Confusion Matrix\n\n",pd.crosstab(y_test,clf4_adabst_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
print ("\nAdaBoost for Ensemble   - Test accuracy",round(accuracy_score(y_test,clf4_adabst_fit.predict(x_test)),3))
print ("\nAdaBoost for Ensemble  - Test Classification Report\n",classification_report(y_test,clf4_adabst_fit.predict(x_test)))


ensemble = pd.DataFrame()

ensemble["log_output_one"] = pd.DataFrame(clf1_logreg_fit.predict_proba(x_train))[1]
ensemble["dtr_output_one"] = pd.DataFrame(clf2_dt_fit.predict_proba(x_train))[1]
ensemble["rf_output_one"] = pd.DataFrame(clf3_rf_fit.predict_proba(x_train))[1]
ensemble["adb_output_one"] = pd.DataFrame(clf4_adabst_fit.predict_proba(x_train))[1]

ensemble = pd.concat([ensemble,pd.DataFrame(y_train).reset_index(drop = True )],axis=1)

# Fitting meta-classifier
meta_logit_fit =  LogisticRegression(fit_intercept=False)
meta_logit_fit.fit(ensemble[['log_output_one','dtr_output_one','rf_output_one','adb_output_one']],ensemble['Attrition_ind'])

coefs =  meta_logit_fit.coef_
print ("Co-efficients for LR, DT, RF & AB are:",coefs)

ensemble_test = pd.DataFrame()
ensemble_test["log_output_one"] = pd.DataFrame(clf1_logreg_fit.predict_proba(x_test))[1]
ensemble_test["dtr_output_one"] = pd.DataFrame(clf2_dt_fit.predict_proba(x_test))[1]
ensemble_test["rf_output_one"] = pd.DataFrame(clf3_rf_fit.predict_proba(x_test))[1]
ensemble_test["adb_output_one"] = pd.DataFrame(clf4_adabst_fit.predict_proba(x_test))[1]
test_y_knn = knn_opt.predict_proba(test_x)

knn_out = submission
knn_out['target'] = test_y_knn

knn_out['target'] = 1 - knn_out['target']
knn_out.to_csv('knn_predictions1.csv', index=False, float_format='%.4f')

ada_opt = AdaBoostClassifier(algorithm='SAMME.R',
                             base_estimator=None,
                             learning_rate=1.0,
                             n_estimators=200,
                             random_state=None)

ada_opt.fit(train_x, train_y)
test_y_ada = ada_opt.predict_proba(test_x)

ada_out = submission
ada_out['target'] = test_y_ada
ada_out['target'] = 1 - ada_out['target']

ada_out.to_csv('ada_predictions1.csv', index=False, float_format='%.4f')

gb_opt = GradientBoostingClassifier(criterion='friedman_mse',
                                    init=None,
                                    learning_rate=0.1,
                                    loss='deviance',
                                    max_depth=3,
                                    max_features=None,
                                    max_leaf_nodes=None,
                                    min_impurity_split=None,
Example #58
0
def buildModel(X, y):
    # X = np.reshape(X,(X.shape[0],X.shape[1] * X.shape[2]))
    print X.shape, y.shape
    scaler = StandardScaler()
    print(scaler.fit(X))
    scaled_train_x = scaler.transform(X)
    X_train, X_test, y_train, y_test = train_test_split(scaled_train_x,
                                                        y,
                                                        random_state=19,
                                                        test_size=0.3)

    bag = BalancedBaggingClassifier(n_estimators=200, random_state=19)
    svm = SVC(class_weight='balanced',
              random_state=19,
              decision_function_shape='ovr')
    neural = MLPClassifier(max_iter=500,
                           random_state=19,
                           solver='lbfgs',
                           alpha=1e-5,
                           hidden_layer_sizes=(49, 8, 4))
    ada = AdaBoostClassifier(n_estimators=100, random_state=19)
    logistic = LogisticRegression(solver='lbfgs', max_iter=500)

    bag.fit(X_train, y_train)
    svm.fit(X_train, y_train)
    neural.fit(X_train, y_train)
    ada.fit(X_train, y_train)
    logistic.fit(X_train, y_train)

    # joblib.dump(bag,'bag.pkl')
    # joblib.dump(scaler,'scaler.pkl')

    y_pred = bag.predict(X_test)
    y_pred2 = svm.predict(X_test)
    y_pred3 = neural.predict(X_test)
    y_pred4 = ada.predict(X_test)
    y_pred5 = logistic.predict(X_test)

    print matthews_corrcoef(y_test, y_pred)
    print matthews_corrcoef(y_test, y_pred2)
    print matthews_corrcoef(y_test, y_pred3)
    print matthews_corrcoef(y_test, y_pred4)
    print matthews_corrcoef(y_test, y_pred5)

    print confusion_matrix(y_test, y_pred)
    print confusion_matrix(y_test, y_pred2)
    print confusion_matrix(y_test, y_pred3)
    print confusion_matrix(y_test, y_pred4)
    print confusion_matrix(y_test, y_pred5)

    print(classification_report_imbalanced(y_test, y_pred))
    print(classification_report_imbalanced(y_test, y_pred2))
    print(classification_report_imbalanced(y_test, y_pred3))
    print(classification_report_imbalanced(y_test, y_pred4))
    print(classification_report_imbalanced(y_test, y_pred5))

    probs_ada = ada.predict_proba(X_test)
    probs_bag = bag.predict_proba(X_test)
    probs_neural = neural.predict_proba(X_test)
    probs_logistic = logistic.predict_proba(X_test)
    probs_svm = svm.decision_function(X_test)

    ROCplot(probs_ada, y_test, "Plots/ROCplotADA-organelle.png")
    ROCplot(probs_logistic, y_test, "Plots/ROCplotLogistic-organelle.png")
    ROCplot(probs_bag, y_test, "Plots/ROCplotBAG-organelle.png")
    ROCplot(probs_neural, y_test, "Plots/ROCplotNeural-organelle.png")
    ROCplot(probs_svm, y_test, "Plots/ROCplotSVM-organelle.png")

    multiROCplot(
        [probs_ada, probs_logistic, probs_bag, probs_neural, probs_svm],
        y_test, "Plots/multiROCplot.png",
        ['AdaBoost', 'Logistic', 'Bagging Classifier', 'MLP', 'SVM'])
#dataset is imbalanced, we'll be using the ROC AUC score as a metric instead of accuracy.

# Import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

# Import AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier

# Instantiate dt
dt = DecisionTreeClassifier(max_depth=2, random_state=1)

# Instantiate ada
ada = AdaBoostClassifier(base_estimator=dt, n_estimators=180, random_state=1)

# Fit ada to the training set
ada.fit(X_train, y_train)

# Compute the probabilities of obtaining the positive class
y_pred_proba = ada.predict_proba(X_test)[:,1]

# Import roc_auc_score
from sklearn.metrics import roc_auc_score

# Evaluate test-set roc_auc_score
ada_roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print roc_auc_score
print('ROC AUC score: {:.2f}'.format(ada_roc_auc))
Clf.fit(X_train, y_train)

pred = Clf.predict_proba(X_test)[:,1]

pd.DataFrame({"id": original_test["id"], "target": pred}).to_csv("RandomForest_submission.csv", index=False)

score=cross_validate(adaClf, X_train, y_train, cv=3, scoring="roc_auc")["test_score"].mean()
print(f"{score:.6f}")

"""## AdaBoost Classifier"""

adaClf = AdaBoostClassifier()

adaClf.fit(X_train, y_train)

pred = adaClf.predict_proba(X_test)[:,1]

pd.DataFrame({"id": original_test["id"], "target": pred}).to_csv("adaboost_submission.csv", index=False)

score=cross_validate(adaClf, X_train, y_train, cv=3, scoring="roc_auc")["test_score"].mean()
print(f"{score:.6f}")

"""## GaussianProcessClassifier"""

clf = GaussianProcessClassifier(1**2 * RBF(length_scale=0.8))

clf.fit(X_train, y_train)

pred=clf.predict_proba(X_test)[:,1]

pd.DataFrame({"id": original_test["id"], "target": pred}).to_csv("GaussianProcess_submission.csv", index=False)