def ADA_Classifier(X_train, X_cv, X_test, Y_train,Y_cv,Y_test, Actual_DS): print("***************Starting AdaBoost Classifier***************") t0 = time() clf = AdaBoostClassifier(n_estimators=300) clf.fit(X_train, Y_train) preds = clf.predict(X_cv) score = clf.score(X_cv,Y_cv) print("AdaBoost Classifier - {0:.2f}%".format(100 * score)) Summary = pd.crosstab(label_enc.inverse_transform(Y_cv), label_enc.inverse_transform(preds), rownames=['actual'], colnames=['preds']) Summary['pct'] = (Summary.divide(Summary.sum(axis=1), axis=1)).max(axis=1)*100 print(Summary) #Check with log loss function epsilon = 1e-15 #ll_output = log_loss_func(Y_cv, preds, epsilon) preds2 = clf.predict_proba(X_cv) ll_output2= log_loss(Y_cv, preds2, eps=1e-15, normalize=True) print(ll_output2) print("done in %0.3fs" % (time() - t0)) preds3 = clf.predict_proba(X_test) #preds4 = clf.predict_proba((Actual_DS.ix[:,'feat_1':])) preds4 = clf.predict_proba(Actual_DS) print("***************Ending AdaBoost Classifier***************") return pd.DataFrame(preds2) , pd.DataFrame(preds3),pd.DataFrame(preds4)
def ab(train_data,train_label,val_data,val_label,test_data,name="adaboost_submission.csv"): print "Start training AdaBoost..." abClf = AdaBoostClassifier() abClf.fit(train_data,train_label) #evaluate on validation set val_pred_label = abClf.predict_proba(val_data) logloss = preprocess.evaluation(val_label,val_pred_label) print "logloss of validation set:",logloss print "Start classify test set..." test_label = abClf.predict_proba(test_data) preprocess.saveResult(test_label,filename = name)
def ab_predictedValue(): print '----------AdaBoost----------' ab_clf = AdaBoostClassifier(n_estimators = NoOfEstimators) ab_clf.fit(train_df[features], train_df['SeriousDlqin2yrs']) ab_predictedValue = ab_clf.predict_proba(test_df[features]) print 'Feature Importance = %s' % ab_clf.feature_importances_ return ab_predictedValue[:,1]
def do_all_study(X,y): names = [ "Decision Tree","Gradient Boosting", "Random Forest", "AdaBoost", "Naive Bayes"] classifiers = [ #SVC(), DecisionTreeClassifier(max_depth=10), GradientBoostingClassifier(max_depth=10, n_estimators=20, max_features=1), RandomForestClassifier(max_depth=10, n_estimators=20, max_features=1), AdaBoostClassifier()] for name, clf in zip(names, classifiers): estimator,score = plot_learning_curve(clf, X_train, y_train, scoring='roc_auc') clf_GBC = GradientBoostingClassifier(max_depth=10, n_estimators=20, max_features=1) param_name = 'n_estimators' param_range = [1, 5, 10, 20,40] plot_validation_curve(clf_GBC, X_train, y_train, param_name, param_range, scoring='roc_auc') clf_GBC.fit(X_train,y_train) y_pred_GBC = clf_GBC.predict_proba(X_test)[:,1] print("ROC AUC GradientBoostingClassifier: %0.4f" % roc_auc_score(y_test, y_pred_GBC)) clf_AB = AdaBoostClassifier() param_name = 'n_estimators' param_range = [1, 5, 10, 20,40] plot_validation_curve(clf_AB, X_train, y_train, param_name, param_range, scoring='roc_auc') clf_AB.fit(X_train,y_train) y_pred_AB = clf_AB.predict_proba(X_test)[:,1] print("ROC AUC AdaBoost: %0.4f" % roc_auc_score(y_test, y_pred_AB))
def training(baseclassparameters, adaparameters, queue): treeclassifier = DecisionTreeClassifier(**baseclassparameters) adaclassifier = AdaBoostClassifier(treeclassifier, **adaparameters) print "\nBegin calculation with {0} and {1}".format(str(baseclassparameters), str(adaparameters)) adaclassifier.fit(Xtrain, ytrain) #Predict with the model prob_predict_test = adaclassifier.predict_proba(Xtest)[:,1] #Calculate maximal significance True_Signal_test = prob_predict_test[ytest==1] True_Bkg_test = prob_predict_test[ytest==0] best_significance = 0 for x in np.linspace(0, 1, 1000): S = float(len(True_Signal_test[True_Signal_test>x])) B = float(len(True_Bkg_test[True_Bkg_test>x])) significance = S/np.sqrt(S+B) if significance > best_significance: best_significance = significance best_x = x best_S = S best_B = B print "\nCalculation with {} and {} done ".format(str(baseclassparameters), str(adaparameters)) print "Best significance of {0:.2f} archived when cutting at {1:.3f}".format(best_significance, best_x) print "Signal efficiency: {0:.2f}%".format(100.*best_S/len(True_Signal_test)) print "Background efficiency: {0:.2f}%".format(100.*best_B/len(True_Bkg_test)) print "Purity: {0:.2f}%".format(100.*best_S/(best_S+best_B)) queue.put( (best_significance, baseclassparameters, adaparameters) )
def test_oneclass_adaboost_proba(): # Test predict_proba robustness for one class label input. # In response to issue #7501 # https://github.com/scikit-learn/scikit-learn/issues/7501 y_t = np.ones(len(X)) clf = AdaBoostClassifier().fit(X, y_t) assert_array_almost_equal(clf.predict_proba(X), np.ones((len(X), 1)))
def ada_prediction(features_train, labels_train, features_test, ids): X_train, X_test, y_train, y_test = cross_validation.train_test_split(features_train, labels_train, random_state=1301, stratify=labels_train, test_size=0.3) clf = AdaBoostClassifier(RandomForestClassifier(bootstrap=True, criterion='entropy', max_depth=None, max_features=2, max_leaf_nodes=16, min_samples_split=10, n_estimators=1000, n_jobs=-1, oob_score=False), algorithm="SAMME", n_estimators=200) #clf_acc = clf.fit(X_train, y_train) # print(clf.best_estimator_) #feature_importance = clf.feature_importances_ #print (feature_importance) #pred = clf_acc.predict_proba(X_test)[:,1] #print (y_test, pred) # acc = accuracy_score(y_test, pred) # print ("Acc {}".format(acc)) clf = clf.fit(features_train, labels_train) pred = clf.predict_proba(features_test)[:,1] predictions_file = open("data/canivel_ada_forest.csv", "wb") predictions_file_object = csv.writer(predictions_file) predictions_file_object.writerow(["ID", "TARGET"]) predictions_file_object.writerows(zip(ids, pred)) predictions_file.close()
def ada_boost_cv(x_train, y_train, cv, max_tree_depth, n_estimators, learning_rate): tree_classifier = DecisionTreeClassifier(max_depth=max_tree_depth, class_weight="balanced") ada_boost_classifier = AdaBoostClassifier(base_estimator=tree_classifier, n_estimators=n_estimators, learning_rate=learning_rate) y_bar = cross_val_predict(estimator=ada_boost_classifier, X=x_train, y=y_train, cv=cv, n_jobs=cv) y_bar_proba = ada_boost_classifier.predict_proba(x_train) print(list(zip(y_bar,y_bar_proba))) cm = confusion_matrix(y_train,y_bar) accuracy_negative = cm[0,0] / np.sum(cm[0,:]) accuracy_positive = cm[1,1] / np.sum(cm[1,:]) precision = cm[1,1] / (cm[1,1] + cm[0,1]) recall = cm[1,1] / (cm[1,1] + cm[1,0]) f1_score = 2 * precision * recall / (precision + recall) return accuracy_positive, accuracy_negative, precision, recall, f1_score
def Adaboost(TrainData,TestData): features=['Time','Season','Hour','Minute','District'] clf = AdaBoostClassifier(tree.DecisionTreeClassifier(),n_estimators=30) size=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] for i in range(0,len(size)): train,validation= train_test_split(TrainData, train_size=size[i]) while len(set(train['Category'])) != len(set(validation['Category'])): train,validation= train_test_split(TrainData, train_size=size[i]) clf = clf.fit(train[features], train['Category']) """stop = timeit.default_timer() print "Runnin time adaboost is ", stop-start""" predicted=np.array(clf.predict_proba(validation[features])) model=clf.predict(train[features]) model1=clf.predict(validation[features]) #scores = cross_val_score(clf, validation[features], validation['Category']) #print "Scores mean is",scores.mean() #accuracy print "Training accuracy is", accuracy_score(train['Category'].values.tolist(),model) print "Validation accuracy is",accuracy_score(validation['Category'].values.tolist(),model1) print "Precision is ",precision_score(validation['Category'].values.tolist(),model1,average='macro') print "Recall is ",recall_score(validation['Category'].values.tolist(),model1,average='macro') print "Log loss is", log_loss(validation['Category'].values.tolist(),predicted,eps=1e-15, normalize=True, sample_weight=None) #writing to file """Category_new=[]
def test_staged_predict(): """Check staged predictions.""" # AdaBoost classification for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg, n_estimators=10) clf.fit(iris.data, iris.target) predictions = clf.predict(iris.data) staged_predictions = [p for p in clf.staged_predict(iris.data)] proba = clf.predict_proba(iris.data) staged_probas = [p for p in clf.staged_predict_proba(iris.data)] score = clf.score(iris.data, iris.target) staged_scores = [s for s in clf.staged_score(iris.data, iris.target)] assert_equal(len(staged_predictions), 10) assert_array_almost_equal(predictions, staged_predictions[-1]) assert_equal(len(staged_probas), 10) assert_array_almost_equal(proba, staged_probas[-1]) assert_equal(len(staged_scores), 10) assert_array_almost_equal(score, staged_scores[-1]) # AdaBoost regression clf = AdaBoostRegressor(n_estimators=10) clf.fit(boston.data, boston.target) predictions = clf.predict(boston.data) staged_predictions = [p for p in clf.staged_predict(boston.data)] score = clf.score(boston.data, boston.target) staged_scores = [s for s in clf.staged_score(boston.data, boston.target)] assert_equal(len(staged_predictions), 10) assert_array_almost_equal(predictions, staged_predictions[-1]) assert_equal(len(staged_scores), 10) assert_array_almost_equal(score, staged_scores[-1])
def test_iris(): # Check consistency on dataset iris. classes = np.unique(iris.target) clf_samme = prob_samme = None for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg) clf.fit(iris.data, iris.target) assert_array_equal(classes, clf.classes_) proba = clf.predict_proba(iris.data) if alg == "SAMME": clf_samme = clf prob_samme = proba assert_equal(proba.shape[1], len(classes)) assert_equal(clf.decision_function(iris.data).shape[1], len(classes)) score = clf.score(iris.data, iris.target) assert score > 0.9, "Failed with algorithm %s and score = %f" % \ (alg, score) # Somewhat hacky regression test: prior to # ae7adc880d624615a34bafdb1d75ef67051b8200, # predict_proba returned SAMME.R values for SAMME. clf_samme.algorithm = "SAMME.R" assert_array_less(0, np.abs(clf_samme.predict_proba(iris.data) - prob_samme))
def adaboost(X,training_target,Y,est): from sklearn.ensemble import AdaBoostClassifier clf = AdaBoostClassifier(n_estimators=est) clf.fit(X,training_target) proba = clf.predict_proba(Y)
def train(xTrain, yTrain, metric): print 'adaboost' global boost boost = AdaBoostClassifier() boost.fit(xTrain,yTrain) global trainResults trainResults = boost.predict_proba(xTrain)[:,1] i.setSuccess(trainResults, metric)
def test_classification_toy(): # Check classification on a toy dataset. for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg, random_state=0) clf.fit(X, y_class) assert_array_equal(clf.predict(T), y_t_class) assert_array_equal(np.unique(np.asarray(y_t_class)), clf.classes_) assert_equal(clf.predict_proba(T).shape, (len(T), 2)) assert_equal(clf.decision_function(T).shape, (len(T),))
def main(): X, Y = load('train.csv') adaboost = AdaBoostClassifier(n_estimators=150, learning_rate=0.1) adaboost.fit(X, Y) X_test, ID = loadTest('test.csv') target = adaboost.predict_proba(X_test) df = pandas.DataFrame() df['TARGET'] = target[:,1] df.index = pandas.Series(ID, name='ID') df.to_csv('sumbit.csv')
def abClassifier(X_train,y_train,X_test,y_test,to_plot=False): params = { 'random_state': [None,0,1,2,3,4,5]} for param in ParameterGrid(params): print param clf = AdaBoostClassifier(algorithm='SAMME.R',n_estimators=50,learning_rate=0.7,**param) clf.fit(X_train,y_train) # auc_compute(y_test,clf.predict_proba(X_test)[:,1]) predictions=clf.predict(X_test) scores(y_test,predictions,clf.predict_proba(X_test)[:,1],'ab',to_plot=to_plot)
def classify_AdaBoost(train, test): from sklearn.ensemble import AdaBoostClassifier as ABC x, y = train clf = ABC() clf.fit(x, y) x, y = test proba = clf.predict_proba(x) return proba
class ABClassifier(Model): ''' Adaptive Boosting Classifier Boosting an initial result of 1-depth decision tress classifier ''' def __init__(self): Model.__init__(self) self.model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),algorithm="SAMME",n_estimators=200) def predict(self, test): return self.model.predict_proba(test)
def adaboost(X, y, train, valid): from sklearn.ensemble import AdaBoostClassifier clf2 = AdaBoostClassifier(n_estimators=100).fit(X[train], y[train]) yhat = clf2.predict(X[valid]) print(classification_report(y[valid], yhat)) accuracy_score(y[valid], yhat) print("adaboost" + str(accuracy_score(y[valid], yhat))) yhat_prob = clf2.predict_proba(X[valid])[:,1] print("extra tree randomForest roc_accuracy" + str(roc_auc_score(y[valid], yhat_prob))) np.savetxt("y_ada.csv", yhat_prob) return yhat_prob
def test_multidimensional_X(): """ Check that the AdaBoost estimators can work with n-dimensional data matrix """ from sklearn.dummy import DummyClassifier, DummyRegressor rng = np.random.RandomState(0) X = rng.randn(50, 3, 3) yc = rng.choice([0, 1], 50) yr = rng.randn(50) boost = AdaBoostClassifier(DummyClassifier(strategy='most_frequent')) boost.fit(X, yc) boost.predict(X) boost.predict_proba(X) boost = AdaBoostRegressor(DummyRegressor()) boost.fit(X, yr) boost.predict(X)
def run_adaboost(estimators_and_learn_rt): print estimators_and_learn_rt[0] print estimators_and_learn_rt[1] clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1, max_features=features-1, splitter='best', min_samples_leaf=10), n_estimators = int(estimators_and_learn_rt[0]), learning_rate=estimators_and_learn_rt[1]) clf.fit(train_features, train_outcome) validation['predictions_clf']=clf.predict_proba(validation_for_p)[:,1] fpr, tpr, thresholds = roc_curve(validation.is_exciting, validation.predictions_clf) auc_score = auc(fpr,tpr) return auc_score
def AdaBoost(X, Y, XTest, YTest): print '-----------------------------------------------------' # param_grid = {'learning_rate': [0.1, 0.3, 0.6, 1, 3, 6, 10]} # tree_grid = GridSearchCV(AdaBoostClassifier(), param_grid) tree_grid = AdaBoostClassifier(n_estimators=100, learning_rate=2) tree_grid.fit(X, Y) # print("The best parameters are %s with a score of %0.2f" # % (tree_grid.best_params_, tree_grid.best_score_)) print "Computing training statistics" dtree_predict_time_training = time.time() Ypred_dtree_training = tree_grid.predict(X) dtree_predict_time_training = time.time() - dtree_predict_time_training dtree_accuracy_training = metrics.accuracy_score(Y, Ypred_dtree_training) dt_precision_training = metrics.precision_score(Y, Ypred_dtree_training, average='binary') dtree_recall_training = metrics.recall_score(Y, Ypred_dtree_training, average='binary') print "DT training prediction time: " + str(dtree_predict_time_training) print "DT training accuracy Score: " + str(dtree_accuracy_training) print "DT training precision Score: " + str(dt_precision_training) print "DT training recall Score: " + str(dtree_recall_training) print "Computing testing statistics" dtree_predict_time_test = time.time() Ypred_dtree_test = tree_grid.predict(XTest) dtree_predict_time_test = time.time() - dtree_predict_time_test dtree_accuracy_test = metrics.accuracy_score(YTest, Ypred_dtree_test) dt_precision_test = metrics.precision_score(YTest, Ypred_dtree_test, average='binary') dtree_recall_test = metrics.recall_score(YTest, Ypred_dtree_test, average='binary') print "DT test prediction time: " + str(dtree_predict_time_test) print "DT test accuracy Score: " + str(dtree_accuracy_test) print "DT test precision Score: " + str(dt_precision_test) print "DT test recall Score: " + str(dtree_recall_test) print "Creating ROC curve" y_true = YTest y_score = tree_grid.predict_proba(XTest) fprSVM, trpSVM, _ = metrics.roc_curve(y_true=y_true, y_score=y_score[:, 0], pos_label=0) plt.plot(fprSVM, trpSVM, 'c-', label='ADA')
def iterate(self, n_estimators_conf=[10], learning_rate_conf=[0.25]): print '-'*80 print 'Running AdaBoost Iterations...' # performance by number of estimators and max depth results = [] for ne in n_estimators_conf: for lr in learning_rate_conf: print 'Iteration: n_estimators=%s, learning_rate=%s' % (str(ne), str(lr)) m = AB(n_estimators=ne, learning_rate=lr) m.fit(self.xtrain, self.ytrain) predtrain = m.predict(self.xtrain) predtest = m.predict(self.xtest) predprobatrain = m.predict_proba(self.xtrain) predprobatest = m.predict_proba(self.xtest) accuracytrain = metrics.accuracy_score(predtrain, self.ytrain) accuracytest = metrics.accuracy_score(predtest, self.ytest) kstrain = multiclass_log_loss(self.ytrain, predprobatrain) kstest = multiclass_log_loss(self.ytest, predprobatest) cr = self.convert_cr(metrics.classification_report(self.ytest, predtest)) results.append([ne, lr, accuracytrain, accuracytest, kstrain, kstest, cr]) self.results = pd.DataFrame(results) self.results.columns = ['ne', 'lr', 'accuracy_train', 'accuracy_test', 'ks_train', 'ks_test', 'cr']
def ada_boost_predict(new_train_data, new_train_labels, test_data, test_labels, base_est = "tree", n = 50): # Create a classifier: AdaBoost classifier if base_est == "tree": base = DecisionTreeClassifier(max_depth=5) classifier = AdaBoostClassifier(base_estimator = base, n_estimators = n) # We learn the digits on the first half of the digits classifier.fit(new_train_data, new_train_labels) # Now predict the value of the digit on the second half: expected = test_labels predicted = classifier.predict_proba(test_data) return predicted
def test_iris(): """Check consistency on dataset iris.""" classes = np.unique(iris.target) for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg) clf.fit(iris.data, iris.target) assert_array_equal(classes, clf.classes_) assert_equal(clf.predict_proba(iris.data).shape[1], len(classes)) assert_equal(clf.decision_function(iris.data).shape[1], len(classes)) score = clf.score(iris.data, iris.target) assert score > 0.9, "Failed with algorithm %s and score = %f" % \ (alg, score)
def main(): Algorithm = 'CamKt12LCTopoSplitFilteredMu67SmallR0YCut9' print 'Loading training data ...' data_train = pd.read_csv(Algorithm+'merged.csv') r =np.random.rand(data_train.shape[0]) #Set label and weight vectors - and drop any unwanted tranining one Y_train = data_train['label'].values[r<0.5] # W_train = data_train['weight'].values[r<0.9] Y_valid = data_train['label'].values[r>=0.5] # W_valid = data_train['weight'].values[r>=0.9] # data_train.drop('AKT10LCTRIM530_MassDropSplit', axis=1, inplace=True) varcombinations = itertools.combinations(data_train.columns.values[1:-1],2) fac = lambda n: 1 if n < 2 else n * fac(n - 1) combos = lambda n, k: fac(n) / fac(k) / fac(n - k) colors = plt.get_cmap('jet')(np.linspace(0, 1.0,combos(len(data_train.columns.values[1:-1]),2) )) for varset,color in zip(varcombinations, colors): print list(varset) X_train = data_train[list(varset)].values[r<0.5] X_valid = data_train[list(varset)].values[r>=0.5] dt = DC(max_depth=3,min_samples_leaf=0.05*len(X_train)) abc = ABC(dt,algorithm='SAMME', n_estimators=8, learning_rate=0.5) print 'Training classifier with all the data..' abc.fit(X_train, Y_train) print 'Done.. Applying to validation sample and drawing ROC' prob_predict_valid = abc.predict_proba(X_valid)[:,1] Y_score = abc.decision_function(X_valid) fpr, tpr, _ = roc_curve(Y_valid, prob_predict_valid) labelstring = ' And '.join(var.replace('_','') for var in varset) print labelstring plt.plot(tpr, (1-fpr), label=labelstring, color=color) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.ylabel('1- Background Efficiency') plt.xlabel('Signal Efficiency') plt.title(Algorithm+' ROC Curve') plt.legend(loc="lower left",prop={'size':6}) plt.savefig(Algorithm+'rocmva.pdf')
def training(thistrainingfeatures, baseclassparameters, adaparameters): treeclassifier = DecisionTreeClassifier(**baseclassparameters) adaclassifier = AdaBoostClassifier(treeclassifier, **adaparameters) #Split training and testdata Xtrain, Xtest, ytrain, ytest = train_test_split(thistrainingfeatures, label) #Cast pd.Series to arrays to apply mask later ytrain = np.asarray(ytrain) ytest = np.asarray(ytest) #print "\nBegin calculation with {0} and {1}".format(str(baseclassparameters), str(adaparameters)) adaclassifier.fit(Xtrain, ytrain) #Predict with the model prob_predict_test = adaclassifier.predict_proba(Xtest)[:,1] #Calculate maximal significance True_Signal_test = prob_predict_test[ytest==1] True_Bkg_test = prob_predict_test[ytest==0] best_significance = 0 for x in np.linspace(0, 1, 1000): S = float(len(True_Signal_test[True_Signal_test>x])) B = float(len(True_Bkg_test[True_Bkg_test>x])) significance = S/np.sqrt(S+B) if significance > best_significance: best_significance = significance best_x = x best_S = S best_B = B if best_significance > best_overall_significance: print """\nCalculation with {0} and {1} done. Variables: {2} Best significance of {3:.2f} archived when cutting at {4:.3f} Signal efficiency: {5:.2f}% Background efficiency: {6:.2f}% Purity: {7:.2f}%""".format( str(baseclassparameters), str(adaparameters), str(list(thistrainingfeatures.columns)), best_significance, best_x, 100.*best_S/len(True_Signal_test), 100.*best_B/len(True_Bkg_test), 100.*best_S/(best_S+best_B) ) #Print feature importances for (feature, importance) in izip(thistrainingfeatures.columns, adaclassifier.feature_importances_): print "{0:45s}: {1:>10.2f}%".format(feature, importance*100.)
class AdaBoostPredictor(PredictorBase): ''' AdaBoost ''' def __init__(self): self.clf = AdaBoostClassifier() def fit(self, X_train, y_train): self.clf.fit(X_train, y_train) def predict(self, X_test): predictions = self.clf.predict_proba(X_test) predictions_df = self.bundle_predictions(predictions) return predictions_df
class ADAClassifier(Classifier, ProbClassifier): def __init__(self, maxTreeDepth=1, estimators=50, learningRate=1.): self.cl = AdaBoostClassifier(n_estimators=estimators, learning_rate=learningRate, base_estimator=DecisionTreeClassifier(max_depth=maxTreeDepth)) def retrain(self, vectorFeature, vectorTarget): # self.cl.fit([v.toarray()[0] for v in vectorFeature], vectorTarget) self.cl.fit(vectorFeature, vectorTarget) def classify(self, vectorizedTest): # return self.cl.predict(vectorizedTest.toarray()[0])[0] return self.cl.predict(vectorizedTest)[0] def getProb(self, vectorizedTest): # return self.cl.predict_proba(vectorizedTest.toarray()[0])[0][1] return self.cl.predict_proba(vectorizedTest)[0][1]
def test(self): X, y = self.dataMat,self.labelMat X_test = self.testData params = {'n_estimators': 1200, 'max_depth': 4, 'subsample': 0.5,'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3} #clf = GradientBoostingClassifier(**params) clf=AdaBoostClassifier(DecisionTreeClassifier(max_depth=6),algorithm="SAMME.R",n_estimators=280) clf.fit(X, y); y_pred = clf.predict(X_test); y_predprob = clf.predict_proba(X_test); output.write('bidder_id'+','+'prediction'+'\n') for i in range(0,len(self.totalid)): if self.totalid[i] in self.testid: idx = self.testid.index(self.totalid[i]) output.write(str(self.testid[idx])+','+str(y_predprob[idx][1])+'\n') #print str(self.testid[idx])+','+str(y_predprob[idx][1]) else: #print str(self.totalid[idx])+','+str(0.0) output.write(str(self.totalid[i])+','+str(0.0)+'\n')
def test_sparse_classification(): # Check classification with sparse input. class CustomSVC(SVC): """SVC variant that records the nature of the training set.""" def fit(self, X, y, sample_weight=None): """Modification on fit caries data type for later verification.""" super().fit(X, y, sample_weight=sample_weight) self.data_type_ = type(X) return self X, y = datasets.make_multilabel_classification(n_classes=1, n_samples=15, n_features=5, random_state=42) # Flatten y to a 1d array y = np.ravel(y) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix, dok_matrix]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) # Trained on sparse format sparse_classifier = AdaBoostClassifier( base_estimator=CustomSVC(probability=True), random_state=1, algorithm="SAMME" ).fit(X_train_sparse, y_train) # Trained on dense format dense_classifier = AdaBoostClassifier( base_estimator=CustomSVC(probability=True), random_state=1, algorithm="SAMME" ).fit(X_train, y_train) # predict sparse_results = sparse_classifier.predict(X_test_sparse) dense_results = dense_classifier.predict(X_test) assert_array_equal(sparse_results, dense_results) # decision_function sparse_results = sparse_classifier.decision_function(X_test_sparse) dense_results = dense_classifier.decision_function(X_test) assert_array_almost_equal(sparse_results, dense_results) # predict_log_proba sparse_results = sparse_classifier.predict_log_proba(X_test_sparse) dense_results = dense_classifier.predict_log_proba(X_test) assert_array_almost_equal(sparse_results, dense_results) # predict_proba sparse_results = sparse_classifier.predict_proba(X_test_sparse) dense_results = dense_classifier.predict_proba(X_test) assert_array_almost_equal(sparse_results, dense_results) # score sparse_results = sparse_classifier.score(X_test_sparse, y_test) dense_results = dense_classifier.score(X_test, y_test) assert_array_almost_equal(sparse_results, dense_results) # staged_decision_function sparse_results = sparse_classifier.staged_decision_function( X_test_sparse) dense_results = dense_classifier.staged_decision_function(X_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_almost_equal(sprase_res, dense_res) # staged_predict sparse_results = sparse_classifier.staged_predict(X_test_sparse) dense_results = dense_classifier.staged_predict(X_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_equal(sprase_res, dense_res) # staged_predict_proba sparse_results = sparse_classifier.staged_predict_proba(X_test_sparse) dense_results = dense_classifier.staged_predict_proba(X_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_almost_equal(sprase_res, dense_res) # staged_score sparse_results = sparse_classifier.staged_score(X_test_sparse, y_test) dense_results = dense_classifier.staged_score(X_test, y_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_equal(sprase_res, dense_res) # Verify sparsity of data is maintained during training types = [i.data_type_ for i in sparse_classifier.estimators_] assert all([(t == csc_matrix or t == csr_matrix) for t in types])
acc = cross_val_score(estimator = clf_ada, X = X_train, y = y_train, cv = cv, scoring='f1') acc.mean(), acc.std() # last step clf_ada = AdaBoostClassifier(dt, algorithm = 'SAMME', n_estimators = 100, learning_rate = 0.1, random_state= 1337 ) clf_ada.fit(X_train, y_train) y_pred = clf_ada.predict(X_test) print(classification_report(y_test, y_pred)) y_pred = clf_ada.predict_proba(X_test)[:, 1] roc_auc_score(y_test, y_pred) #kf = KFold(n_splits = 5, random_state = 1337, shuffle = True) rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1337) # Create arrays and dataframes to store results auc_preds = [] test_preds = np.zeros(df_test.shape[0]) n_fold = 0 for idx_train, idx_valid in rskf.split(X_train, y_train): train_x, train_y = X_train[idx_train], y_train[idx_train] valid_x, valid_y = X_train[idx_valid], y_train[idx_valid]
df = pd.DataFrame(vals) X = df.drop('Class', axis=1) y = df.loc[:, 'Class'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=420) #:# preprocessing #:# model model = AdaBoostClassifier(n_estimators=10, learning_rate=0.3) model.fit(X_train, y_train) #:# hash #:# e37d46e1a5c0065376d1471f564f3ac7 md5 = hashlib.md5(str(model).encode('utf-8')).hexdigest() print(f'md5: {md5}') #:# audit y_pred = model.predict(X_test) y_pred_proba = model.predict_proba(X_test)[:, 1] print(f'Accuracy: {model.score(X_test, y_test)}') print(f'Area under ROC: {roc_auc_score(y_test, y_pred_proba)}') #:# session info sessionInfo = { "python_version": python_version(), "library_versions":[str(d) for d in pkg_resources.working_set] } with open('sessionInfo.txt', 'w') as f: json.dump(sessionInfo, f, indent=4)
to_predict = [calendar[ii].lrank - calendar[ii].wrank, calendar[ii].welo-calendar[ii].lelo, calendar[ii].welosur-calendar[ii].lelosur, surf_into_num[calendar[ii].surface], \ round(wins_percent(*winner) - wins_percent(*loser), 3), round(wins_per_surface(*winner) - wins_per_surface(*loser), 3), \ round(av_first_serve(*winner) - av_first_serve(*loser), 3), round(av_first_serve_surface(*winner) - av_first_serve_surface(*loser), 3), \ round(av_second_serve(*winner) - av_second_serve(*loser), 3), round(av_second_serve_surface(*winner) - av_second_serve_surface(*loser), 3), \ round(av_first_return(*winner) - av_first_return(*loser), 3), round(av_first_return_surface(*winner) - av_first_return_surface(*loser), 3), \ round(av_second_return(*winner) - av_second_return(*loser), 3), round(av_second_return_surface(*winner) - av_second_return_surface(*loser), 3), \ round(av_aces(*winner) - av_aces(*loser), 3), round(av_aces_surface(*winner) - av_aces_surface(*loser), 3), \ round(av_dfs(*winner) - av_dfs(*loser), 5), round(av_dfs_surface(*winner) - av_dfs_surface(*loser), 5), \ round(av_bps(*winner) - av_bps(*loser), 3), round(av_bps_surface(*winner) - av_bps_surface(*loser), 3)] clf.fit(X, Y) X_test = to_predict y_test = [1] prediction = clf.predict([to_predict]) proba = clf.predict_proba([to_predict]) coeffs_test = [calendar[ii].cfw, calendar[ii].cfl] q1 = 0.1*bank1 q2 = 0.1*bank2 q3 = 0.1*bank3 q4 = 0.1*bank4 '''print("Best params:", model.best_params_) print("To predict:", to_predict) print("Prediction:", prediction) print("Probabilities:", proba) print("testing:", "("+str(roi_1(prediction, q1, coeffs_test)), str(roi_2(prediction, q2, coeffs_test)), str(roi_3(prediction, q3, coeffs_test)), \ str(roi_4(prediction, proba, q4, coeffs_test))+")")''' profit1 += roi_1(prediction, q1, coeffs_test)
data_test = pd.read_csv('adult.test', header=None, skiprows=1, names=column_names) for name in data_test.columns: data_test[name] = pd.Categorical(data_test[name]).codes x_test = data_test[data_test.columns[:-1]] y_test = data_test[data_test.columns[-1]] y_test_pred = model.predict(x_test) print('测试集准确率:', accuracy_score(y_test, y_test_pred)) print('\t测试集查准率:', precision_score(y_test, y_test_pred)) print('\t测试集召回率:', recall_score(y_test, y_test_pred)) print('\t测试集F1:', f1_score(y_test, y_test_pred)) y_test_proba = model.predict_proba(x_test) # print y_test_proba y_test_proba = y_test_proba[:, 1] fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_proba) auc = metrics.auc(fpr, tpr) print('AUC = ', auc) # 或直接调用roc_auc_score # print 'AUC = ', metrics.roc_auc_score(y_test, y_test_proba) mpl.rcParams['font.sans-serif'] = 'SimHei' mpl.rcParams['axes.unicode_minus'] = False plt.figure(facecolor='w') plt.plot(fpr, tpr, 'r-', lw=2, alpha=0.8, label='AUC=%.3f' % auc) plt.plot((0, 1), (0, 1), c='b', lw=1.5, ls='--', alpha=0.7) plt.xlim((-0.01, 1.02)) plt.ylim((-0.01, 1.02))
# Draw a horizontal barplot of importances_sorted vImportancesSorted.plot(kind='barh', color='lightgreen') plt.title('Features Importances') plt.show() # METHOD V: Boosting # Ada Boosting vAdaBoostClassifier = AdaBoostClassifier(base_estimator=vDecisionTree, n_estimators=180, random_state=SEED) # Fit ada to the training set vAdaBoostClassifier.fit(vXTrain, vYTrain) # Compute the probabilities of obtaining the positive class vYPredProba = vAdaBoostClassifier.predict_proba(vXTest)[:, 1] vAdaROCAUC = roc_auc_score(vYTest, vYPredProba) print('ROC AUC score: {:.2f}'.format(vAdaROCAUC)) # Gradient Boosting vGradientBoostingClassifier = GradientBoostingClassifier(max_depth=4, n_estimators=180, random_state=SEED) vGradientBoostingClassifier.fit(vXTrain, vYTrain) vYPred = vGradientBoostingClassifier.predict(vXTest) vRMSE = MSE(vYTest, vYPred)**(1 / 2) print('Test set RMSE of Gradient Boosting Classifier: {:.2f}'.format(vRMSE)) # Stochastic Gradient Boosting vStochasticGradientBoostingClassifier = GradientBoostingClassifier( max_depth=4,
random_state=None) classifier.fit(train_data, train_label) tra_label = classifier.predict(train_data) # 训练集的预测标签 tes_label = classifier.predict(test_data) # 测试集的预测标签 print("训练集:", accuracy_score(train_label, tra_label)) print("测试集:", accuracy_score(test_label, tes_label)) matrix = confusion_matrix(test_label, tes_label, labels=[0, 1]) TP = matrix[1, 1] TN = matrix[0, 0] FP = matrix[0, 1] FN = matrix[1, 0] sn = TP / (TP + FN) sp = TN / (TN + FP) decision_score = classifier.predict_proba(test_data) fprs, tprs, thresholds = roc_curve(test_label, decision_score[:, 1]) # plt.plot(fprs, tprs) # plt.show() roc_auc = auc(fprs, tprs) plt.figure() lw = 2 plt.plot(fprs, tprs, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=SEED) # Instantiate a classification-tree 'dt' dt = DecisionTreeClassifier(max_depth=1, random_state=SEED) # Instantiate an AdaBoostClassifier 'adb_clf' adb_clf = AdaBoostClassifier(base_estimator=dt, n_estimators=100) # Fit adb_clf to the training set adb_clf.fit(X_train, y_train) # Predict the test set probabilities of positive class y_pred_proba = adb_clf.predict_proba(X_test)[:,1] # Evaluate test roc_auc score adb_clf_roc_auc_score = roc_auc_score(y_test, y_pred_proba) # Print adb_clf_roc_auc score print('ROC AUC score: {:.2f}'.format(adb_clf_roc_auc_score)) # Gradient Boosting (GB) # Gradient Boosted Trees - sequential correction of predecessor's errors # Does not tweak the weights of the training instances # Fit each predictor is trained using its predecessor's residual errors as labels # Gradient Boosted Trees - CART is used as a base learner # Important parameter - shrinkage - prediction of each tree is shrinked after multiplication by a learning rate, eta (0 to 1) # Similar to AdaBoost - trade-off between Eta and the number of estimators # Decreasing learning rate, needs to be compensated by increasing the number of estimators
print(classification_report(y_test, rf.predict(X_test))) #print('Accuracy of Random Forest Classifier on test set: {:.2f}'.format(rf.score(X_test, y_test)*100)) # Classification report for the optimised RF Regression rf.fit(X_train, y_train) rfp = rf.predict(X_test) #adaboost model ada = AdaBoostClassifier(n_estimators=100, random_state=0) ada.fit(X_train, y_train) print("AdaBoost accuracy is %2.2f" % accuracy_score(y_test, ada.predict(X_test))) ada_roc_auc = roc_auc_score(y_test, ada.predict(X_test)) print("AdaBoost AUC = %2.2f" % ada_roc_auc) ######probability leaves employee probsada = ada.predict_proba( X_test)[:, 1] # predict probabilities associated with the employee leaving adaProb_roc_auc = roc_auc_score( y_test, probsada) # calculate AUC score using test dataset print('AUC score: %.3f' % adaProb_roc_auc) print(classification_report(y_test, ada.predict(X_test))) #decision tree model dtree = tree.DecisionTreeClassifier(max_depth=3, class_weight="balanced", min_weight_fraction_leaf=0.01) dtree.fit(X_train, y_train) print("Decision Tree accuracy is %2.2f" % accuracy_score(y_test, dtree.predict(X_test))) dt_roc_auc = roc_auc_score(y_test, dtree.predict(X_test)) print("Decision Tree AUC = %2.2f" % dt_roc_auc)
def train_bdt_multiclass(): print("Loading data...") if SMALL_DATA: signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn = import_data_small() else: signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn = import_data() print("Creating arrays...") # X = Features (i.e. the data) X = np.concatenate((signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn)) # y = Labels (i.e. what it is, signal / background) y = np.concatenate((np.ones(signal.shape[0]), np.full(bkg2nu.shape[0], 2), np.full(bkg214Bi.shape[0], 3), np.full(bkg208Tl.shape[0], 4), np.full(bkgRn.shape[0], 5))) print("Splitting Data...") # Split the data X_dev, X_eval, y_dev, y_eval = train_test_split(X, y, test_size=0.33, random_state=48) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) print("Creating classifier for DT") # Create classifiers dt = DecisionTreeClassifier(max_depth=12, min_samples_split=0.5, min_samples_leaf=400) print("Creating classifier for BDT") bdt = AdaBoostClassifier(dt, algorithm='SAMME', n_estimators=1200, learning_rate=0.5) print("Fitting BDT...") # Train the classifier - not using weights here as it is a multiclassifier fitted_tree = bdt.fit(X_train, y_train) print("Predicting on training data...") # Use the fitted tree to predict on training data and new test data y_predicted_train = bdt.predict(X_train) print("Predicting on test data...") y_predicted_test = bdt.predict(X_test) print( classification_report( y_train, y_predicted_train, target_names=["signal", "2nu", "214Bi", "208Tl", "Radon"])) print("Area under ROC curve for training data: {0:.4f}".format( roc_auc_score(y_train, bdt.predict_proba(X_train), average="weighted", multi_class="ovr"))) print( classification_report( y_test, y_predicted_test, target_names=["signal", "2nu", "214Bi", "208Tl", "Radon"])) print("Area under ROC curve for test data: {0:.4f}".format( roc_auc_score(y_test, bdt.predict_proba(X_test), average="weighted", multi_class="ovr"))) plot_roc_curve(bdt, X_test, y_test) compare_train_test_multi(bdt, X_train, y_train, X_test, y_test) print("Saving classifier...") save_path = BASE_PATH + 'ml_calculated_data/multiClass/' dump(bdt, save_path + 'bdt_classifier.joblib') dump(fitted_tree, save_path + 'bdt_fitted_tree.joblib') dump(X_train, save_path + 'bdt_X_train.joblib') dump(X_test, save_path + 'bdt_X_test.joblib') dump(X_dev, save_path + 'bdt_X_dev.joblib') dump(X_eval, save_path + 'bdt_X_eval.joblib') dump(y_test, save_path + 'bdt_y_test.joblib') dump(y_train, save_path + 'bdt_y_train.joblib') dump(y_dev, save_path + 'bdt_y_dev.joblib') dump(y_eval, save_path + 'bdt_y_eval.joblib') print("Finished Training.")
def AdaMECvsAdaBoost(dataset, C_FP, C_FN, base_estimator, algorithm, n_estimators, calibration_method, test_set_prcnt, cal_set_prcnt): ## Load data mat_contents = sio.loadmat(os.getcwd() + '\\Datasets\\' + dataset + '.mat') data = mat_contents['data'] target = np.asarray([float(i) for i in mat_contents['labels'].ravel()]) target[np.where(target != 1)] = 0 # One-vs-all if multiclass ## Split the data into training and test sets X_train, X_test, y_train, y_test = train_test_split( data, target, test_size=test_set_prcnt) Pos = sum( y_train[np.where(y_train == 1)] ) #Number of positive training examples --estimate of prior of positive class Neg = len( y_train ) - Pos #Number of negative training examples --estimate of prior of negative class C_FP_effective = C_FP * Neg / ( C_FN * Pos + C_FP * Neg ) #Positive skew (overall importance of a single positive example) #C_FN_effective = 1 - C_FP_effective #Negative skew (overall importance of a single negative example) #Define weak learner base_estimator = eval(base_estimator) ## Train ensembles #I.Train an AdaBoost ensemble (algorithm="SAMME" for discrete AdaBoost, algorithm="SAMME.R" for real AdaBoost) AdaBoost = AdaBoostClassifier(base_estimator, algorithm=algorithm, n_estimators=n_estimators) AdaBoost = AdaBoost.fit(X_train, y_train) #II.Train a Calibrated AdaBoost ensemble AdaBoostCal = CalibratedAdaMEC.trainCalibratedAdaMEC( base_estimator, algorithm, n_estimators, calibration_method, cal_set_prcnt, X_train, y_train) ## Generate predictions #I.AdaBoost predictions and scores scores_AdaBoost = AdaBoost.predict_proba(X_test)[:, 1] #Positive Class scores y_pred_AdaBoost = np.zeros(X_test.shape[0]) y_pred_AdaBoost[np.where( scores_AdaBoost > 0.5 )] = 1 #Classifications, the standard AdaBoost decision rule corresponds to a threshold of 0.5 (skew-insensitive) #II.Calibrated AdaMEC predictions and scores y_pred_CalibratedAdaMEC, scores_CalibratedAdaMEC = CalibratedAdaMEC.predictCalibratedAdaMEC( AdaBoostCal, C_FP_effective, X_test) ##Print results: comment/uncomment to your liking! # #Confusion matrices #print('AdaBoost Confusion Matrix:') #conf_mat_AdaBoost = metrics.confusion_matrix(y_test, y_pred_AdaBoost) #print(conf_mat_AdaBoost) #print('Calibrated AdaMEC Confusion Matrix:') #conf_mat_CalibratedAdaMEC = metrics.confusion_matrix(y_test, y_pred_CalibratedAdaMEC) #print(conf_mat_CalibratedAdaMEC) # #Accuracy (lower means better *skew-insensitive* classification). # Note: Not a good measure for *skew-sensitive* learning. #print('Accuracy:') #print('\t\t\tAdaBoost: {0}'.format(metrics.accuracy_score(y_test, y_pred_AdaBoost))) #print('\t\t\tCalibrated AdaMEC: {0}'.format(metrics.accuracy_score(y_test, y_pred_CalibratedAdaMEC))) #Brier Score (lower means better probability estimates) print('Brier Score:') print('\t\t\tAdaBoost: {0}'.format( metrics.brier_score_loss(y_test, scores_AdaBoost))) print('\t\t\tCalibrated AdaMEC: {0}'.format( metrics.brier_score_loss(y_test, scores_CalibratedAdaMEC))) #Negative Log-likelihood (lower means better probability estimates) print('Negative Log-likelihood:') print('\t\t\tAdaBoost: {0}'.format( metrics.log_loss(y_test, scores_AdaBoost))) print('\t\t\tCalibrated AdaMEC: {0}'.format( metrics.log_loss(y_test, scores_CalibratedAdaMEC))) #Misclassification Cost (lower means better skew-sensitive classification) print('Misclassification Cost:') conf_mat_AdaBoost = metrics.confusion_matrix( y_test, y_pred_AdaBoost) #Confusion matrix cost_AdaBoost = conf_mat_AdaBoost[ 0, 1] * C_FP_effective + conf_mat_AdaBoost[1, 0] * ( 1 - C_FP_effective) #Skew-Sensitive Cost print('\t\t\tAdaBoost: {0}'.format(cost_AdaBoost)) conf_mat_CalibratedAdaMEC = metrics.confusion_matrix( y_test, y_pred_CalibratedAdaMEC) #Confusion matrix cost_AdaMEC = conf_mat_CalibratedAdaMEC[ 0, 1] * C_FP_effective + conf_mat_CalibratedAdaMEC[1, 0] * ( 1 - C_FP_effective) #Skew-Sensitive Cost print('\t\t\tCalibrated AdaMEC: {0}'.format(cost_AdaMEC)) if cost_AdaBoost > cost_AdaMEC: print('Calibrated AdaMEC outperformed AdaBoost!') else: print('AdaBoost produced a lower cost solution this time. Try again.') print('Calibrated AdaMEC should lead to lower cost in expectation.')
algorithm="SAMME", n_estimators=number_of_estimators_all_attr, learning_rate=rate_of_learning_all_attr) print 'fitting bdt...' ti = timer() bdt.fit(X_train, y_train, sample_weight=w_train) clf.fit(X_train, y_train, sample_weight=w_train) tf = timer() print 'bdt fit completed>>>>>>>>>>>>>>>>>>>>>>>' print 'time taken for bdt fit1: ' + str(tf - ti) + 'sec' #joblib.dump(bdt,'bdt.pkl') #bdt = joblib.load('bdt.pkl') #~~~~~~~~calculate the decision scores #twoclass_output = bdt.decision_function(X_train) all_probs = bdt.predict_proba(X_test) dftt = df_test_orig.copy() class_names = {0: "background", 1: "signal"} classes = sorted(class_names.keys()) for cls in classes: dftt[class_names[cls]] = all_probs[:, cls] sig = dftt[isSigL] == 1 bkg = dftt[isSigL] == 0 probs = dftt["signal"][sig].values probb = dftt["signal"][bkg].values es, eb = [], [] for c in np.arange(-1, 1, roc_resolution): es.append((float((probs > c).sum()) / probs.size)) eb.append((float((probb > c).sum()) / probb.size))
def create_model(dataset): print("dataset : ", dataset) df = pd.read_csv('/home/farshid/Desktop/' + dataset, header=None) print('reading', dataset) df['label'] = df[df.shape[1] - 1] # df.drop([df.shape[1] - 2], axis=1, inplace=True) labelencoder = LabelEncoder() df['label'] = labelencoder.fit_transform(df['label']) # X = np.array(df.drop(['label'], axis=1)) y = np.array(df['label']) number_of_clusters = 23 sampler = RandomUnderSampler() normalization_object = Normalizer() X = normalization_object.fit_transform(X) skf = StratifiedKFold(n_splits=5, shuffle=True) n_classes = 2 for train_index, test_index in skf.split(X, y): X_train = X[train_index] X_test = X[test_index] y_train = y[train_index] y_test = y[test_index] break print('training', dataset) top_roc = 0 depth_for_rus = 0 split_for_rus = 0 for depth in range(3, 20, 20): for split in range(3, 9, 20): classifier = AdaBoostClassifier(DecisionTreeClassifier( max_depth=depth, min_samples_split=split), n_estimators=100, learning_rate=1, algorithm='SAMME') X_train, y_train = sampler.fit_sample(X_train, y_train) classifier.fit(X_train, y_train) predictions = classifier.predict_proba(X_test) score = roc_auc_score(y_test, predictions[:, 1]) if top_roc < score: top_roc = score tpr = dict() fpr = dict() roc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_test, predictions[:, i]) roc[i] = roc_auc_score(y_test, predictions[:, i]) major_class = max(sampler.fit(X_train, y_train).stats_c_, key=sampler.fit(X_train, y_train).stats_c_.get) major_class_X_train = [] major_class_y_train = [] minor_class_X_train = [] minor_class_y_train = [] for index in range(len(X_train)): if y_train[index] == major_class: major_class_X_train.append(X_train[index]) major_class_y_train.append(y_train[index]) else: minor_class_X_train.append(X_train[index]) minor_class_y_train.append(y_train[index]) # optimize for number of clusters here kmeans = KMeans(max_iter=200, n_jobs=4, n_clusters=number_of_clusters) kmeans.fit(major_class_X_train) # get the centroids of each of the clusters cluster_centroids = kmeans.cluster_centers_ # get the points under each cluster points_under_each_cluster = { i: np.where(kmeans.labels_ == i)[0] for i in range(kmeans.n_clusters) } for i in range(number_of_clusters): size = len(points_under_each_cluster[i]) random_indexes = np.random.randint(low=0, high=size, size=int(size / 2)) temp = points_under_each_cluster[i] feature_indexes = temp[random_indexes] X_train_major = np.concatenate( (X_train_major, X_train[feature_indexes]), axis=0) y_train_major = np.concatenate( (y_train_major, y_train[feature_indexes]), axis=0) final_train_x = np.concatenate((X_train_major, minor_class_X_train), axis=0) final_train_y = np.concatenate((y_train_major, minor_class_y_train), axis=0) classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=150)) # classifier = sklearn.svm.SVC(C=50 , gamma= .0008 , kernel='rbf', probability=True) # classifier = sklearn.svm.SVC(C=100, gamma=.006, kernel='rbf', probability=True) classifier.fit(final_train_x, final_train_y) predicted = classifier.predict_proba(X_test) tpr_c = dict() fpr_c = dict() roc_c = dict() for i in range(n_classes): fpr_c[i], tpr_c[i], _ = roc_curve(y_test, predictions[:, i]) roc_c[i] = auc(y_test, predictions[:, i]) print('ploting', dataset) # plt.clf() plt.plot(fpr[1], tpr[1], lw=2, color='red', label='Roc curve: Clustered sampling') plt.plot(fpr_c[1], tpr_c[1], lw=2, color='navy', label='Roc curve: random under sampling') plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.title('Area under ROC curve') plt.legend(loc="lower right") plt.show()
for index in sortedIndicies.tolist(): if classLabels[index] == 1.0: delX = 0 delY = yStep else: delX = xStep delY = 0 ySum += cursor[1] # draw line from cursor to (cursor[0]-delX,cursor[1]-delY) ax.plot([cursor[0], cursor[0] - delX], [cursor[1], cursor[1] - delY], c='b') cursor = (cursor[0] - delX, cursor[1] - delY) ax.plot([0, 1], [0, 1], 'b--') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC cursorve for AdaBoost horse colic detection system') ax.axis([0, 1, 0, 1]) plt.show() # 每个小矩形相加,矩形的宽度为xStep,因此对矩形的高度进行相加得到ySum print("the Area Under the cursorve is: ", ySum * xStep) if __name__ == "__main__": X, y = make_hastie_10_2(n_samples=4000, random_state=1) X_test, y_test = X[2000:], y[2000:] X_train, y_train = X[:2000], y[:2000] clf = AdaBoostClassifier(n_estimators=100) clf.fit(X_train, y_train) preds = clf.predict_proba(X_test) plotROC(preds[:, 1], y_test)
#NOTE: change classifier here clf = AdaBoostClassifier(n_estimators=500, algorithm='SAMME') #training st = time.time() print "training started" clf.fit(x_train, y_train) print "training ended" et = time.time() tt = et - st print "Training Time = " + str(tt) + "\n" #predictions pred = clf.predict(x_test) #NOTE: change to decision_function or predict_proba depending on the classifier y_score = clf.predict_proba(x_test) #y_score = clf.decision_function(x_test) ################################################################################# pp = PdfPages('results/EXP_Result.pdf') #PrecisionRecall-plot precision = dict() recall = dict() PR_area = dict() PR_thresholds = dict() average_precision = dict() for i in range(n_classes): precision[i], recall[i], PR_thresholds[i] = precision_recall_curve( y_test[:, i], y_score[:, i]) PR_area[i] = auc(recall[i], precision[i]) average_precision[i] = average_precision_score(y_test[:, i], y_score[:, i])
plt.ylabel('Test Accuracy%') plt.xlabel('n_estimators') plt.show() # ROC curve for baseline classification tree clf_probs=clf.predict_proba(wine_test.loc[:,['price','regn_enc','var_enc', \ 'wnry_enc']]) fpr1,tpr1,thr1=roc_curve(np.where(wine_test['point_bins']=='90+',1.,0.), \ clf_probs[:,0]) # ROC curve for bagging ensemble using full classification trees bag_probs=baglfy.predict_proba(wine_test.loc[:,['price','regn_enc', \ 'var_enc','wnry_enc']]) fpr2,tpr2,thr2=roc_curve(np.where(wine_test['point_bins']=='90+',1.,0.), \ bag_probs[:,0]) # ROC curve for boosting ensemble using full classification trees bst_probs=bstlfy.predict_proba(wine_test.loc[:,['price','regn_enc', \ 'var_enc','wnry_enc']]) fpr3,tpr3,thr3=roc_curve(np.where(wine_test['point_bins']=='90+',1.,0.), \ bst_probs[:,0]) # Plot ROC Curves plt.plot(fpr1,tpr1,color='#4d4d33',label='Baseline CART') plt.plot(fpr2,tpr2,color='#0080ff',label='Bagging Ensemble') plt.plot(fpr3,tpr3,color='#ff3300',label='Boosting Ensemble') plt.plot([0.,1.],[0.,1.],color='k',linestyle='--') plt.title('ROC Curves for 90+ Point Wine Classification') plt.xlabel('false positive rate') plt.ylabel('true positive rate') plt.legend(fontsize=8) plt.show()
y_pred_dtc = dtc.predict(X_test_prepared0) print("dtc percentage: ", 100 * np.sum(y_pred_dtc == Val_y) / len(Val_y)) y_score_sgd = f1_score(Val_y, y_pred_sgd) print("sgd f1 score: ", y_score_sgd) y_score_gbc = f1_score(Val_y, y_pred_gbc) print("gbc f1 score: ", y_score_gbc) y_score_adb = f1_score(Val_y, y_pred_adb) print("adb f1 score: ", y_score_adb) y_score_dtc = f1_score(Val_y, y_pred_dtc) print("dtc f1 score: ", y_score_dtc) auc_sgd = roc_auc_score(Val_y, y_pred_sgd) auc_gbc = roc_auc_score(Val_y, y_pred_gbc) auc_adb = roc_auc_score(Val_y, y_pred_adb) auc_dtc = roc_auc_score(Val_y, y_pred_dtc) print("sgd auc: ", auc_sgd) print("gbc auc: ", auc_gbc) print("adb auc: ", auc_adb) print("dtc auc: ", auc_dtc) # y_sgd_predict = sgd.predict_proba(X_test_prepared) y_gbc_predict = gbc.predict_proba(X_test_prepared) y_adb_predict = adb.predict_proba(X_test_prepared) y_dtc_predict = dtc.predict_proba(X_test_prepared) np.save("y_gbc_predict", y_gbc_predict) np.save("y_adb_predict", y_adb_predict) np.save("y_dtc_predict", y_dtc_predict)
#KNN from sklearn.neighbors import KNeighborsClassifier rf6 = KNeighborsClassifier() rf6.fit(X_train, y_train) y_val_pred6 = rf6.predict_proba(X_val) y_val_pred_acc6 = rf6.predict(X_val) print(log_loss(y_val, y_val_pred6)) print(accuracy_score(y_val, y_val_pred_acc6)) #AdaBoost from sklearn.ensemble import AdaBoostClassifier rf7 = AdaBoostClassifier(n_estimators=250) rf7.fit(X_train, y_train) y_val_pred6 = rf7.predict_proba(X_val) y_val_pred_acc7 = rf7.predict(X_val) print(log_loss(y_val, y_val_pred7)) print(accuracy_score(y_val, y_val_pred_acc7)) #Compare ROC of each Algorithm import matplotlib.pyplot as plt from sklearn import metrics #RandomForest fpr1, tpr1, threshold1 = metrics.roc_curve(y_val_pred_acc1, y_val_pred1) roc_auc1 = metrics.auc(fpr1, tpr1) plt.title('ROC of RandomForest') plt.plot(fpr1, tpr1, 'b', label='AUC = %0.2f' % roc_auc1) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1])
def main(Testbatch, DocumentName): start_time = time.time() Indexlist = pd.read_csv('Index.csv', sep=";", header=None) Indexlist.columns = ['Index', 'document'] Dataset = 'ENRON' #'TREC' catR = 'Fraud' #'spam' catNR = 'Legit' #'ham' #%% NrMails = 1640 #75000/5 SavePer = 40 #100 Traininglist = pd.read_csv('Traininglist'+str(Dataset),sep = '\t', index_col = 0) wordselection =pd.read_csv('wordselection-'+str(Dataset)+str(DocumentName), sep = '\t', index_col = 0, names=['Words']) #%% # ============================================================================= # start of training LOGISTIC REGRESSION # ============================================================================= Training = pd.DataFrame(0, columns = wordselection, index = [], dtype = 'uint32') y = pd.DataFrame(0, columns = [], index = [], dtype = 'uint32') Batches = list(Traininglist.columns.values) Batches.remove(Testbatch) for batch in Batches: print(batch) for Files in range(int(SavePer),int(NrMails+SavePer),int(SavePer)): TrainingFile = pd.DataFrame() TrainingFile = pd.read_csv('Frequencies'+str(Dataset)+batch+'-'+str(Files),sep = '\t', index_col = 0) y = pd.concat([y, TrainingFile['Index_given']], sort=False, ignore_index=True) Training = pd.concat([Training, TrainingFile], sort=False, ignore_index=True) Training = Training[wordselection] Training = Training.fillna(0).to_sparse(fill_value=0) print(round(Files/float(NrMails)*100,4), '%') del TrainingFile Training.to_csv('Training'+str(Dataset)+str(DocumentName),sep='\t') train = AdaBoostClassifier(n_estimators = 100, random_state=0).fit(Training, y) #%% # ============================================================================= # Applying LOGISTIC REGRESSION to test data # ============================================================================= start_time2 = time.time() TrainingWords = list(Training.columns.values) Test = pd.DataFrame(0, columns = TrainingWords, index = [], dtype = 'uint32') Traininglist = pd.read_csv('Traininglist'+str(Dataset),sep = '\t', index_col = 0) Batches = list(Traininglist.columns.values) Batches.remove(Testbatch) ProbSpam = list() ProbHam = list() Given_y = list() Predicted_y = list() for Files in range(int(SavePer),int(NrMails+SavePer),int(SavePer)): Test = pd.DataFrame(0, columns = TrainingWords, index = [], dtype = 'uint32') TestFile = pd.read_csv('Frequencies'+str(Dataset)+batch+'-'+str(Files),sep = '\t', index_col=0) y = TestFile['Index_given'] del TestFile['Index_given'] Test = Test.merge(TestFile, how='outer') for word in list(set(Test.columns.values)-set(TrainingWords)): del Test[word] TestNew = Test.fillna(0).to_sparse(fill_value=0) pred = train.predict(TestNew) proba = train.predict_proba(TestNew) for i in range(0,len(Test)): ProbSpam.append(proba[i][0]) ProbHam.append(proba[i][1]) Given_y.append(y[i]) Predicted_y.append(pred[i]) print(round(Files/float(NrMails)*100,4), '%') Result = pd.DataFrame(0, columns = ["Given_Label", "Predicted_Label", "ProbSpam", "ProbHam", "expSpam", "expHam"], index = [], dtype = 'uint32') Result["Given_Label"] = Given_y Result["Predicted_Label"] = Predicted_y Result["ProbSpam"] = ProbSpam Result["ProbHam"] = ProbHam Result["expSpam"] = 0 Result["expHam"] = 0 Result.to_csv('Result'+str(Dataset)+'-'+str(DocumentName),sep='\t') Timings = pd.DataFrame(columns = ['Description','time']) Timings = Timings.append(pd.Series({'Description':'Script', 'time': time.time()-start_time}), ignore_index=True) Timings = Timings.append(pd.Series({'Description':'Training', 'time': start_time2-start_time}), ignore_index=True) Timings = Timings.append(pd.Series({'Description':'Classification', 'time': time.time()- start_time2}), ignore_index=True) Timings.to_csv('Timings'+str(Dataset)'-'+str(DocumentName),sep='\t')
pred = rf_model.predict_proba(X_train[training_vars]) print('RF train roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1]))) pred = rf_model.predict_proba(X_test[training_vars]) print('RF test roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1]))) # #### Adaboost # In[283]: ada_model = AdaBoostClassifier() ada_model.fit(X_train[training_vars], y_train) pred = ada_model.predict_proba(X_train[training_vars]) print('Adaboost train roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1]))) pred = ada_model.predict_proba(X_test[training_vars]) print('Adaboost test roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1]))) # #### Logistic Regression # In[4]: logit_model = LogisticRegression() logit_model.fit(scaler.transform(X_train[training_vars]), y_train) pred = logit_model.predict_proba(scaler.transform(X_train[training_vars])) print('Logit train roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
def adaBoostClassifier(xTrain, yTrain, xTest): adaClassifier = AdaBoostClassifier() adaClassifier.fit(xTrain, yTrain) yPredict = adaClassifier.predict(xTest) probability = adaClassifier.predict_proba(xTest) return yPredict, probability
from sklearn.ensemble import AdaBoostClassifier from sklearn.datasets import make_moons, make_circles, make_classification #引入训练数据 #X, y = make_circles(noise=0.2, factor=0.5, random_state=1) X, y = make_moons(noise=0.1, random_state=1) #定义AdaBoost分类器 adb = AdaBoostClassifier() #训练过程 adb.fit(X, y) #绘图库引入 import matplotlib.pyplot as plt import matplotlib as mpl import numpy as np #调整图片风格adbadb mpl.style.use('fivethirtyeight') #定义xy网格,用于绘制等值线图 x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) #预测可能性 Z = adb.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, alpha=.8) #绘制散点图 plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k') plt.title("AdaBoost") plt.axis("equal") plt.show()
##tfidf = feature_extraction.text.TfidfTransformer() ##train_data = tfidf.fit_transform(train_data).toarray() ##test_data = tfidf.transform(test_data).toarray() print 'Training...' forest = GradientBoostingClassifier(n_estimators=200, verbose=1, learning_rate = 0.2, max_depth=3) forest2 = RandomForestClassifier(n_estimators = 400, verbose = 1, max_features = 13) learner = AdaBoostClassifier(base_estimator = forest2, n_estimators = 50) forest = forest.fit(train_data, y) learner = learner.fit(train_data, y) print 'Predicting...' output1 = forest.predict_proba(test_data) output2 = learner.predict_proba(test_data) output = [] for t, row in enumerate(output1): tmp = np.vstack([output1[t], output2[t]]) tmp = np.average(tmp, axis = 0) output.append(tmp) output = np.array(output) predictions_file = open("submission.csv", "wb") open_file_object = csv.writer(predictions_file) open_file_object.writerow(['Id', 'Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9']) for t, row in enumerate(Ids):
# Gradient Boosting classifier gbc = GradientBoostingClassifier(n_estimators = 200, learning_rate = 0.05, random_state=12) gbc.fit(x_train,y_train) # **Best solo performer from above is AdaBoost with AUROC of ~0.8612. A close second was the MLP, with AUROC of ~0.8574 # on the validation set. predictions_LR_train = logreg.predict_proba(x_2)[:,1] predictions_DT_train = dt.predict_proba(x_2)[:,1] predictions_NN_train = nn.predict_proba(x_2)[:,1] predictions_GBC_train = gbc.predict_proba(x_2)[:,1] predictions_KNN_train = knn.predict_proba(x_2)[:,1] predictions_RF_train = rf.predict_proba(x_2)[:,1] predictions_AB_train = ab.predict_proba(x_2)[:,1] predictions_GNB_train = gnb.predict_proba(x_2)[:,1] # Reshape to get the arrays to work predictions_LR_train = predictions_LR_train.reshape(-1, 1) predictions_DT_train = predictions_DT_train.reshape(-1, 1) predictions_NN_train = predictions_NN_train.reshape(-1, 1) predictions_GBC_train = predictions_GBC_train.reshape(-1, 1) predictions_KNN_train = predictions_KNN_train.reshape(-1, 1) predictions_RF_train = predictions_RF_train.reshape(-1, 1) predictions_AB_train = predictions_AB_train.reshape(-1, 1) predictions_GNB_train = predictions_GNB_train.reshape(-1, 1) # What to train the meta model on next_x_train = np.concatenate((predictions_LR_train,predictions_DT_train, predictions_NN_train, predictions_KNN_train, predictions_RF_train, predictions_AB_train, predictions_GNB_train,
with open('resultAda.csv', 'w') as csvFile: writer = csv.writer(csvFile, delimiter=' ') writer.writerows(predict) dfAda = predict csvFile.close() uniciAda, counteggioAda = np.unique(dfAda, return_counts=True) print(uniciAda, counteggioAda) print("\nRilevanza attributi Ada") for nameAda, scoreAda in zip(COLUMNS, classificatore.feature_importances_): print(nameAda, scoreAda) # testo il classificatore con delle classi che già conosce e di cui conosce anche l array di feature per testarne la precisione predict_proba = classificatore.predict_proba(dataframe_training) predict = np.array(predict) classi_target = list(np.array(classi_target)) cnf_matrix = confusion_matrix(classi_target, predict) print(' - Confusion Matrix -') print(cnf_matrix) print(' - Accuracy Score -', accuracy_score(classi_target, predict)) print(' - Report -'), print(classification_report(classi_target, predict)) FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix) FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix) TP = np.diag(cnf_matrix) TN = cnf_matrix.sum() - (FP + FN + TP) FP = FP.astype(float)
print ("\nAdaBoost for Ensemble - Train Confusion Matrix\n\n",pd.crosstab(y_train,clf4_adabst_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"])) print ("\nAdaBoost for Ensemble - Train accuracy",round(accuracy_score(y_train,clf4_adabst_fit.predict(x_train)),3)) print ("\nAdaBoost for Ensemble - Train Classification Report\n",classification_report(y_train,clf4_adabst_fit.predict(x_train))) print ("\n\nAdaBoost for Ensemble - Test Confusion Matrix\n\n",pd.crosstab(y_test,clf4_adabst_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"])) print ("\nAdaBoost for Ensemble - Test accuracy",round(accuracy_score(y_test,clf4_adabst_fit.predict(x_test)),3)) print ("\nAdaBoost for Ensemble - Test Classification Report\n",classification_report(y_test,clf4_adabst_fit.predict(x_test))) ensemble = pd.DataFrame() ensemble["log_output_one"] = pd.DataFrame(clf1_logreg_fit.predict_proba(x_train))[1] ensemble["dtr_output_one"] = pd.DataFrame(clf2_dt_fit.predict_proba(x_train))[1] ensemble["rf_output_one"] = pd.DataFrame(clf3_rf_fit.predict_proba(x_train))[1] ensemble["adb_output_one"] = pd.DataFrame(clf4_adabst_fit.predict_proba(x_train))[1] ensemble = pd.concat([ensemble,pd.DataFrame(y_train).reset_index(drop = True )],axis=1) # Fitting meta-classifier meta_logit_fit = LogisticRegression(fit_intercept=False) meta_logit_fit.fit(ensemble[['log_output_one','dtr_output_one','rf_output_one','adb_output_one']],ensemble['Attrition_ind']) coefs = meta_logit_fit.coef_ print ("Co-efficients for LR, DT, RF & AB are:",coefs) ensemble_test = pd.DataFrame() ensemble_test["log_output_one"] = pd.DataFrame(clf1_logreg_fit.predict_proba(x_test))[1] ensemble_test["dtr_output_one"] = pd.DataFrame(clf2_dt_fit.predict_proba(x_test))[1] ensemble_test["rf_output_one"] = pd.DataFrame(clf3_rf_fit.predict_proba(x_test))[1] ensemble_test["adb_output_one"] = pd.DataFrame(clf4_adabst_fit.predict_proba(x_test))[1]
test_y_knn = knn_opt.predict_proba(test_x) knn_out = submission knn_out['target'] = test_y_knn knn_out['target'] = 1 - knn_out['target'] knn_out.to_csv('knn_predictions1.csv', index=False, float_format='%.4f') ada_opt = AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0, n_estimators=200, random_state=None) ada_opt.fit(train_x, train_y) test_y_ada = ada_opt.predict_proba(test_x) ada_out = submission ada_out['target'] = test_y_ada ada_out['target'] = 1 - ada_out['target'] ada_out.to_csv('ada_predictions1.csv', index=False, float_format='%.4f') gb_opt = GradientBoostingClassifier(criterion='friedman_mse', init=None, learning_rate=0.1, loss='deviance', max_depth=3, max_features=None, max_leaf_nodes=None, min_impurity_split=None,
def buildModel(X, y): # X = np.reshape(X,(X.shape[0],X.shape[1] * X.shape[2])) print X.shape, y.shape scaler = StandardScaler() print(scaler.fit(X)) scaled_train_x = scaler.transform(X) X_train, X_test, y_train, y_test = train_test_split(scaled_train_x, y, random_state=19, test_size=0.3) bag = BalancedBaggingClassifier(n_estimators=200, random_state=19) svm = SVC(class_weight='balanced', random_state=19, decision_function_shape='ovr') neural = MLPClassifier(max_iter=500, random_state=19, solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(49, 8, 4)) ada = AdaBoostClassifier(n_estimators=100, random_state=19) logistic = LogisticRegression(solver='lbfgs', max_iter=500) bag.fit(X_train, y_train) svm.fit(X_train, y_train) neural.fit(X_train, y_train) ada.fit(X_train, y_train) logistic.fit(X_train, y_train) # joblib.dump(bag,'bag.pkl') # joblib.dump(scaler,'scaler.pkl') y_pred = bag.predict(X_test) y_pred2 = svm.predict(X_test) y_pred3 = neural.predict(X_test) y_pred4 = ada.predict(X_test) y_pred5 = logistic.predict(X_test) print matthews_corrcoef(y_test, y_pred) print matthews_corrcoef(y_test, y_pred2) print matthews_corrcoef(y_test, y_pred3) print matthews_corrcoef(y_test, y_pred4) print matthews_corrcoef(y_test, y_pred5) print confusion_matrix(y_test, y_pred) print confusion_matrix(y_test, y_pred2) print confusion_matrix(y_test, y_pred3) print confusion_matrix(y_test, y_pred4) print confusion_matrix(y_test, y_pred5) print(classification_report_imbalanced(y_test, y_pred)) print(classification_report_imbalanced(y_test, y_pred2)) print(classification_report_imbalanced(y_test, y_pred3)) print(classification_report_imbalanced(y_test, y_pred4)) print(classification_report_imbalanced(y_test, y_pred5)) probs_ada = ada.predict_proba(X_test) probs_bag = bag.predict_proba(X_test) probs_neural = neural.predict_proba(X_test) probs_logistic = logistic.predict_proba(X_test) probs_svm = svm.decision_function(X_test) ROCplot(probs_ada, y_test, "Plots/ROCplotADA-organelle.png") ROCplot(probs_logistic, y_test, "Plots/ROCplotLogistic-organelle.png") ROCplot(probs_bag, y_test, "Plots/ROCplotBAG-organelle.png") ROCplot(probs_neural, y_test, "Plots/ROCplotNeural-organelle.png") ROCplot(probs_svm, y_test, "Plots/ROCplotSVM-organelle.png") multiROCplot( [probs_ada, probs_logistic, probs_bag, probs_neural, probs_svm], y_test, "Plots/multiROCplot.png", ['AdaBoost', 'Logistic', 'Bagging Classifier', 'MLP', 'SVM'])
#dataset is imbalanced, we'll be using the ROC AUC score as a metric instead of accuracy. # Import DecisionTreeClassifier from sklearn.tree import DecisionTreeClassifier # Import AdaBoostClassifier from sklearn.ensemble import AdaBoostClassifier # Instantiate dt dt = DecisionTreeClassifier(max_depth=2, random_state=1) # Instantiate ada ada = AdaBoostClassifier(base_estimator=dt, n_estimators=180, random_state=1) # Fit ada to the training set ada.fit(X_train, y_train) # Compute the probabilities of obtaining the positive class y_pred_proba = ada.predict_proba(X_test)[:,1] # Import roc_auc_score from sklearn.metrics import roc_auc_score # Evaluate test-set roc_auc_score ada_roc_auc = roc_auc_score(y_test, y_pred_proba) # Print roc_auc_score print('ROC AUC score: {:.2f}'.format(ada_roc_auc))
Clf.fit(X_train, y_train) pred = Clf.predict_proba(X_test)[:,1] pd.DataFrame({"id": original_test["id"], "target": pred}).to_csv("RandomForest_submission.csv", index=False) score=cross_validate(adaClf, X_train, y_train, cv=3, scoring="roc_auc")["test_score"].mean() print(f"{score:.6f}") """## AdaBoost Classifier""" adaClf = AdaBoostClassifier() adaClf.fit(X_train, y_train) pred = adaClf.predict_proba(X_test)[:,1] pd.DataFrame({"id": original_test["id"], "target": pred}).to_csv("adaboost_submission.csv", index=False) score=cross_validate(adaClf, X_train, y_train, cv=3, scoring="roc_auc")["test_score"].mean() print(f"{score:.6f}") """## GaussianProcessClassifier""" clf = GaussianProcessClassifier(1**2 * RBF(length_scale=0.8)) clf.fit(X_train, y_train) pred=clf.predict_proba(X_test)[:,1] pd.DataFrame({"id": original_test["id"], "target": pred}).to_csv("GaussianProcess_submission.csv", index=False)