Exemple #1
0
def abclassifier(training_samples, eval_samples):
    X_train, Y_train = training_samples
    X_eval, Y_eval = eval_samples
    do_grid_search=False
    clf = RandomForestClassifier(n_estimators=2000, criterion='gini', max_depth=None, 
                                 min_samples_split=8, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                                 max_features=40, max_leaf_nodes=None, bootstrap=True, oob_score=False, 
                                 n_jobs=10, random_state=None, verbose=0, warm_start=False, class_weight=None)

    if do_grid_search:
        to_be_tuned_parameters = {
                                  'n_estimators':[500, 1000, 2000],
                                  'max_features':['log2', 'auto', None],
                                  'min_samples_split':[2, 4, 8],
                                  'min_samples_leaf': [1, 2],
    
                                  }
        clf = GridSearchCV(clf, to_be_tuned_parameters, cv=5, n_jobs=5, scoring='log_loss')

    #Best parameters set found on development set:
    #()
    #{'max_features': 'log2', 'min_samples_split': 8, 'criterion': 'gini', 'min_samples_leaf': 1}
    
                
    clf = AdaBoostClassifier(base_estimator=clf, n_estimators=200, learning_rate=0.2, algorithm='SAMME.R', random_state=None)
    print(clf)
    clf.fit(X_train, Y_train)
    if do_grid_search:
        print("Best parameters set found on development set:")
        print()
        
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        for params, mean_score, scores in clf.grid_scores_:
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean_score, scores.std() * 2, params))
    else:
        scores = cross_validation.cross_val_score(clf, X_train, Y_train, cv=5, n_jobs=5, scoring='log_loss')
        print scores, np.mean(scores), np.median(scores)
    Y_eval = clf.predict(X_eval)
    Y_prob = clf.predict_proba(X_eval)
    return Y_eval, Y_prob, clf.feature_importances_()
Exemple #2
0
correlation_matrix(df)
#splitiing data in train and test
X = df.iloc[:,:(nVar-1)]
print(X)
Y = df.iloc[:,(nVar-1):]
print(Y)
X_train, X_test,Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state=24) 
print(Y_train)


## Model 1-RandomForest
clf1 = RandomForestClassifier()
clf1.fit(X_train,Y_train)

#feature importance
clf1.feature_importances_()

predict = clf1.predict(X_test)


#cross val score
score1 = np.mean(cross_val_score(clf, X, Y, scoring='accuracy', cv=10))
print(score1)
## Metrics-accuracy
print(accuracy_score(predict,Y_test))

#kappa score
score3 = cohen_kappa_score(Y_test,predict)
print(score3)
#recall score
score2=recall_score(Y_test, predict, average='macro')