예제 #1
0
def classify_hdd_failure(x_train, x_test, y_train, y_test):
    start_time_lr = time.time()
    ###### Log Regression #####
    print("-------------Running Logistic Regression________________")
    pipe_lr = Pipeline([('classifier', LogisticRegression())])
    #return lr

    param_grid_lr = [{
        'classifier': [LogisticRegression()],
        'classifier__penalty': ['l1', 'l2'],
        'classifier__solver': ['lbfgs', 'liblinear', 'sag', 'saga'],
        'classifier__C': [0.001, 0.01, 0.1, 10],
        'classifier__max_iter': [1000]
    }]
    #return param_grid

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    #return cv

    lr_clf = GridSearchCV(pipe_lr,
                          param_grid=param_grid_lr,
                          cv=cv,
                          verbose=True,
                          scoring='f1',
                          n_jobs=-1)
    #return lr_clf

    lr_clf.fit(x_train, y_train)

    #define best parameter variables to run on LR algo.
    lr_C = lr_clf.best_params_['classifier__C']
    lr_penalty = lr_clf.best_params_['classifier__penalty']
    lr_solver = lr_clf.best_params_['classifier__solver']

    #return lr_max_iter, lr_penalty, lr_solver

    # Now that we have the best params, we will fit the model to the training data and run
    lr_clf_best = LogisticRegression(C=lr_C,
                                     penalty=lr_penalty,
                                     solver=lr_solver)
    #return lr_clf_best

    lr_clf_best.fit(x_train, y_train)
    lr_clf_predict = lr_clf_best.predict(x_test)
    #return lr_clf_predict

    # Run all related reports
    print('LR Accuracy Score: ', accuracy_score(y_test, lr_clf_predict))
    print('LR Precision Score: ', precision_score(y_test, lr_clf_predict))
    print('LR Recall Score: ', recall_score(y_test, lr_clf_predict))
    print('LR F1 Score: ', f1_score(y_test, lr_clf_predict))
    print('LR AP Score: ', average_precision_score(y_test, lr_clf_predict))
    print('LR F0.5-Measure: ', fbeta_score(y_test, lr_clf_predict, beta=0.5))
    print('LR Confusion Matrix: \n', confusion_matrix(y_test, lr_clf_predict))
    print('LR Classification Report: \n',
          classification_report(y_test, lr_clf_predict))
    print("------Time taken for LR: %s minutes" %
          ((time.time() - start_time_lr) / 60))
    time_lr = (time.time() - start_time_lr) / 60
    print(time_lr)
    ################################################################################
    # Random Forest Classifier
    start_time_rf = time.time()
    print("-------------Running Random Forest________________")
    pipe_rf = Pipeline([('classifier', RandomForestClassifier())])

    # define the parameters to be used for Grid Search
    param_grid_rf = [{
        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': list(range(5, 20, 2)),
        'classifier__max_depth': list(range(8, 22, 2))
    }]

    # define the cross validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    rf_clf = GridSearchCV(pipe_rf,
                          param_grid=param_grid_rf,
                          cv=cv,
                          verbose=True,
                          scoring='f1',
                          n_jobs=-1)

    # Fit the model to the Training Dataset
    rf_clf.fit(x_train, y_train)

    #define best parameter variables to run on RF algo.
    rf_n_estimators = rf_clf.best_params_['classifier__n_estimators']
    rf_max_depth = rf_clf.best_params_['classifier__max_depth']

    # Now that we have the best params, we will fit the model to the training data and run
    rf_clf_best = RandomForestClassifier(n_estimators=rf_n_estimators,
                                         max_depth=rf_max_depth)
    rf_clf_best.fit(x_train, y_train)

    # Run Test data to predict
    rf_clf_predict = rf_clf_best.predict(x_test)

    # Run all related reports.
    print('RF Accuracy Score: ', accuracy_score(y_test, rf_clf_predict))
    print('RF Precision Score: ', precision_score(y_test, rf_clf_predict))
    print('RF Recall Score: ', recall_score(y_test, rf_clf_predict))
    print('RF F1 Score: ', f1_score(y_test, rf_clf_predict))
    print('RF AP Score: ', average_precision_score(y_test, rf_clf_predict))
    print('RF F0.5-Measure: ', fbeta_score(y_test, rf_clf_predict, beta=0.5))
    print('RF Confusion Matrix: \n', confusion_matrix(y_test, rf_clf_predict))
    print('RF Classification Report: \n ',
          classification_report(y_test, rf_clf_predict))
    print("------Time taken for RF: %s minutes" %
          ((time.time() - start_time_rf) / 60))
    time_rf = (time.time() - start_time_rf) / 60
    print(time_rf)
    #########################################################################
    # Gradient Boosting Classifier:
    start_time_gbc = time.time()
    print("-------------Running Gradient Boosting________________")

    pipe_gbc = Pipeline([('classifier', GradientBoostingClassifier())])

    param_grid_gbc = [{
        'classifier': [GradientBoostingClassifier()],
        'classifier__learning_rate': [0.1, 1.0, 2.0],
        'classifier__n_estimators': [75, 100, 125],
        'classifier__max_depth': list(range(3, 5, 1))
    }]

    # define the cross validation and grid search
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    gbc_clf = GridSearchCV(pipe_gbc,
                           param_grid=param_grid_gbc,
                           cv=cv,
                           verbose=True,
                           scoring='f1',
                           n_jobs=-1)

    # Fit the model on training dataset for optm parameters
    gbc_clf.fit(x_train, y_train)

    #define best parameter variables to run on GBC algo.
    gbc_learning_rate = gbc_clf.best_params_['classifier__learning_rate']
    gbc_n_estimators = gbc_clf.best_params_['classifier__n_estimators']
    gbc_max_depth = gbc_clf.best_params_['classifier__max_depth']

    # Now that we have the best params, we will fit the model to the training data and run
    gbc_clf_best = GradientBoostingClassifier(n_estimators=gbc_n_estimators,
                                              max_depth=gbc_max_depth,
                                              learning_rate=gbc_learning_rate)

    # Train the model on optm parameters
    gbc_clf_best.fit(x_train, y_train)

    # predict on test set
    gbc_clf_predict = gbc_clf.predict(x_test)

    # Run all related reports
    print('GBC Accuracy Score: ', accuracy_score(y_test, gbc_clf_predict))
    print('GBC Precision Score: ', precision_score(y_test, gbc_clf_predict))
    print('GBC Recall Score: ', recall_score(y_test, gbc_clf_predict))
    print('GBC F1 Score: ', f1_score(y_test, gbc_clf_predict))
    print('GBC AP Score: ', average_precision_score(y_test, gbc_clf_predict))
    print('GBC F0.5-Measure: ', fbeta_score(y_test, gbc_clf_predict, beta=0.5))
    print('GBC Confusion Matrix: \n', confusion_matrix(y_test,
                                                       gbc_clf_predict))
    print('GBC Classification Report: \n',
          classification_report(y_test, gbc_clf_predict))
    print("------Time taken for GBC: %s minutes" %
          ((time.time() - start_time_gbc) / 60))
    time_gbc = (time.time() - start_time_gbc) / 60
    print(time_gbc)
    ##################################################################
    # XGB [eXtreme Gradient Boosting]
    # sources: https://cran.r-project.org/web/packages/xgboost/vignettes/xgboost.pdf
    # https://machinelearningmastery.com/gentle-introduction-xgboost-applied-machine-learning/

    start_time_xgb = time.time()
    print("-------------Running eXtreme Gradient Boosting________________")
    from xgboost import XGBClassifier

    pipe_xgb = Pipeline([('classifier', XGBClassifier())])

    param_grid_xgb = [{
        'classifier': [XGBClassifier()],
        'classifier__booster': ['gbtree', 'dart'],
        'classifier__n_estimators': [20, 40, 50],
        'classifier__max_depth': list(range(0, 1, 1)),
        'classifier__learning_rate': [0.001, 0.01, 0.05]
    }]

    # define the cross validation and grid search
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    xgb_clf = GridSearchCV(pipe_xgb,
                           param_grid=param_grid_xgb,
                           cv=cv,
                           verbose=True,
                           scoring='f1',
                           n_jobs=-1)

    # Fit the model on training dataset for optm parameters
    xgb_clf.fit(x_train, y_train)

    #define best parameter variables to run on GBC algo.
    xgb_learning_rate = xgb_clf.best_params_['classifier__learning_rate']
    xgb_booster = xgb_clf.best_params_['classifier__booster']
    xgb_n_estimators = xgb_clf.best_params_['classifier__n_estimators']
    xgb_max_depth = xgb_clf.best_params_['classifier__max_depth']

    # Now that we have the best params, we will fit the model to the training data and run
    xgb_clf_best = XGBClassifier(booster=xgb_booster,
                                 n_estimators=xgb_n_estimators,
                                 max_depth=xgb_max_depth,
                                 learning_rate=xgb_learning_rate)

    # Train the model on optm parameters
    xgb_clf_best.fit(x_train, y_train)

    # predict on test set
    xgb_clf_predict = xgb_clf.predict(x_test)

    # Run all related reports
    print('XGB Accuracy Score: ', accuracy_score(y_test, xgb_clf_predict))
    print('XGB Precision Score: ', precision_score(y_test, xgb_clf_predict))
    print('XGB Recall Score: ', recall_score(y_test, xgb_clf_predict))
    print('XGB F1 Score: ', f1_score(y_test, xgb_clf_predict))
    print('XGB AP Score: ', average_precision_score(y_test, xgb_clf_predict))
    print('XGB F0.5-Measure: ', fbeta_score(y_test, xgb_clf_predict, beta=0.5))
    print('XGB Confusion Matrix: \n', confusion_matrix(y_test,
                                                       xgb_clf_predict))
    print('XGB Classification Report: \n',
          classification_report(y_test, xgb_clf_predict))
    print("------Time taken for XGB: %s minutes" %
          ((time.time() - start_time_xgb) / 60))
    time_xgb = (time.time() - start_time_xgb) / 60
    print(time_xgb)
    ##################################################################
    # Create an DataFrame with results
    acc_score = []
    prec_score = []
    rcall_score = []
    fone_score = []
    ap_score = []
    f05_score = []
    time_model = []

    # Append LR to scores list
    acc_score.append(accuracy_score(y_test, lr_clf_predict))
    prec_score.append(precision_score(y_test, lr_clf_predict))
    rcall_score.append(recall_score(y_test, lr_clf_predict))
    fone_score.append(f1_score(y_test, lr_clf_predict))
    ap_score.append(average_precision_score(y_test, lr_clf_predict))
    f05_score.append(fbeta_score(y_test, lr_clf_predict, beta=0.5))
    time_model.append(time_lr)

    # Append RF to scores list
    acc_score.append(accuracy_score(y_test, rf_clf_predict))
    prec_score.append(precision_score(y_test, rf_clf_predict))
    rcall_score.append(recall_score(y_test, rf_clf_predict))
    fone_score.append(f1_score(y_test, rf_clf_predict))
    ap_score.append(average_precision_score(y_test, rf_clf_predict))
    f05_score.append(fbeta_score(y_test, rf_clf_predict, beta=0.5))
    time_model.append(time_rf)

    # Append GBC to scores list
    acc_score.append(accuracy_score(y_test, gbc_clf_predict))
    prec_score.append(precision_score(y_test, gbc_clf_predict))
    rcall_score.append(recall_score(y_test, gbc_clf_predict))
    fone_score.append(f1_score(y_test, gbc_clf_predict))
    ap_score.append(average_precision_score(y_test, gbc_clf_predict))
    f05_score.append(fbeta_score(y_test, gbc_clf_predict, beta=0.5))
    time_model.append(time_gbc)

    # Append XGB to scores list
    acc_score.append(accuracy_score(y_test, xgb_clf_predict))
    prec_score.append(precision_score(y_test, xgb_clf_predict))
    rcall_score.append(recall_score(y_test, xgb_clf_predict))
    fone_score.append(f1_score(y_test, xgb_clf_predict))
    ap_score.append(average_precision_score(y_test, xgb_clf_predict))
    f05_score.append(fbeta_score(y_test, xgb_clf_predict, beta=0.5))
    time_model.append(time_xgb)

    report_card = {
        'Accuracy': acc_score,
        'Precision': prec_score,
        'Recall': rcall_score,
        'F1': fone_score,
        'Avg Precision Score': ap_score,
        'F0.5-Measure': f05_score,
        'Time_Model': time_model
    }
    report_card_df = pd.DataFrame(report_card,
                                  index=['LR', 'RF', 'GBC', 'XGB'])
    print(report_card_df)
    return report_card_df
예제 #2
0
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
onehotencoder = OneHotEncoder(categorical_features = [1])
X = onehotencoder.fit_transform(X).toarray()
X = X[:, 1:]

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Fitting XGBoost to the Training set
onda-cfrom xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
accuracies.mean()
accuracies.std()
예제 #3
0
y_train = final_train.pop('wage_class')
y_test = final_test.pop('wage_class')
import xgboost as xgb
from sklearn.grid_search import GridSearchCV
cv_params = {'max_depth': [3,5,7], 'min_child_weight': [1,3,5]}
ind_params = {'learning_rate': 0.1, 'n_estimators': 1000, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic'}
optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), 
                            cv_params, 
                             scoring = 'accuracy', cv = 5, n_jobs = -1) 
# Optimize for accuracy since that is the metric used in the Adult Data Set notation
optimized_GBM.fit(final_train, y_train)
GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'min_child_weight': [1, 3, 5], 'max_depth': [3, 5, 7]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)
optimized_GBM.grid_scores_
cv_params = {'learning_rate': [0.1, 0.01], 'subsample': [0.7,0.8,0.9]}
ind_params = {'n_estimators': 1000, 'seed':0, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic', 'max_depth': 3, 'min_child_weight': 1}


optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), 
                            cv_params, 
                             scoring = 'accuracy', cv = 5, n_jobs = -1)
optimized_GBM.fit(final_train, y_train)
예제 #4
0
acc = cross_val_score(classifier,x_train,y_train,cv=10)
acc.mean()
acc.std()
-----------------------------GRID SEARCH---------------------------------------
from sklearn.model_selection import GridSearchCV

param_grid = {'bootstrap':[True],'n_estimators':[10,20,50,100]}
classifier_grid = RandomForestClassifier()
gr = GridSearchCV(classifier_grid,param_grid,cv=10,n_jobs=-1)
gr.fit(x_train,y_train)
gr.best_params_
gr.best_estimator_
-----------------------------XGBOOST-------------------------------------------
from xgboost.sklearn import XGBClassifier

classifier1 = XGBClassifier()
classifier1.fit(x_train,y_train)

y_pred = classifier1.predict(x_test)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)

accuracy_xgb = (cm[0,0]+cm[1,1])/(cm[0,0]+cm[1,1]+cm[0,1]+cm[1,0])
print(accuracy)
print(accuracy_xgb)
-----------------------------------Hierarchial Clustering--------------------------------------------
import pandas as pd
data = pd.read_csv('Mall_Customers.csv')
x = data.iloc[:,[3,4]].values
예제 #5
0
    selector = SelectPercentile(f_classif, percentile=1)
    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(
        features_train_transformed).toarray()
    features_test_transformed = selector.transform(
        features_test_transformed).toarray()
    #print "no. of Sara training emails:", len(labels_train)-sum(labels_train)

    return features_train_transformed, features_test_transformed, labels_train, labels_test


# =============================================================================
#

features_train, features_test, labels_train, labels_test = preprocess()
#clf = GaussianNB()
#clf.fit(features_train, labels_train)
#pred=clf.predict(features_test)
#accuracy= accuracy_score(labels_test, pred)
#print accuracy
#
# =============================================================================
model = XGBClassifier()
model.fit(features_train, labels_train)
# make predictions for test data
y_pred = model.predict(features_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(labels_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))