def classify_hdd_failure(x_train, x_test, y_train, y_test): start_time_lr = time.time() ###### Log Regression ##### print("-------------Running Logistic Regression________________") pipe_lr = Pipeline([('classifier', LogisticRegression())]) #return lr param_grid_lr = [{ 'classifier': [LogisticRegression()], 'classifier__penalty': ['l1', 'l2'], 'classifier__solver': ['lbfgs', 'liblinear', 'sag', 'saga'], 'classifier__C': [0.001, 0.01, 0.1, 10], 'classifier__max_iter': [1000] }] #return param_grid cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) #return cv lr_clf = GridSearchCV(pipe_lr, param_grid=param_grid_lr, cv=cv, verbose=True, scoring='f1', n_jobs=-1) #return lr_clf lr_clf.fit(x_train, y_train) #define best parameter variables to run on LR algo. lr_C = lr_clf.best_params_['classifier__C'] lr_penalty = lr_clf.best_params_['classifier__penalty'] lr_solver = lr_clf.best_params_['classifier__solver'] #return lr_max_iter, lr_penalty, lr_solver # Now that we have the best params, we will fit the model to the training data and run lr_clf_best = LogisticRegression(C=lr_C, penalty=lr_penalty, solver=lr_solver) #return lr_clf_best lr_clf_best.fit(x_train, y_train) lr_clf_predict = lr_clf_best.predict(x_test) #return lr_clf_predict # Run all related reports print('LR Accuracy Score: ', accuracy_score(y_test, lr_clf_predict)) print('LR Precision Score: ', precision_score(y_test, lr_clf_predict)) print('LR Recall Score: ', recall_score(y_test, lr_clf_predict)) print('LR F1 Score: ', f1_score(y_test, lr_clf_predict)) print('LR AP Score: ', average_precision_score(y_test, lr_clf_predict)) print('LR F0.5-Measure: ', fbeta_score(y_test, lr_clf_predict, beta=0.5)) print('LR Confusion Matrix: \n', confusion_matrix(y_test, lr_clf_predict)) print('LR Classification Report: \n', classification_report(y_test, lr_clf_predict)) print("------Time taken for LR: %s minutes" % ((time.time() - start_time_lr) / 60)) time_lr = (time.time() - start_time_lr) / 60 print(time_lr) ################################################################################ # Random Forest Classifier start_time_rf = time.time() print("-------------Running Random Forest________________") pipe_rf = Pipeline([('classifier', RandomForestClassifier())]) # define the parameters to be used for Grid Search param_grid_rf = [{ 'classifier': [RandomForestClassifier()], 'classifier__n_estimators': list(range(5, 20, 2)), 'classifier__max_depth': list(range(8, 22, 2)) }] # define the cross validation cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) rf_clf = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=cv, verbose=True, scoring='f1', n_jobs=-1) # Fit the model to the Training Dataset rf_clf.fit(x_train, y_train) #define best parameter variables to run on RF algo. rf_n_estimators = rf_clf.best_params_['classifier__n_estimators'] rf_max_depth = rf_clf.best_params_['classifier__max_depth'] # Now that we have the best params, we will fit the model to the training data and run rf_clf_best = RandomForestClassifier(n_estimators=rf_n_estimators, max_depth=rf_max_depth) rf_clf_best.fit(x_train, y_train) # Run Test data to predict rf_clf_predict = rf_clf_best.predict(x_test) # Run all related reports. print('RF Accuracy Score: ', accuracy_score(y_test, rf_clf_predict)) print('RF Precision Score: ', precision_score(y_test, rf_clf_predict)) print('RF Recall Score: ', recall_score(y_test, rf_clf_predict)) print('RF F1 Score: ', f1_score(y_test, rf_clf_predict)) print('RF AP Score: ', average_precision_score(y_test, rf_clf_predict)) print('RF F0.5-Measure: ', fbeta_score(y_test, rf_clf_predict, beta=0.5)) print('RF Confusion Matrix: \n', confusion_matrix(y_test, rf_clf_predict)) print('RF Classification Report: \n ', classification_report(y_test, rf_clf_predict)) print("------Time taken for RF: %s minutes" % ((time.time() - start_time_rf) / 60)) time_rf = (time.time() - start_time_rf) / 60 print(time_rf) ######################################################################### # Gradient Boosting Classifier: start_time_gbc = time.time() print("-------------Running Gradient Boosting________________") pipe_gbc = Pipeline([('classifier', GradientBoostingClassifier())]) param_grid_gbc = [{ 'classifier': [GradientBoostingClassifier()], 'classifier__learning_rate': [0.1, 1.0, 2.0], 'classifier__n_estimators': [75, 100, 125], 'classifier__max_depth': list(range(3, 5, 1)) }] # define the cross validation and grid search cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) gbc_clf = GridSearchCV(pipe_gbc, param_grid=param_grid_gbc, cv=cv, verbose=True, scoring='f1', n_jobs=-1) # Fit the model on training dataset for optm parameters gbc_clf.fit(x_train, y_train) #define best parameter variables to run on GBC algo. gbc_learning_rate = gbc_clf.best_params_['classifier__learning_rate'] gbc_n_estimators = gbc_clf.best_params_['classifier__n_estimators'] gbc_max_depth = gbc_clf.best_params_['classifier__max_depth'] # Now that we have the best params, we will fit the model to the training data and run gbc_clf_best = GradientBoostingClassifier(n_estimators=gbc_n_estimators, max_depth=gbc_max_depth, learning_rate=gbc_learning_rate) # Train the model on optm parameters gbc_clf_best.fit(x_train, y_train) # predict on test set gbc_clf_predict = gbc_clf.predict(x_test) # Run all related reports print('GBC Accuracy Score: ', accuracy_score(y_test, gbc_clf_predict)) print('GBC Precision Score: ', precision_score(y_test, gbc_clf_predict)) print('GBC Recall Score: ', recall_score(y_test, gbc_clf_predict)) print('GBC F1 Score: ', f1_score(y_test, gbc_clf_predict)) print('GBC AP Score: ', average_precision_score(y_test, gbc_clf_predict)) print('GBC F0.5-Measure: ', fbeta_score(y_test, gbc_clf_predict, beta=0.5)) print('GBC Confusion Matrix: \n', confusion_matrix(y_test, gbc_clf_predict)) print('GBC Classification Report: \n', classification_report(y_test, gbc_clf_predict)) print("------Time taken for GBC: %s minutes" % ((time.time() - start_time_gbc) / 60)) time_gbc = (time.time() - start_time_gbc) / 60 print(time_gbc) ################################################################## # XGB [eXtreme Gradient Boosting] # sources: https://cran.r-project.org/web/packages/xgboost/vignettes/xgboost.pdf # https://machinelearningmastery.com/gentle-introduction-xgboost-applied-machine-learning/ start_time_xgb = time.time() print("-------------Running eXtreme Gradient Boosting________________") from xgboost import XGBClassifier pipe_xgb = Pipeline([('classifier', XGBClassifier())]) param_grid_xgb = [{ 'classifier': [XGBClassifier()], 'classifier__booster': ['gbtree', 'dart'], 'classifier__n_estimators': [20, 40, 50], 'classifier__max_depth': list(range(0, 1, 1)), 'classifier__learning_rate': [0.001, 0.01, 0.05] }] # define the cross validation and grid search cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) xgb_clf = GridSearchCV(pipe_xgb, param_grid=param_grid_xgb, cv=cv, verbose=True, scoring='f1', n_jobs=-1) # Fit the model on training dataset for optm parameters xgb_clf.fit(x_train, y_train) #define best parameter variables to run on GBC algo. xgb_learning_rate = xgb_clf.best_params_['classifier__learning_rate'] xgb_booster = xgb_clf.best_params_['classifier__booster'] xgb_n_estimators = xgb_clf.best_params_['classifier__n_estimators'] xgb_max_depth = xgb_clf.best_params_['classifier__max_depth'] # Now that we have the best params, we will fit the model to the training data and run xgb_clf_best = XGBClassifier(booster=xgb_booster, n_estimators=xgb_n_estimators, max_depth=xgb_max_depth, learning_rate=xgb_learning_rate) # Train the model on optm parameters xgb_clf_best.fit(x_train, y_train) # predict on test set xgb_clf_predict = xgb_clf.predict(x_test) # Run all related reports print('XGB Accuracy Score: ', accuracy_score(y_test, xgb_clf_predict)) print('XGB Precision Score: ', precision_score(y_test, xgb_clf_predict)) print('XGB Recall Score: ', recall_score(y_test, xgb_clf_predict)) print('XGB F1 Score: ', f1_score(y_test, xgb_clf_predict)) print('XGB AP Score: ', average_precision_score(y_test, xgb_clf_predict)) print('XGB F0.5-Measure: ', fbeta_score(y_test, xgb_clf_predict, beta=0.5)) print('XGB Confusion Matrix: \n', confusion_matrix(y_test, xgb_clf_predict)) print('XGB Classification Report: \n', classification_report(y_test, xgb_clf_predict)) print("------Time taken for XGB: %s minutes" % ((time.time() - start_time_xgb) / 60)) time_xgb = (time.time() - start_time_xgb) / 60 print(time_xgb) ################################################################## # Create an DataFrame with results acc_score = [] prec_score = [] rcall_score = [] fone_score = [] ap_score = [] f05_score = [] time_model = [] # Append LR to scores list acc_score.append(accuracy_score(y_test, lr_clf_predict)) prec_score.append(precision_score(y_test, lr_clf_predict)) rcall_score.append(recall_score(y_test, lr_clf_predict)) fone_score.append(f1_score(y_test, lr_clf_predict)) ap_score.append(average_precision_score(y_test, lr_clf_predict)) f05_score.append(fbeta_score(y_test, lr_clf_predict, beta=0.5)) time_model.append(time_lr) # Append RF to scores list acc_score.append(accuracy_score(y_test, rf_clf_predict)) prec_score.append(precision_score(y_test, rf_clf_predict)) rcall_score.append(recall_score(y_test, rf_clf_predict)) fone_score.append(f1_score(y_test, rf_clf_predict)) ap_score.append(average_precision_score(y_test, rf_clf_predict)) f05_score.append(fbeta_score(y_test, rf_clf_predict, beta=0.5)) time_model.append(time_rf) # Append GBC to scores list acc_score.append(accuracy_score(y_test, gbc_clf_predict)) prec_score.append(precision_score(y_test, gbc_clf_predict)) rcall_score.append(recall_score(y_test, gbc_clf_predict)) fone_score.append(f1_score(y_test, gbc_clf_predict)) ap_score.append(average_precision_score(y_test, gbc_clf_predict)) f05_score.append(fbeta_score(y_test, gbc_clf_predict, beta=0.5)) time_model.append(time_gbc) # Append XGB to scores list acc_score.append(accuracy_score(y_test, xgb_clf_predict)) prec_score.append(precision_score(y_test, xgb_clf_predict)) rcall_score.append(recall_score(y_test, xgb_clf_predict)) fone_score.append(f1_score(y_test, xgb_clf_predict)) ap_score.append(average_precision_score(y_test, xgb_clf_predict)) f05_score.append(fbeta_score(y_test, xgb_clf_predict, beta=0.5)) time_model.append(time_xgb) report_card = { 'Accuracy': acc_score, 'Precision': prec_score, 'Recall': rcall_score, 'F1': fone_score, 'Avg Precision Score': ap_score, 'F0.5-Measure': f05_score, 'Time_Model': time_model } report_card_df = pd.DataFrame(report_card, index=['LR', 'RF', 'GBC', 'XGB']) print(report_card_df) return report_card_df
# Encoding categorical data from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_X_1 = LabelEncoder() X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1]) labelencoder_X_2 = LabelEncoder() X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2]) onehotencoder = OneHotEncoder(categorical_features = [1]) X = onehotencoder.fit_transform(X).toarray() X = X[:, 1:] # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) # Fitting XGBoost to the Training set onda-cfrom xgboost import XGBClassifier classifier = XGBClassifier() classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) # Applying k-Fold Cross Validation from sklearn.model_selection import cross_val_score accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10) accuracies.mean() accuracies.std()
y_train = final_train.pop('wage_class') y_test = final_test.pop('wage_class') import xgboost as xgb from sklearn.grid_search import GridSearchCV cv_params = {'max_depth': [3,5,7], 'min_child_weight': [1,3,5]} ind_params = {'learning_rate': 0.1, 'n_estimators': 1000, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 'objective': 'binary:logistic'} optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), cv_params, scoring = 'accuracy', cv = 5, n_jobs = -1) # Optimize for accuracy since that is the metric used in the Adult Data Set notation optimized_GBM.fit(final_train, y_train) GridSearchCV(cv=5, error_score='raise', estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3, min_child_weight=1, missing=None, n_estimators=1000, nthread=-1, objective='binary:logistic', reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True, subsample=0.8), fit_params={}, iid=True, n_jobs=-1, param_grid={'min_child_weight': [1, 3, 5], 'max_depth': [3, 5, 7]}, pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0) optimized_GBM.grid_scores_ cv_params = {'learning_rate': [0.1, 0.01], 'subsample': [0.7,0.8,0.9]} ind_params = {'n_estimators': 1000, 'seed':0, 'colsample_bytree': 0.8, 'objective': 'binary:logistic', 'max_depth': 3, 'min_child_weight': 1} optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), cv_params, scoring = 'accuracy', cv = 5, n_jobs = -1) optimized_GBM.fit(final_train, y_train)
acc = cross_val_score(classifier,x_train,y_train,cv=10) acc.mean() acc.std() -----------------------------GRID SEARCH--------------------------------------- from sklearn.model_selection import GridSearchCV param_grid = {'bootstrap':[True],'n_estimators':[10,20,50,100]} classifier_grid = RandomForestClassifier() gr = GridSearchCV(classifier_grid,param_grid,cv=10,n_jobs=-1) gr.fit(x_train,y_train) gr.best_params_ gr.best_estimator_ -----------------------------XGBOOST------------------------------------------- from xgboost.sklearn import XGBClassifier classifier1 = XGBClassifier() classifier1.fit(x_train,y_train) y_pred = classifier1.predict(x_test) from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test,y_pred) accuracy_xgb = (cm[0,0]+cm[1,1])/(cm[0,0]+cm[1,1]+cm[0,1]+cm[1,0]) print(accuracy) print(accuracy_xgb) -----------------------------------Hierarchial Clustering-------------------------------------------- import pandas as pd data = pd.read_csv('Mall_Customers.csv') x = data.iloc[:,[3,4]].values
selector = SelectPercentile(f_classif, percentile=1) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform( features_train_transformed).toarray() features_test_transformed = selector.transform( features_test_transformed).toarray() #print "no. of Sara training emails:", len(labels_train)-sum(labels_train) return features_train_transformed, features_test_transformed, labels_train, labels_test # ============================================================================= # features_train, features_test, labels_train, labels_test = preprocess() #clf = GaussianNB() #clf.fit(features_train, labels_train) #pred=clf.predict(features_test) #accuracy= accuracy_score(labels_test, pred) #print accuracy # # ============================================================================= model = XGBClassifier() model.fit(features_train, labels_train) # make predictions for test data y_pred = model.predict(features_test) predictions = [round(value) for value in y_pred] # evaluate predictions accuracy = accuracy_score(labels_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0))