def DLmodel_regressor(Xtrain_in, ytrain_in, Xtest_in, ytest_in, lime_flag=False, df_row=None): start_time = time.time() estimator = KerasRegressor(build_fn=DLmodel_baseline, epochs=20, batch_size=5, verbose=10) seed = 23 numpy.random.seed(seed) estimator.fit(Xtrain_in, ytrain_in) y_test_pred = estimator.predict(Xtest_in) y_train_pred = estimator.predict(Xtrain_in) score_test = r2_score(y_test_pred, ytest_in) score_train = r2_score(y_train_pred, ytrain_in) adj_Rscore_train = adjusted_R2score_calc(Xtrain_in, score_train) adj_Rscore_test = adjusted_R2score_calc(Xtest_in, score_test) time_end = time.time() - start_time mrs_train = mean_squared_error(y_train_pred, ytrain_in) mrs_test = mean_squared_error(y_test_pred, ytest_in) if lime_flag: lime_explainer(Xtrain_in, df_row, estimator, "Keras_base") time_end = time.time() - start_time log_record_result("Keras base model", time_end, score_train, score_test, adj_Rscore_train, adj_Rscore_test, mrs_train, mrs_test) plot_residuals(Xtest_in, ytest_in, estimator, "Keras_base") #plots residual return "Keras base model", str(time_end), str(score_train), str( score_test), str(adj_Rscore_train), str(adj_Rscore_test)
def adaboost(X_train, y_train, X_test, y_test, lime_flag=False, base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None): ''' Parameters: X_train, y_train, X_test, y_test- Learning set lime_flag- enable or disable lime ''' start_time = time.time() # cretae instance adaboost= AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, algorithm=algorithm, random_state=random_state) adaboost.fit(X_train,y_train) #Predict on test set y_pred= adaboost.predict(X_test) # understand the model through lime if lime_flag: lime_explainer(X_train, y_train, X_test, y_test, df_row=2, model_predictor= adaboost, alogorithm_name="adaboost") time_end=time.time() - start_time # Scores model_evaluation(X_train,y_train, X_test, y_test,y_pred, adaboost, time_end, alg_name='adaboost') # resturn model object return adaboost
def gb(X_train, y_train, X_test, y_test, lime_flag=False, loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto', validation_fraction=0.1, n_iter_no_change=None, tol=0.0001): ''' Parameters: X_train, y_train, X_test, y_test- Learning set lime_flag- enable or disable lime ''' start_time = time.time() # cretae instance gb= GradientBoostingClassifier(loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, subsample=subsample, criterion=criterion, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_depth=max_depth, min_impurity_decrease=min_impurity_decrease, min_impurity_split=min_impurity_split, init=init, random_state=random_state, max_features=max_features, verbose=verbose, max_leaf_nodes=max_leaf_nodes, warm_start=warm_start, presort=presort, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, tol=tol) gb.fit(X_train,y_train) #Predict on test set y_pred= gb.predict(X_test) # understand the model through lime if lime_flag: lime_explainer(X_train, y_train, X_test, y_test, df_row=2, model_predictor= gb, alogorithm_name="gb") time_end=time.time() - start_time # Scores model_evaluation(X_train,y_train, X_test, y_test,y_pred, gb, time_end, alg_name='gb') # resturn model object return gb
def rf(X_train, y_train, X_test, y_test, lime_flag=False, n_estimators=50, criterion='gini', max_depth=None, min_samples_split=40, min_samples_leaf=20, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=-1, random_state=42, verbose=0, warm_start=False, class_weight=None): ''' Parameters: X_train, y_train, X_test, y_test- Learning set lime_flag- enable or disable lime ''' start_time = time.time() # cretae instance rf= RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, min_impurity_split=min_impurity_split, bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, class_weight=class_weight) rf.fit(X_train,y_train) #Predict on test set y_pred= rf.predict(X_test) # understand the model through lime if lime_flag: lime_explainer(X_train, y_train, X_test, y_test, df_row=2, model_predictor= rf, alogorithm_name="rf") time_end=time.time() - start_time # Scores model_evaluation(X_train,y_train, X_test, y_test,y_pred, rf, time_end, alg_name='rf') # resturn model object return rf
def dt(X_train, y_train, X_test, y_test, lime_flag=False, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False): ''' Parameters: X_train, y_train, X_test, y_test- Learning set lime_flag- enable or disable lime ''' start_time = time.time() # cretae instance dt= DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, random_state=random_state, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, min_impurity_split=min_impurity_split, class_weight=class_weight, presort=presort) dt.fit(X_train,y_train) #Predict on test set y_pred= dt.predict(X_test) feat_imp= pd.DataFrame(dt.feature_importances_,index = X_train.columns, columns=['Importance']).sort_values('Importance',ascending=False) print(feat_imp.loc[feat_imp['Importance']>0].shape[0] , "Features have more than zero importance") print(feat_imp.loc[feat_imp['Importance']>0]) # understand the model through lime if lime_flag: lime_explainer(X_train, y_train, X_test, y_test, df_row=2, model_predictor= dt, alogorithm_name="Decision Tree") time_end=time.time() - start_time # Scores model_evaluation(X_train,y_train, X_test, y_test,y_pred, dt, time_end, alg_name='Decision Tree') # resturn model object return dt
def gpc(X_train, y_train, X_test, y_test, lime_flag=False, kernel=1.0 * RBF(1.0), optimizer='fmin_l_bfgs_b', n_restarts_optimizer=0,warm_start=False , random_state=42, n_jobs=-1 , max_iter_predict= 1000 , copy_X_train=True ): ''' Parameters: X_train, y_train, X_test, y_test- Learning set lime_flag- enable or disable lime ''' start_time = time.time() # cretae instance gpc= GaussianProcessClassifier(kernel=1.0 * RBF(1.0) , optimizer=optimizer, n_restarts_optimizer=n_restarts_optimizer, max_iter_predict=max_iter_predict , warm_start=warm_start, copy_X_train=copy_X_train, random_state=random_state, n_jobs=n_jobs) gpc.fit(X_train,y_train) #Predict on test set y_pred= gpc.predict(X_test) # understand the model through lime if lime_flag: lime_explainer(X_train, y_train, X_test, y_test, df_row=2, model_predictor= gpc, alogorithm_name="gpc") time_end=time.time() - start_time # Scores model_evaluation(X_train,y_train, X_test, y_test,y_pred, gpc, time_end, alg_name='gpc') # resturn model object return gpc
def sgd(X_train, y_train, X_test, y_test, lime_flag=False, loss= 'log', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=None, shuffle=True, verbose=0, epsilon=0.1, n_jobs=-1, random_state=42, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight=None, warm_start=False, average=False, n_iter=None): ''' Parameters: X_train, y_train, X_test, y_test- Learning set lime_flag- enable or disable lime ''' start_time = time.time() # cretae instance sgd= SGDClassifier(loss=loss, penalty=penalty, alpha=alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, shuffle=shuffle, verbose=verbose, epsilon=epsilon, n_jobs=n_jobs, random_state=random_state, learning_rate=learning_rate, eta0=eta0, power_t=power_t, early_stopping=early_stopping, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, class_weight=class_weight, warm_start=warm_start, average=average, n_iter=n_iter) sgd.fit(X_train,y_train) #Predict on test set y_pred= sgd.predict(X_test) # understand the model through lime if lime_flag: lime_explainer(X_train, y_train, X_test, y_test, df_row=2, model_predictor= sgd, alogorithm_name="SGD") time_end=time.time() - start_time # Scores model_evaluation(X_train,y_train, X_test, y_test,y_pred, sgd, time_end, alg_name='SGD') # resturn model object return sgd
def DLmodel_regressor_gridsearch(Xtrain_in, ytrain_in, Xtest_in, ytest_in, lime_flag=False, df_row=None): start_time = time.time() estimator = KerasRegressor(build_fn=DLmodel_model, verbose=10) batch_size = [20, 50, 100] epochs = [50, 100, 200] param_grid = dict(batch_size=batch_size, epochs=epochs) #, neurons=neurons) grid = GridSearchCV(estimator=estimator, param_grid=param_grid, n_jobs=-1) grid_result = grid.fit(Xtrain_in, ytrain_in) seed = 23 np.random.seed(seed) estimator.fit(Xtrain_in, ytrain_in) y_test_pred = grid_result.predict(Xtest_in) y_train_pred = grid_result.predict(Xtrain_in) score_test = r2_score(y_test_pred, ytest_in) score_train = r2_score(y_train_pred, ytrain_in) adj_Rscore_train = adjusted_R2score_calc(Xtrain_in, score_train) adj_Rscore_test = adjusted_R2score_calc(Xtest_in, score_test) best_parameters = grid.best_params_ time_end = time.time() - start_time mrs_train = mean_squared_error(y_train_pred, ytrain_in) mrs_test = mean_squared_error(y_test_pred, ytest_in) if lime_flag: lime_explainer(Xtrain_in, df_row, grid, "Keras_grid") time_end = time.time() - start_time log_record_result("Keras base model gridsearch", time_end, score_train, score_test, adj_Rscore_train, adj_Rscore_test, mrs_train=mrs_train, mrs_test=mrs_test, best_param=best_parameters) plot_residuals(Xtest_in, ytest_in, grid, "Keras_grid") #plots residual return "Keras base model gridsearch", str(time_end), str(score_train), str( score_test), str(adj_Rscore_train), str(adj_Rscore_test), str( best_parameters)
def logistic_regression(X_train, y_train, X_test, y_test, lime_flag=False, penalty='l2', dual=True, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, verbose=0, warm_start=False, n_jobs=-1): ''' Parameters: X_train, y_train, X_test, y_test- Learning set lime_flag- enable or disable lime penalty, dual, tol=, C=, fit_intercept, intercept_scaling, class_weight, random_state, solver, max_iter, multi_class, verbose=, warm_start, n_jobs : Parameters to sklearn logistic regression class ''' start_time = time.time() # cretae instance log= LogisticRegression(penalty=penalty, dual=dual, tol=tol, C=C, fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight=class_weight, random_state=random_state, solver=solver, max_iter=max_iter, verbose=verbose, warm_start=warm_start, n_jobs=n_jobs) #fit on train set log.fit(X_train,y_train) #Predict on test set y_pred= log.predict(X_test) # understand the model through lime if lime_flag: lime_explainer(X_train, y_train, X_test, y_test, df_row=2, model_predictor= log, alogorithm_name="Logistic_Regression") time_end=time.time() - start_time # Scores model_evaluation(X_train,y_train, X_test, y_test,y_pred, log, time_end, alg_name='Logistic_Regression') # resturn model object return log
def mlpc(X_train, y_train, X_test, y_test, lime_flag=False, hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10): ''' Parameters: X_train, y_train, X_test, y_test- Learning set lime_flag- enable or disable lime ''' start_time = time.time() # cretae instance mlpc = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, alpha=alpha, batch_size=batch_size, learning_rate='constant', learning_rate_init=0.001, power_t=power_t, max_iter=max_iter, shuffle=shuffle, random_state=random_state, tol=tol, verbose=verbose, warm_start=warm_start, momentum=momentum, nesterovs_momentum=nesterovs_momentum, early_stopping=early_stopping, validation_fraction=validation_fraction, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, n_iter_no_change=n_iter_no_change) mlpc.fit(X_train, y_train) #Predict on test set y_pred = mlpc.predict(X_test) # understand the model through lime if lime_flag: lime_explainer(X_train, y_train, X_test, y_test, df_row=2, model_predictor=mlpc, alogorithm_name="mlpc") time_end = time.time() - start_time # Scores model_evaluation(X_train, y_train, X_test, y_test, y_pred, mlpc, time_end, alg_name='mlpc') # resturn model object return mlpc
def grid_search(X_train, y_train, X_test, y_test, lime_flag= True, tuned_parameters = [{'kernel': ['rbf'] , 'gamma': [1e-3, 1e-4] , 'C': [1, 10, 100, 1000]} , {'kernel': ['linear'] , 'C': [1, 10, 100, 1000]}] ,model= SVC()): ''' X_train, y_train, X_test, y_test: train and test set tuned_parameters: parameters to be tuned model: classifier for grid search ''' start_time = time.time() scores = ['precision', 'recall', 'accuracy', 'f1', 'roc_auc'] for score in scores: print("# Tuning hyper-parameters for %s" % score) print() clf = GridSearchCV(model, tuned_parameters, cv=5,scoring= score) clf.fit(X_train, y_train) #Best parameters print("Best parameters set found on development set:") print() print(clf.best_params_) print() print("Grid scores on development set:") print() means = clf.cv_results_['mean_test_score'] stds = clf.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, clf.cv_results_['params']): print((mean, std * 2, params)) print() #classification report print("Detailed classification report:") y_true, y_pred = y_test, clf.predict(X_test) print(classification_report(y_true, y_pred)) # Accuracy Score acc =accuracy_score(y_pred= y_pred, y_true=y_test) * 100 print('Accuracy '+ str(acc)) print('Balanced accuracy score '+ str(balanced_accuracy_score(y_pred= y_pred, y_true=y_test) * 100)) #f1_score f1=f1_score(y_pred= y_pred, y_true=y_test, average='macro') * 100 print('F1 score ' + str(f1)) #precision_score prec=precision_score(y_pred= y_pred, y_true=y_test, average='weighted')* 100 print('Precision score ' + str(prec)) #log_loss print('Log loss ' + str(log_loss(y_pred= y_pred, y_true=y_test) )) #recall_score recall=recall_score(y_pred= y_pred, y_true=y_test)*100 print('Recall score ' + str( recall)) #roc_curve y_pred_proba = clf.predict_proba(X_test)[:,1] false_positive_rate, true_positive_rate, thresholds = roc_curve(y_true=y_test, y_score=y_pred_proba) plt.plot([0,1],[0,1],'k--') plt.plot(false_positive_rate,true_positive_rate, label='Grid Search') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve of ' + 'Grid Search') plt.show() #roc_auc_score roc= roc_auc_score(y_test,y_pred_proba) * 100 print('ROC AUC score ' + str(roc)) #confusion_matrix print('Confusion matrix ' + str(confusion_matrix(y_test,y_pred))) pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True) # understand the model through lime if lime_flag: lime_explainer(X_train, y_train, X_test, y_test, df_row=2, model_predictor= clf, alogorithm_name="Grid Search") time_end=time.time() - start_time # Scores model_evaluation(X_train,y_train, X_test, y_test,y_pred, clf, time_end, alg_name='Grid Search') # resturn model object return clf