def register_model_with_scores(name, acc, precision, recall, f1, fn): mod_name.append(name) accuracy.append(acc) precision_score.append(precision) recall_score.append(recall) f1_score.append(f1) false_negative.append(fn)
def plot_metrics(report, acc): accuracy = [] precision = [] recall =[] f1_score = [] for mod in range(len(modulations)): plt.grid(b=True) plt.title('{} Performance'.format(modulations[mod])) plt.ylabel('Performance') plt.xlabel('SNR [dB]') plt.xticks(np.arange(len(snr_list)), [str(snr_values[i]) for i in range(len(snr_list))]) for item in report: precision.append(item[str(mod)]['precision']) recall.append(item[str(mod)]['recall']) f1_score.append(item[str(mod)]['f1-score']) for item in acc[mod]: accuracy.append(item) plt.plot(accuracy, label='accuracy', linewidth=4) plt.plot(precision, label='precision', linewidth=2) plt.plot(recall, label='recall', linewidth=2, linestyle='--', color='r') plt.plot(f1_score, label='f1-score', linewidth=2) plt.legend(loc='best') #plt.show() plt.savefig('{}_software_performance.png'.format(modulations[mod]), bbox_inches='tight', dpi=300) plt.clf() accuracy.clear() precision.clear() recall.clear() f1_score.clear()
def participant_analysis(big_table, tmp_list): means = [] f1_score = [] for i in tmp_list: tmp_value = big_table['actual_use_' + i] - big_table['Predictions_' + i] tmp_val_add = big_table['actual_use_' + i] + big_table['Predictions_' + i] tmp_abs = tmp_value.abs() means.append(tmp_abs.mean()) tp = tmp_val_add[tmp_val_add > 1] tp = len(tp.tolist()) tn = tmp_val_add[tmp_val_add < 1] tn = len(tn.tolist()) fn = tmp_value[tmp_value > 0] fn = len(fn.tolist()) fp = tmp_value[tmp_value < 0].abs() fp = len(fp.tolist()) precision = tp / (tp + fn) recall = tp / (tp + fp) f1 = 2.0 * precision * recall / (precision + recall) f1_score.append(f1) final_value = 1 - np.mean(means) final_f1 = np.mean(f1_score) print(final_f1) return final_value
def run_all_models(dataset, data_file, embedding, is_oversample): x_text, labels = load_data(data_file) precision = [] recall = [] f1_score = [] for i in range(4): results = train(x_text, labels, MODEL_TYPES[i], embedding, is_oversample) precision.append(results[2][0]) recall.append(results[2][1]) f1_score.append(results[2][2]) plot_graph(precision, recall, f1_score, dataset, embedding)
def evaluate_model(model, X_test, Y_test, category_names): """ Testing the model with test data and printing out the average precision and f1-score for all categories. Parameters ---------- model : CLASS full NLP pipeline. X_test : array messages column. Y_test : array 36 categoriy labels. category_names : list names of the categories. Returns ------- None. """ #predict test values Y_pred = model.predict(X_test) f1_score = [] precision = [] classif = [] recall_score = [] #check for each classification column the precision and f1-score for n in range(Y_pred.shape[1]): s = classification_report(Y_test[:, n], Y_pred[:, n], output_dict=True) classif.append(category_names[n]) precision.append(s['1.0']['precision']) f1_score.append(s['1.0']['f1-score']) recall_score.append(s['1.0']['recall']) #print result print('avg_precision:' + str(sum(precision) / len(precision)) + ', avg_f1-score\ :' + str(sum(f1_score) / len(f1_score)) + ', avg_recall-score:' + str(sum(recall_score) / len(recall_score))) #form a dataframe with the results and export it as excel file results = pd.DataFrame(classif, columns=['classifier']) results['precision'] = precision results['recall'] = recall_score results['f1-score'] = f1_score results['model'] = "DecisionTreeClassifier" results.to_excel("results.xlsx", index=False)
def pca_svm_time_score_compare(): ac_score=[] p_score=[] r_score=[] f1_score=[] tt=[] stand=MinMaxScaler((20,30)) steps=numpy.arange(10,410,10) for n in steps: ac,p,r,f1,t=pca_svm(pca_n=n) p_score.append(p) f1_score.append(f1) r_score.append(r) ac_score.append(ac) tt.append(t) p_score_stand=stand.fit_transform(numpy.array(p_score).reshape((-1,1))) r_score_stand=stand.fit_transform(numpy.array(r_score).reshape((-1,1))) f1_score_stand=stand.fit_transform(numpy.array(f1_score).reshape((-1,1))) ac_score_stand=stand.fit_transform(numpy.array(ac_score).reshape((-1,1))) figure=pyplot.figure() pyplot.subplot(2,1,1) pyplot.scatter(steps,f1_score,label='f1-score',color='red',s=p_score_stand,alpha=0.7) pyplot.scatter(steps,r_score,label='recall-score',color='blue',s=r_score_stand,alpha=0.7) pyplot.scatter(steps,p_score,label='precision-score',color='yellow',s=f1_score_stand,alpha=0.7) pyplot.scatter(steps,ac_score,label='accuracy-score',color='purple',s=ac_score_stand,alpha=0.7) pyplot.xlabel('n-components') pyplot.ylabel('score') pyplot.legend() pyplot.title('The Score Of SVM After PCA To N_components') pyplot.subplot(2,1,2) pyplot.plot(steps,tt,label='cost-time',color='black',marker='o') # for i in range(len(tt)): # pyplot.text(steps[i],ac_score[i],str(round(tt[i],1))+'s',fontdict=dict(size=10,weight='normal')) # pyplot.plot([steps[i],steps[i]],[0,ac_score[i]],'--b') pyplot.legend() pyplot.xlabel('n-components') pyplot.ylabel('time') pyplot.show()
def train(self, algorithm): """Main training function. This is the entry for a training process in this class Arguments: algorithm {object} -- This is the instanciated algorithm object to call .fit() onto in order to get the model """ kfold = StratifiedKFold(10, True, 1) f1_score = [] precision_score = [] recall_score = [] for train, test in kfold.split(self.data_training, self.data_target): model = algorithm.fit(self.data_training.iloc[train], self.data_target.iloc[train]) scores = self.score_model(model, self.data_training.iloc[test], self.data_target.iloc[test]) f1_score.append(scores[0]) precision_score.append(scores[1]) recall_score.append(scores[2]) self.print_results(f1_score, precision_score, recall_score)
else: confusion_matrix[1][1]+=1 if(confusion_matrix[0][0]+confusion_matrix[1][0]==0): precision_for_label_j=0 else: precision_for_label_j = confusion_matrix[0][0]/float(confusion_matrix[0][0]+confusion_matrix[1][0]) if(confusion_matrix[0][0]+confusion_matrix[0][1]==0): recall_for_label_j=0 else: recall_for_label_j = confusion_matrix[0][0]/float(confusion_matrix[0][0]+confusion_matrix[0][1]) precision.append(precision_for_label_j) recall.append(recall_for_label_j) if(precision_for_label_j + recall_for_label_j==0): f1_score.append(0) else: f1_score.append(2*precision_for_label_j*recall_for_label_j/float(precision_for_label_j+recall_for_label_j)) confusion_matrix=[[0,0],[0,0]] for i in range(0,len(precision)): print Top_tags[i], precision[i], recall[i], f1_score[i] print "\n" print mean(precision) print mean(recall) print mean(f1_score)
classifiers = [] accuracy = [] f1_score = [] # reformat max_accuracy = -1 max_accuracy_classifier = "" max_f1_score = -1 max_f1_score_classifier = "" for classifier in results.keys(): classifiers.append(classifier) accuracy.append(results[classifier]["accuracy"]) if (results[classifier]["accuracy"] > max_accuracy): max_accuracy_classifier = classifier max_accuracy = results[classifier]["accuracy"] f1_score.append(results[classifier]["f1-score"]) if (results[classifier]["f1-score"] > max_f1_score): max_f1_score_classifier = classifier max_f1_score = results[classifier]["f1-score"] # # Sort # sorted_indices = np.argsort(accuracy) classifiers = np.array(classifiers)[sorted_indices] accuracy = np.array(accuracy)[sorted_indices] f1_score = np.array(f1_score)[sorted_indices] # # Print
'max_depth': [2, 4, 6, 8], #tree depths to check 'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0 } rand_search = RandomizedSearchCV(model, param_distributions=param_grid, scoring='f1_micro', n_iter=3, n_jobs=-1, verbose=10, cv=kfold) rand_result = rand_search.fit(X_train, y_train) print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_)) best_XGB_parameters = rand_result.best_estimator_ #INSERT CITY NAME FOR .DAT FILE pickle.dump(best_XGB_parameters, open("xgb_RICHMOND.pickle.dat", 'wb')) #change pickle # In[6]: #test on test set best_XGB_parameters.fit(X_train, y_train) preds = best_XGB_parameters.predict(X_test) f1score = f1_score(y_test, preds, average='micro') #CSV append best score after test set f1_score = [] f1_score.append(('Richmond', f1score)) export_df = pd.DataFrame(f1_score) #change csv name export_df.to_csv("Richmond_results.dat", index=False, header=False)
input_x, input_y, cv=5, scoring=[ 'accuracy', 'precision_macro', 'recall_macro', 'f1_macro', 'roc_auc', 'average_precision' ], return_train_score=True, return_estimator=True) # append performance index auroc.append(outputs['test_roc_auc']) auprc.append(outputs['test_average_precision']) acc.append(outputs['test_accuracy']) precision.append(outputs['test_precision_macro']) recall.append(outputs['test_recall_macro']) f1_score.append(outputs['test_f1_macro']) # 변수선택 횟수 csv 파일 만들기 test = dict() for f in vital: test[f] = 0 for i in list(test.keys()): for k in s: if k == i: test[i] += 1 for i in list(test.keys()): test[i] = [test[i]] # odds ratio 평균값 구하기 odds_dict = dict()
model = XGBClassifier(nthread = n_threads) #or -1 kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_seed) param_grid = {'n_estimators': [120, 240, 360, 480], #random int btwn 100 and 500 - removed 'learning_rate': stats.uniform(0.01, 0.08), #.01 + loc, range of .01+/-.08 'max_depth': [2, 4, 6, 8], #tree depths to check 'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0 } rand_search = RandomizedSearchCV(model, param_distributions = param_grid, scoring = 'f1_micro', n_iter = 3, n_jobs=-1, verbose = 10, cv=kfold) rand_result = rand_search.fit(X_train, y_train) print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_)) best_XGB_parameters = rand_result.best_estimator_ #INSERT CITY NAME FOR .DAT FILE pickle.dump(best_XGB_parameters, open("xgb_CENTRAL.pickle.dat", 'wb')) #change pickle # In[6]: #test on test set best_XGB_parameters.fit(X_train, y_train) preds = best_XGB_parameters.predict(X_test) f1score = f1_score(y_test, preds, average = 'micro') #CSV append best score after test set f1_score = [] f1_score.append(('Central', f1score)) export_df = pd.DataFrame(f1_score) #change csv name export_df.to_csv("Central_results.dat", index = False, header = False)
def crossValidationStrategy(self, reviews_df_preprocessed, do_pickle): print('\nK-Fold Cross Validation Strategy...\n') train_test_indices, X, y = self.kFoldSplit(reviews_df_preprocessed) accuracy = [] precision = [] recall = [] f1 = [] roc_auc = [] cm = [] for i in range(0, len(self.clf)): accuracy.append([]) precision.append([]) recall.append([]) f1.append([]) roc_auc.append([]) cm.append(np.zeros((2,2), dtype = 'int32')) for train_idx, test_idx in train_test_indices: X_train, y_train = X[train_idx], y[train_idx] X_test, y_test = X[test_idx], y[test_idx] _, model = self.trainData(X_train, y_train, self.clf) prediction = self.predictData(X_test, model) clf_accuracy, clf_precision, clf_recall, clf_f1, clf_roc_auc, clf_cm, _ = self.evaluate(y_test, prediction) for j in range(0, len(self.clf)): accuracy[j].append(clf_accuracy[j]) precision[j].append(clf_precision[j]) recall[j].append(clf_recall[j]) f1[j].append(clf_f1[j]) roc_auc[j].append(clf_roc_auc[j]) cm[j] += clf_cm[j] acc = [] prec = [] rec = [] f1_score = [] auc = [] for i in range(0, len(self.clf)): if i == 0: print('======================================================\n') print('Evaluation metrics of Classifier ' + self.clf_names[i] + ':') print('Accuracy: {}'.format(np.mean(accuracy[i]))) print('Precision: {}'.format(np.mean(precision[i]))) print('Recall: {}'.format(np.mean(recall[i]))) print('F1-score: {}'.format(np.mean(f1[i]))) print('ROC AUC: {}'.format(np.mean(roc_auc[i]))) print('Confusion Matrix: \n{}\n'.format(cm[i])) print('======================================================\n') acc.append(np.mean(accuracy[i])) prec.append(np.mean(precision[i])) rec.append(np.mean(recall[i])) f1_score.append(np.mean(f1[i])) auc.append(np.mean(roc_auc[i])) metrics_list = { 'Classifier': self.clf_names, 'Accuracy': clf_accuracy, 'Precision': clf_precision, 'Recall': clf_recall, 'F1-score': clf_f1, 'ROC AUC': clf_roc_auc } metrics_df = pd.DataFrame.from_dict(metrics_list) print('Comparison of different metrics for the various Classifiers used:\n') print(metrics_df) if do_pickle: with open('pickled/metrics_dataframe_kfold.pickle', 'wb') as df_kfold: pickle.dump(metrics_df, df_kfold)
'max_depth': [2, 4, 6, 8], #tree depths to check 'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0 } rand_search = RandomizedSearchCV(model, param_distributions=param_grid, scoring='f1_micro', n_iter=3, n_jobs=-1, verbose=10, cv=kfold) rand_result = rand_search.fit(X_train, y_train) print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_)) best_XGB_parameters = rand_result.best_estimator_ #INSERT CITY NAME FOR .DAT FILE pickle.dump(best_XGB_parameters, open("xgb_MISSION.pickle.dat", 'wb')) #change pickle # In[6]: #test on test set best_XGB_parameters.fit(X_train, y_train) preds = best_XGB_parameters.predict(X_test) f1score = f1_score(y_test, preds, average='micro') #CSV append best score after test set f1_score = [] f1_score.append(('Mission', f1score)) export_df = pd.DataFrame(f1_score) #change csv name export_df.to_csv("Mission_results.dat", index=False, header=False)
y_pred = model.predict(X) cm_train = confusion_matrix(y, y_pred) #print(names) report = classification_report(y, y_pred, output_dict=True) #sccurate = accuracy_score(y_train, y_pred) #print(sccurate) macro_precision = report['macro avg']['precision'] macro_recall = report['macro avg']['recall'] macro_f1 = report['macro avg']['f1-score'] accuracy = report['accuracy'] print('jjjjj',macro_precision) #f1_score1 = (f1_score(y, y_pred)) f1_score.append(macro_f1) #precision_score1 = (precision_score(y, y_pred)) precision_score.append(macro_precision) #recall_score1 = (recall_score(y, y_pred)) recall_score.append(macro_recall) #acc = accuracy_score(y, y_pred) accuracy_score.append(accuracy) sensitivity1 = cm_train[1,1]/(cm_train[1,0]+cm_train[1,1]) #print(sensitivity) #0.6666666666666666 sensitivity.append(sensitivity1)
#INSERT CITY NAME FOR .DAT FILE pickle.dump(best_XGB_parameters, open("xgb_CITY.pickle.dat", 'wb')) #change pickle # In[109]: #test on test set best_XGB_parameters.fit(X_train, y_train) # In[111]: preds = best_XGB_parameters.predict(X_test) f1score = f1_score(y_test, preds, average='micro') #CSV append best score after test set f1_score = [] f1_score.append(('City', f1score)) export_df = pd.DataFrame(f1_score) #change csv name export_df.to_csv("cityresults.dat", index=False, header=False) # In[156]: #add code for graphing each parameter from cv search... #https://stackoverflow.com/questions/42793254/what-replaces-gridsearchcv-grid-scores-in-scikit param_scores = rand_result.cv_results_['mean_test_score'] # In[165]: param_list = rand_result.cv_results_['params']
'max_depth': [2, 4, 6, 8], #tree depths to check 'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0 } rand_search = RandomizedSearchCV(model, param_distributions=param_grid, scoring='f1_micro', n_iter=3, n_jobs=-1, verbose=10, cv=kfold) rand_result = rand_search.fit(X_train, y_train) print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_)) best_XGB_parameters = rand_result.best_estimator_ #INSERT CITY NAME FOR .DAT FILE pickle.dump(best_XGB_parameters, open("xgb_INGLESIDE.pickle.dat", 'wb')) #change pickle # In[16]: #test on test set best_XGB_parameters.fit(X_train, y_train) preds = best_XGB_parameters.predict(X_test) f1score = f1_score(y_test, preds, average='micro') #CSV append best score after test set f1_score = [] f1_score.append(('Ingleside', f1score)) export_df = pd.DataFrame(f1_score) #change csv name export_df.to_csv("Ingleside_results.dat", index=False, header=False)
sns.set_style("whitegrid") msplt = sns.barplot(x = "model", y = "f1_score", data=mscore_df) _ = plt.xlabel('Model') _ = plt.ylabel('f1 score') _ = plt.savefig('model_selection_bar') _ = plt.show() # In[71]: #test on test set here. #XGBoost non one-hot-encoded was the best model but only very slightly, XGB one-hot-encoded alone without #feature reduction performed nearly as well and was much more efficient in time computations, thus I will use #XGB w/o feature selection and with one-hot-encoding for my final modelling on test set as well as in other #districts and whole city #train the best estimator on the train set again then test on test set for results best_XGB_only_estimator.fit(X_train, y_train) preds = best_XGB_only_estimator.predict(X_test) f1score = f1_score(y_test, preds, average = 'micro') # In[72]: f1_score = [] f1_score.append(('Bayview', f1score)) export_df = pd.DataFrame(f1_score) export_df.to_csv("bayviewresults.dat", index = False, header = False)
'max_depth': [2, 4, 6, 8], #tree depths to check 'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0 } rand_search = RandomizedSearchCV(model, param_distributions=param_grid, scoring='f1_micro', n_iter=3, n_jobs=-1, verbose=10, cv=kfold) rand_result = rand_search.fit(X_train, y_train) print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_)) best_XGB_parameters = rand_result.best_estimator_ #INSERT CITY NAME FOR .DAT FILE pickle.dump(best_XGB_parameters, open("xgb_NORTHERN.pickle.dat", 'wb')) #change pickle # In[6]: #test on test set best_XGB_parameters.fit(X_train, y_train) preds = best_XGB_parameters.predict(X_test) f1score = f1_score(y_test, preds, average='micro') #CSV append best score after test set f1_score = [] f1_score.append(('Northern', f1score)) export_df = pd.DataFrame(f1_score) #change csv name export_df.to_csv("Northern_results.dat", index=False, header=False)
loss_test = [] f1_score_test = [] neural = [] for i in range(20): mlp = sklearn.neural_network.MLPClassifier(activation='logistic', hidden_layer_sizes=(i + 1), max_iter=2000) mlp.fit(x_train, y_train) loss.append(mlp.loss_) f1_score.append( sklearn.metrics.f1_score(y_train, mlp.predict(x_train), average='macro')) loss_test.append(mlp.loss_) f1_score_test.append( sklearn.metrics.f1_score(y_test, mlp.predict(x_test), average='macro')) neural.append(i + 1) print('Loss', mlp.loss_) print('F1', sklearn.metrics.f1_score(y_test, mlp.predict(x_test), average='macro')) # In[7]: loss = np.array(loss)
def predict_result(list_of_random_items): def xgboost_hot_cold(date, Data): pd.options.mode.chained_assignment = None # default='warn' contents = pd.DataFrame(Data, columns=[ 'PAYMENT', 'PROGRAM_TYPE', 'New_Contents', 'genre_Label', 'target_age', 'playtime', 'channel_Label', 'contentnumber', 'episode_count', 'past_view' ]) x_train = contents[contents['New_Contents'] == 0] x_train.loc[:, 'PROGRAM_TYPE'] = round(x_train.loc[:, 'PROGRAM_TYPE']) x_train = x_train[x_train['PROGRAM_TYPE'] == x_train.PROGRAM_TYPE.unique()[0]] x_train.contentnumber = x_train.contentnumber.fillna(0) x_train.episode_count = x_train.episode_count.fillna(1) x_train = x_train.drop('New_Contents', axis=1) x_train = x_train.drop('PROGRAM_TYPE', axis=1) x_train = x_train.values x_train = x_train.astype('float32') scaler = MinMaxScaler(feature_range=(0, 1)) x_train = scaler.fit_transform(x_train) x_test = contents[contents['New_Contents'] == 1] x_test = x_test[x_test['PROGRAM_TYPE'] == x_test.PROGRAM_TYPE.unique() [0]] x_test.contentnumber = x_test.contentnumber.fillna(0) x_test.episode_count = x_test.episode_count.fillna(1) x_test = x_test.drop('New_Contents', axis=1) x_test = x_test.drop('PROGRAM_TYPE', axis=1) x_test = x_test.values x_test = x_test.astype('float32') x_test = scaler.transform(x_test) contents = pd.DataFrame( Data, columns=['PROGRAM_TYPE', 'New_Contents', 'ViewCount']) y_train = contents[contents['New_Contents'] == 0] y_train = y_train.drop('New_Contents', axis=1) y_train.loc[:, 'PROGRAM_TYPE'] = round(y_train.loc[:, 'PROGRAM_TYPE']) y_train = y_train[y_train['PROGRAM_TYPE'] == y_train.PROGRAM_TYPE.unique()[0]] y_train = y_train.drop('PROGRAM_TYPE', axis=1) y_train = y_train.values y_train = y_train.astype('float32') y_test = contents[contents['New_Contents'] == 1] y_test = y_test[y_test['PROGRAM_TYPE'] == y_test.PROGRAM_TYPE.unique() [0]] y_test = y_test.drop('PROGRAM_TYPE', axis=1) y_test = y_test.drop('New_Contents', axis=1) y_test = y_test.values import xgboost as xgb xgb = xgb.XGBRegressor(colsample_bytree=1, learning_rate=0.4, n_estimators=1000, max_depth=8, min_child_weight=1, max_delta_step=2.5, gamma=1.0, subsample=0.8, objective='reg:linear', n_jobs=8, scale_pos_weight=1.8, random_state=27, base_score=0.5) xgb.fit(x_train, y_train) xgb_preds = xgb.predict(x_test) for i in range(xgb_preds.shape[0]): if xgb_preds[i] < 1: xgb_preds[i] = 1 else: xgb_preds[i] = int(xgb_preds[i]) for i in range(xgb_preds.shape[0]): xgb_preds[i] = round(xgb_preds[i]) contents = pd.DataFrame(Data, columns=[ 'EPISODE', 'PAYMENT', 'PROGRAM_TYPE', 'ViewCount', 'New_Contents', 'genre_Label', 'target_age', 'playtime', 'channel_Label', 'contentnumber', 'episode_count', 'past_view' ]) x_test = contents[contents['New_Contents'] == 1] x_test = x_test.drop('ViewCount', axis=1) x_test = x_test[x_test['PROGRAM_TYPE'] == x_test.PROGRAM_TYPE.unique() [0]] xgb_preds = pd.DataFrame(xgb_preds, columns=['ViewCount']) xgb_preds.index = x_test.index H_C = pd.concat([x_test, xgb_preds], axis=1) old = contents[contents['New_Contents'] == 0] old = old[old['PROGRAM_TYPE'] == old.PROGRAM_TYPE.unique()[0]] H_C = H_C[[ 'EPISODE', 'PAYMENT', 'PROGRAM_TYPE', 'ViewCount', 'New_Contents', 'genre_Label', 'target_age', 'playtime', 'channel_Label', 'contentnumber', 'episode_count', 'past_view' ]] yhat = pd.concat([old, H_C], axis=0) yhat = yhat.sort_values(by=['ViewCount'], ascending=False) values = yhat.values new_index = yhat[yhat['New_Contents'] == 1].index Hot_index = round(yhat.shape[0] / 5) yhat.index = range(0, len(yhat)) HC = [] for i in range(yhat.shape[0]): if yhat.index[i] < (Hot_index + 1): HC.append('HOT') else: HC.append('COLD') HC = pd.DataFrame(HC, columns=['H&C']) yhat = pd.concat([yhat, HC], axis=1) yhat = yhat[yhat['New_Contents'] == 1] yhat.index = new_index yhat = yhat.sort_index(ascending=True) yhat = yhat['H&C'].values actual = pd.DataFrame(Data, columns=[ 'PAYMENT', 'PROGRAM_TYPE', 'New_Contents', 'H&C', 'genre_Label', 'target_age', 'playtime', 'channel_Label', 'contentnumber', 'past_view' ]) actual = actual[actual['New_Contents'] == 1] actual = actual[actual['PROGRAM_TYPE'] == actual.PROGRAM_TYPE.unique() [0]] actual = actual['H&C'].values from sklearn.metrics import recall_score, precision_score, f1_score recall_scores = recall_score(actual, yhat, average='macro', labels=['HOT']) recall_score_cold = recall_score(actual, yhat, average='macro', labels=['COLD']) precision_scores = precision_score(actual, yhat, average='macro', labels=['HOT']) precision_score_cold = precision_score(actual, yhat, average='macro', labels=['COLD']) f_score = f1_score(actual, yhat, average='macro', labels=['HOT']) f_score_cold = f1_score(actual, yhat, average='macro', labels=['COLD']) return f_score, f_score_cold, recall_scores, precision_scores, recall_score_cold, precision_score_cold precision_cold = [] precision = [] recall_cold = [] recall = [] f1_score = [] f1_score_colds = [] for i in tqdm(list_of_random_items): db = pymysql.connect( host='gcsdbinstance.cu0nuaw6yxna.us-east-1.rds.amazonaws.com', port=3306, user='******', passwd='awsg1020*', db='gcs_database', charset='utf8') cursor = db.cursor() sql = "SELECT * FROM gcs_database." + str(i) cursor.execute(sql) Data = pd.read_sql(sql, db) print('Date: ', i) f_score, f_score_cold, recall_scores, precision_scores, recall_score_cold, precision_score_cold = xgboost_hot_cold( i, Data) print('Precision-Score: ', precision_scores) print('Recall-Score: ', recall_scores) print('F1-Score: ', f_score) precision_cold.append(precision_score_cold) precision.append(precision_scores) recall_cold.append(recall_score_cold) recall.append(recall_scores) f1_score.append(f_score) f1_score_colds.append(f_score_cold) print('View pattern classification Precision - New_HOT : ' + str(np.mean(precision))) print('View pattern classification Recall - New_HOT : ' + str(np.mean(recall))) print('View pattern classification F1-Score - New_HOT : ' + str(np.mean(f1_score))) print('View pattern classification Precision - New_COLD : ' + str(np.mean(precision_cold))) print('View pattern classification Recall - New_COLD : ' + str(np.mean(recall_cold))) print('View pattern classification F1-Score - New_COLD : ' + str(np.mean(f1_score_colds))) return 'F1-score : ' + str(np.mean(f1_score)), np.mean(f1_score)
XGBClassifier_result, GaussianNB_result, KNN_result, Category_boost, voting] # + roc_score = [] f1_score = [] for i in model: roc_mean = pd.Series(i['test_ROC-AUC']).mean() f1_mean = pd.Series(i['test_f1']).mean() roc_score.append(roc_mean) f1_score.append(f1_mean) print('roc_score:', roc_score) print('f1_score:', f1_score) # - model_name = ['LGBMClassifier_result', 'DecisionTreeClassifier_result', 'LogisticRegression_result', 'RandomForestClassifier_result', 'GradientBoostingClassifier_result', 'XGBClassifier_result', 'GaussianNB_result', 'KNN_result',
model = XGBClassifier(nthread = n_threads) #or -1 kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_seed) param_grid = {'n_estimators': [120, 240, 360, 480], #random int btwn 100 and 500 - removed 'learning_rate': stats.uniform(0.01, 0.08), #.01 + loc, range of .01+/-.08 'max_depth': [2, 4, 6, 8], #tree depths to check 'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0 } rand_search = RandomizedSearchCV(model, param_distributions = param_grid, scoring = 'f1_micro', n_iter = 3, n_jobs=-1, verbose = 10, cv=kfold) rand_result = rand_search.fit(X_train, y_train) print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_)) best_XGB_parameters = rand_result.best_estimator_ #INSERT CITY NAME FOR .DAT FILE pickle.dump(best_XGB_parameters, open("xgb_SOUTHERN.pickle.dat", 'wb')) #change pickle # In[6]: #test on test set best_XGB_parameters.fit(X_train, y_train) preds = best_XGB_parameters.predict(X_test) f1score = f1_score(y_test, preds, average = 'micro') #CSV append best score after test set f1_score = [] f1_score.append(('Southern', f1score)) export_df = pd.DataFrame(f1_score) #change csv name export_df.to_csv("Southern_results.dat", index = False, header = False)
'max_depth': [2, 4, 6, 8], #tree depths to check 'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0 } rand_search = RandomizedSearchCV(model, param_distributions=param_grid, scoring='f1_micro', n_iter=3, n_jobs=-1, verbose=10, cv=kfold) rand_result = rand_search.fit(X_train, y_train) print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_)) best_XGB_parameters = rand_result.best_estimator_ #INSERT CITY NAME FOR .DAT FILE pickle.dump(best_XGB_parameters, open("xgb_PARK.pickle.dat", 'wb')) #change pickle # In[6]: #test on test set best_XGB_parameters.fit(X_train, y_train) preds = best_XGB_parameters.predict(X_test) f1score = f1_score(y_test, preds, average='micro') #CSV append best score after test set f1_score = [] f1_score.append(('Park', f1score)) export_df = pd.DataFrame(f1_score) #change csv name export_df.to_csv("Park_results.dat", index=False, header=False)
'max_depth': [2, 4, 6, 8], #tree depths to check 'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0 } rand_search = RandomizedSearchCV(model, param_distributions=param_grid, scoring='f1_micro', n_iter=3, n_jobs=-1, verbose=10, cv=kfold) rand_result = rand_search.fit(X_train, y_train) print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_)) best_XGB_parameters = rand_result.best_estimator_ #INSERT CITY NAME FOR .DAT FILE pickle.dump(best_XGB_parameters, open("xgb_TARAVAL.pickle.dat", 'wb')) #change pickle # In[12]: #test on test set best_XGB_parameters.fit(X_train, y_train) preds = best_XGB_parameters.predict(X_test) f1score = f1_score(y_test, preds, average='micro') #CSV append best score after test set f1_score = [] f1_score.append(('Taraval', f1score)) export_df = pd.DataFrame(f1_score) #change csv name export_df.to_csv("Taraval_results.dat", index=False, header=False)
model = XGBClassifier(nthread = n_threads) #or -1 kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_seed) param_grid = {'n_estimators': [120, 240, 360, 480], #random int btwn 100 and 500 - removed 'learning_rate': stats.uniform(0.01, 0.08), #.01 + loc, range of .01+/-.08 'max_depth': [2, 4, 6, 8], #tree depths to check 'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0 } rand_search = RandomizedSearchCV(model, param_distributions = param_grid, scoring = 'f1_micro', n_iter = 3, n_jobs=-1, verbose = 10, cv=kfold) rand_result = rand_search.fit(X_train, y_train) print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_)) best_XGB_parameters = rand_result.best_estimator_ #INSERT CITY NAME FOR .DAT FILE pickle.dump(best_XGB_parameters, open("xgb_TENDERLOIN.pickle.dat", 'wb')) #change pickle # In[6]: #test on test set best_XGB_parameters.fit(X_train, y_train) preds = best_XGB_parameters.predict(X_test) f1score = f1_score(y_test, preds, average = 'micro') #CSV append best score after test set f1_score = [] f1_score.append(('Tenderloin', f1score)) export_df = pd.DataFrame(f1_score) #change csv name export_df.to_csv("Tenderloin_results.dat", index = False, header = False)