def perform_cross_validation(model, kfold_train_test, features): ''' Takes in the model to evaluate and the list of extracted train/test data from each of the K Folds. Prints out the kappa score from each of the folds and outputs the final averaged kappa score over all of the folds. Returns a list of #fold data frames of y_test,y_pred for each of the folds. ''' score_count = 0 score_total = 0.0 test_data = [] for X_train, y_train, X_test, y_test in kfold_train_test: X_train = X_train[features] X_test = X_test[features] model.fit(X_train, y_train) predictions = model.predict(X_test) score_count += 1 score = evaluation.quadratic_weighted_kappa(y=y_test, y_pred=predictions) score_total += score print(("Score " + str(score_count) + ": " + str(score))) y_and_y_pred = pd.DataFrame({'y': y_test, 'y_pred': predictions}) #Add y_and_y_pred to test_data to return for future use in ensembling test_data.append(y_and_y_pred) average_score = score_total / float(score_count) print(("Average score: " + str(average_score))) return test_data
def perform_cross_validation(model, kfold_train_test, features): ''' Takes in the model to evaluate and the list of extracted train/test data from each of the K Folds. Prints out the kappa score from each of the folds and outputs the final averaged kappa score over all of the folds. Returns a list of #fold data frames of y_test,y_pred for each of the folds. ''' score_count = 0 score_total = 0.0 test_data = [] for X_train, y_train, X_test, y_test in kfold_train_test: X_train = X_train[features] X_test = X_test[features] model.fit(X_train, y_train) predictions = model.predict(X_test) score_count += 1 score = evaluation.quadratic_weighted_kappa(y = y_test, y_pred = predictions) score_total += score print("Score " + str(score_count) + ": " + str(score)) y_and_y_pred = pd.DataFrame({'y': y_test, 'y_pred': predictions}) #Add y_and_y_pred to test_data to return for future use in ensembling test_data.append(y_and_y_pred) average_score = score_total/float(score_count) print("Average score: " + str(average_score)) return test_data
def perform_tfidf_cross_validation(tfv, pipeline, kfold_train_test): score_count = 0 score_total = 0.0 test_data = [] for X_train, y_train, X_test, y_test in kfold_train_test: tfv.fit(X_train) X_train = tfv.transform(X_train) X_test = tfv.transform(X_test) pipeline.fit(X_train, y_train) predictions = pipeline.predict(X_test) score_count += 1 score = evaluation.quadratic_weighted_kappa(y = y_test, y_pred = predictions) score_total += score print("Score " + str(score_count) + ": " + str(score)) y_and_y_pred = pd.DataFrame({'y': y_test, 'y_pred': predictions}) test_data.append(y_and_y_pred) average_score = score_total/float(score_count) print("Average score: " + str(average_score)) return test_data
print "preds",preds[0] wt_final = [] for w in wt_list: if sum(w) == 1.0: wt_final.append(w) #Find the optimal weights. max_average_score = 0 max_weights = None for wt in wt_final: total_score = 0 for i in range(5): y_true = preds[0][i]['y'] weighted_prediction = sum([wt[x] * preds[x][i]['y_pred'].astype(int).reset_index() for x in range(6)]) weighted_prediction = [round(p) for p in weighted_prediction['y_pred']] total_score += evaluation.quadratic_weighted_kappa(y = y_true, y_pred = weighted_prediction) average_score = total_score/5.0 if average_score > max_average_score: max_average_score = average_score max_weights = wt print "Best set of weights: " + str(max_weights) print "Corresponding score: " + str(max_average_score) rf_final_predictions = pickle.load(open('rf_final_predictions.pkl', 'r')) svc_final_predictions = pickle.load(open('svc_final_predictions.pkl', 'r')) adaboost_final_predictions = pickle.load(open('adaboost_final_predictions.pkl', 'r')) tfidf_v1_final_predictions = pickle.load(open('tfidf_v1_final_predictions.pkl', 'r')) tfidf_v2_final_predictions = pickle.load(open('tfidf_v2_final_predictions.pkl', 'r')) knn_final_predictions = pickle.load(open('knn_final_predictions.pkl', 'r')) preds = [rf_final_predictions, svc_final_predictions, adaboost_final_predictions, tfidf_v1_final_predictions, tfidf_v2_final_predictions,knn_final_predictions]
#test to see which one is best). wt_final = [] for w in wt_list: if sum(w) == 1.0: wt_final.append(w) #Find the optimal weights. max_average_score = 0 max_weights = None for wt in wt_final: total_score = 0 for i in range(5): y_true = preds[0][i]['y'] weighted_prediction = sum([wt[x] * preds[x][i]['y_pred'].astype(int).reset_index() for x in range(5)]) weighted_prediction = [round(p) for p in weighted_prediction['y_pred']] total_score += evaluation.quadratic_weighted_kappa(y = y_true, y_pred = weighted_prediction) average_score = total_score/5.0 if average_score > max_average_score: max_average_score = average_score max_weights = wt print "Best set of weights: " + str(max_weights) print "Corresponding score: " + str(max_average_score) #Now perform the best ensembling on the full dataset #using the optimanl weights determined above rf_final_predictions = pickle.load(open('rf_final_predictions.pkl', 'r')) svc_final_predictions = pickle.load(open('svc_final_predictions.pkl', 'r')) adaboost_final_predictions = pickle.load(open('adaboost_final_predictions.pkl', 'r')) tfidf_v1_final_predictions = pickle.load(open('tfidf_v1_final_predictions.pkl', 'r')) tfidf_v2_final_predictions = pickle.load(open('tfidf_v2_final_predictions.pkl', 'r'))