Beispiel #1
0
def perform_cross_validation(model, kfold_train_test, features):
    '''
  Takes in the model to evaluate and the list of 
  extracted train/test data from each of the K Folds. Prints
  out the kappa score from each of the folds and outputs the final
  averaged kappa score over all of the folds. Returns a list of 
  #fold data frames of y_test,y_pred for each of the folds.
  '''
    score_count = 0
    score_total = 0.0
    test_data = []
    for X_train, y_train, X_test, y_test in kfold_train_test:
        X_train = X_train[features]
        X_test = X_test[features]
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        score_count += 1
        score = evaluation.quadratic_weighted_kappa(y=y_test,
                                                    y_pred=predictions)
        score_total += score
        print(("Score " + str(score_count) + ": " + str(score)))

        y_and_y_pred = pd.DataFrame({'y': y_test, 'y_pred': predictions})

        #Add y_and_y_pred to test_data to return for future use in ensembling
        test_data.append(y_and_y_pred)

    average_score = score_total / float(score_count)
    print(("Average score: " + str(average_score)))
    return test_data
def perform_cross_validation(model, kfold_train_test, features):
  '''
  Takes in the model to evaluate and the list of 
  extracted train/test data from each of the K Folds. Prints
  out the kappa score from each of the folds and outputs the final
  averaged kappa score over all of the folds. Returns a list of 
  #fold data frames of y_test,y_pred for each of the folds.
  '''
  score_count = 0
  score_total = 0.0
  test_data = []
  for X_train, y_train, X_test, y_test in kfold_train_test:
    X_train = X_train[features]
    X_test = X_test[features]
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    score_count += 1
    score = evaluation.quadratic_weighted_kappa(y = y_test, y_pred = predictions)
    score_total += score
    print("Score " + str(score_count) + ": " + str(score))

    y_and_y_pred = pd.DataFrame({'y': y_test, 'y_pred': predictions})

    #Add y_and_y_pred to test_data to return for future use in ensembling
    test_data.append(y_and_y_pred)

  average_score = score_total/float(score_count)
  print("Average score: " + str(average_score))
  return test_data
Beispiel #3
0
def perform_tfidf_cross_validation(tfv, pipeline, kfold_train_test):
    score_count = 0
    score_total = 0.0
    test_data = []
    for X_train, y_train, X_test, y_test in kfold_train_test:
      
      tfv.fit(X_train)
      X_train =  tfv.transform(X_train) 
      X_test = tfv.transform(X_test)
      pipeline.fit(X_train, y_train)
      predictions = pipeline.predict(X_test)
      score_count += 1
      score = evaluation.quadratic_weighted_kappa(y = y_test, y_pred = predictions)
      score_total += score
      print("Score " + str(score_count) + ": " + str(score))
      y_and_y_pred = pd.DataFrame({'y': y_test, 'y_pred': predictions})
      test_data.append(y_and_y_pred)
      
    average_score = score_total/float(score_count)
    print("Average score: " + str(average_score))
    return test_data
print "preds",preds[0]
wt_final = []
for w in wt_list:
	if sum(w) == 1.0:
		wt_final.append(w)

#Find the optimal weights.
max_average_score = 0
max_weights = None
for wt in wt_final:
	total_score = 0
	for i in range(5):
		y_true = preds[0][i]['y']
		weighted_prediction = sum([wt[x] * preds[x][i]['y_pred'].astype(int).reset_index() for x in range(6)])
		weighted_prediction = [round(p) for p in weighted_prediction['y_pred']]
		total_score += evaluation.quadratic_weighted_kappa(y = y_true, y_pred = weighted_prediction)
	average_score = total_score/5.0
	if average_score > max_average_score:
		max_average_score = average_score
		max_weights = wt
print "Best set of weights: " + str(max_weights)
print "Corresponding score: " + str(max_average_score)


rf_final_predictions = pickle.load(open('rf_final_predictions.pkl', 'r'))
svc_final_predictions = pickle.load(open('svc_final_predictions.pkl', 'r'))
adaboost_final_predictions = pickle.load(open('adaboost_final_predictions.pkl', 'r'))
tfidf_v1_final_predictions = pickle.load(open('tfidf_v1_final_predictions.pkl', 'r'))
tfidf_v2_final_predictions = pickle.load(open('tfidf_v2_final_predictions.pkl', 'r'))
knn_final_predictions = pickle.load(open('knn_final_predictions.pkl', 'r'))
preds = [rf_final_predictions, svc_final_predictions, adaboost_final_predictions, tfidf_v1_final_predictions, tfidf_v2_final_predictions,knn_final_predictions]
#test to see which one is best).
wt_final = []
for w in wt_list:
	if sum(w) == 1.0:
		wt_final.append(w)

#Find the optimal weights.
max_average_score = 0
max_weights = None
for wt in wt_final:
	total_score = 0
	for i in range(5):
		y_true = preds[0][i]['y']
		weighted_prediction = sum([wt[x] * preds[x][i]['y_pred'].astype(int).reset_index() for x in range(5)])
		weighted_prediction = [round(p) for p in weighted_prediction['y_pred']]
		total_score += evaluation.quadratic_weighted_kappa(y = y_true, y_pred = weighted_prediction)
	average_score = total_score/5.0
	if average_score > max_average_score:
		max_average_score = average_score
		max_weights = wt
print "Best set of weights: " + str(max_weights)
print "Corresponding score: " + str(max_average_score)


#Now perform the best ensembling on the full dataset 
#using the optimanl weights determined above
rf_final_predictions = pickle.load(open('rf_final_predictions.pkl', 'r'))
svc_final_predictions = pickle.load(open('svc_final_predictions.pkl', 'r'))
adaboost_final_predictions = pickle.load(open('adaboost_final_predictions.pkl', 'r'))
tfidf_v1_final_predictions = pickle.load(open('tfidf_v1_final_predictions.pkl', 'r'))
tfidf_v2_final_predictions = pickle.load(open('tfidf_v2_final_predictions.pkl', 'r'))