def run_model(model_A, model_B, data_set_name, mode, cutoff_filename=""): """ Runs the most recent update model for each day in the dataset and returns the result to a csv file in the python directory. """ # Load list of folder names which each contain a day if data_set_name == "InitialTrainingSet_rev1": days_list = fn.folder_names_init_set() elif data_set_name == "PublicLeaderboardSet": days_list = fn.folder_names_test_set() else: days_list = [] print "Problem with data set name!" # Fin_X contains the final predictions for model X fin_A = fd.FlightPredictions() # If we have a second model to compare against, to check for # improvements or anything like that load it into B if model_B != None: fin_B = fd.FlightPredictions() print "Using mode: {}".format(mode) print "Using data from {}".format(data_set_name) # Loop through all of the days in the data set for i, d in enumerate(days_list): if model_B == None: print "Running model '{}' on day {} (day {} of {}):".format(model_A, d, i + 1, len(days_list)) else: print "Running models '{}', '{}' on day {} (day {} of {}):".format(model_A, model_B, d, i + 1, len(days_list)) # Initialize all the information about each day. # Extended means we include the flight history events file # day = efd.ExtendedFlightDay(d, data_set_name, mode, cutoff_filename) day = fd.FlightDay(d, data_set_name, mode, cutoff_filename) # day = ad.ASDIDay(d, data_set_name, mode, cutoff_filename) # day = ead.ExtendedASDIDay(d, data_set_name, mode, cutoff_filename) # Compute the predicitons for the day fin_A = return_predictions(model_A, day, fin_A) if model_B != None: fin_B = return_predictions(model_B, day, fin_B) print "\tDay {} has finished".format(d) print "" print "All days in {} are done!".format(data_set_name) if "leaderboard" in mode: # In leaderboard mode we just write the predictions to a csv file # for submission to kaggle fin_A.flight_predictions = fin_A.flight_predictions.sort(columns='flight_history_id') fin_A.flight_predictions.to_csv('test.csv', index=False) print "Predictions written to csv file in Python folder." if model_B != None: print "Warning: we have disregarded the output of '{}'!".format(model_B) elif "training" in mode: # In training mode we can calculate the root mean squared error as we # know the true values score_A = rmse.calculate_rmse_score(fin_A.flight_predictions, fin_A.test_data) if model_B != None: score_B = rmse.calculate_rmse_score(fin_B.flight_predictions, fin_B.test_data) else: score_B = None scores = {str(model_A): score_A, str(model_B): score_B} # Write the scores to the score log for record keeping # See if we are making improvements log_predictions(day, model_A, model_B, scores, "scores.log") return scores else: print "Not an option!"
def r_forest(): [X_train, y_train, ind_train] = load_and_format_data('all_combined_no_dates') [X_test, y_test, ind_test] = load_and_format_data('all_combined_no_dates') print len(X_train) # y_train_runway = y_train['actual_runway_arrival_minutes_after_midnight'] # y_train_gate = y_train['actual_gate_arrival_minutes_after_midnight'] forest = RandomForestRegressor(n_estimators=2, random_state=None, n_jobs=-1) # forest = ExtraTreesRegressor(n_estimators=200, random_state=None, n_jobs=-1) # forest = GradientBoostingRegressor(n_estimators=200, # learn_rate=0.1, max_depth=5, random_state=None, loss='ls') # forest.fit(X_train, y_train_runway) # y_pred_runway = forest.predict(X_pred) # forest.fit(X_train, y_train_gate) # y_pred_gate = forest.predict(X_pred) # forest.fit(X_train, y_train) # y_pred = forest.predict(X_pred) # y_pred_runway = y_pred[:,0] # y_pred_gate = y_pred[:,1] # pred = fd.FlightPredictions() # pred.flight_predictions = pred.flight_predictions.reindex(range(len(ind_pred))) # pred.flight_predictions['flight_history_id'] = ind_pred # pred.flight_predictions['actual_runway_arrival'] = y_pred_runway # pred.flight_predictions['actual_gate_arrival'] = y_pred_gate # pred.flight_predictions = pred.flight_predictions.sort(columns='flight_history_id') # pred.flight_predictions.to_csv('test_rand_forest.csv', index=False) score = [] kfold = cross_validation.KFold(n=len(X_train), k=2, indices=False, shuffle=True) for i, (traincv, testcv) in enumerate(kfold): print i pred = fd.FlightPredictions() y_pred = []; ind_pred = []; y_pred_runway = []; y_pred_gate = [] print "Starting training...", forest.fit(X_train[traincv], y_train[traincv]) print "done" print "Starting prediction...", y_pred = forest.predict(X_train[testcv]) print "done" ind_pred = ind_train[testcv].values y_pred_runway = y_pred[:,0] y_pred_gate = y_pred[:,1] pred.flight_predictions = \ pred.flight_predictions.reindex(range(len(ind_pred))) pred.test_data = \ pred.test_data.reindex(range(len(ind_pred))) pred.flight_predictions['flight_history_id'] = ind_pred pred.flight_predictions['actual_runway_arrival'] = y_pred_runway pred.flight_predictions['actual_gate_arrival'] = y_pred_gate pred.test_data['flight_history_id'] = ind_pred pred.test_data['actual_runway_arrival'] = \ y_train['actual_runway_arrival_minutes_after_midnight'][testcv].values pred.test_data['actual_gate_arrival'] = \ y_train['actual_gate_arrival_minutes_after_midnight'][testcv].values score.append(rmse.calculate_rmse_score(pred.flight_predictions, pred.test_data)) print score print np.mean(score) print np.std(score)
def r_forest(): [X_train, y_train, ind_train] = load_and_format_data('all_combined_no_dates') [X_test, y_test, ind_test] = load_and_format_data('all_combined_no_dates') print len(X_train) # y_train_runway = y_train['actual_runway_arrival_minutes_after_midnight'] # y_train_gate = y_train['actual_gate_arrival_minutes_after_midnight'] forest = RandomForestRegressor(n_estimators=2, random_state=None, n_jobs=-1) # forest = ExtraTreesRegressor(n_estimators=200, random_state=None, n_jobs=-1) # forest = GradientBoostingRegressor(n_estimators=200, # learn_rate=0.1, max_depth=5, random_state=None, loss='ls') # forest.fit(X_train, y_train_runway) # y_pred_runway = forest.predict(X_pred) # forest.fit(X_train, y_train_gate) # y_pred_gate = forest.predict(X_pred) # forest.fit(X_train, y_train) # y_pred = forest.predict(X_pred) # y_pred_runway = y_pred[:,0] # y_pred_gate = y_pred[:,1] # pred = fd.FlightPredictions() # pred.flight_predictions = pred.flight_predictions.reindex(range(len(ind_pred))) # pred.flight_predictions['flight_history_id'] = ind_pred # pred.flight_predictions['actual_runway_arrival'] = y_pred_runway # pred.flight_predictions['actual_gate_arrival'] = y_pred_gate # pred.flight_predictions = pred.flight_predictions.sort(columns='flight_history_id') # pred.flight_predictions.to_csv('test_rand_forest.csv', index=False) score = [] kfold = cross_validation.KFold(n=len(X_train), k=2, indices=False, shuffle=True) for i, (traincv, testcv) in enumerate(kfold): print i pred = fd.FlightPredictions() y_pred = [] ind_pred = [] y_pred_runway = [] y_pred_gate = [] print "Starting training...", forest.fit(X_train[traincv], y_train[traincv]) print "done" print "Starting prediction...", y_pred = forest.predict(X_train[testcv]) print "done" ind_pred = ind_train[testcv].values y_pred_runway = y_pred[:, 0] y_pred_gate = y_pred[:, 1] pred.flight_predictions = \ pred.flight_predictions.reindex(range(len(ind_pred))) pred.test_data = \ pred.test_data.reindex(range(len(ind_pred))) pred.flight_predictions['flight_history_id'] = ind_pred pred.flight_predictions['actual_runway_arrival'] = y_pred_runway pred.flight_predictions['actual_gate_arrival'] = y_pred_gate pred.test_data['flight_history_id'] = ind_pred pred.test_data['actual_runway_arrival'] = \ y_train['actual_runway_arrival_minutes_after_midnight'][testcv].values pred.test_data['actual_gate_arrival'] = \ y_train['actual_gate_arrival_minutes_after_midnight'][testcv].values score.append( rmse.calculate_rmse_score(pred.flight_predictions, pred.test_data)) print score print np.mean(score) print np.std(score)