def raw_data_to_training_days(raw_data_path, training_days_path, cutoff_times): #df_flight_history = flighthistory.get_df_flight_history_from_raw_format(os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv")) print(os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv")) days_flight_ids = flighthistory.process_flight_history_to_train_day_files( input_path = os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv"), output_path = training_days_path, output_folder_name = "FlightHistory", output_file_name = "flighthistory.csv", cutoff_times = cutoff_times, start_hours_offset = -9) print("Flight History Events") utilities.split_file_based_on_times_filter_on_ids_streaming( os.path.join(raw_data_path, "FlightHistory", "flighthistoryevents.csv"), training_days_path, "FlightHistory", "flighthistoryevents.csv", "flight_history_id", days_flight_ids) print("ASDI Flight Plan") days_flight_plan_ids = utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiflightplan.csv"), training_days_path, "ASDI", "asdiflightplan.csv", "flighthistoryid", days_flight_ids, ids_to_track_column_name="asdiflightplanid") print("ASDI Position") utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiposition.csv"), training_days_path, "ASDI", "asdiposition.csv", "flighthistoryid", days_flight_ids) print("ASDI Airway") utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiairway.csv"), training_days_path, "ASDI", "asdiairway.csv", "asdiflightplanid", days_flight_plan_ids) print("ASDI FPFix") utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpfix.csv"), training_days_path, "ASDI", "asdifpfix.csv", "asdiflightplanid", days_flight_plan_ids) print("ASDI FPCenter") utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpcenter.csv"), training_days_path, "ASDI", "asdifpcenter.csv", "asdiflightplanid", days_flight_plan_ids) print("ASDI FPSector") utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpsector.csv"), training_days_path, "ASDI", "asdifpsector.csv", "asdiflightplanid", days_flight_plan_ids) print("ASDI FPWaypoint") utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpwaypoint.csv"), training_days_path, "ASDI", "asdifpwaypoint.csv", "asdiflightplanid", days_flight_plan_ids) for ct in cutoff_times: print ct day_output_path = os.path.join(training_days_path, utilities.get_day_str(ct, -9)) day_beginning, day_end = utilities.get_day_boundaries(ct, -9) if not os.path.exists(day_output_path): os.makedirs(day_output_path) weather.process_one_day(raw_data_path, day_output_path, day_beginning, day_end, "train")
def raw_data_to_training_days(raw_data_path, training_days_path, cutoff_times): #df_flight_history = flighthistory.get_df_flight_history_from_raw_format(os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv")) print(os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv")) days_flight_ids = flighthistory.process_flight_history_to_train_day_files( input_path = os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv"), output_path = training_days_path, output_folder_name = "FlightHistory", output_file_name = "flighthistory.csv", cutoff_times = cutoff_times, start_hours_offset = -9) print("Flight History Events") utilities.split_file_based_on_times_filter_on_ids_streaming( os.path.join(raw_data_path, "FlightHistory", "flighthistoryevents.csv"), training_days_path, "FlightHistory", "flighthistoryevents.csv", "flight_history_id", days_flight_ids) print("ASDI Flight Plan") days_flight_plan_ids = utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiflightplan.csv"), training_days_path, "ASDI", "asdiflightplan.csv", "flighthistoryid", days_flight_ids, ids_to_track_column_name="asdiflightplanid") print("ASDI Position") utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiposition.csv"), training_days_path, "ASDI", "asdiposition.csv", "flighthistoryid", days_flight_ids) print("ASDI Airway") utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiairway.csv"), training_days_path, "ASDI", "asdiairway.csv", "asdiflightplanid", days_flight_plan_ids) print("ASDI FPFix") utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpfix.csv"), training_days_path, "ASDI", "asdifpfix.csv", "asdiflightplanid", days_flight_plan_ids) print("ASDI FPCenter") utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpcenter.csv"), training_days_path, "ASDI", "asdifpcenter.csv", "asdiflightplanid", days_flight_plan_ids) print("ASDI FPSector") utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpsector.csv"), training_days_path, "ASDI", "asdifpsector.csv", "asdiflightplanid", days_flight_plan_ids) print("ASDI FPWaypoint") utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpwaypoint.csv"), training_days_path, "ASDI", "asdifpwaypoint.csv", "asdiflightplanid", days_flight_plan_ids) for ct in cutoff_times: print ct day_output_path = os.path.join(output_path, utilities.get_day_str(ct, -9)) day_beginning, day_end = utilities.get_day_boundaries(ct, -9) if not os.path.exists(day_output_path): os.makedirs(day_output_path) weather.process_one_day(raw_data_path, day_output_path, day_beginning, day_end, "train")
def training_day_to_test_day(training_day_path, test_day_path, solution_path, cutoff_time): flighthistory.write_flight_history_test_day_file( os.path.join(training_day_path, "FlightHistory", "flighthistory.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "FlightHistory"), "flighthistory.csv"), cutoff_time) flighthistory.write_flight_history_test_day_and_solution_test_flights_only( os.path.join(training_day_path, "FlightHistory", "flighthistory.csv"), os.path.join(test_day_path, "test_flights.csv"), os.path.join(solution_path, utilities.get_day_str(cutoff_time) + "_solution.csv"), cutoff_time) utilities.filter_file_based_on_cutoff_time_streaming(os.path.join(training_day_path, "FlightHistory", "flighthistoryevents.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "FlightHistory"), "flighthistoryevents.csv"), "date_time_recorded", utilities.parse_datetime_format3, cutoff_time) utilities.filter_file_based_on_cutoff_time_streaming(os.path.join(training_day_path, "ASDI", "asdiposition.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdiposition.csv"), "received", utilities.parse_datetime_format1, cutoff_time) flight_plan_ids = utilities.filter_file_based_on_cutoff_time_streaming(os.path.join(training_day_path, "ASDI", "asdiflightplan.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdiflightplan.csv"), "updatetimeutc", utilities.parse_datetime_format2, cutoff_time, ids_to_track_column_name = "asdiflightplanid") utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdiairway.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdiairway.csv"), "asdiflightplanid", flight_plan_ids) utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpfix.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpfix.csv"), "asdiflightplanid", flight_plan_ids) utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpcenter.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpcenter.csv"), "asdiflightplanid", flight_plan_ids) utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpsector.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpsector.csv"), "asdiflightplanid", flight_plan_ids) utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpwaypoint.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpwaypoint.csv"), "asdiflightplanid", flight_plan_ids)
def get_departure_day_str(row, start_hours_offset): """ Returns the date_str for the specific day that a flighthistory row belongs to based on the departure date for the flight Sample return value: "2012_11_15" """ departure_time = get_departure_time(row) if departure_time=="MISSING": return "" return utilities.get_day_str(departure_time, start_hours_offset)
def get_departure_day_str(row, start_hours_offset): """ Returns the date_str for the specific day that a flighthistory row belongs to based on the departure date for the flight Sample return value: "2012_11_15" """ departure_time = get_departure_time(row) if departure_time == "MISSING": return "" return utilities.get_day_str(departure_time, start_hours_offset)
def write_flight_history_test_day_and_solution_test_flights_only( input_path, test_output_path, solution_path, cutoff_time): diverted_or_redirected_flight_ids = get_diverted_or_redirected_flights( input_path) codes_file = os.path.join(os.environ["DataPath"], "GEFlight", "Reference", "usairporticaocodes.txt") us_icao_codes = get_us_airport_icao_codes(codes_file) midnight_time = datetime.datetime(cutoff_time.year, cutoff_time.month, cutoff_time.day, tzinfo=tz.tzutc()) df = get_df_flight_history_from_train_format(input_path) original_length = len(df) df = df.select(lambda i: flight_history_row_in_test_set( df.irow(i), cutoff_time, us_icao_codes, diverted_or_redirected_flight_ids)) df_test = df[[ "flight_history_id", "departure_airport_code", "arrival_airport_code", "published_departure", "published_arrival", "scheduled_gate_departure", "scheduled_gate_arrival", "scheduled_runway_departure", "scheduled_runway_arrival" ]] df_test.to_csv(test_output_path, index=False) df_solution = df[[ "flight_history_id", "actual_runway_arrival", "actual_gate_arrival" ]] for i in df_solution.index: df_solution["actual_runway_arrival"][i] = utilities.minutes_difference( df_solution["actual_runway_arrival"][i], midnight_time) df_solution["actual_gate_arrival"][i] = utilities.minutes_difference( df_solution["actual_gate_arrival"][i], midnight_time) df_solution.to_csv(solution_path, index=False) print("%s, %s: %d rows kept out of %d original lines" % (utilities.get_day_str(cutoff_time), "test_flights.csv", len(df_test), original_length)) return df_test, df_solution
def write_flight_history_test_day_file(input_path, output_path, cutoff_time): df = get_df_flight_history_from_train_format(input_path) cols_to_mask = get_flight_history_date_columns_to_hide() rows_modified = 0 for i in range(len(df)): row_modified = False for col in cols_to_mask: if df[col][i] == "MISSING": continue if df[col][i] <= cutoff_time: continue df[col][i] = "HIDDEN" row_modified = True if row_modified: rows_modified += 1 df.to_csv(output_path, index=False) print("%s, %s: %d rows modified out of %d original lines" % (utilities.get_day_str(cutoff_time), "flighthistory.csv", rows_modified, len(df)))
def write_flight_history_test_day_and_solution_test_flights_only(input_path, test_output_path, solution_path, cutoff_time): diverted_or_redirected_flight_ids = get_diverted_or_redirected_flights(input_path) codes_file = os.path.join(os.environ["DataPath"], "GEFlight", "Reference", "usairporticaocodes.txt") us_icao_codes = get_us_airport_icao_codes(codes_file) midnight_time = datetime.datetime(cutoff_time.year, cutoff_time.month, cutoff_time.day, tzinfo=tz.tzutc()) df = get_df_flight_history_from_train_format(input_path) original_length = len(df) df = df.select(lambda i: flight_history_row_in_test_set(df.irow(i), cutoff_time, us_icao_codes, diverted_or_redirected_flight_ids)) df_test = df[["flight_history_id" , "departure_airport_code" , "arrival_airport_code" , "published_departure" , "published_arrival" , "scheduled_gate_departure" , "scheduled_gate_arrival" , "scheduled_runway_departure" , "scheduled_runway_arrival"]] df_test.to_csv(test_output_path, index=False) df_solution = df[["flight_history_id" , "actual_runway_arrival" , "actual_gate_arrival"]] for i in df_solution.index: df_solution["actual_runway_arrival"][i] = utilities.minutes_difference(df_solution["actual_runway_arrival"][i], midnight_time) df_solution["actual_gate_arrival"][i] = utilities.minutes_difference(df_solution["actual_gate_arrival"][i], midnight_time) df_solution.to_csv(solution_path, index=False) print("%s, %s: %d rows kept out of %d original lines" % (utilities.get_day_str(cutoff_time), "test_flights.csv", len(df_test), original_length)) return df_test, df_solution
def test_get_day_str(self): cutoff_time = datetime(2012, 10, 25, 8, 00, tzinfo=tzutc()) self.assertEqual("2012_10_24", utilities.get_day_str(cutoff_time)) cutoff_time = datetime(2012, 10, 25, 8, 00, tzinfo=tzutc()) self.assertEqual("2012_10_25", utilities.get_day_str(cutoff_time, 0))
def process_flight_history_to_train_day_files( input_path, output_path, output_folder_name, output_file_name, cutoff_times, start_hours_offset = -9): """ """ file_started_for_day = {cutoff_time: False for cutoff_time in cutoff_times} i=0 cnt=0 departure_date_columns = get_flight_history_departure_date_columns() arrival_date_columns = get_flight_history_arrival_date_columns() reader = utilities.HeaderCsvReader(open(input_path)) header_out = reader.get_header() for col in get_flight_history_columns_to_delete(): header_out.remove(col) day_flight_history_ids = {cutoff_time:set() for cutoff_time in cutoff_times} day_str_to_cutoff_time = {} file_handles = {} writers = {} for cutoff_time in cutoff_times: day_output_path = utilities.get_full_output_path(output_path, output_folder_name, cutoff_time) file_output_path = os.path.join(day_output_path, output_file_name) file_handles[cutoff_time] = open(file_output_path, "w") writers[cutoff_time] = csv.writer(file_handles[cutoff_time], dialect=utilities.CsvDialect()) writers[cutoff_time].writerow(header_out) day_str_to_cutoff_time[utilities.get_day_str(cutoff_time)] = cutoff_time i_row_mod = 0 buffer_dict = {cutoff_time: [] for cutoff_time in cutoff_times} start_time, end_time = utilities.get_day_boundaries(cutoff_time, start_hours_offset) codes_file = os.path.join(os.environ["DataPath"], "GEFlight", "Reference", "usairporticaocodes.txt") us_icao_codes = get_us_airport_icao_codes(codes_file) for row in reader: i_row_mod += 1 if not is_flight_in_or_out_of_us(row, us_icao_codes): continue parse_flight_history_dates(row, departure_date_columns, arrival_date_columns) row_day_str = get_departure_day_str(row, start_hours_offset) if row_day_str not in day_str_to_cutoff_time: continue cutoff_time = day_str_to_cutoff_time[row_day_str] cnt += 1 buffer_dict[cutoff_time].append([row[col] for col in header_out]) day_flight_history_ids[cutoff_time].add(row["flight_history_id"]) if i_row_mod < 100000: continue i+=1 print("%s: %d00k records processed, %d with relevant flights in this chunk" % (output_file_name, i, cnt)) cnt=0 for cutoff_time in cutoff_times: writers[cutoff_time].writerows(buffer_dict[cutoff_time]) file_handles[cutoff_time].flush() i_row_mod = 0 buffer_dict = {cutoff_time: [] for cutoff_time in cutoff_times} for cutoff_time in cutoff_times: writers[cutoff_time].writerows(buffer_dict[cutoff_time]) file_handles[cutoff_time].close() return day_flight_history_ids
def training_day_to_test_day(training_day_path, test_day_path, solution_path, cutoff_time): flighthistory.write_flight_history_test_day_file( os.path.join(training_day_path, "FlightHistory", "flighthistory.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "FlightHistory"), "flighthistory.csv"), cutoff_time) flighthistory.write_flight_history_test_day_and_solution_test_flights_only( os.path.join(training_day_path, "FlightHistory", "flighthistory.csv"), os.path.join(test_day_path, "test_flights.csv"), os.path.join(solution_path, utilities.get_day_str(cutoff_time) + "_solution.csv"), cutoff_time) utilities.filter_file_based_on_cutoff_time_streaming(os.path.join(training_day_path, "FlightHistory", "flighthistoryevents.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "FlightHistory"), "flighthistoryevents.csv"), "date_time_recorded", utilities.parse_datetime_format3, cutoff_time) utilities.filter_file_based_on_cutoff_time_streaming(os.path.join(training_day_path, "ASDI", "asdiposition.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdiposition.csv"), "received", utilities.parse_datetime_format1, cutoff_time) flight_plan_ids = utilities.filter_file_based_on_cutoff_time_streaming(os.path.join(training_day_path, "ASDI", "asdiflightplan.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdiflightplan.csv"), "updatetimeutc", utilities.parse_datetime_format2, cutoff_time, ids_to_track_column_name = "asdiflightplanid") utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdiairway.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdiairway.csv"), "asdiflightplanid", flight_plan_ids) utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpfix.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpfix.csv"), "asdiflightplanid", flight_plan_ids) utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpcenter.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpcenter.csv"), "asdiflightplanid", flight_plan_ids) utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpsector.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpsector.csv"), "asdiflightplanid", flight_plan_ids) utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpwaypoint.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpwaypoint.csv"), "asdiflightplanid", flight_plan_ids) day_beginning, day_end = utilities.get_day_boundaries(cutoff_time) weather.process_one_day( training_day_path, test_day_path, day_beginning, cutoff_time, "test", cutoff_time = cutoff_time)
def test_get_day_str(self): cutoff_time = datetime(2012, 10, 25, 8, 00, tzinfo = tzutc()) self.assertEqual("2012_10_24", utilities.get_day_str(cutoff_time)) cutoff_time = datetime(2012, 10, 25, 8, 00, tzinfo = tzutc()) self.assertEqual("2012_10_25", utilities.get_day_str(cutoff_time, 0))
def process_flight_history_to_train_day_files(input_path, output_path, output_folder_name, output_file_name, cutoff_times, start_hours_offset=-9): """ """ file_started_for_day = {cutoff_time: False for cutoff_time in cutoff_times} i = 0 cnt = 0 departure_date_columns = get_flight_history_departure_date_columns() arrival_date_columns = get_flight_history_arrival_date_columns() reader = utilities.HeaderCsvReader(open(input_path)) header_out = reader.get_header() for col in get_flight_history_columns_to_delete(): header_out.remove(col) day_flight_history_ids = { cutoff_time: set() for cutoff_time in cutoff_times } day_str_to_cutoff_time = {} file_handles = {} writers = {} for cutoff_time in cutoff_times: day_output_path = utilities.get_full_output_path( output_path, output_folder_name, cutoff_time) file_output_path = os.path.join(day_output_path, output_file_name) file_handles[cutoff_time] = open(file_output_path, "w") writers[cutoff_time] = csv.writer(file_handles[cutoff_time], dialect=utilities.CsvDialect()) writers[cutoff_time].writerow(header_out) day_str_to_cutoff_time[utilities.get_day_str( cutoff_time)] = cutoff_time i_row_mod = 0 buffer_dict = {cutoff_time: [] for cutoff_time in cutoff_times} start_time, end_time = utilities.get_day_boundaries( cutoff_time, start_hours_offset) codes_file = os.path.join(os.environ["DataPath"], "GEFlight", "Reference", "usairporticaocodes.txt") us_icao_codes = get_us_airport_icao_codes(codes_file) for row in reader: i_row_mod += 1 if not is_flight_in_or_out_of_us(row, us_icao_codes): continue parse_flight_history_dates(row, departure_date_columns, arrival_date_columns) row_day_str = get_departure_day_str(row, start_hours_offset) if row_day_str not in day_str_to_cutoff_time: continue cutoff_time = day_str_to_cutoff_time[row_day_str] cnt += 1 buffer_dict[cutoff_time].append([row[col] for col in header_out]) day_flight_history_ids[cutoff_time].add(row["flight_history_id"]) if i_row_mod < 100000: continue i += 1 print( "%s: %d00k records processed, %d with relevant flights in this chunk" % (output_file_name, i, cnt)) cnt = 0 for cutoff_time in cutoff_times: writers[cutoff_time].writerows(buffer_dict[cutoff_time]) file_handles[cutoff_time].flush() i_row_mod = 0 buffer_dict = {cutoff_time: [] for cutoff_time in cutoff_times} for cutoff_time in cutoff_times: writers[cutoff_time].writerows(buffer_dict[cutoff_time]) file_handles[cutoff_time].close() return day_flight_history_ids
import os import pandas from datetime import datetime, timedelta from dateutil import parser, tz from geflight.transform import utilities import pytz import random import weather raw_data_path = os.path.join(os.environ["DataPath"], "GEFlight", "RawPublicLeaderboard") output_path = os.path.join(os.environ["DataPath"], "GEFlight", "Release 2", "PublicLeaderboardTrainDays") start_day = datetime(2012,11,26,20,00, tzinfo=tz.tzutc()) cutoff_times = [start_day] for i in range(1,14): cutoff_times.append(start_day + timedelta(i, 0)) for ct in cutoff_times: print ct day_output_path = os.path.join(output_path, utilities.get_day_str(ct, -9)) day_beginning, day_end = utilities.get_day_boundaries(ct, -9) if not os.path.exists(day_output_path): os.makedirs(day_output_path) weather.process_one_day(raw_data_path, day_output_path, day_beginning, day_end, "train")