def test_get_full_output_path(self): original_path = os.tempnam() os.mkdir(original_path) cutoff_time = datetime(2012, 10, 25, 8, 00, tzinfo = tzutc()) output_path = utilities.get_full_output_path(original_path, "FlightHistory", cutoff_time, 0) self.assertEqual(os.path.join(original_path, "2012_10_25", "FlightHistory"), output_path) self.assertTrue(os.path.exists(output_path)) os.rmdir(output_path) os.rmdir(os.path.join(original_path, "2012_10_25")) os.rmdir(original_path)
def test_get_full_output_path(self): original_path = os.tempnam() os.mkdir(original_path) cutoff_time = datetime(2012, 10, 25, 8, 00, tzinfo=tzutc()) output_path = utilities.get_full_output_path(original_path, "FlightHistory", cutoff_time, 0) self.assertEqual( os.path.join(original_path, "2012_10_25", "FlightHistory"), output_path) self.assertTrue(os.path.exists(output_path)) os.rmdir(output_path) os.rmdir(os.path.join(original_path, "2012_10_25")) os.rmdir(original_path)
def process_flight_history_to_train_day_files( input_path, output_path, output_folder_name, output_file_name, cutoff_times, start_hours_offset = -9): """ """ file_started_for_day = {cutoff_time: False for cutoff_time in cutoff_times} i=0 cnt=0 departure_date_columns = get_flight_history_departure_date_columns() arrival_date_columns = get_flight_history_arrival_date_columns() reader = utilities.HeaderCsvReader(open(input_path)) header_out = reader.get_header() for col in get_flight_history_columns_to_delete(): header_out.remove(col) day_flight_history_ids = {cutoff_time:set() for cutoff_time in cutoff_times} day_str_to_cutoff_time = {} file_handles = {} writers = {} for cutoff_time in cutoff_times: day_output_path = utilities.get_full_output_path(output_path, output_folder_name, cutoff_time) file_output_path = os.path.join(day_output_path, output_file_name) file_handles[cutoff_time] = open(file_output_path, "w") writers[cutoff_time] = csv.writer(file_handles[cutoff_time], dialect=utilities.CsvDialect()) writers[cutoff_time].writerow(header_out) day_str_to_cutoff_time[utilities.get_day_str(cutoff_time)] = cutoff_time i_row_mod = 0 buffer_dict = {cutoff_time: [] for cutoff_time in cutoff_times} start_time, end_time = utilities.get_day_boundaries(cutoff_time, start_hours_offset) codes_file = os.path.join(os.environ["DataPath"], "GEFlight", "Reference", "usairporticaocodes.txt") us_icao_codes = get_us_airport_icao_codes(codes_file) for row in reader: i_row_mod += 1 if not is_flight_in_or_out_of_us(row, us_icao_codes): continue parse_flight_history_dates(row, departure_date_columns, arrival_date_columns) row_day_str = get_departure_day_str(row, start_hours_offset) if row_day_str not in day_str_to_cutoff_time: continue cutoff_time = day_str_to_cutoff_time[row_day_str] cnt += 1 buffer_dict[cutoff_time].append([row[col] for col in header_out]) day_flight_history_ids[cutoff_time].add(row["flight_history_id"]) if i_row_mod < 100000: continue i+=1 print("%s: %d00k records processed, %d with relevant flights in this chunk" % (output_file_name, i, cnt)) cnt=0 for cutoff_time in cutoff_times: writers[cutoff_time].writerows(buffer_dict[cutoff_time]) file_handles[cutoff_time].flush() i_row_mod = 0 buffer_dict = {cutoff_time: [] for cutoff_time in cutoff_times} for cutoff_time in cutoff_times: writers[cutoff_time].writerows(buffer_dict[cutoff_time]) file_handles[cutoff_time].close() return day_flight_history_ids
def process_flight_history_to_train_day_files(input_path, output_path, output_folder_name, output_file_name, cutoff_times, start_hours_offset=-9): """ """ file_started_for_day = {cutoff_time: False for cutoff_time in cutoff_times} i = 0 cnt = 0 departure_date_columns = get_flight_history_departure_date_columns() arrival_date_columns = get_flight_history_arrival_date_columns() reader = utilities.HeaderCsvReader(open(input_path)) header_out = reader.get_header() for col in get_flight_history_columns_to_delete(): header_out.remove(col) day_flight_history_ids = { cutoff_time: set() for cutoff_time in cutoff_times } day_str_to_cutoff_time = {} file_handles = {} writers = {} for cutoff_time in cutoff_times: day_output_path = utilities.get_full_output_path( output_path, output_folder_name, cutoff_time) file_output_path = os.path.join(day_output_path, output_file_name) file_handles[cutoff_time] = open(file_output_path, "w") writers[cutoff_time] = csv.writer(file_handles[cutoff_time], dialect=utilities.CsvDialect()) writers[cutoff_time].writerow(header_out) day_str_to_cutoff_time[utilities.get_day_str( cutoff_time)] = cutoff_time i_row_mod = 0 buffer_dict = {cutoff_time: [] for cutoff_time in cutoff_times} start_time, end_time = utilities.get_day_boundaries( cutoff_time, start_hours_offset) codes_file = os.path.join(os.environ["DataPath"], "GEFlight", "Reference", "usairporticaocodes.txt") us_icao_codes = get_us_airport_icao_codes(codes_file) for row in reader: i_row_mod += 1 if not is_flight_in_or_out_of_us(row, us_icao_codes): continue parse_flight_history_dates(row, departure_date_columns, arrival_date_columns) row_day_str = get_departure_day_str(row, start_hours_offset) if row_day_str not in day_str_to_cutoff_time: continue cutoff_time = day_str_to_cutoff_time[row_day_str] cnt += 1 buffer_dict[cutoff_time].append([row[col] for col in header_out]) day_flight_history_ids[cutoff_time].add(row["flight_history_id"]) if i_row_mod < 100000: continue i += 1 print( "%s: %d00k records processed, %d with relevant flights in this chunk" % (output_file_name, i, cnt)) cnt = 0 for cutoff_time in cutoff_times: writers[cutoff_time].writerows(buffer_dict[cutoff_time]) file_handles[cutoff_time].flush() i_row_mod = 0 buffer_dict = {cutoff_time: [] for cutoff_time in cutoff_times} for cutoff_time in cutoff_times: writers[cutoff_time].writerows(buffer_dict[cutoff_time]) file_handles[cutoff_time].close() return day_flight_history_ids