def process(self, element: typing.Dict, *args, **kwargs) -> typing.List[typing.Dict]: try: # get date format and time format from optional function arguments date_fmt = kwargs[ "date_format"] if "date_format" in kwargs else '%Y-%m-%d' # set date format to YYYY-MM-DD by default or take it from args time_fmt = kwargs[ "time_format"] if "time_format" in kwargs else '%H%M' # set time format to HHMM by default or take it from args # parse flight dates and times and add day of the week flight_date = datetime.strptime(element['flight_date'], date_fmt).date() element['flight_date'] = flight_date.strftime( '%Y-%m-%d') # bigquery friendly formatted date element['departure_time'] = datetime.strptime( element['departure_time'], time_fmt).strftime( '%H:%M:%S') # bigquery friendly formatted time element['arrival_time'] = datetime.strptime( element['arrival_time'], time_fmt).strftime( '%H:%M:%S') # bigquery friendly formatted time element['day_of_week'] = flight_date.weekday( ) + 1 # add 1 to start Monday as 1, ending with Sunday as 7 yield element except TypeError as err: logger.warning( f"input flight record is not a proper dict. omitting output") except (KeyError, ValueError) as err: logger.debug( f"intput flight record is missing critical fields. omitting output" ) logger.debug(str(err))
def transform(self): logger.debug(f"transforming master aircraft file") # TODO :: # rename columns using rename_columns() # set street2 data type to str # set n_number as dataframe index raise NotImplementedError('YOU FORGOT TO WRITE THIS!')
def process(self, element, routes, *args, **kwargs): """ Lookup the flight route (airline, src, dest) in routes lookup table. Reject the flight if not found in routes (as 'rejects' output PCollection) AND output missing routes as 'missing_routes' output PCollection. :param element: input flight record as dict :param routes: routes lookup table with ('airline', 'src', 'dest') tuple as keys :return: output flight dict """ try: # route key as (airline, src, dest) tuple route = (element['airline'], element['src'], element['dest']) if route not in routes: # missing route. reject the flight record and add as missing yield beam.pvalue.TaggedOutput( 'rejects', element) # send the record to rejects yield beam.pvalue.TaggedOutput( 'missing_routes', (route, 1)) # also send the route as missing # regardless pass the record through to the main output # change this to an else statement if you wish NOT to pass the element through if missing route yield element except (KeyError, ValueError, TypeError) as err: logger.debug( f"input flight record is missing critical fields or of invalid type." ) logger.debug(str(err))
def api_get_flights(airline: str, flight_date: datetime, api_url=api_url, api_token=api_token, tries=0, timeout=api_timeout) -> typing.List[typing.Dict]: """ Call REST API to get flight records for a given date :param airline: airline IATA code :param flight_date: flight date :param api_url: API base URL such as http://localhost:5000/ :param api_token: OAuth Bearer token :param tries: :param timeout: :return: list of flight records as a dict """ headers = {'Authorization': f'Bearer {api_token}', 'Accept': 'application/json'} params = {'date': flight_date.strftime('%Y-%m-%d'), 'airline': airline} url = os.path.join(api_url, 'flights/flights') try: r = requests.get(url, params=params, headers=headers, timeout=timeout) # raise exception if response returned an unsuccessful status code r.raise_for_status() # return flight records data = r.json() logger.debug(f"Flight API returned {data['records']} flights in {r.elapsed.total_seconds():.3}s") return copy(data['flights']) except requests.exceptions.ConnectTimeout: # REST API connection timeout is reached. Most likely we're sending too many requests and need to ease off # let's exponentially increase the timeout and retry until the max num of retires is reached logger.warning(f"REST API call to get flights timed out. (timeout={timeout}") tries, timeout = tries + 1, timeout * 2 if tries < api_max_retries: # sleep the current thread to back off from submitting too many requests logger.debug(f"thread sleep to back off from submitting too many REST API requests. sleep: {timeout}s") sleep(timeout) logger.warning(f"Increasing timeout to {timeout} and retrying. Retry number={tries}") return api_get_flights(airline, flight_date, api_url, api_token, tries, timeout) else: logger.fatal("Max number of API retries is reached. Quiting.") sys.exit(1) except requests.ConnectionError: # if cannot establish connection with the REST API. Most likely the URL is incorrect or API Flask server # is not running logger.fatal(f"Could not establish connection to the REST API to get flights (url={url})") sys.exit(1) except requests.exceptions.HTTPError as err: logger.error("Flights API has a critical error: {}".format(str(err))) logger.error("No records returned by flights api") return [] except requests.exceptions.RequestException as err: logger.error("Unknown error while connecting to REST API to get flights: {}".format(str(err))) logger.error("No records returned by flights api") return []
def transform(self): logger.debug(f"transforming master aircraft file") df = self.df # rename columns self.rename_columns() # fix data types df['street2'] = df['street2'].astype(str) # set index df.set_index(keys='n_number', inplace=True, drop=False)
def transform(self): logger.debug(f"transforming master aircraft file") df = self.df # rename columns self.rename_columns() # fix data types df['street2'] = df['street2'].astype(str) # remove personal info df['registrant_name'], df['street'], df['street2'] = zip( *df.apply(self.erase_personal_info, axis=1)) # set index df.set_index(keys='n_number', inplace=True, drop=False)
def process(self, element, *args, **kwargs): # todo: finish writing this code. you can cheat and look at deb.ch2.ep3.answers reader = csv.reader(StringIO(element), delimiter=- ',') for row in reader: if (len(row) == len(self.cols)): d = dict(zip(self.cols, row)) self.row_count += 1 yield d else: self.bad_rows = +1 logger.debug(f"bad row: {row}")
def api_get_airlines(api_url=api_url, api_token=api_token, tries=0, timeout=api_timeout) -> typing.List[str]: """ Call the REST API to get the list of available airlines :param api_url: :param api_token: :param tries: :param timeout: :param kwargs: :return: """ global AIRLINES if not AIRLINES: headers = {'Authorization': f'Bearer {api_token}', 'Accept': 'application/json'} url = os.path.join(api_url, 'flights/airlines') logger.debug(f"Calling airlines REST API (url={url})") try: r = requests.get(url, headers=headers, timeout=timeout) r.raise_for_status() AIRLINES = r.json()['airlines'] logger.info(f"flight airlines api call success in {r.elapsed.total_seconds():.3}s. Airlines: {AIRLINES}") return AIRLINES except requests.exceptions.ConnectTimeout: # REST API connection timeout is reached. Most likely we're sending too many requests and need to ease off # let's exponentially increase the timeout and retry until the max num of retires is reached logger.warning(f"REST API call to get airlines timed out. (timeout={timeout}") tries, timeout = tries + 1, timeout * 2 if tries < api_max_retries: # sleep the current thread to back off from submitting too many requests logger.debug(f"thread sleep to back off from submitting too many REST API requests. sleep: {timeout}s") sleep(timeout) logger.warning(f"Increasing timeout to {timeout} and retrying. Retry number={tries}") return api_get_airlines(api_url, api_token, tries, timeout) else: logger.fatal("Max number of API retries is reached. Quiting.") sys.exit(1) except requests.ConnectionError: # if cannot establish connection with the REST API. Most likely the URL is incorrect or API Flask server # is not running logger.fatal(f"Could not establish connection to the REST API to get airlines (url={url})") sys.exit(1) except requests.exceptions.RequestException as err: logger.error("Unknown error while connecting to REST API to get airlines: {}".format(str(err))) return AIRLINES else: return AIRLINES
def rename_columns(self): # rename columns based on the list below and convert all column names to lower case columns = { 'CODE': 'MFR CODE', 'MFR': 'MFR NAME', 'TYPE-ACFT': 'AIRCRAFT TYPE', 'NO-ENG': 'NUM ENGINES', 'NO-SEATS': 'NUM SEATS', 'AC-WEIGHT': 'WEIGHT CLASS', } logger.debug(f"renaming aircraft type columns") # TODO :: # rename columns based on the mapping above # lowercase all column names and replace '-' with '_' (making them bigquery friendly) # use EngineFileProcessor as example raise NotImplementedError('YOU FORGOT TO WRITE THIS!')
def gbq_create(self, table_name): schema = [ bigquery.SchemaField('n_number', 'STRING', mode='NULLABLE'), bigquery.SchemaField('serial_number', 'STRING', mode='NULLABLE'), bigquery.SchemaField('mfr_mdl_code', 'STRING', mode='NULLABLE'), bigquery.SchemaField('eng_mfr_mdl', 'STRING', mode='NULLABLE'), bigquery.SchemaField('mfr_year', 'INTEGER', mode='NULLABLE'), bigquery.SchemaField('registrant_type', 'STRING', mode='NULLABLE'), bigquery.SchemaField('registrant_name', 'STRING', mode='NULLABLE'), bigquery.SchemaField('street', 'STRING', mode='NULLABLE'), bigquery.SchemaField('street2', 'STRING', mode='NULLABLE'), bigquery.SchemaField('city', 'STRING', mode='NULLABLE'), bigquery.SchemaField('state', 'STRING', mode='NULLABLE'), bigquery.SchemaField('zip_code', 'STRING', mode='NULLABLE'), bigquery.SchemaField('region', 'STRING', mode='NULLABLE'), bigquery.SchemaField('country', 'STRING', mode='NULLABLE'), bigquery.SchemaField('last_action_date', 'DATE', mode='NULLABLE'), bigquery.SchemaField('issue_date', 'DATE', mode='NULLABLE'), bigquery.SchemaField('status', 'STRING', mode='NULLABLE'), bigquery.SchemaField('air_ready_date', 'DATE', mode='NULLABLE'), bigquery.SchemaField('expiration_date', 'DATE', mode='NULLABLE'), bigquery.SchemaField('mfr_name', 'STRING', mode='NULLABLE'), bigquery.SchemaField('mfr_short_name', 'STRING', mode='NULLABLE'), bigquery.SchemaField('model', 'STRING', mode='NULLABLE'), bigquery.SchemaField('aircraft_type', 'STRING', mode='NULLABLE'), bigquery.SchemaField('num_engines', 'INTEGER', mode='NULLABLE'), bigquery.SchemaField('num_seats', 'INTEGER', mode='NULLABLE'), bigquery.SchemaField('weight_class', 'STRING', mode='NULLABLE'), bigquery.SchemaField('speed', 'INTEGER', mode='NULLABLE'), bigquery.SchemaField('eng_mfr_name', 'STRING', mode='NULLABLE'), bigquery.SchemaField('eng_model', 'STRING', mode='NULLABLE'), bigquery.SchemaField('eng_type', 'STRING', mode='NULLABLE'), bigquery.SchemaField('horsepower', 'FLOAT', mode='NULLABLE'), bigquery.SchemaField('thrust', 'FLOAT', mode='NULLABLE'), ] # create a bigquery client client = bigquery.Client() # delete table if it exists logger.debug(f"dropping {table_name} table if it exists") client.delete_table(table_name, not_found_ok=True) # create a new table table = bigquery.Table(table_name, schema=schema) table = client.create_table(table) # table created logger.info(f"bigquery table created: {table_name}")
def rename_columns(self): # rename columns based on the list below and convert all column names to lower case mapper = { # source file column name: new column name 'CODE': 'ENG CODE', 'MFR': 'ENG MFR NAME', 'MODEL': 'ENG MODEL', 'TYPE': 'ENG TYPE', } logger.debug(f"renaming aircraft engine file columns") df = self.df # rename columns based on mapping rules above df.rename(columns=mapper, inplace=True, errors='ignore') # lowercase columns names and replace special characters mapper = { col: str(col).strip().lower().replace(' ', '_').replace('-', '_') for col in list(df.columns) } df.rename(columns=mapper, inplace=True)
def process(self, element, *args, **kwargs): """ Parse a records CSV line and transpose column headers :param element: csv line :return: dict {column_name: csv_value} """ # use csv.reader to correctly parse csv. accounting for quoted values reader = csv.reader(StringIO(element), delimiter=',') for row in reader: if len(row) == len(self.cols): # transpose the schema onto the csv cols to create a # {column_name: csv_value} dict. read about python zip function: https://www.programiz.com/python-programming/methods/built-in/zip d = dict(zip(self.cols, row)) self.row_count += 1 yield d else: self.bad_rows += 1 logger.debug(f"bad row: {row}")
def delete_table(self, table_name): """ Delete a table from BigQuery. :param table_name: full table name to delete including dataset id (ie: my_dataset.my_table) :return: None """ bq = self.client try: table: Table = bq.get_table(table_name) logger.info( f"deleting existing bigquery table: {table.project}:{table.dataset_id}.{table.table_id}" ) bq.delete_table(table) except google_exceptions.NotFound: # table doesn't exist logger.debug( f"bigquery table to delete did not exist: {table_name}") pass
def rename_columns(self): # rename columns based on the list below and convert all column names to lower case columns = { 'CODE': 'MFR CODE', 'MFR': 'MFR NAME', 'TYPE-ACFT': 'AIRCRAFT TYPE', 'NO-ENG': 'NUM ENGINES', 'NO-SEATS': 'NUM SEATS', 'AC-WEIGHT': 'WEIGHT CLASS', } logger.debug(f"renaming aircraft type columns") df = self.df # rename columns based on mapping rules above df.rename(columns=columns, inplace=True, errors='ignore') # lowercase columns names and replace special characters mapper = { col: str(col).strip().lower().replace(' ', '_').replace('-', '_') for col in list(df.columns) } df.rename(columns=mapper, inplace=True)
def process(self, element, airports, *args, **kwargs): """ Lookup src and dest IATA airport codes and add src/dest city/state if found Reject the record if either src/dest airport is not found in airports lookup table (as 'rejects' output PCollection). Reject missing src/dest airport IATA codes as 'missing_airports' output PCollection. :param element: flight dict record :param airports: airports dict lookup with airport iata code as keys :return: dict, flight record with src and dest airport cities added """ try: reject = False # lookup src airport and get city, state if found if element['src'] in airports: element['src_city'] = airports[element['src']]['city'] element['src_state'] = airports[element['src']]['state'] else: reject = True yield beam.pvalue.TaggedOutput('missing_airport', (element['src'], 1)) # lookup dest airport and get city, state if found if element['dest'] in airports: element['dest_city'] = airports[element['dest']]['city'] element['dest_state'] = airports[element['dest']]['state'] else: yield beam.pvalue.TaggedOutput('missing_airport', (element['dest'], 1)) # any lookup value is missing if reject: # one of the lookup src or dest airports is missing, reject the record yield beam.pvalue.TaggedOutput('rejects', element) else: # both lookups are good, main output yield element except (KeyError, ValueError, TypeError) as err: logger.debug( f"input flight record is missing critical fields or of invalid type." ) logger.debug(str(err))