Beispiel #1
0
 def process(self, element: typing.Dict, *args,
             **kwargs) -> typing.List[typing.Dict]:
     try:
         # get date format and time format from optional function arguments
         date_fmt = kwargs[
             "date_format"] if "date_format" in kwargs else '%Y-%m-%d'  # set date format to YYYY-MM-DD by default or take it from args
         time_fmt = kwargs[
             "time_format"] if "time_format" in kwargs else '%H%M'  # set time format to HHMM by default or take it from args
         # parse flight dates and times and add day of the week
         flight_date = datetime.strptime(element['flight_date'],
                                         date_fmt).date()
         element['flight_date'] = flight_date.strftime(
             '%Y-%m-%d')  # bigquery friendly formatted date
         element['departure_time'] = datetime.strptime(
             element['departure_time'], time_fmt).strftime(
                 '%H:%M:%S')  # bigquery friendly formatted time
         element['arrival_time'] = datetime.strptime(
             element['arrival_time'], time_fmt).strftime(
                 '%H:%M:%S')  # bigquery friendly formatted time
         element['day_of_week'] = flight_date.weekday(
         ) + 1  # add 1 to start Monday as 1, ending with Sunday as 7
         yield element
     except TypeError as err:
         logger.warning(
             f"input flight record is not a proper dict. omitting output")
     except (KeyError, ValueError) as err:
         logger.debug(
             f"intput flight record is missing critical fields. omitting output"
         )
         logger.debug(str(err))
Beispiel #2
0
 def transform(self):
     logger.debug(f"transforming master aircraft file")
     # TODO ::
     #   rename columns using rename_columns()
     #   set street2 data type to str
     #   set n_number as dataframe index
     raise NotImplementedError('YOU FORGOT TO WRITE THIS!')
Beispiel #3
0
    def process(self, element, routes, *args, **kwargs):
        """
        Lookup the flight route (airline, src, dest) in routes lookup table.

        Reject the flight if not found in routes (as 'rejects' output PCollection) AND output missing routes as
        'missing_routes' output PCollection.

        :param element: input flight record as dict
        :param routes: routes lookup table with ('airline', 'src', 'dest') tuple as keys
        :return: output flight dict
        """
        try:
            # route key as (airline, src, dest) tuple
            route = (element['airline'], element['src'], element['dest'])
            if route not in routes:
                # missing route. reject the flight record and add as missing
                yield beam.pvalue.TaggedOutput(
                    'rejects', element)  # send the record to rejects
                yield beam.pvalue.TaggedOutput(
                    'missing_routes',
                    (route, 1))  # also send the route as missing
            # regardless pass the record through to the main output
            # change this to an else statement if you wish NOT to pass the element through if missing route
            yield element
        except (KeyError, ValueError, TypeError) as err:
            logger.debug(
                f"input flight record is missing critical fields or of invalid type."
            )
            logger.debug(str(err))
Beispiel #4
0
def api_get_flights(airline: str,
                    flight_date: datetime,
                    api_url=api_url,
                    api_token=api_token,
                    tries=0,
                    timeout=api_timeout) -> typing.List[typing.Dict]:
    """
    Call REST API to get flight records for a given date

    :param airline: airline IATA code
    :param flight_date: flight date
    :param api_url: API base URL such as http://localhost:5000/
    :param api_token: OAuth Bearer token
    :param tries:
    :param timeout:
    :return: list of flight records as a dict
    """
    headers = {'Authorization': f'Bearer {api_token}',
               'Accept': 'application/json'}
    params = {'date': flight_date.strftime('%Y-%m-%d'),
              'airline': airline}
    url = os.path.join(api_url, 'flights/flights')
    try:
        r = requests.get(url, params=params, headers=headers, timeout=timeout)
        # raise exception if response returned an unsuccessful status code
        r.raise_for_status()
        # return flight records
        data = r.json()
        logger.debug(f"Flight API returned {data['records']} flights in {r.elapsed.total_seconds():.3}s")
        return copy(data['flights'])
    except requests.exceptions.ConnectTimeout:
        # REST API connection timeout is reached. Most likely we're sending too many requests and need to ease off
        # let's exponentially increase the timeout and retry until the max num of retires is reached
        logger.warning(f"REST API call to get flights timed out. (timeout={timeout}")
        tries, timeout = tries + 1, timeout * 2
        if tries < api_max_retries:
            # sleep the current thread to back off from submitting too many requests
            logger.debug(f"thread sleep to back off from submitting too many REST API requests. sleep: {timeout}s")
            sleep(timeout)
            logger.warning(f"Increasing timeout to {timeout} and retrying. Retry number={tries}")
            return api_get_flights(airline, flight_date, api_url, api_token, tries, timeout)
        else:
            logger.fatal("Max number of API retries is reached. Quiting.")
            sys.exit(1)
    except requests.ConnectionError:
        # if cannot establish connection with the REST API. Most likely the URL is incorrect or API Flask server
        # is not running
        logger.fatal(f"Could not establish connection to the REST API to get flights (url={url})")
        sys.exit(1)
    except requests.exceptions.HTTPError as err:
        logger.error("Flights API has a critical error: {}".format(str(err)))
        logger.error("No records returned by flights api")
        return []
    except requests.exceptions.RequestException as err:
        logger.error("Unknown error while connecting to REST API to get flights: {}".format(str(err)))
        logger.error("No records returned by flights api")
        return []
Beispiel #5
0
 def transform(self):
     logger.debug(f"transforming master aircraft file")
     df = self.df
     # rename columns
     self.rename_columns()
     # fix data types
     df['street2'] = df['street2'].astype(str)
     # set index
     df.set_index(keys='n_number', inplace=True, drop=False)
Beispiel #6
0
 def transform(self):
     logger.debug(f"transforming master aircraft file")
     df = self.df
     # rename columns
     self.rename_columns()
     # fix data types
     df['street2'] = df['street2'].astype(str)
     # remove personal info
     df['registrant_name'], df['street'], df['street2'] = zip(
         *df.apply(self.erase_personal_info, axis=1))
     # set index
     df.set_index(keys='n_number', inplace=True, drop=False)
Beispiel #7
0
    def process(self, element, *args, **kwargs):
        # todo: finish writing this code. you can cheat and look at deb.ch2.ep3.answers
        reader = csv.reader(StringIO(element), delimiter=- ',')

        for row in reader:
            if (len(row) == len(self.cols)):
                d = dict(zip(self.cols, row))
                self.row_count += 1
                yield d
            else:
                self.bad_rows = +1
                logger.debug(f"bad row: {row}")
Beispiel #8
0
def api_get_airlines(api_url=api_url,
                     api_token=api_token,
                     tries=0,
                     timeout=api_timeout) -> typing.List[str]:
    """
    Call the REST API to get the list of available airlines

    :param api_url:
    :param api_token:
    :param tries:
    :param timeout:
    :param kwargs:
    :return:
    """
    global AIRLINES
    if not AIRLINES:
        headers = {'Authorization': f'Bearer {api_token}',
                   'Accept': 'application/json'}
        url = os.path.join(api_url, 'flights/airlines')
        logger.debug(f"Calling airlines REST API (url={url})")
        try:
            r = requests.get(url, headers=headers, timeout=timeout)
            r.raise_for_status()
            AIRLINES = r.json()['airlines']
            logger.info(f"flight airlines api call success in {r.elapsed.total_seconds():.3}s. Airlines: {AIRLINES}")
            return AIRLINES
        except requests.exceptions.ConnectTimeout:
            # REST API connection timeout is reached. Most likely we're sending too many requests and need to ease off
            # let's exponentially increase the timeout and retry until the max num of retires is reached
            logger.warning(f"REST API call to get airlines timed out. (timeout={timeout}")
            tries, timeout = tries + 1, timeout * 2
            if tries < api_max_retries:
                # sleep the current thread to back off from submitting too many requests
                logger.debug(f"thread sleep to back off from submitting too many REST API requests. sleep: {timeout}s")
                sleep(timeout)
                logger.warning(f"Increasing timeout to {timeout} and retrying. Retry number={tries}")
                return api_get_airlines(api_url, api_token, tries, timeout)
            else:
                logger.fatal("Max number of API retries is reached. Quiting.")
                sys.exit(1)
        except requests.ConnectionError:
            # if cannot establish connection with the REST API. Most likely the URL is incorrect or API Flask server
            # is not running
            logger.fatal(f"Could not establish connection to the REST API to get airlines (url={url})")
            sys.exit(1)
        except requests.exceptions.RequestException as err:
            logger.error("Unknown error while connecting to REST API to get airlines: {}".format(str(err)))
            return AIRLINES
    else:
        return AIRLINES
Beispiel #9
0
 def rename_columns(self):
     # rename columns based on the list below and convert all column names to lower case
     columns = {
         'CODE': 'MFR CODE',
         'MFR': 'MFR NAME',
         'TYPE-ACFT': 'AIRCRAFT TYPE',
         'NO-ENG': 'NUM ENGINES',
         'NO-SEATS': 'NUM SEATS',
         'AC-WEIGHT': 'WEIGHT CLASS',
     }
     logger.debug(f"renaming aircraft type columns")
     # TODO ::
     #   rename columns based on the mapping above
     #   lowercase all column names and replace '-' with '_' (making them bigquery friendly)
     #   use EngineFileProcessor as example
     raise NotImplementedError('YOU FORGOT TO WRITE THIS!')
Beispiel #10
0
 def gbq_create(self, table_name):
     schema = [
         bigquery.SchemaField('n_number', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('serial_number', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('mfr_mdl_code', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('eng_mfr_mdl', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('mfr_year', 'INTEGER', mode='NULLABLE'),
         bigquery.SchemaField('registrant_type', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('registrant_name', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('street', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('street2', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('city', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('state', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('zip_code', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('region', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('country', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('last_action_date', 'DATE', mode='NULLABLE'),
         bigquery.SchemaField('issue_date', 'DATE', mode='NULLABLE'),
         bigquery.SchemaField('status', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('air_ready_date', 'DATE', mode='NULLABLE'),
         bigquery.SchemaField('expiration_date', 'DATE', mode='NULLABLE'),
         bigquery.SchemaField('mfr_name', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('mfr_short_name', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('model', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('aircraft_type', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('num_engines', 'INTEGER', mode='NULLABLE'),
         bigquery.SchemaField('num_seats', 'INTEGER', mode='NULLABLE'),
         bigquery.SchemaField('weight_class', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('speed', 'INTEGER', mode='NULLABLE'),
         bigquery.SchemaField('eng_mfr_name', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('eng_model', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('eng_type', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('horsepower', 'FLOAT', mode='NULLABLE'),
         bigquery.SchemaField('thrust', 'FLOAT', mode='NULLABLE'),
     ]
     # create a bigquery client
     client = bigquery.Client()
     # delete table if it exists
     logger.debug(f"dropping {table_name} table if it exists")
     client.delete_table(table_name, not_found_ok=True)
     # create a new table
     table = bigquery.Table(table_name, schema=schema)
     table = client.create_table(table)
     # table created
     logger.info(f"bigquery table created: {table_name}")
Beispiel #11
0
 def rename_columns(self):
     # rename columns based on the list below and convert all column names to lower case
     mapper = {
         # source file column name: new column name
         'CODE': 'ENG CODE',
         'MFR': 'ENG MFR NAME',
         'MODEL': 'ENG MODEL',
         'TYPE': 'ENG TYPE',
     }
     logger.debug(f"renaming aircraft engine file columns")
     df = self.df
     # rename columns based on mapping rules above
     df.rename(columns=mapper, inplace=True, errors='ignore')
     # lowercase columns names and replace special characters
     mapper = {
         col: str(col).strip().lower().replace(' ', '_').replace('-', '_')
         for col in list(df.columns)
     }
     df.rename(columns=mapper, inplace=True)
Beispiel #12
0
    def process(self, element, *args, **kwargs):
        """
        Parse a records CSV line and transpose column headers

        :param element: csv line
        :return: dict {column_name: csv_value}
        """
        # use csv.reader to correctly parse csv. accounting for quoted values
        reader = csv.reader(StringIO(element), delimiter=',')
        for row in reader:
            if len(row) == len(self.cols):
                # transpose the schema onto the csv cols to create a
                # {column_name: csv_value} dict. read about python zip function: https://www.programiz.com/python-programming/methods/built-in/zip
                d = dict(zip(self.cols, row))
                self.row_count += 1
                yield d
            else:
                self.bad_rows += 1
                logger.debug(f"bad row: {row}")
Beispiel #13
0
    def delete_table(self, table_name):
        """
        Delete a table from BigQuery.

        :param table_name: full table name to delete including dataset id (ie: my_dataset.my_table)
        :return: None
        """
        bq = self.client
        try:
            table: Table = bq.get_table(table_name)
            logger.info(
                f"deleting existing bigquery table: {table.project}:{table.dataset_id}.{table.table_id}"
            )
            bq.delete_table(table)
        except google_exceptions.NotFound:
            # table doesn't exist
            logger.debug(
                f"bigquery table to delete did not exist: {table_name}")
            pass
Beispiel #14
0
 def rename_columns(self):
     # rename columns based on the list below and convert all column names to lower case
     columns = {
         'CODE': 'MFR CODE',
         'MFR': 'MFR NAME',
         'TYPE-ACFT': 'AIRCRAFT TYPE',
         'NO-ENG': 'NUM ENGINES',
         'NO-SEATS': 'NUM SEATS',
         'AC-WEIGHT': 'WEIGHT CLASS',
     }
     logger.debug(f"renaming aircraft type columns")
     df = self.df
     # rename columns based on mapping rules above
     df.rename(columns=columns, inplace=True, errors='ignore')
     # lowercase columns names and replace special characters
     mapper = {
         col: str(col).strip().lower().replace(' ', '_').replace('-', '_')
         for col in list(df.columns)
     }
     df.rename(columns=mapper, inplace=True)
Beispiel #15
0
    def process(self, element, airports, *args, **kwargs):
        """
        Lookup src and dest IATA airport codes and add src/dest city/state if found

        Reject the record if either src/dest airport is not found in airports lookup table (as 'rejects' output PCollection).
        Reject missing src/dest airport IATA codes as 'missing_airports' output PCollection.

        :param element: flight dict record
        :param airports: airports dict lookup with airport iata code as keys
        :return: dict, flight record with src and dest airport cities added
        """
        try:
            reject = False
            # lookup src airport and get city, state if found
            if element['src'] in airports:
                element['src_city'] = airports[element['src']]['city']
                element['src_state'] = airports[element['src']]['state']
            else:
                reject = True
                yield beam.pvalue.TaggedOutput('missing_airport',
                                               (element['src'], 1))
            # lookup dest airport and get city, state if found
            if element['dest'] in airports:
                element['dest_city'] = airports[element['dest']]['city']
                element['dest_state'] = airports[element['dest']]['state']
            else:
                yield beam.pvalue.TaggedOutput('missing_airport',
                                               (element['dest'], 1))
            # any lookup value is missing
            if reject:
                # one of the lookup src or dest airports is missing, reject the record
                yield beam.pvalue.TaggedOutput('rejects', element)
            else:
                # both lookups are good, main output
                yield element
        except (KeyError, ValueError, TypeError) as err:
            logger.debug(
                f"input flight record is missing critical fields or of invalid type."
            )
            logger.debug(str(err))