Beispiel #1
0
 def extract(self):
     logger.info(f"loading aircraft type file: {self.source_file}")
     # column names to keep from the source file (other columns are not parsed)
     keep_columns = [
         'CODE',
         'MFR',
         'MODEL',
         'TYPE-ACFT',
         'NO-ENG',
         'NO-SEATS',
         'AC-WEIGHT',
         'SPEED',
     ]
     # specific field parsers
     # TODO ::
     #   complete the code for converters below.
     #   lambda function to parse CODE and AC-WEIGHT as str
     #   lambda function to parse NO-ENG, NO-SEATS, and SPEED as int
     #   use parse_aircraft_type for TYPE-ACFT
     #   use EngineFileProcessor as example
     converters = {
         'CODE': NotImplementedError,
         'TYPE-ACFT': NotImplementedError,
         'NO-ENG': NotImplementedError,
         'NO-SEATS': NotImplementedError,
         'AC-WEIGHT': NotImplementedError,
         'SPEED': NotImplementedError,
     }
     # read csv, get column names from header row. parse only needed columns using converters
     # TODO ::
     #   read aircrat csv file
     #   set header row as column names
     #   use keep_columns and converters from above
     raise NotImplementedError('YOU FORGOT TO WRITE THIS!')
Beispiel #2
0
def main():
    """
    Load parameters from our config file and run the PassengerUtil with 
    these parameters
    """
    t0 = now()

    logger.info("Loading configuration")
    bucket = config['defaults']['ch3']['ep4']['input_bucket'].get(str)
    passenger_filename = config['defaults']['ch3']['ep4']['input_passengers'].get(
        str)
    passenger_output = config['defaults']['ch3']['ep4']['bq_passengers'].get(str)
    cards_filepath = config['defaults']['ch3']['ep4']['input_addrs'].get(str)
    cards_bq = config['defaults']['ch3']['ep4']['bq_cards'].get(str)
    bucket = config['defaults']['ch3']['ep4']['input_bucket'].get(str)
    addrs_filepath = config['defaults']['ch3']['ep4']['input_addrs'].get(str)
    addrs_bq = config['defaults']['ch3']['ep4']['bq_addrs'].get(str)

    loader = PassengerUtils(bucket)
    loader.load_passengers(passenger_filename, passenger_output)
    loader.archive_csv(passenger_filename)
    loader.load_subtable(cards_filepath, 'card_uid', ["street_address",
                                                      "city",
                                                      "state_code",
                                                      "from_date",
                                                      "to_date"], cards_bq)
    loader.archive_csv(cards_filepath)
    loader.load_subtable(addrs_filepath, 'addr_uid', ["street_address",
                                                      "city",
                                                      "state_code",
                                                      "from_date",
                                                      "to_date"], addrs_bq) 
    loader.archive_csv(addrs_filepath)
    logger.info(f"total time: {(now() - t0):,.6f} secs")
Beispiel #3
0
 def extract(self):
     logger.info(f"loading aircraft type file: {self.source_file}")
     # column names to keep from the source file (other columns are not parsed)
     keep_columns = [
         'CODE',
         'MFR',
         'MODEL',
         'TYPE-ACFT',
         'NO-ENG',
         'NO-SEATS',
         'AC-WEIGHT',
         'SPEED',
     ]
     # specific field parsers
     converters = {
         'CODE': (lambda v: str(v).strip()),
         'MFR': (lambda v: str(v).strip()),
         'MODEL': (lambda v: str(v).strip()),
         'TYPE-ACFT': self.parse_aircraft_type,
         'NO-ENG': (lambda v: int(v) if str(v).strip().isdigit() else -1),
         'NO-SEATS': (lambda v: int(v) if str(v).strip().isdigit() else -1),
         'AC-WEIGHT': (lambda v: str(v).strip()),
         'SPEED': (lambda v: int(v) if str(v).strip().isdigit() else -1),
     }
     # read csv, get column names from header row. parse only needed columns using converters
     df = pd.read_csv(self.source_file,
                      header=0,
                      usecols=keep_columns,
                      converters=converters,
                      low_memory=False)
     self.df = df
Beispiel #4
0
 def transform(self):
     logger.info(f"applying aircraft type transforms")
     # TODO ::
     #   rename columns using rename_columns()
     #   set mfr_code as dataframe index column
     #   add a column named mfr_short_name to be the first word from mfr_name
     raise NotImplementedError('YOU FORGOT TO WRITE THIS!')
Beispiel #5
0
 def extract(self):
     # column names to keep from the source file (other columns are not parsed)
     keep_columns = [
         'CODE',
         'MFR',
         'MODEL',
         'TYPE',
         'HORSEPOWER',
         'THRUST',
     ]
     # specific field parsers to apply data types and transformation rules
     converters = {
         'CODE': (lambda v: str(v).strip()),
         'MFR': (lambda v: str(v).strip()),
         'MODEL': (lambda v: str(v).strip()),
         'TYPE': self.parse_engine_type,
         'HORSEPOWER': (lambda v: int(v)
                        if str(v).strip().isdigit() else -1),
         'THRUST': (lambda v: int(v) if str(v).strip().isdigit() else -1),
     }
     logger.info(f"loading aircraft engine file: {self.source_file}")
     # a) set thefirst row as header column names.
     # b) only parse needed columns.
     # c) use specific field parser (converters)
     df = pd.read_csv(self.source_file,
                      header=0,
                      usecols=keep_columns,
                      converters=converters,
                      low_memory=False)
     self.df = df
Beispiel #6
0
    def execute_as_dict(self,
                        sql: str,
                        keycols=None,
                        query_params=None,
                        **kwargs):
        """
        Execute a query and returns the results as a dict with keycols used as key values. If keycols is a list of multiple columns they the
        returning dict contains a tuple of their values. If keycols is None, a list is returned instead of a dict.

        Example: single keycol
            keycols = 'iata'  (airport IATA code)
            returns =  {'PDX': {'city': 'Portland', 'iata': 'PDX'}, {...}}

        Example: multiple keycols
            keycols = ['iata', 'city']
            returns = {('PDX', 'Portland'): {'city': 'Portland', 'iata': 'PDX'}, {...}}

        :param query_params:
        :param sql: sql command to execute
        :param keycols: columns to use as key values
        :return: dict
        """
        r = list(self.execute(sql, query_params, **kwargs))
        logger.info(f"converting {len(r)} rows as dict, keys: {keycols}")
        if keycols is None:
            return [dict(row) for row in r]
        elif isinstance(keycols, str):
            return {row[keycols]: dict(row) for row in r}
        elif isinstance(keycols, list) or isinstance(
                keycols, set) or isinstance(keycols, tuple):
            return {tuple(row[kk] for kk in keycols): dict(row) for row in r}
        else:
            raise ValueError("keycols must be None, str, list, set, or tuple.")
Beispiel #7
0
 def transform(self):
     logger.info(f"applying aircraft engine transforms")
     df = self.df
     # rename columns
     self.rename_columns()
     # set index
     df.set_index(keys='eng_code', inplace=True)
Beispiel #8
0
    def load_subtable(self, csv_filepath, uid_name, uid_col_list, csv_bq, passenger_bq=None):
        """
        Function to load a supporting table to passengers from GCS and save in BigQuery.
        :param csv_filepath: str input filename
        :param uid_name: str name to give the UID column
        :param uid_col_list: list of str column names to combine into UID
        :param csv_bq: str output project.datset.table where the dat will be saved
        :param passenger_bq: str, optional. If passengers_df already has been loaded
        """
        csv_path = 'gs://{}/{}'.format(self.bucket, csv_filepath)
        logger.info(f"Loading address info from {csv_path}")
        csv_df = self.sparkql.read.csv(csv_path, header=True)

        csv_df = csv_df.withColumn(uid_name,
                                       sha2(concat_ws("",
                                                      *uid_col_list
                                                      ),
                                            256
                                            ))
        if passenger_bq:
            passengers_df = self.sparkql.read.format('bigquery') \
                                 .option('table', passenger_bq) \
                                 .load() \
                                 .withColumnRenamed('uid', 'passenger_uid')
        else:
            passengers_df = self.passengers_df.withColumnRenamed('uid', 'passenger_uid')

        csv_df = csv_df.join(passengers_df.select('email', 'passenger_uid'),
                                 on='email',
                                 how='left')
        logger.info(f"writing card data to {csv_bq}")
        csv_df.write.format('bigquery') \
          .option('table', csv_bq) \
          .save()
Beispiel #9
0
    def execute(self, sql, query_params=None, **kwargs) -> RowIterator:
        """
        Execute a BigQuery query and return the results

        :param query_params: parameterized query params
        :param sql: sql command to execute
        :return: RowIterator
        """
        bq = self.client
        if query_params:
            config = bigquery.QueryJobConfig(allow_large_results=True,
                                             query_parameters=query_params,
                                             **kwargs)
        else:
            config = bigquery.QueryJobConfig(allow_large_results=True,
                                             **kwargs)
        t0 = now()
        logger.info(f"executing bigquery query: \"{sql}\"")
        # execute and get the results
        query_job = bq.query(sql, job_config=config)
        r = query_job.result()
        # log stats
        logger.info(
            f"executed bigquery query [rows: {r.total_rows}][sec: {(now() - t0):5.3f}]"
        )
        return r
Beispiel #10
0
 def to_parquet(self, output_file):
     logger.info(f"writing to parquet: {output_file}")
     df: pd.DataFrame = self.df
     # write parquet file
     df.to_parquet(output_file,
                   engine='pyarrow',
                   compression='gzip',
                   index=False)
Beispiel #11
0
 def lookup_n_number(self, n_numbers):
     if isinstance(n_numbers, str):
         n_numbers = [n_numbers]
     elif isinstance(n_numbers, tuple) or isinstance(n_numbers, set):
         n_numbers = list(n_numbers)
     df = self.df
     for n in n_numbers:
         logger.info(f'looking up N Number: {n}')
         print(df.loc[n])
Beispiel #12
0
 def transform(self):
     logger.info(f"applying aircraft type transforms")
     df = self.df
     # rename columns
     self.rename_columns()
     # set index
     df.set_index(keys='mfr_code', inplace=True)
     # add a short name column
     df['mfr_short_name'] = df['mfr_name'].map(lambda v: str(v).split()[0])
     logger.info(f"transform done")
Beispiel #13
0
 def process(self, element: typing.Tuple, *args,
             **kwargs) -> typing.List[typing.Dict]:
     fligh_date, airline = element
     flights = api_get_flights(airline,
                               fligh_date,
                               api_url=kwargs['api_url'],
                               api_token=kwargs['api_token'])
     logger.info(
         f"flights api call airline={airline}, flight_date={fligh_date} returned {len(flights)} flights"
     )
     return flights
Beispiel #14
0
 def archive_csv(self, input_file):
     """
     Archive a csv in GCS based off date
     :param input_file: str path to file to be archived
     """
     source_bucket = self.storage_client.bucket(self.bucket)
     source_blob = source_bucket.blob(input_file)
     destination_blob_name = f"{self.datetime}/{input_file}"
     logger.info(f"Moving to {destination_blob_name}")
     blob_move = source_bucket.rename_blob(
         source_blob, destination_blob_name
     )
Beispiel #15
0
    def load_passengers(self, passenger_filename, passenger_output):
        """
        Function to load the passenger data from csv in GCS, clean, add UID,
        and upload to BigQuery
        :param passenger_filename: str input file name
        :param passenger_output: str of project.dataset.table to save passenger data
        """
        self.passenger_filename = passenger_filename
        self.passenger_output = passenger_output
        people_path = 'gs://{}/{}'.format(self.bucket, passenger_filename)

        logger.info(f"Loading passenger info from {self.bucket}.{passenger_filename}")
Beispiel #16
0
 def __init__(self, bucket):
     """
     Initialized our util with default parameters
     :param bucket: str name of GCG bucket which will be used by this util
     to load data from and as temporary storage for Dataproc
     """
     logger.info(f"Starting SparkSession and using {bucket} as our bucket")
     self.sparkql = SparkSession.builder.master('yarn').getOrCreate()
     self.bucket = bucket
     self.sparkql.conf.set('temporaryGcsBucket', bucket)
     self.storage_client = storage.Client()
     self.datetime = f"{datetime.now():%Y%m%d%H%M%S}"
Beispiel #17
0
 def extract(self):
     logger.info(f"loading master aircraft file: {self.source_file}")
     # column names to keep from the source file (other columns are not parsed)
     keep_columns = [
         'N-NUMBER',
         'SERIAL NUMBER',
         'MFR MDL CODE',
         'ENG MFR MDL',
         'YEAR MFR',
         'TYPE REGISTRANT',
         'NAME',
         'STREET',
         'STREET2',
         'CITY',
         'STATE',
         'ZIP CODE',
         'REGION',
         'COUNTRY',
         'LAST ACTION DATE',
         'CERT ISSUE DATE',
         'STATUS CODE',
         'AIR WORTH DATE',
         'EXPIRATION DATE',
     ]
     # specific field parsers (converters)
     converters = {
         'N-NUMBER': self.parse_n_number,
         'SERIAL NUMBER': (lambda v: str(v).strip()),
         'MFR MDL CODE': (lambda v: str(v).strip()),
         'ENG MFR MDL': (lambda v: str(v).strip()),
         'YEAR MFR': (lambda v: int(v) if str(v).strip().isdigit() else -1),
         'TYPE REGISTRANT': self.parse_registrant_type,
         'NAME': (lambda v: str(v).strip()),
         'STREET': (lambda v: str(v).strip()),
         'STREET2': (lambda v: str(v).strip()),
         'CITY': (lambda v: str(v).strip()),
         'STATE': (lambda v: str(v).strip()),
         'ZIP CODE': self.parse_zipcode,
         'REGION': self.parse_region,
         'COUNTRY': (lambda v: str(v).strip()),
         'LAST ACTION DATE': self.parse_date,
         'CERT ISSUE DATE': self.parse_date,
         'STATUS CODE': self.parse_status,
         'AIR WORTH DATE': self.parse_date,
         'EXPIRATION DATE': self.parse_date,
     }
     # read csv
     df = pd.read_csv(self.source_file,
                      header=0,
                      usecols=keep_columns,
                      converters=converters,
                      low_memory=False)
     self.df = df
Beispiel #18
0
 def gbq_load(self, table_name, data_file):
     logger.info(f"loading bigquery table: `{table_name}` from {data_file}")
     # Construct a BigQuery client object.
     client = bigquery.Client()
     # Construct a BigQuery client object.
     job_config = bigquery.LoadJobConfig(
         source_format=bigquery.SourceFormat.PARQUET, )
     with open(data_file, "rb") as source_file:
         job = client.load_table_from_file(source_file,
                                           table_name,
                                           job_config=job_config)
     job.result()  # Waits for the job to complete.
     table = client.get_table(table_name)  # get loaded table info
     logger.info(f"loaded {table.num_rows} rows to {table_name}")
Beispiel #19
0
def run():
    t0 = now()

    # parse command line options
    known_args, beam_args = runtime_args()

    options = PipelineOptions(beam_args)
    options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=options) as p:
        rows = p | beam.io.ReadFromText(
            known_args.input, skip_header_lines=1) | beam.ParDo(
                BeamReadCSV(header_cols=FLIGHTS_CSV_COLUMNS))

    logger.info(f"total time: {(now() - t0):,.6f} secs")
Beispiel #20
0
def api_get_airlines(api_url=api_url,
                     api_token=api_token,
                     tries=0,
                     timeout=api_timeout) -> typing.List[str]:
    """
    Call the REST API to get the list of available airlines

    :param api_url:
    :param api_token:
    :param tries:
    :param timeout:
    :param kwargs:
    :return:
    """
    global AIRLINES
    if not AIRLINES:
        headers = {'Authorization': f'Bearer {api_token}',
                   'Accept': 'application/json'}
        url = os.path.join(api_url, 'flights/airlines')
        logger.debug(f"Calling airlines REST API (url={url})")
        try:
            r = requests.get(url, headers=headers, timeout=timeout)
            r.raise_for_status()
            AIRLINES = r.json()['airlines']
            logger.info(f"flight airlines api call success in {r.elapsed.total_seconds():.3}s. Airlines: {AIRLINES}")
            return AIRLINES
        except requests.exceptions.ConnectTimeout:
            # REST API connection timeout is reached. Most likely we're sending too many requests and need to ease off
            # let's exponentially increase the timeout and retry until the max num of retires is reached
            logger.warning(f"REST API call to get airlines timed out. (timeout={timeout}")
            tries, timeout = tries + 1, timeout * 2
            if tries < api_max_retries:
                # sleep the current thread to back off from submitting too many requests
                logger.debug(f"thread sleep to back off from submitting too many REST API requests. sleep: {timeout}s")
                sleep(timeout)
                logger.warning(f"Increasing timeout to {timeout} and retrying. Retry number={tries}")
                return api_get_airlines(api_url, api_token, tries, timeout)
            else:
                logger.fatal("Max number of API retries is reached. Quiting.")
                sys.exit(1)
        except requests.ConnectionError:
            # if cannot establish connection with the REST API. Most likely the URL is incorrect or API Flask server
            # is not running
            logger.fatal(f"Could not establish connection to the REST API to get airlines (url={url})")
            sys.exit(1)
        except requests.exceptions.RequestException as err:
            logger.error("Unknown error while connecting to REST API to get airlines: {}".format(str(err)))
            return AIRLINES
    else:
        return AIRLINES
Beispiel #21
0
 def extract(self):
     logger.info(f"loading master aircraft file: {self.source_file}")
     # column names to keep from the source file (other columns are not parsed)
     keep_columns = [
         'N-NUMBER',
         'SERIAL NUMBER',
         'MFR MDL CODE',
         'ENG MFR MDL',
         'YEAR MFR',
         'TYPE REGISTRANT',
         'NAME',
         'STREET',
         'STREET2',
         'CITY',
         'STATE',
         'ZIP CODE',
         'REGION',
         'COUNTRY',
         'LAST ACTION DATE',
         'CERT ISSUE DATE',
         'STATUS CODE',
         'AIR WORTH DATE',
         'EXPIRATION DATE',
     ]
     # specific field parsers (converters)
     # TODO ::
     #   finish the converters below
     #   make sure to parse 'MFR MDL CODE' and 'ENG MFR MDL' as str
     #   parse 'YEAR MFR' as int and everything else should use their own parsers from above
     converters = {
         'MFR MDL CODE': NotImplementedError,
         'ENG MFR MDL': NotImplementedError,
         'YEAR MFR': NotImplementedError,
         'TYPE REGISTRANT': NotImplementedError,
         'ZIP CODE': NotImplementedError,
         'REGION': NotImplementedError,
         'LAST ACTION DATE': NotImplementedError,
         'CERT ISSUE DATE': NotImplementedError,
         'STATUS CODE': NotImplementedError,
         'AIR WORTH DATE': NotImplementedError,
         'EXPIRATION DATE': NotImplementedError,
     }
     # TODO ::
     #   read csv
     raise NotImplementedError('YOU FORGOT TO WRITE THIS!')
Beispiel #22
0
 def gbq_create(self, table_name):
     schema = [
         bigquery.SchemaField('n_number', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('serial_number', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('mfr_mdl_code', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('eng_mfr_mdl', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('mfr_year', 'INTEGER', mode='NULLABLE'),
         bigquery.SchemaField('registrant_type', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('registrant_name', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('street', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('street2', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('city', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('state', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('zip_code', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('region', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('country', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('last_action_date', 'DATE', mode='NULLABLE'),
         bigquery.SchemaField('issue_date', 'DATE', mode='NULLABLE'),
         bigquery.SchemaField('status', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('air_ready_date', 'DATE', mode='NULLABLE'),
         bigquery.SchemaField('expiration_date', 'DATE', mode='NULLABLE'),
         bigquery.SchemaField('mfr_name', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('mfr_short_name', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('model', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('aircraft_type', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('num_engines', 'INTEGER', mode='NULLABLE'),
         bigquery.SchemaField('num_seats', 'INTEGER', mode='NULLABLE'),
         bigquery.SchemaField('weight_class', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('speed', 'INTEGER', mode='NULLABLE'),
         bigquery.SchemaField('eng_mfr_name', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('eng_model', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('eng_type', 'STRING', mode='NULLABLE'),
         bigquery.SchemaField('horsepower', 'FLOAT', mode='NULLABLE'),
         bigquery.SchemaField('thrust', 'FLOAT', mode='NULLABLE'),
     ]
     # create a bigquery client
     client = bigquery.Client()
     # delete table if it exists
     logger.debug(f"dropping {table_name} table if it exists")
     client.delete_table(table_name, not_found_ok=True)
     # create a new table
     table = bigquery.Table(table_name, schema=schema)
     table = client.create_table(table)
     # table created
     logger.info(f"bigquery table created: {table_name}")
Beispiel #23
0
    def delete_table(self, table_name):
        """
        Delete a table from BigQuery.

        :param table_name: full table name to delete including dataset id (ie: my_dataset.my_table)
        :return: None
        """
        bq = self.client
        try:
            table: Table = bq.get_table(table_name)
            logger.info(
                f"deleting existing bigquery table: {table.project}:{table.dataset_id}.{table.table_id}"
            )
            bq.delete_table(table)
        except google_exceptions.NotFound:
            # table doesn't exist
            logger.debug(
                f"bigquery table to delete did not exist: {table_name}")
            pass
Beispiel #24
0
def run():
    logger.info("DATA ENGINEERING BOOTCAMP - CHAPTER 1 EPISODE 5")
    logger.info("FAA Aircraft Dataset ETL Process")
    # set command line args
    parser = argparse.ArgumentParser(
        description='FAA Aircraft Database ETL Process')
    register_cmdline_args(parser)
    # process command line input
    args = parser.parse_args()
    # execute command
    target = None
    if args.command == 'test-engine':
        # test processing engine file
        target = EngineTypeFileProcessor(source_file=args.engine_file)
    elif args.command == 'test-aircraft':
        # test processing aircraft file
        target = AircraftTypeFileProcessor(source_file=args.aircraft_file)
    elif args.command == 'test-master':
        # test processing master file
        engine = EngineTypeFileProcessor(source_file=args.engine_file)
        aircraft = AircraftTypeFileProcessor(source_file=args.aircraft_file)
        master = AircraftMasterFileProcessor(source_file=args.master_file)
        master.lookup_aircraft_type(aircraft)
        master.lookup_engine_type(engine)
        master.lookup_n_number(['N794JB', 'N518AS', 'N292JB'])
        target = master
    elif args.command == 'etl':
        # extract, transform, and load (etl) all 3 files
        engine = EngineTypeFileProcessor(source_file=args.engine_file)
        aircraft = AircraftTypeFileProcessor(source_file=args.aircraft_file)
        master = AircraftMasterFileProcessor(source_file=args.master_file)
        master.lookup_aircraft_type(aircraft)
        master.lookup_engine_type(engine)
        master.load(output_table=args.output_table,
                    output_file=args.output_file)
        target = master
    elif args.command == 'help':
        parser.print_help()
    # print df
    if args.print and target is not None:
        target.print(sample_size=args.row_count)
    if args.write_csv and target is not None:
        target.to_csv(args.output_file)
Beispiel #25
0
def run():
    logger.info("DATA ENGINEERING BOOTCAMP - CHAPTER 1 EPISODE 5")
    logger.info("FAA Aircraft Dataset ETL Process")
    # set command line args
    parser = argparse.ArgumentParser(
        description='FAA Aircraft Database ETL Process')
    register_cmdline_args(parser)
    # process command line input
    args = parser.parse_args()
    # execute command
    target = None
    if args.command == 'test-engine':
        # test processing engine file
        target = EngineTypeFileProcessor(source_file=args.engine_file)
    elif args.command == 'help':
        parser.print_help()
    # print df
    if args.print and target is not None:
        target.print(sample_size=args.row_count)
Beispiel #26
0
def run():
    t0 = now()

    # parse command line options
    known_args, beam_args = runtime_args()

    options = PipelineOptions(beam_args)
    options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=options) as p:
        rows = (p
                | beam.io.ReadFromText(known_args.input, skip_header_lines=1)
                | beam.ParDo(BeamReadCSV(header_cols=FLIGHTS_CSV_COLUMNS))
                | beam.ParDo(BeamTransformRecords(),
                             date_fmt='%Y-%m-%d',
                             time_fmt='%H%M'))

        # write parquet output files
        output = (rows
                  | beam.io.WriteToParquet(
                      os.path.join(known_args.output, 'flights'),
                      schema=datamodel_flights_parquet_schema(),
                      file_name_suffix='.parquet'))

        # alternative: write (simple) newline delimited json output files
        #              a very flexible output file format for bigquery and other big data tools
        # much slower to write and larger in size than binary formats such as Parquet, ORC, or Avro
        # but provides flexibility over schema for smaller data files
        # larger file sizes should use Avro, Parquet, ORC. Avro provides fastest write speeds where
        # parquet and orc provide faster read performance for analytical queries
        output = (
            rows
            | beam.Map(
                lambda e: {
                    k: v if k != 'flight_date' else v.strftime('%Y-%m-%d')
                    for k, v in e.items()
                }
            )  # convert flight_date back to string type for json conversion
            | beam.Map(lambda e: json.dumps(e))  # json dump row
            | beam.io.WriteToText(os.path.join(known_args.output, 'flights'),
                                  file_name_suffix='.json'))

    logger.info(f"total time: {(now() - t0):,.6f} secs")
Beispiel #27
0
def run_simple():
    t0 = time()

    # parse command line arguments
    known_args, beam_args = runtime_args()

    # pass in the pipeline options
    options = PipelineOptions(beam_args)
    options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=options) as p:
        # pre-process: create a list of date to process and get other side-inputs
        # create a list of flights date to retrieve from api
        days = list_dates(start_date=known_args.start_date,
                          end_date=known_args.end_date)

        # get airline iata codes from the api
        airlines = api_get_airlines(api_url=known_args.api_url,
                                    api_token=known_args.api_token)

        # create a beam collection with all days and airlines to get flights for
        input_rows = (p
                      | beam.Create(days)
                      | beam.ParDo(BeamExpandDaysByAirlines(),
                                   airlines=airlines))

        # call flights api to get flights for each record above and
        # call the beam transforms to process the input flights
        flights = (input_rows
                   | beam.ParDo(BeamGetFlights(),
                                api_url=known_args.api_url,
                                api_token=known_args.api_token)
                   | beam.ParDo(BeamTransformFlights()))

        # prepare & write output files
        json_output = (flights
                       | beam.Map(lambda e: json.dumps(e))
                       | beam.io.WriteToText(os.path.join(
                           known_args.output, 'flights'),
                                             file_name_suffix='.json'))

    logger.info("apache beam pipeline done")
    logger.info(f"process completed in {(time() - t0):,.3f} seconds")
Beispiel #28
0
    def load_subtable(self, csv_filepath, uid_name, uid_col_list, csv_bq, passenger_bq=None):
        """
        Function to load a supporting table to passengers from GCS and save in BigQuery.
        :param csv_filepath: str input filename
        :param uid_name: str name to give the UID column
        :param uid_col_list: list of str column names to combine into UID
        :param csv_bq: str output project.datset.table where the dat will be saved
        :param passenger_bq: str, optional. If passengers_df already has been loaded
        """
        # Create generic function that will work for both cards and addresses
        csv_path = 'gs://{}/{}'.format(self.bucket, csv_filepath)
        logger.info(f"Loading address info from {csv_path}")
        csv_df = self.sparkql.read.csv(csv_path, header=True)

        # Create uid from columns listed in uid_col_list

        if passenger_bq:
            # If passengers_bq is not None, load passenger_df from BigQuery as in Episode 3

        else:
Beispiel #29
0
 def rename_columns(self):
     # rename columns based on the list below and convert all column names to lower case
     columns = {
         'TYPE REGISTRANT': 'REGISTRANT TYPE',
         'NAME': 'REGISTRANT NAME',
         'YEAR MFR': 'MFR YEAR',
         'CERT ISSUE DATE': 'ISSUE DATE',
         'STATUS CODE': 'STATUS',
         'AIR WORTH DATE': 'AIR READY DATE',
     }
     df = self.df
     logger.info(f"renaming master file columns")
     # rename columns based on mapping rules above
     df.rename(columns=columns, inplace=True, errors='ignore')
     # lowercase columns names and replace special characters
     mapper = {
         col: str(col).strip().lower().replace(' ', '_').replace('-', '_')
         for col in list(df.columns)
     }
     df.rename(columns=mapper, inplace=True)
Beispiel #30
0
def run_simple():
    t0 = time()

    # parse command line arguments
    known_args, beam_args = runtime_args()

    # BigQuery Utility
    bq_utils = BigQueryUtils()

    # pass in the pipeline options
    options = PipelineOptions(beam_args)
    options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=options) as p:
        # todo: <<ADD YOUR CODE HERE>>
        # todo: calling beam transforms to call rest API, transform records, and output them into files
        pass

    # todo: create an external table using the output files and insert records into BigQuery

    logger.info(f"process completed in {(time() - t0):,.3f} seconds")