def extract(self): logger.info(f"loading aircraft type file: {self.source_file}") # column names to keep from the source file (other columns are not parsed) keep_columns = [ 'CODE', 'MFR', 'MODEL', 'TYPE-ACFT', 'NO-ENG', 'NO-SEATS', 'AC-WEIGHT', 'SPEED', ] # specific field parsers # TODO :: # complete the code for converters below. # lambda function to parse CODE and AC-WEIGHT as str # lambda function to parse NO-ENG, NO-SEATS, and SPEED as int # use parse_aircraft_type for TYPE-ACFT # use EngineFileProcessor as example converters = { 'CODE': NotImplementedError, 'TYPE-ACFT': NotImplementedError, 'NO-ENG': NotImplementedError, 'NO-SEATS': NotImplementedError, 'AC-WEIGHT': NotImplementedError, 'SPEED': NotImplementedError, } # read csv, get column names from header row. parse only needed columns using converters # TODO :: # read aircrat csv file # set header row as column names # use keep_columns and converters from above raise NotImplementedError('YOU FORGOT TO WRITE THIS!')
def main(): """ Load parameters from our config file and run the PassengerUtil with these parameters """ t0 = now() logger.info("Loading configuration") bucket = config['defaults']['ch3']['ep4']['input_bucket'].get(str) passenger_filename = config['defaults']['ch3']['ep4']['input_passengers'].get( str) passenger_output = config['defaults']['ch3']['ep4']['bq_passengers'].get(str) cards_filepath = config['defaults']['ch3']['ep4']['input_addrs'].get(str) cards_bq = config['defaults']['ch3']['ep4']['bq_cards'].get(str) bucket = config['defaults']['ch3']['ep4']['input_bucket'].get(str) addrs_filepath = config['defaults']['ch3']['ep4']['input_addrs'].get(str) addrs_bq = config['defaults']['ch3']['ep4']['bq_addrs'].get(str) loader = PassengerUtils(bucket) loader.load_passengers(passenger_filename, passenger_output) loader.archive_csv(passenger_filename) loader.load_subtable(cards_filepath, 'card_uid', ["street_address", "city", "state_code", "from_date", "to_date"], cards_bq) loader.archive_csv(cards_filepath) loader.load_subtable(addrs_filepath, 'addr_uid', ["street_address", "city", "state_code", "from_date", "to_date"], addrs_bq) loader.archive_csv(addrs_filepath) logger.info(f"total time: {(now() - t0):,.6f} secs")
def extract(self): logger.info(f"loading aircraft type file: {self.source_file}") # column names to keep from the source file (other columns are not parsed) keep_columns = [ 'CODE', 'MFR', 'MODEL', 'TYPE-ACFT', 'NO-ENG', 'NO-SEATS', 'AC-WEIGHT', 'SPEED', ] # specific field parsers converters = { 'CODE': (lambda v: str(v).strip()), 'MFR': (lambda v: str(v).strip()), 'MODEL': (lambda v: str(v).strip()), 'TYPE-ACFT': self.parse_aircraft_type, 'NO-ENG': (lambda v: int(v) if str(v).strip().isdigit() else -1), 'NO-SEATS': (lambda v: int(v) if str(v).strip().isdigit() else -1), 'AC-WEIGHT': (lambda v: str(v).strip()), 'SPEED': (lambda v: int(v) if str(v).strip().isdigit() else -1), } # read csv, get column names from header row. parse only needed columns using converters df = pd.read_csv(self.source_file, header=0, usecols=keep_columns, converters=converters, low_memory=False) self.df = df
def transform(self): logger.info(f"applying aircraft type transforms") # TODO :: # rename columns using rename_columns() # set mfr_code as dataframe index column # add a column named mfr_short_name to be the first word from mfr_name raise NotImplementedError('YOU FORGOT TO WRITE THIS!')
def extract(self): # column names to keep from the source file (other columns are not parsed) keep_columns = [ 'CODE', 'MFR', 'MODEL', 'TYPE', 'HORSEPOWER', 'THRUST', ] # specific field parsers to apply data types and transformation rules converters = { 'CODE': (lambda v: str(v).strip()), 'MFR': (lambda v: str(v).strip()), 'MODEL': (lambda v: str(v).strip()), 'TYPE': self.parse_engine_type, 'HORSEPOWER': (lambda v: int(v) if str(v).strip().isdigit() else -1), 'THRUST': (lambda v: int(v) if str(v).strip().isdigit() else -1), } logger.info(f"loading aircraft engine file: {self.source_file}") # a) set thefirst row as header column names. # b) only parse needed columns. # c) use specific field parser (converters) df = pd.read_csv(self.source_file, header=0, usecols=keep_columns, converters=converters, low_memory=False) self.df = df
def execute_as_dict(self, sql: str, keycols=None, query_params=None, **kwargs): """ Execute a query and returns the results as a dict with keycols used as key values. If keycols is a list of multiple columns they the returning dict contains a tuple of their values. If keycols is None, a list is returned instead of a dict. Example: single keycol keycols = 'iata' (airport IATA code) returns = {'PDX': {'city': 'Portland', 'iata': 'PDX'}, {...}} Example: multiple keycols keycols = ['iata', 'city'] returns = {('PDX', 'Portland'): {'city': 'Portland', 'iata': 'PDX'}, {...}} :param query_params: :param sql: sql command to execute :param keycols: columns to use as key values :return: dict """ r = list(self.execute(sql, query_params, **kwargs)) logger.info(f"converting {len(r)} rows as dict, keys: {keycols}") if keycols is None: return [dict(row) for row in r] elif isinstance(keycols, str): return {row[keycols]: dict(row) for row in r} elif isinstance(keycols, list) or isinstance( keycols, set) or isinstance(keycols, tuple): return {tuple(row[kk] for kk in keycols): dict(row) for row in r} else: raise ValueError("keycols must be None, str, list, set, or tuple.")
def transform(self): logger.info(f"applying aircraft engine transforms") df = self.df # rename columns self.rename_columns() # set index df.set_index(keys='eng_code', inplace=True)
def load_subtable(self, csv_filepath, uid_name, uid_col_list, csv_bq, passenger_bq=None): """ Function to load a supporting table to passengers from GCS and save in BigQuery. :param csv_filepath: str input filename :param uid_name: str name to give the UID column :param uid_col_list: list of str column names to combine into UID :param csv_bq: str output project.datset.table where the dat will be saved :param passenger_bq: str, optional. If passengers_df already has been loaded """ csv_path = 'gs://{}/{}'.format(self.bucket, csv_filepath) logger.info(f"Loading address info from {csv_path}") csv_df = self.sparkql.read.csv(csv_path, header=True) csv_df = csv_df.withColumn(uid_name, sha2(concat_ws("", *uid_col_list ), 256 )) if passenger_bq: passengers_df = self.sparkql.read.format('bigquery') \ .option('table', passenger_bq) \ .load() \ .withColumnRenamed('uid', 'passenger_uid') else: passengers_df = self.passengers_df.withColumnRenamed('uid', 'passenger_uid') csv_df = csv_df.join(passengers_df.select('email', 'passenger_uid'), on='email', how='left') logger.info(f"writing card data to {csv_bq}") csv_df.write.format('bigquery') \ .option('table', csv_bq) \ .save()
def execute(self, sql, query_params=None, **kwargs) -> RowIterator: """ Execute a BigQuery query and return the results :param query_params: parameterized query params :param sql: sql command to execute :return: RowIterator """ bq = self.client if query_params: config = bigquery.QueryJobConfig(allow_large_results=True, query_parameters=query_params, **kwargs) else: config = bigquery.QueryJobConfig(allow_large_results=True, **kwargs) t0 = now() logger.info(f"executing bigquery query: \"{sql}\"") # execute and get the results query_job = bq.query(sql, job_config=config) r = query_job.result() # log stats logger.info( f"executed bigquery query [rows: {r.total_rows}][sec: {(now() - t0):5.3f}]" ) return r
def to_parquet(self, output_file): logger.info(f"writing to parquet: {output_file}") df: pd.DataFrame = self.df # write parquet file df.to_parquet(output_file, engine='pyarrow', compression='gzip', index=False)
def lookup_n_number(self, n_numbers): if isinstance(n_numbers, str): n_numbers = [n_numbers] elif isinstance(n_numbers, tuple) or isinstance(n_numbers, set): n_numbers = list(n_numbers) df = self.df for n in n_numbers: logger.info(f'looking up N Number: {n}') print(df.loc[n])
def transform(self): logger.info(f"applying aircraft type transforms") df = self.df # rename columns self.rename_columns() # set index df.set_index(keys='mfr_code', inplace=True) # add a short name column df['mfr_short_name'] = df['mfr_name'].map(lambda v: str(v).split()[0]) logger.info(f"transform done")
def process(self, element: typing.Tuple, *args, **kwargs) -> typing.List[typing.Dict]: fligh_date, airline = element flights = api_get_flights(airline, fligh_date, api_url=kwargs['api_url'], api_token=kwargs['api_token']) logger.info( f"flights api call airline={airline}, flight_date={fligh_date} returned {len(flights)} flights" ) return flights
def archive_csv(self, input_file): """ Archive a csv in GCS based off date :param input_file: str path to file to be archived """ source_bucket = self.storage_client.bucket(self.bucket) source_blob = source_bucket.blob(input_file) destination_blob_name = f"{self.datetime}/{input_file}" logger.info(f"Moving to {destination_blob_name}") blob_move = source_bucket.rename_blob( source_blob, destination_blob_name )
def load_passengers(self, passenger_filename, passenger_output): """ Function to load the passenger data from csv in GCS, clean, add UID, and upload to BigQuery :param passenger_filename: str input file name :param passenger_output: str of project.dataset.table to save passenger data """ self.passenger_filename = passenger_filename self.passenger_output = passenger_output people_path = 'gs://{}/{}'.format(self.bucket, passenger_filename) logger.info(f"Loading passenger info from {self.bucket}.{passenger_filename}")
def __init__(self, bucket): """ Initialized our util with default parameters :param bucket: str name of GCG bucket which will be used by this util to load data from and as temporary storage for Dataproc """ logger.info(f"Starting SparkSession and using {bucket} as our bucket") self.sparkql = SparkSession.builder.master('yarn').getOrCreate() self.bucket = bucket self.sparkql.conf.set('temporaryGcsBucket', bucket) self.storage_client = storage.Client() self.datetime = f"{datetime.now():%Y%m%d%H%M%S}"
def extract(self): logger.info(f"loading master aircraft file: {self.source_file}") # column names to keep from the source file (other columns are not parsed) keep_columns = [ 'N-NUMBER', 'SERIAL NUMBER', 'MFR MDL CODE', 'ENG MFR MDL', 'YEAR MFR', 'TYPE REGISTRANT', 'NAME', 'STREET', 'STREET2', 'CITY', 'STATE', 'ZIP CODE', 'REGION', 'COUNTRY', 'LAST ACTION DATE', 'CERT ISSUE DATE', 'STATUS CODE', 'AIR WORTH DATE', 'EXPIRATION DATE', ] # specific field parsers (converters) converters = { 'N-NUMBER': self.parse_n_number, 'SERIAL NUMBER': (lambda v: str(v).strip()), 'MFR MDL CODE': (lambda v: str(v).strip()), 'ENG MFR MDL': (lambda v: str(v).strip()), 'YEAR MFR': (lambda v: int(v) if str(v).strip().isdigit() else -1), 'TYPE REGISTRANT': self.parse_registrant_type, 'NAME': (lambda v: str(v).strip()), 'STREET': (lambda v: str(v).strip()), 'STREET2': (lambda v: str(v).strip()), 'CITY': (lambda v: str(v).strip()), 'STATE': (lambda v: str(v).strip()), 'ZIP CODE': self.parse_zipcode, 'REGION': self.parse_region, 'COUNTRY': (lambda v: str(v).strip()), 'LAST ACTION DATE': self.parse_date, 'CERT ISSUE DATE': self.parse_date, 'STATUS CODE': self.parse_status, 'AIR WORTH DATE': self.parse_date, 'EXPIRATION DATE': self.parse_date, } # read csv df = pd.read_csv(self.source_file, header=0, usecols=keep_columns, converters=converters, low_memory=False) self.df = df
def gbq_load(self, table_name, data_file): logger.info(f"loading bigquery table: `{table_name}` from {data_file}") # Construct a BigQuery client object. client = bigquery.Client() # Construct a BigQuery client object. job_config = bigquery.LoadJobConfig( source_format=bigquery.SourceFormat.PARQUET, ) with open(data_file, "rb") as source_file: job = client.load_table_from_file(source_file, table_name, job_config=job_config) job.result() # Waits for the job to complete. table = client.get_table(table_name) # get loaded table info logger.info(f"loaded {table.num_rows} rows to {table_name}")
def run(): t0 = now() # parse command line options known_args, beam_args = runtime_args() options = PipelineOptions(beam_args) options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: rows = p | beam.io.ReadFromText( known_args.input, skip_header_lines=1) | beam.ParDo( BeamReadCSV(header_cols=FLIGHTS_CSV_COLUMNS)) logger.info(f"total time: {(now() - t0):,.6f} secs")
def api_get_airlines(api_url=api_url, api_token=api_token, tries=0, timeout=api_timeout) -> typing.List[str]: """ Call the REST API to get the list of available airlines :param api_url: :param api_token: :param tries: :param timeout: :param kwargs: :return: """ global AIRLINES if not AIRLINES: headers = {'Authorization': f'Bearer {api_token}', 'Accept': 'application/json'} url = os.path.join(api_url, 'flights/airlines') logger.debug(f"Calling airlines REST API (url={url})") try: r = requests.get(url, headers=headers, timeout=timeout) r.raise_for_status() AIRLINES = r.json()['airlines'] logger.info(f"flight airlines api call success in {r.elapsed.total_seconds():.3}s. Airlines: {AIRLINES}") return AIRLINES except requests.exceptions.ConnectTimeout: # REST API connection timeout is reached. Most likely we're sending too many requests and need to ease off # let's exponentially increase the timeout and retry until the max num of retires is reached logger.warning(f"REST API call to get airlines timed out. (timeout={timeout}") tries, timeout = tries + 1, timeout * 2 if tries < api_max_retries: # sleep the current thread to back off from submitting too many requests logger.debug(f"thread sleep to back off from submitting too many REST API requests. sleep: {timeout}s") sleep(timeout) logger.warning(f"Increasing timeout to {timeout} and retrying. Retry number={tries}") return api_get_airlines(api_url, api_token, tries, timeout) else: logger.fatal("Max number of API retries is reached. Quiting.") sys.exit(1) except requests.ConnectionError: # if cannot establish connection with the REST API. Most likely the URL is incorrect or API Flask server # is not running logger.fatal(f"Could not establish connection to the REST API to get airlines (url={url})") sys.exit(1) except requests.exceptions.RequestException as err: logger.error("Unknown error while connecting to REST API to get airlines: {}".format(str(err))) return AIRLINES else: return AIRLINES
def extract(self): logger.info(f"loading master aircraft file: {self.source_file}") # column names to keep from the source file (other columns are not parsed) keep_columns = [ 'N-NUMBER', 'SERIAL NUMBER', 'MFR MDL CODE', 'ENG MFR MDL', 'YEAR MFR', 'TYPE REGISTRANT', 'NAME', 'STREET', 'STREET2', 'CITY', 'STATE', 'ZIP CODE', 'REGION', 'COUNTRY', 'LAST ACTION DATE', 'CERT ISSUE DATE', 'STATUS CODE', 'AIR WORTH DATE', 'EXPIRATION DATE', ] # specific field parsers (converters) # TODO :: # finish the converters below # make sure to parse 'MFR MDL CODE' and 'ENG MFR MDL' as str # parse 'YEAR MFR' as int and everything else should use their own parsers from above converters = { 'MFR MDL CODE': NotImplementedError, 'ENG MFR MDL': NotImplementedError, 'YEAR MFR': NotImplementedError, 'TYPE REGISTRANT': NotImplementedError, 'ZIP CODE': NotImplementedError, 'REGION': NotImplementedError, 'LAST ACTION DATE': NotImplementedError, 'CERT ISSUE DATE': NotImplementedError, 'STATUS CODE': NotImplementedError, 'AIR WORTH DATE': NotImplementedError, 'EXPIRATION DATE': NotImplementedError, } # TODO :: # read csv raise NotImplementedError('YOU FORGOT TO WRITE THIS!')
def gbq_create(self, table_name): schema = [ bigquery.SchemaField('n_number', 'STRING', mode='NULLABLE'), bigquery.SchemaField('serial_number', 'STRING', mode='NULLABLE'), bigquery.SchemaField('mfr_mdl_code', 'STRING', mode='NULLABLE'), bigquery.SchemaField('eng_mfr_mdl', 'STRING', mode='NULLABLE'), bigquery.SchemaField('mfr_year', 'INTEGER', mode='NULLABLE'), bigquery.SchemaField('registrant_type', 'STRING', mode='NULLABLE'), bigquery.SchemaField('registrant_name', 'STRING', mode='NULLABLE'), bigquery.SchemaField('street', 'STRING', mode='NULLABLE'), bigquery.SchemaField('street2', 'STRING', mode='NULLABLE'), bigquery.SchemaField('city', 'STRING', mode='NULLABLE'), bigquery.SchemaField('state', 'STRING', mode='NULLABLE'), bigquery.SchemaField('zip_code', 'STRING', mode='NULLABLE'), bigquery.SchemaField('region', 'STRING', mode='NULLABLE'), bigquery.SchemaField('country', 'STRING', mode='NULLABLE'), bigquery.SchemaField('last_action_date', 'DATE', mode='NULLABLE'), bigquery.SchemaField('issue_date', 'DATE', mode='NULLABLE'), bigquery.SchemaField('status', 'STRING', mode='NULLABLE'), bigquery.SchemaField('air_ready_date', 'DATE', mode='NULLABLE'), bigquery.SchemaField('expiration_date', 'DATE', mode='NULLABLE'), bigquery.SchemaField('mfr_name', 'STRING', mode='NULLABLE'), bigquery.SchemaField('mfr_short_name', 'STRING', mode='NULLABLE'), bigquery.SchemaField('model', 'STRING', mode='NULLABLE'), bigquery.SchemaField('aircraft_type', 'STRING', mode='NULLABLE'), bigquery.SchemaField('num_engines', 'INTEGER', mode='NULLABLE'), bigquery.SchemaField('num_seats', 'INTEGER', mode='NULLABLE'), bigquery.SchemaField('weight_class', 'STRING', mode='NULLABLE'), bigquery.SchemaField('speed', 'INTEGER', mode='NULLABLE'), bigquery.SchemaField('eng_mfr_name', 'STRING', mode='NULLABLE'), bigquery.SchemaField('eng_model', 'STRING', mode='NULLABLE'), bigquery.SchemaField('eng_type', 'STRING', mode='NULLABLE'), bigquery.SchemaField('horsepower', 'FLOAT', mode='NULLABLE'), bigquery.SchemaField('thrust', 'FLOAT', mode='NULLABLE'), ] # create a bigquery client client = bigquery.Client() # delete table if it exists logger.debug(f"dropping {table_name} table if it exists") client.delete_table(table_name, not_found_ok=True) # create a new table table = bigquery.Table(table_name, schema=schema) table = client.create_table(table) # table created logger.info(f"bigquery table created: {table_name}")
def delete_table(self, table_name): """ Delete a table from BigQuery. :param table_name: full table name to delete including dataset id (ie: my_dataset.my_table) :return: None """ bq = self.client try: table: Table = bq.get_table(table_name) logger.info( f"deleting existing bigquery table: {table.project}:{table.dataset_id}.{table.table_id}" ) bq.delete_table(table) except google_exceptions.NotFound: # table doesn't exist logger.debug( f"bigquery table to delete did not exist: {table_name}") pass
def run(): logger.info("DATA ENGINEERING BOOTCAMP - CHAPTER 1 EPISODE 5") logger.info("FAA Aircraft Dataset ETL Process") # set command line args parser = argparse.ArgumentParser( description='FAA Aircraft Database ETL Process') register_cmdline_args(parser) # process command line input args = parser.parse_args() # execute command target = None if args.command == 'test-engine': # test processing engine file target = EngineTypeFileProcessor(source_file=args.engine_file) elif args.command == 'test-aircraft': # test processing aircraft file target = AircraftTypeFileProcessor(source_file=args.aircraft_file) elif args.command == 'test-master': # test processing master file engine = EngineTypeFileProcessor(source_file=args.engine_file) aircraft = AircraftTypeFileProcessor(source_file=args.aircraft_file) master = AircraftMasterFileProcessor(source_file=args.master_file) master.lookup_aircraft_type(aircraft) master.lookup_engine_type(engine) master.lookup_n_number(['N794JB', 'N518AS', 'N292JB']) target = master elif args.command == 'etl': # extract, transform, and load (etl) all 3 files engine = EngineTypeFileProcessor(source_file=args.engine_file) aircraft = AircraftTypeFileProcessor(source_file=args.aircraft_file) master = AircraftMasterFileProcessor(source_file=args.master_file) master.lookup_aircraft_type(aircraft) master.lookup_engine_type(engine) master.load(output_table=args.output_table, output_file=args.output_file) target = master elif args.command == 'help': parser.print_help() # print df if args.print and target is not None: target.print(sample_size=args.row_count) if args.write_csv and target is not None: target.to_csv(args.output_file)
def run(): logger.info("DATA ENGINEERING BOOTCAMP - CHAPTER 1 EPISODE 5") logger.info("FAA Aircraft Dataset ETL Process") # set command line args parser = argparse.ArgumentParser( description='FAA Aircraft Database ETL Process') register_cmdline_args(parser) # process command line input args = parser.parse_args() # execute command target = None if args.command == 'test-engine': # test processing engine file target = EngineTypeFileProcessor(source_file=args.engine_file) elif args.command == 'help': parser.print_help() # print df if args.print and target is not None: target.print(sample_size=args.row_count)
def run(): t0 = now() # parse command line options known_args, beam_args = runtime_args() options = PipelineOptions(beam_args) options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: rows = (p | beam.io.ReadFromText(known_args.input, skip_header_lines=1) | beam.ParDo(BeamReadCSV(header_cols=FLIGHTS_CSV_COLUMNS)) | beam.ParDo(BeamTransformRecords(), date_fmt='%Y-%m-%d', time_fmt='%H%M')) # write parquet output files output = (rows | beam.io.WriteToParquet( os.path.join(known_args.output, 'flights'), schema=datamodel_flights_parquet_schema(), file_name_suffix='.parquet')) # alternative: write (simple) newline delimited json output files # a very flexible output file format for bigquery and other big data tools # much slower to write and larger in size than binary formats such as Parquet, ORC, or Avro # but provides flexibility over schema for smaller data files # larger file sizes should use Avro, Parquet, ORC. Avro provides fastest write speeds where # parquet and orc provide faster read performance for analytical queries output = ( rows | beam.Map( lambda e: { k: v if k != 'flight_date' else v.strftime('%Y-%m-%d') for k, v in e.items() } ) # convert flight_date back to string type for json conversion | beam.Map(lambda e: json.dumps(e)) # json dump row | beam.io.WriteToText(os.path.join(known_args.output, 'flights'), file_name_suffix='.json')) logger.info(f"total time: {(now() - t0):,.6f} secs")
def run_simple(): t0 = time() # parse command line arguments known_args, beam_args = runtime_args() # pass in the pipeline options options = PipelineOptions(beam_args) options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: # pre-process: create a list of date to process and get other side-inputs # create a list of flights date to retrieve from api days = list_dates(start_date=known_args.start_date, end_date=known_args.end_date) # get airline iata codes from the api airlines = api_get_airlines(api_url=known_args.api_url, api_token=known_args.api_token) # create a beam collection with all days and airlines to get flights for input_rows = (p | beam.Create(days) | beam.ParDo(BeamExpandDaysByAirlines(), airlines=airlines)) # call flights api to get flights for each record above and # call the beam transforms to process the input flights flights = (input_rows | beam.ParDo(BeamGetFlights(), api_url=known_args.api_url, api_token=known_args.api_token) | beam.ParDo(BeamTransformFlights())) # prepare & write output files json_output = (flights | beam.Map(lambda e: json.dumps(e)) | beam.io.WriteToText(os.path.join( known_args.output, 'flights'), file_name_suffix='.json')) logger.info("apache beam pipeline done") logger.info(f"process completed in {(time() - t0):,.3f} seconds")
def load_subtable(self, csv_filepath, uid_name, uid_col_list, csv_bq, passenger_bq=None): """ Function to load a supporting table to passengers from GCS and save in BigQuery. :param csv_filepath: str input filename :param uid_name: str name to give the UID column :param uid_col_list: list of str column names to combine into UID :param csv_bq: str output project.datset.table where the dat will be saved :param passenger_bq: str, optional. If passengers_df already has been loaded """ # Create generic function that will work for both cards and addresses csv_path = 'gs://{}/{}'.format(self.bucket, csv_filepath) logger.info(f"Loading address info from {csv_path}") csv_df = self.sparkql.read.csv(csv_path, header=True) # Create uid from columns listed in uid_col_list if passenger_bq: # If passengers_bq is not None, load passenger_df from BigQuery as in Episode 3 else:
def rename_columns(self): # rename columns based on the list below and convert all column names to lower case columns = { 'TYPE REGISTRANT': 'REGISTRANT TYPE', 'NAME': 'REGISTRANT NAME', 'YEAR MFR': 'MFR YEAR', 'CERT ISSUE DATE': 'ISSUE DATE', 'STATUS CODE': 'STATUS', 'AIR WORTH DATE': 'AIR READY DATE', } df = self.df logger.info(f"renaming master file columns") # rename columns based on mapping rules above df.rename(columns=columns, inplace=True, errors='ignore') # lowercase columns names and replace special characters mapper = { col: str(col).strip().lower().replace(' ', '_').replace('-', '_') for col in list(df.columns) } df.rename(columns=mapper, inplace=True)
def run_simple(): t0 = time() # parse command line arguments known_args, beam_args = runtime_args() # BigQuery Utility bq_utils = BigQueryUtils() # pass in the pipeline options options = PipelineOptions(beam_args) options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: # todo: <<ADD YOUR CODE HERE>> # todo: calling beam transforms to call rest API, transform records, and output them into files pass # todo: create an external table using the output files and insert records into BigQuery logger.info(f"process completed in {(time() - t0):,.3f} seconds")