Example #1
0
def run():
    external_table_name = 'deb.missing_routes'
    table_name = 'deb.routes'

    bq = BigQueryUtils()

    # create external table
    logger.info(f"creating external table: {external_table_name}")
    # todo: <<ADD YOUR CODE>>. Use the BigQueryUtils class above.

    logger.info(f"inserting missing routes...")
    # todo: <<ADD YOUR CODE>>. Use the BigQueryUtils class with a "INSERT INTO SELECT" statement

    # (optional) delete external table
    logger.info(f"deleting external table: {external_table_name}")
    bq.delete_table(external_table_name)
Example #2
0
def run():
    external_table_name = 'deb.missing_routes'
    table_name = 'deb.routes'

    bq = BigQueryUtils()

    # create external table
    logger.info(f"creating external table: {external_table_name}")
    schema = [
        SchemaField('airline', 'string'),
        SchemaField('src', 'string'),
        SchemaField('dest', 'string')
    ]
    bq.create_external_table(
        external_table_name,
        source_uris=
        'gs://deb.gcs.turalabs.com/beam/ch2ep2/output/rejects/missing-routes*.csv',
        schema=schema,
        source_format='CSV',
        delete_if_exists=True,
        skip_leading_rows=1)

    # delete previously inserted rows
    logger.info("deleting previously inserted missing routes...")
    sql = f"DELETE FROM {table_name} WHERE equipment = '-'"
    bq.execute(sql)

    logger.info(f"inserting missing routes...")
    sql = f"""INSERT INTO {table_name}
                SELECT
                  airline,
                  src,
                  dest,
                  NULL as codeshare,
                  1 as stops,
                  '-' as equipment
                FROM {external_table_name}
    """
    bq.execute(sql)

    # (optional) delete external table
    logger.info(f"deleting external table: {external_table_name}")
    bq.delete_table(external_table_name)
Example #3
0
def run_simple():
    t0 = time()

    # parse command line arguments
    known_args, beam_args = runtime_args()

    # BigQuery Utility
    bq_utils = BigQueryUtils()

    # pass in the pipeline options
    options = PipelineOptions(beam_args)
    options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=options) as p:
        # todo: <<ADD YOUR CODE HERE>>
        # todo: calling beam transforms to call rest API, transform records, and output them into files
        pass

    # todo: create an external table using the output files and insert records into BigQuery

    logger.info(f"process completed in {(time() - t0):,.3f} seconds")
Example #4
0
def run():
    t0 = now()
    # parse command line options
    known_args, beam_args = runtime_args()
    # BigQuery utility
    bq_utils = BigQueryUtils()
    # setup apache beam args
    options = PipelineOptions(beam_args)
    options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=options) as p:
        # todo: finish writing this code. you can cheat and look at deb.ch2.ep3.answers
        rows = (p
                | beam.io.ReadFromText(known_args.input, skip_header_lines=1)
                | beam.ParDo(BeamReadCSV(header_cols=CSV_COLS))
                | beam.ParDo(BeamTransformRecords(),
                             date_fmt='%Y-%m-%d',
                             time_fmt='%H%M'))

        output = (rows
                  | beam.io.WriteToParquet(os.path.join(
                      known_args.output, 'flights'),
                                           schema=get_schema_parquet(),
                                           file_name_suffix='.parquet'))

        json_output = (
            rows
            | beam.Map(
                lambda e: {
                    k: v if k != 'flight_date' else v.strftime('%Y-%m-%d')
                    for k, v in e.items()
                })
            | beam.Map(lambda e: json.dumps(e))
            | beam.io.WriteToText(os.path.join(known_args.output, 'flights'),
                                  file_name_suffix='.json'))

    logger.info(f"total time: {(now() - t0):,.6f} seconds")
Example #5
0
def run():
    t0 = now()

    # parse command line options
    known_args, beam_args = runtime_args()

    # BigQuery utility
    bq_utils = BigQueryUtils()

    options = PipelineOptions(beam_args)
    options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=options) as p:
        rows = (p
                | beam.io.ReadFromText(known_args.input, skip_header_lines=1)
                | beam.ParDo(BeamReadCSV(header_cols=FLIGHTS_CSV_COLUMNS))
                | beam.ParDo(BeamTransformRecords(),
                             date_fmt='%Y-%m-%d',
                             time_fmt='%H%M'))

        # load the routes table into a lookup dict
        sql = f"""select airline, src, dest from {known_args.routes_table}"""
        routes = bq_utils.execute_as_dict(sql,
                                          keycols=['airline', 'src', 'dest'])

        # lookup routes
        rows, routes_rejects, missing_routes = (
            rows
            | beam.ParDo(BeamLookupRoute(), routes=routes).with_outputs(
                'rejects', 'missing_routes', main='main'))

        # write parquet output files
        output = (rows
                  | beam.io.WriteToParquet(
                      os.path.join(known_args.output, 'flights'),
                      schema=datamodel_flights_parquet_schema(),
                      file_name_suffix='.parquet'))

        # write missing routes to another output as CSV
        output_routes = (
            missing_routes
            | "gbr" >> beam.GroupByKey()  # calculate distinct missing routes
            | "missing_routes_csv" >> beam.Map(
                lambda e: ','.join(list(e[0]))
            )  # csv output the key (e[0] of key value tuple) which is (airline,src,dest)
            | "missing_routes_out" >> beam.io.WriteToText(
                os.path.join(known_args.output, 'rejects/missing-routes'),
                file_name_suffix='.csv',
                header='airline,src,dest'))

        # alternative: write (simple) newline delimited json output files
        #              a very flexible output file format for bigquery and other big data tools
        #              much slower to write and larger in size than binary formats such as Parquet, ORC, or Avro
        #              but provides flexibility over schema for smaller data files
        #              larger file sizes should use Avro, Parquet, ORC. Avro provides fastest write speeds where
        #              parquet and orc provide faster read performance for analytical queries

        # output = (rows
        #           | beam.Map(lambda e: {k: v if k != 'flight_date' else v.strftime('%Y-%m-%d') for k, v in e.items()})  # convert flight_date back to string type for json conversion
        #           | beam.Map(lambda e: json.dumps(e))  # json dump row
        #           | beam.io.WriteToText(os.path.join(known_args.output, 'flights'),
        #                                 file_name_suffix='.json')
        #           )

    logger.info("beam pipiline completed.")

    # create bigquery external table and insert into bq flights table
    bq_utils.create_external_table(known_args.flights_ext_table,
                                   source_uris=os.path.join(
                                       known_args.output, "flights*.parquet"),
                                   source_format='PARQUET',
                                   delete_if_exists=True)

    # create and replace existing bigquery flights table
    bq_utils.create_table(known_args.flights_table,
                          schema=datamodel_flights_bigquery_schema(),
                          delete_if_exists=True)

    # insert into table as select (itas) statement
    sql = f"""
        INSERT INTO `{known_args.flights_table}`
        SELECT
          a.day_of_week,
          a.flight_date,
          a.airline,
          a.tailnumber,
          a.flight_number,
          a.src,
          a.src_city,
          a.src_state,
          a.dest,
          a.dest_city,
          a.dest_state,
          PARSE_TIME('%H:%M:%S', a.departure_time) as departure_time,
          PARSE_TIME('%H:%M:%S', a.actual_departure_time) as actual_departure_time,
          a.departure_delay,
          a.taxi_out,
          PARSE_TIME('%H:%M:%S', a.wheels_off) as wheels_off,
          PARSE_TIME('%H:%M:%S', a.wheels_on) as wheels_on,
          a.taxi_in,
          PARSE_TIME('%H:%M:%S', a.arrival_time) as arrival_time,
          PARSE_TIME('%H:%M:%S', a.actual_arrival_time) as actual_arrival_time,
          a.arrival_delay,
          a.cancelled,
          a.cancellation_code,
          a.flight_time,
          a.actual_flight_time,
          a.air_time,
          a.flights,
          a.distance,
          a.airline_delay,
          a.weather_delay,
          a.nas_delay,
          a.security_delay,
          a.late_aircraft_delay,
          -- CONCAT(a.flight_date, '_', a.airline, '_', a.flight_number) AS flightDate_airline_flightNumber
        FROM
          `{known_args.flights_ext_table}`  a
        """
    # insert records form parquet external table into final bq managed flights table
    r = bq_utils.execute(sql)

    logger.info(f"total time: {(now() - t0):,.6f} secs")
Example #6
0
def run_with_lookups():
    t0 = time()

    # parse command line arguments
    known_args, pipeline_args = runtime_args()

    # BigQuery Utility
    bq_utils = BigQueryUtils()

    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=options) as p:
        # pre-process: create a list of date to process and get other side-inputs
        # create a list of flights date to retrieve from api
        days = list_dates(start_date=known_args.start_date, end_date=known_args.end_date)

        # get airline iata codes from the api
        airlines = api_get_airlines(api_url=known_args.api_url, api_token=known_args.api_token)

        # read airports into a lookup dict with iata code as dict key
        sql = f"""SELECT iata, city, state, lat, long, tz, utc_offset FROM {known_args.airports_table}"""
        airports = bq_utils.execute_as_dict(sql, keycols='iata')

        # read routes into a lookup dict with (airline, src, dest) as dict key
        # more info about bq parameterized queries: https://cloud.google.com/bigquery/docs/parameterized-queries
        sql = f"""SELECT airline, src, dest FROM {known_args.routes_table} WHERE airline in UNNEST(@airlines)"""
        routes = bq_utils.execute_as_dict(sql, keycols=['airline', 'src', 'dest'],
                                          query_params=[bigquery.ArrayQueryParameter("airlines", "STRING", airlines)])

        # create a beam collection with all days and airlines to get flights for
        input_rows = (p
                      | beam.Create(days)
                      | beam.ParDo(BeamExpandDaysByAirlines(), airlines=airlines)
                      )

        # call flights api to get flights for each record above and
        # call the beam transforms to process the input flights
        flights = (input_rows
                   | beam.ParDo(BeamGetFlights(), api_url=known_args.api_url, api_token=known_args.api_token)
                   | beam.ParDo(BeamTransformFlights())
                   )

        # lookup src/dest airports
        flights, airport_rejects, missing_airports = \
            flights | beam.ParDo(BeamLookupAirport(), airports=airports).with_outputs('rejects', 'missing_airport', main='main')

        # lookup routes
        flights, routes_rejects, missing_routes = \
            flights | beam.ParDo(BeamLookupRoute(), routes=routes).with_outputs('rejects', 'missing_route', main='main')

        # write main flight output. records transformed and lookup checks completed
        (flights
         | "flights_json" >> beam.Map(lambda e: json.dumps(e))
         | "flights_output" >> beam.io.WriteToText(os.path.join(known_args.output, 'flights'), file_name_suffix='.json')
         )

        # write out rejects
        # airport rehects
        (airport_rejects
         | "airport_rejects_json" >> beam.Map(lambda e: json.dumps(e))
         | "airport_rejects_output" >> beam.io.WriteToText(os.path.join(known_args.output, 'rejects/airport_rejects'), file_name_suffix='.json')
         )

        # routes rejects
        (routes_rejects
         | "route_rejects_json" >> beam.Map(lambda e: json.dumps(e))
         | "route_rejects_output" >> beam.io.WriteToText(os.path.join(known_args.output, 'rejects/routes-rejects'), file_name_suffix='.json')
         )

        # missing airports
        (missing_airports
         | "gba" >> beam.GroupByKey()
         | "missing_airports_csv" >> beam.Map(lambda e: str(e[0]))
         | "missing_airport_out" >> beam.io.WriteToText(os.path.join(known_args.output, 'rejects/missing_airports'),
                                                        file_name_suffix='.csv',
                                                        header='iata')
         )

        (missing_routes
         | "gbr" >> beam.GroupByKey()
         | "missing_routes_csv" >> beam.Map(lambda e: ','.join(list(e[0])))
         | "missing_routes_out" >> beam.io.WriteToText(os.path.join(known_args.output, 'rejects/missing_routes'),
                                                       file_name_suffix='.csv',
                                                       header='airline,src,dest')
         )

    logger.info("apache beam pipeline done")

    # create bigquery external table
    logger.info("dropping and creating bigquery external table...")
    bq_utils.create_external_table(known_args.flights_ext_table,
                                   source_uris=os.path.join(known_args.output, "flights*.json"),
                                   schema=FUTURE_FLIGHTS_BIGQUERY_SCHEMA,
                                   delete_if_exists=True)

    # delete existing flights greater than start_date
    sql = f"""DELETE FROM {known_args.flights_table} WHERE flight_date >= '{datetime.strftime(known_args.start_date, '%Y-%m-%d')}'"""
    bq_utils.execute(sql)

    # insert flights records into final table
    sql = f"""INSERT INTO {known_args.flights_table}
                        (SELECT
                          a.day_of_week ,
                          a.flight_date ,
                          a.airline ,
                          a.tailnumber ,
                          a.flight_number ,
                          a.src ,
                          b.city as src_city ,
                          b.state as src_state ,
                          a.dest ,
                          c.city as dest_city ,
                          c.state as dest_state ,
                          a.departure_time ,
                          NULL as actual_departure_time ,
                          NULL as departure_delay ,
                          NULL as taxi_out ,
                          NULL as wheels_off ,
                          NULL as wheels_on ,
                          NULL as taxi_in ,
                          a.arrival_time ,
                          NULL as actual_arrival_time ,
                          NULL as arrival_delay ,
                          False as cancelled ,
                          NULL as cancellation_code ,
                          a.flight_time ,
                          NULL as actual_flight_time ,
                          NULL as air_time ,
                          1 as flights ,
                          a.distance ,
                          NULL as airline_delay ,
                          NULL as weather_delay ,
                          NULL as nas_delay ,
                          NULL as security_delay ,
                          NULL as late_aircraft_delay ,
                          -- CONCAT(a.flight_date, '_', a.airline, '_', a.flight_number) as flightDate_airline_flightNumber
                         FROM {known_args.flights_ext_table} a
                          LEFT JOIN {known_args.airports_table} b on a.src = b.iata
                          LEFT JOIN {known_args.airports_table} c on a.dest = c.iata
                        )"""
    bq_utils.execute(sql)

    logger.info(f"process completed in {(time() - t0):,.3f} seconds")
Example #7
0
def run_simple():
    t0 = time()

    # parse command line arguments
    known_args, beam_args = runtime_args()

    # BigQuery Utility
    bq_utils = BigQueryUtils()

    # pass in the pipeline options
    options = PipelineOptions(beam_args)
    options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=options) as p:
        # pre-process: create a list of date to process and get other side-inputs
        # create a list of flights date to retrieve from api
        days = list_dates(start_date=known_args.start_date, end_date=known_args.end_date)

        # get airline iata codes from the api
        airlines = api_get_airlines(api_url=known_args.api_url, api_token=known_args.api_token)

        # create a beam collection with all days and airlines to get flights for
        input_rows = (p
                      | beam.Create(days)
                      | beam.ParDo(BeamExpandDaysByAirlines(), airlines=airlines)
                      )

        # call flights api to get flights for each record above and
        # call the beam transforms to process the input flights
        flights = (input_rows
                   | beam.ParDo(BeamGetFlights(), api_url=known_args.api_url, api_token=known_args.api_token)
                   | beam.ParDo(BeamTransformFlights())
                   )

        # prepare & write output files
        json_output = (flights
                       | beam.Map(lambda e: json.dumps(e))
                       | beam.io.WriteToText(os.path.join(known_args.output, 'flights'), file_name_suffix='.json')
                       )

    logger.info("apache beam pipeline done")

    # create bigquery external table
    logger.info("dropping and creating bigquery external table...")
    bq_utils.create_external_table(known_args.flights_ext_table,
                                   source_uris=os.path.join(known_args.output, "flights*.json"),
                                   schema=FUTURE_FLIGHTS_BIGQUERY_SCHEMA,
                                   delete_if_exists=True)

    # delete existing flights greater than start_date
    sql = f"""DELETE FROM {known_args.flights_table} WHERE flight_date >= '{datetime.strftime(known_args.start_date, '%Y-%m-%d')}'"""
    bq_utils.execute(sql)

    # insert flights records into final table
    sql = f"""INSERT INTO {known_args.flights_table}
                    (SELECT
                      a.day_of_week ,
                      a.flight_date ,
                      a.airline ,
                      a.tailnumber ,
                      a.flight_number ,
                      a.src ,
                      b.city as src_city ,
                      b.state as src_state ,
                      a.dest ,
                      c.city as dest_city ,
                      c.state as dest_state ,
                      a.departure_time ,
                      NULL as actual_departure_time ,
                      NULL as departure_delay ,
                      NULL as taxi_out ,
                      NULL as wheels_off ,
                      NULL as wheels_on ,
                      NULL as taxi_in ,
                      a.arrival_time ,
                      NULL as actual_arrival_time ,
                      NULL as arrival_delay ,
                      False as cancelled ,
                      NULL as cancellation_code ,
                      a.flight_time ,
                      NULL as actual_flight_time ,
                      NULL as air_time ,
                      1 as flights ,
                      a.distance ,
                      NULL as airline_delay ,
                      NULL as weather_delay ,
                      NULL as nas_delay ,
                      NULL as security_delay ,
                      NULL as late_aircraft_delay ,
                      -- CONCAT(a.flight_date, '_', a.airline, '_', a.flight_number) as flightDate_airline_flightNumber
                     FROM {known_args.flights_ext_table} a
                      LEFT JOIN {known_args.airports_table} b on a.src = b.iata
                      LEFT JOIN {known_args.airports_table} c on a.dest = c.iata
                    )"""
    bq_utils.execute(sql)

    logger.info(f"process completed in {(time() - t0):,.3f} seconds")
Example #8
0
def run():
    t0 = now()

    # parse command line options
    known_args, beam_args = runtime_args()

    # BigQuery utility
    bq_utils = BigQueryUtils()

    options = PipelineOptions(beam_args)
    options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=options) as p:
        rows = (p
                | beam.io.ReadFromText(known_args.input, skip_header_lines=1)
                | beam.ParDo(BeamReadCSV(header_cols=FLIGHTS_CSV_COLUMNS))
                | beam.ParDo(BeamTransformRecords(), date_fmt='%Y-%m-%d', time_fmt='%H%M')
                )

        # write parquet output files
        output = (rows
                  | beam.io.WriteToParquet(os.path.join(known_args.output, 'flights'),
                                           schema=datamodel_flights_parquet_schema(), file_name_suffix='.parquet')
                  )

    logger.info("beam pipiline completed.")

    # create bigquery external table and insert into bq flights table
    bq_utils.create_external_table(known_args.flights_ext_table,
                                   source_uris=os.path.join(known_args.output, "flights*.parquet"),
                                   source_format='PARQUET',
                                   delete_if_exists=True)
    # create and replace existing bigquery flights table
    bq_utils.create_table(known_args.flights_table, schema=datamodel_flights_bigquery_schema(), delete_if_exists=True)

    # insert into table as select (itas) statement
    sql = f"""
        INSERT INTO `{known_args.flights_table}`
        SELECT
          a.day_of_week,
          a.flight_date,
          a.airline,
          a.tailnumber,
          a.flight_number,
          a.src,
          a.src_city,
          a.src_state,
          a.dest,
          a.dest_city,
          a.dest_state,
          PARSE_TIME('%H:%M:%S', a.departure_time) as departure_time,
          PARSE_TIME('%H:%M:%S', a.actual_departure_time) as actual_departure_time,
          a.departure_delay,
          a.taxi_out,
          PARSE_TIME('%H:%M:%S', a.wheels_off) as wheels_off,
          PARSE_TIME('%H:%M:%S', a.wheels_on) as wheels_on,
          a.taxi_in,
          PARSE_TIME('%H:%M:%S', a.arrival_time) as arrival_time,
          PARSE_TIME('%H:%M:%S', a.actual_arrival_time) as actual_arrival_time,
          a.arrival_delay,
          a.cancelled,
          a.cancellation_code,
          a.flight_time,
          a.actual_flight_time,
          a.air_time,
          a.flights,
          a.distance,
          a.airline_delay,
          a.weather_delay,
          a.nas_delay,
          a.security_delay,
          a.late_aircraft_delay,
          -- CONCAT(a.flight_date, '_', a.airline, '_', a.flight_number) AS flightDate_airline_flightNumber
        FROM
          `{known_args.flights_ext_table}`  a
        """
    # insert records form parquet external table into final bq managed flights table
    r = bq_utils.execute(sql)

    logger.info(f"total time: {(now() - t0):,.6f} secs")