Example #1
0
def ingest_into_native_bigquery_storage(data, context):
    """ This is the primary function invoked whenever the Cloud Function is triggered.
    It parses the Pub/Sub notification that triggered it by extracting the location of the
    file in Google Cloud Storage (GCS). It subsequently downloads the contents of this file
    from GCS, sanitizes & augments the events within it & finally writes them into native BigQuery storage.
    """

    # Source required datasets & tables:
    bigquery_asset_list = [
        # (dataset, table_name, table_schema, table_partition_column)
        ('logs', f'native_events_{os.environ["ENVIRONMENT"]}', 'logs',
         'event_ds'),
        ('logs', f'native_events_debug_{os.environ["ENVIRONMENT"]}', 'logs',
         'event_ds'),
        ('logs', f'dataflow_backfill_{os.environ["ENVIRONMENT"]}', 'logs',
         'event_ds'),
        ('native', f'events_improbable_{os.environ["ENVIRONMENT"]}',
         'improbable', 'event_timestamp')
    ]

    try:
        table_logs, table_debug, _, table_function = source_bigquery_assets(
            client_bq, bigquery_asset_list)
    except Exception:
        table_logs, table_debug, _, table_function = generate_bigquery_assets(
            client_bq, bigquery_asset_list)

    # Parse payload:
    payload = json.loads(base64.b64decode(data['data']).decode('utf-8'))
    bucket_name, object_location = payload['bucket'], payload['name']
    gspath = f'gs://{bucket_name}/{object_location}'

    # Write log to events_logs_function:
    malformed, failed_insertion = False, False
    errors = client_bq.insert_rows(
        table_logs,
        format_event_list(['parse_initiated'], str,
                          os.environ['FUNCTION_NAME'], gspath))
    if errors:
        print(f'Errors while inserting logs: {str(errors)}')
        failed_insertion = True

    # Get file from GCS:
    bucket = client_gcs.get_bucket(bucket_name)
    try:
        data = bucket.get_blob(object_location).download_as_string().decode(
            'utf8')
    except UnicodeDecodeError:
        print(
            'Automatic decompressive transcoding failed, unzipping content..')
        data = gunzip_bytes_obj(
            bucket.get_blob(object_location).download_as_string()).decode(
                'utf-8')
    except Exception:
        raise Exception(
            f'Could not retrieve file gs://{bucket_name}/{object_location} from GCS!'
        )

    # We use generators in order to save memory usage, allowing the Cloud Function to use the smallest capacity template:
    for chunk in generator_chunk(generator_split(data, '\n'), 1000):
        events_batch_function, events_batch_debug = [], []
        for event_tuple in generator_load_json(chunk):
            if event_tuple[0]:
                for event in event_tuple[1]:
                    d = dict()
                    # Sanitize:
                    d['analytics_environment'] = get_dict_value(
                        event, 'analyticsEnvironment', 'analytics_environment')
                    d['event_environment'] = get_dict_value(
                        event, 'eventEnvironment', 'event_environment')
                    d['event_source'] = get_dict_value(event, 'eventSource',
                                                       'event_source')
                    d['session_id'] = get_dict_value(event, 'sessionId',
                                                     'session_id')
                    d['version_id'] = get_dict_value(event, 'versionId',
                                                     'version_id')
                    d['batch_id'] = get_dict_value(event, 'batchId',
                                                   'batch_id')
                    d['event_id'] = get_dict_value(event, 'eventId',
                                                   'event_id')
                    d['event_index'] = get_dict_value(event, 'eventIndex',
                                                      'event_index')
                    d['event_class'] = get_dict_value(event, 'eventClass',
                                                      'event_class')
                    d['event_type'] = get_dict_value(event, 'eventType',
                                                     'event_type')
                    d['player_id'] = get_dict_value(event, 'playerId',
                                                    'player_id')
                    d['event_timestamp'] = cast_to_unix_timestamp(
                        get_dict_value(event, 'eventTimestamp',
                                       'event_timestamp'),
                        ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d %H:%M:%S %Z'])
                    d['received_timestamp'] = get_dict_value(
                        event, 'receivedTimestamp', 'received_timestamp'
                    )  # This value was set by our endpoint, so we already know it is in unixtime
                    # Augment:
                    d['inserted_timestamp'] = time.time()
                    d['job_name'] = os.environ['FUNCTION_NAME']
                    # Sanitize:
                    d['event_attributes'] = get_dict_value(
                        event, 'eventAttributes', 'event_attributes')
                    events_batch_function.append(d)
            else:
                events_batch_debug.append(event_tuple[1])

        if len(events_batch_function) > 0:
            # Write JSON to events_function:
            errors = client_bq.insert_rows(table_function,
                                           events_batch_function)
            if errors:
                print(f'Errors while inserting events: {str(errors)}')
                failed_insertion = True

        if len(events_batch_debug) > 0:
            # Write non-JSON to events_debug_function:
            errors = client_bq.insert_rows(
                table_debug,
                format_event_list(events_batch_debug, str,
                                  os.environ['FUNCTION_NAME'], gspath))
            if errors:
                print(f'Errors while inserting debug event: {str(errors)}')
                failed_insertion = True
            malformed = True

    # We only `raise` now because further iterations of the execution loop could have still succeeded:
    if failed_insertion and malformed:
        raise Exception(
            f'Failed to insert records into BigQuery, inspect logs! Non-JSON data present in gs://{bucket_name}/{object_location}'
        )
    if failed_insertion:
        raise Exception(
            'Failed to insert records into BigQuery, inspect logs!')
    if malformed:
        raise Exception(
            f'Non-JSON data present in gs://{bucket_name}/{object_location}')

    return 200
def run():

    client_bq = bigquery.Client.from_service_account_json(args.local_sa_key, location=args.location)
    bigquery_asset_list = [
      ('logs', 'events_logs_function_native', 'event_ds'),
      ('logs', 'events_debug_function_native', 'event_ds'),
      ('logs', 'events_logs_dataflow_backfill', 'event_ds'),
      ('events', 'events_function_native', 'event_timestamp')]
    try:
        source_bigquery_assets(client_bq, bigquery_asset_list)
    except Exception:
        generate_bigquery_assets(client_bq, bigquery_asset_list)

    # https://github.com/apache/beam/blob/master/sdks/python/apache_beam/options/pipeline_options.py
    po = PipelineOptions()
    job_name = 'p1-gcs-to-bq-{method}-backfill-{environment_name}-{event_category}-{event_ds_start}-to-{event_ds_stop}-{event_time}-{ts}'.format(
      method=method, environment_name=environment_name, event_category=args.event_category.replace('_', '-'), event_ds_start=args.event_ds_start, event_ds_stop=args.event_ds_stop, event_time=time_part_name, ts=str(int(time.time())))
    # https://cloud.google.com/dataflow/docs/guides/specifying-exec-params
    pipeline_options = po.from_dictionary({
      'project': args.gcp,
      'staging_location': 'gs://{bucket_name}/data_type=dataflow/batch/staging/{job_name}/'.format(bucket_name=args.bucket_name, job_name=job_name),
      'temp_location': 'gs://{bucket_name}/data_type=dataflow/batch/temp/{job_name}/'.format(bucket_name=args.bucket_name, job_name=job_name),
      'runner': args.execution_environment,  # {DirectRunner, DataflowRunner}
      'setup_file': args.setup_file,
      'service_account_email': 'dataflow-batch@{gcp_project_id}.iam.gserviceaccount.com'.format(gcp_project_id=args.gcp),
      'job_name': job_name,
      'region': args.gcp_region
      })
    pipeline_options.view_as(SetupOptions).save_main_session = True

    p1 = beam.Pipeline(options=pipeline_options)
    fileListGcs = (p1 | 'CreateGcsIterators' >> beam.Create(list(generate_gcs_file_list(args.bucket_name, environment_list, category_list, args.event_ds_start, args.event_ds_stop, time_part_list, args.scale_test_name)))
                      | 'GetGcsFileList' >> beam.ParDo(GetGcsFileList())
                      | 'GcsListPairWithOne' >> beam.Map(lambda x: (x, 1)))

    fileListBq = (p1 | 'ParseBqFileList' >> beam.io.Read(beam.io.BigQuerySource(
                        # "What is already in BQ?"
                        query=generate_backfill_query(
                          args.gcp,
                          method,
                          (safe_convert_list_to_sql_tuple(environment_list), environment_name),
                          (safe_convert_list_to_sql_tuple(category_list), category_name),
                          args.event_ds_start,
                          args.event_ds_stop,
                          (safe_convert_list_to_sql_tuple(time_part_list), time_part_name),
                          args.scale_test_name),
                        use_standard_sql=True))
                     | 'BqListPairWithOne' >> beam.Map(lambda x: (x['gspath'], 1)))


    parseList = ({'fileListGcs': fileListGcs, 'fileListBq': fileListBq}
                 | 'CoGroupByKey' >> beam.CoGroupByKey()
                 | 'UnionMinusIntersect' >> beam.Filter(lambda x: (len(x[1]['fileListGcs']) == 1 and len(x[1]['fileListBq']) == 0))
                 | 'ExtractKeysParseList' >> beam.Map(lambda x: x[0]))

    # Write to BigQuery:
    logsList = (parseList | 'AddParseInitiatedInfo' >> beam.Map(lambda gspath: {'job_name': job_name,
                                                                                'processed_timestamp': time.time(),
                                                                                'batch_id': hashlib.md5(gspath.encode('utf-8')).hexdigest(),
                                                                                'analytics_environment': parse_gspath(gspath, 'analytics_environment='),
                                                                                'event_category': parse_gspath(gspath, 'event_category='),
                                                                                'event_ds': parse_gspath(gspath, 'event_ds='),
                                                                                'event_time': parse_gspath(gspath, 'event_time='),
                                                                                'event': 'parse_initiated',
                                                                                'gspath': gspath})
                          | 'WriteParseInitiated' >> beam.io.WriteToBigQuery(table='events_logs_dataflow_backfill',
                                                                             dataset='logs',
                                                                             project=args.gcp,
                                                                             method='FILE_LOADS',
                                                                             create_disposition=beam.io.gcp.bigquery.BigQueryDisposition.CREATE_IF_NEEDED,
                                                                             write_disposition=beam.io.gcp.bigquery.BigQueryDisposition.WRITE_APPEND,
                                                                             insert_retry_strategy=beam.io.gcp.bigquery_tools.RetryStrategy.RETRY_ON_TRANSIENT_ERROR,
                                                                             schema='job_name:STRING,processed_timestamp:TIMESTAMP,batch_id:STRING,analytics_environment:STRING,event_category:STRING,event_ds:DATE,event_time:STRING,event:STRING,gspath:STRING'))

    # Write to Pub/Sub:
    PDone = (parseList | 'DumpParseListPubSub' >> beam.io.WriteToText('gs://{bucket_name}/data_type=dataflow/batch/output/{job_name}/parselist'.format(bucket_name=args.bucket_name, job_name=job_name))
                       | 'WriteToPubSub' >> beam.ParDo(WriteToPubSub(), job_name, args.topic, args.gcp, args.bucket_name))


    p1.run().wait_until_finish()
    return job_name