Example #1
0
        def load_task(ds, **kwargs):
            client = bigquery.Client()
            job_config = bigquery.LoadJobConfig()
            schema_path = os.path.join(
                dags_folder,
                'resources/stages/raw/schemas/{task}.json'.format(task=task))
            schema = read_bigquery_schema_from_file(schema_path)
            schema = adjust_schema_for_kovan(dag_id, task, schema)
            job_config.schema = schema
            job_config.source_format = bigquery.SourceFormat.CSV if file_format == 'csv' else bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
            if file_format == 'csv':
                job_config.skip_leading_rows = 1
            job_config.write_disposition = 'WRITE_TRUNCATE'
            job_config.allow_quoted_newlines = allow_quoted_newlines
            job_config.ignore_unknown_values = True

            export_location_uri = 'gs://{bucket}/export'.format(
                bucket=output_bucket)
            if load_all_partitions:
                uri = '{export_location_uri}/{task}/*.{file_format}'.format(
                    export_location_uri=export_location_uri,
                    task=task,
                    file_format=file_format)
            else:
                uri = '{export_location_uri}/{task}/block_date={ds}/*.{file_format}'.format(
                    export_location_uri=export_location_uri,
                    task=task,
                    ds=ds,
                    file_format=file_format)
            table_ref = client.dataset(dataset_name_raw).table(task)
            load_job = client.load_table_from_uri(uri,
                                                  table_ref,
                                                  job_config=job_config)
            submit_bigquery_job(load_job, job_config)
            assert load_job.state == 'DONE'
        def seed_task():
            client = bigquery.Client()
            job_config = bigquery.LoadJobConfig()
            schema_path = os.path.join(dags_folder, 'resources/stages/seed/schemas/{task}.json'.format(task=task))
            job_config.schema = read_bigquery_schema_from_file(schema_path)
            job_config.source_format = bigquery.SourceFormat.CSV
            job_config.skip_leading_rows = 1
            job_config.write_disposition = 'WRITE_TRUNCATE'
            job_config.ignore_unknown_values = True

            file_path = os.path.join(dags_folder, 'resources/stages/seed/data/{task}.csv'.format(task=task))
            table_ref = client.dataset(project='blockchain-etl-internal', dataset_id='common').table(task)
            load_job = client.load_table_from_file(open(file_path, mode='r+b'), table_ref, job_config=job_config)
            submit_bigquery_job(load_job, job_config)
            assert load_job.state == 'DONE'
Example #3
0
        def enrich_task(ds, **kwargs):
            template_context = kwargs.copy()
            template_context['ds'] = ds
            template_context['params'] = environment

            client = bigquery.Client()

            # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null
            # when writeDisposition is WRITE_TRUNCATE

            # Create a temporary table
            temp_table_name = '{task}_{milliseconds}'.format(task=task, milliseconds=int(round(time.time() * 1000)))
            temp_table_ref = client.dataset(dataset_name_temp).table(temp_table_name)

            schema_path = os.path.join(dags_folder, 'resources/stages/enrich/schemas/{task}.json'.format(task=task))
            schema = read_bigquery_schema_from_file(schema_path)
            table = bigquery.Table(temp_table_ref, schema=schema)

            description_path = os.path.join(
                dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format(task=task))
            table.description = read_file(description_path)
            if time_partitioning_field is not None:
                table.time_partitioning = TimePartitioning(field=time_partitioning_field)
            logging.info('Creating table: ' + json.dumps(table.to_api_repr()))
            table = client.create_table(table)
            assert table.table_id == temp_table_name

            # Query from raw to temporary table
            query_job_config = bigquery.QueryJobConfig()
            # Finishes faster, query limit for concurrent interactive queries is 50
            query_job_config.priority = bigquery.QueryPriority.INTERACTIVE
            query_job_config.destination = temp_table_ref

            sql_path = os.path.join(dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task))
            sql_template = read_file(sql_path)
            sql = kwargs['task'].render_template('', sql_template, template_context)
            print('Enrichment sql:')
            print(sql)

            query_job = client.query(sql, location='US', job_config=query_job_config)
            submit_bigquery_job(query_job, query_job_config)
            assert query_job.state == 'DONE'

            if load_all_partitions or always_load_all_partitions:
                # Copy temporary table to destination
                copy_job_config = bigquery.CopyJobConfig()
                copy_job_config.write_disposition = 'WRITE_TRUNCATE'
                dest_table_name = '{task}'.format(task=task)
                dest_table_ref = client.dataset(dataset_name, project=destination_dataset_project_id).table(dest_table_name)
                copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config)
                submit_bigquery_job(copy_job, copy_job_config)
                assert copy_job.state == 'DONE'
            else:
                # Merge
                # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement
                merge_job_config = bigquery.QueryJobConfig()
                # Finishes faster, query limit for concurrent interactive queries is 50
                merge_job_config.priority = bigquery.QueryPriority.INTERACTIVE

                merge_sql_path = os.path.join(
                    dags_folder, 'resources/stages/enrich/sqls/merge/merge_{task}.sql'.format(task=task))
                merge_sql_template = read_file(merge_sql_path)

                merge_template_context = template_context.copy()
                merge_template_context['params']['source_table'] = temp_table_name
                merge_template_context['params']['destination_dataset_project_id'] = destination_dataset_project_id
                merge_template_context['params']['destination_dataset_name'] = dataset_name
                merge_sql = kwargs['task'].render_template('', merge_sql_template, merge_template_context)
                print('Merge sql:')
                print(merge_sql)
                merge_job = client.query(merge_sql, location='US', job_config=merge_job_config)
                submit_bigquery_job(merge_job, merge_job_config)
                assert merge_job.state == 'DONE'

            # Delete temp table
            client.delete_table(temp_table_ref)
def create_or_update_history_table(bigquery_client,
                                   dataset_name,
                                   history_table_name,
                                   table_definition,
                                   ds,
                                   public_project_id,
                                   public_dataset_name,
                                   internal_project_id,
                                   destination_project_id,
                                   sqls_folder,
                                   parse_all_partitions,
                                   time_func=time.time):
    table_name = table_definition['table']['table_name']

    schema = table_definition['table']['schema']
    parser_type = table_definition['parser'].get('type', 'log')

    schema = read_bigquery_schema_from_dict(schema, parser_type)

    # # # Create a temporary table

    dataset_name_temp = 'parse_temp'
    create_dataset(bigquery_client, dataset_name_temp)
    temp_table_name = 'temp_{table_name}_{milliseconds}' \
        .format(table_name=table_name, milliseconds=int(round(time_func() * 1000)))
    temp_table_ref = bigquery_client.dataset(dataset_name_temp).table(
        temp_table_name)

    temp_table = bigquery.Table(temp_table_ref, schema=schema)

    table_description = table_definition['table']['table_description']
    temp_table.description = table_description
    temp_table.time_partitioning = bigquery.TimePartitioning(
        field='block_timestamp')
    logging.info('Creating table: ' + json.dumps(temp_table.to_api_repr()))
    temp_table = bigquery_client.create_table(temp_table)
    assert temp_table.table_id == temp_table_name

    # # # Query to temporary table

    udf_name = 'parse_{}'.format(table_name)

    selector = abi_to_selector(parser_type, table_definition['parser']['abi'])

    parse_mode = get_parse_mode(HistoryType.HISTORY,
                                parse_all_partitions=parse_all_partitions)
    full_source_table_name = get_source_table(
        parser_type=parser_type,
        parse_mode=parse_mode,
        ds=ds,
        internal_project_id=internal_project_id,
        public_project_id=public_project_id,
        public_dataset_name=public_dataset_name,
        selector=selector)

    sql = generate_parse_sql_template(
        sqls_folder,
        parser_type,
        parse_mode,
        full_source_table_name=full_source_table_name,
        selector=selector,
        internal_project_id=internal_project_id,
        destination_project_id=destination_project_id,
        dataset_name=dataset_name,
        udf_name=udf_name,
        table_definition=table_definition,
        parse_all_partitions=parse_all_partitions,
        ds=ds)
    query(bigquery_client, sql, destination=temp_table_ref)

    # # # Copy / merge to destination

    if parse_all_partitions:
        # Copy temporary table to destination
        copy_job_config = bigquery.CopyJobConfig()
        copy_job_config.write_disposition = 'WRITE_TRUNCATE'
        dataset = create_dataset(bigquery_client, dataset_name,
                                 internal_project_id)
        dest_table_ref = dataset.table(history_table_name)
        copy_job = bigquery_client.copy_table(temp_table_ref,
                                              dest_table_ref,
                                              location='US',
                                              job_config=copy_job_config)
        submit_bigquery_job(copy_job, copy_job_config)
        assert copy_job.state == 'DONE'
        # Need to do update description as copy above won't respect the description in case destination table
        # already exists
        table = bigquery_client.get_table(dest_table_ref)
        table.description = table_description
        table = bigquery_client.update_table(table, ["description"])
        assert table.description == table_description
    else:
        merge_sql = render_merge_template(
            sqls_folder,
            table_schema=schema,
            internal_project_id=internal_project_id,
            dataset_name=dataset_name,
            destination_table_name=history_table_name,
            dataset_name_temp=dataset_name_temp,
            source_table=temp_table_name,
            ds=ds)
        query(bigquery_client, merge_sql)

    # Delete temp table
    bigquery_client.delete_table(temp_table_ref)
Example #5
0
def create_or_update_table_from_table_definition(
        bigquery_client, table_definition, ds, source_project_id,
        source_dataset_name, destination_project_id, sqls_folder,
        parse_all_partitions, airflow_task):
    dataset_name = 'ethereum_' + table_definition['table']['dataset_name']
    table_name = table_definition['table']['table_name']
    table_description = table_definition['table']['table_description']
    schema = table_definition['table']['schema']
    parser = table_definition['parser']
    parser_type = parser.get('type', 'log')
    abi = json.dumps(parser['abi'])
    columns = [c.get('name') for c in schema]

    template_context = {}
    template_context['ds'] = ds
    template_context['params'] = {}
    template_context['params']['source_project_id'] = source_project_id
    template_context['params']['source_dataset_name'] = source_dataset_name
    template_context['params']['table_name'] = table_name
    template_context['params']['columns'] = columns
    template_context['params']['parser'] = parser
    template_context['params']['abi'] = abi
    if parser_type == 'log':
        template_context['params']['event_topic'] = abi_to_event_topic(
            parser['abi'])
    elif parser_type == 'trace':
        template_context['params']['method_selector'] = abi_to_method_selector(
            parser['abi'])
    template_context['params'][
        'struct_fields'] = create_struct_string_from_schema(schema)
    template_context['params']['parse_all_partitions'] = parse_all_partitions

    contract_address = parser['contract_address']
    if not contract_address.startswith('0x'):
        contract_address_sql = replace_refs(contract_address, ref_regex,
                                            destination_project_id,
                                            dataset_name)
        template_context['params']['parser'][
            'contract_address_sql'] = contract_address_sql

    # # # Create a temporary table

    dataset_name_temp = 'parse_temp'
    create_dataset(bigquery_client, dataset_name_temp)
    temp_table_name = 'temp_{table_name}_{milliseconds}' \
        .format(table_name=table_name, milliseconds=int(round(time.time() * 1000)))
    temp_table_ref = bigquery_client.dataset(dataset_name_temp).table(
        temp_table_name)

    temp_table = bigquery.Table(temp_table_ref,
                                schema=read_bigquery_schema_from_dict(
                                    schema, parser_type))

    temp_table.description = table_description
    temp_table.time_partitioning = bigquery.TimePartitioning(
        field='block_timestamp')
    logging.info('Creating table: ' + json.dumps(temp_table.to_api_repr()))
    temp_table = bigquery_client.create_table(temp_table)
    assert temp_table.table_id == temp_table_name

    # # # Query to temporary table

    job_config = bigquery.QueryJobConfig()
    job_config.priority = bigquery.QueryPriority.INTERACTIVE
    job_config.destination = temp_table_ref
    sql_template = get_parse_sql_template(parser_type, sqls_folder)
    sql = airflow_task.render_template('', sql_template, template_context)
    logging.info(sql)
    query_job = bigquery_client.query(sql,
                                      location='US',
                                      job_config=job_config)
    submit_bigquery_job(query_job, job_config)
    assert query_job.state == 'DONE'

    # # # Copy / merge to destination

    if parse_all_partitions:
        # Copy temporary table to destination
        copy_job_config = bigquery.CopyJobConfig()
        copy_job_config.write_disposition = 'WRITE_TRUNCATE'
        dataset = create_dataset(bigquery_client, dataset_name,
                                 destination_project_id)
        dest_table_ref = dataset.table(table_name)
        copy_job = bigquery_client.copy_table(temp_table_ref,
                                              dest_table_ref,
                                              location='US',
                                              job_config=copy_job_config)
        submit_bigquery_job(copy_job, copy_job_config)
        assert copy_job.state == 'DONE'
        # Need to do update description as copy above won't repect the description in case destination table
        # already exists
        table = bigquery_client.get_table(dest_table_ref)
        table.description = table_description
        table = bigquery_client.update_table(table, ["description"])
        assert table.description == table_description
    else:
        # Merge
        # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement
        merge_job_config = bigquery.QueryJobConfig()
        # Finishes faster, query limit for concurrent interactive queries is 50
        merge_job_config.priority = bigquery.QueryPriority.INTERACTIVE

        merge_sql_template = get_merge_table_sql_template(sqls_folder)
        merge_template_context = template_context.copy()
        merge_template_context['params']['source_table'] = temp_table_name
        merge_template_context['params'][
            'destination_dataset_project_id'] = destination_project_id
        merge_template_context['params'][
            'destination_dataset_name'] = dataset_name
        merge_template_context['params'][
            'dataset_name_temp'] = dataset_name_temp
        merge_template_context['params']['columns'] = columns
        merge_sql = airflow_task.render_template('', merge_sql_template,
                                                 merge_template_context)
        print('Merge sql:')
        print(merge_sql)
        merge_job = bigquery_client.query(merge_sql,
                                          location='US',
                                          job_config=merge_job_config)
        submit_bigquery_job(merge_job, merge_job_config)
        assert merge_job.state == 'DONE'

    # Delete temp table
    bigquery_client.delete_table(temp_table_ref)