Exemple #1
0
def rollback_object(google_key_path, object_type, dataset_name,
                    dataset_backup_name, table_name, table_backup_name):

    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = google_key_path
    client = bigquery.Client()

    if object_type == 'table':
        #check if table exists before deleting the other one
        dataset_ref = client.dataset(dataset_name)
        dataset = client.get_dataset(dataset_ref)

        backup_table_ref = dataset.table(table_backup_name)

        if table_exists(client, backup_table_ref):
            #Delete old table
            print("Backup table {} exists".format(table_backup_name))
            print("Dropping table {}".format(table_name))
            table_ref = dataset.table(table_name)
            table = client.get_table(table_ref)
            client.delete_table(table)

            #Copy backup table to table with the old name
            print("Copying {} to {}".format(table_backup_name, table_name))
            job_config = bigquery.CopyJobConfig()
            job = client.copy_table(backup_table_ref,
                                    table_ref,
                                    job_config=job_config)  # API request
            job.result()  # Waits for job to complete.

            #Delete backup table
            print("Dropping table {}".format(table_backup_name))
            backup_table = client.get_table(backup_table_ref)
            client.delete_table(backup_table)
Exemple #2
0
def copy_table(dataset_id, table_id, new_table_id, project=None):
    """
    Copies a table.If no project is specified, then the currently active project is used.
    """
    bigquery_client = bigquery.Client(project=project)
    dataset_ref = bigquery_client.dataset(dataset_id)
    table_ref = dataset_ref.table(table_id)

    # This sample shows the destination table in the same dataset and project,
    # however, it's possible to copy across datasets and projects. You can
    # also copy multiple source tables into a single destination table by
    # providing addtional arguments to `copy_table`.
    destination_table_ref = dataset_ref.table(new_table_id)

    # Create a job to copy the table to the destination table.
    # Start by creating a job configuration
    job_config = bigquery.CopyJobConfig()

    # Configure the job to create the table if it doesn't exist.
    job_config.create_disposition = (
        bigquery.job.CreateDisposition.CREATE_IF_NEEDED)

    copy_job = bigquery_client.copy_table(table_ref,
                                          destination_table_ref,
                                          job_config=job_config)

    log.info('Waiting for job to finish...')
    copy_job.result()
    log.info('Table {} copied to {}.'.format(table_id, new_table_id))
Exemple #3
0
def _copy_join_parts(client, stable_table, query_jobs):
    total_bytes = sum(query.total_bytes_processed for query in query_jobs)
    if query_jobs[0].dry_run:
        api_repr = json.dumps(query_jobs[0].to_api_repr())
        if len(query_jobs) > 1:
            logging.info(
                f"Would process {total_bytes} bytes: [{api_repr},...]")
            logging.info(
                f"Would copy {len(query_jobs)} results to {stable_table}")
        else:
            logging.info(f"Would process {total_bytes} bytes: {api_repr}")
    else:
        logging.info(
            f"Processed {total_bytes} bytes to populate {stable_table}")
        if len(query_jobs) > 1:
            partition_id = stable_table.table_id.split("$", 1)[1]
            sources = [
                f"{sql_table_id(job.destination)}${partition_id}"
                for job in query_jobs
            ]
            job_config = bigquery.CopyJobConfig(
                write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE)
            copy_job = client.copy_table(sources,
                                         stable_table,
                                         job_config=job_config)
            copy_job.result()
            logging.info(
                f"Copied {len(query_jobs)} results to populate {stable_table}")
            for job in query_jobs:
                client.delete_table(job.destination)
            logging.info(f"Deleted {len(query_jobs)} temporary tables")
Exemple #4
0
def test_copy_table_cmek(client, to_delete):
    dataset_id = 'copy_table_cmek_{}'.format(_millis())
    dest_dataset = bigquery.Dataset(client.dataset(dataset_id))
    dest_dataset = client.create_dataset(dest_dataset)
    to_delete.append(dest_dataset)

    # [START bigquery_copy_table_cmek]
    source_dataset = bigquery.DatasetReference('bigquery-public-data',
                                               'samples')
    source_table_ref = source_dataset.table('shakespeare')

    # dataset_id = 'my_dataset'
    dest_dataset_ref = client.dataset(dataset_id)
    dest_table_ref = dest_dataset_ref.table('destination_table')

    # Set the encryption key to use for the destination.
    # TODO: Replace this key with a key you have created in KMS.
    kms_key_name = 'projects/{}/locations/{}/keyRings/{}/cryptoKeys/{}'.format(
        'cloud-samples-tests', 'us-central1', 'test', 'test')
    encryption_config = bigquery.EncryptionConfiguration(
        kms_key_name=kms_key_name)
    job_config = bigquery.CopyJobConfig()
    job_config.destination_encryption_configuration = encryption_config

    job = client.copy_table(source_table_ref,
                            dest_table_ref,
                            job_config=job_config)  # API request
    job.result()  # Waits for job to complete.

    assert job.state == 'DONE'
    dest_table = client.get_table(dest_table_ref)
    assert dest_table.encryption_configuration.kms_key_name == kms_key_name
    # [END bigquery_copy_table_cmek]

    to_delete.insert(0, dest_table)
Exemple #5
0
def copy_table_cmek(dest_table_id, orig_table_id, kms_key_name):

    # [START bigquery_copy_table_cmek]
    from google.cloud import bigquery

    # Construct a BigQuery client object.
    client = bigquery.Client()

    # TODO(developer): Set dest_table_id to the ID of the destination table.
    # dest_table_id = "your-project.your_dataset.your_table_name"

    # TODO(developer): Set orig_table_id to the ID of the original table.
    # orig_table_id = "your-project.your_dataset.your_table_name"

    # Set the encryption key to use for the destination.
    # TODO(developer): Replace this key with a key you have created in KMS.
    # kms_key_name = "projects/{}/locations/{}/keyRings/{}/cryptoKeys/{}".format(
    #     your-project, location, your-ring, your-key
    # )

    job_config = bigquery.CopyJobConfig(
        destination_encryption_configuration=bigquery.EncryptionConfiguration(
            kms_key_name=kms_key_name))
    job = client.copy_table(orig_table_id,
                            dest_table_id,
                            job_config=job_config)
    job.result()  # Wait for the job to complete.

    dest_table = client.get_table(dest_table_id)  # Make an API request.
    if dest_table.encryption_configuration.kms_key_name == kms_key_name:
        print("A copy of the table created")
Exemple #6
0
 def create_job(client):
     return client.copy_table(
         sources,
         dest,
         bigquery.CopyJobConfig(
             write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE
         ),
     )
Exemple #7
0
    def config_job(self, overwrite=True):
        job_config = bigquery.CopyJobConfig()

        if overwrite:
            job_config.write_disposition = 'WRITE_TRUNCATE'
        else:
            job_config.write_disposition = 'WRITE_APPEND'

        return job_config
Exemple #8
0
def insert_into_bigquery(client, dataset_id, stg_table_id, master_table_id):
    try:
        print '>>> inserting data from {} to {} table'.format(stg_table_id, master_table_id)
        dataset_ref = client.dataset(dataset_id)
        stg_table_ref = dataset_ref.table(stg_table_id)
        master_table_ref = dataset_ref.table(master_table_id)

        master_rows_number = client.get_table(master_table_ref).num_rows
        print(">>> Number of rows in master table before delete and insert operation = {}".format(master_rows_number))
        print(">>> Deleting the duplicate rows from master table that are also present in staging table.....")
        delete_duplicate_query = """DELETE FROM {}.{} 
           	                  WHERE EXISTS (
                	          SELECT * from {}.{} WHERE 
                                  {}.date = {}.date AND
                                  {}.ViewID = {}.viewID )""".format(dataset_id, master_table_id,
                                                                    dataset_id, stg_table_id,
                                                                    master_table_id, stg_table_id,
                                                                    master_table_id, stg_table_id)
        print(">>> " + delete_duplicate_query)
        delete_query_job = client.query(delete_duplicate_query)
        delete_query_job.result()
        retained_row = 0
        for row in delete_query_job.result():
            retained_row += 1
        print(">>> Number of deleted rows = {}".format(master_rows_number - retained_row))

        """Insert the data in staging table to master table"""
        print(">>> Inserting data from staging table to master......")
        stg_master_copy_job_config = bigquery.CopyJobConfig()
        stg_master_copy_job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
        stg_master_copy_job_config.schema_update_options = [
            bigquery.SchemaUpdateOption.ALLOW_FIELD_RELAXATION,
            bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION
        ]
        schema_config = []

        for schema in schemas:
            schema_config.append(eval(schema))
            stg_master_copy_job_config.schema = schema_config

        stg_master_copy_job = client.copy_table(
            stg_table_ref,
            master_table_ref,
            location='US',
            job_config=stg_master_copy_job_config
        )
        stg_master_copy_job.result()
        print(">>> Number of rows inserted = {}".format(client.get_table(stg_table_ref).num_rows))
        print(">>> Total number of rows in master table after delete and insert operation = {}".format(
            client.get_table(master_table_ref).num_rows))
    except Exception as e:
        pprint('>>> Error occured while inserting in the Master BigQuery Table \n Error:::: %s ' % e)
        type_, value_, traceback_ = sys.exc_info()
        pprint(traceback.format_tb(traceback_))
        print(type_, value_)
        sys.exit(1)
Exemple #9
0
def copy_table():
    """Copy temporary table to final destination"""

    logging.info("copy table")
    dataset_ref = bigquery.DatasetReference(GCP_PROJECT, DATASET_NAME)
    table_ref = bigquery.TableReference(dataset_ref, TABLE_NAME)
    copyjob_config = bigquery.CopyJobConfig()
    copyjob_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED
    copyjob_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
    bq.copy_table(temp_table_ref, table_ref, job_config=copyjob_config)
Exemple #10
0
def deploy_features_table():
    """Copy the UNION temp features table to final destination"""

    logging.info("copy table")
    target_dataset_ref = bigquery.DatasetReference(GCP_PROJECT,
                                                   BQ_TARGET_DATASET)
    target_table_ref = bigquery.TableReference(target_dataset_ref, 'features')
    copyjob_config = bigquery.CopyJobConfig()
    copyjob_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED
    copyjob_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
    bq.copy_table(temp_table_ref, target_table_ref, job_config=copyjob_config)
Exemple #11
0
def copy_table(dataset_id, table_id, new_table_id, project):
        bigquery_client = bigquery.Client(project=project)
        dataset_ref = bigquery_client.dataset(dataset_id)
        table_ref = dataset_ref.table(table_id)
        destination_table_ref = dataset_ref.table(new_table_id)
        job_config = bigquery.CopyJobConfig()
        job_config.create_disposition = (bigquery.job.CreateDisposition.CREATE_IF_NEEDED)
        copy_job = bigquery_client.copy_table(table_ref, destination_table_ref, job_config=job_config)
        print('Waiting for job to finish...')
        copy_job.result()
        print('Table {} copied to {}.'.format(table_id, new_table_id))
        def merge_task(ds, **kwargs):
            client = bigquery.Client()

            dataset = create_dataset(client, dataset_name, project=destination_dataset_project_id)

            load_all_partitions_for_table = load_all_partitions
            if load_all_partitions_for_table is None:
                table_ref = dataset.table(task)
                table_exists = does_table_exist(client, table_ref)
                load_all_partitions_for_table = not table_exists
                logging.info('load_all_partitions for table {} is set to {}'.format(task, str(load_all_partitions_for_table)))

            if load_all_partitions_for_table:
                # Copy temporary table to destination
                copy_job_config = bigquery.CopyJobConfig()
                copy_job_config.write_disposition = 'WRITE_TRUNCATE'
                dest_table_ref = dataset.table(task)
                temp_table_ref = client.dataset(dataset_name_temp).table(task)
                copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config)
                submit_bigquery_job(copy_job, copy_job_config)
                assert copy_job.state == 'DONE'
            else:
                merge_job_config = bigquery.QueryJobConfig()
                # Finishes faster, query limit for concurrent interactive queries is 50
                merge_job_config.priority = bigquery.QueryPriority.INTERACTIVE

                merge_sql_path = os.path.join(
                    dags_folder, 'zilliqa_resources/stages/load/sqls/merge.sql'.format(task=task))
                merge_sql_template = read_file(merge_sql_path)

                schema_path = os.path.join(dags_folder, 'zilliqa_resources/stages/load/schemas/{task}.json'.format(task=task))
                schema = read_bigquery_schema_from_file(schema_path)

                merge_template_context = {
                    'ds': ds,
                    'table': task,
                    'destination_dataset_project_id': destination_dataset_project_id,
                    'destination_dataset_name': dataset_name,
                    'dataset_name_temp': dataset_name_temp,
                    'table_schema': schema,
                    'time_partitioning_field': time_partitioning_field,
                }

                merge_sql = kwargs['task'].render_template('', merge_sql_template, merge_template_context)
                print('Merge sql:')
                print(merge_sql)
                merge_job = client.query(merge_sql, location='US', job_config=merge_job_config)
                submit_bigquery_job(merge_job, merge_job_config)
                assert merge_job.state == 'DONE'
def create_view(dataset_name, view_name, project, viewSQL):

    f = Figlet(font="slant")
    print(f.renderText(view_name.replace("_", " ")))

    view_description = get_view_description(view_name)

    bigquery_client = bigquery.Client(project=project)

    job_config = bigquery.CopyJobConfig()
    job_config.write_disposition = "WRITE_TRUNCATE"

    dataset = bigquery_client.dataset(dataset_name)
    table_ref = dataset.table(view_name)
    table = bigquery.Table(table_ref)
    table.view_query = viewSQL
    table.view_use_legacy_sql = False
    table.description = view_description["metric_description"]

    try:
        # Delete the view if it exists
        bigquery_client.delete_table(table)
        with indent(4, quote="* "):
            puts(colored.blue("Existing view deleted"))
    except Exception as err:
        with indent(4, quote="* "):
            puts(colored.red("View doesn't exist"))

    try:
        # Create the new view
        bigquery_client.create_table(table)
        with indent(4, quote="* "):
            puts(
                colored.blue("View created : {}".format(".".join(
                    [project, dataset_name, view_name]))))
    except Exception as err:
        print(err)
        with indent(4, quote="* "):
            puts(colored.red("Error: Couldn't create view"))
        return False

    view_update = bigquery.Table(
        table_ref, schema=get_schema_from_description(view_description))
    view_update = bigquery_client.update_table(view_update, ["schema"])
    with indent(4, quote="* "):
        puts(colored.blue("Updated the view schema descriptions"))

    return True
Exemple #14
0
def test_copy_table(client, to_delete):

    DATASET_ID = 'copy_table_dataset_{}'.format(_millis())

    # [START copy_table]

    source_dataset = bigquery.DatasetReference(

        'bigquery-public-data', 'samples')

    source_table_ref = source_dataset.table('shakespeare')



    dest_dataset = bigquery.Dataset(client.dataset(DATASET_ID))

    dest_dataset = client.create_dataset(dest_dataset)  # API request

    dest_table_ref = dest_dataset.table('destination_table')



    job_config = bigquery.CopyJobConfig()

    job = client.copy_table(

        source_table_ref, dest_table_ref, job_config=job_config)  # API request

    job.result()  # Waits for job to complete.



    assert job.state == 'DONE'

    dest_table = client.get_table(dest_table_ref)  # API request

    assert dest_table.table_id == 'destination_table'

    # [END copy_table]



    to_delete.append(dest_dataset)

    to_delete.insert(0, dest_table)
Exemple #15
0
def copy_single_table(bq_client, src_table, dst_table, overwrite):
    """Copy a single day of ga_sessions."""
    job_config = bigquery.CopyJobConfig(
        write_disposition=bigquery.WriteDisposition.
        WRITE_TRUNCATE if overwrite else bigquery.WriteDisposition.WRITE_EMPTY)
    try:
        copy_job = bq_client.copy_table(
            sources=src_table,
            destination=dst_table,
            job_config=job_config,
        )
    except exceptions.NotFound:
        print(f"{src_table} not found, copy skipped")
        return

    try:
        copy_job.result()
    except exceptions.Conflict:
        print(f"{dst_table} already exists, copy skipped")
Exemple #16
0
def copy_tables_to_public_dataset():
    """copies all tables from internal dataset to BigQuery public dataset

    Service account which is used for access to public dataset project is stored in GCS.

    """

    gcs = storage.Client(project=GCP_PROJECT)

    bucket = gcs.bucket(GCS_BUCKET)
    sa_blob = bucket.blob(SERVICE_ACCOUNT_FILENAME)
    sa_blob.reload()
    f = BytesIO()
    sa_blob.download_to_file(f)
    sa_content = f.getvalue().decode('utf8')
    f.close()
    sa_content = json.loads(sa_content)
    credentials = service_account.Credentials.from_service_account_info(
        sa_content)

    bq = bigquery.Client(project=GCP_PROJECT, credentials=credentials)

    job_config = bigquery.CopyJobConfig()
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
    job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED

    source_dataset = bigquery.DatasetReference(GCP_PROJECT, BQ_DATASET)
    destination_dataset = bigquery.DatasetReference(BQ_PUBLIC_PROJECT,
                                                    BQ_PUBLIC_DATASET)
    table_refs = []
    tables_res = bq.list_tables(source_dataset)
    for table_ref in tables_res:
        table_refs.append(table_ref)

    for table_ref in table_refs:
        table_name = table_ref.table_id
        source_table = bigquery.TableReference(source_dataset, table_name)
        destination_table = bigquery.TableReference(destination_dataset,
                                                    table_name)
        bq.copy_table([source_table], destination_table, job_config=job_config)
        logging.info(f"copying table {table_name} to public dataset")
Exemple #17
0
def copy_tables(client, project_id, target_project_id, source_dataset,
                target_dataset, table_names):
    '''
    テーブルコピーを実施する関数

    Args:
    client (google.cloud.bigquery.client.Client): bigqueryのクライアント
    project_id (STRING): コピー元のGCPのプロジェクトID
    target_project_id (STRING): コピー先GCPのプロジェクトID
    source_dataset (STRING): コピー元のデータセット名
    target_dataset (STRING): コピー先のデータセット名
    table_names (list): コピーするテーブル名のリスト
    Return:
    None
    '''

    # コピージョブの準備
    job_config = bigquery.CopyJobConfig()
    job_config.write_disposition = "WRITE_TRUNCATE"

    #複数テーブルのコピー実施
    for table_name in table_names:

        # コピー元とコピー先の名前の作成
        source_table = "{}.{}.{}".format(project_id, source_dataset,
                                         table_name)
        target_table = "{}.{}.{}".format(target_project_id, target_dataset,
                                         table_name)
        try:
            # コピー元のテーブルがあるかの確認。なければ、ログに出力
            table_ref = client.dataset(source_dataset).table(table_name)
            table_check = client.get_table(table_ref)
            # コピージョブの作成
            copy_job = client.copy_table(source_table,
                                         target_table,
                                         job_config=job_config)
            # コピージョブの実施
            copy_job.result()
        except NotFound:
            logging.info(table_name + " is not exist")
    return
    def execute_query_with_schema_and_target(self,
                                             query,
                                             targetDataset,
                                             targetTable,
                                             schemaDataset=None,
                                             schemaTable=None):

        #
        # Reset the table based on schema
        writeDisposition = 'WRITE_TRUNCATE'
        if schemaDataset is not None and schemaTable is not None:
            copyJobConfig = bigquery.CopyJobConfig()
            copyJobConfig.write_disposition = 'WRITE_TRUNCATE'
            copy_job = self.__bigquery_client.copy_table(
                self.__bigquery_client.dataset(schemaDataset).table(
                    schemaTable),
                self.__bigquery_client.dataset(targetDataset).table(
                    targetTable),
                job_config=copyJobConfig)
            writeDisposition = 'WRITE_APPEND'

        #
        # Set job configuration & execute
        job_config = bigquery.QueryJobConfig()
        job_config.destination = self.__bigquery_client.dataset(
            targetDataset).table(targetTable)
        job_config.write_disposition = writeDisposition
        metrics_query_job = self.__bigquery_client.query(query,
                                                         job_config=job_config)
        nRows = metrics_query_job.result().total_rows
        return nRows

    #def write_dataset_to_bigquery(self, pdDataset, targetTableId):


#import os
#os.chdir('~/PycharmProjects')
#os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="../sportsight-tests.json"
#bqu = BigqueryUtils()
#bqu.execute_query_with_schema_and_target('SELECT lastUpdatedOn  FROM `sportsight-tests.Baseball1.pbp_playoffs_2018` LIMIT 100', 'test_dataset', 'test_table')
    def swap_tables(self, schema, table_name):
        project_id = self.connection_config['project_id']
        table_dict = utils.tablename_to_dict(table_name)
        target_table = table_dict.get('table_name')
        temp_table = table_dict.get('temp_table_name')

        # Swap tables and drop the temp tamp
        table_id = '{}.{}.{}'.format(project_id, schema, target_table)
        temp_table_id = '{}.{}.{}'.format(project_id, schema, temp_table)

        # we cant swap tables in bigquery, so we copy the temp into the table
        # then delete the temp table
        job_config = bigquery.CopyJobConfig()
        job_config.write_disposition = 'WRITE_TRUNCATE'
        client = self.open_connection()
        replace_job = client.copy_table(temp_table_id,
                                        table_id,
                                        job_config=job_config)
        replace_job.result()

        # delete the temp table
        client.delete_table(temp_table_id)
def bq_copy_table(
    source_client=None,
    source_project_id=None,
    target_project_id=None,
    dataset_table_list=None,
):
    """
    Copying custom project.dataset.table between two projects.
    :param source_project_id: GCP Project-Id (type:str)
    :param target_project_id: GCP Project-Id (type:str)
    :param dataset_table_list: [dataset_id.table_id] (type:list)
    :return table_transfer_criterion {0:SUCCESS, 1:FAIL}.
    """
    table_transfer_criterion = 0
    # Table Copy From Source Project To Target Project
    for table_id in dataset_table_list:
        try:
            source_table_id = "{}.{}".format(source_project_id, table_id)
            destination_table_id = "{}.{}".format(target_project_id, table_id)
            job_config = bigquery.CopyJobConfig()
            job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
            job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED
            job = source_client.copy_table(
                source_table_id,
                destination_table_id,
                location="US",
                job_config=job_config,
            )
            job.result()
            print("Copy successful from {} to {}".format(
                source_table_id, destination_table_id))
            table_transfer_criterion = 0
        except Exception as error:
            print("Exception occurred for {} at function {}: {}".format(
                table_id, "bq_copy_table", error))
            table_transfer_criterion = table_transfer_criterion + 1
    return table_transfer_criterion
Exemple #21
0
def create_or_update_table_from_table_definition(
        bigquery_client, table_definition, ds, source_project_id,
        source_dataset_name, destination_project_id, sqls_folder,
        parse_all_partitions, airflow_task):
    dataset_name = 'ethereum_' + table_definition['table']['dataset_name']
    table_name = table_definition['table']['table_name']
    table_description = table_definition['table']['table_description']
    schema = table_definition['table']['schema']
    parser = table_definition['parser']
    parser_type = parser.get('type', 'log')
    abi = json.dumps(parser['abi'])
    columns = [c.get('name') for c in schema]

    template_context = {}
    template_context['ds'] = ds
    template_context['params'] = {}
    template_context['params']['source_project_id'] = source_project_id
    template_context['params']['source_dataset_name'] = source_dataset_name
    template_context['params']['table_name'] = table_name
    template_context['params']['columns'] = columns
    template_context['params']['parser'] = parser
    template_context['params']['abi'] = abi
    if parser_type == 'log':
        template_context['params']['event_topic'] = abi_to_event_topic(
            parser['abi'])
    elif parser_type == 'trace':
        template_context['params']['method_selector'] = abi_to_method_selector(
            parser['abi'])
    template_context['params'][
        'struct_fields'] = create_struct_string_from_schema(schema)
    template_context['params']['parse_all_partitions'] = parse_all_partitions

    contract_address = parser['contract_address']
    if not contract_address.startswith('0x'):
        contract_address_sql = replace_refs(contract_address, ref_regex,
                                            destination_project_id,
                                            dataset_name)
        template_context['params']['parser'][
            'contract_address_sql'] = contract_address_sql

    # # # Create a temporary table

    dataset_name_temp = 'parse_temp'
    create_dataset(bigquery_client, dataset_name_temp)
    temp_table_name = 'temp_{table_name}_{milliseconds}' \
        .format(table_name=table_name, milliseconds=int(round(time.time() * 1000)))
    temp_table_ref = bigquery_client.dataset(dataset_name_temp).table(
        temp_table_name)

    temp_table = bigquery.Table(temp_table_ref,
                                schema=read_bigquery_schema_from_dict(
                                    schema, parser_type))

    temp_table.description = table_description
    temp_table.time_partitioning = bigquery.TimePartitioning(
        field='block_timestamp')
    logging.info('Creating table: ' + json.dumps(temp_table.to_api_repr()))
    temp_table = bigquery_client.create_table(temp_table)
    assert temp_table.table_id == temp_table_name

    # # # Query to temporary table

    job_config = bigquery.QueryJobConfig()
    job_config.priority = bigquery.QueryPriority.INTERACTIVE
    job_config.destination = temp_table_ref
    sql_template = get_parse_sql_template(parser_type, sqls_folder)
    sql = airflow_task.render_template('', sql_template, template_context)
    logging.info(sql)
    query_job = bigquery_client.query(sql,
                                      location='US',
                                      job_config=job_config)
    submit_bigquery_job(query_job, job_config)
    assert query_job.state == 'DONE'

    # # # Copy / merge to destination

    if parse_all_partitions:
        # Copy temporary table to destination
        copy_job_config = bigquery.CopyJobConfig()
        copy_job_config.write_disposition = 'WRITE_TRUNCATE'
        dataset = create_dataset(bigquery_client, dataset_name,
                                 destination_project_id)
        dest_table_ref = dataset.table(table_name)
        copy_job = bigquery_client.copy_table(temp_table_ref,
                                              dest_table_ref,
                                              location='US',
                                              job_config=copy_job_config)
        submit_bigquery_job(copy_job, copy_job_config)
        assert copy_job.state == 'DONE'
        # Need to do update description as copy above won't repect the description in case destination table
        # already exists
        table = bigquery_client.get_table(dest_table_ref)
        table.description = table_description
        table = bigquery_client.update_table(table, ["description"])
        assert table.description == table_description
    else:
        # Merge
        # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement
        merge_job_config = bigquery.QueryJobConfig()
        # Finishes faster, query limit for concurrent interactive queries is 50
        merge_job_config.priority = bigquery.QueryPriority.INTERACTIVE

        merge_sql_template = get_merge_table_sql_template(sqls_folder)
        merge_template_context = template_context.copy()
        merge_template_context['params']['source_table'] = temp_table_name
        merge_template_context['params'][
            'destination_dataset_project_id'] = destination_project_id
        merge_template_context['params'][
            'destination_dataset_name'] = dataset_name
        merge_template_context['params'][
            'dataset_name_temp'] = dataset_name_temp
        merge_template_context['params']['columns'] = columns
        merge_sql = airflow_task.render_template('', merge_sql_template,
                                                 merge_template_context)
        print('Merge sql:')
        print(merge_sql)
        merge_job = bigquery_client.query(merge_sql,
                                          location='US',
                                          job_config=merge_job_config)
        submit_bigquery_job(merge_job, merge_job_config)
        assert merge_job.state == 'DONE'

    # Delete temp table
    bigquery_client.delete_table(temp_table_ref)
        def enrich_task(ds, **kwargs):
            template_context = kwargs.copy()
            template_context['ds'] = ds
            template_context['params'] = environment

            client = bigquery.Client()

            # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null
            # when writeDisposition is WRITE_TRUNCATE

            # Create a temporary table
            temp_table_name = '{task}_{milliseconds}'.format(
                task=task, milliseconds=int(round(time.time() * 1000)))
            temp_table_ref = client.dataset(dataset_name_temp).table(
                temp_table_name)

            schema_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/schemas/{task}.json'.format(
                    task=task))
            schema = read_bigquery_schema_from_file(schema_path)
            table = bigquery.Table(temp_table_ref, schema=schema)

            description_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/descriptions/{task}.txt'.format(
                    task=task))
            table.description = read_file(description_path)
            if time_partitioning_field is not None:
                table.time_partitioning = TimePartitioning(
                    field=time_partitioning_field)
            logging.info('Creating table: ' + json.dumps(table.to_api_repr()))
            table = client.create_table(table)
            assert table.table_id == temp_table_name

            # Query from raw to temporary table
            query_job_config = bigquery.QueryJobConfig()
            # Finishes faster, query limit for concurrent interactive queries is 50
            query_job_config.priority = bigquery.QueryPriority.INTERACTIVE
            query_job_config.destination = temp_table_ref

            sql_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/sqls/{task}.sql'.format(task=task))
            sql_template = read_file(sql_path)
            sql = kwargs['task'].render_template('', sql_template,
                                                 template_context)
            print('Enrichment sql:')
            print(sql)

            query_job = client.query(sql,
                                     location='US',
                                     job_config=query_job_config)
            submit_bigquery_job(query_job, query_job_config)
            assert query_job.state == 'DONE'

            if load_all_partitions or always_load_all_partitions:
                # Copy temporary table to destination
                copy_job_config = bigquery.CopyJobConfig()
                copy_job_config.write_disposition = 'WRITE_TRUNCATE'
                dest_table_name = '{task}'.format(task=task)
                dest_table_ref = client.dataset(
                    dataset_name,
                    project=destination_dataset_project_id).table(
                        dest_table_name)
                copy_job = client.copy_table(temp_table_ref,
                                             dest_table_ref,
                                             location='US',
                                             job_config=copy_job_config)
                submit_bigquery_job(copy_job, copy_job_config)
                assert copy_job.state == 'DONE'
            else:
                # Merge
                # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement
                merge_job_config = bigquery.QueryJobConfig()
                # Finishes faster, query limit for concurrent interactive queries is 50
                merge_job_config.priority = bigquery.QueryPriority.INTERACTIVE

                merge_sql_path = os.path.join(
                    dags_folder,
                    'resources/stages/enrich/sqls/merge/merge_{task}.sql'.
                    format(task=task))
                merge_sql_template = read_file(merge_sql_path)

                merge_template_context = template_context.copy()
                merge_template_context['params'][
                    'source_table'] = temp_table_name
                merge_template_context['params'][
                    'destination_dataset_project_id'] = destination_dataset_project_id
                merge_template_context['params'][
                    'destination_dataset_name'] = dataset_name
                merge_sql = kwargs['task'].render_template(
                    '', merge_sql_template, merge_template_context)
                print('Merge sql:')
                print(merge_sql)
                merge_job = client.query(merge_sql,
                                         location='US',
                                         job_config=merge_job_config)
                submit_bigquery_job(merge_job, merge_job_config)
                assert merge_job.state == 'DONE'

            # Delete temp table
            client.delete_table(temp_table_ref)
def create_or_update_history_table(bigquery_client,
                                   dataset_name,
                                   history_table_name,
                                   table_definition,
                                   ds,
                                   public_project_id,
                                   public_dataset_name,
                                   internal_project_id,
                                   destination_project_id,
                                   sqls_folder,
                                   parse_all_partitions,
                                   time_func=time.time):
    table_name = table_definition['table']['table_name']

    schema = table_definition['table']['schema']
    parser_type = table_definition['parser'].get('type', 'log')

    schema = read_bigquery_schema_from_dict(schema, parser_type)

    # # # Create a temporary table

    dataset_name_temp = 'parse_temp'
    create_dataset(bigquery_client, dataset_name_temp)
    temp_table_name = 'temp_{table_name}_{milliseconds}' \
        .format(table_name=table_name, milliseconds=int(round(time_func() * 1000)))
    temp_table_ref = bigquery_client.dataset(dataset_name_temp).table(
        temp_table_name)

    temp_table = bigquery.Table(temp_table_ref, schema=schema)

    table_description = table_definition['table']['table_description']
    temp_table.description = table_description
    temp_table.time_partitioning = bigquery.TimePartitioning(
        field='block_timestamp')
    logging.info('Creating table: ' + json.dumps(temp_table.to_api_repr()))
    temp_table = bigquery_client.create_table(temp_table)
    assert temp_table.table_id == temp_table_name

    # # # Query to temporary table

    udf_name = 'parse_{}'.format(table_name)

    selector = abi_to_selector(parser_type, table_definition['parser']['abi'])

    parse_mode = get_parse_mode(HistoryType.HISTORY,
                                parse_all_partitions=parse_all_partitions)
    full_source_table_name = get_source_table(
        parser_type=parser_type,
        parse_mode=parse_mode,
        ds=ds,
        internal_project_id=internal_project_id,
        public_project_id=public_project_id,
        public_dataset_name=public_dataset_name,
        selector=selector)

    sql = generate_parse_sql_template(
        sqls_folder,
        parser_type,
        parse_mode,
        full_source_table_name=full_source_table_name,
        selector=selector,
        internal_project_id=internal_project_id,
        destination_project_id=destination_project_id,
        dataset_name=dataset_name,
        udf_name=udf_name,
        table_definition=table_definition,
        parse_all_partitions=parse_all_partitions,
        ds=ds)
    query(bigquery_client, sql, destination=temp_table_ref)

    # # # Copy / merge to destination

    if parse_all_partitions:
        # Copy temporary table to destination
        copy_job_config = bigquery.CopyJobConfig()
        copy_job_config.write_disposition = 'WRITE_TRUNCATE'
        dataset = create_dataset(bigquery_client, dataset_name,
                                 internal_project_id)
        dest_table_ref = dataset.table(history_table_name)
        copy_job = bigquery_client.copy_table(temp_table_ref,
                                              dest_table_ref,
                                              location='US',
                                              job_config=copy_job_config)
        submit_bigquery_job(copy_job, copy_job_config)
        assert copy_job.state == 'DONE'
        # Need to do update description as copy above won't respect the description in case destination table
        # already exists
        table = bigquery_client.get_table(dest_table_ref)
        table.description = table_description
        table = bigquery_client.update_table(table, ["description"])
        assert table.description == table_description
    else:
        merge_sql = render_merge_template(
            sqls_folder,
            table_schema=schema,
            internal_project_id=internal_project_id,
            dataset_name=dataset_name,
            destination_table_name=history_table_name,
            dataset_name_temp=dataset_name_temp,
            source_table=temp_table_name,
            ds=ds)
        query(bigquery_client, merge_sql)

    # Delete temp table
    bigquery_client.delete_table(temp_table_ref)
def main(submission_date, dst_table, project, dataset):
    """Run query per app_version."""
    bq_client = bigquery.Client(project=project)

    app_versions = [
        row["app_version"]
        for row in bq_client.query(
            VERSION_QUERY_TEMPLATE.format(
                date=submission_date, project=project, dataset=dataset
            )
        ).result()
    ]

    print(f"Found versions: {app_versions}")

    if len(app_versions) == 0:
        print("Source table empty", file=sys.stderr)
        sys.exit(1)

    sql_path = SQL_BASE_DIR / dst_table / "query.sql"

    query_text = sql_path.read_text()

    # Write to intermediate table to avoid partial writes to destination table
    intermediate_table = f"{project}.analysis.glam_temp_clustered_query_{dst_table}"
    print(f"Writing results to {intermediate_table}")

    for i, app_version in enumerate(app_versions):
        print(f"Querying for app_version {app_version}")

        query_config = bigquery.QueryJobConfig(
            query_parameters=[
                bigquery.ScalarQueryParameter(
                    "submission_date", "DATE", str(submission_date)
                ),
                bigquery.ScalarQueryParameter("app_version", "INT64", app_version),
            ],
            clustering_fields=["metric", "channel"],
            destination=intermediate_table,
            default_dataset=f"{project}.{dataset}",
            write_disposition=(
                bigquery.WriteDisposition.WRITE_TRUNCATE
                if i == 0
                else bigquery.WriteDisposition.WRITE_APPEND
            ),
        )

        query_job = bq_client.query(query_text, job_config=query_config)

        # Periodically print so airflow gke operator doesn't think task is dead
        elapsed = 0
        while not query_job.done():
            time.sleep(10)
            elapsed += 10
            if elapsed % 200 == 10:
                print("Waiting on query...")

        print(f"Total elapsed: approximately {elapsed} seconds")

        results = query_job.result()
        print(f"Query job {query_job.job_id} finished")
        print(f"{results.total_rows} rows in {intermediate_table}")

    copy_config = bigquery.CopyJobConfig(
        write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
    )

    print(f"Copying {intermediate_table} to {project}.{dataset}.{dst_table}")
    bq_client.copy_table(
        intermediate_table,
        f"{project}.{dataset}.{dst_table}",
        job_config=copy_config,
    ).result()

    print(f"Deleting {intermediate_table}")
    bq_client.delete_table(intermediate_table)
        def parse_task(ds, **kwargs):
            template_context = kwargs.copy()
            template_context['ds'] = ds
            template_context['params'] = environment
            template_context['params']['table_name'] = table_name
            template_context['params']['columns'] = columns
            template_context['params']['parser'] = parser
            template_context['params']['abi'] = abi
            if parser_type == 'log':
                template_context['params']['event_topic'] = abi_to_event_topic(parser['abi'])
            elif parser_type == 'trace':
                template_context['params']['method_selector'] = abi_to_method_selector(parser['abi'])
            template_context['params']['struct_fields'] = create_struct_string_from_schema(schema)
            template_context['params']['parse_all_partitions'] = parse_all_partitions
            client = bigquery.Client()

            # # # Create a temporary table

            dataset_name_temp = 'parse_temp'
            create_dataset(client, dataset_name_temp)
            temp_table_name = 'temp_{table_name}_{milliseconds}'\
                .format(table_name=table_name, milliseconds=int(round(time.time() * 1000)))
            temp_table_ref = client.dataset(dataset_name_temp).table(temp_table_name)

            temp_table = bigquery.Table(temp_table_ref, schema=read_bigquery_schema_from_dict(schema, parser_type))

            temp_table.description = table_description
            temp_table.time_partitioning = TimePartitioning(field='block_timestamp')
            logging.info('Creating table: ' + json.dumps(temp_table.to_api_repr()))
            temp_table = client.create_table(temp_table)
            assert temp_table.table_id == temp_table_name

            # # # Query to temporary table

            job_config = bigquery.QueryJobConfig()
            job_config.priority = bigquery.QueryPriority.INTERACTIVE
            job_config.destination = temp_table_ref
            sql_template = get_parse_sql_template(parser_type)
            sql = kwargs['task'].render_template('', sql_template, template_context)
            logging.info(sql)
            query_job = client.query(sql, location='US', job_config=job_config)
            submit_bigquery_job(query_job, job_config)
            assert query_job.state == 'DONE'

            # # # Copy / merge to destination

            if parse_all_partitions:
                # Copy temporary table to destination
                copy_job_config = bigquery.CopyJobConfig()
                copy_job_config.write_disposition = 'WRITE_TRUNCATE'
                dest_table_ref = client.dataset(dataset_name, project=parse_destination_dataset_project_id).table(table_name)
                copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config)
                submit_bigquery_job(copy_job, copy_job_config)
                assert copy_job.state == 'DONE'
                # Need to do update description as copy above won't repect the description in case destination table
                # already exists
                table = client.get_table(dest_table_ref)
                table.description = table_description
                table = client.update_table(table, ["description"])
                assert table.description == table_description
            else:
                # Merge
                # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement
                merge_job_config = bigquery.QueryJobConfig()
                # Finishes faster, query limit for concurrent interactive queries is 50
                merge_job_config.priority = bigquery.QueryPriority.INTERACTIVE

                merge_sql_template = get_merge_table_sql_template()
                merge_template_context = template_context.copy()
                merge_template_context['params']['source_table'] = temp_table_name
                merge_template_context['params']['destination_dataset_project_id'] = parse_destination_dataset_project_id
                merge_template_context['params']['destination_dataset_name'] = dataset_name
                merge_template_context['params']['dataset_name_temp'] = dataset_name_temp
                merge_template_context['params']['columns'] = columns
                merge_sql = kwargs['task'].render_template('', merge_sql_template, merge_template_context)
                print('Merge sql:')
                print(merge_sql)
                merge_job = client.query(merge_sql, location='US', job_config=merge_job_config)
                submit_bigquery_job(merge_job, merge_job_config)
                assert merge_job.state == 'DONE'

            # Delete temp table
            client.delete_table(temp_table_ref)
        def enrich_task():
            client = bigquery.Client()

            # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null
            # when writeDisposition is WRITE_TRUNCATE

            # Create a temporary table
            temp_table_name = '{task}_{milliseconds}'.format(
                task=task, milliseconds=int(round(time.time() * 1000)))
            temp_table_ref = client.dataset(dataset_name_temp).table(
                temp_table_name)

            schema_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/schemas/{task}.json'.format(
                    task=task))
            schema = read_bigquery_schema_from_file(schema_path)
            table = bigquery.Table(temp_table_ref, schema=schema)

            description_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/descriptions/{task}.txt'.format(
                    task=task))
            table.description = read_file(description_path)
            if time_partitioning_field is not None:
                table.time_partitioning = TimePartitioning(
                    field=time_partitioning_field)
            logging.info('Creating table: ' + json.dumps(table.to_api_repr()))
            table = client.create_table(table)
            assert table.table_id == temp_table_name

            # Query from raw to temporary table
            query_job_config = bigquery.QueryJobConfig()
            # Finishes faster, query limit for concurrent interactive queries is 50
            query_job_config.priority = bigquery.QueryPriority.INTERACTIVE
            query_job_config.destination = temp_table_ref
            sql_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/sqls/{task}.sql'.format(task=task))
            sql = read_file(sql_path)
            query_job = client.query(sql,
                                     location='US',
                                     job_config=query_job_config)
            submit_bigquery_job(query_job, query_job_config)
            assert query_job.state == 'DONE'

            # Copy temporary table to destination
            copy_job_config = bigquery.CopyJobConfig()
            copy_job_config.write_disposition = 'WRITE_TRUNCATE'

            all_destination_projects = [(destination_dataset_project_id,
                                         dataset_name)]
            if copy_dataset_project_id is not None and len(copy_dataset_project_id) > 0 \
                    and copy_dataset_name is not None and len(copy_dataset_name) > 0:
                all_destination_projects.append(
                    (copy_dataset_project_id, copy_dataset_name))

            for dest_project, dest_dataset_name in all_destination_projects:
                dest_table_name = '{task}'.format(task=task)
                dest_table_ref = client.dataset(
                    dest_dataset_name,
                    project=dest_project).table(dest_table_name)
                copy_job = client.copy_table(temp_table_ref,
                                             dest_table_ref,
                                             location='US',
                                             job_config=copy_job_config)
                submit_bigquery_job(copy_job, copy_job_config)
                assert copy_job.state == 'DONE'

            # Delete temp table
            client.delete_table(temp_table_ref)
Exemple #27
0
    number = "_" + str(random.randint(1, 21) * 5)
    temp_table_name = sys.argv[5] + number
    print("Creating temporary table {}".format(temp_table_name))
    temp_table_ref = dataset.table(temp_table_name)
    temp_table = bigquery.Table(temp_table_ref, schema=schema)
    temp_table = client.create_table(temp_table)

    print("Running {}".format(sys.argv[4].format(temp_table_name)))
    query_job = client.query(sys.argv[4].format(temp_table_name))
    iterator = query_job.result()
    print(query_job.state)

    print("Deleting {}".format(sys.argv[5]))
    client.delete_table(table)
    table = None

    print("Creating empty table {}".format(sys.argv[5]))
    table_ref = dataset.table(sys.argv[5])
    table = bigquery.Table(table_ref, schema=schema)
    table = client.create_table(table)

    print("Copying {} to {}".format(temp_table_name, sys.argv[5]))
    source_table_ref = temp_table_ref
    job_config = bigquery.CopyJobConfig()
    job = client.copy_table(temp_table_ref, table_ref,
                            job_config=job_config)  # API request
    job.result()  # Waits for job to complete.

    print("Deleting {}".format(temp_table_name))
    client.delete_table(temp_table)
Exemple #28
0
    def resize(self):
        """
        This is the execute function of this class. It copies the source table
        into the destination table and then copies the destination table into
        itself until it reaches or exceeds the target_rows.
        """
        # How many rows short of our target are we?
        gap = self.target_rows - self.source_table.num_rows

        while gap > 0:  # Copy until we've reached or exceeded target_rows

            # API requests to get the latest table info.
            source_table = self.client.get_table(self.source_table)
            try:
                dest_table = self.client.get_table(self.dest_table_ref)

            except NotFound:
                dest_table = self.client.create_table(
                    bigquery.Table(self.dest_table_ref))

            # Get the latest size of the dest_table.
            # Note that for the first call these properties are None.
            dest_rows = dest_table.num_rows
            dest_bytes = dest_table.num_bytes
            dest_gb = dest_bytes / float(1024**3)

            # Recalculate the gap.
            if dest_rows:
                gap = self.target_rows - dest_rows
            else:
                gap = self.target_rows

            print('{} rows in table of size {} GB, with a target of {}, '
                  'leaving a gap of {}'.format(dest_rows, round(dest_gb, 2),
                                               self.target_rows, gap))

            # Greedily copy the largest of dest_table and source_table into
            # dest_table without going over the target rows. The last query
            # will be a subset of source_table via a limit query.
            if gap < source_table.num_rows:
                # This will be the last copy operation if target_rows is
                # not a power of 2 times the number of rows originally in the
                # source table. It is not a full copy.
                job_config = bigquery.QueryJobConfig()
                # Set the destination table

                job_config.destination = self.dest_table_ref
                job_config.write_disposition = 'WRITE_APPEND'
                job_config.allow_large_results = True

                sql = """
                    SELECT *
                    FROM `{}.{}.{}`
                    LIMIT {}

                """.format(self.project, self.source_table.dataset_id,
                           self.source_table.table_id, gap)

                # API request to BigQuery with query and config defined above.
                query_job = self.client.query(
                    sql,
                    # Location must match that of the dataset(s) referenced in
                    # the query and of the destination table.
                    location=self.location,
                    job_config=job_config)
                # Wait for query_job to finish.
                query_job.result()
            else:
                if source_table.num_rows < dest_table.num_rows < gap:
                    use_as_source_table = self.dest_table_ref
                else:  # source_table.num_rows < gap < dest_table.num_rows
                    use_as_source_table = self.source_table.reference
                copy_config = bigquery.CopyJobConfig()
                copy_config.write_disposition = 'WRITE_APPEND'
                copy_config.create_disposition = 'CREATE_IF_EMPTY'

                copy_job = self.client.copy_table(use_as_source_table,
                                                  self.dest_table_ref,
                                                  job_config=copy_config)
                # Wait for copy_job to finish.
                copy_job.result()