def rollback_object(google_key_path, object_type, dataset_name, dataset_backup_name, table_name, table_backup_name): os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = google_key_path client = bigquery.Client() if object_type == 'table': #check if table exists before deleting the other one dataset_ref = client.dataset(dataset_name) dataset = client.get_dataset(dataset_ref) backup_table_ref = dataset.table(table_backup_name) if table_exists(client, backup_table_ref): #Delete old table print("Backup table {} exists".format(table_backup_name)) print("Dropping table {}".format(table_name)) table_ref = dataset.table(table_name) table = client.get_table(table_ref) client.delete_table(table) #Copy backup table to table with the old name print("Copying {} to {}".format(table_backup_name, table_name)) job_config = bigquery.CopyJobConfig() job = client.copy_table(backup_table_ref, table_ref, job_config=job_config) # API request job.result() # Waits for job to complete. #Delete backup table print("Dropping table {}".format(table_backup_name)) backup_table = client.get_table(backup_table_ref) client.delete_table(backup_table)
def copy_table(dataset_id, table_id, new_table_id, project=None): """ Copies a table.If no project is specified, then the currently active project is used. """ bigquery_client = bigquery.Client(project=project) dataset_ref = bigquery_client.dataset(dataset_id) table_ref = dataset_ref.table(table_id) # This sample shows the destination table in the same dataset and project, # however, it's possible to copy across datasets and projects. You can # also copy multiple source tables into a single destination table by # providing addtional arguments to `copy_table`. destination_table_ref = dataset_ref.table(new_table_id) # Create a job to copy the table to the destination table. # Start by creating a job configuration job_config = bigquery.CopyJobConfig() # Configure the job to create the table if it doesn't exist. job_config.create_disposition = ( bigquery.job.CreateDisposition.CREATE_IF_NEEDED) copy_job = bigquery_client.copy_table(table_ref, destination_table_ref, job_config=job_config) log.info('Waiting for job to finish...') copy_job.result() log.info('Table {} copied to {}.'.format(table_id, new_table_id))
def _copy_join_parts(client, stable_table, query_jobs): total_bytes = sum(query.total_bytes_processed for query in query_jobs) if query_jobs[0].dry_run: api_repr = json.dumps(query_jobs[0].to_api_repr()) if len(query_jobs) > 1: logging.info( f"Would process {total_bytes} bytes: [{api_repr},...]") logging.info( f"Would copy {len(query_jobs)} results to {stable_table}") else: logging.info(f"Would process {total_bytes} bytes: {api_repr}") else: logging.info( f"Processed {total_bytes} bytes to populate {stable_table}") if len(query_jobs) > 1: partition_id = stable_table.table_id.split("$", 1)[1] sources = [ f"{sql_table_id(job.destination)}${partition_id}" for job in query_jobs ] job_config = bigquery.CopyJobConfig( write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE) copy_job = client.copy_table(sources, stable_table, job_config=job_config) copy_job.result() logging.info( f"Copied {len(query_jobs)} results to populate {stable_table}") for job in query_jobs: client.delete_table(job.destination) logging.info(f"Deleted {len(query_jobs)} temporary tables")
def test_copy_table_cmek(client, to_delete): dataset_id = 'copy_table_cmek_{}'.format(_millis()) dest_dataset = bigquery.Dataset(client.dataset(dataset_id)) dest_dataset = client.create_dataset(dest_dataset) to_delete.append(dest_dataset) # [START bigquery_copy_table_cmek] source_dataset = bigquery.DatasetReference('bigquery-public-data', 'samples') source_table_ref = source_dataset.table('shakespeare') # dataset_id = 'my_dataset' dest_dataset_ref = client.dataset(dataset_id) dest_table_ref = dest_dataset_ref.table('destination_table') # Set the encryption key to use for the destination. # TODO: Replace this key with a key you have created in KMS. kms_key_name = 'projects/{}/locations/{}/keyRings/{}/cryptoKeys/{}'.format( 'cloud-samples-tests', 'us-central1', 'test', 'test') encryption_config = bigquery.EncryptionConfiguration( kms_key_name=kms_key_name) job_config = bigquery.CopyJobConfig() job_config.destination_encryption_configuration = encryption_config job = client.copy_table(source_table_ref, dest_table_ref, job_config=job_config) # API request job.result() # Waits for job to complete. assert job.state == 'DONE' dest_table = client.get_table(dest_table_ref) assert dest_table.encryption_configuration.kms_key_name == kms_key_name # [END bigquery_copy_table_cmek] to_delete.insert(0, dest_table)
def copy_table_cmek(dest_table_id, orig_table_id, kms_key_name): # [START bigquery_copy_table_cmek] from google.cloud import bigquery # Construct a BigQuery client object. client = bigquery.Client() # TODO(developer): Set dest_table_id to the ID of the destination table. # dest_table_id = "your-project.your_dataset.your_table_name" # TODO(developer): Set orig_table_id to the ID of the original table. # orig_table_id = "your-project.your_dataset.your_table_name" # Set the encryption key to use for the destination. # TODO(developer): Replace this key with a key you have created in KMS. # kms_key_name = "projects/{}/locations/{}/keyRings/{}/cryptoKeys/{}".format( # your-project, location, your-ring, your-key # ) job_config = bigquery.CopyJobConfig( destination_encryption_configuration=bigquery.EncryptionConfiguration( kms_key_name=kms_key_name)) job = client.copy_table(orig_table_id, dest_table_id, job_config=job_config) job.result() # Wait for the job to complete. dest_table = client.get_table(dest_table_id) # Make an API request. if dest_table.encryption_configuration.kms_key_name == kms_key_name: print("A copy of the table created")
def create_job(client): return client.copy_table( sources, dest, bigquery.CopyJobConfig( write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE ), )
def config_job(self, overwrite=True): job_config = bigquery.CopyJobConfig() if overwrite: job_config.write_disposition = 'WRITE_TRUNCATE' else: job_config.write_disposition = 'WRITE_APPEND' return job_config
def insert_into_bigquery(client, dataset_id, stg_table_id, master_table_id): try: print '>>> inserting data from {} to {} table'.format(stg_table_id, master_table_id) dataset_ref = client.dataset(dataset_id) stg_table_ref = dataset_ref.table(stg_table_id) master_table_ref = dataset_ref.table(master_table_id) master_rows_number = client.get_table(master_table_ref).num_rows print(">>> Number of rows in master table before delete and insert operation = {}".format(master_rows_number)) print(">>> Deleting the duplicate rows from master table that are also present in staging table.....") delete_duplicate_query = """DELETE FROM {}.{} WHERE EXISTS ( SELECT * from {}.{} WHERE {}.date = {}.date AND {}.ViewID = {}.viewID )""".format(dataset_id, master_table_id, dataset_id, stg_table_id, master_table_id, stg_table_id, master_table_id, stg_table_id) print(">>> " + delete_duplicate_query) delete_query_job = client.query(delete_duplicate_query) delete_query_job.result() retained_row = 0 for row in delete_query_job.result(): retained_row += 1 print(">>> Number of deleted rows = {}".format(master_rows_number - retained_row)) """Insert the data in staging table to master table""" print(">>> Inserting data from staging table to master......") stg_master_copy_job_config = bigquery.CopyJobConfig() stg_master_copy_job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND stg_master_copy_job_config.schema_update_options = [ bigquery.SchemaUpdateOption.ALLOW_FIELD_RELAXATION, bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION ] schema_config = [] for schema in schemas: schema_config.append(eval(schema)) stg_master_copy_job_config.schema = schema_config stg_master_copy_job = client.copy_table( stg_table_ref, master_table_ref, location='US', job_config=stg_master_copy_job_config ) stg_master_copy_job.result() print(">>> Number of rows inserted = {}".format(client.get_table(stg_table_ref).num_rows)) print(">>> Total number of rows in master table after delete and insert operation = {}".format( client.get_table(master_table_ref).num_rows)) except Exception as e: pprint('>>> Error occured while inserting in the Master BigQuery Table \n Error:::: %s ' % e) type_, value_, traceback_ = sys.exc_info() pprint(traceback.format_tb(traceback_)) print(type_, value_) sys.exit(1)
def copy_table(): """Copy temporary table to final destination""" logging.info("copy table") dataset_ref = bigquery.DatasetReference(GCP_PROJECT, DATASET_NAME) table_ref = bigquery.TableReference(dataset_ref, TABLE_NAME) copyjob_config = bigquery.CopyJobConfig() copyjob_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED copyjob_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE bq.copy_table(temp_table_ref, table_ref, job_config=copyjob_config)
def deploy_features_table(): """Copy the UNION temp features table to final destination""" logging.info("copy table") target_dataset_ref = bigquery.DatasetReference(GCP_PROJECT, BQ_TARGET_DATASET) target_table_ref = bigquery.TableReference(target_dataset_ref, 'features') copyjob_config = bigquery.CopyJobConfig() copyjob_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED copyjob_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE bq.copy_table(temp_table_ref, target_table_ref, job_config=copyjob_config)
def copy_table(dataset_id, table_id, new_table_id, project): bigquery_client = bigquery.Client(project=project) dataset_ref = bigquery_client.dataset(dataset_id) table_ref = dataset_ref.table(table_id) destination_table_ref = dataset_ref.table(new_table_id) job_config = bigquery.CopyJobConfig() job_config.create_disposition = (bigquery.job.CreateDisposition.CREATE_IF_NEEDED) copy_job = bigquery_client.copy_table(table_ref, destination_table_ref, job_config=job_config) print('Waiting for job to finish...') copy_job.result() print('Table {} copied to {}.'.format(table_id, new_table_id))
def merge_task(ds, **kwargs): client = bigquery.Client() dataset = create_dataset(client, dataset_name, project=destination_dataset_project_id) load_all_partitions_for_table = load_all_partitions if load_all_partitions_for_table is None: table_ref = dataset.table(task) table_exists = does_table_exist(client, table_ref) load_all_partitions_for_table = not table_exists logging.info('load_all_partitions for table {} is set to {}'.format(task, str(load_all_partitions_for_table))) if load_all_partitions_for_table: # Copy temporary table to destination copy_job_config = bigquery.CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' dest_table_ref = dataset.table(task) temp_table_ref = client.dataset(dataset_name_temp).table(task) copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' else: merge_job_config = bigquery.QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 merge_job_config.priority = bigquery.QueryPriority.INTERACTIVE merge_sql_path = os.path.join( dags_folder, 'zilliqa_resources/stages/load/sqls/merge.sql'.format(task=task)) merge_sql_template = read_file(merge_sql_path) schema_path = os.path.join(dags_folder, 'zilliqa_resources/stages/load/schemas/{task}.json'.format(task=task)) schema = read_bigquery_schema_from_file(schema_path) merge_template_context = { 'ds': ds, 'table': task, 'destination_dataset_project_id': destination_dataset_project_id, 'destination_dataset_name': dataset_name, 'dataset_name_temp': dataset_name_temp, 'table_schema': schema, 'time_partitioning_field': time_partitioning_field, } merge_sql = kwargs['task'].render_template('', merge_sql_template, merge_template_context) print('Merge sql:') print(merge_sql) merge_job = client.query(merge_sql, location='US', job_config=merge_job_config) submit_bigquery_job(merge_job, merge_job_config) assert merge_job.state == 'DONE'
def create_view(dataset_name, view_name, project, viewSQL): f = Figlet(font="slant") print(f.renderText(view_name.replace("_", " "))) view_description = get_view_description(view_name) bigquery_client = bigquery.Client(project=project) job_config = bigquery.CopyJobConfig() job_config.write_disposition = "WRITE_TRUNCATE" dataset = bigquery_client.dataset(dataset_name) table_ref = dataset.table(view_name) table = bigquery.Table(table_ref) table.view_query = viewSQL table.view_use_legacy_sql = False table.description = view_description["metric_description"] try: # Delete the view if it exists bigquery_client.delete_table(table) with indent(4, quote="* "): puts(colored.blue("Existing view deleted")) except Exception as err: with indent(4, quote="* "): puts(colored.red("View doesn't exist")) try: # Create the new view bigquery_client.create_table(table) with indent(4, quote="* "): puts( colored.blue("View created : {}".format(".".join( [project, dataset_name, view_name])))) except Exception as err: print(err) with indent(4, quote="* "): puts(colored.red("Error: Couldn't create view")) return False view_update = bigquery.Table( table_ref, schema=get_schema_from_description(view_description)) view_update = bigquery_client.update_table(view_update, ["schema"]) with indent(4, quote="* "): puts(colored.blue("Updated the view schema descriptions")) return True
def test_copy_table(client, to_delete): DATASET_ID = 'copy_table_dataset_{}'.format(_millis()) # [START copy_table] source_dataset = bigquery.DatasetReference( 'bigquery-public-data', 'samples') source_table_ref = source_dataset.table('shakespeare') dest_dataset = bigquery.Dataset(client.dataset(DATASET_ID)) dest_dataset = client.create_dataset(dest_dataset) # API request dest_table_ref = dest_dataset.table('destination_table') job_config = bigquery.CopyJobConfig() job = client.copy_table( source_table_ref, dest_table_ref, job_config=job_config) # API request job.result() # Waits for job to complete. assert job.state == 'DONE' dest_table = client.get_table(dest_table_ref) # API request assert dest_table.table_id == 'destination_table' # [END copy_table] to_delete.append(dest_dataset) to_delete.insert(0, dest_table)
def copy_single_table(bq_client, src_table, dst_table, overwrite): """Copy a single day of ga_sessions.""" job_config = bigquery.CopyJobConfig( write_disposition=bigquery.WriteDisposition. WRITE_TRUNCATE if overwrite else bigquery.WriteDisposition.WRITE_EMPTY) try: copy_job = bq_client.copy_table( sources=src_table, destination=dst_table, job_config=job_config, ) except exceptions.NotFound: print(f"{src_table} not found, copy skipped") return try: copy_job.result() except exceptions.Conflict: print(f"{dst_table} already exists, copy skipped")
def copy_tables_to_public_dataset(): """copies all tables from internal dataset to BigQuery public dataset Service account which is used for access to public dataset project is stored in GCS. """ gcs = storage.Client(project=GCP_PROJECT) bucket = gcs.bucket(GCS_BUCKET) sa_blob = bucket.blob(SERVICE_ACCOUNT_FILENAME) sa_blob.reload() f = BytesIO() sa_blob.download_to_file(f) sa_content = f.getvalue().decode('utf8') f.close() sa_content = json.loads(sa_content) credentials = service_account.Credentials.from_service_account_info( sa_content) bq = bigquery.Client(project=GCP_PROJECT, credentials=credentials) job_config = bigquery.CopyJobConfig() job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED source_dataset = bigquery.DatasetReference(GCP_PROJECT, BQ_DATASET) destination_dataset = bigquery.DatasetReference(BQ_PUBLIC_PROJECT, BQ_PUBLIC_DATASET) table_refs = [] tables_res = bq.list_tables(source_dataset) for table_ref in tables_res: table_refs.append(table_ref) for table_ref in table_refs: table_name = table_ref.table_id source_table = bigquery.TableReference(source_dataset, table_name) destination_table = bigquery.TableReference(destination_dataset, table_name) bq.copy_table([source_table], destination_table, job_config=job_config) logging.info(f"copying table {table_name} to public dataset")
def copy_tables(client, project_id, target_project_id, source_dataset, target_dataset, table_names): ''' テーブルコピーを実施する関数 Args: client (google.cloud.bigquery.client.Client): bigqueryのクライアント project_id (STRING): コピー元のGCPのプロジェクトID target_project_id (STRING): コピー先GCPのプロジェクトID source_dataset (STRING): コピー元のデータセット名 target_dataset (STRING): コピー先のデータセット名 table_names (list): コピーするテーブル名のリスト Return: None ''' # コピージョブの準備 job_config = bigquery.CopyJobConfig() job_config.write_disposition = "WRITE_TRUNCATE" #複数テーブルのコピー実施 for table_name in table_names: # コピー元とコピー先の名前の作成 source_table = "{}.{}.{}".format(project_id, source_dataset, table_name) target_table = "{}.{}.{}".format(target_project_id, target_dataset, table_name) try: # コピー元のテーブルがあるかの確認。なければ、ログに出力 table_ref = client.dataset(source_dataset).table(table_name) table_check = client.get_table(table_ref) # コピージョブの作成 copy_job = client.copy_table(source_table, target_table, job_config=job_config) # コピージョブの実施 copy_job.result() except NotFound: logging.info(table_name + " is not exist") return
def execute_query_with_schema_and_target(self, query, targetDataset, targetTable, schemaDataset=None, schemaTable=None): # # Reset the table based on schema writeDisposition = 'WRITE_TRUNCATE' if schemaDataset is not None and schemaTable is not None: copyJobConfig = bigquery.CopyJobConfig() copyJobConfig.write_disposition = 'WRITE_TRUNCATE' copy_job = self.__bigquery_client.copy_table( self.__bigquery_client.dataset(schemaDataset).table( schemaTable), self.__bigquery_client.dataset(targetDataset).table( targetTable), job_config=copyJobConfig) writeDisposition = 'WRITE_APPEND' # # Set job configuration & execute job_config = bigquery.QueryJobConfig() job_config.destination = self.__bigquery_client.dataset( targetDataset).table(targetTable) job_config.write_disposition = writeDisposition metrics_query_job = self.__bigquery_client.query(query, job_config=job_config) nRows = metrics_query_job.result().total_rows return nRows #def write_dataset_to_bigquery(self, pdDataset, targetTableId): #import os #os.chdir('~/PycharmProjects') #os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="../sportsight-tests.json" #bqu = BigqueryUtils() #bqu.execute_query_with_schema_and_target('SELECT lastUpdatedOn FROM `sportsight-tests.Baseball1.pbp_playoffs_2018` LIMIT 100', 'test_dataset', 'test_table')
def swap_tables(self, schema, table_name): project_id = self.connection_config['project_id'] table_dict = utils.tablename_to_dict(table_name) target_table = table_dict.get('table_name') temp_table = table_dict.get('temp_table_name') # Swap tables and drop the temp tamp table_id = '{}.{}.{}'.format(project_id, schema, target_table) temp_table_id = '{}.{}.{}'.format(project_id, schema, temp_table) # we cant swap tables in bigquery, so we copy the temp into the table # then delete the temp table job_config = bigquery.CopyJobConfig() job_config.write_disposition = 'WRITE_TRUNCATE' client = self.open_connection() replace_job = client.copy_table(temp_table_id, table_id, job_config=job_config) replace_job.result() # delete the temp table client.delete_table(temp_table_id)
def bq_copy_table( source_client=None, source_project_id=None, target_project_id=None, dataset_table_list=None, ): """ Copying custom project.dataset.table between two projects. :param source_project_id: GCP Project-Id (type:str) :param target_project_id: GCP Project-Id (type:str) :param dataset_table_list: [dataset_id.table_id] (type:list) :return table_transfer_criterion {0:SUCCESS, 1:FAIL}. """ table_transfer_criterion = 0 # Table Copy From Source Project To Target Project for table_id in dataset_table_list: try: source_table_id = "{}.{}".format(source_project_id, table_id) destination_table_id = "{}.{}".format(target_project_id, table_id) job_config = bigquery.CopyJobConfig() job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED job = source_client.copy_table( source_table_id, destination_table_id, location="US", job_config=job_config, ) job.result() print("Copy successful from {} to {}".format( source_table_id, destination_table_id)) table_transfer_criterion = 0 except Exception as error: print("Exception occurred for {} at function {}: {}".format( table_id, "bq_copy_table", error)) table_transfer_criterion = table_transfer_criterion + 1 return table_transfer_criterion
def create_or_update_table_from_table_definition( bigquery_client, table_definition, ds, source_project_id, source_dataset_name, destination_project_id, sqls_folder, parse_all_partitions, airflow_task): dataset_name = 'ethereum_' + table_definition['table']['dataset_name'] table_name = table_definition['table']['table_name'] table_description = table_definition['table']['table_description'] schema = table_definition['table']['schema'] parser = table_definition['parser'] parser_type = parser.get('type', 'log') abi = json.dumps(parser['abi']) columns = [c.get('name') for c in schema] template_context = {} template_context['ds'] = ds template_context['params'] = {} template_context['params']['source_project_id'] = source_project_id template_context['params']['source_dataset_name'] = source_dataset_name template_context['params']['table_name'] = table_name template_context['params']['columns'] = columns template_context['params']['parser'] = parser template_context['params']['abi'] = abi if parser_type == 'log': template_context['params']['event_topic'] = abi_to_event_topic( parser['abi']) elif parser_type == 'trace': template_context['params']['method_selector'] = abi_to_method_selector( parser['abi']) template_context['params'][ 'struct_fields'] = create_struct_string_from_schema(schema) template_context['params']['parse_all_partitions'] = parse_all_partitions contract_address = parser['contract_address'] if not contract_address.startswith('0x'): contract_address_sql = replace_refs(contract_address, ref_regex, destination_project_id, dataset_name) template_context['params']['parser'][ 'contract_address_sql'] = contract_address_sql # # # Create a temporary table dataset_name_temp = 'parse_temp' create_dataset(bigquery_client, dataset_name_temp) temp_table_name = 'temp_{table_name}_{milliseconds}' \ .format(table_name=table_name, milliseconds=int(round(time.time() * 1000))) temp_table_ref = bigquery_client.dataset(dataset_name_temp).table( temp_table_name) temp_table = bigquery.Table(temp_table_ref, schema=read_bigquery_schema_from_dict( schema, parser_type)) temp_table.description = table_description temp_table.time_partitioning = bigquery.TimePartitioning( field='block_timestamp') logging.info('Creating table: ' + json.dumps(temp_table.to_api_repr())) temp_table = bigquery_client.create_table(temp_table) assert temp_table.table_id == temp_table_name # # # Query to temporary table job_config = bigquery.QueryJobConfig() job_config.priority = bigquery.QueryPriority.INTERACTIVE job_config.destination = temp_table_ref sql_template = get_parse_sql_template(parser_type, sqls_folder) sql = airflow_task.render_template('', sql_template, template_context) logging.info(sql) query_job = bigquery_client.query(sql, location='US', job_config=job_config) submit_bigquery_job(query_job, job_config) assert query_job.state == 'DONE' # # # Copy / merge to destination if parse_all_partitions: # Copy temporary table to destination copy_job_config = bigquery.CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' dataset = create_dataset(bigquery_client, dataset_name, destination_project_id) dest_table_ref = dataset.table(table_name) copy_job = bigquery_client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' # Need to do update description as copy above won't repect the description in case destination table # already exists table = bigquery_client.get_table(dest_table_ref) table.description = table_description table = bigquery_client.update_table(table, ["description"]) assert table.description == table_description else: # Merge # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement merge_job_config = bigquery.QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 merge_job_config.priority = bigquery.QueryPriority.INTERACTIVE merge_sql_template = get_merge_table_sql_template(sqls_folder) merge_template_context = template_context.copy() merge_template_context['params']['source_table'] = temp_table_name merge_template_context['params'][ 'destination_dataset_project_id'] = destination_project_id merge_template_context['params'][ 'destination_dataset_name'] = dataset_name merge_template_context['params'][ 'dataset_name_temp'] = dataset_name_temp merge_template_context['params']['columns'] = columns merge_sql = airflow_task.render_template('', merge_sql_template, merge_template_context) print('Merge sql:') print(merge_sql) merge_job = bigquery_client.query(merge_sql, location='US', job_config=merge_job_config) submit_bigquery_job(merge_job, merge_job_config) assert merge_job.state == 'DONE' # Delete temp table bigquery_client.delete_table(temp_table_ref)
def enrich_task(ds, **kwargs): template_context = kwargs.copy() template_context['ds'] = ds template_context['params'] = environment client = bigquery.Client() # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null # when writeDisposition is WRITE_TRUNCATE # Create a temporary table temp_table_name = '{task}_{milliseconds}'.format( task=task, milliseconds=int(round(time.time() * 1000))) temp_table_ref = client.dataset(dataset_name_temp).table( temp_table_name) schema_path = os.path.join( dags_folder, 'resources/stages/enrich/schemas/{task}.json'.format( task=task)) schema = read_bigquery_schema_from_file(schema_path) table = bigquery.Table(temp_table_ref, schema=schema) description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) if time_partitioning_field is not None: table.time_partitioning = TimePartitioning( field=time_partitioning_field) logging.info('Creating table: ' + json.dumps(table.to_api_repr())) table = client.create_table(table) assert table.table_id == temp_table_name # Query from raw to temporary table query_job_config = bigquery.QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 query_job_config.priority = bigquery.QueryPriority.INTERACTIVE query_job_config.destination = temp_table_ref sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql_template = read_file(sql_path) sql = kwargs['task'].render_template('', sql_template, template_context) print('Enrichment sql:') print(sql) query_job = client.query(sql, location='US', job_config=query_job_config) submit_bigquery_job(query_job, query_job_config) assert query_job.state == 'DONE' if load_all_partitions or always_load_all_partitions: # Copy temporary table to destination copy_job_config = bigquery.CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dataset_name, project=destination_dataset_project_id).table( dest_table_name) copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' else: # Merge # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement merge_job_config = bigquery.QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 merge_job_config.priority = bigquery.QueryPriority.INTERACTIVE merge_sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/merge/merge_{task}.sql'. format(task=task)) merge_sql_template = read_file(merge_sql_path) merge_template_context = template_context.copy() merge_template_context['params'][ 'source_table'] = temp_table_name merge_template_context['params'][ 'destination_dataset_project_id'] = destination_dataset_project_id merge_template_context['params'][ 'destination_dataset_name'] = dataset_name merge_sql = kwargs['task'].render_template( '', merge_sql_template, merge_template_context) print('Merge sql:') print(merge_sql) merge_job = client.query(merge_sql, location='US', job_config=merge_job_config) submit_bigquery_job(merge_job, merge_job_config) assert merge_job.state == 'DONE' # Delete temp table client.delete_table(temp_table_ref)
def create_or_update_history_table(bigquery_client, dataset_name, history_table_name, table_definition, ds, public_project_id, public_dataset_name, internal_project_id, destination_project_id, sqls_folder, parse_all_partitions, time_func=time.time): table_name = table_definition['table']['table_name'] schema = table_definition['table']['schema'] parser_type = table_definition['parser'].get('type', 'log') schema = read_bigquery_schema_from_dict(schema, parser_type) # # # Create a temporary table dataset_name_temp = 'parse_temp' create_dataset(bigquery_client, dataset_name_temp) temp_table_name = 'temp_{table_name}_{milliseconds}' \ .format(table_name=table_name, milliseconds=int(round(time_func() * 1000))) temp_table_ref = bigquery_client.dataset(dataset_name_temp).table( temp_table_name) temp_table = bigquery.Table(temp_table_ref, schema=schema) table_description = table_definition['table']['table_description'] temp_table.description = table_description temp_table.time_partitioning = bigquery.TimePartitioning( field='block_timestamp') logging.info('Creating table: ' + json.dumps(temp_table.to_api_repr())) temp_table = bigquery_client.create_table(temp_table) assert temp_table.table_id == temp_table_name # # # Query to temporary table udf_name = 'parse_{}'.format(table_name) selector = abi_to_selector(parser_type, table_definition['parser']['abi']) parse_mode = get_parse_mode(HistoryType.HISTORY, parse_all_partitions=parse_all_partitions) full_source_table_name = get_source_table( parser_type=parser_type, parse_mode=parse_mode, ds=ds, internal_project_id=internal_project_id, public_project_id=public_project_id, public_dataset_name=public_dataset_name, selector=selector) sql = generate_parse_sql_template( sqls_folder, parser_type, parse_mode, full_source_table_name=full_source_table_name, selector=selector, internal_project_id=internal_project_id, destination_project_id=destination_project_id, dataset_name=dataset_name, udf_name=udf_name, table_definition=table_definition, parse_all_partitions=parse_all_partitions, ds=ds) query(bigquery_client, sql, destination=temp_table_ref) # # # Copy / merge to destination if parse_all_partitions: # Copy temporary table to destination copy_job_config = bigquery.CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' dataset = create_dataset(bigquery_client, dataset_name, internal_project_id) dest_table_ref = dataset.table(history_table_name) copy_job = bigquery_client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' # Need to do update description as copy above won't respect the description in case destination table # already exists table = bigquery_client.get_table(dest_table_ref) table.description = table_description table = bigquery_client.update_table(table, ["description"]) assert table.description == table_description else: merge_sql = render_merge_template( sqls_folder, table_schema=schema, internal_project_id=internal_project_id, dataset_name=dataset_name, destination_table_name=history_table_name, dataset_name_temp=dataset_name_temp, source_table=temp_table_name, ds=ds) query(bigquery_client, merge_sql) # Delete temp table bigquery_client.delete_table(temp_table_ref)
def main(submission_date, dst_table, project, dataset): """Run query per app_version.""" bq_client = bigquery.Client(project=project) app_versions = [ row["app_version"] for row in bq_client.query( VERSION_QUERY_TEMPLATE.format( date=submission_date, project=project, dataset=dataset ) ).result() ] print(f"Found versions: {app_versions}") if len(app_versions) == 0: print("Source table empty", file=sys.stderr) sys.exit(1) sql_path = SQL_BASE_DIR / dst_table / "query.sql" query_text = sql_path.read_text() # Write to intermediate table to avoid partial writes to destination table intermediate_table = f"{project}.analysis.glam_temp_clustered_query_{dst_table}" print(f"Writing results to {intermediate_table}") for i, app_version in enumerate(app_versions): print(f"Querying for app_version {app_version}") query_config = bigquery.QueryJobConfig( query_parameters=[ bigquery.ScalarQueryParameter( "submission_date", "DATE", str(submission_date) ), bigquery.ScalarQueryParameter("app_version", "INT64", app_version), ], clustering_fields=["metric", "channel"], destination=intermediate_table, default_dataset=f"{project}.{dataset}", write_disposition=( bigquery.WriteDisposition.WRITE_TRUNCATE if i == 0 else bigquery.WriteDisposition.WRITE_APPEND ), ) query_job = bq_client.query(query_text, job_config=query_config) # Periodically print so airflow gke operator doesn't think task is dead elapsed = 0 while not query_job.done(): time.sleep(10) elapsed += 10 if elapsed % 200 == 10: print("Waiting on query...") print(f"Total elapsed: approximately {elapsed} seconds") results = query_job.result() print(f"Query job {query_job.job_id} finished") print(f"{results.total_rows} rows in {intermediate_table}") copy_config = bigquery.CopyJobConfig( write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE, ) print(f"Copying {intermediate_table} to {project}.{dataset}.{dst_table}") bq_client.copy_table( intermediate_table, f"{project}.{dataset}.{dst_table}", job_config=copy_config, ).result() print(f"Deleting {intermediate_table}") bq_client.delete_table(intermediate_table)
def parse_task(ds, **kwargs): template_context = kwargs.copy() template_context['ds'] = ds template_context['params'] = environment template_context['params']['table_name'] = table_name template_context['params']['columns'] = columns template_context['params']['parser'] = parser template_context['params']['abi'] = abi if parser_type == 'log': template_context['params']['event_topic'] = abi_to_event_topic(parser['abi']) elif parser_type == 'trace': template_context['params']['method_selector'] = abi_to_method_selector(parser['abi']) template_context['params']['struct_fields'] = create_struct_string_from_schema(schema) template_context['params']['parse_all_partitions'] = parse_all_partitions client = bigquery.Client() # # # Create a temporary table dataset_name_temp = 'parse_temp' create_dataset(client, dataset_name_temp) temp_table_name = 'temp_{table_name}_{milliseconds}'\ .format(table_name=table_name, milliseconds=int(round(time.time() * 1000))) temp_table_ref = client.dataset(dataset_name_temp).table(temp_table_name) temp_table = bigquery.Table(temp_table_ref, schema=read_bigquery_schema_from_dict(schema, parser_type)) temp_table.description = table_description temp_table.time_partitioning = TimePartitioning(field='block_timestamp') logging.info('Creating table: ' + json.dumps(temp_table.to_api_repr())) temp_table = client.create_table(temp_table) assert temp_table.table_id == temp_table_name # # # Query to temporary table job_config = bigquery.QueryJobConfig() job_config.priority = bigquery.QueryPriority.INTERACTIVE job_config.destination = temp_table_ref sql_template = get_parse_sql_template(parser_type) sql = kwargs['task'].render_template('', sql_template, template_context) logging.info(sql) query_job = client.query(sql, location='US', job_config=job_config) submit_bigquery_job(query_job, job_config) assert query_job.state == 'DONE' # # # Copy / merge to destination if parse_all_partitions: # Copy temporary table to destination copy_job_config = bigquery.CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' dest_table_ref = client.dataset(dataset_name, project=parse_destination_dataset_project_id).table(table_name) copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' # Need to do update description as copy above won't repect the description in case destination table # already exists table = client.get_table(dest_table_ref) table.description = table_description table = client.update_table(table, ["description"]) assert table.description == table_description else: # Merge # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement merge_job_config = bigquery.QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 merge_job_config.priority = bigquery.QueryPriority.INTERACTIVE merge_sql_template = get_merge_table_sql_template() merge_template_context = template_context.copy() merge_template_context['params']['source_table'] = temp_table_name merge_template_context['params']['destination_dataset_project_id'] = parse_destination_dataset_project_id merge_template_context['params']['destination_dataset_name'] = dataset_name merge_template_context['params']['dataset_name_temp'] = dataset_name_temp merge_template_context['params']['columns'] = columns merge_sql = kwargs['task'].render_template('', merge_sql_template, merge_template_context) print('Merge sql:') print(merge_sql) merge_job = client.query(merge_sql, location='US', job_config=merge_job_config) submit_bigquery_job(merge_job, merge_job_config) assert merge_job.state == 'DONE' # Delete temp table client.delete_table(temp_table_ref)
def enrich_task(): client = bigquery.Client() # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null # when writeDisposition is WRITE_TRUNCATE # Create a temporary table temp_table_name = '{task}_{milliseconds}'.format( task=task, milliseconds=int(round(time.time() * 1000))) temp_table_ref = client.dataset(dataset_name_temp).table( temp_table_name) schema_path = os.path.join( dags_folder, 'resources/stages/enrich/schemas/{task}.json'.format( task=task)) schema = read_bigquery_schema_from_file(schema_path) table = bigquery.Table(temp_table_ref, schema=schema) description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) if time_partitioning_field is not None: table.time_partitioning = TimePartitioning( field=time_partitioning_field) logging.info('Creating table: ' + json.dumps(table.to_api_repr())) table = client.create_table(table) assert table.table_id == temp_table_name # Query from raw to temporary table query_job_config = bigquery.QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 query_job_config.priority = bigquery.QueryPriority.INTERACTIVE query_job_config.destination = temp_table_ref sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql = read_file(sql_path) query_job = client.query(sql, location='US', job_config=query_job_config) submit_bigquery_job(query_job, query_job_config) assert query_job.state == 'DONE' # Copy temporary table to destination copy_job_config = bigquery.CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' all_destination_projects = [(destination_dataset_project_id, dataset_name)] if copy_dataset_project_id is not None and len(copy_dataset_project_id) > 0 \ and copy_dataset_name is not None and len(copy_dataset_name) > 0: all_destination_projects.append( (copy_dataset_project_id, copy_dataset_name)) for dest_project, dest_dataset_name in all_destination_projects: dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dest_dataset_name, project=dest_project).table(dest_table_name) copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' # Delete temp table client.delete_table(temp_table_ref)
number = "_" + str(random.randint(1, 21) * 5) temp_table_name = sys.argv[5] + number print("Creating temporary table {}".format(temp_table_name)) temp_table_ref = dataset.table(temp_table_name) temp_table = bigquery.Table(temp_table_ref, schema=schema) temp_table = client.create_table(temp_table) print("Running {}".format(sys.argv[4].format(temp_table_name))) query_job = client.query(sys.argv[4].format(temp_table_name)) iterator = query_job.result() print(query_job.state) print("Deleting {}".format(sys.argv[5])) client.delete_table(table) table = None print("Creating empty table {}".format(sys.argv[5])) table_ref = dataset.table(sys.argv[5]) table = bigquery.Table(table_ref, schema=schema) table = client.create_table(table) print("Copying {} to {}".format(temp_table_name, sys.argv[5])) source_table_ref = temp_table_ref job_config = bigquery.CopyJobConfig() job = client.copy_table(temp_table_ref, table_ref, job_config=job_config) # API request job.result() # Waits for job to complete. print("Deleting {}".format(temp_table_name)) client.delete_table(temp_table)
def resize(self): """ This is the execute function of this class. It copies the source table into the destination table and then copies the destination table into itself until it reaches or exceeds the target_rows. """ # How many rows short of our target are we? gap = self.target_rows - self.source_table.num_rows while gap > 0: # Copy until we've reached or exceeded target_rows # API requests to get the latest table info. source_table = self.client.get_table(self.source_table) try: dest_table = self.client.get_table(self.dest_table_ref) except NotFound: dest_table = self.client.create_table( bigquery.Table(self.dest_table_ref)) # Get the latest size of the dest_table. # Note that for the first call these properties are None. dest_rows = dest_table.num_rows dest_bytes = dest_table.num_bytes dest_gb = dest_bytes / float(1024**3) # Recalculate the gap. if dest_rows: gap = self.target_rows - dest_rows else: gap = self.target_rows print('{} rows in table of size {} GB, with a target of {}, ' 'leaving a gap of {}'.format(dest_rows, round(dest_gb, 2), self.target_rows, gap)) # Greedily copy the largest of dest_table and source_table into # dest_table without going over the target rows. The last query # will be a subset of source_table via a limit query. if gap < source_table.num_rows: # This will be the last copy operation if target_rows is # not a power of 2 times the number of rows originally in the # source table. It is not a full copy. job_config = bigquery.QueryJobConfig() # Set the destination table job_config.destination = self.dest_table_ref job_config.write_disposition = 'WRITE_APPEND' job_config.allow_large_results = True sql = """ SELECT * FROM `{}.{}.{}` LIMIT {} """.format(self.project, self.source_table.dataset_id, self.source_table.table_id, gap) # API request to BigQuery with query and config defined above. query_job = self.client.query( sql, # Location must match that of the dataset(s) referenced in # the query and of the destination table. location=self.location, job_config=job_config) # Wait for query_job to finish. query_job.result() else: if source_table.num_rows < dest_table.num_rows < gap: use_as_source_table = self.dest_table_ref else: # source_table.num_rows < gap < dest_table.num_rows use_as_source_table = self.source_table.reference copy_config = bigquery.CopyJobConfig() copy_config.write_disposition = 'WRITE_APPEND' copy_config.create_disposition = 'CREATE_IF_EMPTY' copy_job = self.client.copy_table(use_as_source_table, self.dest_table_ref, job_config=copy_config) # Wait for copy_job to finish. copy_job.result()