def main(): from google.cloud.bigquery import Client tables = {} with open("/vol/required_tables.txt") as rt: table_names = rt.read().split() bq_client = Client() for table_name in table_names: splited_table_name = table_name.split(".") if len(splited_table_name) == 3: dataset_ref = bq_client.dataset(splited_table_name[1], project=splited_table_name[0]) else: dataset_ref = bq_client.dataset(splited_table_name[0]) table_ref = dataset_ref.table(splited_table_name[-1]) table = bq_client.get_table(table_ref) tables[table_name] = [ field.to_api_repr() for field in table.schema ] if table_name.endswith("*"): tables[table_name].append({ "name": "_TABLE_SUFFIX", "type": "STRING", "mode": "REQUIRED" }) with open("/vol/schema.json", mode="w") as schema: schema.write(json.dumps(tables))
def _recreate_table(client: bigquery.Client, table_id: str, schema: list) -> None: logger.info(f'Recreating {table_id} table...') dataset_ref = client.dataset(DATASET_ID) table_ref = dataset_ref.table(table_id) table = bigquery.Table(table_ref, schema=schema) client.delete_table(table_ref, not_found_ok=True) client.create_table(table)
def main(): args = get_sysargs(sys.argv[1:]) exporter = Exporter() jinjafy = Jinjafy('../data/queries/marreco/datajet/') client = Client() dataset = client.dataset(args.dataset) table = dataset.table(args.table) for day in range(args.days_init, args.days_end - 1, -1): print('processing day: ', day) for idx, file_ in enumerate(['productview.sql', 'search.sql', 'purchase.sql']): query = build_query(jinjafy, file_, {'dataset': '40663402', 'days_interval': day, 'days_interval_end': day}) exporter.bq_to_gcs(client, query, {'destination': table, 'maximum_bytes_billed': 1000000000000, 'write_disposition': 'WRITE_TRUNCATE'}, {'uri': args.uri.format(day=day, idx=idx), 'table': table, 'compression': 'GZIP', 'destination_format': 'NEWLINE_DELIMITED_JSON'})
def create_view_task(ds, **kwargs): template_context = kwargs.copy() template_context['ds'] = ds template_context['params'] = environment client = Client() dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dataset_name, project=destination_dataset_project_id).table(dest_table_name) table = Table(dest_table_ref) sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql_template = read_file(sql_path) sql = kwargs['task'].render_template('', sql_template, template_context) table.view_query = sql description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) logging.info('Creating view: ' + json.dumps(table.to_api_repr())) try: table = client.create_table(table) except Conflict: # https://cloud.google.com/bigquery/docs/managing-views table = client.update_table(table, ['view_query']) assert table.table_id == dest_table_name
def delete_views_or_table(client: bigquery.Client, view_or_table_name: str, dataset: str): LOGGER.debug("delete_views_or_tables: %s", view_or_table_name) dataset_ref = client.dataset(dataset) table_ref = dataset_ref.table(view_or_table_name) client.delete_table(table_ref) LOGGER.info("deleted view or table: %s", view_or_table_name)
def update_or_create_table_from_csv( client: bigquery.Client, table_name: str, source_file: str, dataset: str, source_schema_file: str, ): LOGGER.debug("update_or_create_table_from_csv: %s=%s", table_name, [source_file]) dataset_ref = client.dataset(dataset) table_ref = dataset_ref.table(table_name) job_config = LoadJobConfig() job_config.source_format = "CSV" job_config.skip_leading_rows = 1 if Path(source_schema_file).exists(): job_config.schema = get_table_schema(source_schema_file) else: job_config.autodetect = True job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE with open(source_file, "rb") as source_fp: load_job = client.load_table_from_file(source_fp, destination=table_ref, job_config=job_config) # wait for job to complete load_job.result() LOGGER.info("updated config table: %s", table_ref.table_id)
def create_view_task(): client = Client() dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dataset_name, project=destination_dataset_project_id).table(dest_table_name) table = Table(dest_table_ref) sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql = read_file(sql_path, environment) table.view_query = sql description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) logging.info('Creating view: ' + json.dumps(table.to_api_repr())) try: table = client.create_table(table) except Conflict: # https://cloud.google.com/bigquery/docs/managing-views table = client.update_table(table, ['view_query']) assert table.table_id == dest_table_name
def load_task(): client = Client() job_config = LoadJobConfig() schema_path = os.path.join( dags_folder, 'resources/stages/raw/schemas/{task}.json'.format(task=task)) job_config.schema = read_bigquery_schema_from_file(schema_path) job_config.source_format = SourceFormat.CSV if file_format == 'csv' else SourceFormat.NEWLINE_DELIMITED_JSON if file_format == 'csv': job_config.skip_leading_rows = 1 job_config.write_disposition = 'WRITE_TRUNCATE' job_config.allow_quoted_newlines = allow_quoted_newlines job_config.ignore_unknown_values = True export_location_uri = 'gs://{bucket}/export'.format( bucket=output_bucket) uri = '{export_location_uri}/{task}/*.{file_format}'.format( export_location_uri=export_location_uri, task=task, file_format=file_format) table_ref = client.dataset(dataset_name_raw).table(task) load_job = client.load_table_from_uri(uri, table_ref, job_config=job_config) submit_bigquery_job(load_job, job_config) assert load_job.state == 'DONE'
def does_bigquery_table_exist(client: bigquery.Client, dataset_name: str, table_name: str): dataset_ref = client.dataset(dataset_name) table_ref = dataset_ref.table(table_name) try: client.get_table(table_ref) return True except NotFound: return False
def dataset(bq: bigquery.Client, dataset_id: str): """Context manager for creating and deleting the BigQuery dataset for a test.""" try: bq.get_dataset(dataset_id) except NotFound: bq.create_dataset(dataset_id) try: yield bq.dataset(dataset_id) finally: bq.delete_dataset(dataset_id, delete_contents=True)
def get_temporary_dataset(client: bigquery.Client): """Get a cached reference to the dataset used for server-assigned destinations.""" global temporary_dataset if temporary_dataset is None: # look up the dataset used for query results without a destination dry_run = bigquery.QueryJobConfig(dry_run=True) destination = client.query("SELECT NULL", dry_run).destination temporary_dataset = client.dataset(destination.dataset_id, destination.project) return temporary_dataset
def get_bq_dataset( client: bigquery.Client, dataset_id: str, project_id: str = None, ) -> bigquery.Dataset: # If `project_id is None` then the default project of `client` will be used. dataset_ref = client.dataset( dataset_id, project=project_id) # type: bigquery.DatasetReference # API request return client.get_dataset(dataset_ref) # type: bigquery.Dataset
def does_table_exist(bigquery_client: bigquery.Client, table: str, dataset: str = 'analytics') -> bool: """Check if given table from given Dataset exists in BigQuery, return True if so.""" try: table_reference = bigquery_client.dataset(dataset).table(table) is_table = bigquery_client.get_table(table_reference) if is_table: logging.info('Table "{}" in Dataset "{}" already exists in BigQuery.'.format(table, dataset)) return True except NotFound as error: logging.warning('Table "{}" does not exist in BigQuery Dataset "{}". Ref: {}.'.format(table, dataset, error)) return False
def get_bq_table( client: bigquery.Client, dataset_id: str, table_id: str, project_id: str = None, ) -> bigquery.Table: # If `project_id is None` then the default project of `client` will be used. table_ref = client.dataset(dataset_id, project=project_id).table( table_id) # type: bigquery.TableReference # noqa: E501 # API request return client.get_table(table_ref) # type: bigquery.Table
def get_bigquery_table(): log = logging.getLogger('get_bigquery_table') log.info("get client") # Create a client, and get the table that we are interested in: bigquery_client = BQClient(project="pocketreplacement") log.info("get dataset") dataset_name = 'RedditData' dataset = bigquery_client.dataset(dataset_name) log.info("get table") table = dataset.table("subset") table.reload() return table
def loadjob_with_thread(project_name, dataset_name, table_name, num_thread, sleep_time=None): tlist = [] for _ in range(num_thread): client = Client(project_name) dataset_ref = Dataset(client.dataset(dataset_name)) tw = ThreadWrapper(loadjob_infinite, [client, dataset_ref, table_name, sleep_time]) tw.start() tlist.append(tw) while len(tlist) > 0: tlist = [t for t in tlist if t.isAlive()] time.sleep(1)
def delete_table(bq_client: bigquery.Client, dataset_id: str, table_name: str) -> None: """Deletes a specified table in BigQuery. Args: bq_client: bigquery.Client object. dataset_id: String holding ID of dataset table_name: String of table name to delete Returns: None; Deletes a table in BigQuery """ dataset_ref = bq_client.dataset(dataset_id) table_ref = dataset_ref.table(table_name) bq_client.delete_table(table=table_ref)
def update_or_create_view(client: bigquery.Client, view_name: str, view_query: str, dataset: str): LOGGER.debug("update_view: %s=%s", view_name, [view_query]) dataset_ref = client.dataset(dataset) view_ref = dataset_ref.table(view_name) view = bigquery.Table(view_ref) view.view_query = view_query query_job = client.query(get_create_or_replace_view_query(view)) query_job.result() # wait for query job to finish updated_view = client.get_table(view) LOGGER.info("updated or replaced view: %s", updated_view.full_table_id) LOGGER.debug("view schema (%s): %s", updated_view.full_table_id, updated_view.schema)
def load_bigquery_table_via_bq_apis(bq_client: bigquery.Client, dataset_id, table_name, imported_data_info, src_uris): """ Load tables using BigQuery Load jobs, using the same configuration as BQ DTS ImportedDataInfo :return: """ # https://googlecloudplatform.github.io/google-cloud-python/latest/_modules/google/cloud/bigquery/client.html#Client.load_table_from_uri # Step 1 - Translate required fields for BigQuery Python SDK tgt_tabledef = imported_data_info['table_defs'][0] # Step 2 - Create target table if it doesn't exist dataset_ref = bq_client.dataset(dataset_id) table_ref = dataset_ref.table(table_name) try: bq_client.get_table(table_ref) except exceptions.NotFound: # Step 2a - Attach schema tgt_schema = RPCRecordSchema_to_GCloudSchema(tgt_tabledef['schema']) tgt_table = bigquery.Table(table_ref, schema=tgt_schema) # Step 2b - Attach description tgt_table.description = imported_data_info[ 'destination_table_description'] # Step 2c - Conditionally set partitioning type if '$' in table_name: tgt_table.partitioning_type = 'DAY' tgt_table._properties['tableReference'][ 'tableId'], _, _ = table_name.partition('$') # Step 2d - Create BigQuery table bq_client.create_table(tgt_table) # Step 3a - Create BigQuery Load Job ID current_datetime = datetime.datetime.utcnow().isoformat() raw_job_id = f'{table_name}_{current_datetime}' clean_job_id = BQ_JOB_ID_MATCHER.sub('___', raw_job_id) # Step 3b - Create BigQuery Job Config job_config = DTSTableDefinition_to_BQLoadJobConfig(tgt_tabledef) # Step 4 - Execute BigQuery Load Job using Python SDK load_job = bq_client.load_table_from_uri(source_uris=src_uris, destination=table_ref, job_id=clean_job_id, job_config=job_config) return load_job
def list_tables(client: bigquery.Client, dataset_id: str): """ Lists the tables in project:dataset Args: client: BQ API client dataset_id: dataset to be inspected Returns: list Examples: list_tables(client, 'my_dataset') """ dataset_ref = client.dataset(dataset_id) return [t.table_id for t in client.list_tables(dataset_ref)]
def _main(): args = _parse_args() _client = Client(args.project) _dataset_ref = Dataset(_client.dataset(args.dataset)) _table = args.table if args.cmd == 'create-table': create_table(_client, _dataset_ref, _table, args.partitioned) elif args.cmd == 'drop-table': drop_table(_client, _dataset_ref, _table) elif args.cmd == 'loadjob-one': loadjob_one(_client, _dataset_ref, _table) elif args.cmd == 'loadjob-with-thread': loadjob_with_thread(args.project, args.dataset, _table, args.num_thread, args.sleep)
def delete_dataset(client: bigquery.Client, dataset_id: str, delete_contents: bool = True, ): """ Deletes dataset from the current project Args: client: BQ API client dataset_id: dataset to be deleted delete_contents: (default is True) Returns: deletes dataset """ # TODO: ADD CHECK OF EXISTENCE dataset_ref = client.dataset(dataset_id) client.delete_dataset(dataset_ref, delete_contents)
def create_bq_dataset( client: bigquery.Client, dataset_id: str, dataset_description: str = None, ) -> bigquery.Dataset: """ Create empty dataset. """ # TODO: validate 'dataset_id'. # > Dataset IDs must be alphanumeric (plus underscores) and must be at most 1024 chars long. # note: it is not intuitive the dual instantiation of a 'Dataset' object. dataset = bigquery.Dataset( client.dataset(dataset_id)) # type: bigquery.Dataset dataset.description = dataset_description # API request return client.create_dataset(dataset) # type: bigquery.Dataset
def create_dataset(client: bigquery.Client, dataset_id: str, location: str = 'EU', description: str = "Creation date: {}".format(datetime.datetime.now())): """ Creates a dataset with following referece project_id:dataset_id Args: client: BQ API client dataset_id: dataset to be created location: location of the dataset (default is Europe for legal reasons) description: description of the dataset (default is date oc creation) Returns: """ # TODO: ADD CHECK OF EXISTENCE dataset_ref = client.dataset(dataset_id) dataset = bigquery.Dataset(dataset_ref) dataset.location = location dataset.description = description client.create_dataset(dataset)
def delete_table(client: bigquery.Client, dataset_id: str, table_id: str): """ Deletes the specified table in the given project:dataset Args: client: BQ API client dataset_id: destination dataset table_id: table to be deleted Returns: Examples: delete_table(client, 'my_dataset', 'my_table') """ dataset_ref = client.dataset(dataset_id=dataset_id) tables_list = [t.table_id for t in list(client.list_tables(dataset_ref))] if table_id not in tables_list: print("THIS TABLE DOES NOT EXIST IN {}:{}".format(client.project, dataset_id)) else: table_ref = dataset_ref.table(table_id) client.delete_table(table_ref)
def create_table(client: bigquery.Client, dataset_id: str, table_id: str, schema: list): """ Creates a table according to the given schema in the specified project:dataset Args: client: BQ API client dataset_id: destination dataset table_id: table to be created schema: schema of the table to be created Returns: Examples: create_table(client, 'my_dataset', 'my_table', my_schema) """ dataset_ref = client.dataset(dataset_id=dataset_id) tables_list = [t.table_id for t in list(client.list_tables(dataset_ref))] if table_id in tables_list: print("THIS TABLE ALREADY EXISTS IN {}:{}".format(client.project, dataset_id)) else: table_ref = dataset_ref.table(table_id) client.create_table(bigquery.Table(table_ref, schema))
def materialize_view( # pylint: disable=too-many-arguments, too-many-locals client: bigquery.Client, source_view_name: str, destination_table_name: str, project: str, source_dataset: str, destination_dataset: str, ) -> QueryJob: query = get_select_all_from_query(source_view_name, project=project, dataset=source_dataset) LOGGER.info("materializing view: %s.%s -> %s.%s", source_dataset, source_view_name, destination_dataset, destination_table_name) LOGGER.debug("materialize_view: %s=%s", destination_table_name, [query]) start = time.perf_counter() dataset_ref = client.dataset(destination_dataset) destination_table_ref = dataset_ref.table(destination_table_name) job_config = QueryJobConfig() job_config.destination = destination_table_ref job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE query_job = client.query(query, job_config=job_config) # getting the result will make sure that the query ran successfully result: bigquery.table.RowIterator = query_job.result() duration = time.perf_counter() - start total_bytes_processed = query_job.total_bytes_processed LOGGER.info( 'materialized view: %s.%s, total rows: %s, %s bytes processed, took: %.3fs', source_dataset, source_view_name, result.total_rows, total_bytes_processed, duration) if LOGGER.isEnabledFor(logging.DEBUG): sample_result = list(islice(result, 3)) LOGGER.debug("sample_result: %s", sample_result) return MaterializeViewResult(total_bytes_processed=total_bytes_processed, total_rows=result.total_rows)
def enrich_task(): client = Client() # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null # when writeDisposition is WRITE_TRUNCATE # Create a temporary table temp_table_name = '{task}_{milliseconds}'.format( task=task, milliseconds=int(round(time.time() * 1000))) temp_table_ref = client.dataset(dataset_name_temp).table( temp_table_name) table = Table(temp_table_ref) description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) if time_partitioning_field is not None: table.time_partitioning = TimePartitioning( field=time_partitioning_field) logging.info('Creating table: ' + json.dumps(table.to_api_repr())) schema_path = os.path.join( dags_folder, 'resources/stages/enrich/schemas/{task}.json'.format( task=task)) schema = read_bigquery_schema_from_file(schema_path) table.schema = schema table = client.create_table(table) assert table.table_id == temp_table_name # Query from raw to temporary table query_job_config = QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 query_job_config.priority = QueryPriority.INTERACTIVE query_job_config.destination = temp_table_ref sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql = read_file(sql_path, environment) query_job = client.query(sql, location='US', job_config=query_job_config) submit_bigquery_job(query_job, query_job_config) assert query_job.state == 'DONE' # Copy temporary table to destination copy_job_config = CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dataset_name, project=destination_dataset_project_id).table(dest_table_name) copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' # Delete temp table client.delete_table(temp_table_ref)
def get_bq_view_query(client: bigquery.Client, view_name: str, dataset: str): dataset_ref = client.dataset(dataset) view_ref = dataset_ref.table(view_name) view = client.get_table(view_ref) return view.view_query
def enrich_task(ds, **kwargs): template_context = kwargs.copy() template_context['ds'] = ds template_context['params'] = environment client = Client() # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null # when writeDisposition is WRITE_TRUNCATE # Create a temporary table temp_table_name = '{task}_{milliseconds}'.format( task=task, milliseconds=int(round(time.time() * 1000))) temp_table_ref = client.dataset(dataset_name_temp).table( temp_table_name) table = Table(temp_table_ref) description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) table.time_partitioning = TimePartitioning( field=time_partitioning_field) logging.info('Creating table: ' + json.dumps(table.to_api_repr())) schema_path = os.path.join( dags_folder, 'resources/stages/enrich/schemas/{task}.json'.format( task=task)) schema = read_bigquery_schema_from_file(schema_path) table.schema = schema table = client.create_table(table) assert table.table_id == temp_table_name # Query from raw to temporary table query_job_config = QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 query_job_config.priority = QueryPriority.INTERACTIVE query_job_config.destination = temp_table_ref sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql_template = read_file(sql_path) sql = kwargs['task'].render_template('', sql_template, template_context) print('Enrichment sql:') print(sql) query_job = client.query(sql, location='US', job_config=query_job_config) submit_bigquery_job(query_job, query_job_config) assert query_job.state == 'DONE' if load_all_partitions: # Copy temporary table to destination copy_job_config = CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dataset_name, project=destination_dataset_project_id).table( dest_table_name) copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' else: # Merge # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement merge_job_config = QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 merge_job_config.priority = QueryPriority.INTERACTIVE merge_sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/merge_{task}.sql'.format( task=task)) merge_sql_template = read_file(merge_sql_path) template_context['params']['source_table'] = temp_table_name merge_sql = kwargs['task'].render_template( '', merge_sql_template, template_context) print('Merge sql:') print(merge_sql) merge_job = client.query(merge_sql, location='US', job_config=merge_job_config) submit_bigquery_job(merge_job, merge_job_config) assert merge_job.state == 'DONE' # Delete temp table client.delete_table(temp_table_ref)