def create_members_table(apps, schema_editor): dataset_ref = client.dataset(settings.BIGQUERY_DATASET_ID) # Only define fields that are required and not strings (because # non-required strings will be populated automatically) schema = [ bigquery.SchemaField('date', 'DATE', mode='REQUIRED'), bigquery.SchemaField('id', 'INT64', mode='REQUIRED'), bigquery.SchemaField('joined', 'TIMESTAMP'), bigquery.SchemaField('lat', 'FLOAT64'), bigquery.SchemaField('lon', 'FLOAT64'), bigquery.SchemaField('group_id', 'INT64', mode='REQUIRED'), bigquery.SchemaField('group_urlname', 'STRING', mode='REQUIRED'), bigquery.SchemaField('is_pro_admin', 'BOOL'), ] table_ref = dataset_ref.table('members') table = bigquery.Table(table_ref, schema=schema) table.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, field='date') # define partitioning by date table.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, field='date') # define clustering by group urlname table.clustering_fields = ['group_urlname'] client.create_table(table)
def load_data_to_bq(self): """ loads or initalizes the job in BQ """ # for newest data bigquery_client = bigquery.Client(CONFIG['project']) # print(bigquery_client) destination_dataset_ref = bigquery_client.dataset(CONFIG['dataset']) destination_table_ref = destination_dataset_ref.table( self.table_destination + '$' + self.date_nodash) job_config = bigquery.LoadJobConfig() job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON #using partition by ingestion Time job_config.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY) with open(self.path, 'rb') as f: job = bigquery_client.load_table_from_file(f, destination_table_ref, job_config=job_config) job.result() print('----->>>> ' + self.path + ' has success to load!') os.remove(self.path)
def dest_partitioned_table(request, bq: bigquery.Client, mock_env, dest_dataset) -> bigquery.Table: public_table: bigquery.Table = bq.get_table( bigquery.TableReference.from_string( "bigquery-public-data.new_york_311.311_service_requests")) schema = public_table.schema table: bigquery.Table = bigquery.Table( f"{os.environ.get('GCP_PROJECT')}" f".{dest_dataset.dataset_id}.cf_test_nyc_311_" f"{str(uuid.uuid4()).replace('-','_')}", schema=schema, ) table.time_partitioning = bigquery.TimePartitioning() table.time_partitioning.type_ = bigquery.TimePartitioningType.HOUR table.time_partitioning.field = "created_date" table = bq.create_table(table) def teardown(): bq.delete_table(table, not_found_ok=True) request.addfinalizer(teardown) return table
def _run_project_sql(bq_client, project, dataset, submission_date, slug): normalized_slug = _bq_normalize_name(slug) date_partition = str(submission_date).replace("-", "") query_config = bigquery.QueryJobConfig( query_parameters=[ bigquery.ScalarQueryParameter("submission_date", "DATE", str(submission_date)), ], use_legacy_sql=False, clustering_fields=["build_id"], default_dataset=f"{project}.{dataset}", time_partitioning=bigquery.TimePartitioning(field="submission_date"), use_query_cache=True, allow_large_results=True, ) for data_type in DATA_TYPES: destination_table = ( f"{project}.{dataset}.{normalized_slug}_{data_type}${date_partition}" ) query_config.destination = destination_table query_config.write_disposition = "WRITE_TRUNCATE" _run_sql_for_data_type(bq_client, project, dataset, normalized_slug, query_config, data_type)
def bq_create_table(self, table_ref, columns=[], data_types=[], mode=[], partition=None): if len(columns) == 0 or len(data_types) == 0: print("columns or data type must be filled") return if len(columns) != len(data_types): print("columns and data type must be same total array") return bigquery_client = bigquery.Client() # Prepares a reference to the table try: bigquery_client.get_table(table_ref) except Exception as e: schema = [] for i in range(0, len(data_types)): schema.append( bigquery.SchemaField(columns[i], data_types[i], mode[i])) table = bigquery.Table(table_ref, schema=schema) if partition is not None: table.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, field="process_time", ) table = bigquery_client.create_table(table) return_name = "Created table {}".format(table.table_id) if partition is not None: return_name = "{}, partitioned on column{}".format( return_name, table.time_partitioning) print(return_name)
def create_table(self): """ Method to create a table in the dataset with schema and data from a csv. """ # creating a reference for the dataset in which the table is present dataset_ref = self.client.dataset('dataset_airflow_etl') # creating a reference to the table table_ref = dataset_ref.table('airflow_table_assignment_divya') try: # fetching details of the table from the reference self.client.get_table(table_ref) except NotFound: # table_ref = dataset_ref.table(table_ref) schema = [ bigquery.SchemaField("state", "STRING", mode="REQUIRED"), bigquery.SchemaField("count", "INTEGER", mode="REQUIRED"), bigquery.SchemaField("date", "DATE", mode="REQUIRED"), ] table = bigquery.Table(table_ref, schema=schema) table.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, field="date", # name of column to use for partitioning expiration_ms=5184000000, ) # 90 days print("Created table {}, partitioned on column {}".format( table.table_id, table.time_partitioning.field)) table = self.client.create_table(table)
def run_query(self, date, qstring=None): if qstring is None: qstring = read_string("sql/{}.sql".format(self.config["query"])) qparams = self.get_query_params(date) qstring = qstring.format(**qparams) table_ref = self.client.dataset( self.config["params"]["dataset"]).table( self.config["params"]["dest"]) job_config = bigquery.QueryJobConfig() job_config.write_disposition = ( bigquery.WriteDisposition.WRITE_APPEND if self.is_write_append() else bigquery.WriteDisposition.WRITE_TRUNCATE) if ("allow_field_addition" in self.config and self.config["allow_field_addition"]): job_config.schema_update_options = [ bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION ] job_config.destination = table_ref if "partition_field" in self.config: job_config.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, field=self.config["partition_field"], ) query = self.client.query(qstring, job_config=job_config) query.result() if "create_view_alt" in self.config and self.config["create_view_alt"]: self.create_view("_view")
def create_groups_table(apps, schema_editor): dataset_ref = client.dataset(settings.BIGQUERY_DATASET_ID) # Only define fields that are required and not strings (because # non-required strings will be populated automatically) schema = [ bigquery.SchemaField('date', 'DATE', mode='REQUIRED'), bigquery.SchemaField('id', 'INT64', mode='REQUIRED'), bigquery.SchemaField('urlname', 'STRING', mode='REQUIRED'), bigquery.SchemaField('created', 'TIMESTAMP'), bigquery.SchemaField('lat', 'FLOAT64'), bigquery.SchemaField('lon', 'FLOAT64'), bigquery.SchemaField('members', 'INT64'), bigquery.SchemaField('organizer_id', 'INT64'), bigquery.SchemaField('next_event_yes_rsvp_count', 'INT64'), bigquery.SchemaField('next_event_time', 'TIMESTAMP'), bigquery.SchemaField('category_id', 'INT64'), bigquery.SchemaField('meta_category_id', 'INT64'), ] table_ref = dataset_ref.table('groups') table = bigquery.Table(table_ref, schema=schema) # define partitioning by date table.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, field='date') # define clustering by urlname table.clustering_fields = ['urlname'] client.create_table(table)
def load_new_telemetry_snippet_data(dataset_id, table_name, next_load_date, end_load_date): ''' Reads csv file from google cloud storage bucket and loads it into bigquery :param dataset_id: Name of dataset to be loaded into :param table_name: Name of table to be loaded into :param next_load_date: Earliest date to be loaded into table_name :param end_load_date: Latest date to be loaded into table_name :return: ''' while next_load_date < end_load_date: # Set dates required for loading new data next_load_date = datetime.strftime(next_load_date, '%Y%m%d') logging.info(f'snippetTelemetryDailyRetrieve: Starting load for next load date: {next_load_date}') client = bigquery.Client(project='ga-mozilla-org-prod-001') file = f'gs://snippets-data-transfer/daily-tracking-data/snippets_{next_load_date}.csv' load_dataset_id = dataset_id load_table_name = table_name load_table_suffix = next_load_date load_table_id = f'{load_table_name.lower()}_{load_table_suffix}' # Configure load job dataset_ref = client.dataset(load_dataset_id) table_ref = dataset_ref.table(load_table_id) load_job_config = bigquery.LoadJobConfig() # load job call load_job_config.schema = [ bigquery.SchemaField('sendDate', 'DATE'), bigquery.SchemaField('messageID', 'STRING'), bigquery.SchemaField('releaseChannel', 'STRING'), bigquery.SchemaField('locale', 'STRING'), bigquery.SchemaField('countryCode', 'STRING'), bigquery.SchemaField('os', 'STRING'), bigquery.SchemaField('version', 'STRING'), bigquery.SchemaField('impressions', 'INTEGER'), bigquery.SchemaField('clicks', 'INTEGER'), bigquery.SchemaField('blocks', 'INTEGER') ] # Define schema load_job_config.source_format = bigquery.SourceFormat.CSV load_job_config.skip_leading_rows = 1 load_job_config.max_bad_records = 20 load_job_config.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, field='sendDate', ) load_job_config.write_disposition = 'WRITE_APPEND' # Options are WRITE_TRUNCATE, WRITE_APPEND, WRITE_EMPTY load_job = client.load_table_from_uri( file, table_ref, location='US', job_config=load_job_config ) assert load_job.job_type == 'load' load_job.result() # Waits for the query to finish logging.info(f'snippetTelemetryDailyRetrieve: File {file} loaded to table {table_ref.path}') # Set next_load_date next_load_date = datetime.strptime(next_load_date, '%Y%m%d') + timedelta(1) return
def create_table(client, table): schema = client.schema_from_json(get_schema_file_object()) table = bigquery.table.Table(table, schema) table.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, field="created_at" ) return client.create_table(table)
def get_table_ref(self, name, project, dataset, partition=None, schema=None, clustering_fields=None): tbb = bigquery.Table("{}.{}.{}".format(project, dataset.dataset_id, name), schema=schema) if clustering_fields is not None and schema is not None: clustering_fields = list( map(lambda x: x.split(':')[0], clustering_fields)) tbb.clustering_fields = clustering_fields if partition is not None: if partition['type'] == 'time': tbb.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, require_partition_filter=False) elif partition['type'] == 'range': tbb.range_partitioning = bigquery.RangePartitioning( bigquery.PartitionRange(partition['start'], partition['end'], partition['interval']), partition['field']) return tbb
def dest_partitioned_table_allow_jagged(bq: bigquery.Client, dest_dataset, monkeypatch) -> bigquery.Table: public_table: bigquery.Table = bq.get_table( bigquery.TableReference.from_string( "bigquery-public-data.new_york_311.311_service_requests")) schema = public_table.schema if os.getenv('GCP_PROJECT') is None: monkeypatch.setenv("GCP_PROJECT", bq.project) extra_field_for_jagged_row_test = bigquery.schema.SchemaField( "extra_jagged_row_test_column", "STRING") schema.append(extra_field_for_jagged_row_test) table: bigquery.Table = bigquery.Table( f"{os.getenv('GCP_PROJECT')}" f".{dest_dataset.dataset_id}.cf_test_nyc_311_" f"{str(uuid.uuid4()).replace('-', '_')}", schema=schema, ) table.time_partitioning = bigquery.TimePartitioning() table.time_partitioning.type_ = bigquery.TimePartitioningType.HOUR table.time_partitioning.field = "created_date" table = bq.create_table(table) return table
def __init__( self, project_id, credentials, table_name, table_schema, csv_path='csv' ): self.project_id = project_id self.credentials = credentials self.table_name = table_name self.table_schema = table_schema self.csv_path = csv_path # 1 years of data should always be ok self.default_table_expiration = 24 * 60 * 60 * 365 self.date_col_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, field="date", expiration_ms=self.default_table_expiration * 1000, ) self.uploader = BigQueryUploader( project_id=self.project_id, dataset_id=os.getenv("BIGQUERY_DATASET"), tmp_folder=self.csv_path, credentials=self.credentials )
def client_load_partitioned_table(client, table_id): # [START bigquery_load_table_partitioned] from google.cloud import bigquery # TODO(developer): Construct a BigQuery client object. # client = bigquery.Client() # TODO(developer): Set table_id to the ID of the table to create. # table_id = "your-project.your_dataset.your_table_name" job_config = bigquery.LoadJobConfig( schema=[ bigquery.SchemaField("name", "STRING"), bigquery.SchemaField("post_abbr", "STRING"), bigquery.SchemaField("date", "DATE"), ], skip_leading_rows=1, time_partitioning=bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, field="date", # Name of the column to use for partitioning. expiration_ms=7776000000, # 90 days. ), ) uri = "gs://cloud-samples-data/bigquery/us-states/us-states-by-date.csv" load_job = client.load_table_from_uri( uri, table_id, job_config=job_config ) # Make an API request. load_job.result() # Wait for the job to complete. table = client.get_table(table_id) print("Loaded {} rows to table {}".format(table.num_rows, table_id))
def load_csv_to_bq(project_name, dataset_name, table_name, uri): client = bigquery.Client() table = client.get_table(project_name + '.' + dataset_name + '.' + table_name) if 'detailed' in table_name: job_config = bigquery.LoadJobConfig( schema=[bigquery.SchemaField("brand", "STRING"), bigquery.SchemaField("headline", "STRING"), bigquery.SchemaField("review_date", "DATE"), bigquery.SchemaField("review_content", "STRING"), bigquery.SchemaField("review_votes", "INT64"), bigquery.SchemaField("review_stars", "INT64"), bigquery.SchemaField("ingestion_date", "DATETIME")], skip_leading_rows = 1, field_delimiter = '|', time_partitioning = bigquery.TimePartitioning( type_ = bigquery.TimePartitioningType.DAY, field = 'ingestion_date')) else: job_config = bigquery.LoadJobConfig( schema= table.schema, skip_leading_rows = 1, field_delimiter = '|') load_job = client.load_table_from_uri(uri, table, job_config = job_config) try: load_job.result() print("Loaded {} rows to {}.{}".format(load_job.output_rows, dataset_name, table_name)) except BadRequest as e: for e in load_job.errors: print('ERROR: {}'.format(e['message']))
def execute_transformation_query(bq_client): """Executes transformation query to a new destination table. Args: bq_client: Object representing a reference to a BigQuery Client """ dataset_ref = bq_client.get_dataset(bigquery.DatasetReference( project=config.billing_project_id, dataset_id=config.output_dataset_id)) table_ref = dataset_ref.table(config.output_table_name) job_config = bigquery.QueryJobConfig() job_config.destination = table_ref job_config.write_disposition = bigquery.WriteDisposition().WRITE_TRUNCATE job_config.time_partitioning = bigquery.TimePartitioning( field='usage_start_time', expiration_ms=None) sql = Template(file_to_string(config.sql_file_path)) sql = sql.safe_substitute(billing_table=config.billing_project_id + '.' + config.billing_dataset_id + '.' + config.billing_table_name, allocation_method=config.allocation_method ) logging.info('Attempting query on all dates...') # Execute Query query_job = bq_client.query( sql, job_config=job_config) query_job.result() # Waits for the query to finish logging.info('Transformation query complete. All partitions are updated.')
def create_functional_alltypes_parted_table(bqclient, functional_alltypes_parted_table): table = bigquery.Table(functional_alltypes_parted_table) table.schema = [ bigquery.SchemaField("index", "INTEGER"), bigquery.SchemaField("Unnamed_0", "INTEGER"), bigquery.SchemaField("id", "INTEGER"), bigquery.SchemaField("bool_col", "BOOLEAN"), bigquery.SchemaField("tinyint_col", "INTEGER"), bigquery.SchemaField("smallint_col", "INTEGER"), bigquery.SchemaField("int_col", "INTEGER"), bigquery.SchemaField("bigint_col", "INTEGER"), bigquery.SchemaField("float_col", "FLOAT"), bigquery.SchemaField("double_col", "FLOAT"), bigquery.SchemaField("date_string_col", "STRING"), bigquery.SchemaField("string_col", "STRING"), bigquery.SchemaField("timestamp_col", "TIMESTAMP"), bigquery.SchemaField("year", "INTEGER"), bigquery.SchemaField("month", "INTEGER"), ] table.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY) table.require_partition_filter = False bqclient.create_table(table, exists_ok=True) return table
def write_to_bq(self, date, messages): bq_client = bigquery.Client(project=self.project) table_name = f"messages_v{self.version}" if self.table_prefix is not None: table_name = f"{self.table_prefix}_{table_name}" load_config = bigquery.LoadJobConfig( time_partitioning=bigquery.TimePartitioning( field="load_date", require_partition_filter=True, ), create_disposition=bigquery.CreateDisposition.CREATE_IF_NEEDED, write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE, schema_update_options=bigquery.SchemaUpdateOption. ALLOW_FIELD_ADDITION, ) load_job = bq_client.load_table_from_json( messages, destination= f"{self.bq_dataset}.{table_name}${date.replace('-', '')}", job_config=load_config, ) load_job.result()
def create_empty_table(self, project_id, dataset_id, table_id, schema, partition_field=None, expiration=None): """Create table function. Create a table in the project_id/dataset at bigQuery. If the table alredy exists its gona be overwrited. Args: project_id (str): BigQuery project id table_id (str): Name of the table to createTable. query (str): Query to store as a table. Returns: bool: True for success, Raises an error otherwise. """ table_ref = self.bigquery_client.dataset(dataset_id, project_id).table(table_id) if isinstance(schema, list): schema = self._get_schema_from_json(schema) elif isinstance(schema, str): schema = self._get_schema_from_str(schema) table = bigquery.Table(table_ref, schema=schema) if partition_field: expiration_ms = expiration * 24 * 60 * 60 * 1000 if expiration else None partitioning = bigquery.TimePartitioning( field=partition_field, expiration_ms=expiration_ms) table.time_partitioning = partitioning table_ref = self.bigquery_client.create_table(table)
def execute_query(bq_client: bigquery.Client, env_vars: {}, query_path: object, output_table_name: str, time_partition: bool) -> None: """Executes transformation query to a new destination table. Args: bq_client: bigquery.Client object env_vars: Dictionary of key: value, where value is environment variable query_path: Object representing location of SQL query to execute output_table_name: String representing name of table that holds output time_partition: Boolean indicating whether to time-partition output """ dataset_ref = bq_client.get_dataset( bigquery.DatasetReference(project=bq_client.project, dataset_id=env_vars['corrected_dataset_id'])) table_ref = dataset_ref.table(output_table_name) job_config = bigquery.QueryJobConfig() job_config.destination = table_ref job_config.write_disposition = bigquery.WriteDisposition().WRITE_TRUNCATE # Time Partitioning table is only needed for final output query if time_partition: job_config.time_partitioning = bigquery.TimePartitioning( field='usage_start_time', expiration_ms=None) logging.info('Attempting query...') # Execute Query query_job = bq_client.query(query=render_template(query_path, env_vars), job_config=job_config) query_job.result() # Waits for the query to finish
def tables(bq, dataset, generated_test): # load tables into dataset for table in generated_test.tables.values(): destination = f"{dataset.dataset_id}.{table.name}" job_config = bigquery.LoadJobConfig( default_dataset=dataset, source_format=table.source_format, write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE, ) if table.schema is None: # autodetect schema if not provided job_config.autodetect = True else: job_config.schema = table.schema # look for time_partitioning_field in provided schema for field in job_config.schema: if field.description == "time_partitioning_field": job_config.time_partitioning = bigquery.TimePartitioning( field=field.name ) break # stop because there can only be one time partitioning field with open(table.source_path, "rb") as file_obj: job = bq.load_table_from_file(file_obj, destination, job_config=job_config) try: job.result() except BadRequest: print(job.errors) raise
def exist_dataset_table(client, table_id, dataset_id, project_id, schema, clustering_fields=None): try: dataset_ref = "{}.{}".format(project_id, dataset_id) client.get_dataset(dataset_ref) # Make an API request. except NotFound: dataset_ref = "{}.{}".format(project_id, dataset_id) dataset = bigquery.Dataset(dataset_ref) dataset.location = "US" dataset = client.create_dataset(dataset) # Make an API request. logger.info("Created dataset {}.{}".format(client.project, dataset.dataset_id)) try: table_ref = "{}.{}.{}".format(project_id, dataset_id, table_id) client.get_table(table_ref) # Make an API request. except NotFound: table_ref = "{}.{}.{}".format(project_id, dataset_id, table_id) table = bigquery.Table(table_ref, schema=schema) table.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, field="date" ) if clustering_fields is not None: table.clustering_fields = clustering_fields table = client.create_table(table) # Make an API request. logger.info("Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)) return 'ok'
def test_create_partitioned_table(client, to_delete): dataset_id = "create_table_partitioned_{}".format(_millis()) project = client.project dataset_ref = bigquery.DatasetReference(project, dataset_id) dataset = client.create_dataset(dataset_ref) to_delete.append(dataset) # [START bigquery_create_table_partitioned] # from google.cloud import bigquery # client = bigquery.Client() # project = client.project # dataset_ref = bigquery.DatasetReference(project, 'my_dataset') table_ref = dataset_ref.table("my_partitioned_table") schema = [ bigquery.SchemaField("name", "STRING"), bigquery.SchemaField("post_abbr", "STRING"), bigquery.SchemaField("date", "DATE"), ] table = bigquery.Table(table_ref, schema=schema) table.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, field="date", # name of column to use for partitioning expiration_ms=7776000000, ) # 90 days table = client.create_table(table) print("Created table {}, partitioned on column {}".format( table.table_id, table.time_partitioning.field)) # [END bigquery_create_table_partitioned] assert table.time_partitioning.type_ == "DAY" assert table.time_partitioning.field == "date" assert table.time_partitioning.expiration_ms == 7776000000
def run_query(self, date, autodetect=False): dataset_ref = self.client.dataset(self.config["params"]["dataset"]) job_config = bigquery.LoadJobConfig() job_config.write_disposition = ( bigquery.WriteDisposition.WRITE_APPEND if self.is_write_append() else bigquery.WriteDisposition.WRITE_TRUNCATE ) # don't do autodetect after schema created, may have errors on STRING/INTEGER job_config.autodetect = autodetect job_config.source_format = FILETYPES[self.config["filetype"]] if "partition_field" in self.config: job_config.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, field=self.config["partition_field"], ) uri = "gs://%s" % self.config["params"]["src"].format(start_date=date) load_job = self.client.load_table_from_uri( uri, dataset_ref.table(self.config["params"]["dest"]), location=self.config["params"]["location"], job_config=job_config, ) log.info("Starting job {}".format(load_job.job_id)) load_job.result() # Waits for table load to complete. log.info("Job finished.") destination_table = self.client.get_table( dataset_ref.table(self.config["params"]["dest"]) ) log.info("Loaded {} rows.".format(destination_table.num_rows))
def stripe_import( api_key: Optional[str], date: Optional[datetime], before_date: Optional[datetime], table: Optional[str], format_resources: bool, resource: Type[ListableAPIResource], ): """Import Stripe data into BigQuery.""" if resource is stripe.Event and not date: click.echo("must specify --date for --resource=Event") sys.exit(1) if table and date: table = f"{table}${date:%Y%m%d}" with (TemporaryFile(mode="w+b") if table else sys.stdout.buffer) as file_obj: filtered_schema = FilteredSchema(resource) has_rows = False for row in _get_rows(api_key, date, before_date, resource): if format_resources or (table and api_key): row = filtered_schema.format_row(row) file_obj.write(ujson.dumps(row).encode("UTF-8")) file_obj.write(b"\n") has_rows = True if not has_rows: click.echo(f"no {filtered_schema.type}s returned") sys.exit(1) elif table: if file_obj.writable(): file_obj.seek(0) warnings.filterwarnings("ignore", module="google.auth._default") job_config = bigquery.LoadJobConfig( clustering_fields=["created"], ignore_unknown_values=False, schema=filtered_schema.filtered, source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON, time_partitioning=bigquery.TimePartitioning(field="created"), write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE, ) if "$" in table: job_config.schema_update_options = [ bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION ] job = bigquery.Client().load_table_from_file( file_obj=file_obj, destination=table, job_config=job_config, ) try: click.echo(f"Waiting for {job.job_id}", file=sys.stderr) job.result() except Exception as e: click.echo(f"{job.job_id} failed: {e}", file=sys.stderr) for error in job.errors or (): message = error.get("message") if message and message != getattr(e, "message", None): click.echo(message, file=sys.stderr) sys.exit(1) else: click.echo(f"{job.job_id} succeeded", file=sys.stderr)
def set_defaults_save_job_config(job_config): job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED job_config.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, # If not set [field], the table is partitioned by pseudo column ``_PARTITIONTIME``. field=None) return job_config
def create_timestamp_table(bqclient, timestamp_table): table = bigquery.Table(timestamp_table) table.schema = [ bigquery.SchemaField("my_timestamp_parted_col", "DATE"), bigquery.SchemaField("string_col", "STRING"), bigquery.SchemaField("int_col", "INTEGER"), ] table.time_partitioning = bigquery.TimePartitioning( field="my_timestamp_parted_col") bqclient.create_table(table, exists_ok=True)
def setting_options(load_job_config, schema, config, kind_name, update_date): """ GCSにあるconfig.jsonとschema.jsonを取得・利用し、 BigQueryのロードジョブの設定を行う。 Args: load_job_config (google.cloud.bigquery.job.LoadJobConfig): BigQueryへのロードに使用する設定 config (dict): GCSから読み込んだBigQueryロードの設定 schema (dict): GCSから読み込んだBigQueryのスキーマ kind_name (STRING): データの種類 update_date (STRING): ファイル連携日 Returns: job_config (google.cloud.bigquery.job.LoadJobConfig): BigQueryへのロードに使用する設定 dataset_id (STRING): BigQueryのdatasetID table_id (STRING): BigQueryのtableID """ new_schema = list() # スキーマファイルをインポート for i in range(len(schema)): try: new_schema.append( bigquery.SchemaField(schema[i]["name"], schema[i]["type"], schema[i]["mode"])) except: new_schema.append( bigquery.SchemaField(schema[i]["name"], schema[i]["type"])) # 設定ファイルにスキーマを代入 load_job_config.schema = new_schema # config.jsonの設定をインポート load_job_config.source_format = config["sourceFormat"] load_job_config.autodetect = config["autodetect"] load_job_config.create_disposition = config["createDisposition"] load_job_config.write_disposition = config["writeDisposition"] load_job_config.skip_leading_rows = config["skipLeadingRows"] load_job_config.null_marker = config["nullMarker"] load_job_config.encoding = config["encoding"] load_job_config.max_bad_records = config["maxBadRecords"] load_job_config.allow_quoted_newlines = config["allowQuotedNewlines"] load_job_config.allow_jagged_rows = config["allowJaggedRows"] load_job_config.quote_character = config["quote"] load_job_config.ignore_unknown_values = config["ignoreUnknownValues"] # BigQueryのデータセット・テーブルIDの取得 dataset_id = config["destinationTable"]["datasetId"] table_id = config["destinationTable"]["tableId"] # transactionの場合にはパーティションを作成するので、その設定。 if (kind_name == "transaction"): table_id = "{}${}".format(table_id, update_date) load_job_config.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY) return load_job_config, dataset_id, table_id
def init_big_query_uploader(project_id, dataset_id): uploader = BigQueryUploader(project_id, dataset_id) bigquery_ready = uploader.is_dataset_ready() if not bigquery_ready: print("Unable to connect to dataset {}, project {}, quitting".format(dataset_id, project_id)) return None # Create tables if not exist date_col_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, field="date", # expiration_ms=BQ_STORAGE_DATA_EXPIRATION_MS, ) tables_and_schemas = { "browsers": bq_schema.browsers(), "browser_users": bq_schema.browser_users(), "aggregated_browser_days": bq_schema.aggregated_browser_days(), "aggregated_browser_days_tags": bq_schema.aggregated_browser_days_tags(), "aggregated_browser_days_categories": bq_schema.aggregated_browser_days_categories(), "aggregated_browser_days_referer_mediums": bq_schema.aggregated_browser_days_referer_mediums(), "aggregated_user_days": bq_schema.aggregated_user_days(), "aggregated_user_days_tags": bq_schema.aggregated_user_days_tags(), "aggregated_user_days_categories": bq_schema.aggregated_user_days_categories(), "aggregated_user_days_referer_mediums": bq_schema.aggregated_user_days_referer_mediums(), } # tables partitioned by 'date' column for table_name, table_schema in tables_and_schemas.items(): if not uploader.table_exists(table_name): uploader.create_table(table_name, table_schema, date_col_partitioning) # tables partitioned by 'time' column time_col_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, field="time", # expiration_ms=BQ_STORAGE_DATA_EXPIRATION_MS, ) if not uploader.table_exists('events'): uploader.create_table('events', bq_schema.events(), time_col_partitioning) return uploader
def _run_project_sql(bq_client, project, dataset, submission_date, slug): normalized_slug = _bq_normalize_name(slug) date_partition = str(submission_date).replace("-", "") destination_table = f"{project}.{dataset}.{normalized_slug}${date_partition}" print("destination_table", destination_table) init_sql_path = Path( os.path.join(OUTPUT_DIR, dataset, normalized_slug, INIT_FILENAME)) query_sql_path = Path( os.path.join(OUTPUT_DIR, dataset, normalized_slug, QUERY_FILENAME)) view_sql_path = Path( os.path.join(OUTPUT_DIR, dataset, normalized_slug, VIEW_FILENAME)) query_config = bigquery.QueryJobConfig( query_parameters=[ bigquery.ScalarQueryParameter("submission_date", "DATE", str(submission_date)), ], use_legacy_sql=False, clustering_fields=["build_id"], destination=destination_table, default_dataset=f"{project}.{dataset}", time_partitioning=bigquery.TimePartitioning(field="submission_date"), write_disposition="WRITE_TRUNCATE", use_query_cache=True, allow_large_results=True, ) init_query_text = init_sql_path.read_text() query_text = query_sql_path.read_text() view_text = view_sql_path.read_text() # Wait for init to complete before running queries init_query_job = bq_client.query(init_query_text) view_query_job = bq_client.query(view_text) results = init_query_job.result() query_job = bq_client.query(query_text, job_config=query_config) # Periodically print so airflow gke operator doesn't think task is dead elapsed = 0 while not query_job.done(): time.sleep(10) elapsed += 10 if elapsed % 200 == 10: print("Waiting on query...") print(f"Total elapsed: approximately {elapsed} seconds") results = query_job.result() print(f"Query job {query_job.job_id} finished") print(f"{results.total_rows} rows in {destination_table}") # Add a view once the derived table is generated. view_query_job.result()