def update_or_create_table_from_csv( client: bigquery.Client, table_name: str, source_file: str, dataset: str, source_schema_file: str, ): LOGGER.debug("update_or_create_table_from_csv: %s=%s", table_name, [source_file]) dataset_ref = client.dataset(dataset) table_ref = dataset_ref.table(table_name) job_config = LoadJobConfig() job_config.source_format = "CSV" job_config.skip_leading_rows = 1 if Path(source_schema_file).exists(): job_config.schema = get_table_schema(source_schema_file) else: job_config.autodetect = True job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE with open(source_file, "rb") as source_fp: load_job = client.load_table_from_file(source_fp, destination=table_ref, job_config=job_config) # wait for job to complete load_job.result() LOGGER.info("updated config table: %s", table_ref.table_id)
def load_from_newline_delimited_json( client: bigquery.Client, filepath: pathlib.Path, project_id: str, dataset_id: str, table_id: str, ): full_table_id = f"{project_id}.{dataset_id}.{table_id}" job_config = bigquery.LoadJobConfig() job_config.source_format = enums.SourceFormat.NEWLINE_DELIMITED_JSON job_config.schema = [ bigquery.SchemaField("id", enums.SqlTypeNames.STRING), bigquery.SchemaField("user_id", enums.SqlTypeNames.INTEGER), bigquery.SchemaField("login_time", enums.SqlTypeNames.TIMESTAMP), bigquery.SchemaField("logout_time", enums.SqlTypeNames.TIMESTAMP), bigquery.SchemaField("ip_address", enums.SqlTypeNames.STRING), ] with open(filepath, "rb") as json_file: load_job = client.load_table_from_file(json_file, full_table_id, job_config=job_config) # Wait for load job to finish. load_job.result()
def load_local_file(client: bigquery.Client, filename: str, table_ref: bigquery.table.TableReference, job_config: bigquery.LoadJobConfig, # job_id: str = str(datetime.datetime.now()).replace(' ', ''), ): """ Args: client: filename: table_ref: job_config: Returns: Examples: """ with open(filename, 'rb') as source_file: load_job = client.load_table_from_file( file_obj=source_file, destination=table_ref, # job_id=job_id, job_id_prefix='llf-', job_config=job_config) # API request tic = time.time() print('Starting job {} at {}'.format(load_job.job_id, tic)) load_job.result() print('Job took {} seconds'.format(time.time() - tic)) assert load_job.state == 'DONE'
def load_tables(bq: bigquery.Client, dataset: bigquery.Dataset, tables: Iterable[Table]): """Load tables for a test.""" for table in tables: destination = dataset.table(table.name) job_config = bigquery.LoadJobConfig( source_format=table.source_format, write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE, ) if table.schema is None: # autodetect schema if not provided job_config.autodetect = True else: job_config.schema = table.schema # look for time_partitioning_field in provided schema for field in job_config.schema: if field.description == "time_partitioning_field": job_config.time_partitioning = bigquery.TimePartitioning( field=field.name) break # stop because there can only be one time partitioning field if isinstance(table.source_path, str): with open(table.source_path, "rb") as file_obj: job = bq.load_table_from_file(file_obj, destination, job_config=job_config) else: file_obj = BytesIO() for row in load(*table.source_path): file_obj.write( json.dumps(row, default=default_encoding).encode() + b"\n") file_obj.seek(0) job = bq.load_table_from_file(file_obj, destination, job_config=job_config) try: job.result() except BadRequest: # print the first 5 rows for debugging errors for row in job.errors[:5]: print(row) raise
def scalars_extreme_table(bigquery_client: bigquery.Client, project_id: str, dataset_id: str): schema = bigquery_client.schema_from_json(DATA_DIR / "scalars_schema.json") job_config = bigquery.LoadJobConfig() job_config.schema = schema job_config.source_format = enums.SourceFormat.NEWLINE_DELIMITED_JSON full_table_id = f"{project_id}.{dataset_id}.scalars_extreme" with open(DATA_DIR / "scalars_extreme.jsonl", "rb") as data_file: job = bigquery_client.load_table_from_file(data_file, full_table_id, job_config=job_config) job.result() yield full_table_id bigquery_client.delete_table(full_table_id)
def write_rows(client: bigquery.Client, table: str, rows: List[TableRow]) -> None: """Write the test results to the specified BigQuery table.""" job_config = bigquery.LoadJobConfig( write_disposition=bigquery.job.WriteDisposition.WRITE_APPEND, source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON, ) data_str = "\n".join(json.dumps(asdict(row), ensure_ascii=False) for row in rows) data_file = io.BytesIO(data_str.encode(encoding="utf-8")) load_job = client.load_table_from_file( data_file, destination=table, job_config=job_config, ) # Wait for load job to complete; raises an exception if the job failed. load_job.result()
def load_sample_data( full_table_id: str, bigquery_client: bigquery.Client, bigquery_schema: List[bigquery.SchemaField], filename: str = "sample.json", ): sample_config = bigquery.LoadJobConfig() sample_config.destination_table_description = ( "A sample table containing most data types." ) sample_config.schema = bigquery_schema sample_config.time_partitioning = bigquery.TimePartitioning(field="timestamp") sample_config.clustering_fields = ["integer", "string"] sample_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON sample_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE with open(DATA_DIR / filename, "rb") as data_file: return bigquery_client.load_table_from_file( data_file, full_table_id, job_config=sample_config, )
def upload_table_data(client: bigquery.Client, tableRef: bigquery.Table, fusionFile: str) -> bigquery.LoadJob: """Given the client, BigQuery table target, and data, upload the data""" with open(fusionFile, mode='rb') as file: job = client.load_table_from_file(file, tableRef) return job