def write_sentences_to_bq(self): dataset_ref = self.client.dataset(dataset_id) job_config = bigquery.LoadJobConfig() job_config.autodetect = True job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON uri = "gs://dataproc-7e10897a-5391-4ea0-b815-f6e72cf284f7-asia-east1/data/contents/sentences/part-00000-a04d02d9-794e-4546-a9af-38dbb086452f-c000.json" try: load_job = self.client.load_table_from_uri( uri, dataset_ref.table("sentences"), job_config=job_config) # API request except Exception as e: print(e) finally: print(load_job.result()) # load_job.result() # Waits for table load to complete. print("Job finished.")
def write_contents_to_bq(self): dataset_ref = self.client.dataset(dataset_id) job_config = bigquery.LoadJobConfig() job_config.autodetect = True job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON uri = "gs://dataproc-7e10897a-5391-4ea0-b815-f6e72cf284f7-asia-east1/data/contents/data_modified.json/part-00000-3873cd9e-dd6c-4e9f-a94c-6362a265d946-c000.json" try: load_job = self.client.load_table_from_uri( uri, dataset_ref.table("contents"), job_config=job_config) # API request except Exception as e: print(e) finally: print(load_job.result()) # load_job.result() # Waits for table load to complete. print("Job finished.")
def truncate(self): """ Truncate the table. """ # BigQuery does not support truncate natively, so we will "load" an empty dataset # with write disposition of "truncate" table_ref = get_table_ref(self.db.client, self.table) bq_table = self.db.client.get_table(table_ref) # BigQuery wants the schema when we load the data, so we will grab it from the table job_config = bigquery.LoadJobConfig() job_config.schema = bq_table.schema empty_table = Table([]) self.db.copy(empty_table, self.table, if_exists='truncate', job_config=job_config)
def load_to_bigquery(filename, dataset_id, table_id): dataset_ref = client.dataset(dataset_id) table_ref = dataset_ref.table(table_id) job_config = bigquery.LoadJobConfig() job_config.source_format = bigquery.SourceFormat.CSV job_config.skip_leading_rows = 1 job_config.autodetect = True job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE with open(filename, "rb") as source_file: job = client.load_table_from_file(source_file, table_ref, job_config=job_config) job.result() # Waits for table load to complete. print("Loaded {} rows into {}:{}.".format(job.output_rows, dataset_id, table_id))
def save_run_results_in_bq(project_id, dbt_project_name, run_results_path): """ Load run_results json file in BigQuery. As a first step is checked if the table already exists in the schema and if not, it's created. To fit BQ schema, the field metadata.env (JSON object) must be serialised and results.message converted to string, because depending on the task it can be an integer or a string """ table_id = f"{project_id}.{dbt_project_name}.{DBT_RUN_RESULTS_TABLE}" client = bigquery.Client(project=project_id) check_run_results_table(client, table_id) run_results = {} if run_results_path.startswith("gs://"): storage_client = storage.Client() bucket, path = _parse_gcs_url(run_results_path) bucket = storage_client.get_bucket(bucket) blob = bucket.blob(path) run_results = json.loads(blob.download_as_string()) else: with open(run_results_path) as run_results_file: run_results = json.load(run_results_file) if run_results["metadata"]["env"]: run_results["metadata"]["env"] = json.dumps( run_results["metadata"]["env"]) else: del run_results["metadata"]["env"] for item in run_results["results"]: item["message"] = str(item["message"]) data_as_file = io.StringIO(json.dumps(run_results)) job_config = bigquery.LoadJobConfig( source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON) job = client.load_table_from_file(data_as_file, table_id, job_config=job_config) try: result = job.result() # Waits for table load to complete. logging.info("Pushed {} rows into run_results table".format( result.output_rows)) except Exception: logging.info(f"Error loading run_results to BigQuery: {job.errors}")
def load_comp_boss(self, client, dataset_id, load_data): table_id = "{}.{}".format(dataset_id, "comp_boss") schema = [ bq.SchemaField("component_id", "STRING", mode="NULLABLE"), bq.SchemaField("component_type_id", "STRING", mode="NULLABLE"), bq.SchemaField("type", "STRING", mode="NULLABLE"), bq.SchemaField("connection_type_id", "STRING", mode="NULLABLE"), bq.SchemaField("outside_shape", "STRING", mode="NULLABLE"), bq.SchemaField("base_type", "STRING", mode="NULLABLE"), bq.SchemaField("height_over_tube", "STRING", mode="NULLABLE"), bq.SchemaField("bolt_pattern_long", "STRING", mode="NULLABLE"), bq.SchemaField("bolt_pattern_wide", "STRING", mode="NULLABLE"), bq.SchemaField("groove", "STRING", mode="NULLABLE"), bq.SchemaField("base_diameter", "STRING", mode="NULLABLE"), bq.SchemaField("shoulder_diameter", "STRING", mode="NULLABLE"), bq.SchemaField("unique_feature", "STRING", mode="NULLABLE"), bq.SchemaField("orientation", "STRING", mode="NULLABLE"), bq.SchemaField("weight", "STRING", mode="NULLABLE") ] client.delete_table(table=table_id, not_found_ok=True) table = bq.Table(table_id, schema=schema) client.create_table(table, exists_ok=True) if load_data: dataset_ref = client.dataset(dataset_id) table_ref = dataset_ref.table(table_id) job_config = bq.LoadJobConfig() job_config.source_format = bq.SourceFormat.CSV job_config.skip_leading_rows = 1 job_config.autodetect = False job_config.schema_update_options = [] job_config.max_bad_records = 0 with open(self.data_dir + "comp_boss.csv", "rb") as source_file: job = client.load_table_from_file(file_obj=source_file, destination=table_id, job_config=job_config) job.result() print("Loaded {} rows into {}:{}.".format(job.output_rows, dataset_id, table_id))
def persist_JSON(json_dict, dataset_id, table_id): """ Persists provided dictionary as a SINGLE row with nested and repeated columns into BigQuery. If unfamiliar with nested and repeated columns, refer to https://cloud.google.com/bigquery/docs/nested-repeated. :param json_dict: a SINGLE object definition to be persisted. :param dataset_id: The Id of an EXISTING BigQuery dataset. :param table_id: The Id of the BigQuery table where the JSON is to be persisted. If table doesn't exist, it will be created. :return: None """ from google.cloud import bigquery import os _tempFile = "tmp.json" save_new_line_delimited_JSON([json_dict], _tempFile) client = bigquery.Client() dataset_ref = client.dataset(dataset_id) dataset_location = client.get_dataset(dataset_ref).location table_ref = dataset_ref.table(table_id) job_config = bigquery.LoadJobConfig() job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON job_config.autodetect = True with open('tmp.json', 'rb') as source_file: job = client.load_table_from_file( source_file, table_ref, location= dataset_location, # Must match the destination dataset location. job_config=job_config) # API request job.result() # Waits for table load to complete. print('Loaded {} rows into {}:{}.'.format(job.output_rows, dataset_id, table_id)) os.remove(_tempFile) # Delete temp json file
def load_data(): client = bigquery.Client() dataset_id = 'landing' dataset_ref = client.dataset(dataset_id) job_config = bigquery.LoadJobConfig() # Allows to update with any new schema/field additions. Optional Extra (Required in this case). job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND job_config.schema_update_options = [ bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION ] job_config.schema = [ bigquery.SchemaField("title", "STRING"), bigquery.SchemaField("subreddit", "STRING"), bigquery.SchemaField("score", "Integer"), bigquery.SchemaField("id", "STRING"), bigquery.SchemaField("url", "STRING"), bigquery.SchemaField("comms_num", "Integer"), bigquery.SchemaField("created", "DATE"), ] job_config.skip_leading_rows = 1 # The source format defaults to CSV, so the line below is optional. job_config.source_format = bigquery.SourceFormat.CSV uri = "gs://dubai_source_api/source-api" # API request try: load_job = client.load_table_from_uri(uri, dataset_ref.table("dubai_posts"), job_config=job_config) except Exception as e: print('Failed to load data due to an expected error.') print('System abort!') print(str(e)) sys.exit() else: print("Starting job {}".format(load_job.job_id)) # Waits for table load to complete. load_job.result() print("Job finished.") destination_table = client.get_table(dataset_ref.table("dubai_posts")) table = "landing.dubai_posts" print("Loaded {} rows into {}.".format(destination_table.num_rows, table))
def bq_load_csv(url): import logging from google.cloud import bigquery logging.info('***In bq_load_csv *** {}'.format(MESSAGE_FROM_PUBSUB)) ## schema code here schema_input = bq_schema() bigquery_client = bigquery.Client() dataset_ref = bigquery_client.dataset('test') job_config = bigquery.LoadJobConfig() job_config.source_format = 'CSV' job_config.skip_leading_rows = 1 if 'ABI-L1b' in MESSAGE_FROM_PUBSUB['name']: table_ref = dataset_ref.table('abi_l1b_radiance') table = bigquery.Table(table_ref,schema=schema_input['abi_l1b_radiance']) elif 'ABI-L2-CMIP' in MESSAGE_FROM_PUBSUB['name']: table_ref = dataset_ref.table('abi_l2_cmip') table = bigquery.Table(table_ref,schema=schema_input['abi_l2_cmip']) elif 'ABI-L2-MCMIP' in MESSAGE_FROM_PUBSUB['name']: table_ref = dataset_ref.table('abi_l2_mcmip') table = bigquery.Table(table_ref,schema=schema_input['abi_l2_mcmip']) elif 'GLM-L2-LCFA' in MESSAGE_FROM_PUBSUB['name']: table_ref = dataset_ref.table('glm_l2_lcfa') table = bigquery.Table(table_ref,schema=schema_input['glm_l2_lcfa']) if not if_table_exists(bigquery_client, table_ref): table = bigquery_client.create_table(table) load_job = bigquery_client.load_table_from_uri( url, table_ref, job_config=job_config) assert load_job.job_type == 'load' load_job.result() # Waits for table load to complete. assert load_job.state == 'DONE' logging.info('**** Bigquery job completed ****')
def prep_bq_fs_and_fv( bq_source_type: str, ) -> Iterator[Tuple[FeatureStore, FeatureView]]: client = bigquery.Client() gcp_project = client.project bigquery_dataset = "test_ingestion" dataset = bigquery.Dataset(f"{gcp_project}.{bigquery_dataset}") client.create_dataset(dataset, exists_ok=True) dataset.default_table_expiration_ms = (1000 * 60 * 60 * 24 * 14 ) # 2 weeks in milliseconds client.update_dataset(dataset, ["default_table_expiration_ms"]) df = create_dataset() job_config = bigquery.LoadJobConfig() table_ref = f"{gcp_project}.{bigquery_dataset}.{bq_source_type}_correctness_{int(time.time())}" query = f"SELECT * FROM `{table_ref}`" job = client.load_table_from_dataframe(df, table_ref, job_config=job_config) job.result() bigquery_source = BigQuerySource( table_ref=table_ref if bq_source_type == "table" else None, query=query if bq_source_type == "query" else None, event_timestamp_column="ts", created_timestamp_column="created_ts", date_partition_column="", field_mapping={ "ts_1": "ts", "id": "driver_id" }, ) fv = get_feature_view(bigquery_source) with tempfile.TemporaryDirectory() as repo_dir_name: config = RepoConfig( registry=str(Path(repo_dir_name) / "registry.db"), project=f"test_bq_correctness_{uuid.uuid4()}", provider="gcp", ) fs = FeatureStore(config=config) fs.apply([fv]) yield fs, fv
def load_table(client, table_id): """Load local CSV file to an external BigQuery table. Args: client (google.cloud.bigquery.client.Client): BigQuery client object. table_id (str): Desired table id in BigQuery. Returns: None """ path_absolute_output = os.path.join(os.environ['RESULTS_DATA_LOCAL'], '{}.csv'.format(os.environ['SCRIPT'])) # Set the LoadJobConfig for creating the table. job_config = bigquery.LoadJobConfig() job_config.skip_leading_rows = 1 job_config.create_disposition = 'CREATE_IF_NEEDED' # Define the schema. with open(path_absolute_output) as f: headers = f.readline().replace('"', '').replace('\n', '').split(',') schema = [] for i in headers: if i in ['province']: schema.append(bigquery.SchemaField(i, 'STRING')) elif i == 'ts_load': schema.append(bigquery.SchemaField(i, 'TIMESTAMP')) else: schema.append(bigquery.SchemaField(i, 'INTEGER')) job_config.schema = schema # Load the table. load_job = client.load_table_from_file( open(path_absolute_output, 'rb'), table_id, job_config=job_config ) # Waits for table load to complete and raises errors, like BadRequest if # file in combination with schema has errors. result_load_job = load_job.result() if result_load_job is not None and result_load_job.error_result is not None: raise Exception('Load job result errors: {}.'.format( result_load_job.error_result))
def load_tables(bq: bigquery.Client, dataset: bigquery.Dataset, tables: Iterable[Table]): """Load tables for a test.""" for table in tables: destination = dataset.table(table.name) job_config = bigquery.LoadJobConfig( source_format=table.source_format, write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE, ) if table.schema is None: # autodetect schema if not provided job_config.autodetect = True else: job_config.schema = table.schema # look for time_partitioning_field in provided schema for field in job_config.schema: if field.description == "time_partitioning_field": job_config.time_partitioning = bigquery.TimePartitioning( field=field.name) break # stop because there can only be one time partitioning field if isinstance(table.source_path, str): with open(table.source_path, "rb") as file_obj: job = bq.load_table_from_file(file_obj, destination, job_config=job_config) else: file_obj = BytesIO() for row in load(*table.source_path): file_obj.write( json.dumps(row, default=default_encoding).encode() + b"\n") file_obj.seek(0) job = bq.load_table_from_file(file_obj, destination, job_config=job_config) try: job.result() except BadRequest: # print the first 5 rows for debugging errors for row in job.errors[:5]: print(row) raise
def main(): args = parser.parse_args() client = bigquery.Client(args.project) experiments = get_experiments() destination_table = ( f"{args.project}.{args.destination_dataset}.{args.destination_table}") bq_schema = ( bigquery.SchemaField("experimenter_slug", "STRING"), bigquery.SchemaField("normandy_slug", "STRING"), bigquery.SchemaField("type", "STRING"), bigquery.SchemaField("status", "STRING"), bigquery.SchemaField("start_date", "DATE"), bigquery.SchemaField("end_date", "DATE"), bigquery.SchemaField("proposed_enrollment", "INTEGER"), bigquery.SchemaField("reference_branch", "STRING"), bigquery.SchemaField("is_high_population", "BOOL"), bigquery.SchemaField( "branches", "RECORD", mode="REPEATED", fields=[ bigquery.SchemaField("slug", "STRING"), bigquery.SchemaField("ratio", "INTEGER"), ], ), ) job_config = bigquery.LoadJobConfig(destination=destination_table, write_disposition="WRITE_TRUNCATE") job_config.schema = bq_schema converter = cattr.Converter() converter.register_unstructure_hook( datetime.datetime, lambda d: datetime.datetime.strftime(d, format="%Y-%m-%d")) client.load_table_from_json(converter.unstructure(experiments), destination_table, job_config=job_config).result() print(f"Loaded {len(experiments)} experiments")
def cloud_storage_to_table(self, bucket_name, filename, dataset_id, table_id, job_config=None, import_format="csv", location="US", **kwargs): """Extract table from GoogleStorage and send to BigQuery""" self.create_table(dataset_id, table_id) dataset_ref = self._client.dataset(dataset_id) table_ref = dataset_ref.table(table_id) job_config = job_config if job_config else bigquery.LoadJobConfig() job_config.source_format = self.FILE_FORMATS.get(import_format) return self._client.load_table_from_uri( "gs://{}/{}".format(bucket_name, filename), table_ref, job_config=job_config, location=location, **kwargs ).result()
def load_data(self, gcs_file_path): """Loads sample data to sample table. Args: gcs_file_path: Cloud storage path for CSV. """ destination_table = self._client.get_table(self._table_ref) if destination_table.num_rows < 1: job_config = bigquery.LoadJobConfig() job_config.autodetect = True job_config.skip_leading_rows = 1 job_config.source_format = bigquery.SourceFormat.CSV load_job = self._client.load_table_from_uri( gcs_file_path, self._table_ref, job_config=job_config) load_job.result() # Waits for table load to complete. logger.info('Finished loading data. Job id: "%s"', load_job.job_id) else: logger.info('Required data already exists in "%s".', destination_table.table_id)
def loaddatafromfiletotable(bqclient, newdatasetname, newtablename): dataset_ref = newdatasetname table = newtablename.table_id table_ref = "{}.{}".format(dataset_ref, table) jobconfig = bigquery.LoadJobConfig() # Initialize job configuration. jobconfig.source_format = 'CSV' jobconfig.skip_leading_rows = 1 jobconfig.write_disposition = "JOB_WRITE_TRUNCATE" job = bqclient.load_table_from_uri(loadFilePath, table_ref, job_config=jobconfig) result = job.result() # wait for job to complete print(" job-type = {}".format(job.job_type)) print("Load job status") print(result.state) print("Load job statistics")
def load_data_from_gcs(dataset_id, table_id, source): bigquery_client = bigquery.Client() dataset_ref = bigquery_client.dataset(dataset_id) table_ref = dataset_ref.table(table_id) job_config = bigquery.LoadJobConfig() job_config.autodetect = False job_config.source_format = 'CSV' job_config.skip_leading_rows = 1 job_config.write_disposition = 'WRITE_APPEND' # WRITE_EMPTY, WRITE_APPEND, WRITE_TRUNCATE job = bigquery_client.load_table_from_uri(source, table_ref, job_config=job_config) job.result() # Waits for job to complete print('Loaded {} rows into {}:{}.'.format(job.output_rows, dataset_id, table_id))
def _upload_2_bigquery(self, file_path, table_id): job_config = bigquery.LoadJobConfig( source_format=bigquery.SourceFormat.CSV, skip_leading_rows=1, autodetect=True, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", ) with open(file_path, "rb") as source_file: job = self.client.load_table_from_file(source_file, table_id, job_config=job_config) job.result() # Waits for the job to complete. table = self.client.get_table(table_id) # Make an API request. print( f"Loaded {table.num_rows} rows and {len(table.schema)} columns to {table_id}" )
def update_bq_table(uri, fn, table_ref, table_schema): '''Saves data from a bq bucket to a table''' job_config = bigquery.LoadJobConfig() job_config.write_disposition = "WRITE_APPEND" job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON #job_config.autodetect = True job_config.autodetect = False job_config.schema = table_schema orig_rows = bq_client.get_table(table_ref).num_rows load_job = bq_client.load_table_from_uri(uri + fn, table_ref, job_config=job_config) # API request print("Starting job {}".format(load_job.job_id)) load_job.result() # Waits for table load to complete. destination_table = bq_client.get_table(table_ref) print('Loaded {} rows into {}:{}.'.format(destination_table.num_rows-orig_rows, 'sumo', table_ref.table_id))
def load_task(): client = bigquery.Client() job_config = bigquery.LoadJobConfig() schema_path = os.path.join(dags_folder, 'resources/stages/raw/schemas/{task}.json'.format(task=task)) job_config.schema = read_bigquery_schema_from_file(schema_path) job_config.source_format = bigquery.SourceFormat.CSV if file_format == 'csv' else bigquery.SourceFormat.NEWLINE_DELIMITED_JSON if file_format == 'csv': job_config.skip_leading_rows = 1 job_config.write_disposition = 'WRITE_TRUNCATE' job_config.allow_quoted_newlines = allow_quoted_newlines job_config.ignore_unknown_values = True export_location_uri = 'gs://{bucket}/export'.format(bucket=output_bucket) uri = '{export_location_uri}/{task}/*.{file_format}'.format( export_location_uri=export_location_uri, task=task, file_format=file_format) table_ref = client.dataset(dataset_name_raw).table(task) load_job = client.load_table_from_uri(uri, table_ref, job_config=job_config) submit_bigquery_job(load_job, job_config) assert load_job.state == 'DONE'
def upload_csv(table_id): job_config = bigquery.LoadJobConfig( source_format=bigquery.SourceFormat.CSV, skip_leading_rows=1, autodetect=True, ) job_config.write_disposition = 'WRITE_APPEND' job_config.schemaUpdateOptions = [ 'ALLOW_FIELD_ADDITION', 'ALLOW_FIELD_RELAXATION' ] with open("interview.csv", "rb") as source_file: job = client.load_table_from_file(source_file, table_id, job_config=job_config) job.result() # Waits for the job to complete. table = client.get_table(table_id) # Make an API request. print("Loaded {} rows and {} columns to {}".format(table.num_rows, len(table.schema), table_id))
def job_config(self): job_config = bigquery.LoadJobConfig() job_config.create_disposition = bigquery.job.CreateDisposition.CREATE_IF_NEEDED job_config.source_format = bigquery.job.SourceFormat.NEWLINE_DELIMITED_JSON job_config.autodetect = True if self._write_disposition == "truncate": job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_TRUNCATE elif self._write_disposition == "append": job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_APPEND else: raise Exception("Unknown BigQuery write disposition") if self._partition_column: job_config.time_partitioning = bigquery.table.TimePartitioning( bigquery.table.TimePartitioningType.DAY, self._partition_column ) return job_config
def upload_to_bq(gs_path, table_name, source_format ='csv'): client = bigquery.Client() job_config = bigquery.LoadJobConfig() if source_format == 'csv': job_config.source_format = bigquery.SourceFormat.CSV elif source_format == 'json': job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON job_config.write_disposition = 'WRITE_TRUNCATE' if source_format == 'csv': job_config.skip_leading_rows = 1 dataset_ref = client.dataset('covid19') dataset = bigquery.Dataset(dataset_ref) load_job = client.load_table_from_uri( gs_path, dataset_ref.table(table_name), job_config=job_config) try: load_job.result() except google.api_core.exceptions.BadRequest: raise ValueError(load_job.errors)
def runme(): dataset_id = 'IanTest' table_id = 'ia' path1 = os.path.dirname(os.path.realpath(__file__)) parentPath = os.path.dirname(path1) #type = sys.argv[1] #file = os.path.join(parentPath,"store",type + ".pkl") filename = os.path.join(parentPath,"store","adviser.csv") creds = os.path.join(parentPath,"creds","backlogger_bq.json") client = bigquery.Client.from_service_account_json(creds) dataset_ref = client.dataset(dataset_id) table_ref = dataset_ref.table(table_id) job_config = bigquery.LoadJobConfig() job_config.source_format = bigquery.SourceFormat.CSV job_config.skip_leading_rows = 1 job_config.autodetect = True job_config.WriteDisposition = "WRITE_TRUNCATE" job_config.CreateDisposition = "CREATE_IF_NEEDED" with open(filename, "rb") as source_file: job = client.load_table_from_file( source_file, table_ref, #location="EU", # Must match the destination dataset location. job_config=job_config ) # API request try: job.result() # Waits for table load to complete. except: for er in job.errors: print(er) print("Loaded {} rows into {}:{}.".format(job.output_rows, dataset_id, table_id))
def load_task(**context): dags_folder = os.environ.get("DAGS_FOLDER", "/home/airflow/gcs/dags") schema_path = os.path.join( dags_folder, "resources/stages/load/schemas/{schema}.json".format( schema="annual_npp"), ) client = bigquery.Client() job_config = bigquery.LoadJobConfig() job_config.schema = read_bigquery_schema_from_file(schema_path) job_config.source_format = bigquery.SourceFormat.PARQUET job_config.write_disposition = "WRITE_TRUNCATE" job_config.ignore_unknown_values = True job_config.clustering_fields = [ "geography", "geography_polygon", ] job_config.range_partitioning = RangePartitioning( field="year", range_=PartitionRange(start=1900, end=2100, interval=1), ) execution_date = context["execution_date"] load_table_name = "{table}${partition}".format( table=destination_table_name, partition=execution_date.strftime("%Y")) table_ref = create_dataset( client, destination_dataset_name, project=destination_dataset_project_id, ).table(load_table_name) load_uri = "gs://{bucket}/{prefix}/annual_npp/parquet/{date}.parquet".format( bucket=output_bucket, prefix=output_path_prefix, date=execution_date.strftime("%Y_%m_%d"), ) load_job = client.load_table_from_uri( load_uri, table_ref, job_config=job_config, ) submit_bigquery_job(load_job, job_config) assert load_job.state == "DONE"
def load_csv(uri, table_id, schema): job_config = bigquery.LoadJobConfig( schema=schema, skip_leading_rows=1, source_format=bigquery.SourceFormat.CSV, write_disposition= 'WRITE_TRUNCATE', # if table already exists, drops and recreates ) load_job = client.load_table_from_uri(uri, table_id, job_config=job_config) try: load_job.result() except BadRequest as e: for e in load_job.errors: print('ERROR: {}'.format(e['message'])) table = client.get_table(table_id) print("Loaded {} rows to table {}".format(table.num_rows, table_id))
def load_data_to_bigquery(project): dataset = 'fashion_mnist' train = 'train' test = 'test' # Create dataset, train and test tables. client = bigquery.Client(project=project) client.create_dataset(dataset) train_table = client.create_table(f'{project}.{dataset}.{train}') test_table = client.create_table(f'{project}.{dataset}.{test}') # Set job configuration. job_config = bigquery.LoadJobConfig() job_config.source_format = bigquery.SourceFormat.CSV job_config.field_delimiter = "|" job_config.schema = [ bigquery.schema.SchemaField("image_raw", "STRING"), bigquery.schema.SchemaField("label", "INTEGER"), ] job_config.autodetect = True # Load train data to BigQuery. with open(_train_file, "rb") as input: train_load = client.load_table_from_file(input, train_table, job_config=job_config) # Load test data to BigQuery. with open(_test_file, "rb") as input: test_load = client.load_table_from_file(input, test_table, job_config=job_config) # The loading is an async operation. Wait for it to finish. train_load.result() test_load.result() print( f'Loaded {train_load.output_rows} rows from {_train_file} into {project}.{dataset}.{train}' ) print( f'Loaded {test_load.output_rows} rows from {_test_file} into {project}.{dataset}.{test}' )
def load_functional_alltypes_parted_data( request, bqclient, create_functional_alltypes_parted_table): if request.config.getoption("--no-refresh-dataset"): return table = create_functional_alltypes_parted_table load_config = bigquery.LoadJobConfig() load_config.write_disposition = "WRITE_TRUNCATE" load_config.skip_leading_rows = 1 # skip the header row. filepath = download_file( "{}/functional_alltypes.csv".format(TESTING_DATA_URI)) with open(filepath.name, "rb") as csvfile: job = bqclient.load_table_from_file( csvfile, table, job_config=load_config, ).result() if job.error_result: print("error")
def cloudsql_to_bigquery(request): request_obj = request.get_json(silent=True) job_done = False if request_obj and request_obj['message'] == os.getenv('TASK_TO_EXECUTE'): # Connect to CloudSql connection = pymysql.connect( unix_socket=os.getenv('CONNECTION_STRING'), user=os.getenv('USERNAME'), password=os.getenv('PASSWORD'), db=os.getenv('DATABASE'), cursorclass=pymysql.cursors.DictCursor) print('Connection:', connection) # Query to read data from CloudSql table sql_query = 'SELECT * FROM <sql_table_name>' # Create dataframe reading table data dataframe = pb.read_sql(sql_query, connection, index_col='<col_name>Ex:id') print(dataframe.head(5)) # Connect to BigQuery client bq_client = bigquery.Client() bq_dataset = bq_client.dataset('<bq_dataset_name>') bq_table_name = bq_dataset.table('<bq_table_name>') # Create job configurations bq_job_config = bigquery.LoadJobConfig() bq_job_config.autodetect = True bq_job_config.write_disposition = 'WRITE_TRUNCATE' # WRITE_APPEND, WRITE_EMPTY, WRITE_DISPOSITION_UNSPECIFIED # Load data into BigQuery table write_data = bq_client.load_table_from_dataframe(dataframe, bq_table_name, job_config=bq_job_config) write_data.result() # Wait for the job to finish # Print write job task id print('Running task {}'.format(write_data)) job_done = True # Return a valid response if job_done is True: return ('Success', 200) else: return ('Failed', 500)
def gcs_to_gbq(): # Connect to Google Cloud Storage and set bucket storage_client = storage.Client() bucket = storage_client.get_bucket(config.gcp['bucket_name']) blobs = bucket.list_blobs() client = bigquery.Client() # Connect to Google BigQuery and define BigQuery options dataset_ref = client.dataset(config.gcp['dataset_id']) job_config = bigquery.LoadJobConfig() job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE job_config.allow_quoted_newlines = True job_config.allow_jagged_rows = True job_config.autodetect = False job_config.skip_leading_rows = 1 job_config.field_delimiter = "|" job_config.source_format = bigquery.SourceFormat.CSV # Iterate though all files of defined Google Cloud Storage bucket for blob in blobs: # Set dynamic URL for current Cloud Storage file and BigQuery schema file uri = 'gs://' + config.gcp['bucket_name'] + '/' + blob.name file_name_json = blob.name.replace(".csv", ".json") file_name_json_path = os.path.join(config.general['csv_cache_folder'], file_name_json) # Load JSON File for schema and set schema fields for BigQuery input_json = open(file_name_json_path) input_json_config = json.load(input_json) job_config.schema = [ bigquery.SchemaField(item["name"], item["type"]) for item in input_json_config["fields"] ] # Set dynamic table name for Google BigQuery table_name = blob.name.replace(".csv", "") # Create new big query table / replace existing load_job = client.load_table_from_uri(uri, dataset_ref.table(table_name), job_config=job_config) assert load_job.job_type == 'load' load_job.result() # Waits for table load to complete. assert load_job.state == 'DONE' # Send slack message when load job done send_message_to_slack( ':bigquery: Backend Data Sync: Update DWH Data OK :thumbs-up-green:', config.slack_hooks['slack_channel_1'])