def _export_table( client: bigquery.Client, project_id: str, dataset_id: str, table: str, bucket: str, storage_client: storage.Client, ): """Export a single table or view to GCS as JSON.""" # since views cannot get exported directly, write data into a temporary table job = client.query( f""" SELECT * FROM {dataset_id}.{table} """ ) job.result() destination_uri = f"gs://{bucket}/{table}.ndjson" dataset_ref = bigquery.DatasetReference(project_id, job.destination.dataset_id) table_ref = dataset_ref.table(job.destination.table_id) logger.info(f"Export table {table} to {destination_uri}") job_config = bigquery.ExtractJobConfig() job_config.destination_format = "NEWLINE_DELIMITED_JSON" extract_job = client.extract_table( table_ref, destination_uri, location="US", job_config=job_config ) extract_job.result() # convert ndjson to json _convert_ndjson_to_json(bucket, table, storage_client)
def _export_table( client: bigquery.Client, project_id: str, dataset_id: str, table: str, bucket: str, target_path: str, storage_client: storage.Client, ): """Export a single table or view to GCS as JSON.""" # since views cannot get exported directly, write data into a temporary table job = client.query( f""" SELECT * FROM {dataset_id}.{table} WHERE analysis_basis = 'enrollments' """ ) # todo: once experimenter supports different analysis_bases, remove filter job.result() # add a random string to the identifier to prevent collision errors if there # happen to be multiple instances running that export data for the same experiment tmp = "".join(random.choices(string.ascii_lowercase, k=8)) destination_uri = f"gs://{bucket}/{target_path}/{table}-{tmp}.ndjson" dataset_ref = bigquery.DatasetReference(project_id, job.destination.dataset_id) table_ref = dataset_ref.table(job.destination.table_id) logger.info(f"Export table {table} to {destination_uri}") job_config = bigquery.ExtractJobConfig() job_config.destination_format = "NEWLINE_DELIMITED_JSON" extract_job = client.extract_table(table_ref, destination_uri, location="US", job_config=job_config) extract_job.result() # convert ndjson to json _convert_ndjson_to_json(bucket, target_path, table, storage_client, tmp)
def extract_rows(table_name: str = table_name, bucket_name: str = dest_bucket_name, path: str = dest_path, diff_type: str = diff_type, dest_data_project: str = dest_data_project, dest_dataset_name: str = dest_dataset_name, client: bigquery.Client = dest_client): job_config = bigquery.ExtractJobConfig(printHeader=False, destination_format="NEWLINE_DELIMITED_JSON") diff_type_val = DiffType[diff_type].value destination_uri = f"gs://{bucket_name}/{path}/{table_name}/{diff_type_val}/*" dataset_ref = bigquery.DatasetReference(dest_data_project, dest_dataset_name) table_ref = dataset_ref.table(f"{table_name}_{diff_type_val}") extract_job = client.extract_table( table_ref, destination_uri, job_config=job_config ) # API request print(f"The write destination is: {destination_uri}") try: extract_job.result() except GoogleCloudError as err: print(f"There was a {type(err)}") print(err)
def query_to_dataframe( query: str, bigquery_client: bigquery.Client = None, storage_client: storage.Client = None, project: str = None, templocation: str = None, ) -> pd.DataFrame: """ Run a query job on BigQuery and return the result in Pandas DataFrame format Args: query: BigQuery query e.g. "SELECT * FROM dataset.table" bigquery_client: storage_client: project: Google Cloud project id templocation: Google Cloud Storage location to store intermediate files, must start with "gs://" Returns: Pandas DataFrame of the query result """ if isinstance(templocation, str) and not templocation.startswith("gs://"): raise RuntimeError('templocation must start with "gs://"') if bigquery_client is None: bigquery_client = bigquery.Client(project=project) if project is None: project = bigquery_client.project query_job = bigquery_client.query(query, project=project) query_job_state = "" while not query_job.done(): if query_job.state != query_job_state: print(f"Query status: {query_job.state}") query_job_state = query_job.state time.sleep(5) if query_job.state != query_job_state: print(f"Query status: {query_job.state}") if query_job.exception(): raise query_job.exception() if not templocation: templocation = get_default_templocation(bigquery_client, project=project) if templocation.endswith("/"): templocation += templocation[:-1] destination_uri = ( f"{templocation}/bq-{datetime.now(pytz.utc).strftime('%Y%m%dT%H%M%SZ')}.avro" ) extract_job_config = bigquery.job.ExtractJobConfig( destination_format="AVRO") extract_job = bigquery_client.extract_table(query_job.destination, destination_uri, job_config=extract_job_config) while not extract_job.done(): time.sleep(5) if extract_job.exception(): raise extract_job.exception() if not storage_client: storage_client = storage.Client(project=project) print("Reading query result into DataFrame") bucket_name, blob_name = ( destination_uri.split("/")[2], "/".join(destination_uri.split("/")[3:]), ) bucket = storage_client.get_bucket(bucket_name) blob = bucket.get_blob(blob_name) downloaded_avro_filename = tempfile.NamedTemporaryFile().name blob.download_to_filename(downloaded_avro_filename) with open(downloaded_avro_filename, "rb") as avro_file: avro_reader = fastavro.reader(avro_file) df = pd.DataFrame.from_records(avro_reader) return df