Exemple #1
0
def main(event, context):
    """Entrypoint for Cloud Function"""

    data = base64.b64decode(event['data'])
    upstream_bq_dts_obj = json.loads(data)
    error = upstream_bq_dts_obj.get('errorStatus')
    if error:
        logging.error(
            RuntimeError(f"Error in upstream query job: {error['message']}."))
    else:
        project_id = get_env('PROJECT_ID')
        dataset_id = upstream_bq_dts_obj['destinationDatasetId']
        table_name = upstream_bq_dts_obj['params'][
            'destination_table_name_template']
        schedule_time = upstream_bq_dts_obj['scheduleTime']

        bq_client = bigquery.Client(client_info=CLIENT_INFO)

        dataset_ref = bigquery.DatasetReference.from_string(
            dataset_id, default_project=project_id)
        table_ref = dataset_ref.table(table_name)
        destination_uri = get_destination_uri(schedule_time)
        extract_config = bigquery.ExtractJobConfig(
            compression=get_env('COMPRESSION'),
            destination_format=get_env('DEST_FMT'),
            field_delimeter=get_env('FIELD_DELIMITER'),
            use_avro_logical_types=get_env('USE_AVRO_TYPES'))
        bq_client.extract_table(table_ref,
                                destination_uri,
                                job_id_prefix="email_export_",
                                job_config=extract_config)
        print(
            f"Exporting {project_id}:{dataset_id}.{table_name} to {destination_uri}"
        )
Exemple #2
0
def _extract_bq_table(project_id, dataset_id, table_id, bucket_name,
                      facturation_project_id):
    work_directory = str(uuid.uuid4())
    facturation_project_id = facturation_project_id or project_id

    # Prepare extract job
    client = bigquery.Client(project=facturation_project_id)
    dataset_ref = client.dataset(dataset_id, project=project_id)
    table_ref = dataset_ref.table(table_id)
    gs_uri = "gs://{}/{}/part_*.csv.gz".format(bucket_name, work_directory)
    extract_conf = bigquery.ExtractJobConfig()
    extract_conf.compression = 'GZIP'
    extract_conf.destination_format = 'CSV'
    extract_conf.print_header = False

    # Ensure bucket exists
    location = client.get_dataset(dataset_ref).location
    _ensure_bucket(project_id, bucket_name, location)

    print('Extracting table %s to %s' % (table_ref, gs_uri))
    extract_job = client.extract_table(table_ref,
                                       gs_uri,
                                       job_config=extract_conf)
    extract_job.result()
    _check_job_status(extract_job)
    return work_directory
def _upload_table_to_gcs(
    table,
    bucket,
    gcs_path,
    experiment_slug,
    table_name,
    source_project,
    client,
    storage_client,
):
    """Export the provided table reference to GCS as JSON."""
    # add a random string to the identifier to prevent collision errors if there
    # happen to be multiple instances running that export data for the same experiment
    tmp = "".join(random.choices(string.ascii_lowercase, k=8))
    destination_uri = (
        f"gs://{bucket}/{gcs_path}/{experiment_slug}_{table_name}_{tmp}.ndjson"
    )

    print(f"Export table {table} to {destination_uri}")

    job_config = bigquery.ExtractJobConfig()
    job_config.destination_format = "NEWLINE_DELIMITED_JSON"
    extract_job = client.extract_table(table,
                                       destination_uri,
                                       location="US",
                                       job_config=job_config)
    extract_job.result()

    # convert ndjson to json
    _convert_ndjson_to_json(bucket, gcs_path, experiment_slug, table_name,
                            storage_client, tmp)
Exemple #4
0
def _export_table(
    client: bigquery.Client,
    project_id: str,
    dataset_id: str,
    table: str,
    bucket: str,
    storage_client: storage.Client,
):
    """Export a single table or view to GCS as JSON."""
    # since views cannot get exported directly, write data into a temporary table
    job = client.query(
        f"""
        SELECT *
        FROM {dataset_id}.{table}
    """
    )

    job.result()

    destination_uri = f"gs://{bucket}/{table}.ndjson"
    dataset_ref = bigquery.DatasetReference(project_id, job.destination.dataset_id)
    table_ref = dataset_ref.table(job.destination.table_id)

    logger.info(f"Export table {table} to {destination_uri}")

    job_config = bigquery.ExtractJobConfig()
    job_config.destination_format = "NEWLINE_DELIMITED_JSON"
    extract_job = client.extract_table(
        table_ref, destination_uri, location="US", job_config=job_config
    )
    extract_job.result()

    # convert ndjson to json
    _convert_ndjson_to_json(bucket, table, storage_client)
Exemple #5
0
    def _publish_table_as_json(self, result_table):
        """Export the `result_table` data as JSON to Cloud Storage."""
        prefix = (f"api/{self.api_version}/tables/{self.dataset}/"
                  f"{self.table}/{self.version}/files/")

        if self.date is not None:
            # if date exists, then query is incremental and newest results are exported
            prefix += f"{self.date}/"

        logging.info(
            f"""Export JSON for {result_table} to {self.stage_gcs_path}""")

        table_ref = self.client.get_table(result_table)

        job_config = bigquery.ExtractJobConfig()
        job_config.destination_format = "NEWLINE_DELIMITED_JSON"

        # "*" makes sure that files larger than 1GB get split up into JSON files
        # files are written to a stage directory first
        destination_uri = (f"gs://{self.target_bucket}/" +
                           self.stage_gcs_path + "*.ndjson")
        extract_job = self.client.extract_table(table_ref,
                                                destination_uri,
                                                location="US",
                                                job_config=job_config)
        extract_job.result()

        self._gcp_convert_ndjson_to_json(prefix)
def save_table_to_storage(bq_table_id,
                          bq_project_id='freestyle-libre-app',
                          bq_dataset_id='tmp',
                          cred_file=None):
    print('saving file to storage...')
    PROJECT_ID = bq_project_id
    CREDENTIALS = get_credentials(cred_file)

    yesterday_dt = datetime.datetime.today() - datetime.timedelta(days=1)
    yesterday_str = datetime.datetime.strftime(yesterday_dt, '%Y%m%d')

    gcs_bucket = 'adcpipeline.appspot.com/digital-exhibit/data/fsll/nfc_scan_success'
    gcs_filename = '{filename}.json'.format(filename='scan_data')
    gcs_destination_uri = 'gs://{}/{}'.format(gcs_bucket, gcs_filename)

    bq_client = bigquery.Client(project=PROJECT_ID, credentials=CREDENTIALS)

    dataset_ref = bq_client.dataset(bq_dataset_id)
    table_ref = dataset_ref.table(bq_table_id)

    extract_config = bigquery.ExtractJobConfig()
    # extract_config.compression = 'NONE'
    extract_config.destination_format = 'NEWLINE_DELIMITED_JSON'

    extract_job = bq_client.extract_table(table_ref,
                                          gcs_destination_uri,
                                          job_id_prefix='exhibit',
                                          job_config=extract_config,
                                          location='US')  # API request
    extract_job.result()  # Waits for job to complete.
    print('Exported {}.{}.{} to {}'.format(bq_project_id, bq_dataset_id,
                                           bq_table_id, gcs_destination_uri))
    return (extract_job)
Exemple #7
0
    def table_to_cloud_storage(self,
                               dataset_id,
                               table_id,
                               bucket_name,
                               filename,
                               job_config=None,
                               export_format="csv",
                               compression_format="gz",
                               location="US",
                               **kwargs):
        """Extract a table from BigQuery and send to GoogleStorage"""
        complete_filename = self._complete_filename(filename, export_format,
                                                    compression_format)

        destination_uri = "gs://{}/{}".format(bucket_name, complete_filename)
        table = self._client.dataset(dataset_id).table(table_id)

        job_config = job_config if job_config else bigquery.ExtractJobConfig()

        job_config.compression = self.COMPRESSION_FORMATS.get(
            compression_format)
        job_config.destination_format = self.FILE_FORMATS.get(export_format)

        return self._client.extract_table(table,
                                          destination_uri,
                                          location=location,
                                          job_config=job_config,
                                          **kwargs).result()
Exemple #8
0
def bq_to_bucket_tsv(src_table, project, dataset, bucket_name, bucket_file, do_batch, do_header):
    """
    Get a BQ Result to a Bucket TSV file
    Export BQ table to a cloud bucket
    """
    client = bigquery.Client()
    destination_uri = "gs://{}/{}".format(bucket_name, bucket_file)
    dataset_ref = client.dataset(dataset, project=project)
    table_ref = dataset_ref.table(src_table)

    job_config = bigquery.ExtractJobConfig()
    if do_batch:
        job_config.priority = bigquery.QueryPriority.BATCH
    location = 'US'
    job_config.field_delimiter = '\t'
    job_config.print_header = do_header

    extract_job = client.extract_table(table_ref, destination_uri, location="US", job_config=job_config)

    # Query
    job_state = 'NOT_STARTED'
    while job_state != 'DONE':
        extract_job = client.get_job(extract_job.job_id, location=location)
        print('Job {} is currently in state {}'.format(extract_job.job_id, extract_job.state))
        job_state = extract_job.state
        if job_state != 'DONE':
            time.sleep(5)
    print('Job {} is done'.format(extract_job.job_id))

    extract_job = client.get_job(extract_job.job_id, location=location)
    if extract_job.error_result is not None:
        print('Error result!! {}'.format(extract_job.error_result))
        return False
    return True
Exemple #9
0
    def execute(self, *args):
        super().execute()

        valid = EssentialParameters(
            self.__class__.__name__, [self._tblname, self._bucket, self._dest_dir]
        )
        valid()

        os.makedirs(self._dest_dir, exist_ok=True)

        if isinstance(self._credentials, str):
            self._logger.warning(
                (
                    "DeprecationWarning: "
                    "In the near future, "
                    "the `credentials` will be changed to accept only dictionary types. "
                )
            )
            key_filepath = self._credentials
        else:
            key_filepath = self._source_path_reader(self._credentials)
        gbq_client = BigQuery.get_bigquery_client(key_filepath)
        gbq_ref = gbq_client.dataset(self._dataset).table(self._tblname)

        gcs_client = Gcs.get_gcs_client(key_filepath)
        gcs_bucket = gcs_client.bucket(self._bucket)

        ymd_hms = datetime.now().strftime("%Y%m%d%H%M%S%f")
        path = "%s-%s" % ("".join(random.choices(string.ascii_letters, k=8)), ymd_hms)
        prefix = "%s/%s/%s" % (self._dataset, self._tblname, path)

        """
        gsc dir -> gs://{bucket_name}
                       /{dataset_name}/{table_name}
                       /{XXXXXXXX}-{yyyyMMddHHmmssSSS}/*.csv.gz
        """
        if self._filename:
            dest_gcs = "gs://%s/%s/%s*.csv.gz" % (self._bucket, prefix, self._filename)
        else:
            dest_gcs = "gs://%s/%s/*.csv.gz" % (self._bucket, prefix)

        # job config settings
        job_config = bigquery.ExtractJobConfig()
        job_config.compression = bigquery.Compression.GZIP
        job_config.destination_format = bigquery.DestinationFormat.CSV

        # Execute query.
        job = gbq_client.extract_table(
            gbq_ref, dest_gcs, job_config=job_config, location=self._location
        )
        job.result()

        # Download from gcs
        for blob in gcs_client.list_blobs(gcs_bucket, prefix=prefix):
            dest = os.path.join(self._dest_dir, os.path.basename(blob.name))
            blob.download_to_filename(dest)

        # Cleanup temporary files
        for blob in gcs_client.list_blobs(gcs_bucket, prefix=prefix):
            blob.delete()
    def GBQTableToGCS(self, **kwargs):
        self.bucket = kwargs.get('bucket', None)
        self.destination = kwargs.get('destination', None)
        self.dataset = kwargs.get('dataset', None)
        self.table = kwargs.get('table', None)
        self.field_delimiter = kwargs.get('field_delimiter', ',')

        client = bigquery.Client()

        self.destination_uri = f'gs://{self.bucket}/{self.destination}'

        self.dataset_ref = client.dataset(self.dataset)
        self.table_ref = self.dataset_ref.table(self.table)

        job_config = bigquery.ExtractJobConfig()
        job_config.field_delimiter = self.field_delimiter

        if helpers.isLocationArgVersion():
            extract_job = client.extract_table(self.table_ref, self.destination_uri, location='US', job_config=job_config)
            extract_job.result()
        else:
            extract_job = client.extract_table(self.table_ref, self.destination_uri,job_config=job_config)
            extract_job.result()

        return extract_job
def export_table(bq_client, table_ref, dest_uri, dest_fmt):
    """ Run the extract job to export the give table to the given destination and wait for completion"""
    job_config = bigquery.ExtractJobConfig(destination_format=dest_fmt)
    extract_job = bq_client.extract_table(table_ref,
                                          dest_uri,
                                          location='US',
                                          job_config=job_config)
    extract_job.result()
    logging.info("Exported %s to %s", table_ref.table_id, dest_uri)
Exemple #12
0
def get_extract_config(file_name):
    jc = bq.ExtractJobConfig()

    compression = bq.job.Compression.GZIP if fnmatch(file_name, '*.gz') \
        else bq.job.Compression.NONE

    jc.compression = compression
    jc.destination_format = bq.job.DestinationFormat.CSV

    return jc
Exemple #13
0
    def config_job(self,
                   destination_format='CSV',
                   field_delimiter=',',
                   print_header=True):
        job_config = bigquery.ExtractJobConfig()

        job_config.destination_format = destination_format
        job_config.field_delimiter = field_delimiter
        job_config.print_header = print_header

        return job_config
Exemple #14
0
    def read_file(bucket, storage_client=storage_client, bq_client=bq_client):
        ds_ref = bq_client.dataset('pysearchml')
        bq_client.create_dataset(ds_ref, exists_ok=True)

        table_id = 'es_docs'
        table_ref = ds_ref.table(table_id)

        bucket_obj = storage_client.bucket(bucket)
        if not bucket_obj.exists():
            bucket_obj.create()

        # Query GA data
        query_path = PATH / f'{args.model_name}' / 'ga_data.sql'
        query = open(str(query_path)).read()
        print(query)
        job_config = bigquery.QueryJobConfig()
        job_config.destination = f'{bq_client.project}.pysearchml.{table_id}'
        job_config.maximum_bytes_billed = 10 * (1024 ** 3)
        job_config.write_disposition = 'WRITE_TRUNCATE'
        job = bq_client.query(query, job_config=job_config)
        job.result()

        # export BigQuery table to GCS
        destination_uri = f'gs://{bucket}/es_docs.gz'

        extract_config = bigquery.ExtractJobConfig()
        extract_config.compression = 'GZIP'
        extract_config.destination_format = 'NEWLINE_DELIMITED_JSON'
        job = bq_client.extract_table(table_ref, destination_uri,
                                      job_config=extract_config)
        job.result()

        # Download data
        blob = bucket_obj.blob('es_docs.gz')
        file_obj = gzip.io.BytesIO()
        blob.download_to_file(file_obj)

        file_obj.seek(0)

        c = 0
        for row in gzip.GzipFile(fileobj=file_obj, mode='rb'):
            row = json.loads(row)
            yield {
                '_index': index,
                '_source': row,
                '_id': row['sku']
            }
            c += 1
            if not c % 1000:
                print(c)

        # Delete BQ Table
        bq_client.delete_table(table_ref)
Exemple #15
0
 def _dataset_to_bucket_job(self, dataset_to_bucket_config):
     config = dataset_to_bucket_config
     source = self._build_table_id(config.data_name)
     job_config = bigquery.ExtractJobConfig()
     job_config.compression = 'GZIP'
     destination_uri = (self._blob_uri_prefix + config.data_name +
                        '-*.csv.gz')
     job_config.field_delimiter = self._separator
     job = self._bq_client.extract_table(source=source,
                                         destination_uris=destination_uri,
                                         job_config=job_config)
     return job
Exemple #16
0
 def bqtable2gs(self, dataset_name, table_name, bucket, gspath, file_name,
                file_format=CSV, compression=False, ext='.gzip'):
     bq_client = self.client(self.project_id)
     job_config = bigquery.ExtractJobConfig()
     job_config.destination_format = file_format
     dataset_ref = bq_client.dataset(dataset_name, project=self.bq_project_id)
     if compression:
         job_config.compression = 'GZIP'
         file_name = file_name + ext
     destination_uri = 'gs://' + path.join(bucket, gspath, file_name)
     extract_job = \
         bq_client.extract_table(dataset_ref.table(table_name), destination_uri,
                                 job_config=job_config)
     return extract_job.result(), destination_uri
Exemple #17
0
    def _extract(self):

        job_config = bigquery.ExtractJobConfig(**{
            "compression":"GZIP",
            "destinationFormat":"CSV",
        })

        name = os.path.join(self.gcs_prefix, self.table_ref.table_id)
        job = self.bq_client.extract_table(self.table_ref, f'gs://{self.bucket}/{name}*.csv.gz', job_config=job_config)

        log.info("Waiting for BigQuery Table Extract job to finish...")
        result = job.result()
        # result.destination_uri_file_counts
        log.info("BigQuery job finished.")
Exemple #18
0
    def execute(self, *args):
        super().execute()

        valid = EssentialParameters(
            self.__class__.__name__,
            [self._tblname, self._bucket, self._dest_dir])
        valid()

        os.makedirs(self._dest_dir, exist_ok=True)

        gbq_client = bigquery.Client.from_service_account_json(
            self._credentials)
        gbq_ref = gbq_client.dataset(self._dataset).table(self._tblname)

        gcs_client = storage.Client.from_service_account_json(
            self._credentials)
        gcs_bucket = gcs_client.get_bucket(self._bucket)

        ymd_hms = datetime.now().strftime("%Y%m%d%H%M%S%f")
        path = "%s-%s" % ("".join(random.choices(string.ascii_letters,
                                                 k=8)), ymd_hms)
        prefix = "%s/%s/%s" % (self._dataset, self._tblname, path)

        # gsc dir -> gs://{bucket_name}/{dataset_name}/{table_name}/{XXXXXXXX}-{yyyyMMddHHmmssSSS}/*.csv.gz
        if self._filename:
            dest_gcs = "gs://%s/%s/%s*.csv.gz" % (self._bucket, prefix,
                                                  self._filename)
        else:
            dest_gcs = "gs://%s/%s/*.csv.gz" % (self._bucket, prefix)

        # job config settings
        job_config = bigquery.ExtractJobConfig()
        job_config.compression = bigquery.Compression.GZIP
        job_config.desctination_format = bigquery.DestinationFormat.CSV

        # Execute query.
        job = gbq_client.extract_table(gbq_ref,
                                       dest_gcs,
                                       job_config=job_config,
                                       location=self._location)
        job.result()

        # Download from gcs
        for blob in gcs_bucket.list_blobs(prefix=prefix):
            dest = os.path.join(self._dest_dir, os.path.basename(blob.name))
            blob.download_to_filename(dest)

        # Cleanup temporary files
        for blob in gcs_bucket.list_blobs(prefix=prefix):
            blob.delete()
def store_table(table_ref, client, destination_format, compression,
                destination_uri):
    job_config = bq.ExtractJobConfig()
    job_config.compression = bq.Compression.GZIP if compression else bq.Compression.NONE
    job_config.destination_format = destination_format

    destination_format_ = destination_format.split("_")[-1]
    extension = "." + destination_format_.lower(
    ) + f"{'.gz' if compression else ''}"
    destination_uri = destination_uri + extension
    with msg.loading():
        client.extract_table(source=table_ref,
                             destination_uris=destination_uri,
                             job_config=job_config).result()
    msg.good("Table stored 🚀")
Exemple #20
0
def main(validation_init_date, validation_end_date, bucket, destination):
    # Remove everything and deletes destination folder to receive new files.
    rmtree(destination, ignore_errors=True)
    os.makedirs(destination, exist_ok=True)

    storage_client = storage.Client()
    bq_client = bigquery.Client()

    ds_ref = bq_client.dataset('pysearchml')

    table_id = str(uuid.uuid4().hex)
    table_ref = ds_ref.table(table_id)

    # Query GA data
    query_path = PATH / 'validation.sql'
    query = open(str(query_path)).read()
    query = query.format(validation_init_date=validation_init_date,
                         validation_end_date=validation_end_date)

    job_config = bigquery.QueryJobConfig()
    job_config.destination = f'{bq_client.project}.pysearchml.{table_id}'
    job_config.maximum_bytes_billed = 10 * (1024**3)
    job_config.write_disposition = 'WRITE_TRUNCATE'
    job = bq_client.query(query, job_config=job_config)
    job.result()

    # export BigQuery table to GCS
    # bucket will be set in accordance to which validation dataset is referenced, i.e.,
    # whether regular validation or validation for the training dataset.
    destination_uri = f"gs://{bucket}/validation*.gz"

    extract_config = bigquery.ExtractJobConfig()
    extract_config.compression = 'GZIP'
    extract_config.destination_format = 'NEWLINE_DELIMITED_JSON'
    job = bq_client.extract_table(table_ref,
                                  destination_uri,
                                  job_config=extract_config)
    job.result()

    # Download data
    bucket_obj = storage_client.bucket(bucket.split('/')[0])
    blobs = bucket_obj.list_blobs(prefix=bucket.partition('/')[-1])
    for blob in blobs:
        blob.download_to_filename(f"{destination}/{blob.name.split('/')[-1]}")
        blob.delete()

    # delete BQ table
    bq_client.delete_table(table_ref)
Exemple #21
0
def table_to_gcs(dataset: str,
                 table: str,
                 uri: str,
                 gzip: bool = True,
                 delete_first: bool = True):
    """Load a file from google cloud storage into BigQuery

    Parameters
    ----------
    dataset: str
        The Bigquery dataset
    table: str
        The Bigquery table
    uri: str
        The google cloud storage uri (``gs://....``)
    gzip: bool
        Compress output with gzip or not
    """
    client = bigquery.Client()

    destination_uri = uri
    dataset_ref = client.dataset(dataset)
    table_ref = dataset_ref.table(table)

    logging.info("Exporting {}.{} to {}".format(dataset, table,
                                                destination_uri))

    job_config = bigquery.ExtractJobConfig()

    if (gzip):
        job_config.compression = bigquery.Compression.GZIP
        job_config.destination_format = bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON

    extract_job = client.extract_table(
        table_ref,
        destination_uri,
        # Location must match that of the source table.
        location="US",
        job_config=job_config)  # API request
    try:
        extract_job.result()  # Waits for table load to complete.
        logging.info("Extract completed")
    except google.api_core.exceptions.BadRequest:
        logging.error(f"extract failed")
        logging.error(extract_job.errors)

    logging.info("Exported {}.{} to {}".format(dataset, table,
                                               destination_uri))
Exemple #22
0
def export_table_to_gcs(table_ref, table_name):
    """
    Exporting the dataset table to GCS
    :param table_ref: the table to export
    :return:
    """
    client = bigquery.Client()
    destination_uri = data_dir
    job_config = bigquery.ExtractJobConfig(print_header=False)
    extract_job = client.extract_table(table_ref,
                                       destination_uri,
                                       location="US",
                                       job_config=job_config)
    extract_job.result()

    print("Exported {}:{}.{} to {}".format(project_id, dataset_id, table_name,
                                           destination_uri))
Exemple #23
0
    def export_to_gcs(self, table_ref, client, dst_uri):
        from google.cloud import bigquery

        job_config = bigquery.ExtractJobConfig()

        # use delimiter which never exists in data
        job_config.field_delimiter = '\t'
        job_config.print_header = False

        extract_job = client.extract_table(
            table_ref,
            dst_uri +
            "/000000_0",  # this naming rule has to be identical to datalake-hive for future useage efficiency
            location='US',  # API request
            job_config=job_config)

        extract_job.result()  # Waits for job to complete.
Exemple #24
0
    def run(self):
        logging.info(
            'Started BigQueryToFileJob with SQL {} and output filename {}'.
            format(self.sql, self.output_filename))

        # Query
        # TODO: Make sure it works with different locations.

        random_name = random_string(10)
        destination_table = self.bigquery_client.dataset(
            self.temp_bigquery_dataset).table(random_name)
        query_job_config = bigquery.QueryJobConfig()
        query_job_config.priority = bigquery.QueryPriority.INTERACTIVE
        query_job_config.destination = destination_table

        query_job = self.bigquery_client.query(self.sql,
                                               job_config=query_job_config)

        submit_bigquery_job(query_job, query_job_config)
        assert query_job.state == 'DONE'

        # Export
        # TODO: Allow exporting to multiple files in case output is bigger than 1GB.

        bucket = self.temp_bucket
        filename = random_name + '.json'
        object = filename
        destination_uri = "gs://{}/{}".format(bucket, object)
        extract_job_config = bigquery.ExtractJobConfig()
        extract_job_config.priority = bigquery.QueryPriority.INTERACTIVE
        extract_job_config.destination_format = bigquery.job.DestinationFormat.NEWLINE_DELIMITED_JSON

        extract_job = self.bigquery_client.extract_table(
            destination_table, destination_uri, job_config=extract_job_config)
        submit_bigquery_job(extract_job, extract_job_config)
        assert query_job.state == 'DONE'

        # Delete the BigQuery table

        self.bigquery_client.delete_table(destination_table)

        # Download

        download_from_gcs(bucket, object, self.output_filename)
        delete_in_gcs(bucket, object)
Exemple #25
0
 def export_table(self, table, path, localtion='US'):
     if path[-1] != '/':
         path = path + '/'
     self.rm(path)
     destination_uri = "gs://" + str(Path(path, 'data-*.csv.gz'))
     dataset_id = table.split('.')[0]
     table_id = table.split('.')[-1]
     dataset_ref = bigquery.DatasetReference(project=self.project,
                                             dataset_id=dataset_id)
     table_ref = dataset_ref.table(table_id)
     job_config = bigquery.ExtractJobConfig()
     job_config.compression = bigquery.Compression.GZIP
     extract_job = self.client.extract_table(table_ref,
                                             destination_uri,
                                             location=localtion,
                                             job_config=job_config)
     extract_job.result()  # Waits for job to complete
     return self.ls(path)
Exemple #26
0
    def _extract_to_blobs(self, source_table):
        # Returns list of blobs
        # 1: EXTRACT
        extract_job_config = bigquery.ExtractJobConfig(
            compression="GZIP", destination_format="CSV"
        )
        extract_prefix = "staging/{}_{}".format(source_table, uuid.uuid4().hex)
        extract_destination_uri = "gs://{}/{}-*.csv.gz".format(
            self.staging_bucket, extract_prefix
        )
        extract_job = bq_client.extract_table(
            source_table, extract_destination_uri, job_config=extract_job_config
        )  # API request
        extract_job.result()  # Waits for job to complete.
        logger.info("Exported {} to {}".format(source_table, extract_destination_uri))

        # 2: LIST BLOBS
        storage_client = storage.Client()
        bucket = storage_client.bucket(self.staging_bucket)
        return bucket.list_blobs(prefix=extract_prefix)
Exemple #27
0
def _export_table(
    client: bigquery.Client,
    project_id: str,
    dataset_id: str,
    table: str,
    bucket: str,
    target_path: str,
    storage_client: storage.Client,
):
    """Export a single table or view to GCS as JSON."""
    # since views cannot get exported directly, write data into a temporary table
    job = client.query(
        f"""
        SELECT *
        FROM {dataset_id}.{table}
        WHERE analysis_basis = 'enrollments'
    """
    )  # todo: once experimenter supports different analysis_bases, remove filter

    job.result()

    # add a random string to the identifier to prevent collision errors if there
    # happen to be multiple instances running that export data for the same experiment
    tmp = "".join(random.choices(string.ascii_lowercase, k=8))
    destination_uri = f"gs://{bucket}/{target_path}/{table}-{tmp}.ndjson"
    dataset_ref = bigquery.DatasetReference(project_id,
                                            job.destination.dataset_id)
    table_ref = dataset_ref.table(job.destination.table_id)

    logger.info(f"Export table {table} to {destination_uri}")

    job_config = bigquery.ExtractJobConfig()
    job_config.destination_format = "NEWLINE_DELIMITED_JSON"
    extract_job = client.extract_table(table_ref,
                                       destination_uri,
                                       location="US",
                                       job_config=job_config)
    extract_job.result()

    # convert ndjson to json
    _convert_ndjson_to_json(bucket, target_path, table, storage_client, tmp)
Exemple #28
0
    def export_table_to_cloud_storage_async(
        self, source_table_dataset_ref: bigquery.DatasetReference,
        source_table_id: str, destination_uri: str,
        destination_format: bigquery.DestinationFormat
    ) -> Optional[bigquery.ExtractJob]:
        if not self.table_exists(source_table_dataset_ref, source_table_id):
            logging.error("Table [%s] does not exist in dataset [%s]",
                          source_table_id, str(source_table_dataset_ref))
            return None

        table_ref = source_table_dataset_ref.table(source_table_id)

        job_config = bigquery.ExtractJobConfig()
        job_config.destination_format = destination_format

        return self.client.extract_table(
            table_ref,
            destination_uri,
            # Location must match that of the source table.
            location=self.LOCATION,
            job_config=job_config)
Exemple #29
0
def extract_tables(FROM_DATASET):
    # Extract all tables in a dataset to a Cloud Storage bucket.
    print('Extracting {}:{} to Cloud Storage bucket {}'.format(
        FROM_PROJECT, FROM_DATASET, FROM_BUCKET))

    tables = list(bq_client.list_tables(bq_client.dataset(FROM_DATASET)))
    extract_jobs = []
    for table in tables:
        job_config = bigquery.ExtractJobConfig()
        job_config.destination_format = bigquery.DestinationFormat.AVRO
        extract_job = bq_client.extract_table(
            table.reference,
            ['gs://{}/{}.avro'.format(FROM_BUCKET, table.table_id)],
            location=FROM_LOCATION,  # Available in 0.32.0 library.
            job_config=job_config)  # Starts the extract job.
        extract_jobs.append(extract_job)

    for job in extract_jobs:
        job.result()

    return tables
Exemple #30
0
def export_to_cloud_storage(dataset_ref: bigquery.dataset.DatasetReference,
                            bucket: str, view: bqview.BigQueryView,
                            state_code: str):
    """Exports the table corresponding to the given view to the bucket.

    Extracts the entire table and exports in JSON format to the given bucket in
    Cloud Storage.

    This is a synchronous function that waits for the query job to complete
    before returning.

    Args:
        dataset_ref: The dataset where the view and table exist.
        bucket: The bucket in Cloud Storage where the export should go.
        view: The view whose corresponding table to export.
        state_code: The state code of the data being exported.
    """
    source_tablename = _table_name_for_view(view, state_code)

    if table_exists(dataset_ref, source_tablename):
        destination_filename = _destination_filename_for_view(view, state_code)
        destination_uri = "gs://{}/{}".format(bucket, destination_filename)

        table_ref = dataset_ref.table(source_tablename)

        job_config = bigquery.ExtractJobConfig()
        job_config.destination_format = \
            bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON

        extract_job = client().extract_table(
            table_ref,
            destination_uri,
            # Location must match that of the source table.
            location=LOCATION,
            job_config=job_config)
        # Waits for job to complete
        extract_job.result()
    else:
        logging.error("Table [%s] does not exist in dataset [%s]",
                      source_tablename, str(dataset_ref))