Esempio n. 1
0
        def load_task():
            client = Client()
            job_config = LoadJobConfig()
            schema_path = os.path.join(
                dags_folder,
                'resources/stages/raw/schemas/{task}.json'.format(task=task))
            job_config.schema = read_bigquery_schema_from_file(schema_path)
            job_config.source_format = SourceFormat.CSV if file_format == 'csv' else SourceFormat.NEWLINE_DELIMITED_JSON
            if file_format == 'csv':
                job_config.skip_leading_rows = 1
            job_config.write_disposition = 'WRITE_TRUNCATE'
            job_config.allow_quoted_newlines = allow_quoted_newlines
            job_config.ignore_unknown_values = True

            export_location_uri = 'gs://{bucket}/export'.format(
                bucket=output_bucket)
            uri = '{export_location_uri}/{task}/*.{file_format}'.format(
                export_location_uri=export_location_uri,
                task=task,
                file_format=file_format)
            table_ref = client.dataset(dataset_name_raw).table(task)
            load_job = client.load_table_from_uri(uri,
                                                  table_ref,
                                                  job_config=job_config)
            submit_bigquery_job(load_job, job_config)
            assert load_job.state == 'DONE'
Esempio n. 2
0
def load_batches(gcs_client: storage.Client, bq_client: bigquery.Client,
                 gsurl: str, load_config: bigquery.LoadJobConfig, job_id: str,
                 table: bigquery.TableReference):
    """orchestrate 1 or more load jobs based on number of URIs and total byte
    size of objects at gsurl"""
    batches = get_batches_for_gsurl(gcs_client, gsurl)
    jobs: List[Tuple[bigquery.TableReference, bigquery.LoadJob]] = []
    for batch in batches:
        # None is passed to destination parameter below because the load_config
        # object contains the destination information
        job: bigquery.LoadJob = bq_client.load_table_from_uri(
            batch, None, job_config=load_config, job_id=job_id)
        jobs.append((table, job))
        logging.log_bigquery_job(
            job, table,
            f"Submitted asynchronous bigquery load job: {job.job_id}")
    start_poll_for_errors = time.monotonic()
    # Check if job failed quickly
    while time.monotonic(
    ) - start_poll_for_errors < constants.WAIT_FOR_JOB_SECONDS:
        # Check if job failed quickly
        for table_ref, job in jobs:
            job.reload(client=bq_client)
            check_for_bq_job_and_children_errors(bq_client, job, table_ref)
        time.sleep(constants.JOB_POLL_INTERVAL_SECONDS)
Esempio n. 3
0
def load_gcs_file(client: bigquery.Client,
                  uri: str,
                  table_ref: bigquery.table.TableReference,
                  job_config: bigquery.LoadJobConfig,
                  # job_id: str = str(datetime.datetime.now()).replace(' ', ''),
                  ):
    """

    Args:
        client:
        uri:
        table_ref:
        job_config:
        job_id:

    Returns:

    Examples:

    """

    load_job = client.load_table_from_uri(
        source_uris=uri,
        destination=table_ref,
        # job_id=job_id,
        job_id_prefix='lgs-',
        job_config=job_config,
    )

    tic = time.time()
    print('Starting job {}'.format(load_job.job_id))
    load_job.result()
    print('Job took {} seconds'.format(time.time() - tic))
    assert load_job.state == 'DONE'
Esempio n. 4
0
def ingest_by_uri(cl: bigquery.Client, destination: str, uri: str,
                  schema: List[Any]) -> bigquery.LoadJob:
    c = bigquery.LoadJobConfig(
        source_format="NEWLINE_DELIMITED_JSON",
        write_disposition="WRITE_TRUNCATE",
        schema=schema
    )
    return cl.load_table_from_uri(uri, destination=destination, job_config=c)
Esempio n. 5
0
def load_bigquery_table_via_bq_apis(bq_client: bigquery.Client, dataset_id,
                                    table_name, imported_data_info, src_uris):
    """
    Load tables using BigQuery Load jobs, using the same configuration as BQ DTS ImportedDataInfo
    :return:
    """
    # https://googlecloudplatform.github.io/google-cloud-python/latest/_modules/google/cloud/bigquery/client.html#Client.load_table_from_uri
    # Step 1 - Translate required fields for BigQuery Python SDK
    tgt_tabledef = imported_data_info['table_defs'][0]

    # Step 2 - Create target table if it doesn't exist
    dataset_ref = bq_client.dataset(dataset_id)
    table_ref = dataset_ref.table(table_name)
    try:
        bq_client.get_table(table_ref)
    except exceptions.NotFound:
        # Step 2a - Attach schema
        tgt_schema = RPCRecordSchema_to_GCloudSchema(tgt_tabledef['schema'])
        tgt_table = bigquery.Table(table_ref, schema=tgt_schema)

        # Step 2b - Attach description
        tgt_table.description = imported_data_info[
            'destination_table_description']

        # Step 2c - Conditionally set partitioning type
        if '$' in table_name:
            tgt_table.partitioning_type = 'DAY'
            tgt_table._properties['tableReference'][
                'tableId'], _, _ = table_name.partition('$')

        # Step 2d - Create BigQuery table
        bq_client.create_table(tgt_table)

    # Step 3a - Create BigQuery Load Job ID
    current_datetime = datetime.datetime.utcnow().isoformat()
    raw_job_id = f'{table_name}_{current_datetime}'
    clean_job_id = BQ_JOB_ID_MATCHER.sub('___', raw_job_id)

    # Step 3b - Create BigQuery Job Config
    job_config = DTSTableDefinition_to_BQLoadJobConfig(tgt_tabledef)

    # Step 4 - Execute BigQuery Load Job using Python SDK
    load_job = bq_client.load_table_from_uri(source_uris=src_uris,
                                             destination=table_ref,
                                             job_id=clean_job_id,
                                             job_config=job_config)

    return load_job
Esempio n. 6
0
def load_stage(dst_dataset: Dataset, bq_client: Client, bucket_name: str,
               gcs_client: storage.Client) -> List[LoadJob]:
    """
    Stage files from a bucket to a dataset

    :param dst_dataset: reference to destination dataset object
    :param bq_client: a BigQuery client object
    :param bucket_name: the location in GCS containing the vocabulary files
    :param gcs_client: a Cloud Storage client object
    :return: list of completed load jobs
    """
    blobs = list(gcs_client.list_blobs(bucket_name))

    table_blobs = [_filename_to_table_name(blob.name) for blob in blobs]
    missing_blobs = [
        table for table in VOCABULARY_TABLES if table not in table_blobs
    ]
    if missing_blobs:
        raise RuntimeError(
            f'Bucket {bucket_name} is missing files for tables {missing_blobs}'
        )

    load_jobs = []
    for blob in blobs:
        table_name = _filename_to_table_name(blob.name)
        # ignore any non-vocabulary files
        if table_name not in VOCABULARY_TABLES:
            continue
        destination = dst_dataset.table(table_name)
        safe_schema = safe_schema_for(table_name)
        job_config = LoadJobConfig()
        job_config.schema = safe_schema
        job_config.skip_leading_rows = 1
        job_config.field_delimiter = FIELD_DELIMITER
        job_config.max_bad_records = MAX_BAD_RECORDS
        job_config.source_format = 'CSV'
        job_config.quote_character = ''
        source_uri = f'gs://{bucket_name}/{blob.name}'
        load_job = bq_client.load_table_from_uri(source_uri,
                                                 destination,
                                                 job_config=job_config)
        LOGGER.info(f'table:{destination} job_id:{load_job.job_id}')
        load_jobs.append(load_job)
        load_job.result()
    return load_jobs
def load_folder(dst_dataset: str, bq_client: BQClient, bucket_name: str,
                prefix: str, gcs_client: GCSClient,
                hpo_id: str) -> List[LoadJob]:
    """
    Stage files from a bucket to a dataset

    :param dst_dataset: Identifies the destination dataset
    :param bq_client: a BigQuery client object
    :param bucket_name: the bucket in GCS containing the archive files
    :param prefix: prefix of the filepath URI
    :param gcs_client: a Cloud Storage client object
    :param hpo_id: Identifies the HPO site
    :return: list of completed load jobs
    """
    blobs = list(gcs_client.list_blobs(bucket_name, prefix=prefix))

    load_jobs = []
    for blob in blobs:
        table_name = _filename_to_table_name(blob.name)
        if table_name not in AOU_REQUIRED:
            LOGGER.debug(f'Skipping file for {table_name}')
            continue
        schema = get_table_schema(table_name)
        hpo_table_name = f'{hpo_id}_{table_name}'
        fq_hpo_table = f'{bq_client.project}.{dst_dataset}.{hpo_table_name}'
        destination = Table(fq_hpo_table, schema=schema)
        destination = bq_client.create_table(destination)
        job_config = LoadJobConfig()
        job_config.schema = schema
        job_config.skip_leading_rows = 1
        job_config.source_format = 'CSV'
        source_uri = f'gs://{bucket_name}/{blob.name}'
        load_job = bq_client.load_table_from_uri(
            source_uri,
            destination,
            job_config=job_config,
            job_id_prefix=f"{__file__.split('/')[-1].split('.')[0]}_")
        LOGGER.info(f'table:{destination} job_id:{load_job.job_id}')
        load_jobs.append(load_job)
        load_job.result()
    return load_jobs