コード例 #1
0
def update_or_create_table_from_csv(
    client: bigquery.Client,
    table_name: str,
    source_file: str,
    dataset: str,
    source_schema_file: str,
):
    LOGGER.debug("update_or_create_table_from_csv: %s=%s", table_name,
                 [source_file])
    dataset_ref = client.dataset(dataset)
    table_ref = dataset_ref.table(table_name)

    job_config = LoadJobConfig()
    job_config.source_format = "CSV"
    job_config.skip_leading_rows = 1
    if Path(source_schema_file).exists():
        job_config.schema = get_table_schema(source_schema_file)
    else:
        job_config.autodetect = True
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

    with open(source_file, "rb") as source_fp:
        load_job = client.load_table_from_file(source_fp,
                                               destination=table_ref,
                                               job_config=job_config)

    # wait for job to complete
    load_job.result()

    LOGGER.info("updated config table: %s", table_ref.table_id)
コード例 #2
0
def load_from_newline_delimited_json(
    client: bigquery.Client,
    filepath: pathlib.Path,
    project_id: str,
    dataset_id: str,
    table_id: str,
):
    full_table_id = f"{project_id}.{dataset_id}.{table_id}"
    job_config = bigquery.LoadJobConfig()
    job_config.source_format = enums.SourceFormat.NEWLINE_DELIMITED_JSON
    job_config.schema = [
        bigquery.SchemaField("id", enums.SqlTypeNames.STRING),
        bigquery.SchemaField("user_id", enums.SqlTypeNames.INTEGER),
        bigquery.SchemaField("login_time", enums.SqlTypeNames.TIMESTAMP),
        bigquery.SchemaField("logout_time", enums.SqlTypeNames.TIMESTAMP),
        bigquery.SchemaField("ip_address", enums.SqlTypeNames.STRING),
    ]

    with open(filepath, "rb") as json_file:
        load_job = client.load_table_from_file(json_file,
                                               full_table_id,
                                               job_config=job_config)

    # Wait for load job to finish.
    load_job.result()
コード例 #3
0
def load_local_file(client: bigquery.Client,
                    filename: str,
                    table_ref: bigquery.table.TableReference,
                    job_config: bigquery.LoadJobConfig,
                    # job_id: str = str(datetime.datetime.now()).replace(' ', ''),
                    ):
    """
    Args:
        client:
        filename:
        table_ref:
        job_config:

    Returns:

    Examples:

    """

    with open(filename, 'rb') as source_file:
        load_job = client.load_table_from_file(
            file_obj=source_file,
            destination=table_ref,
            # job_id=job_id,
            job_id_prefix='llf-',
            job_config=job_config)  # API request
    tic = time.time()
    print('Starting job {} at {}'.format(load_job.job_id, tic))
    load_job.result()
    print('Job took {} seconds'.format(time.time() - tic))
    assert load_job.state == 'DONE'
コード例 #4
0
ファイル: sql_test.py プロジェクト: teonbrooks/bigquery-etl
def load_tables(bq: bigquery.Client, dataset: bigquery.Dataset,
                tables: Iterable[Table]):
    """Load tables for a test."""
    for table in tables:
        destination = dataset.table(table.name)
        job_config = bigquery.LoadJobConfig(
            source_format=table.source_format,
            write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
        )

        if table.schema is None:
            # autodetect schema if not provided
            job_config.autodetect = True
        else:
            job_config.schema = table.schema
            # look for time_partitioning_field in provided schema
            for field in job_config.schema:
                if field.description == "time_partitioning_field":
                    job_config.time_partitioning = bigquery.TimePartitioning(
                        field=field.name)
                    break  # stop because there can only be one time partitioning field

        if isinstance(table.source_path, str):
            with open(table.source_path, "rb") as file_obj:
                job = bq.load_table_from_file(file_obj,
                                              destination,
                                              job_config=job_config)
        else:
            file_obj = BytesIO()
            for row in load(*table.source_path):
                file_obj.write(
                    json.dumps(row, default=default_encoding).encode() + b"\n")
            file_obj.seek(0)
            job = bq.load_table_from_file(file_obj,
                                          destination,
                                          job_config=job_config)

        try:
            job.result()
        except BadRequest:
            # print the first 5 rows for debugging errors
            for row in job.errors[:5]:
                print(row)
            raise
コード例 #5
0
ファイル: conftest.py プロジェクト: kalona/python-bigquery
def scalars_extreme_table(bigquery_client: bigquery.Client, project_id: str,
                          dataset_id: str):
    schema = bigquery_client.schema_from_json(DATA_DIR / "scalars_schema.json")
    job_config = bigquery.LoadJobConfig()
    job_config.schema = schema
    job_config.source_format = enums.SourceFormat.NEWLINE_DELIMITED_JSON
    full_table_id = f"{project_id}.{dataset_id}.scalars_extreme"
    with open(DATA_DIR / "scalars_extreme.jsonl", "rb") as data_file:
        job = bigquery_client.load_table_from_file(data_file,
                                                   full_table_id,
                                                   job_config=job_config)
    job.result()
    yield full_table_id
    bigquery_client.delete_table(full_table_id)
コード例 #6
0
def write_rows(client: bigquery.Client, table: str, rows: List[TableRow]) -> None:
    """Write the test results to the specified BigQuery table."""

    job_config = bigquery.LoadJobConfig(
        write_disposition=bigquery.job.WriteDisposition.WRITE_APPEND,
        source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
    )

    data_str = "\n".join(json.dumps(asdict(row), ensure_ascii=False) for row in rows)
    data_file = io.BytesIO(data_str.encode(encoding="utf-8"))

    load_job = client.load_table_from_file(
        data_file, destination=table, job_config=job_config,
    )

    # Wait for load job to complete; raises an exception if the job failed.
    load_job.result()
コード例 #7
0
def load_sample_data(
    full_table_id: str,
    bigquery_client: bigquery.Client,
    bigquery_schema: List[bigquery.SchemaField],
    filename: str = "sample.json",
):
    sample_config = bigquery.LoadJobConfig()
    sample_config.destination_table_description = (
        "A sample table containing most data types."
    )
    sample_config.schema = bigquery_schema
    sample_config.time_partitioning = bigquery.TimePartitioning(field="timestamp")
    sample_config.clustering_fields = ["integer", "string"]
    sample_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
    sample_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

    with open(DATA_DIR / filename, "rb") as data_file:
        return bigquery_client.load_table_from_file(
            data_file, full_table_id, job_config=sample_config,
        )
コード例 #8
0
ファイル: ft2bq.py プロジェクト: tehhowch/apps-script
def upload_table_data(client: bigquery.Client, tableRef: bigquery.Table,
                      fusionFile: str) -> bigquery.LoadJob:
    """Given the client, BigQuery table target, and data, upload the data"""
    with open(fusionFile, mode='rb') as file:
        job = client.load_table_from_file(file, tableRef)
    return job