コード例 #1
0
def test_read_rows_to_dataframe_with_wide_table(client, project_id):
    # Use a wide table to boost the chance of getting a large message size.
    # https://github.com/googleapis/python-bigquery-storage/issues/78
    read_session = types.ReadSession()
    read_session.table = "projects/{}/datasets/{}/tables/{}".format(
        "bigquery-public-data", "geo_census_tracts", "us_census_tracts_national"
    )
    read_session.data_format = types.DataFormat.ARROW

    session = client.create_read_session(
        request={
            "parent": "projects/{}".format(project_id),
            "read_session": read_session,
            "max_stream_count": 1,
        }
    )

    stream = session.streams[0].name

    read_rows_stream = client.read_rows(stream)

    # fetch the first two batches of rows
    pages_iter = iter(read_rows_stream.rows(session).pages)
    some_rows = next(pages_iter)

    assert all(len(row["tract_geom"].as_py()) > 0 for row in some_rows)
コード例 #2
0
def test_rows_no_schema_set_raises_type_error(mut, class_under_test,
                                              mock_gapic_client, monkeypatch):
    reader = class_under_test([], mock_gapic_client, "", 0, {})
    read_session = types.ReadSession()

    with pytest.raises(TypeError):
        reader.rows(read_session)
コード例 #3
0
def test_ingestion_time_partitioned_table(
    client, project_id, ingest_partition_table_ref, bq_client, data_format
):
    data = [{"shape": "cigar", "altitude": 1200}, {"shape": "disc", "altitude": 750}]
    destination = _to_bq_table_ref(
        ingest_partition_table_ref, partition_suffix="$20190809"
    )
    bq_client.load_table_from_json(data, destination).result()

    data = [
        {"shape": "sphere", "altitude": 3500},
        {"shape": "doughnut", "altitude": 100},
    ]
    destination = _to_bq_table_ref(
        ingest_partition_table_ref, partition_suffix="$20190810"
    )
    bq_client.load_table_from_json(data, destination).result()

    data = [
        {"shape": "elephant", "altitude": 1},
        {"shape": "rocket", "altitude": 12700},
    ]
    destination = _to_bq_table_ref(
        ingest_partition_table_ref, partition_suffix="$20190811"
    )
    bq_client.load_table_from_json(data, destination).result()

    read_session = types.ReadSession()
    read_session.table = ingest_partition_table_ref
    read_session.data_format = data_format
    read_session.read_options.row_restriction = "DATE(_PARTITIONTIME) = '2019-08-10'"

    session = client.create_read_session(
        request={
            "parent": "projects/{}".format(project_id),
            "read_session": read_session,
            "max_stream_count": 1,
        }
    )

    assert session.streams  # there should be some data to fetch

    stream = session.streams[0].name

    rows = list(client.read_rows(stream).rows(session))
    assert len(rows) == 2

    if data_format == types.DataFormat.AVRO:
        actual_items = {(row["shape"], row["altitude"]) for row in rows}
    else:
        assert data_format == types.DataFormat.ARROW
        actual_items = {(row["shape"].as_py(), row["altitude"].as_py()) for row in rows}

    expected_items = {("sphere", 3500), ("doughnut", 100)}
    assert actual_items == expected_items
コード例 #4
0
def test_session_to_dataframe(capsys, clients):
    from google.cloud.bigquery_storage import types

    bqclient, bqstorageclient = clients
    your_project_id = bqclient.project

    # [START bigquerystorage_pandas_tutorial_all]
    # [START bigquerystorage_pandas_tutorial_read_session]
    project_id = "bigquery-public-data"
    dataset_id = "new_york_trees"
    table_id = "tree_species"
    table = f"projects/{project_id}/datasets/{dataset_id}/tables/{table_id}"

    # Select columns to read with read options. If no read options are
    # specified, the whole table is read.
    read_options = types.ReadSession.TableReadOptions(
        selected_fields=["species_common_name", "fall_color"])

    parent = "projects/{}".format(your_project_id)

    requested_session = types.ReadSession(
        table=table,
        # This API can also deliver data serialized in Apache Avro format.
        # This example leverages Apache Arrow.
        data_format=types.DataFormat.ARROW,
        read_options=read_options,
    )
    read_session = bqstorageclient.create_read_session(
        parent=parent,
        read_session=requested_session,
        max_stream_count=1,
    )

    # This example reads from only a single stream. Read from multiple streams
    # to fetch data faster. Note that the session may not contain any streams
    # if there are no rows to read.
    stream = read_session.streams[0]
    reader = bqstorageclient.read_rows(stream.name)

    # Parse all Arrow blocks and create a dataframe. This call requires a
    # session, because the session contains the schema for the row blocks.
    dataframe = reader.to_dataframe(read_session)
    print(dataframe.head())
    # [END bigquerystorage_pandas_tutorial_read_session]
    # [END bigquerystorage_pandas_tutorial_all]

    out, _ = capsys.readouterr()
    assert "species_common_name" in out
コード例 #5
0
def test_basic_nonfiltered_read(client, project_id, table_with_data_ref, data_format):
    read_session = types.ReadSession()
    read_session.table = table_with_data_ref
    read_session.data_format = data_format

    session = client.create_read_session(
        request={
            "parent": "projects/{}".format(project_id),
            "read_session": read_session,
            "max_stream_count": 1,
        }
    )
    stream = session.streams[0].name

    rows = list(client.read_rows(stream).rows(session))

    assert len(rows) == 5  # all table rows
コード例 #6
0
def test_filtered_rows_read(client, project_id, table_with_data_ref):
    read_session = types.ReadSession()
    read_session.table = table_with_data_ref
    read_session.data_format = types.DataFormat.AVRO
    read_session.read_options.row_restriction = "age >= 50"

    session = client.create_read_session(
        request={
            "parent": "projects/{}".format(project_id),
            "read_session": read_session,
            "max_stream_count": 1,
        }
    )
    stream = session.streams[0].name

    rows = list(client.read_rows(stream).rows(session))

    assert len(rows) == 2
コード例 #7
0
def test_read_rows_as_rows_full_table(
    client, project_id, small_table_reference, data_format, expected_schema_type
):

    read_session = types.ReadSession()
    read_session.table = small_table_reference
    read_session.data_format = data_format

    session = client.create_read_session(
        request={
            "parent": "projects/{}".format(project_id),
            "read_session": read_session,
            "max_stream_count": 1,
        }
    )
    stream = session.streams[0].name

    rows = list(client.read_rows(stream).rows(session))

    assert len(rows) > 0
コード例 #8
0
def test_column_partitioned_table(
    client, project_id, col_partition_table_ref, bq_client
):
    data = [
        {"description": "Tracking established.", "occurred": "2017-02-15"},
        {"description": "Look, a solar eclipse!", "occurred": "2018-02-15"},
        {"description": "Fake solar eclipse reported.", "occurred": "2018-02-15"},
        {"description": "1 day after false eclipse report.", "occurred": "2018-02-16"},
        {"description": "1 year after false eclipse report.", "occurred": "2019-02-15"},
    ]

    destination = _to_bq_table_ref(col_partition_table_ref)
    bq_client.load_table_from_json(data, destination).result()

    # Read from the table with a partition filter specified, and verify that
    # only the expected data is returned.

    read_session = types.ReadSession()
    read_session.table = col_partition_table_ref
    read_session.data_format = types.DataFormat.AVRO
    read_session.read_options.row_restriction = "occurred = '2018-02-15'"

    session = client.create_read_session(
        request={
            "parent": "projects/{}".format(project_id),
            "read_session": read_session,
            "max_stream_count": 1,
        }
    )

    assert session.streams  # there should be some data to fetch

    stream = session.streams[0].name

    rows = list(client.read_rows(stream).rows(session))
    assert len(rows) == 2

    expected_descriptions = ("Look, a solar eclipse!", "Fake solar eclipse reported.")
    for row in rows:
        assert row["occurred"] == dt.date(2018, 2, 15)
        assert row["description"] in expected_descriptions
コード例 #9
0
def test_create_read_session(mock_transport, client_under_test):
    assert client_under_test._transport is mock_transport  # sanity check

    table = "projects/{}/datasets/{}/tables/{}".format("data-project-id",
                                                       "dataset_id",
                                                       "table_id")

    read_session = types.ReadSession()
    read_session.table = table

    client_under_test.create_read_session(parent="projects/other-project",
                                          read_session=read_session)

    expected_session_arg = types.CreateReadSessionRequest(
        parent="projects/other-project", read_session=read_session)
    rpc_callable = mock_transport._wrapped_methods[
        mock_transport.create_read_session]
    rpc_callable.assert_called_once_with(expected_session_arg,
                                         metadata=mock.ANY,
                                         retry=mock.ANY,
                                         timeout=mock.ANY)
コード例 #10
0
def test_column_selection_read(client, project_id, table_with_data_ref, data_format):

    read_session = types.ReadSession()
    read_session.table = table_with_data_ref
    read_session.data_format = data_format
    read_session.read_options.selected_fields.append("first_name")
    read_session.read_options.selected_fields.append("age")

    session = client.create_read_session(
        request={
            "parent": "projects/{}".format(project_id),
            "read_session": read_session,
            "max_stream_count": 1,
        }
    )
    stream = session.streams[0].name

    rows = list(client.read_rows(stream).rows(session))

    for row in rows:
        assert sorted(row.keys()) == ["age", "first_name"]
コード例 #11
0
def test_resuming_read_from_offset(
    client, project_id, data_format, local_shakespeare_table_reference
):

    read_session = types.ReadSession()
    read_session.table = local_shakespeare_table_reference
    read_session.data_format = data_format

    session = client.create_read_session(
        request={
            "parent": "projects/{}".format(project_id),
            "read_session": read_session,
            "max_stream_count": 1,
        }
    )

    assert session.streams  # there should be data available

    stream = session.streams[0].name

    read_rows_stream = client.read_rows(stream)

    # fetch the first two batches of rows
    rows_iter = iter(read_rows_stream)
    some_rows = next(rows_iter)
    more_rows = next(rows_iter)

    # fetch the rest of the rows using the stream offset
    offset = some_rows.row_count + more_rows.row_count
    remaining_rows_count = sum(
        1 for _ in client.read_rows(stream, offset=offset).rows(session)
    )

    # verify that the counts match
    expected_len = 164656  # total rows in shakespeare table
    actual_len = remaining_rows_count + some_rows.row_count + more_rows.row_count
    assert actual_len == expected_len
コード例 #12
0
def test_snapshot(client, project_id, table_with_data_ref, bq_client):
    before_new_data = types.Timestamp()
    before_new_data.GetCurrentTime()

    # load additional data into the table
    new_data = [
        {u"first_name": u"NewGuyFoo", u"last_name": u"Smith", u"age": 46},
        {u"first_name": u"NewGuyBar", u"last_name": u"Jones", u"age": 30},
    ]

    destination = _to_bq_table_ref(table_with_data_ref)
    bq_client.load_table_from_json(new_data, destination).result()

    # read data using the timestamp before the additional data load

    read_session = types.ReadSession()
    read_session.table = table_with_data_ref
    read_session.table_modifiers.snapshot_time = before_new_data
    read_session.data_format = types.DataFormat.AVRO

    session = client.create_read_session(
        request={
            "parent": "projects/{}".format(project_id),
            "read_session": read_session,
            "max_stream_count": 1,
        }
    )
    stream = session.streams[0].name

    rows = list(client.read_rows(stream).rows(session))

    # verify that only the data before the timestamp was returned
    assert len(rows) == 5  # all initial records

    for row in rows:
        assert "NewGuy" not in row["first_name"]  # no new records
コード例 #13
0
def _generate_arrow_read_session(arrow_schema):
    return types.ReadSession(
        arrow_schema={
            "serialized_schema": arrow_schema.serialize().to_pybytes()
        })
コード例 #14
0
def _generate_avro_read_session(avro_schema_json):
    schema = json.dumps(avro_schema_json)
    return types.ReadSession(avro_schema={"schema": schema})
コード例 #15
0
from google.cloud.bigquery_storage import types
import os

os.environ[
    "GOOGLE_APPLICATION_CREDENTIALS"] = 'D:\medium\example-apis\key\key_bqsa.json'
project_id = 'medium-sandbox'

#Setting the client
client = BigQueryReadClient()

# Selecting the table
table = "projects/{}/datasets/{}/tables/{}".format(
    "bigquery-public-data", "world_bank_global_population",
    "population_by_country")

requested_session = types.ReadSession()
requested_session.table = table

# This API deliver data serialized in Apache Arrow and AVRO format.
requested_session.data_format = types.DataFormat.ARROW

# Selecting the columns and appying a filter
requested_session.read_options.selected_fields = [
    "country", "year_1960", "year_1970", "year_1980", "year_1990", "year_2000",
    "year_2010", "year_2018"
]
requested_session.read_options.row_restriction = 'country_code = "PER"'

parent = "projects/{}".format(project_id)
session = client.create_read_session(
    parent=parent,
コード例 #16
0
def main(project_id="your-project-id", snapshot_millis=0):
    # [START bigquerystorage_quickstart]
    from google.cloud.bigquery_storage import BigQueryReadClient
    from google.cloud.bigquery_storage import types

    # TODO(developer): Set the project_id variable.
    # project_id = 'your-project-id'
    #
    # The read session is created in this project. This project can be
    # different from that which contains the table.

    client = BigQueryReadClient()

    # This example reads baby name data from the public datasets.
    table = "projects/{}/datasets/{}/tables/{}".format("bigquery-public-data",
                                                       "usa_names",
                                                       "usa_1910_current")

    requested_session = types.ReadSession()
    requested_session.table = table
    # This API can also deliver data serialized in Apache Arrow format.
    # This example leverages Apache Avro.
    requested_session.data_format = types.DataFormat.AVRO

    # We limit the output columns to a subset of those allowed in the table,
    # and set a simple filter to only report names from the state of
    # Washington (WA).
    requested_session.read_options.selected_fields = [
        "name", "number", "state"
    ]
    requested_session.read_options.row_restriction = 'state = "WA"'

    # Set a snapshot time if it's been specified.
    if snapshot_millis > 0:
        snapshot_time = types.Timestamp()
        snapshot_time.FromMilliseconds(snapshot_millis)
        requested_session.table_modifiers.snapshot_time = snapshot_time

    parent = "projects/{}".format(project_id)
    session = client.create_read_session(
        parent=parent,
        read_session=requested_session,
        # We'll use only a single stream for reading data from the table. However,
        # if you wanted to fan out multiple readers you could do so by having a
        # reader process each individual stream.
        max_stream_count=1,
    )
    reader = client.read_rows(session.streams[0].name)

    # The read stream contains blocks of Avro-encoded bytes. The rows() method
    # uses the fastavro library to parse these blocks as an iterable of Python
    # dictionaries. Install fastavro with the following command:
    #
    # pip install google-cloud-bigquery-storage[fastavro]
    rows = reader.rows(session)

    # Do any local processing by iterating over the rows. The
    # google-cloud-bigquery-storage client reconnects to the API after any
    # transient network errors or timeouts.
    names = set()
    states = set()

    for row in rows:
        names.add(row["name"])
        states.add(row["state"])

    print("Got {} unique names in states: {}".format(len(names),
                                                     ", ".join(states)))
コード例 #17
0
def test_decoding_data_types(
    client, project_id, all_types_table_ref, bq_client, data_format
):
    data = [
        {
            u"string_field": u"Price: € 9.95.",
            u"bytes_field": bigquery._helpers._bytes_to_json(b"byteees"),
            u"int64_field": -1085,
            u"float64_field": -42.195,
            u"numeric_field": "1.4142",
            u"bool_field": True,
            u"geography_field": '{"type": "Point", "coordinates": [-49.3028, 69.0622]}',
            u"person_struct_field": {u"name": u"John", u"age": 42},
            u"timestamp_field": 1565357902.017896,  # 2019-08-09T13:38:22.017896
            u"date_field": u"1995-03-17",
            u"time_field": u"16:24:51",
            u"datetime_field": u"2005-10-26T19:49:41",
            u"string_array_field": [u"foo", u"bar", u"baz"],
        }
    ]

    # Explicit schema is needed to recognize bytes_field as BYTES, and not STRING.
    # Since partial schemas are not supported in load_table_from_json(), a full
    # schema needs to be specified.
    schema = [
        bigquery.SchemaField("string_field", "STRING"),
        bigquery.SchemaField("bytes_field", "BYTES"),
        bigquery.SchemaField("int64_field", "INT64"),
        bigquery.SchemaField("float64_field", "FLOAT64"),
        bigquery.SchemaField("numeric_field", "NUMERIC"),
        bigquery.SchemaField("bool_field", "BOOL"),
        bigquery.SchemaField("geography_field", "GEOGRAPHY"),
        bigquery.SchemaField(
            "person_struct_field",
            "STRUCT",
            fields=(
                bigquery.SchemaField("name", "STRING"),
                bigquery.SchemaField("age", "INT64"),
            ),
        ),
        bigquery.SchemaField("timestamp_field", "TIMESTAMP"),
        bigquery.SchemaField("date_field", "DATE"),
        bigquery.SchemaField("time_field", "TIME"),
        bigquery.SchemaField("datetime_field", "DATETIME"),
        bigquery.SchemaField("string_array_field", "STRING", mode="REPEATED"),
    ]

    job_config = bigquery.LoadJobConfig(schema=schema)
    destination = _to_bq_table_ref(all_types_table_ref)
    bq_client.load_table_from_json(data, destination, job_config=job_config).result()

    read_session = types.ReadSession()
    read_session.table = all_types_table_ref
    read_session.data_format = data_format

    session = client.create_read_session(
        request={
            "parent": "projects/{}".format(project_id),
            "read_session": read_session,
            "max_stream_count": 1,
        }
    )

    assert session.streams  # there should be data available

    stream = session.streams[0].name

    if data_format == types.DataFormat.AVRO:
        rows = list(client.read_rows(stream).rows(session))
    else:
        assert data_format == types.DataFormat.ARROW
        rows = list(
            dict((key, value.as_py()) for key, value in row_dict.items())
            for row_dict in client.read_rows(stream).rows(session)
        )

    expected_result = {
        u"string_field": u"Price: € 9.95.",
        u"bytes_field": b"byteees",
        u"int64_field": -1085,
        u"float64_field": -42.195,
        u"numeric_field": decimal.Decimal("1.4142"),
        u"bool_field": True,
        u"geography_field": "POINT(-49.3028 69.0622)",
        u"person_struct_field": {u"name": u"John", u"age": 42},
        u"timestamp_field": dt.datetime(2019, 8, 9, 13, 38, 22, 17896, tzinfo=pytz.UTC),
        u"date_field": dt.date(1995, 3, 17),
        u"time_field": dt.time(16, 24, 51),
        u"string_array_field": [u"foo", u"bar", u"baz"],
    }

    result_copy = copy.copy(rows[0])
    del result_copy["datetime_field"]
    assert result_copy == expected_result

    # Compare datetime separately, AVRO and PYARROW return different object types,
    # although they should both represent the same value.
    # TODO: when fixed, change assertion to assert a datetime instance!
    expected_pattern = re.compile(r"2005-10-26( |T)19:49:41")
    assert expected_pattern.match(str(rows[0]["datetime_field"]))