Esempio n. 1
0
def iter_results(
    bigquery_client: bigquery.Client,
    query: str,
    job_config: QueryJobConfig,
    df_cleaner: Callable[[pd.DataFrame], pd.DataFrame] = None,
) -> Generator[pd.Series, None, None]:
    """
    Page through the results of a query and yield each row as a pandas Series

    Args:
        bigquery_client (bigquery.Client): The BigQuery client
        query (str): The query to run
        job_config (QueryJobConfig): The BigQuery job config

    Returns:
        Generator[pd.Series, None, None]: A generator of pandas Series
    """

    query_job = bigquery_client.query(query, job_config=job_config)
    query_job.result()

    # Get reference to destination table
    destination = bigquery_client.get_table(query_job.destination)

    rows = bigquery_client.list_rows(destination, page_size=10000)

    dfs = rows.to_dataframe_iterable()

    for df in dfs:
        if df_cleaner is not None:
            df = df_cleaner(df)

        for index, row in df.iterrows():
            yield row
Esempio n. 2
0
def test_list_rows_scalars_extreme(bigquery_client: bigquery.Client,
                                   scalars_extreme_table: str):
    rows = sorted(
        bigquery_client.list_rows(scalars_extreme_table),
        key=lambda row: row["rowindex"],
    )
    row = rows[0]
    assert row["bool_col"]  # True
    assert row["bytes_col"] == b"\r\n"
    assert row["date_col"] == datetime.date(9999, 12, 31)
    assert row["datetime_col"] == datetime.datetime(9999, 12, 31, 23, 59, 59,
                                                    999999)
    assert row["geography_col"] == "POINT(-135 90)"
    assert row["int64_col"] == 9223372036854775807
    assert row["numeric_col"] == decimal.Decimal(f"9.{'9' * 37}E+28")
    assert row["bignumeric_col"] == decimal.Decimal(f"9.{'9' * 75}E+37")
    assert row["float64_col"] == float("Inf")
    assert row["string_col"] == "Hello, World"
    assert row["time_col"] == datetime.time(23, 59, 59, 999999)
    assert row["timestamp_col"] == datetime.datetime(
        9999, 12, 31, 23, 59, 59, 999999, tzinfo=datetime.timezone.utc)

    nullrow = rows[4]
    for column, value in nullrow.items():
        if column == "rowindex":
            assert value == 4
        else:
            assert value is None
Esempio n. 3
0
def test_list_rows_scalars(bigquery_client: bigquery.Client,
                           scalars_table: str):
    rows = sorted(bigquery_client.list_rows(scalars_table),
                  key=lambda row: row["rowindex"])
    row = rows[0]
    assert row["bool_col"]  # True
    assert row["bytes_col"] == b"Hello, World!"
    assert row["date_col"] == datetime.date(2021, 7, 21)
    assert row["datetime_col"] == datetime.datetime(2021, 7, 21, 11, 39, 45)
    assert row["geography_col"] == "POINT(-122.0838511 37.3860517)"
    assert row["int64_col"] == 123456789
    assert row["numeric_col"] == decimal.Decimal("1.23456789")
    assert row["bignumeric_col"] == decimal.Decimal("10.111213141516171819")
    assert row["float64_col"] == 1.25
    assert row["string_col"] == "Hello, World!"
    assert row["time_col"] == datetime.time(11, 41, 43, 76160)
    assert row["timestamp_col"] == datetime.datetime(
        2021, 7, 21, 17, 43, 43, 945289, tzinfo=datetime.timezone.utc)

    nullrow = rows[1]
    for column, value in nullrow.items():
        if column == "rowindex":
            assert value == 1
        else:
            assert value is None
Esempio n. 4
0
def test_list_rows_page_size(bigquery_client: bigquery.Client, table_id: str):
    num_items = 7
    page_size = 3
    num_pages, num_last_page = divmod(num_items, page_size)

    to_insert = [{
        "string_col": "item%d" % i,
        "rowindex": i
    } for i in range(num_items)]
    bigquery_client.load_table_from_json(to_insert, table_id).result()

    df = bigquery_client.list_rows(
        table_id,
        selected_fields=[
            bigquery.SchemaField("string_col", enums.SqlTypeNames.STRING)
        ],
        page_size=page_size,
    )
    pages = df.pages

    for i in range(num_pages):
        page = next(pages)
        assert page.num_items == page_size
    page = next(pages)
    assert page.num_items == num_last_page
Esempio n. 5
0
def test_list_rows_empty_table(bigquery_client: bigquery.Client,
                               table_id: str):
    from google.cloud.bigquery.table import RowIterator

    table = bigquery_client.create_table(table_id)

    # It's a bit silly to list rows for an empty table, but this does
    # happen as the result of a DDL query from an IPython magic command.
    rows = bigquery_client.list_rows(table)
    assert isinstance(rows, RowIterator)
    assert tuple(rows) == ()
 def _read_from_bigquery(
         self, bqclient: bigquery.Client) -> Tuple[int, Generator]:
     if TESTING:
         raise RuntimeError(
             """テストデータ用の推論には未対応。先に tweet_id, text_tokens のペアでユニークをとったテーブルを作って、↓のテーブル名を書き換える必要がある"""
         )
     """
     select tweet_id, any_value(text_tokens) as text_tokens
     from (
         select tweet_id, any_value(text_tokens) as text_tokens
         from `recsys2020.training`
         group by tweet_id
         union all
         select tweet_id, any_value(text_tokens) as text_tokens
         from `recsys2020.val_20200418`
         group by tweet_id
     )
     group by tweet_id
     """
     max_rows = 10000 if self.debugging else None
     row_iterator = bqclient.list_rows(
         f"{PROJECT_ID}.recsys2020.tmp_unique_tweet_tokens_val_20200418",
         max_results=max_rows)
     return row_iterator.total_rows, row_iterator.to_dataframe_iterable()