def iter_results( bigquery_client: bigquery.Client, query: str, job_config: QueryJobConfig, df_cleaner: Callable[[pd.DataFrame], pd.DataFrame] = None, ) -> Generator[pd.Series, None, None]: """ Page through the results of a query and yield each row as a pandas Series Args: bigquery_client (bigquery.Client): The BigQuery client query (str): The query to run job_config (QueryJobConfig): The BigQuery job config Returns: Generator[pd.Series, None, None]: A generator of pandas Series """ query_job = bigquery_client.query(query, job_config=job_config) query_job.result() # Get reference to destination table destination = bigquery_client.get_table(query_job.destination) rows = bigquery_client.list_rows(destination, page_size=10000) dfs = rows.to_dataframe_iterable() for df in dfs: if df_cleaner is not None: df = df_cleaner(df) for index, row in df.iterrows(): yield row
def test_list_rows_scalars_extreme(bigquery_client: bigquery.Client, scalars_extreme_table: str): rows = sorted( bigquery_client.list_rows(scalars_extreme_table), key=lambda row: row["rowindex"], ) row = rows[0] assert row["bool_col"] # True assert row["bytes_col"] == b"\r\n" assert row["date_col"] == datetime.date(9999, 12, 31) assert row["datetime_col"] == datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) assert row["geography_col"] == "POINT(-135 90)" assert row["int64_col"] == 9223372036854775807 assert row["numeric_col"] == decimal.Decimal(f"9.{'9' * 37}E+28") assert row["bignumeric_col"] == decimal.Decimal(f"9.{'9' * 75}E+37") assert row["float64_col"] == float("Inf") assert row["string_col"] == "Hello, World" assert row["time_col"] == datetime.time(23, 59, 59, 999999) assert row["timestamp_col"] == datetime.datetime( 9999, 12, 31, 23, 59, 59, 999999, tzinfo=datetime.timezone.utc) nullrow = rows[4] for column, value in nullrow.items(): if column == "rowindex": assert value == 4 else: assert value is None
def test_list_rows_scalars(bigquery_client: bigquery.Client, scalars_table: str): rows = sorted(bigquery_client.list_rows(scalars_table), key=lambda row: row["rowindex"]) row = rows[0] assert row["bool_col"] # True assert row["bytes_col"] == b"Hello, World!" assert row["date_col"] == datetime.date(2021, 7, 21) assert row["datetime_col"] == datetime.datetime(2021, 7, 21, 11, 39, 45) assert row["geography_col"] == "POINT(-122.0838511 37.3860517)" assert row["int64_col"] == 123456789 assert row["numeric_col"] == decimal.Decimal("1.23456789") assert row["bignumeric_col"] == decimal.Decimal("10.111213141516171819") assert row["float64_col"] == 1.25 assert row["string_col"] == "Hello, World!" assert row["time_col"] == datetime.time(11, 41, 43, 76160) assert row["timestamp_col"] == datetime.datetime( 2021, 7, 21, 17, 43, 43, 945289, tzinfo=datetime.timezone.utc) nullrow = rows[1] for column, value in nullrow.items(): if column == "rowindex": assert value == 1 else: assert value is None
def test_list_rows_page_size(bigquery_client: bigquery.Client, table_id: str): num_items = 7 page_size = 3 num_pages, num_last_page = divmod(num_items, page_size) to_insert = [{ "string_col": "item%d" % i, "rowindex": i } for i in range(num_items)] bigquery_client.load_table_from_json(to_insert, table_id).result() df = bigquery_client.list_rows( table_id, selected_fields=[ bigquery.SchemaField("string_col", enums.SqlTypeNames.STRING) ], page_size=page_size, ) pages = df.pages for i in range(num_pages): page = next(pages) assert page.num_items == page_size page = next(pages) assert page.num_items == num_last_page
def test_list_rows_empty_table(bigquery_client: bigquery.Client, table_id: str): from google.cloud.bigquery.table import RowIterator table = bigquery_client.create_table(table_id) # It's a bit silly to list rows for an empty table, but this does # happen as the result of a DDL query from an IPython magic command. rows = bigquery_client.list_rows(table) assert isinstance(rows, RowIterator) assert tuple(rows) == ()
def _read_from_bigquery( self, bqclient: bigquery.Client) -> Tuple[int, Generator]: if TESTING: raise RuntimeError( """テストデータ用の推論には未対応。先に tweet_id, text_tokens のペアでユニークをとったテーブルを作って、↓のテーブル名を書き換える必要がある""" ) """ select tweet_id, any_value(text_tokens) as text_tokens from ( select tweet_id, any_value(text_tokens) as text_tokens from `recsys2020.training` group by tweet_id union all select tweet_id, any_value(text_tokens) as text_tokens from `recsys2020.val_20200418` group by tweet_id ) group by tweet_id """ max_rows = 10000 if self.debugging else None row_iterator = bqclient.list_rows( f"{PROJECT_ID}.recsys2020.tmp_unique_tweet_tokens_val_20200418", max_results=max_rows) return row_iterator.total_rows, row_iterator.to_dataframe_iterable()