Beispiel #1
def _parse_data(client, job, index_col=None, col_order=None):
	Iterate through the query results and piece together the
	final DataFrame. Builds a DataFrame for each page of
	results, then concatenates them together when finished.
	To save memory, we use numpy record arrays to build these

	client: An instance of bq.Client
	job: An array containing the job info for a completed query
	index_col: str (optional)
		Name of result column to use for index in results DataFrame
	col_order: list() (optional)
		List of BigQuery column names in the desired order for results

	df: pandas DataFrame
		DataFrame representing results of query

		Raised if 'col_order' parameter doesn't match returned DataFrame
		Raised by bigquery_client if a Google API error is encountered

	This script relies on Google being consistent with their 
	pagination API. We are using the most flexible iteration method
	that we could find in the API's, but 
	these have undergone large amounts of change recently.

	We have encountered bugs with this functionality, see:

    # dtype Map -
    # see:
    dtype_map = {
        'INTEGER': np.dtype(float),
        'FLOAT': np.dtype(float),
        'TIMESTAMP': 'M8[ns]'
    }  # This seems to be buggy without nanosecond indicator

    # We first need the schema to get information about the columns of
    # our dataframe.

    table_dict = job['configuration']['query']['destinationTable']
    fields = client.GetTableSchema(table_dict)['fields']

    # Get the schema into a format useable to create our
    # dataframe
    col_dtypes = []
    col_types = []
    col_names = []

    # TODO: Do this in one clean step
    for field in fields:
        # Note the encoding... numpy doesn't like titles that are UTF8, which is the return
        # type from the API
        col_names.append(field['name'].encode('ascii', 'ignore'))
        # Note, it would be nice to use 'str' types, but BigQuery doesn't have a fixed length
        # in mind - just maxes out at 64k
        col_dtypes.append(dtype_map.get(field['type'], object))

    # How many columns are there
    num_columns = len(col_names)

    # Iterate over the result rows.
    # Since Google's API now requires pagination of results,
    # we do that here. The following is repurposed from
    #  :: Client.ReadTableRows()

    # Initially, no page token is set
    page_token = None

    # Most of Google's client API's allow one to set total_rows in case
    # the user only wants the first 'n' results from a query. Typically
    # they set this to sys.maxint by default, but this caused problems
    # during testing - specifically on OS X. It appears that at some
    # point in, there is an attempt to cast this value
    # to an unsigned integer. Depending on the python install,
    # sys.maxint may exceed the limitations of unsigned integers.
    # See:

    # This is hardcoded value for 32bit sys.maxint per
    # the above note. Theoretically, we could simply use
    # 100,000 (or whatever the current max page size is),
    # but this is more flexible in the event of an API change
    total_rows = 2147483647

    # Keep track of rows read
    row_count = 0

    # Keep our page DataFrames until the end when we
    # concatentate them
    dataframe_list = list()

    # Iterate over all rows
    while row_count < total_rows:
        data = client.apiclient.tabledata().list(maxResults=total_rows -

        # If there are more results than will fit on a page,
        # you will recieve a token for the next page.
        page_token = data.get('pageToken', None)

        # How many rows are there across all pages?
        total_rows = min(total_rows, int(
            data['totalRows']))  # Changed to use get(data[rows],0)
        raw_page = data.get('rows', [])
        page_array = _parse_page(raw_page, col_names, col_types, col_dtypes)

        row_count += len(page_array)
        if total_rows > 0:
            completed = (100 * row_count) / total_rows
  'Remaining Rows: ' + str(total_rows - row_count) +
                         '(' + str(completed) + '% Complete)')
  'No Rows')


        # Handle any exceptions that might have occured
        if not page_token and row_count != total_rows:
            raise bigquery_client.BigqueryInterfaceError(
                'PageToken missing for %r' %
                    **table_dict), ))
        if not raw_page and row_count != total_rows:
            raise bigquery_client.BigqueryInterfaceError(
                'Not enough rows returned by server for %r' %
                    **table_dict), ))

    # Build final dataframe
    final_df = concat(dataframe_list, ignore_index=True)

    # Reindex the DataFrame on the provided column
    if index_col is not None:
        if index_col in col_names:
            final_df.set_index(index_col, inplace=True)
            raise InvalidColumnOrder(
                'Index column "{0}" does not exist in DataFrame.'.format(

    # Change the order of columns in the DataFrame based on provided list
    if col_order is not None:
        if sorted(col_order) == sorted(col_names):
            final_df = final_df[col_order]
            raise InvalidColumnOrder(
                'Column order does not match this DataFrame.')

    # Downcast floats to integers and objects to booleans
    # if there are no NaN's. This is presently due to a
    # limitation of numpy in handling missing data.
    final_df._data = final_df._data.downcast(dtypes='infer')
    return final_df
Beispiel #2
def _parse_data(client, job, index_col=None, col_order=None):
    Iterate through the query results and piece together the
    final DataFrame. Builds a DataFrame for each page of
    results, then concatenates them together when finished.
    To save memory, we use numpy record arrays to build these

    client: An instance of bq.Client
    job: An array containing the job info for a completed query
    index_col: str (optional)
        Name of result column to use for index in results DataFrame
    col_order: list() (optional)
        List of BigQuery column names in the desired order for results

    df: pandas DataFrame
        DataFrame representing results of query

        Raised if 'col_order' parameter doesn't match returned DataFrame
        Raised by bigquery_client if a Google API error is encountered

    This script relies on Google being consistent with their
    pagination API. We are using the most flexible iteration method
    that we could find in the API's, but
    these have undergone large amounts of change recently.

    We have encountered bugs with this functionality, see:

    # dtype Map -
    # see:
    dtype_map = {
        'INTEGER': np.dtype(float),
        'FLOAT': np.dtype(float),
        'TIMESTAMP': 'M8[ns]'
    }  # This seems to be buggy without
    # nanosecond indicator

    # We first need the schema to get information about the columns of
    # our dataframe.

    table_dict = job['configuration']['query']['destinationTable']
    fields = client.GetTableSchema(table_dict)['fields']

    # Get the schema into a format useable to create our
    # dataframe
    col_dtypes = []
    col_types = []
    col_names = []

    # TODO: Do this in one clean step
    for field in fields:
        # Note the encoding... numpy doesn't like titles that are UTF8, which
        # is the return type from the API
        col_names.append(field['name'].encode('ascii', 'ignore'))
        # Note, it would be nice to use 'str' types, but BigQuery doesn't have
        # a fixed length in mind - just maxes out at 64k
        col_dtypes.append(dtype_map.get(field['type'], object))

    # How many columns are there
    num_columns = len(col_names)

    # Iterate over the result rows.
    # Since Google's API now requires pagination of results,
    # we do that here. The following is repurposed from
    #  :: Client._JobTableReader._ReadOnePage

    # TODO: Enable Reading From Table,
    # see Client._TableTableReader._ReadOnePage

    # Initially, no page token is set
    page_token = None

    # This number is the current max results per page
    max_rows = bigquery_client._MAX_ROWS_PER_REQUEST

    # How many rows in result set? Initialize to max_rows
    total_rows = max_rows

    # This is the starting row for a particular page...
    # is ignored if page_token is present, though
    # it may be useful if we wish to implement SQL like LIMITs
    # with minimums
    start_row = 0

    # Keep our page DataFrames until the end when we concatenate them
    dataframe_list = list()

    current_job = job['jobReference']

    # Iterate over all rows
    while start_row < total_rows:
        # Setup the parameters for getQueryResults() API Call
        kwds = dict(current_job)
        kwds['maxResults'] = max_rows
        # Sets the timeout to 0 because we assume the table is already ready.
        # This is because our previous call to Query() is synchronous
        # and will block until it's actually done
        kwds['timeoutMs'] = 0
        # Use start row if there's no page_token ... in other words, the
        # user requested to start somewhere other than the beginning...
        # presently this is not a parameter to read_gbq(), but it will be
        # added eventually.
        if page_token:
            kwds['pageToken'] = page_token
            kwds['startIndex'] = start_row
        data =**kwds).execute()
        if not data['jobComplete']:
            raise BigqueryError('Job was not completed, or was invalid')

        # How many rows are there across all pages?
        # Note: This is presently the only reason we don't just use
        # _ReadOnePage() directly
        total_rows = int(data['totalRows'])

        page_token = data.get('pageToken', None)
        raw_page = data.get('rows', [])
        page_array = _parse_page(raw_page, col_names, col_types, col_dtypes)

        start_row += len(raw_page)
        if total_rows > 0:
            completed = (100 * start_row) / total_rows
  'Remaining Rows: ' + str(total_rows - start_row) +
                        '(' + str(completed) + '% Complete)')
  'No Rows')


        # Did we get enough rows? Note: stopped checking for this
        # but we felt it was still a good idea.
        if not page_token and not raw_page and start_row != total_rows:
            raise bigquery_client.BigqueryInterfaceError(
                'Not enough rows returned by server. Expected: {0} Rows, But '
                'Received {1}'.format(total_rows, start_row))

    # Build final dataframe
    final_df = concat(dataframe_list, ignore_index=True)

    # Reindex the DataFrame on the provided column
    if index_col is not None:
        if index_col in col_names:
            final_df.set_index(index_col, inplace=True)
            raise InvalidColumnOrder(
                'Index column "{0}" does not exist in DataFrame.'.format(

    # Change the order of columns in the DataFrame based on provided list
    if col_order is not None:
        if sorted(col_order) == sorted(col_names):
            final_df = final_df[col_order]
            raise InvalidColumnOrder(
                'Column order does not match this DataFrame.')

    # Downcast floats to integers and objects to booleans
    # if there are no NaN's. This is presently due to a
    # limitation of numpy in handling missing data.
    final_df._data = final_df._data.downcast(dtypes='infer')
    return final_df