Beispiel #1
0
def _parse_data(client, job, index_col=None, col_order=None):
    """
	Iterate through the query results and piece together the
	final DataFrame. Builds a DataFrame for each page of
	results, then concatenates them together when finished.
	To save memory, we use numpy record arrays to build these
	DataFrames.

	Parameters
	----------
	client: An instance of bq.Client
	job: An array containing the job info for a completed query
	index_col: str (optional)
		Name of result column to use for index in results DataFrame
	col_order: list() (optional)
		List of BigQuery column names in the desired order for results
		DataFrame

	Returns
	-------
	df: pandas DataFrame
		DataFrame representing results of query

	Raises:
	------
	InvalidColumnOrder:
		Raised if 'col_order' parameter doesn't match returned DataFrame
	BigqueryError:
		Raised by bigquery_client if a Google API error is encountered


	Notes:
	-----
	This script relies on Google being consistent with their 
	pagination API. We are using the most flexible iteration method
	that we could find in the bq.py/bigquery_client.py API's, but 
	these have undergone large amounts of change recently.

	We have encountered bugs with this functionality, see:
	http://stackoverflow.com/questions/19145587/bq-py-not-paging-results
	"""

    # dtype Map -
    # see: http://pandas.pydata.org/pandas-docs/dev/missing_data.html#missing-data-casting-rules-and-indexing
    dtype_map = {
        'INTEGER': np.dtype(float),
        'FLOAT': np.dtype(float),
        'TIMESTAMP': 'M8[ns]'
    }  # This seems to be buggy without nanosecond indicator

    # We first need the schema to get information about the columns of
    # our dataframe.

    table_dict = job['configuration']['query']['destinationTable']
    fields = client.GetTableSchema(table_dict)['fields']

    # Get the schema into a format useable to create our
    # dataframe
    col_dtypes = []
    col_types = []
    col_names = []

    # TODO: Do this in one clean step
    for field in fields:
        col_types.append(field['type'])
        # Note the encoding... numpy doesn't like titles that are UTF8, which is the return
        # type from the API
        col_names.append(field['name'].encode('ascii', 'ignore'))
        # Note, it would be nice to use 'str' types, but BigQuery doesn't have a fixed length
        # in mind - just maxes out at 64k
        col_dtypes.append(dtype_map.get(field['type'], object))

    # How many columns are there
    num_columns = len(col_names)

    # Iterate over the result rows.
    # Since Google's API now requires pagination of results,
    # we do that here. The following is repurposed from
    # bigquery_client.py  :: Client.ReadTableRows()

    # Initially, no page token is set
    page_token = None

    # Most of Google's client API's allow one to set total_rows in case
    # the user only wants the first 'n' results from a query. Typically
    # they set this to sys.maxint by default, but this caused problems
    # during testing - specifically on OS X. It appears that at some
    # point in bigquery_client.py, there is an attempt to cast this value
    # to an unsigned integer. Depending on the python install,
    # sys.maxint may exceed the limitations of unsigned integers.
    #
    # See:
    # https://code.google.com/p/google-bigquery-tools/issues/detail?id=14

    # This is hardcoded value for 32bit sys.maxint per
    # the above note. Theoretically, we could simply use
    # 100,000 (or whatever the current max page size is),
    # but this is more flexible in the event of an API change
    total_rows = 2147483647

    # Keep track of rows read
    row_count = 0

    # Keep our page DataFrames until the end when we
    # concatentate them
    dataframe_list = list()

    # Iterate over all rows
    while row_count < total_rows:
        data = client.apiclient.tabledata().list(maxResults=total_rows -
                                                 row_count,
                                                 pageToken=page_token,
                                                 **table_dict).execute()

        # If there are more results than will fit on a page,
        # you will recieve a token for the next page.
        page_token = data.get('pageToken', None)

        # How many rows are there across all pages?
        total_rows = min(total_rows, int(
            data['totalRows']))  # Changed to use get(data[rows],0)
        raw_page = data.get('rows', [])
        page_array = _parse_page(raw_page, col_names, col_types, col_dtypes)

        row_count += len(page_array)
        if total_rows > 0:
            completed = (100 * row_count) / total_rows
            logging.info('Remaining Rows: ' + str(total_rows - row_count) +
                         '(' + str(completed) + '% Complete)')
        else:
            logging.info('No Rows')

        dataframe_list.append(DataFrame(page_array))

        # Handle any exceptions that might have occured
        if not page_token and row_count != total_rows:
            raise bigquery_client.BigqueryInterfaceError(
                'PageToken missing for %r' %
                (bigquery_client.ApiClientHelper.TableReference.Create(
                    **table_dict), ))
        if not raw_page and row_count != total_rows:
            raise bigquery_client.BigqueryInterfaceError(
                'Not enough rows returned by server for %r' %
                (bigquery_client.ApiClientHelper.TableReference.Create(
                    **table_dict), ))

    # Build final dataframe
    final_df = concat(dataframe_list, ignore_index=True)

    # Reindex the DataFrame on the provided column
    if index_col is not None:
        if index_col in col_names:
            final_df.set_index(index_col, inplace=True)
            col_names.remove(index_col)
        else:
            raise InvalidColumnOrder(
                'Index column "{0}" does not exist in DataFrame.'.format(
                    index_col))

    # Change the order of columns in the DataFrame based on provided list
    if col_order is not None:
        if sorted(col_order) == sorted(col_names):
            final_df = final_df[col_order]
        else:
            raise InvalidColumnOrder(
                'Column order does not match this DataFrame.')

    # Downcast floats to integers and objects to booleans
    # if there are no NaN's. This is presently due to a
    # limitation of numpy in handling missing data.
    final_df._data = final_df._data.downcast(dtypes='infer')
    return final_df
Beispiel #2
0
def _parse_data(client, job, index_col=None, col_order=None):
    """
    Iterate through the query results and piece together the
    final DataFrame. Builds a DataFrame for each page of
    results, then concatenates them together when finished.
    To save memory, we use numpy record arrays to build these
    DataFrames.

    Parameters
    ----------
    client: An instance of bq.Client
    job: An array containing the job info for a completed query
    index_col: str (optional)
        Name of result column to use for index in results DataFrame
    col_order: list() (optional)
        List of BigQuery column names in the desired order for results
        DataFrame

    Returns
    -------
    df: pandas DataFrame
        DataFrame representing results of query

    Raises:
    ------
    InvalidColumnOrder:
        Raised if 'col_order' parameter doesn't match returned DataFrame
    BigqueryError:
        Raised by bigquery_client if a Google API error is encountered


    Notes:
    -----
    This script relies on Google being consistent with their
    pagination API. We are using the most flexible iteration method
    that we could find in the bq.py/bigquery_client.py API's, but
    these have undergone large amounts of change recently.

    We have encountered bugs with this functionality, see:
    http://stackoverflow.com/questions/19145587/bq-py-not-paging-results
    """

    # dtype Map -
    # see: http://pandas.pydata.org/pandas-docs/dev/missing_data.html#missing-data-casting-rules-and-indexing
    dtype_map = {
        'INTEGER': np.dtype(float),
        'FLOAT': np.dtype(float),
        'TIMESTAMP': 'M8[ns]'
    }  # This seems to be buggy without
    # nanosecond indicator

    # We first need the schema to get information about the columns of
    # our dataframe.

    table_dict = job['configuration']['query']['destinationTable']
    fields = client.GetTableSchema(table_dict)['fields']

    # Get the schema into a format useable to create our
    # dataframe
    col_dtypes = []
    col_types = []
    col_names = []

    # TODO: Do this in one clean step
    for field in fields:
        col_types.append(field['type'])
        # Note the encoding... numpy doesn't like titles that are UTF8, which
        # is the return type from the API
        col_names.append(field['name'].encode('ascii', 'ignore'))
        # Note, it would be nice to use 'str' types, but BigQuery doesn't have
        # a fixed length in mind - just maxes out at 64k
        col_dtypes.append(dtype_map.get(field['type'], object))

    # How many columns are there
    num_columns = len(col_names)

    # Iterate over the result rows.
    # Since Google's API now requires pagination of results,
    # we do that here. The following is repurposed from
    # bigquery_client.py  :: Client._JobTableReader._ReadOnePage

    # TODO: Enable Reading From Table,
    # see Client._TableTableReader._ReadOnePage

    # Initially, no page token is set
    page_token = None

    # This number is the current max results per page
    max_rows = bigquery_client._MAX_ROWS_PER_REQUEST

    # How many rows in result set? Initialize to max_rows
    total_rows = max_rows

    # This is the starting row for a particular page...
    # is ignored if page_token is present, though
    # it may be useful if we wish to implement SQL like LIMITs
    # with minimums
    start_row = 0

    # Keep our page DataFrames until the end when we concatenate them
    dataframe_list = list()

    current_job = job['jobReference']

    # Iterate over all rows
    while start_row < total_rows:
        # Setup the parameters for getQueryResults() API Call
        kwds = dict(current_job)
        kwds['maxResults'] = max_rows
        # Sets the timeout to 0 because we assume the table is already ready.
        # This is because our previous call to Query() is synchronous
        # and will block until it's actually done
        kwds['timeoutMs'] = 0
        # Use start row if there's no page_token ... in other words, the
        # user requested to start somewhere other than the beginning...
        # presently this is not a parameter to read_gbq(), but it will be
        # added eventually.
        if page_token:
            kwds['pageToken'] = page_token
        else:
            kwds['startIndex'] = start_row
        data = client.apiclient.jobs().getQueryResults(**kwds).execute()
        if not data['jobComplete']:
            raise BigqueryError('Job was not completed, or was invalid')

        # How many rows are there across all pages?
        # Note: This is presently the only reason we don't just use
        # _ReadOnePage() directly
        total_rows = int(data['totalRows'])

        page_token = data.get('pageToken', None)
        raw_page = data.get('rows', [])
        page_array = _parse_page(raw_page, col_names, col_types, col_dtypes)

        start_row += len(raw_page)
        if total_rows > 0:
            completed = (100 * start_row) / total_rows
            logger.info('Remaining Rows: ' + str(total_rows - start_row) +
                        '(' + str(completed) + '% Complete)')
        else:
            logger.info('No Rows')

        dataframe_list.append(DataFrame(page_array))

        # Did we get enough rows? Note: gbq.py stopped checking for this
        # but we felt it was still a good idea.
        if not page_token and not raw_page and start_row != total_rows:
            raise bigquery_client.BigqueryInterfaceError(
                'Not enough rows returned by server. Expected: {0} Rows, But '
                'Received {1}'.format(total_rows, start_row))

    # Build final dataframe
    final_df = concat(dataframe_list, ignore_index=True)

    # Reindex the DataFrame on the provided column
    if index_col is not None:
        if index_col in col_names:
            final_df.set_index(index_col, inplace=True)
            col_names.remove(index_col)
        else:
            raise InvalidColumnOrder(
                'Index column "{0}" does not exist in DataFrame.'.format(
                    index_col))

    # Change the order of columns in the DataFrame based on provided list
    if col_order is not None:
        if sorted(col_order) == sorted(col_names):
            final_df = final_df[col_order]
        else:
            raise InvalidColumnOrder(
                'Column order does not match this DataFrame.')

    # Downcast floats to integers and objects to booleans
    # if there are no NaN's. This is presently due to a
    # limitation of numpy in handling missing data.
    final_df._data = final_df._data.downcast(dtypes='infer')
    return final_df