def raw_query(con, querystr, last_modified, dest=None, max_rows=cfg.MAX_ROWS, fetch=cfg.FETCH_BY_DEFAULT, overwrite_method='fail'): '''executes a query and returns the results or a result sample as a pandas df and the destination table as a dict INPUTS: querystr (str): dest (dict): specify destination table for output of query (if None, BQ creates a temporary (24hr) table) max_rows (int): max number of rows that the con will return in the results fetch (bool): if True, fetch the full resultset locally, otherwise return only a sample of the first 5 rows OUTPUTS: result (pandas dataframe): dataframe containing the query results or first 5 rows or resultset (if fetch==True) destinationtable (dict): remote table that contains the query results ''' exists = con._check_query(querystr, fetch, last_modified) if overwrite_method == 'append': write_disposition = 'WRITE_APPEND' elif overwrite_method == 'overwrite': write_disposition = 'WRITE_TRUNCATE' else: write_disposition = 'WRITE_EMPTY' if not exists: query_response = run_query(con, querystr, destination_table=dest, write_disposition=write_disposition) if fetch: fields, data = fetch_query(con, query_response, start_row=0, max_rows=max_rows) df, source = bqresult_2_df( fields, data ), query_response['configuration']['query']['destinationTable'] con._cache_query(querystr, df, source, fetch) if con.client._apiclient.tables().get( **source).execute()['numRows'] > max_rows: exceeds_max_rows = True else: exceeds_max_rows = False return df, source, exceeds_max_rows else: fields, data = fetch_query(con, query_response, start_row=0, max_rows=5) head_sample = bqresult_2_df(fields, data) df, source = head_sample, query_response['configuration']['query'][ 'destinationTable'] exceeds_max_rows = False con._cache_query(querystr, df, source, fetch) return df, source, exceeds_max_rows else: return con._fetch_from_cache(querystr)
def apply(self, func, col=None, columns=None, max_rows=cfg.MAX_ROWS, fetch=True, dest=None, chunksize=10000): '''idea is to (in a majorly hacky way) allow arbitrary python "udfs" but pulling each row locally and applying the python function, then writing back to bq''' # TODO make work and allow user to provide arguments to function if col is None: col = self.active_col startrow = 0 while startrow < len(self): fields, data = self.con.client.ReadSchemaAndRows( bqutil.dictify(self.remote), start_row=startrow, max_rows=chunksize) ndf = bqresult_2_df(fields, data) ndf[col + '_mod'] = ndf[col].apply(func) if dest is None: dest = self.remote + '_mod_%s' % col ndf = ndf[[col + '_mod']] _, _ = write_df_to_remote(self.con, ndf, overwrite_method='append', **bqutil.dictify(dest)) startrow += chunksize if not self._check_write(dest): warnings.warn('remote writing of UDF apply function failed') combined_df = BQDF(self.con, dest) return combined_df
def raw_query(con, querystr, last_modified, dest=None, max_rows=cfg.MAX_ROWS, fetch=cfg.FETCH_BY_DEFAULT, overwrite_method='fail'): '''executes a query and returns the results or a result sample as a pandas df and the destination table as a dict INPUTS: querystr (str): dest (dict): specify destination table for output of query (if None, BQ creates a temporary (24hr) table) max_rows (int): max number of rows that the con will return in the results fetch (bool): if True, fetch the full resultset locally, otherwise return only a sample of the first 5 rows OUTPUTS: result (pandas dataframe): dataframe containing the query results or first 5 rows or resultset (if fetch==True) destinationtable (dict): remote table that contains the query results ''' exists = con._check_query(querystr, fetch, last_modified) if overwrite_method == 'append': write_disposition = 'WRITE_APPEND' elif overwrite_method == 'overwrite': write_disposition = 'WRITE_TRUNCATE' else: write_disposition = 'WRITE_EMPTY' if not exists: query_response = run_query( con, querystr, destination_table=dest, write_disposition=write_disposition) if fetch: fields, data = fetch_query( con, query_response, start_row=0, max_rows=max_rows) df, source = bqresult_2_df(fields, data), query_response[ 'configuration']['query']['destinationTable'] con._cache_query(querystr, df, source, fetch) if con.client._apiclient.tables().get(**source).execute()['numRows'] > max_rows: exceeds_max_rows = True else: exceeds_max_rows = False return df, source, exceeds_max_rows else: fields, data = fetch_query( con, query_response, start_row=0, max_rows=5) head_sample = bqresult_2_df(fields, data) df, source = head_sample, query_response[ 'configuration']['query']['destinationTable'] exceeds_max_rows = False con._cache_query(querystr, df, source, fetch) return df, source, exceeds_max_rows else: return con._fetch_from_cache(querystr)
def slice(self, start=0, end=10): # NOTE need to fit slice locally # see if there is a bigquery way to do this fields, data = self.con.client.ReadSchemaAndRows( bqutil.dictify(self.remote), start_row=start, max_rows=end - start) ndf = bqresult_2_df(fields, data) dest = self.remote + '_slice_%sto%s' % (start, end) _ = write_df_to_remote(self.con, ndf, **bqutil.dictify(dest)) if not self._check_write(dest): warnings.warn('failed to write new slice to bigquery') ndf = BQDF(self.con, dest) ndf.refresh() return ndf
def slice(self, start=0, end=10): # NOTE need to fit slice locally # see if there is a bigquery way to do this fields, data = self.con.client.ReadSchemaAndRows(bqutil.dictify( self.remote), start_row=start, max_rows=end - start) ndf = bqresult_2_df(fields, data) dest = self.remote + '_slice_%sto%s' % (start, end) _ = write_df_to_remote(self.con, ndf, **bqutil.dictify(dest)) if not self._check_write(dest): warnings.warn('failed to write new slice to bigquery') ndf = BQDF(self.con, dest) ndf.refresh() return ndf
def apply(self, func, col=None, columns=None, max_rows=cfg.MAX_ROWS, fetch=True, dest=None, chunksize=10000): '''idea is to (in a majorly hacky way) allow arbitrary python "udfs" but pulling each row locally and applying the python function, then writing back to bq''' # TODO make work and allow user to provide arguments to function if col is None: col = self.active_col startrow = 0 while startrow < len(self): fields, data = self.con.client.ReadSchemaAndRows( bqutil.dictify(self.remote), start_row=startrow, max_rows=chunksize) ndf = bqresult_2_df(fields, data) ndf[col + '_mod'] = ndf[col].apply(func) if dest is None: dest = self.remote + '_mod_%s' % col ndf = ndf[[col + '_mod']] _, _ = write_df_to_remote( self.con, ndf, overwrite_method='append', **bqutil.dictify(dest)) startrow += chunksize if not self._check_write(dest): warnings.warn('remote writing of UDF apply function failed') combined_df = BQDF(self.con, dest) return combined_df