def apply(self, func, col=None, columns=None, max_rows=cfg.MAX_ROWS, fetch=True, dest=None, chunksize=10000): '''idea is to (in a majorly hacky way) allow arbitrary python "udfs" but pulling each row locally and applying the python function, then writing back to bq''' # TODO make work and allow user to provide arguments to function if col is None: col = self.active_col startrow = 0 while startrow < len(self): fields, data = self.con.client.ReadSchemaAndRows( bqutil.dictify(self.remote), start_row=startrow, max_rows=chunksize) ndf = bqresult_2_df(fields, data) ndf[col + '_mod'] = ndf[col].apply(func) if dest is None: dest = self.remote + '_mod_%s' % col ndf = ndf[[col + '_mod']] _, _ = write_df_to_remote(self.con, ndf, overwrite_method='append', **bqutil.dictify(dest)) startrow += chunksize if not self._check_write(dest): warnings.warn('remote writing of UDF apply function failed') combined_df = BQDF(self.con, dest) return combined_df
def groupby_apply(self, groupingcol, func, columns=None, max_rows=cfg.MAX_ROWS, fetch=True, dest=None): ''' same as apply (python udf hack) but for groups analogous to df.groupby('col').apply(myfunc) # TODO make work and allow user to provide arguments groups data by grouping column and performs requested operations on other columns INPUTS: groupingcol (str): column to group on func (python function): takes arbitrary python function that acts on all data in a group columns (list): list of column names to touch with function OUTPUTS: ndf: BQDF instance for result ''' dest = None if columns is None: columns is self.columns for group in self.unique(groupingcol): group_query = "SELECT %s FROM %s WHERE %s == %s" ( ', '.join(columns), self.tablename, groupingcol, group) ndf = self.query(group_query, fetch=True, dest=dest) applied_ndf = func(ndf.local) if dest is None: gdf = self.query(group_query, fetch=True, dest=None) dest = gdf.remote _, _ = write_df_to_remote( self.con, applied_ndf, overwrite_method='append', **bqutil.dictify(dest)) if not self._check_write(dest): warnings.warn( 'remote writing of UDF groupby-apply function failed') gdf = BQDF(self.con, '%s' % dest) return gdf
def slice(self, start=0, end=10): # NOTE need to fit slice locally # see if there is a bigquery way to do this fields, data = self.con.client.ReadSchemaAndRows( bqutil.dictify(self.remote), start_row=start, max_rows=end - start) ndf = bqresult_2_df(fields, data) dest = self.remote + '_slice_%sto%s' % (start, end) _ = write_df_to_remote(self.con, ndf, **bqutil.dictify(dest)) if not self._check_write(dest): warnings.warn('failed to write new slice to bigquery') ndf = BQDF(self.con, dest) ndf.refresh() return ndf
def slice(self, start=0, end=10): # NOTE need to fit slice locally # see if there is a bigquery way to do this fields, data = self.con.client.ReadSchemaAndRows(bqutil.dictify( self.remote), start_row=start, max_rows=end - start) ndf = bqresult_2_df(fields, data) dest = self.remote + '_slice_%sto%s' % (start, end) _ = write_df_to_remote(self.con, ndf, **bqutil.dictify(dest)) if not self._check_write(dest): warnings.warn('failed to write new slice to bigquery') ndf = BQDF(self.con, dest) ndf.refresh() return ndf
def apply(self, func, col=None, columns=None, max_rows=cfg.MAX_ROWS, fetch=True, dest=None, chunksize=10000): '''idea is to (in a majorly hacky way) allow arbitrary python "udfs" but pulling each row locally and applying the python function, then writing back to bq''' # TODO make work and allow user to provide arguments to function if col is None: col = self.active_col startrow = 0 while startrow < len(self): fields, data = self.con.client.ReadSchemaAndRows( bqutil.dictify(self.remote), start_row=startrow, max_rows=chunksize) ndf = bqresult_2_df(fields, data) ndf[col + '_mod'] = ndf[col].apply(func) if dest is None: dest = self.remote + '_mod_%s' % col ndf = ndf[[col + '_mod']] _, _ = write_df_to_remote( self.con, ndf, overwrite_method='append', **bqutil.dictify(dest)) startrow += chunksize if not self._check_write(dest): warnings.warn('remote writing of UDF apply function failed') combined_df = BQDF(self.con, dest) return combined_df
def groupby_apply(self, groupingcol, func, columns=None, max_rows=cfg.MAX_ROWS, fetch=True, dest=None): ''' same as apply (python udf hack) but for groups analogous to df.groupby('col').apply(myfunc) # TODO make work and allow user to provide arguments groups data by grouping column and performs requested operations on other columns INPUTS: groupingcol (str): column to group on func (python function): takes arbitrary python function that acts on all data in a group columns (list): list of column names to touch with function OUTPUTS: ndf: BQDF instance for result ''' dest = None if columns is None: columns is self.columns for group in self.unique(groupingcol): group_query = "SELECT %s FROM %s WHERE %s == %s" ( ', '.join(columns), self.tablename, groupingcol, group) ndf = self.query(group_query, fetch=True, dest=dest) applied_ndf = func(ndf.local) if dest is None: gdf = self.query(group_query, fetch=True, dest=None) dest = gdf.remote _, _ = write_df_to_remote(self.con, applied_ndf, overwrite_method='append', **bqutil.dictify(dest)) if not self._check_write(dest): warnings.warn( 'remote writing of UDF groupby-apply function failed') gdf = BQDF(self.con, '%s' % dest) return gdf