Exemple #1
0
 def apply(self,
           func,
           col=None,
           columns=None,
           max_rows=cfg.MAX_ROWS,
           fetch=True,
           dest=None,
           chunksize=10000):
     '''idea is to (in a majorly hacky way) allow arbitrary python "udfs" but pulling each row locally and applying the python function, then writing back to bq'''
     # TODO make work and allow user to provide arguments to function
     if col is None:
         col = self.active_col
     startrow = 0
     while startrow < len(self):
         fields, data = self.con.client.ReadSchemaAndRows(
             bqutil.dictify(self.remote),
             start_row=startrow,
             max_rows=chunksize)
         ndf = bqresult_2_df(fields, data)
         ndf[col + '_mod'] = ndf[col].apply(func)
         if dest is None:
             dest = self.remote + '_mod_%s' % col
         ndf = ndf[[col + '_mod']]
         _, _ = write_df_to_remote(self.con,
                                   ndf,
                                   overwrite_method='append',
                                   **bqutil.dictify(dest))
         startrow += chunksize
     if not self._check_write(dest):
         warnings.warn('remote writing of UDF apply function failed')
     combined_df = BQDF(self.con, dest)
     return combined_df
Exemple #2
0
 def groupby_apply(self, groupingcol, func, columns=None, max_rows=cfg.MAX_ROWS, fetch=True, dest=None):
     ''' same as apply (python udf hack) but for groups analogous to df.groupby('col').apply(myfunc)
     # TODO make work and allow user to provide arguments
     groups data by grouping column and performs requested operations on other columns
     INPUTS:
         groupingcol (str): column to group on
         func (python function): takes arbitrary python function that acts on all data in a group
         columns (list): list of column names to touch with function
     OUTPUTS:
        ndf: BQDF instance for result
     '''
     dest = None
     if columns is None:
         columns is self.columns
     for group in self.unique(groupingcol):
         group_query = "SELECT %s FROM %s WHERE  %s == %s" (
             ', '.join(columns), self.tablename, groupingcol, group)
         ndf = self.query(group_query, fetch=True, dest=dest)
         applied_ndf = func(ndf.local)
         if dest is None:
             gdf = self.query(group_query, fetch=True, dest=None)
             dest = gdf.remote
         _, _ = write_df_to_remote(
             self.con, applied_ndf, overwrite_method='append', **bqutil.dictify(dest))
     if not self._check_write(dest):
         warnings.warn(
             'remote writing of UDF groupby-apply function failed')
     gdf = BQDF(self.con, '%s' % dest)
     return gdf
Exemple #3
0
 def slice(self, start=0, end=10):
     # NOTE need to fit slice locally
     # see if there is a bigquery way to do this
     fields, data = self.con.client.ReadSchemaAndRows(
         bqutil.dictify(self.remote), start_row=start, max_rows=end - start)
     ndf = bqresult_2_df(fields, data)
     dest = self.remote + '_slice_%sto%s' % (start, end)
     _ = write_df_to_remote(self.con, ndf, **bqutil.dictify(dest))
     if not self._check_write(dest):
         warnings.warn('failed to write new slice to bigquery')
     ndf = BQDF(self.con, dest)
     ndf.refresh()
     return ndf
Exemple #4
0
 def slice(self, start=0, end=10):
     # NOTE need to fit slice locally
     # see if there is a bigquery way to do this
     fields, data = self.con.client.ReadSchemaAndRows(bqutil.dictify(
         self.remote),
                                                      start_row=start,
                                                      max_rows=end - start)
     ndf = bqresult_2_df(fields, data)
     dest = self.remote + '_slice_%sto%s' % (start, end)
     _ = write_df_to_remote(self.con, ndf, **bqutil.dictify(dest))
     if not self._check_write(dest):
         warnings.warn('failed to write new slice to bigquery')
     ndf = BQDF(self.con, dest)
     ndf.refresh()
     return ndf
Exemple #5
0
 def apply(self, func, col=None, columns=None, max_rows=cfg.MAX_ROWS, fetch=True, dest=None, chunksize=10000):
     '''idea is to (in a majorly hacky way) allow arbitrary python "udfs" but pulling each row locally and applying the python function, then writing back to bq'''
     # TODO make work and allow user to provide arguments to function
     if col is None:
         col = self.active_col
     startrow = 0
     while startrow < len(self):
         fields, data = self.con.client.ReadSchemaAndRows(
             bqutil.dictify(self.remote), start_row=startrow, max_rows=chunksize)
         ndf = bqresult_2_df(fields, data)
         ndf[col + '_mod'] = ndf[col].apply(func)
         if dest is None:
             dest = self.remote + '_mod_%s' % col
         ndf = ndf[[col + '_mod']]
         _, _ = write_df_to_remote(
             self.con, ndf, overwrite_method='append', **bqutil.dictify(dest))
         startrow += chunksize
     if not self._check_write(dest):
         warnings.warn('remote writing of UDF apply function failed')
     combined_df = BQDF(self.con, dest)
     return combined_df
Exemple #6
0
 def groupby_apply(self,
                   groupingcol,
                   func,
                   columns=None,
                   max_rows=cfg.MAX_ROWS,
                   fetch=True,
                   dest=None):
     ''' same as apply (python udf hack) but for groups analogous to df.groupby('col').apply(myfunc)
     # TODO make work and allow user to provide arguments
     groups data by grouping column and performs requested operations on other columns
     INPUTS:
         groupingcol (str): column to group on
         func (python function): takes arbitrary python function that acts on all data in a group
         columns (list): list of column names to touch with function
     OUTPUTS:
        ndf: BQDF instance for result
     '''
     dest = None
     if columns is None:
         columns is self.columns
     for group in self.unique(groupingcol):
         group_query = "SELECT %s FROM %s WHERE  %s == %s" (
             ', '.join(columns), self.tablename, groupingcol, group)
         ndf = self.query(group_query, fetch=True, dest=dest)
         applied_ndf = func(ndf.local)
         if dest is None:
             gdf = self.query(group_query, fetch=True, dest=None)
             dest = gdf.remote
         _, _ = write_df_to_remote(self.con,
                                   applied_ndf,
                                   overwrite_method='append',
                                   **bqutil.dictify(dest))
     if not self._check_write(dest):
         warnings.warn(
             'remote writing of UDF groupby-apply function failed')
     gdf = BQDF(self.con, '%s' % dest)
     return gdf