Example #1
0
 def describe(self):
     '''replicates df.describe() by returning a dataframe with summary measures for each numeric column'''
     # TODO this is super inefficient. investigate percentile options.
     with bqutil.Mask_Printing():
         fields = self.table_schema()
     describe_data = {}
     rows = [
         'count', 'min', '25th percentile', '50th percentile',
         '75th percentile', 'max', 'mean', 'std', 'mode'
     ]
     for f in fields:
         if 'INT' in f['type'] or 'LONG' in f['type'] or 'FLOAT' in f[
                 'type']:
             column = []
             for func in [
                     self.count, self.min, self.percentiles, self.max,
                     self.mean, self.std, self.mode
             ]:
                 result = func(f['name'])
                 try:
                     column.extend(result)
                 except:
                     column.append(result)
             describe_data[f['name']] = column
     return pd.DataFrame(data=describe_data, index=rows)
Example #2
0
 def join(self,
          df2,
          on=None,
          left_on=None,
          right_on=None,
          how='LEFT',
          dest=None,
          inplace=True):
     '''joins table with table referenced in df2 and optionally returns result'''
     if inplace:
         dest = self.remote
         overwrite_method = 'overwrite'
     else:
         overwrite_method = 'fail'
     if left_on is None:
         left_on, right_on = on, on
     dups = list(set(self.columns).intersection(set(df2.columns)))
     fulldups = [
         x for j in [['df1.' + i, 'df2.' + i] for i in dups] for x in j
     ]
     allcols = [
         c for c in self.columns + df2.columns + fulldups if c not in dups
     ]
     join_query = "SELECT %s FROM %s df1 %s JOIN %s df2 ON df1.%s=df2.%s" % (
         ', '.join(allcols), self.tablename, how, df2.tablename, left_on,
         right_on)
     with bqutil.Mask_Printing():
         ndf = self.query(join_query,
                          fetch=self.fetched,
                          dest=dest,
                          overwrite_method=overwrite_method)
     if inplace:
         self.refresh()
     else:
         return ndf
Example #3
0
 def query(self,
           querystr,
           fetch=cfg.FETCH_BY_DEFAULT,
           dest=None,
           fill=True,
           overwrite_method='fail'):
     '''execute any arbitary query on the associated table'''
     self.fetched = fetch
     with bqutil.Mask_Printing():
         output, source, exceeds_max = raw_query(
             self.con,
             querystr,
             self.last_modified,
             dest=dest,
             fetch=fetch,
             overwrite_method=overwrite_method)
         new_bqdf = BQDF(self.con,
                         '%s' % bqutil.stringify(source),
                         fill=fill)
         new_bqdf.local = output
         new_bqdf.fetched = fetch
     if exceeds_max:
         pass  # TODO figure how why exceeds_max isn't behaving as expected
         # print "Number of rows in remote table exceeds bqdf object's
         # max_rows. Only max_rows have been fetched locally"
     return new_bqdf
Example #4
0
 def topk(self, k, col=None, fetch=True, dest=None):
     if col is None:
         col = self.active_col
     top_query = "SELECT TOP(%s, %s) %s, COUNT(*) as count FROM %s" % (
         col, k, col, self.tablename)
     with bqutil.Mask_Printing():
         ndf = self.query(top_query, fetch=True)
     return ndf
Example #5
0
 def __len__(self):
     '''length of table (# of rows)'''
     try:
         return int(self.resource['numRows'])
     except KeyError:
         with bqutil.Mask_Printing():
             output, source, exceeds_max = raw_query(
                 self.con, 'SELECT COUNT(*) FROM %s' % self.tablename,
                 self.last_modified)
         return output.values[0][0]
Example #6
0
 def unique(self, col=None, fetch=True):
     '''find unique values in the requested column'''
     if col is None:
         col = self.active_col
     unique_query = "SELECT %s FROM %s GROUP BY %s" % (col, self.tablename,
                                                       col)
     with bqutil.Mask_Printing():
         ndf = self.query(unique_query, fetch=fetch)
     self._clear_active_col()
     return ndf.local[col].values
Example #7
0
 def values(self, col=None):
     '''return values from single column'''
     if col is None:
         col = self.active_col
     with bqutil.Mask_Printing():
         output, source, exceeds_max = raw_query(self.con,
                                                 "SELECT %s FROM %s" %
                                                 (col, self.tablename),
                                                 self.last_modified,
                                                 fetch=True)
     return output[col].values
Example #8
0
 def _head(self):
     with bqutil.Mask_Printing():
         output, source, _ = raw_query(
             self.con, "SELECT * FROM %s LIMIT 5" % (self.tablename),
             self.last_modified)
     return output