def describe(self): '''replicates df.describe() by returning a dataframe with summary measures for each numeric column''' # TODO this is super inefficient. investigate percentile options. with bqutil.Mask_Printing(): fields = self.table_schema() describe_data = {} rows = [ 'count', 'min', '25th percentile', '50th percentile', '75th percentile', 'max', 'mean', 'std', 'mode' ] for f in fields: if 'INT' in f['type'] or 'LONG' in f['type'] or 'FLOAT' in f[ 'type']: column = [] for func in [ self.count, self.min, self.percentiles, self.max, self.mean, self.std, self.mode ]: result = func(f['name']) try: column.extend(result) except: column.append(result) describe_data[f['name']] = column return pd.DataFrame(data=describe_data, index=rows)
def join(self, df2, on=None, left_on=None, right_on=None, how='LEFT', dest=None, inplace=True): '''joins table with table referenced in df2 and optionally returns result''' if inplace: dest = self.remote overwrite_method = 'overwrite' else: overwrite_method = 'fail' if left_on is None: left_on, right_on = on, on dups = list(set(self.columns).intersection(set(df2.columns))) fulldups = [ x for j in [['df1.' + i, 'df2.' + i] for i in dups] for x in j ] allcols = [ c for c in self.columns + df2.columns + fulldups if c not in dups ] join_query = "SELECT %s FROM %s df1 %s JOIN %s df2 ON df1.%s=df2.%s" % ( ', '.join(allcols), self.tablename, how, df2.tablename, left_on, right_on) with bqutil.Mask_Printing(): ndf = self.query(join_query, fetch=self.fetched, dest=dest, overwrite_method=overwrite_method) if inplace: self.refresh() else: return ndf
def query(self, querystr, fetch=cfg.FETCH_BY_DEFAULT, dest=None, fill=True, overwrite_method='fail'): '''execute any arbitary query on the associated table''' self.fetched = fetch with bqutil.Mask_Printing(): output, source, exceeds_max = raw_query( self.con, querystr, self.last_modified, dest=dest, fetch=fetch, overwrite_method=overwrite_method) new_bqdf = BQDF(self.con, '%s' % bqutil.stringify(source), fill=fill) new_bqdf.local = output new_bqdf.fetched = fetch if exceeds_max: pass # TODO figure how why exceeds_max isn't behaving as expected # print "Number of rows in remote table exceeds bqdf object's # max_rows. Only max_rows have been fetched locally" return new_bqdf
def topk(self, k, col=None, fetch=True, dest=None): if col is None: col = self.active_col top_query = "SELECT TOP(%s, %s) %s, COUNT(*) as count FROM %s" % ( col, k, col, self.tablename) with bqutil.Mask_Printing(): ndf = self.query(top_query, fetch=True) return ndf
def __len__(self): '''length of table (# of rows)''' try: return int(self.resource['numRows']) except KeyError: with bqutil.Mask_Printing(): output, source, exceeds_max = raw_query( self.con, 'SELECT COUNT(*) FROM %s' % self.tablename, self.last_modified) return output.values[0][0]
def unique(self, col=None, fetch=True): '''find unique values in the requested column''' if col is None: col = self.active_col unique_query = "SELECT %s FROM %s GROUP BY %s" % (col, self.tablename, col) with bqutil.Mask_Printing(): ndf = self.query(unique_query, fetch=fetch) self._clear_active_col() return ndf.local[col].values
def values(self, col=None): '''return values from single column''' if col is None: col = self.active_col with bqutil.Mask_Printing(): output, source, exceeds_max = raw_query(self.con, "SELECT %s FROM %s" % (col, self.tablename), self.last_modified, fetch=True) return output[col].values
def _head(self): with bqutil.Mask_Printing(): output, source, _ = raw_query( self.con, "SELECT * FROM %s LIMIT 5" % (self.tablename), self.last_modified) return output