-
Notifications
You must be signed in to change notification settings - Fork 1
/
bqdf.py
1078 lines (933 loc) · 44 KB
/
bqdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
''' pybq.bqdf: defines BQDF class which provides a vaguely pandas-esque
interface with a specific bigquery table
Note: some of this may become obsolete when https://github.com/pydata/pandas/blob/master/pandas/io/gbq.py
is fully developed and stable (for now gbq only provides basic read/write api)
'''
import pandas as pd
import numpy as np
import sys
import itertools
import bqutil
import bqviz
from time import time, sleep
import cfg
import warnings
import scipy.stats
import copy
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
from core import run_query, fetch_query, Connection, create_column_from_values, bqresult_2_df, write_df_to_remote
# TODO:
# don't allow pandas column names that will cause trouble in the bq query
# (no commas, any other constraints?)
# think about efficiency/pricing tradeoffs
# this approach makes. in many cases, what could be a larger (and possibly
# more efficient) single query is broken down into intermediate steps. But
# this can in principle have an efficiency advantage since the
# intermediate computations(e.g. a mean of some column) can be cached and
# reused for different higher level operations. Need to understand pricing
# specifics more to optimize this.
##########################################################################
# ####################### BigQuery DataFrame Class #######################
##########################################################################
class BQDF(object):
'''Reference to a bigquery table that provides some quick and easy access
to basic features of the table. Aims to replicate some of the basic functionality
of pandas dataframe operations and interfacing for return data in df form'''
def __init__(self, con, tablename, max_rows=cfg.MAX_ROWS, fill=True, bucket=cfg.STORAGE_BUCKET):
'''initialize a reference to table'''
self.con = con
if ":" not in tablename:
tablename = "%s:%s" %(con.project_id, tablename)
self.tablename = '[%s]' % tablename
self.remote = tablename
print self.remote
self.resource = None
self.allstring = "SELECT * FROM [%s] LIMIT 1" % tablename
self.local = None
self.max_rows = max_rows
self.fetched = False
self.active_col = None
self.local = None
self.bucket = bucket
self.hidden = []
if fill:
self.resource = self.get_resource(self.remote)
self.local = self._head()
########################################
# ###### MAIN TABLE ACCESS #######
########################################
def __getitem__(self, index):
if isinstance(index, list):
return self.query('SELECT %s FROM %s' % (', '.join(index), self.tablename), fetch=self.fetched)
elif isinstance(index, tuple):
raise ValueError(
"Trying to access with a tuple. Did you mean [%s]" % list(index))
elif index in self.columns:
self._set_active_col(index)
return self
elif isinstance(index, int):
return self._get_nth_row(index)
def __delitem__(self, colname):
'''Delete column from table'''
if colname not in self.columns:
raise NameError("%s is not a column in table" % colname)
newcols = [col for col in self.columns if col != colname]
newdf = self.query('SELECT %s from %s' % (', '.join(
newcols), self.remote), fetch=False, dest=self.remote, overwrite_method='overwrite')
self.refresh()
def query(self, querystr, fetch=cfg.FETCH_BY_DEFAULT, dest=None, fill=True, overwrite_method='fail'):
'''execute any arbitary query on the associated table'''
self.fetched = fetch
with bqutil.Mask_Printing():
output, source, exceeds_max = raw_query(
self.con, querystr, self.last_modified, dest=dest, fetch=fetch, overwrite_method=overwrite_method)
new_bqdf = BQDF(self.con, '%s' % bqutil.stringify(source), fill=fill)
new_bqdf.local = output
new_bqdf.fetched = fetch
if exceeds_max:
pass # TODO figure how why exceeds_max isn't behaving as expected
# print "Number of rows in remote table exceeds bqdf object's
# max_rows. Only max_rows have been fetched locally"
return new_bqdf
def rename(self, rename_columns, inplace=False):
'''rename columns'''
# NOTE: involves full table scan
if inplace:
dest = self.remote
else:
dest = None
cols = [col for col in self.columns if col not in rename_columns]
renames = ["%s as %s" % (key, columns[key]) for key in rename_columns]
querystr = "SELECT %s FROM %s" (
', '.join(renames + cols), self.tablename)
newdf = self.query(
querystr, fetch=self.fetched, dest=dest, overwrite_method='overwrite')
if not inplace:
return newdf
else:
self.refresh()
def add_col(self, column=None, content=1, replace=False, inplace=True):
'''add new column to the table '''
# TODO think about alternative implementation (major constraint: can't change existing rows of a table)
# TODO warn/prompt for permission to perform these more costly
# operations?
if column in self.columns:
if replace:
del self[column]
else:
raise NameError("%s is already a column in table" % column)
newremote, length = self._get_remote_reference(content, column)
# with bqutil.Mask_Printing():
df = self._add_col_join(newremote, column, inplace)
if not inplace:
return df
else:
self.refresh()
def replace(self, column=None, content=1):
'''adds a column that replaces an existing column'''
self.add_col(column, content, replace=True)
def slice(self, start=0, end=10):
# NOTE need to fit slice locally
# see if there is a bigquery way to do this
fields, data = self.con.client.ReadSchemaAndRows(
bqutil.dictify(self.remote), start_row=start, max_rows=end - start)
ndf = bqresult_2_df(fields, data)
dest = self.remote + '_slice_%sto%s' % (start, end)
_ = write_df_to_remote(self.con, ndf, **bqutil.dictify(dest))
if not self._check_write(dest):
warnings.warn('failed to write new slice to bigquery')
ndf = BQDF(self.con, dest)
ndf.refresh()
return ndf
def save(self, project_id, dataset_id, table_id):
'''copy table (useful for saving a bqdf currently pointing to a temporary table'''
raise NotImplementedError
########################################
# ###### FILTER, GROUP, APPLY #######
########################################
def where(self, *args, **kwargs):
'''returns data filtered by where statements
INPUTS:
args (str): str specifying WHERE clause (e.g. score > 95, name == 'Amy')
kwargs: fetch (bool): specifies whether to fetch the data locally, dest (dict): specifies destination for results table
OUTPUTS:
ndf: BQDF instance for result
'''
fetch = kwargs.get('fetch', True)
dest = kwargs.get('dest', None)
columns = kwargs.get('columns', self.columns)
filter_query = "SELECT %s FROM %s WHERE %s" % (', '.join(columns),
self.tablename, _create_where_statement(args))
ndf = self.query(filter_query, fetch=fetch, dest=dest)
return ndf
def groupby(self, groupingcol, operations, max_rows=cfg.MAX_ROWS, fetch=True, dest=None):
'''groups data by grouping column and performs requested operations on other columns
INPUTS:
groupingcol (str): column to group on
operations (list): list of tuples where tuple[0] are columns and tuple[1] are strings representing operations on those columns
OUTPUTS:
ndf: BQDF instance for result
'''
opmap = {'mean': 'AVG', 'std': 'STDDEV', 'sum': 'SUM',
'min': 'MIN', 'max': 'MAX', 'count': 'COUNT'}
operationpairs = []
for (key, val) in operations:
if val == 'sem':
operationpairs.append(
"STDDEV(%s)/SQRT(COUNT(%s) %s_sem " % (key, key, key)) # TODO figure out nan handling
else:
operationpairs.append(
"%s(%s) %s_%s " % (opmap[val], key, key, val))
grouping_query = "SELECT %s, %s FROM %s GROUP BY %s" % (
groupingcol, ', '.join(operationpairs), self.tablename, groupingcol)
ndf = self.query(grouping_query, fetch=fetch, dest=dest)
return ndf
def apply(self, func, col=None, columns=None, max_rows=cfg.MAX_ROWS, fetch=True, dest=None, chunksize=10000):
'''idea is to (in a majorly hacky way) allow arbitrary python "udfs" but pulling each row locally and applying the python function, then writing back to bq'''
# TODO make work and allow user to provide arguments to function
if col is None:
col = self.active_col
startrow = 0
while startrow < len(self):
fields, data = self.con.client.ReadSchemaAndRows(
bqutil.dictify(self.remote), start_row=startrow, max_rows=chunksize)
ndf = bqresult_2_df(fields, data)
ndf[col + '_mod'] = ndf[col].apply(func)
if dest is None:
dest = self.remote + '_mod_%s' % col
ndf = ndf[[col + '_mod']]
_, _ = write_df_to_remote(
self.con, ndf, overwrite_method='append', **bqutil.dictify(dest))
startrow += chunksize
if not self._check_write(dest):
warnings.warn('remote writing of UDF apply function failed')
combined_df = BQDF(self.con, dest)
return combined_df
def groupby_apply(self, groupingcol, func, columns=None, max_rows=cfg.MAX_ROWS, fetch=True, dest=None):
''' same as apply (python udf hack) but for groups analogous to df.groupby('col').apply(myfunc)
# TODO make work and allow user to provide arguments
groups data by grouping column and performs requested operations on other columns
INPUTS:
groupingcol (str): column to group on
func (python function): takes arbitrary python function that acts on all data in a group
columns (list): list of column names to touch with function
OUTPUTS:
ndf: BQDF instance for result
'''
dest = None
if columns is None:
columns is self.columns
for group in self.unique(groupingcol):
group_query = "SELECT %s FROM %s WHERE %s == %s" (
', '.join(columns), self.tablename, groupingcol, group)
ndf = self.query(group_query, fetch=True, dest=dest)
applied_ndf = func(ndf.local)
if dest is None:
gdf = self.query(group_query, fetch=True, dest=None)
dest = gdf.remote
_, _ = write_df_to_remote(
self.con, applied_ndf, overwrite_method='append', **bqutil.dictify(dest))
if not self._check_write(dest):
warnings.warn(
'remote writing of UDF groupby-apply function failed')
gdf = BQDF(self.con, '%s' % dest)
return gdf
def join(self, df2, on=None, left_on=None, right_on=None, how='LEFT', dest=None, inplace=True):
'''joins table with table referenced in df2 and optionally returns result'''
if inplace:
dest = self.remote
overwrite_method = 'overwrite'
else:
overwrite_method = 'fail'
if left_on is None:
left_on, right_on = on, on
dups = list(set(self.columns).intersection(set(df2.columns)))
fulldups = [x for j in [['df1.' + i, 'df2.' + i]
for i in dups] for x in j]
allcols = [
c for c in self.columns + df2.columns + fulldups if c not in dups]
join_query = "SELECT %s FROM %s df1 %s JOIN %s df2 ON df1.%s=df2.%s" % (', '.join(allcols),
self.tablename, how, df2.tablename, left_on, right_on)
with bqutil.Mask_Printing():
ndf = self.query(
join_query, fetch=self.fetched, dest=dest, overwrite_method=overwrite_method)
if inplace:
self.refresh()
else:
return ndf
def add_index(self, colname='index', inplace=False):
'''add an index (can be used to join and add new columns)'''
# TODO any way make this not involve full table scan for all columns?
if inplace:
dest = self.remote
overwrite_method = 'overwrite'
else:
dest = None
overwrite_method = 'fail'
ndf = self.query('SELECT ROW_NUMBER() OVER() %s, * from %s' % (colname,
self.tablename), fetch=self.fetched, dest=dest, overwrite_method=overwrite_method)
if inplace:
self.refresh()
else:
return ndf
def dropna(self, dest=None, overwrite_method='fail'):
filters = ["%s IS NOT NULL" % c for c in self.columns]
filterstr = ' AND '.join(filters)
ndf = self.query('SELECT * FROM %s WHERE %s' % (self.tablename, filterstr),
fetch=self.fetched, dest=dest, overwrite_method=overwrite_method)
return ndf
def split_unstack(self, col=None, delimiter=' '):
if col is None:
col = self.active_col
ndf = self.query('SELECT SPLIT(%s, "%s") as split from %s' %
(col, delimiter, self.tablename), fetch=fetch, fill=False)
return ndf
def sort_by(self, col, desc=True, columns=None):
if columns is None:
columns = self.columns
if desc:
querystr = 'SELECT %s from %s ORDER BY %s DESC' % (
', '.join(columns), self.tablename, col)
else:
querystr = 'SELECT %s from %s ORDER BY %s' % (
', '.join(columns), self.tablename, col)
ndf = self.query(querystr, fetch=fetch, fill=False)
return ndf
########################################
# ###### PAIR OPERATIONS #######
########################################
def corr(self, col1, col2):
'''compute pearson correlation between two columns'''
ndf = self.query('SELECT CORR(%s, %s) from %s' %
(col1, col2, self.tablename), fetch=False, fill=False)
return ndf.local.values[0][0]
def cos(self, col1, col2):
'''compute cosine similarity between two columns'''
# cos-theta = dot(col1, col2) / ||col1|| * ||col2||)
dot = self.dot(col1, col2)
mag = self.query('SELECT SQRT(SUM(POW(%s,2))) * SQRT(SUM(POW(%s,2))) from %s WHERE %s is not NULL and %s is not NULL' %
(col1, col2, self.tablename, col1, col2), fetch=False, fill=False)
return dot / mag.local.values[0][0]
def dot(self, col1, col2):
'''compute dot product between two columns'''
ndf = self.query('SELECT SUM(%s * %s) from %s WHERE %s is not NULL and %s is not NULL' %
(col1, col2, self.tablename, col1, col2), fetch=False, fill=False)
return ndf.local.values[0][0]
def euclidean(self, col1, col2):
'''compute euclidean distance between two columns'''
ndf = self.query('SELECT SQRT(SUM(POW((%s-%s),2))) from %s WHERE %s is not NULL and %s is not NULL' %
(col1, col2, self.tablename, col1, col2), fetch=False, fill=False)
return ndf.local.values[0][0]
def contingency_mat(self, col1, col2, dest=None):
'''creates a contingency matrix for col1 and col2'''
contingency_query = "SELECT %s, %s, COUNT(*) count FROM %s GROUP BY %s, %s" % (
col1, col2, self.tablename, col1, col2)
ndf = self.query(contingency_query, fetch=True, dest=dest).local
return ndf.unstack(col2)
########################################
# ###### SERIES OPERATIONS #######
########################################
def subtract(self, col1, col2, fetch=cfg.FETCH_BY_DEFAULT):
ndf = self.query('SELECT %s - %s as diff from %s' %
(col1, col2, self.tablename), fetch=fetch, fill=False)
return ndf
def add(self, col1, col2, fetch=cfg.FETCH_BY_DEFAULT):
ndf = self.query('SELECT %s + %s as sum from %s' %
(col1, col2, self.tablename), fetch=fetch, fill=False)
return ndf
def divide(self, col1, col2, fetch=cfg.FETCH_BY_DEFAULT):
ndf = self.query('SELECT %s / %s as div from %s' %
(col1, col2, self.tablename), fetch=fetch, fill=False)
return ndf
def multiply(self, col1, col2, fetch=cfg.FETCH_BY_DEFAULT):
ndf = self.query('SELECT %s * %s as product from %s' %
(col1, col2, self.tablename), fetch=fetch, fill=False)
return ndf
def abs(self, col=None, fetch=cfg.FETCH_BY_DEFAULT):
'''compute absolute value of the column'''
if col is None:
col = self.active_col
ndf = self.query('SELECT ABS(%s) as abs from %s' %
(col, self.tablename), fetch=fetch, fill=False)
self._clear_active_col()
return ndf
def sqrt(self, col=None, fetch=cfg.FETCH_BY_DEFAULT):
'''compute square root of the column'''
if col is None:
col = self.active_col
ndf = self.query('SELECT SQRT(%s) as sqrt from %s' %
(col, self.tablename), fetch=fetch, fill=False)
self._clear_active_col()
return ndf
def round(self, col=None, dig=0, fetch=cfg.FETCH_BY_DEFAULT):
'''round column to specified digit'''
if col is None:
col = self.active_col
ndf = self.query('SELECT ROUND(%s, %s) as round from %s' %
(col, dig, self.tablename), fetch=fetch, fill=False)
self._clear_active_col()
return ndf
def pow(self, col=None, power=2, fetch=cfg.FETCH_BY_DEFAULT):
'''compute column values to specified power'''
if col is None:
col = self.active_col
ndf = self.query('SELECT pow(%s, %s) as pow from %s' %
(col, power, self.tablename), fetch=fetch, fill=False)
self._clear_active_col()
return ndf
def log(self, col=None, base='e', fetch=cfg.FETCH_BY_DEFAULT):
'''compute log of the column values'''
if col is None:
col = self.active_col
logs = {'e': 'ln', 2: 'log2', 10: 'log10'}
try:
func = logs[base]
except KeyError:
raise NameError("log base %s is not supported" % base)
ndf = self.query('SELECT %s(%s) as log from %s' %
(func, col, self.tablename), fetch=fetch, fill=False)
self._clear_active_col()
return ndf
def zscore(self, col=None, fetch=cfg.FETCH_BY_DEFAULT):
'''compute zscore of the column'''
if col is None:
col = self.active_col
avg = self.query('SELECT AVG(%s) from %s' %
(col, self.tablename), fetch=False).local.iloc[0, 0]
std = self.query('SELECT STDDEV(%s) from %s' %
(col, self.tablename), fetch=False).local.iloc[0, 0]
ndf = self.query('SELECT (%s-%s)/%s zscore from %s' %
(col, avg, std, self.tablename), fetch=fetch)
self._clear_active_col()
return ndf
def lower(self, col=None, fetch=cfg.FETCH_BY_DEFAULT):
if col is None:
col = self.active_col
ndf = self.query('SELECT LOWER(%s) as lower from %s' %
(col, self.tablename), fetch=fetch, fill=False)
return ndf
def upper(self, col=None, fetch=cfg.FETCH_BY_DEFAULT):
if col is None:
col = self.active_col
ndf = self.query('SELECT UPPER(%s) as upper from %s' %
(col, self.tablename), fetch=fetch, fill=False)
return ndf
def replace_str(self, string_to_replace, replacement_string, col=None, fetch=cfg.FETCH_BY_DEFAULT):
'''replace string_to_replace with replacement_string'''
if col is None:
col = self.active_col
ndf = self.query('SELECT REPLACE(%s, "%s", "%s") as replace from %s' %
(col, string_to_replace, replacement_string, self.tablename), fetch=fetch, fill=False)
return ndf
def str_index(self, search_string, col=None, fetch=cfg.FETCH_BY_DEFAULT):
'''get 1-based index of first occurence of search_string in the column (0 if not present)'''
if col is None:
col = self.active_col
ndf = self.query('SELECT INSTR(%s, "%s") as position from %s' %
(col, search_string, self.tablename), fetch=fetch, fill=False)
return ndf
########################################
# ###### AGGREGATE FUNCTIONS #######
########################################
def count(self, col=None):
'''return count of non-null entries in column'''
return self._simple_agg(col=col, operator='COUNT')
def min(self, col=None):
'''return minimum value of column'''
return self._simple_agg(col=col, operator='MIN')
def max(self, col=None):
'''return maximum value of column'''
return self._simple_agg(col=col, operator='MAX')
def mean(self, col=None):
'''return mean of column'''
return self._simple_agg(col=col, operator='AVG')
def sum(self, col=None):
'''return sum of column'''
return self._simple_agg(col=col, operator='SUM')
def std(self, col=None):
'''return standard deviation of column'''
return self._simple_agg(col=col, operator='STDDEV')
def sem(self, col=None):
'''return standard error of the mean of column'''
if col is None:
col = self.active_col
ndf = self.query('SELECT STDDEV(%s)/SQRT(COUNT(%s)) from %s' %
(col, col, self.tablename), fetch=False, fill=False)
self._clear_active_col()
return ndf.local.values[0][0]
def mode(self, col=None):
'''return mode of column (if multiple, returns first listed)'''
if col is None:
col = self.active_col
ndf = self.query('SELECT %s, COUNT(%s) as frequency from %s GROUP BY %s ORDER BY frequency DESC' % (
col, col, self.tablename, col), fetch=False)
self._clear_active_col()
return ndf.local.iloc[0, 0]
def median(self, col=None):
raise NotImplementedError
def percentiles(self, col=None):
'''returns 25th, 50th, and 75t percentiles of column'''
if col is None:
col = self.active_col
ndf = self.query(
'SELECT QUANTILEs(%s, 5) from %s' % (col, self.tablename), fetch=False)
try:
perc_25 = ndf.local.iloc[1, 0]
perc_50 = ndf.local.iloc[2, 0]
perc_75 = ndf.local.iloc[3, 0]
except IndexError:
perc_25, perc_50, perc_75 = np.nan, np.nan, np.nan
self._clear_active_col()
return perc_25, perc_50, perc_75
########################################
# ###### STATISTICS #######
########################################
# TODO: write tests for all stats and handle nan values correctly
def ttest_1samp(self, col, nullhypothesis=0):
#(mean-nullhyp)/(std/sqrt(n))
ndf = self.query(
'SELECT AVG(%s) as mean, STDDEV(%s) as std, COUNT(%s) as n from %s' % (col, col, col, self.tablename), fetch=False)
n = ndf.local.ix[0, 'n']
t = (ndf.local.ix[0, 'mean'] - nullhypothesis) / \
(ndf.local.ix[0, 'std'] / np.sqrt(n))
df = n - 1
pval = scipy.stats.t.sf(np.abs(t), df) * 2
return t, df, pval
def ttest_ind(self, col1, col2):
# pooled variance ttest
#(mean_1 - mean_2)/(std_12 * sqrt(1/n_1 +1/n_2))
# std_12 = sqrt( ((n1-1)*std_1 + (n2-1)*std_2)/( n_1 + n_2 - 2) )
ndf = self.query(
'SELECT AVG(%s) as mean_1, STDDEV(%s) as std_1, COUNT(%s) as n_1, AVG(%s) as mean_2, STDDEV(%s) as std_2, COUNT(%s) as n_2 from %s' % (col1, col1, col1, col2, col2, col2, self.tablename), fetch=False)
df = ndf.local.ix[0, 'n_1'] + ndf.local.ix[0, 'n_2'] - 2
std_12 = np.sqrt(((ndf.local.ix[0, 'n_1'] - 1) * ndf.local.ix[0, 'std_1'] + (
ndf.local.ix[0, 'n_2'] - 1) * ndf.local.ix[0, 'std_1']) / df)
t = (ndf.local.ix[0, 'mean_1'] - ndf.local.ix[0, 'mean_2']) / \
(std_12 *
np.sqrt(1.0 / ndf.local.ix[0, 'n_1'] + 1 / ndf.local.ix[0, 'n_2']))
pval = scipy.stats.t.sf(np.abs(t), df) * 2
return t, df, pval
def ttest_rel(self, col1, col2):
#(meandiff)/(std_diff/sqrt(n))
ndf = self.query(
'SELECT AVG(diff) as mean_diff, STDDEV(diff) as std_diff, COUNT(diff) as n from (SELECT %s-%s as diff FROM %s)' % (col1, col2, self.tablename), fetch=False)
n = ndf.local.ix[0, 'n']
t = ndf.local.ix[0, 'mean_diff'] / \
(ndf.local.ix[0, 'std_diff'] / np.sqrt(n))
df = n - 1
pval = scipy.stats.t.sf(np.abs(t), df) * 2
return t, df, pval
def chi_square(self, col1, col2):
col1levels = self[col1].unique()
col2levels = self[col2].unique()
df = (len(col1levels) - 1) * (len(col2levels) - 1)
n = len(self)
combos = []
chi = 0
for c1 in col1levels:
for c2 in col2levels:
if (c1, c2) not in combos:
combos.append(c1, c2)
for level1, level2 in combos:
n1 = len(self.where('%s = %s' % (col1, level1), columns=[col1]))
n2 = len(self.where('%s = %s' % (col2, level2), columns=[col2]))
expected = (n1 * n2) / n
observed = len(
self.where('%s==%s' % (col1, level1), '%s==%s' % (col2, level2), columns=[col1]))
new = ((observed - expected) ** 2) / expected
chi += new
pval = scipy.stats.chi2.sf(np.abs(chi), df) * 2
return chi, df, pval
def binomial(self, col, success=1, p=.5):
successes = self.where('%s = %s' % (col, success))
n = len(successes)
k = len(self)
pval = scipy.stats.binom_test(n, k, p)
return float(n) / k, pval
def pearsonr(self, col1, col2):
r = self.corr(col1, col2)
n = len(self[[col1, col2]].dropna())
df = n - 1
t = r * np.sqrt(df) / np.sqrt(1 - r ** 2)
pval = scipy.stats.t.sf(np.abs(t), df) * 2
return r, df, pval
def onewayanova(self, valuecol, factor):
raise NotImplementedError
def twowayanova(self, valuecol, factor1, factor2):
raise NotImplementedError
def rmanova(self, valuecol, withinfactor, betweenfactor):
raise NotImplementedError
def linear_regression(self, y, xcols):
if len(xcols) == 1:
output = _simple_linear_regression(y, x)
return ouput
def _simple_linear_regression(y, x):
querystr = "SELECT (AVG(x*y) - AVG(x)*AVG(y))/VARIANCE(X) as slope, AVG(y) - ((AVG(x*y) - AVG(x)*AVG(y))/VARIANCE(X)) * AVG(x) as intercept"
ndf = self.query(querystr, fetch=False)
return ndf
########################################
# ##### DATETIME FUNCTIONALITY #######
########################################
# TODO
# easy time windowing
# dal, wal, etc.
########################################
# ##### BASIC EXPLORATION FUNCS #######
########################################
def _head(self):
with bqutil.Mask_Printing():
output, source, _ = raw_query(
self.con, "SELECT * FROM %s LIMIT 5" % (self.tablename), self.last_modified)
return output
def head(self):
return self.local.head()
@property
def values(self, col=None):
'''return values from single column'''
if col is None:
col = self.active_col
with bqutil.Mask_Printing():
output, source, exceeds_max = raw_query(
self.con, "SELECT %s FROM %s" % (col, self.tablename), self.last_modified, fetch=True)
return output[col].values
@property
def columns(self):
'''returns list of column names from table'''
return [f['name'] for f in self.resource['schema']['fields'] if f['name']]
def table_schema(self):
'''prints datatypes and other settings for each column'''
fields = self.resource['schema']['fields']
for f in fields:
others = [
"%s-%s" % (key, val) for key, val in f.items() if key not in ['type', 'name']]
print "%s (%s) : %s" % (f['name'], f['type'], ', '.join(others))
return fields
def describe(self):
'''replicates df.describe() by returning a dataframe with summary measures for each numeric column'''
# TODO this is super inefficient. investigate percentile options.
with bqutil.Mask_Printing():
fields = self.table_schema()
describe_data = {}
rows = ['count', 'min', '25th percentile', '50th percentile',
'75th percentile', 'max', 'mean', 'std', 'mode']
for f in fields:
if 'INT' in f['type'] or 'LONG' in f['type'] or 'FLOAT' in f['type']:
column = []
for func in [self.count, self.min, self.percentiles, self.max, self.mean, self.std, self.mode]:
result = func(f['name'])
try:
column.extend(result)
except:
column.append(result)
describe_data[f['name']] = column
return pd.DataFrame(data=describe_data, index=rows)
def unique(self, col=None, fetch=True):
'''find unique values in the requested column'''
if col is None:
col = self.active_col
unique_query = "SELECT %s FROM %s GROUP BY %s" % (
col, self.tablename, col)
with bqutil.Mask_Printing():
ndf = self.query(unique_query, fetch=fetch)
self._clear_active_col()
return ndf.local[col].values
def topk(self, k, col=None, fetch=True, dest=None):
if col is None:
col = self.active_col
top_query = "SELECT TOP(%s, %s) %s, COUNT(*) as count FROM %s" % (col,
k, col, self.tablename)
with bqutil.Mask_Printing():
ndf = self.query(top_query, fetch=True)
return ndf
def sample(self, uniquecol, p=.10, columns=None):
'''generate a random sample of the data'''
if columns is None:
columns = self.columns
modval = int(round(1 / p))
querystr = 'SELECT %s FROM %s WHERE HASH(STRING(%s)) \% %s == 0' % (
', '.join(columns), self.tablename, uniquecol, modval)
return self.query(querystr)
########################################
# ###### VISUALIZATION #######
########################################
def corr_mat(self, plot=True):
'''compute correlation matrix between all numeric table columns'''
numerics = [col for col in self.columns if self.local[
col].dtype in (np.int64, np.float64)]
mat = pd.DataFrame(columns=numerics, index=numerics, data=1)
for col1, col2 in itertools.combinations(numerics, 2):
r = self.corr(col1, col2)
mat.loc[col1, col2] = r
mat.loc[col2, col1] = r
if plot:
bqviz.plot_matrix(mat)
return mat
def qqplot(self, col=None):
if col is None:
col = self.active_col
raise NotImplementedError
def gridplot(self):
return bqviz._gridplot(self)
def plot(self, grouping_col, value_col, kind='bar', ax=None):
'''plots the mean of value_col (Y), broken down by grouping_col (X) and returns plot axis'''
plotdf = self.groupby(
grouping_col, [(value_col, 'mean'), (value_col, 'std'), (value_col, 'count')])
return bqviz._plot_grouped_data(plotdf.local, value_col, grouping_col, kind=kind)
def hist(self, col=None, bins=20, ax=None):
'''plots a histogram of the desired column, returns the df used for plotting'''
if col is None:
col = self.active_col
binbreaks = self._get_binbreaks(col, bins=bins)
countstr = _create_full_str(col, binbreaks, kind='count')
querystr = 'SELECT %s FROM %s' % (countstr, self.tablename)
ndf = self.query(querystr)
bqviz.plot_hist(ndf.local, col, ax=ax)
self._clear_active_col()
return ndf.local.T
def scatter(self, x=None, y=None, bins=200, ax=None):
'''plots a scatter plot of x vs y (downsampled if data.size>bins, returns the series used for plotting'''
if self.__len__() > bins:
binbreaks = self._get_binbreaks(x, bins=bins)
meanstr = _create_full_str(x, binbreaks, kind='mean', ycol=y)
countstr = 'SELECT %s FROM %s' % (countstr, self.tablename)
counts = self.query(countstr).local.T[0]
sems = [s / np.sqrt(c) for s, c in zip(error, counts)]
plotdf = bqviz.plot_scatter(
scatterdf.local, x, y, ax=ax, downsampled=True, error=sems, counts=counts)
else:
querystr = 'SELECT %s, %s FROM %s' % (x, y, self.tablename)
scatterdf = self.query(querystr)
plotdf = bqviz.plot_scatter(
scatterdf.local, x, y, ax=ax, downsampled=False)
self._clear_active_col()
return plotdf
########################################
# ###### TABLE PROPERTIES #######
########################################
@property
def size(self):
'''returns size of the table (# rows, # columns)'''
return (int(self.resource['numRows']), len(self.resource['schema']['fields']))
@property
def last_modified(self):
self.resource = self.get_resource(self.remote)
print bqutil.convert_timestamp(self.resource['lastModifiedTime'])
return float(self.resource['lastModifiedTime'])
@property
def creation_time(self):
'''Creation time for the table'''
print bqutil.convert_timestamp(self.resource['creationTime'])
@property
def expiration_time(self):
'''Expiration time for the table'''
try:
print bqutil.convert_timestamp(self.resource['expirationTime'])
except KeyError:
warnings.warn("No expiration set")
def __len__(self):
'''length of table (# of rows)'''
try:
return int(self.resource['numRows'])
except KeyError:
with bqutil.Mask_Printing():
output, source, exceeds_max = raw_query(
self.con, 'SELECT COUNT(*) FROM %s' % self.tablename, self.last_modified)
return output.values[0][0]
def footprint(self):
'''check size of table'''
return float(self.resource['numBytes']) / 1048576
########################################
# ###### UTILITIES #######
########################################
def flush(self):
'''flush cache (will not affect bigquery cache, only affects local caching to prevent excessive network burden)'''
self.con.flush_cache()
self.resource = self.get_resource(self.remote)
def get_resource(self, remote):
'''fetch info about remote table'''
return self.con.client._apiclient.tables().get(**bqutil.dictify(remote)).execute()
def refresh(self):
'''refresh the local state of the table'''
if self.fetched:
self.fetch()
else:
self.local = self._head()
def fetch(self):
'''overwrite table with columns specified in bqdf.columns'''
ndf = self.query('select %s from %s' % (', '.join(
self.columns), self.tablename), fetch=True, dest=self.remote, overwrite_method='overwrite')
self.fetched = True
self.local = ndf.local
########################################
# ###### INTERNAL METHODS #######
########################################
def _get_remote_reference(self, content, column):
'''get reference to a remote table containing new column content'''
if isinstance(content, BQDF):
if len(content.columns) > 1:
raise ValueError(
"trying to add multiple column bqdf as single column.")
colname = content.columns[0]
tempdf = self.query('SELECT %s as %s from %s' %
(colname, column, content.tablename))
length = len(tempdf)
newremote = tempdf.remote
else:
length = len(self)
newremote = create_column_from_values(
self.con, column, content, self.remote, length=length)
return newremote, length
def _check_write(self, newremote, timeout=10):
'''query from a newly created table (waits until table has been fully inserted)'''
loaded = False
start_time = time()
elapsed_time = 0
while not loaded:
if elapsed_time < timeout:
resource = bqutil.get_table_resource(
self.con, bqutil.dictify(newremote))
# won't contain this attribute while actively streaming
# insertions
if 'numRows' in resource:
if int(resource['numRows']) > 0:
return True
elapsed_time = time() - start_time
sleep(.5)
else:
return False
def _set_active_col(self, col):
'''sets the "active column" to use for subsequent operation'''
self.active_col = col
return self
def _clear_active_col(self):
'''clears the active column'''
self.active_col = None
def _limit_columns(self, columns, fetch=cfg.FETCH_BY_DEFAULT, dest=None):
'''create new bqdf limited to these columns)'''
ndf = self.query('select %s from %s' %
(', '.join(columns), self.tablename), fetch=fetch)
return ndf
def _simple_agg(self, col=None, operator='COUNT'):
# TODO figure out nan handling
if col is None:
col = self.active_col
ndf = self.query('SELECT %s(%s) from %s' %
(operator, col, self.tablename), fetch=False, fill=False)
self._clear_active_col()
return ndf.local.values[0][0]
def _get_binbreaks(self, col, bins=20):
'''computes breakpoints for binning of data in column'''
maxval = self.max(col)
minval = self.min(col)
interval = float(maxval - minval) / bins
binbreaks = [minval]
val = minval
for i in range(bins):
val = val + interval
binbreaks.append(val)
return binbreaks
def _get_nth_row(self, n):
fields, data = con.client.ReadSchemaAndRows(
bqutil.dictify(self.remote), start_row=n, max_rows=1)
result = {f['name']: d for f, d in zip(fields, data[0])}
return result
def _add_col_join(self, newdfname, newcol, inplace):
allcols = [col for col in self.columns + [newcol] if col != 'index']
querystr = 'SELECT %s FROM (SELECT ROW_NUMBER() OVER() as index, * FROM %s) as tb1 JOIN(SELECT ROW_NUMBER() OVER() as index, * from %s) tb2 on tb1.index==tb2.index' % (
', '.join(allcols), newdfname, self.tablename)
if inplace:
dest = self.remote
else:
dest = None
print querystr
df, _, _ = raw_query(self.con, querystr, self.last_modified,
dest=dest, overwrite_method='overwrite', fetch=self.fetched)
return df
##########################################################################
############################################ SUPPLEMENTAL FUNCTIONS ######
##########################################################################
def raw_query(con, querystr, last_modified, dest=None, max_rows=cfg.MAX_ROWS, fetch=cfg.FETCH_BY_DEFAULT, overwrite_method='fail'):
'''executes a query and returns the results or a result sample as a pandas df and the destination table as a dict
INPUTS:
querystr (str):
dest (dict): specify destination table for output of query (if None, BQ creates a temporary (24hr) table)
max_rows (int): max number of rows that the con will return in the results
fetch (bool): if True, fetch the full resultset locally, otherwise return only a sample of the first 5 rows
OUTPUTS:
result (pandas dataframe): dataframe containing the query results or
first 5 rows or resultset (if fetch==True)
destinationtable (dict): remote table that contains the query results
'''
exists = con._check_query(querystr, fetch, last_modified)
if overwrite_method == 'append':
write_disposition = 'WRITE_APPEND'
elif overwrite_method == 'overwrite':
write_disposition = 'WRITE_TRUNCATE'
else:
write_disposition = 'WRITE_EMPTY'