def main(): noaa_goes = BigQueryHelper(active_project="bigquery-public-data", dataset_name="noaa_goes16") noaa_goes.list_tables() noaa_goes.table_schema('abi_l1b_radiance') print(noaa_goes.head("abi_l1b_radiance", num_rows=10)) query = """ SELECT dataset_name, platform_id, scene_id FROM `bigquery-public-data.noaa_goes16.abi_l1b_radiance` WHERE geospatial_westbound_longitude<120 and geospatial_eastbound_longitude>75 and geospatial_northbound_latitude<50 and geospatial_southbound_latitude>30 """ print("Query size in GB is %f " % noaa_goes.estimate_query_size(query))
class TestBQHelper(unittest.TestCase): def setUp(self): self.my_bq = BigQueryHelper("bigquery-public-data", "openaq") self.query = "SELECT location FROM `bigquery-public-data.openaq.global_air_quality`" # Query randomized so it won't hit the cache across multiple test runs self.randomizable_query = """ SELECT value FROM `bigquery-public-data.openaq.global_air_quality` WHERE value = {0}""" def test_list_tables(self): self.assertEqual(self.my_bq.list_tables(), ['global_air_quality']) def test_list_schema(self): self.assertEqual(len(self.my_bq.table_schema('global_air_quality')), 11) def test_estimate_query_size(self): self.assertIsInstance(self.my_bq.estimate_query_size(self.query), float) def test_query_to_pandas(self): self.assertIsInstance(self.my_bq.query_to_pandas(self.query), DataFrame) def test_query_safe_passes(self): self.assertIsInstance(self.my_bq.query_to_pandas_safe(self.query), DataFrame) def test_query_safe_fails(self): # Different query must be used for this test to ensure we don't hit the # cache and end up passing by testing a query that would use zero bytes. fail_query = self.randomizable_query.format(random()) self.assertIsNone(self.my_bq.query_to_pandas_safe(fail_query, 10**-10)) def test_head(self): self.assertIsInstance(self.my_bq.head('global_air_quality'), DataFrame) def test_useage_tracker(self): self.my_bq.query_to_pandas(self.randomizable_query.format(random())) self.assertNotEqual(self.my_bq.total_gb_used_net_cache, 0) def test_bad_query_raises_right_error(self): with self.assertRaises(BadRequest): self.my_bq.query_to_pandas("Not a valid query") def test_list_nested_schema(self): nested_helper = BigQueryHelper("bigquery-public-data", "github_repos") self.assertEqual(len(nested_helper.table_schema('commits')), 33)
import pandas as pd import bq_helper from bq_helper import BigQueryHelper # https://www.kaggle.com/sohier/introduction-to-the-bq-helper-package medicare = bq_helper.BigQueryHelper(active_project="bigquery-public-data", dataset_name="cms_medicare") bq_assistant = BigQueryHelper("bigquery-public-data", "cms_medicare") bq_assistant.list_tables() bq_assistant.head("inpatient_charges_2015", num_rows=15)
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot import plotly as py import plotly.graph_objs as go init_notebook_mode(connected=True) plt.rcParams['figure.figsize'] = (12, 5) from google.cloud import bigquery from bq_helper import BigQueryHelper client = bigquery.Client() bq_assistant = BigQueryHelper("bigquery-public-data", "bitcoin_blockchain") def satoshi_to_bitcoin(satoshi): return float(float(satoshi) / float(100000000)) test_data = bq_assistant.head("transactions") bq = DataSaver(bq_assistant) def Create_Bar_plotly(list_of_tuples, items_to_show=40, title=""): #list_of_tuples=list_of_tuples[:items_to_show] data = [ go.Bar(x=[val[0] for val in list_of_tuples], y=[val[1] for val in list_of_tuples]) ] layout = go.Layout( title=title, xaxis=dict(autotick=False, tickangle=290), ) fig = go.Figure(data=data, layout=layout)
# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) from google.cloud import bigquery import bq_helper from bq_helper import BigQueryHelper stackOverflow = bq_helper.BigQueryHelper(active_project="bigquery-public-data", dataset_name="stackoverflow") bq_assistant = BigQueryHelper("bigquery-public-data", "stackoverflow") bq_assistant.list_tables() tables = [ 'badges', 'comments', 'post_history', 'post_links', 'posts_answers', 'posts_moderator_nomination', 'posts_orphaned_tag_wiki', 'posts_privilege_wiki', 'posts_questions', 'posts_tag_wiki', 'posts_tag_wiki_excerpt', 'posts_wiki_placeholder', 'stackoverflow_posts', 'tags', 'users', 'votes' ] for t in tables: tag = bq_assistant.head(t, num_rows=5000) tag.to_csv('stackoverflow_' + t + '.csv')
import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import os from google.cloud import bigquery from bq_helper import BigQueryHelper bq_assist = BigQueryHelper(active_project='bigquery-public-data', dataset_name='epa_historical_air_quality') bq_assist.list_tables() bq_assist.head('temperature_daily_summary') query = """ SELECT EXTRACT (YEAR FROM date_local) AS Year, AVG ((arithmetic_mean - 32.0)/ 1.80) AS avg_temp_celcius FROM `bigquery-public-data.epa_historical_air_quality.temperature_daily_summary` GROUP BY Year Order BY Year """ avg_temp = bq_assist.query_to_pandas_safe(query, max_gb_scanned=10) query = """ SELECT EXTRACT (YEAR FROM date_local) AS Year, AVG (arithmetic_mean) AS avg_co FROM `bigquery-public-data.epa_historical_air_quality.co_daily_summary` GROUP BY Year Order BY Year """ avg_co = bq_assist.query_to_pandas_safe(query, max_gb_scanned=10)