def main():
    noaa_goes = BigQueryHelper(active_project="bigquery-public-data",
                               dataset_name="noaa_goes16")
    noaa_goes.list_tables()

    noaa_goes.table_schema('abi_l1b_radiance')
    print(noaa_goes.head("abi_l1b_radiance", num_rows=10))

    query = """
    SELECT dataset_name, platform_id, scene_id FROM `bigquery-public-data.noaa_goes16.abi_l1b_radiance` WHERE geospatial_westbound_longitude<120 and geospatial_eastbound_longitude>75 and geospatial_northbound_latitude<50 and geospatial_southbound_latitude>30
    """

    print("Query size in GB is %f " % noaa_goes.estimate_query_size(query))
Beispiel #2
0
class TestBQHelper(unittest.TestCase):
    def setUp(self):
        self.my_bq = BigQueryHelper("bigquery-public-data", "openaq")
        self.query = "SELECT location FROM `bigquery-public-data.openaq.global_air_quality`"
        # Query randomized so it won't hit the cache across multiple test runs
        self.randomizable_query = """
            SELECT value FROM `bigquery-public-data.openaq.global_air_quality`
            WHERE value = {0}"""

    def test_list_tables(self):
        self.assertEqual(self.my_bq.list_tables(), ['global_air_quality'])

    def test_list_schema(self):
        self.assertEqual(len(self.my_bq.table_schema('global_air_quality')),
                         11)

    def test_estimate_query_size(self):
        self.assertIsInstance(self.my_bq.estimate_query_size(self.query),
                              float)

    def test_query_to_pandas(self):
        self.assertIsInstance(self.my_bq.query_to_pandas(self.query),
                              DataFrame)

    def test_query_safe_passes(self):
        self.assertIsInstance(self.my_bq.query_to_pandas_safe(self.query),
                              DataFrame)

    def test_query_safe_fails(self):
        # Different query must be used for this test to ensure we don't hit the
        # cache and end up passing by testing a query that would use zero bytes.
        fail_query = self.randomizable_query.format(random())
        self.assertIsNone(self.my_bq.query_to_pandas_safe(fail_query, 10**-10))

    def test_head(self):
        self.assertIsInstance(self.my_bq.head('global_air_quality'), DataFrame)

    def test_useage_tracker(self):
        self.my_bq.query_to_pandas(self.randomizable_query.format(random()))
        self.assertNotEqual(self.my_bq.total_gb_used_net_cache, 0)

    def test_bad_query_raises_right_error(self):
        with self.assertRaises(BadRequest):
            self.my_bq.query_to_pandas("Not a valid query")

    def test_list_nested_schema(self):
        nested_helper = BigQueryHelper("bigquery-public-data", "github_repos")
        self.assertEqual(len(nested_helper.table_schema('commits')), 33)
Beispiel #3
0
import pandas as pd
import bq_helper
from bq_helper import BigQueryHelper
# https://www.kaggle.com/sohier/introduction-to-the-bq-helper-package
medicare = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                    dataset_name="cms_medicare")
bq_assistant = BigQueryHelper("bigquery-public-data", "cms_medicare")
bq_assistant.list_tables()
bq_assistant.head("inpatient_charges_2015", num_rows=15)
Beispiel #4
0
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly as py
import plotly.graph_objs as go
init_notebook_mode(connected=True)
plt.rcParams['figure.figsize'] = (12, 5)
from google.cloud import bigquery
from bq_helper import BigQueryHelper
client = bigquery.Client()
bq_assistant = BigQueryHelper("bigquery-public-data", "bitcoin_blockchain")


def satoshi_to_bitcoin(satoshi):
    return float(float(satoshi) / float(100000000))


test_data = bq_assistant.head("transactions")

bq = DataSaver(bq_assistant)


def Create_Bar_plotly(list_of_tuples, items_to_show=40, title=""):
    #list_of_tuples=list_of_tuples[:items_to_show]
    data = [
        go.Bar(x=[val[0] for val in list_of_tuples],
               y=[val[1] for val in list_of_tuples])
    ]
    layout = go.Layout(
        title=title,
        xaxis=dict(autotick=False, tickangle=290),
    )
    fig = go.Figure(data=data, layout=layout)
Beispiel #5
0
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from google.cloud import bigquery

import bq_helper
from bq_helper import BigQueryHelper
stackOverflow = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                         dataset_name="stackoverflow")

bq_assistant = BigQueryHelper("bigquery-public-data", "stackoverflow")
bq_assistant.list_tables()
tables = [
    'badges', 'comments', 'post_history', 'post_links', 'posts_answers',
    'posts_moderator_nomination', 'posts_orphaned_tag_wiki',
    'posts_privilege_wiki', 'posts_questions', 'posts_tag_wiki',
    'posts_tag_wiki_excerpt', 'posts_wiki_placeholder', 'stackoverflow_posts',
    'tags', 'users', 'votes'
]
for t in tables:
    tag = bq_assistant.head(t, num_rows=5000)
    tag.to_csv('stackoverflow_' + t + '.csv')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

from google.cloud import bigquery
from bq_helper import BigQueryHelper

bq_assist = BigQueryHelper(active_project='bigquery-public-data',
                           dataset_name='epa_historical_air_quality')
bq_assist.list_tables()

bq_assist.head('temperature_daily_summary')

query = """ SELECT EXTRACT (YEAR FROM date_local)  AS Year,
                   AVG ((arithmetic_mean - 32.0)/ 1.80)  AS avg_temp_celcius
            FROM  `bigquery-public-data.epa_historical_air_quality.temperature_daily_summary` 
            GROUP BY  Year
            Order BY  Year
"""
avg_temp = bq_assist.query_to_pandas_safe(query, max_gb_scanned=10)

query = """ SELECT EXTRACT (YEAR FROM date_local)  AS Year,
                   AVG (arithmetic_mean) AS avg_co
            FROM  `bigquery-public-data.epa_historical_air_quality.co_daily_summary` 
            GROUP BY  Year
            Order BY  Year
"""
avg_co = bq_assist.query_to_pandas_safe(query, max_gb_scanned=10)