예제 #1
0
class TestBQHelper(unittest.TestCase):
    def setUp(self):
        self.my_bq = BigQueryHelper("bigquery-public-data", "openaq")
        self.query = "SELECT location FROM `bigquery-public-data.openaq.global_air_quality`"
        # Query randomized so it won't hit the cache across multiple test runs
        self.randomizable_query = """
            SELECT value FROM `bigquery-public-data.openaq.global_air_quality`
            WHERE value = {0}"""

    def test_list_tables(self):
        self.assertEqual(self.my_bq.list_tables(), ['global_air_quality'])

    def test_list_schema(self):
        self.assertEqual(len(self.my_bq.table_schema('global_air_quality')),
                         11)

    def test_estimate_query_size(self):
        self.assertIsInstance(self.my_bq.estimate_query_size(self.query),
                              float)

    def test_query_to_pandas(self):
        self.assertIsInstance(self.my_bq.query_to_pandas(self.query),
                              DataFrame)

    def test_query_safe_passes(self):
        self.assertIsInstance(self.my_bq.query_to_pandas_safe(self.query),
                              DataFrame)

    def test_query_safe_fails(self):
        # Different query must be used for this test to ensure we don't hit the
        # cache and end up passing by testing a query that would use zero bytes.
        fail_query = self.randomizable_query.format(random())
        self.assertIsNone(self.my_bq.query_to_pandas_safe(fail_query, 10**-10))

    def test_head(self):
        self.assertIsInstance(self.my_bq.head('global_air_quality'), DataFrame)

    def test_useage_tracker(self):
        self.my_bq.query_to_pandas(self.randomizable_query.format(random()))
        self.assertNotEqual(self.my_bq.total_gb_used_net_cache, 0)

    def test_bad_query_raises_right_error(self):
        with self.assertRaises(BadRequest):
            self.my_bq.query_to_pandas("Not a valid query")

    def test_list_nested_schema(self):
        nested_helper = BigQueryHelper("bigquery-public-data", "github_repos")
        self.assertEqual(len(nested_helper.table_schema('commits')), 33)
예제 #2
0
import pandas as pd
# https://github.com/SohierDane/BigQuery_Helper
from bq_helper import BigQueryHelper

bq_assistant = BigQueryHelper("bigquery-public-data", "github_repos")

QUERY = """
        SELECT *
        FROM `bigquery-public-data.github_repos.languages`
        """
df = bq_assistant.query_to_pandas_safe(QUERY)


def proc_language(row):
    return list(map(lambda x: x['name'], row['language']))


df['language'] = df.apply(proc_language, axis=1)

df.to_csv('language_proccessed.csv')

from ast import literal_eval

count = {}
byte_count = {}


def stats(row):
    global count
    global byte_count
    for lang in literal_eval(row['language']):
예제 #3
0
train = train.merge(descrips, how='left', on='LabelName')
train.head(9)

open_images = BigQueryHelper(active_project="bigquery-public-data",
                             dataset_name="open_images")
query = """
            SELECT image_id, original_url 
            FROM `bigquery-public-data.open_images.images` 
            WHERE image_id IN UNNEST(['0199bc3e1db115d0',
                                      '4fa8054781a4c382',
                                      '51c5d8d5d9cd87ca',
                                      '9ec02b5c0315fcd1',
                                      'b37f763ae67d0888',
                                      'ddcb4b7478e9917b'])
        """
urls = open_images.query_to_pandas_safe(query)

boxes = pd.read_csv('../input/train_bounding_boxes.csv')
boxes = boxes[boxes.ImageID.isin(
    urls.image_id.tolist())].sort_values('ImageID')

# In[ ]:

imlist = urls.image_id.tolist()
files = ['../input/openimages-support/ims/{}.jpg'.format(i) for i in imlist]
fig, ax = plt.subplots()
fig.set_size_inches((15, 15))
ax.set_axis_off()
for n, (file, image) in enumerate(zip(files, imlist)):
    a = fig.add_subplot(2, 3, n + 1)
    req = request.urlopen(urls.original_url[n])
예제 #4
0
# Pulls data from the BigQuery 4TB dataset of GitHub at https://www.kaggle.com/github/github-repos
# * Create a kernel at https://www.kaggle.com/mrisdal/safely-analyzing-github-projects-popular-licenses
# ! Don't run this code here! Do it at the kernel!

import pandas as pd
from bq_helper import BigQueryHelper
bq_assistant = BigQueryHelper("bigquery-public-data", "github_repos")

QUERY = """
        SELECT author, subject, repo_name
        FROM `bigquery-public-data.github_repos.commits`
        LIMIT 250000
        """

print(f"Using {bq_assistant.estimate_query_size(QUERY)} GB / 5 TB Limit")

df = bq_assistant.query_to_pandas_safe(QUERY, max_gb_scanned=125)
print(df.head())
print(
    f'Size of dataframe: {int(df.memory_usage(index=True, deep=True).sum()) / 2**20} MB'
)

df.to_csv('commits.csv')
예제 #5
0
QUERY = """
    SELECT
        country, 
        avg(value) as o3_avg_value
    FROM
      `bigquery-public-data.openaq.global_air_quality`
    WHERE
      pollutant = 'o3'
      AND country != 'NL'
      AND unit = 'µg/m³'
    GROUP BY country
    ORDER BY o3_avg_value ASC
        """

df_all = bq_assistant.query_to_pandas_safe(QUERY)

# In[ ]:

QUERY = """
    SELECT
        country, 
        avg(value) as no2_avg_value
    FROM
      `bigquery-public-data.openaq.global_air_quality`
    WHERE
      pollutant = 'no2'
      AND unit = 'µg/m³'
    GROUP BY country
    ORDER BY no2_avg_value ASC
        """
예제 #6
0
from bq_helper import BigQueryHelper
from google.cloud import bigquery

#before executing this code you have to export your key : export GOOGLE_APPLICATION_CREDENTIALS="path to your key.json"

client = bigquery.Client(project='Oceans')

bq_assistant = BigQueryHelper(active_project="bigquery-public-data",
                              dataset_name="noaa_icoads")

bq_assistant.list_tables()
bq_assistant.table_schema('icoads_core_2005')

query = """SELECT latitude, longitude, wind_direction_true, wind_speed, present_weather, sea_level_pressure, air_temperature,  wetbulb_temperature, sea_surface_temp,
            total_cloud_amount, cloud_height, wave_direction, wave_period, wave_height, swell_direction, swell_period, swell_height, timestamp
            FROM `bigquery-public-data.noaa_icoads.icoads_core_2005`
                        """

#execute the query.
res = bq_assistant.query_to_pandas_safe(query)

#save the result.
res.to_csv("resultat_total_20052.csv", sep=',')
예제 #7
0
  to_address,
  value,
  receipt_contract_address,
  receipt_status,
  block_number
FROM
  `bigquery-public-data.ethereum_blockchain.transactions` AS transactions
WHERE TRUE
  AND block_timestamp < '2017-01-01 00:00:00'
"""
# This establishes an authenticated session and prepares a reference to the dataset that lives in BigQuery.
bq_assistant = BigQueryHelper("bigquery-public-data", "ethereum_blockchain")
print("estimated query data size: " +
      str(bq_assistant.estimate_query_size(query)))

df = bq_assistant.query_to_pandas_safe(query, max_gb_scanned=60)

df.to_csv(
    "/home/yueduan/yueduan/postdoc_study/ether_transactions_before2017.csv")

# # Import dataset from big query
# from google.cloud import bigquery
# from bq_helper import BigQueryHelper
# import matplotlib.pyplot as plt
# import seaborn as sns
# import pandas as pd
# #%matplotlib inline
# plt.style.use('ggplot')
# sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})

# # This establishes an authenticated session and prepares a reference to the dataset that lives in BigQuery.
from google.cloud import bigquery
from bq_helper import BigQueryHelper

bq_assist = BigQueryHelper(active_project='bigquery-public-data',
                           dataset_name='epa_historical_air_quality')
bq_assist.list_tables()

bq_assist.head('temperature_daily_summary')

query = """ SELECT EXTRACT (YEAR FROM date_local)  AS Year,
                   AVG ((arithmetic_mean - 32.0)/ 1.80)  AS avg_temp_celcius
            FROM  `bigquery-public-data.epa_historical_air_quality.temperature_daily_summary` 
            GROUP BY  Year
            Order BY  Year
"""
avg_temp = bq_assist.query_to_pandas_safe(query, max_gb_scanned=10)

query = """ SELECT EXTRACT (YEAR FROM date_local)  AS Year,
                   AVG (arithmetic_mean) AS avg_co
            FROM  `bigquery-public-data.epa_historical_air_quality.co_daily_summary` 
            GROUP BY  Year
            Order BY  Year
"""
avg_co = bq_assist.query_to_pandas_safe(query, max_gb_scanned=10)

concat = pd.concat([avg_temp, avg_co.avg_co], axis=1)

fig, ax = plt.subplots(figsize=(16, 5))

color = 'tab:blue'
ax.bar(concat.Year, concat.avg_temp_celcius, color=color)
예제 #9
0
# NOTES:
#   -   Need a Google Cloud authentication:
#           https://cloud.google.com/docs/authentication/getting-started#auth-cloud-implicit-python
#   -   The path to the json auth file you get from the google auth needs to be
#       set in your PATH.
#       -   This means you're going to have to set it everytime you have a new
#           shell session.
#   -   bq_helper abstracts retrieving big queries through google cloud for
#       an easier time:
#           https://github.com/SohierDane/BigQuery_Helper
from bq_helper import BigQueryHelper
import pandas as pd

print('Retrieving NOAA GSOD data.')
bq_assist = BigQueryHelper("bigquery-public-data", "noaa_gsod")
print('Successfully retrieved data.')

start_yr = 2010
end_yr = 2019

for year in range(start_yr, end_yr):
    query = "SELECT stn, year, mo AS month, da AS day, temp, slp FROM `bigquery-public-data.noaa_gsod.gsod{}` WHERE wban='23174' OR wban='23129' OR wban='93134' ".format(year)

    df_weather = bq_assist.query_to_pandas_safe(query, max_gb_scanned=5)

    filename = 'socal_weather_{}.csv'.format(year)

    df_weather.to_csv(filename, index=False)

    print("Saved {}".format(filename))