Exemple #1
0
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "key.json"

import plotly.graph_objects as go
from plotly.offline import plot

from bq_helper import BigQueryHelper

client = BigQueryHelper('bigquery-public-data', 'covid19_italy')

query = """ select 
                    * 
            from bigquery-public-data.covid19_italy.data_by_region 
        """
query_for_bar = """ select 
                    date, hospitalized_patients_symptoms
            from bigquery-public-data.covid19_italy.data_by_region 
            where region_name="Toscana"
                    
        """
query_for_pie = """ select 
                    region_name, SUM(hospitalized_patients_symptoms) AS sick
            
                    from bigquery-public-data.covid19_italy.data_by_region 
            
                    group by region_name
                """
query_for_scatter = """ select 
                                date, AVG(total_hospitalized_patients) as hospitalized
                                
                        from bigquery-public-data.covid19_italy.data_by_region 
Exemple #2
0
import pandas as pd
import numpy as pn
import plotly.graph_objs as go
import os
from plotly.offline import plot

from bq_helper import BigQueryHelper
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "keys.json"

bq_assistant = BigQueryHelper('bigquery-public-data', 'san_francisco')

QUERY = """
        SELECT unique_key, descript, dayofweek, category FROM `bigquery-public-data.san_francisco.sfpd_incidents`
        LIMIT 100
        """
df = bq_assistant.query_to_pandas(QUERY)
print(df.head(4))

trace1 = go.Scatter(
    x=df['unique_key'].index,
    y=df['category'],
    mode="lines",
    name="category",
)
layout = dict(
    title='',
    xaxis=dict(title='unique_key'),
    yaxis=dict(title='category'),
)
fig = dict(data=[trace1], layout=layout)
Exemple #3
0
# sample query from:
# https://cloud.google.com/bigquery/public-data/openaq#which_10_locations_have_had_the_worst_air_quality_this_month_as_measured_by_high_pm10
QUERY = """
        SELECT location, city, country, value, timestamp
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE pollutant = "pm10" AND timestamp > "2017-04-01"
        ORDER BY value DESC
        LIMIT 1000
        """

# The quick way to execute this query is to use [the bq_helper library](https://github.com/SohierDane/BigQuery_Helper):

# In[7]:

bq_assistant = BigQueryHelper('bigquery-public-data', 'openaq')
df = bq_assistant.query_to_pandas(QUERY)
df.head(3)

# But what is bq_helper actually doing under the hood? Let's replicate the same process through the core BigQuery API to find out.

# In[8]:

client = bigquery.Client()
query_job = client.query(QUERY)
rows = list(query_job.result(timeout=30))
for row in rows[:3]:
    print(row)

# The outputs look reasonable, but what's this storage format?
Exemple #4
0
import pandas as pd
from bq_helper import BigQueryHelper

print("export GOOGLE_APPLICATION_CREDENTIALS=\"/Users/davidwu/Downloads/cs221-project-1b5850653193.json\"")

bq_assistant = BigQueryHelper("bigquery-public-data", "github_repos")
#all_tables = bq_assistant.list_tables()
#for table in all_tables:
#    print(table)
#    print(str(bq_assistant.table_schema(table)))

QUERY = """
        SELECT *
        FROM `bigquery-public-data.github_repos.languages`
        """
#print(bq_assistant.estimate_query_size(QUERY))
print("HELLO")
df = bq_assistant.query_to_pandas_safe(QUERY)
print("HELLO")
df.to_csv("all_project_languages.txt", sep='\t')
Exemple #5
0
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "keys.json"

import pandas as pd
from bq_helper import BigQueryHelper
import plotly.graph_objs as go
from plotly.offline import plot

bq_assistant = BigQueryHelper('bigquery-public-data',
                              'epa_historical_air_quality')

QUERY = """
		SELECT `state_code`, `date_local`, `mdl`, `parameter_name`
        FROM `bigquery-public-data.epa_historical_air_quality.co_hourly_summary`
        LIMIT 1000
        """

df = bq_assistant.query_to_pandas(QUERY)

state_code_count = df.groupby(['state_code'])['parameter_name'].count()
date_local_count = df.groupby(['date_local'])['parameter_name'].count()
mdl_count = df.groupby(['mdl'])['parameter_name'].count()
trace1 = go.Scatter(x=state_code_count.index, y=state_code_count.values)
trace2 = go.Pie(labels=date_local_count.index, values=date_local_count.values)

trace3 = go.Bar(x=mdl_count.index, y=mdl_count.values)

layout1 = go.Layout(xaxis=dict(title='state_code'),
                    yaxis=dict(title='parmater_name'))
layout2 = go.Layout(title='data', xaxis=dict(title=''), yaxis=dict(title=''))
Exemple #6
0
def GLquery(query="query1",
            country="US",
            start_year="1790",
            end_year="2019",
            min_count="100",
            kind_code="B2",
            cpc_code="A63F",
            assignee="President and Fellows of Harvard College",
            keyword="internet of things",
            budget=1000,
            output_to_csv=False,
            plot=False,
            ask_before_running=False):
    '''

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

FUNCTION PARAMETERS

GLquery takes the following arguments:

- query (string): choose among one of the options in the next section (see below) (default is "query1")

- country (string) = format "LL" (default is "US")

- start_year (string) = format "YYYY" (default is "1790")

- end_year (string) = format "YYYY" (default is "2019")

- min_count (string) = minimum threshold (default is "100")

- kind_code (string) = format "LN", see kind codes here: https://www.cas.org/support/documentation/references/patkind (default is "B2")

- cpc_code (string) = formant "LNNL", see CPC codes here: https://www.uspto.gov/web/patents/classification/cpc/html/cpc.html (default is "A63F")

- assignee (string) = any format (case sensitive) (default is "President and Fellows of Harvard College")

- keyword (string) = any format (case sensitive) (default is "internet of things")

- budget (number) = any number (default is 1.000 GB) (queries above value 'budget' will not run; queries below 'budget' will use only the minimum amount of memory necessary to run the query, not the full value of 'budget')

- output_to_csv (True or False) = output results as .csv file to your current working directory (default is False)

- plot (True of False) = plot results as .pdf file to your current working directory (default is False) (implemented for queries 1, 2, 3, 10)

- ask_before_running (True or False) = given a query size (in GB), asks user input before running the query (default is False)

///////////////////////////////////////////////////////////////////////////////

QUERIES

As of June/2019, GLquery can perform the following queries:

- query1: Number of patent applications by country (takes no inputs)

- query2: Number of patents published between years X and Y by country (necessary arguments: start_year and end_year)

- query3: Number of patents published to country Z between years X and Y (necessary arguments: start_year, end_year and country)

- query4: Which patents country Z published between years X and Y? (necessary arguments: start_year, end_year, country, kind_code)

- query5: Most common patenting technology areas by year (takes no inputs)

- query6: Most common patenting technology areas in country Z between years X and Y (necessary arguments: start_year, end_year and country)

- query7: Inventors with over N patents by country (necessary arguments: min_count)

- query8: Patent landscaping of technology T between years X and Y" (necessary arguments: start_year, end_year, min_count and cpc_code) (this query returns patents, firms and countries associated with technology T)

- query9: Which firms is assignee A citing in their patents? (necessary arguments: assignee)

- query10: Number of patent applications with keyword K in country Z (necessary arguments: country and keyword)

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////


    '''

    global my_output_path

    patents = bq_helper.BigQueryHelper(active_project="patents-public-data",
                                       dataset_name="patents")
    bq_assistant = BigQueryHelper("patents-public-data", "patents")

    print("Country selected: {}.".format(country))
    print("Period selected: from {} to {}.".format(start_year, end_year))

    def replace(string, substitutions):
        substrings = sorted(substitutions, key=len, reverse=True)
        regex = re.compile('|'.join(map(re.escape, substrings)))
        return regex.sub(lambda match: substitutions[match.group(0)], string)

    def query_selector(query):
        switcher = {
            "query1":
            """
            -- "Number of publications by country"
            
            SELECT COUNT(*) AS cnt, country_code
            FROM (
                SELECT ANY_VALUE(country_code) AS country_code
                FROM `patents-public-data.patents.publications`
                GROUP BY application_number)
            GROUP BY country_code
            ORDER BY cnt DESC
            """,
            "query2":
            replace(
                """
            -- "Number of patents published between years X and Y by country"
            
            SELECT country_code,
            COUNT(DISTINCT publication_number) AS publications
            FROM `patents-public-data.patents.publications`
            WHERE publication_date >= XXXX0000 
            AND publication_date < YYYY0000
            AND application_kind = 'A'
            GROUP BY country_code
            ORDER BY publications DESC
            """, {
                    "XXXX": start_year,
                    "YYYY": end_year
                }),
            "query3":
            replace(
                """
            -- "Number of patents granted to country Z between years X and Y"
            
            SELECT FLOOR(publication_date/10000) as datum,
            COUNT(DISTINCT publication_number) as publications
            FROM `patents-public-data.patents.publications`
            WHERE country_code = 'ZZZZ'
            AND publication_date >= XXXX0000
            AND publication_date <= YYYY0000
            AND application_kind = 'A'
            GROUP BY datum, application_kind
            ORDER BY application_kind, datum
            """, {
                    "XXXX": start_year,
                    "YYYY": end_year,
                    "ZZZZ": country
                }),
            "query4":
            replace(
                """
            -- "Which patents country Z published between years X and Y"
            
            SELECT publication_number
            FROM `patents-public-data.patents.publications`
            WHERE country_code = 'ZZZZ'
            AND publication_date >= XXXX0000
            AND publication_date <= YYYY0000
            AND application_kind = 'A'
            AND kind_code = 'BBBB'
            """, {
                    "XXXX": start_year,
                    "YYYY": end_year,
                    "ZZZZ": country,
                    "BBBB": kind_code
                }),
            "query5":
            """
            -- "Most common patenting technology areas by year"
            
            CREATE TEMPORARY FUNCTION highest_moving_avg(yearcnt ARRAY<STRUCT<filing_year INT64, cnt INT64>>)
            RETURNS STRUCT<filing_year INT64, avg INT64>
            LANGUAGE js AS \"""
            let avg = 0;
            let a = 1.0;
            let highest = {filing_year: -1, avg: -1};
            for (let x of yearcnt) {
                    avg = a * x.cnt + (1 - a) * avg;
                    if (avg > highest.avg) {
                            highest = {filing_year: x.filing_year, avg: avg};}
                    }
                    return highest;
                \""";
                
            WITH patent_cpcs AS (
                SELECT cd.parents,
                CAST(FLOOR(filing_date/10000) AS INT64) AS filing_year
                FROM (
                    SELECT ANY_VALUE(cpc) AS cpc, ANY_VALUE(filing_date) AS filing_date
                    FROM `patents-public-data.patents.publications`
                    WHERE application_number != ""
                    GROUP BY application_number), UNNEST(cpc) AS cpcs
                JOIN `patents-public-data.cpc.definition` cd ON cd.symbol = cpcs.code
                WHERE cpcs.first = TRUE AND filing_date > 0)

            SELECT c.title_full, cpc_group, best_year.*
            FROM (
                SELECT cpc_group, highest_moving_avg(ARRAY_AGG(STRUCT<filing_year INT64, cnt INT64>(filing_year, cnt) ORDER BY filing_year ASC)) AS best_year
                FROM (
                    SELECT cpc_group, filing_year, COUNT(*) AS cnt
                    FROM (
                        SELECT cpc_parent AS cpc_group, filing_year
                        FROM patent_cpcs, UNNEST(parents) AS cpc_parent)
                    GROUP BY cpc_group, filing_year
                    ORDER BY filing_year DESC, cnt DESC)
                GROUP BY cpc_group)
            JOIN `patents-public-data.cpc.definition` c ON cpc_group = c.symbol
            WHERE c.level = 5
            ORDER BY best_year.filing_year ASC;
            """,
            "query6":
            replace(
                """
            -- "Most common patenting technology areas in country Z between years X and Y"
            
            CREATE TEMPORARY FUNCTION highest_moving_avg(yearcnt ARRAY<STRUCT<filing_year INT64, cnt INT64>>)
            RETURNS STRUCT<filing_year INT64, avg INT64>
            LANGUAGE js AS \"""
            let avg = 0;
            let a = 1.0;
            let highest = {filing_year: -1, avg: -1};
            for (let x of yearcnt) {
                    avg = a * x.cnt + (1 - a) * avg;
                    if (avg > highest.avg) {
                            highest = {filing_year: x.filing_year, avg: avg};}
                    }
                    return highest;
                \""";
                
            WITH patent_cpcs AS (
                SELECT cd.parents,
                CAST(FLOOR(filing_date/10000) AS INT64) AS filing_year
                FROM (
                    SELECT ANY_VALUE(cpc) AS cpc, ANY_VALUE(filing_date) AS filing_date
                    FROM `patents-public-data.patents.publications`
                    WHERE application_number != ""
                    AND country_code = 'ZZZZ'
                    AND grant_date >= XXXX0000
                    AND grant_date <= YYYY0000
                    GROUP BY application_number), UNNEST(cpc) AS cpcs
                JOIN `patents-public-data.cpc.definition` cd ON cd.symbol = cpcs.code
                WHERE cpcs.first = TRUE AND filing_date > 0)

            SELECT c.title_full, cpc_group, best_year.*
            FROM (
                SELECT cpc_group, highest_moving_avg(ARRAY_AGG(STRUCT<filing_year INT64, cnt INT64>(filing_year, cnt) ORDER BY filing_year ASC)) AS best_year
                FROM (
                    SELECT cpc_group, filing_year, COUNT(*) AS cnt
                    FROM (
                        SELECT cpc_parent AS cpc_group, filing_year
                        FROM patent_cpcs, UNNEST(parents) AS cpc_parent)
                    GROUP BY cpc_group, filing_year
                    ORDER BY filing_year DESC, cnt DESC)
                GROUP BY cpc_group)
            JOIN `patents-public-data.cpc.definition` c ON cpc_group = c.symbol
            WHERE c.level = 5 
            ORDER BY best_year.filing_year ASC;
            """, {
                    "XXXX": start_year,
                    "YYYY": end_year,
                    "ZZZZ": country
                }),
            "query7":
            replace(
                """
            -- "Inventors with over N patents by country"
            
            WITH temp1 AS (
            SELECT DISTINCT PUB.country_code,
            PUB.application_number AS patent_number, inventor_name
            FROM `patents-public-data.patents.publications` PUB
            CROSS JOIN
            UNNEST(PUB.inventor) AS inventor_name
            WHERE PUB.grant_date >= 1790000
            AND PUB.country_code IS NOT NULL
            AND PUB.application_number IS NOT NULL
            AND PUB.inventor IS NOT NULL)
            SELECT * FROM (
            SELECT temp1.country_code AS country, temp1.inventor_name AS inventor,
            COUNT(temp1.patent_number) AS count_of_patents
            FROM temp1
            GROUP BY temp1.country_code, temp1.inventor_name)
            WHERE count_of_patents >= NNNN
            """, {"NNNN": min_count}),
            "query8":
            replace(
                """
            -- "Patent landscaping of technology T between years X and Y" 
            
            SELECT SUM(year_cnt) AS total_count, assignee_name,
            ARRAY_AGG(STRUCT<cnt INT64, filing_year INT64, countries STRING>(year_cnt, filing_year, countries) ORDER BY year_cnt DESC LIMIT 1)[SAFE_ORDINAL(1)] AS Number_of_patents_under_this_CPC_code_Peak_year_Top_countries
            FROM (
                SELECT SUM(year_country_cnt) AS year_cnt, assignee_name, filing_year, STRING_AGG(country_code ORDER BY year_country_cnt DESC LIMIT 5) AS countries
                FROM (
                    SELECT COUNT(*) AS year_country_cnt, a.name AS assignee_name, CAST(FLOOR(filing_date/10000) AS INT64) AS filing_year, apps.country_code
                    FROM (
                        SELECT ANY_VALUE(assignee_harmonized) AS assignee_harmonized, ANY_VALUE(filing_date) AS filing_date, ANY_VALUE(country_code) AS country_code
                        FROM `patents-public-data.patents.publications` AS pubs
                        WHERE (SELECT MAX(TRUE) FROM UNNEST(pubs.cpc) AS c WHERE REGEXP_CONTAINS(c.code, "TTTT"))
                        AND publication_date >= XXXX0000
                        AND publication_date <= YYYY0000
                        GROUP BY application_number) AS apps, UNNEST(assignee_harmonized) AS a
                    WHERE filing_date > 0
                    GROUP BY a.name, filing_year, country_code)
                GROUP BY assignee_name, filing_year)
            GROUP BY assignee_name
            ORDER BY total_count DESC
            LIMIT NNNN
            """, {
                    "XXXX": start_year,
                    "YYYY": end_year,
                    "NNNN": min_count,
                    "TTTT": cpc_code
                }),
            "query9":
            replace(
                """
            -- "Which firms is assignee A citing in their patents?"
            
            SELECT citing_assignee,
            COUNT(*) AS num_cites, citing_cpc_subclass, cpcdef.title_full AS citing_cpc_title
            FROM (
                SELECT pubs.publication_number AS citing_publication_number, cite.publication_number AS cited_publication_number, citing_assignee_s.name AS citing_assignee, SUBSTR(cpcs.code, 0, 4) AS citing_cpc_subclass
                FROM `patents-public-data.patents.publications` AS pubs, UNNEST(citation) AS cite, UNNEST(assignee_harmonized) AS citing_assignee_s, UNNEST(cpc) AS cpcs
                WHERE cpcs.first = TRUE) AS pubs
                JOIN (
                    SELECT publication_number AS cited_publication_number, cited_assignee_s.name AS cited_assignee
                    FROM `patents-public-data.patents.publications`, UNNEST(assignee_harmonized) AS cited_assignee_s) AS refs ON pubs.cited_publication_number = refs.cited_publication_number
                JOIN `patents-public-data.cpc.definition` AS cpcdef ON cpcdef.symbol = citing_cpc_subclass
                WHERE cited_assignee = "AAAA" AND citing_assignee != "AAAA"
                GROUP BY cited_assignee, citing_assignee, citing_cpc_subclass, cpcdef.title_full
                ORDER BY num_cites DESC
                """, {"AAAA": assignee}),
            "query10":
            replace(
                """
            --  Number of patent applications with keyword K in country Z             
            
            WITH Patent_Matches AS (
                SELECT PARSE_DATE('%Y%m%d', SAFE_CAST(ANY_VALUE(patentsdb.filing_date) AS STRING)) AS Patent_Filing_Date, patentsdb.application_number AS Patent_Application_Number,
                ANY_VALUE(abstract_info.text) AS Patent_Title,
                ANY_VALUE(abstract_info.language) AS Patent_Title_Language
                FROM `patents-public-data.patents.publications` AS patentsdb,
                UNNEST(abstract_localized) AS abstract_info
                WHERE
                LOWER(abstract_info.text) LIKE '%KKKK%'
                AND patentsdb.country_code = 'ZZZZ'
                GROUP BY Patent_Application_Number),
            Date_Series_Table AS (
                SELECT day, 0 AS Number_of_Patents
                FROM UNNEST (GENERATE_DATE_ARRAY(
                    (SELECT MIN(Patent_Filing_Date) FROM Patent_Matches),
                    (SELECT MAX(Patent_Filing_Date) FROM Patent_Matches))) AS day)
            SELECT SAFE_CAST(FORMAT_DATE('%Y-%m',Date_Series_Table.day) AS STRING) AS Patent_Date_YearMonth, COUNT(Patent_Matches.Patent_Application_Number) AS Number_of_Patent_Applications
            FROM Patent_Matches
            RIGHT JOIN Date_Series_Table
            ON Patent_Matches.Patent_Filing_Date = Date_Series_Table.day
            GROUP BY Patent_Date_YearMonth
            ORDER BY Patent_Date_YearMonth
            """, {
                    "ZZZZ": country,
                    "KKKK": keyword
                }),
        }
        return switcher.get(query, "Invalid query")

    print("Estimated query size: {} GB.".format(
        bq_assistant.estimate_query_size(query_selector(query))))

    def kenvelo(question, answer="no"):

        range_of_choices = {
            "yes": True,
            "y": True,
            "": True,
            "no": False,
            "n": False
        }

        if answer is None:
            prompt = "[Y/N]"

        elif answer == "yes":
            prompt = "[Y/N]"

        elif answer == "no":
            prompt = "[Y/N]"

        else:
            raise ValueError("Answer '%s' is invalid.")

        while True:
            sys.stdout.write(question + prompt)
            choice = input().lower()

            if answer is not None and choice == '':
                return range_of_choices[answer]

            elif choice in range_of_choices:
                return range_of_choices[choice]

            else:
                sys.stdout.write(
                    "Please answer any variation of 'yes' or 'no'. ")

    while ask_before_running is False or kenvelo("Run query? "):

        def plot_query1():
            figure(num=None, figsize=(24, 16), facecolor='w', edgecolor='k')
            sns.set(context='paper', style='ticks', font_scale=0.9)
            sns.barplot(x='country_code',
                        y='cnt',
                        data=patents.query_to_pandas_safe(
                            query_selector("query1"),
                            max_gb_scanned=bq_assistant.estimate_query_size(
                                query_selector("query1"))))
            plt.title("Number of publications by country",
                      loc='left',
                      fontsize=24,
                      style='oblique')
            plt.ylabel('# of publications (log)', fontsize=14)
            plt.xlabel('Country', fontsize=14)
            plt.yscale('log')
            sns.despine(offset=10, trim=True)
            plt.savefig(my_output_path + "query1" + '.pdf',
                        orientation='landscape',
                        bbox_inches='tight')
            plt.show()

        def plot_query2():
            figure(num=None, figsize=(24, 16), facecolor='w', edgecolor='k')
            sns.set(context='paper', style='ticks', font_scale=0.9)
            sns.barplot(x='country_code',
                        y='publications',
                        data=patents.query_to_pandas_safe(
                            query_selector("query2"),
                            max_gb_scanned=bq_assistant.estimate_query_size(
                                query_selector("query2"))))
            plt.title(
                "Number of patents published between years {} and {} by country"
                .format(start_year, end_year),
                loc='left',
                fontsize=24,
                style='oblique')
            plt.ylabel('# of publications')
            plt.xlabel('Country')
            plt.yscale('log')
            sns.despine(offset=10, trim=True)
            plt.savefig(my_output_path + "query2" + '.pdf',
                        orientation='landscape',
                        bbox_inches='tight')
            plt.show()

        def plot_query3():
            figure(num=None, figsize=(24, 16), facecolor='w', edgecolor='k')
            sns.set(context='paper', style='ticks', font_scale=0.9)
            sns.barplot(x='datum',
                        y='publications',
                        data=patents.query_to_pandas_safe(
                            query_selector("query3"),
                            max_gb_scanned=bq_assistant.estimate_query_size(
                                query_selector("query3"))))
            plt.title(
                "Number of patents granted to country {} between years {} and {}"
                .format(country, start_year, end_year),
                loc='left',
                fontsize=24,
                style='oblique')
            plt.ylabel('# of patents', fontsize=14)
            plt.xlabel('')
            sns.despine(offset=10, trim=True)
            plt.savefig(my_output_path + "query3" + '.pdf',
                        orientation='landscape',
                        bbox_inches='tight')
            plt.show()

        def plot_query10():
            figure(num=None, figsize=(24, 16), facecolor='w', edgecolor='k')
            sns.set(context='paper', style='ticks', font_scale=0.9)
            sns.barplot(x='Patent_Date_YearMonth',
                        y='Number_of_Patent_Applications',
                        data=patents.query_to_pandas_safe(
                            query_selector("query10"),
                            max_gb_scanned=bq_assistant.estimate_query_size(
                                query_selector("query10"))))
            plt.title(
                "Number of patent applications of technology {} in country {}".
                format(keyword, country),
                loc='left',
                fontsize=24,
                style='oblique')
            plt.ylabel('# of applications', fontsize=14)
            plt.xlabel('Date', fontsize=14, rotation=45)
            sns.despine(offset=10, trim=True)
            plt.savefig(my_output_path + "query10" + '.pdf',
                        orientation='landscape',
                        bbox_inches='tight')
            plt.show()

        def plotter(query):
            if query == "query1":
                plot_query1()
                query = "query2"

            elif query == "query2":
                plot_query2()
                query = "query3"

            elif query == "query3":
                plot_query3()
                query = "query10"

            elif query == "query10":
                plot_query10()

            else:
                print('\033[1m' +
                      "Sorry, this query doesn't output graphs (yet)." +
                      '\033[0m')

        temp_path = Path(my_output_path)

        if output_to_csv and plot:
            return patents.query_to_pandas_safe(
                query_selector(query),
                max_gb_scanned=min(
                    budget,
                    bq_assistant.estimate_query_size(
                        query_selector(query)))).to_csv(
                            Path(temp_path, query + '.csv'),
                            index=False,
                            encoding="utf-8"), plotter(query)

        if output_to_csv:
            return patents.query_to_pandas_safe(
                query_selector(query),
                max_gb_scanned=min(
                    budget,
                    bq_assistant.estimate_query_size(
                        query_selector(query)))).to_csv(Path(
                            temp_path, query + '.csv'),
                                                        index=False,
                                                        encoding="utf-8")

        if plot:
            return plotter(query)

        else:
            return patents.query_to_pandas_safe(
                query_selector(query),
                max_gb_scanned=min(
                    budget,
                    bq_assistant.estimate_query_size(query_selector(query))))

        if kenvelo("You will use",
                   bq_assistant.estimate_query_size(query_selector(query)),
                   "GB in this query. Continue? ") is False:
            break
Exemple #7
0
#!/usr/bin/env python
# coding: utf-8

# In[2]:

import bq_helper
from bq_helper import BigQueryHelper
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "Cloud Final-1f49f4274b69.json"
bq_assistant = BigQueryHelper("bigquery-public-data", "stackoverflow")
QUERY_1 = "SELECT ID,TITLE FROM `bigquery-public-data.stackoverflow.posts_questions` WHERE ID = 57804"
QUERY = "SELECT q.id, q.title, q.body, q.tags, a.body as answers, a.score FROM `bigquery-public-data.stackoverflow.posts_questions` AS q INNER JOIN `bigquery-public-data.stackoverflow.posts_answers` AS a ON q.id = a.parent_id WHERE q.tags LIKE '%python%' LIMIT 10000"

df = bq_assistant.query_to_pandas(QUERY)

# In[3]:

import findspark

findspark.init()

import pyspark  # only run after findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

# In[5]:

file_save_path = os.getcwd() + "/dataset/big_query.parquet"
data_frame = spark.createDataFrame(df)
Exemple #8
0
from bq_helper import BigQueryHelper
from google.cloud import bigquery

#before executing this code you have to export your key : export GOOGLE_APPLICATION_CREDENTIALS="path to your key.json"

client = bigquery.Client(project='Oceans')

bq_assistant = BigQueryHelper(active_project="bigquery-public-data",
                              dataset_name="noaa_icoads")

bq_assistant.list_tables()
bq_assistant.table_schema('icoads_core_2005')

query = """SELECT latitude, longitude, wind_direction_true, wind_speed, present_weather, sea_level_pressure, air_temperature,  wetbulb_temperature, sea_surface_temp,
            total_cloud_amount, cloud_height, wave_direction, wave_period, wave_height, swell_direction, swell_period, swell_height, timestamp
            FROM `bigquery-public-data.noaa_icoads.icoads_core_2005`
                        """

#execute the query.
res = bq_assistant.query_to_pandas_safe(query)

#save the result.
res.to_csv("resultat_total_20052.csv", sep=',')
Exemple #9
0
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "key.json"

from bq_helper import BigQueryHelper

client = BigQueryHelper('bigquery-public-data', 'stackoverflow')

QUERY = """
    SELECT * FROM `bigquery-public-data.stackoverflow.comments` 
    
    """

print(client.estimate_query_size(QUERY))  #13.3 GB

bq_assistant = BigQueryHelper('bigquery-public-data', 'new_york')

QUERY = """
        SELECT * FROM `bigquery-public-data.new_york.citibike_trips`
        """

print(bq_assistant.estimate_query_size(QUERY))  #4.6 GB
Exemple #10
0
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="keys.json"

import pandas as pd
from bq_helper import BigQueryHelper
import plotly.graph_objs as go
from plotly.offline import plot

bq_assistant = BigQueryHelper('bigquery-public-data','hap_hourly_summary')


QUERY = """
        SELECT 'country-name','date-local','sample-measurement', 'method-code'
        FROM  'bigquery-public-data.epa_historical_air_quality.epa_hap_hourly_summary'
        LIMIT 100
        """


df = bq_assistant.query_to_pandas(QUERY)






trace1 = go.Scatter(
    x = df_observation_.date_local,
    y = df_observation_state.sample_measurement,
    mode = 'lines+markers',
    name = 'Observation percent depending on state'
Exemple #11
0
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from google.cloud import bigquery

import bq_helper
from bq_helper import BigQueryHelper
stackOverflow = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                         dataset_name="stackoverflow")

bq_assistant = BigQueryHelper("bigquery-public-data", "stackoverflow")
bq_assistant.list_tables()
tables = [
    'badges', 'comments', 'post_history', 'post_links', 'posts_answers',
    'posts_moderator_nomination', 'posts_orphaned_tag_wiki',
    'posts_privilege_wiki', 'posts_questions', 'posts_tag_wiki',
    'posts_tag_wiki_excerpt', 'posts_wiki_placeholder', 'stackoverflow_posts',
    'tags', 'users', 'votes'
]
for t in tables:
    tag = bq_assistant.head(t, num_rows=5000)
    tag.to_csv('stackoverflow_' + t + '.csv')
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from google.cloud import bigquery
from bq_helper import BigQueryHelper

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

# import os
# print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

#some queries to extract the data from the hacker news dataset
#I end up getting chunks from the tables, as chunks >1gig tend to fail even if I can theoretically get upto 5gig
bq_assistant = BigQueryHelper("bigquery-public-data", "hacker_news")
#pandas_client = bigquery.Client()
#bq_assistant.list_tables()

#Gets the 'stories' table into a csv file
#QUERY1 = """
#    SELECT
#        *
#    FROM
#        `bigquery-public-data.hacker_news.stories`
#    """
#bq_assistant.estimate_query_size(QUERY1)
#df = bq_assistant.query_to_pandas_safe(QUERY)
#df.to_csv('stories_test.csv', index = False)

#Gets the 'comments' table into a csv file
Exemple #13
0
query = """
SELECT 
  block_timestamp,
  from_address,
  to_address,
  value,
  receipt_contract_address,
  receipt_status,
  block_number
FROM
  `bigquery-public-data.ethereum_blockchain.transactions` AS transactions
WHERE TRUE
  AND block_timestamp < '2017-01-01 00:00:00'
"""
# This establishes an authenticated session and prepares a reference to the dataset that lives in BigQuery.
bq_assistant = BigQueryHelper("bigquery-public-data", "ethereum_blockchain")
print("estimated query data size: " +
      str(bq_assistant.estimate_query_size(query)))

df = bq_assistant.query_to_pandas_safe(query, max_gb_scanned=60)

df.to_csv(
    "/home/yueduan/yueduan/postdoc_study/ether_transactions_before2017.csv")

# # Import dataset from big query
# from google.cloud import bigquery
# from bq_helper import BigQueryHelper
# import matplotlib.pyplot as plt
# import seaborn as sns
# import pandas as pd
# #%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

from google.cloud import bigquery
from bq_helper import BigQueryHelper

bq_assist = BigQueryHelper(active_project='bigquery-public-data',
                           dataset_name='epa_historical_air_quality')
bq_assist.list_tables()

bq_assist.head('temperature_daily_summary')

query = """ SELECT EXTRACT (YEAR FROM date_local)  AS Year,
                   AVG ((arithmetic_mean - 32.0)/ 1.80)  AS avg_temp_celcius
            FROM  `bigquery-public-data.epa_historical_air_quality.temperature_daily_summary` 
            GROUP BY  Year
            Order BY  Year
"""
avg_temp = bq_assist.query_to_pandas_safe(query, max_gb_scanned=10)

query = """ SELECT EXTRACT (YEAR FROM date_local)  AS Year,
                   AVG (arithmetic_mean) AS avg_co
            FROM  `bigquery-public-data.epa_historical_air_quality.co_daily_summary` 
            GROUP BY  Year
            Order BY  Year
"""
avg_co = bq_assist.query_to_pandas_safe(query, max_gb_scanned=10)
Exemple #15
0
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "keys.json"

import pandas as pd
from bq_helper import BigQueryHelper

bq_assistant = BigQueryHelper('bigquery-public-data', 'new_york')

QUERY = """
        SELECT * FROM `bigquery-public-data.new_york.citibike_trips`
        """

print(bq_assistant.estimate_query_size(QUERY))  #4.5 GB

QUERY = """
        SELECT * FROM `bigquery-public-data.new_york.citibike_trips`
        LIMIT 100
        """

df = bq_assistant.query_to_pandas(QUERY)

print(df.head(10))
Exemple #16
0
import bq_helper
from bq_helper import BigQueryHelper
stackOverflow = bq_helper.BigQueryHelper(active_project="bigquery-public-data", dataset_name="stackoverflow")
bq_assistant = BigQueryHelper("bigquery-public-data", "stackoverflow")

tabelas = bq_assistant.list_tables()
esquemas_tabelas = {}

query = """
SELECT Year, Tag, Total, Percent_Questions_with_Answers
FROM (SELECT EXTRACT(YEAR FROM a.creation_date) as Year, t.tag_name as Tag, COUNT(1) as Total, ROUND(100 * SUM(IF(a.answer_count > 0, 1, 0)) / COUNT(*), 1) AS Percent_Questions_with_Answers
FROM `bigquery-public-data.stackoverflow.posts_questions` a right JOIN `bigquery-public-data.stackoverflow.tags` t ON t.tag_name in UNNEST(SPLIT(a.tags,'|'))
GROUP BY Year, Tag
HAVING
  Year > 2019 AND Year < 2021
ORDER BY
  Total DESC
LIMIT 20)
ORDER BY Percent_Questions_with_Answers DESC
"""
response = stackOverflow.query_to_pandas_safe(query, max_gb_scanned=20)
response.head(20)

query1 = "SELECT tag_name as Assunto, count as Num_perguntas FROM `bigquery-public-data.stackoverflow.tags` order BY count DESC"
response1 = stackOverflow.query_to_pandas_safe(query1, max_gb_scanned=20)
response1.head(20)

query2 = """SELECT
  Day_of_Week,
  COUNT(1) AS Num_Questions,
  SUM(answered_in_1h) AS Num_Answered_in_1H,
 def setUp(self):
     self.my_bq = BigQueryHelper("bigquery-public-data", "openaq")
     self.query = "SELECT location FROM `bigquery-public-data.openaq.global_air_quality`"
     # Query randomized so it won't hit the cache across multiple test runs
     self.randomizable_query = """
Exemple #18
0
class TestBQHelper(unittest.TestCase):
    def setUp(self):
        self.my_bq = BigQueryHelper("bigquery-public-data", "openaq")
        self.query = "SELECT location FROM `bigquery-public-data.openaq.global_air_quality`"
        # Query randomized so it won't hit the cache across multiple test runs
        self.randomizable_query = """
            SELECT value FROM `bigquery-public-data.openaq.global_air_quality`
            WHERE value = {0}"""

    def test_list_tables(self):
        self.assertEqual(self.my_bq.list_tables(), ['global_air_quality'])

    def test_list_schema(self):
        self.assertEqual(len(self.my_bq.table_schema('global_air_quality')),
                         11)

    def test_estimate_query_size(self):
        self.assertIsInstance(self.my_bq.estimate_query_size(self.query),
                              float)

    def test_query_to_pandas(self):
        self.assertIsInstance(self.my_bq.query_to_pandas(self.query),
                              DataFrame)

    def test_query_safe_passes(self):
        self.assertIsInstance(self.my_bq.query_to_pandas_safe(self.query),
                              DataFrame)

    def test_query_safe_fails(self):
        # Different query must be used for this test to ensure we don't hit the
        # cache and end up passing by testing a query that would use zero bytes.
        fail_query = self.randomizable_query.format(random())
        self.assertIsNone(self.my_bq.query_to_pandas_safe(fail_query, 10**-10))

    def test_head(self):
        self.assertIsInstance(self.my_bq.head('global_air_quality'), DataFrame)

    def test_useage_tracker(self):
        self.my_bq.query_to_pandas(self.randomizable_query.format(random()))
        self.assertNotEqual(self.my_bq.total_gb_used_net_cache, 0)

    def test_bad_query_raises_right_error(self):
        with self.assertRaises(BadRequest):
            self.my_bq.query_to_pandas("Not a valid query")

    def test_list_nested_schema(self):
        nested_helper = BigQueryHelper("bigquery-public-data", "github_repos")
        self.assertEqual(len(nested_helper.table_schema('commits')), 33)
    display(f, df)

# ### Images
# Let's use the data to annotate some train images and see what we're trying to do. You'll see there is some complexity here, both with the labels and the boxes.

# In[ ]:

train = pd.read_csv('../input/train_human_labels.csv',
                    usecols=['ImageID', 'LabelName'])

descrips = pd.read_csv('../input/class-descriptions.csv',
                       names=['LabelName', 'Description'])
train = train.merge(descrips, how='left', on='LabelName')
train.head(9)

open_images = BigQueryHelper(active_project="bigquery-public-data",
                             dataset_name="open_images")
query = """
            SELECT image_id, original_url 
            FROM `bigquery-public-data.open_images.images` 
            WHERE image_id IN UNNEST(['0199bc3e1db115d0',
                                      '4fa8054781a4c382',
                                      '51c5d8d5d9cd87ca',
                                      '9ec02b5c0315fcd1',
                                      'b37f763ae67d0888',
                                      'ddcb4b7478e9917b'])
        """
urls = open_images.query_to_pandas_safe(query)

boxes = pd.read_csv('../input/train_bounding_boxes.csv')
boxes = boxes[boxes.ImageID.isin(
    urls.image_id.tolist())].sort_values('ImageID')
Exemple #20
0
 def test_list_nested_schema(self):
     nested_helper = BigQueryHelper("bigquery-public-data", "github_repos")
     self.assertEqual(len(nested_helper.table_schema('commits')), 33)
Exemple #21
0
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="keys.json"

import pandas as pd
from bq_helper import BigQueryHelper
import plotly.graph_objs as go
from plotly.offline import plot

bq_assistant = BigQueryHelper('epa_historical_air_quality' ,'bigquery-public-data')


QUERY = """
        SELECT state_code, sample_duration, date_local,  units_of_measure  FROM bigquery-public-data.epa_historical_air_quality.lead_daily_summary
        LIMIT 100
        """

_sample_duration_count = df.groupby(['date_local'])['sample_duration'].count()
_units_of_measure_count = df.groupby(['date_local'])['units_of_measure'].count()
_code_count = df.groupby(['date_local'])['code_count'].count()
trace1 = go.Bar(
    x=_sample_duration_count.index,
    y=_sample_duration_count.values
)
trace2 = go.Pie(
    x=_units_of_measure_count.index,
    y=_units_of_measure_count.values
)
trace3 = go.Scatter(
    x=_code_count.index,
    y=_code_count.values,
    mode='lines'
Exemple #22
0
# https://github.com/SohierDane/BigQuery_Helper

from bq_helper import BigQueryHelper
bq_assistant = BigQueryHelper("bigquery-public-data", "github_repos")
QUERY = """
        SELECT message
        FROM `bigquery-public-data.github_repos.commits`
        WHERE LENGTH(message) > 6 AND LENGTH(message) <= 20
        LIMIT 2000
        """
bq_assistant.estimate_query_size(QUERY)
bq_assistant.listtables()

# Any results you write to the current directory are saved as output.
Exemple #23
0
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "key.json"

import pandas as pd
from bq_helper import BigQueryHelper

client = BigQueryHelper('bigquery-public-data', 'stackoverflow')

query = "SELECT " \
        "title, answer_count " \
        "FROM " \
        "bigquery-public-data.stackoverflow.stackoverflow_posts " \
        "ORDER BY " \
        "answer_count DESC " \
        "LIMIT 2"

df = client.query_to_pandas(query)
print(df)
Exemple #24
0
import seaborn as sns

plt.style.use('fivethirtyeight')

# import plotly
import plotly
import plotly.offline as py

py.init_notebook_mode(connected=True)
import plotly.tools as tls
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as fig_fact

plotly.tools.set_config_file(world_readable=True, sharing='public')
bq_assistant = BigQueryHelper("bigquery-public-data", "openaq")

get_ipython().run_line_magic('matplotlib', 'inline')

# # TL;DR
# > A country plot of air pollution. Hover over the desire location, it will show you average value of some dangerous elements

# In[ ]:

QUERY = """
    SELECT
        country, 
        avg(value) as o3_avg_value
    FROM
      `bigquery-public-data.openaq.global_air_quality`
    WHERE
Exemple #25
0
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "keys.json"

import pandas
from bq_helper import BigQueryHelper
import plotly.graph_objs as go
from plotly.offline import plot

bq_assistant = BigQueryHelper("bigquery-public-data", "san_francisco")

QUERY = """
        SELECT `trip_id`, `duration_sec`, `start_station_name`, `subscriber_type` 
        FROM `bigquery-public-data.san_francisco.bikeshare_trips`
        LIMIT 15
        """

df = bq_assistant.query_to_pandas(QUERY)

trace1 = go.Scatter(x=df.trip_id.index,
                    y=df.duration_sec.values,
                    mode="lines",
                    name="trip duration by trip")
layoutScatter = dict(title="placeholder",
                     xaxis=dict(title="tripnumber"),
                     yaxis=dict(title="tripduration"))
tracescatter = dict(data=[trace1], layout=layoutScatter)
plot(tracescatter)

numberofcustomers = df.groupby(["subscriber_type"])["trip_id"].count()
Exemple #26
0
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "keys.json"

import pandas as pd
from bq_helper import BigQueryHelper
import plotly.graph_objs as go
from plotly.offline import plot

bq_assistant = BigQueryHelper('bigquerry-public-data', 'pm10_daily_summary')

QUERY = """
        SELECT county_code, sample_duration, date_local, observation_count  FROM `bigquerry-public-data._pm10_daily_summary`
        LIMIT 100
        """

df = bq_assistant.query_to_pandas(QUERY)

local_count_1 = df.grouphy(['county_code'])['observation_count'].count()
local_count_2 = df.grouphy(['county_code'])['observation_count'].count()
local_count_3 = df.grouphy(['county_code'])['observation_count'].count()

trace3 = go.Scatter(x=local_count_3.index,
                    y=local_count_3.value,
                    marker=dict(color='rgba(225, 174, 255, 0.5)'))
trace2 = go.Scatter(x=local_count_2.index,
                    y=local_count_2.value,
                    marker=dict(color='rgba(225, 174, 255, 0.5)'))

trace1 = go.Bar(x=local_count_1.index,
                y=local_count_1.value,
                marker=dict(color='rgba(225, 174, 255, 0.5)'))
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "key.json"

from bq_helper import BigQueryHelper
client = BigQueryHelper('bigquery-public-data','usa-names')

import plotly.graph_objects as go
from plotly.offline import plot

query = "select name, number from bigquery-public-data.usa_names.usa_1910_current limit 150"

df = client.query_to_pandas(query)

x = df['name']

y= df['number']

name =go.Scatter(x=x , y=y, mode="lines")

layout = go.Layout(title='Scatter' )

fig = go.Figure(data=[name], layout = layout)

plot(fig)
Exemple #28
0
def main():
    path = '../data/accidents'
    bq_assistant = BigQueryHelper('bigquery-public-data',
                                  'nhtsa_traffic_fatalities')

    QUERY = """
        SELECT
            a.state_number,
            a.consecutive_number,
            veh.vehicle_number,
            state_name,
            a.hour_of_crash AS hour_of_day,
            CASE
                WHEN day_of_week = 1 THEN 'Sunday'
                WHEN day_of_week = 2 THEN 'Monday'
                WHEN day_of_week = 3 THEN 'Tuesday'
                WHEN day_of_week = 4 THEN 'Wednesday'
                WHEN day_of_week = 5 THEN 'Thursday'
                WHEN day_of_week = 6 THEN 'Friday'
                WHEN day_of_week = 7 THEN 'Saturday'
            END day_of_week,
            CASE
                WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 1 THEN 'January'
                WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 2 THEN 'February'
                WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 3 THEN 'March'
                WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 4 THEN 'April'
                WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 5 THEN 'May'
                WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 6 THEN 'June'
                WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 7 THEN 'July'
                WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 8 THEN 'August'
                WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 9 THEN 'September'
                WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 10 THEN 'October'
                WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 11 THEN 'November'
                WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 12 THEN 'December'
            END month,
            EXTRACT(YEAR FROM a.timestamp_of_crash) AS year,
            land_use_name AS land_use,
            national_highway_system,
            route_signing_name AS roadway_type,
            type_of_intersection AS intersection,
            light_condition_name AS light_condition,
            atmospheric_conditions_1_name AS atmospheric_conditions,
            latitude,
            longitude,
            manner_of_collision_name AS manner_of_collision,
            number_of_vehicle_forms_submitted_all AS num_vehicles,
            number_of_persons_not_in_motor_vehicles_in_transport_mvit AS num_nonmotorists,
            number_of_persons_in_motor_vehicles_in_transport_mvit AS num_motorists,
            number_of_fatalities AS num_fatalities,
            number_of_drunk_drivers AS num_drunk_drivers,
            body_type_name AS body_type,
            vehicle_model_year AS vehicle_year,
            rollover,
            contributing_circumstances_motor_vehicle_name AS vehicle_conditions,
            previous_dwi_convictions,
            previous_speeding_convictions,
            speeding_related,
            speed_limit,
            drivers_vision_obscured_by_name AS driver_vision_obscured,
        FROM
            `bigquery-public-data.nhtsa_traffic_fatalities.accident_{0}` AS a
        JOIN
            `bigquery-public-data.nhtsa_traffic_fatalities.vehicle_{0}` AS veh
            ON a.consecutive_number=veh.consecutive_number
        JOIN
            `bigquery-public-data.nhtsa_traffic_fatalities.factor_{0}` AS f
            ON a.consecutive_number=f.consecutive_number AND veh.vehicle_number=f.vehicle_number
        JOIN
            `bigquery-public-data.nhtsa_traffic_fatalities.vision_{0}` AS v
            ON a.consecutive_number=v.consecutive_number AND veh.vehicle_number=v.vehicle_number
        ORDER BY consecutive_number, vehicle_number
    """

    data_2015 = bq_assistant.query_to_pandas(QUERY.format(2015))
    data_2016 = bq_assistant.query_to_pandas(QUERY.format(2016))

    data_2015.to_csv(f'{path}/accident_driver_data_2015.csv', index=False)
    data_2016.to_csv(f'{path}/accident_driver_data_2016.csv', index=False)
Exemple #29
0
def create_github_bq_client():
    """Create the object for BigQueryHelper"""
    gh_archive = BigQueryHelper(active_project="githubarchive", dataset_name="day")
    _logger.info('Setting up BQ Client')
    return gh_archive
Exemple #30
0
#
# NOTES:
#   -   Need a Google Cloud authentication:
#           https://cloud.google.com/docs/authentication/getting-started#auth-cloud-implicit-python
#   -   The path to the json auth file you get from the google auth needs to be
#       set in your PATH.
#       -   This means you're going to have to set it everytime you have a new
#           shell session.
#   -   bq_helper abstracts retrieving big queries through google cloud for
#       an easier time:
#           https://github.com/SohierDane/BigQuery_Helper
from bq_helper import BigQueryHelper
import pandas as pd

print('Retrieving NOAA GSOD data.')
bq_assist = BigQueryHelper("bigquery-public-data", "noaa_gsod")
print('Successfully retrieved data.')

start_yr = 2010
end_yr = 2019

for year in range(start_yr, end_yr):
    query = "SELECT stn, year, mo AS month, da AS day, temp, slp FROM `bigquery-public-data.noaa_gsod.gsod{}` WHERE wban='23174' OR wban='23129' OR wban='93134' ".format(year)

    df_weather = bq_assist.query_to_pandas_safe(query, max_gb_scanned=5)

    filename = 'socal_weather_{}.csv'.format(year)

    df_weather.to_csv(filename, index=False)

    print("Saved {}".format(filename))