예제 #1
0
def read_bq_query(query,
                  activate_project="bigquery-public-data",
                  dataset_name="openaq"):
    """Create a helper function to execute differernt queries. """
    # initiate bq object
    open_aq = bq_helper.BigQueryHelper(active_project=activate_project,
                                       dataset_name=dataset_name)
    query_result = open_aq.query_to_pandas_safe(query)
    return query_result
예제 #2
0
def load_data():
    #Read the datset from BigQuery file
    dataset = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                       dataset_name="usa_names")
    # df = pd.read_csv(
    #     "https://query.data.world/s/6joi7hjgjmwifhl2clpldwm36xmvmx")
    # df["REPORTDATETIME"] = pd.to_datetime(
    #     df["REPORTDATETIME"], infer_datetime_format=True)
    # df["Day"] = df["REPORTDATETIME"].dt.day
    # df["Month"] = df["REPORTDATETIME"].dt.month
    # df["Hour"] = df["REPORTDATETIME"].dt.hour
    return dataset
예제 #3
0
# ## Example: What are all the U.S. cities in the OpenAQ dataset?
# ___
#
# Now that you've got the basics down, let's work through an example with a real dataset. Today we're going to be working with the OpenAQ dataset, which has information on air quality around the world. (The data in it should be current: it's updated weekly.)
#
# To help get you situated, I'm going to run through a complete query first. Then it will be your turn to get started running your queries!
#
# First, I'm going to set up everything we need to run queries and take a quick peek at what tables are in our database.

# In[ ]:

# import package with helper functions
import bq_helper

# create a helper object for this dataset
open_aq = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                   dataset_name="openaq")

# print all the tables in this dataset (there's only one!)
open_aq.list_tables()

# I'm going to take a peek at the first couple of rows to help me see what sort of data is in this dataset.

# In[ ]:

# print the first couple rows of the "global_air_quality" dataset
open_aq.head("global_air_quality")

# Great, everything looks good! Now that I'm set up, I'm going to put together a query. I want to select all the values from the "city" column for the rows there the "country" column is "us" (for "United States").
#
# > **What's up with the triple quotation marks (""")?** These tell Python that everything inside them is a single string, even though we have line breaks in it. The line breaks aren't necessary, but they do make it much easier to read your query.
예제 #4
0
# Your code goes here :)

# import package with helper functions
import bq_helper

# create a helper object for this dataset
accidents = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                     dataset_name="nhtsa_traffic_fatalities")

# print couple of rows from the table accident_2015
accidents.head("accident_2015")

# query for the first question
query_for_first_question = """SELECT COUNT(consecutive_number),
                              EXTRACT(HOUR FROM timestamp_of_crash)
                              FROM `bigquery-public-data.nhtsa_traffic_fatalities.accident_2015`
                              GROUP BY EXTRACT(HOUR FROM timestamp_of_crash)
                              ORDER BY COUNT(consecutive_number) DESC
                           """

# run the query
accidents_by_hour = accidents.query_to_pandas_safe(query_for_first_question)

# print the dataframe
accidents_by_hour

# library for plotting
import matplotlib.pyplot as plt

# make a plot to show that our data is, actually, sorted:
plt.plot(accidents_by_hour.f0_)
예제 #5
0
# Set up feedack system
from learntools.core import binder
binder.bind(globals())
from learntools.sql.ex5 import *

# import package with helper functions 
import bq_helper

# create a helper object for this dataset
chicago_taxi_helper = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                               dataset_name="chicago_taxi_trips"

# Your code here to find the table name
chicago_taxi_helper.list_tables()

# write the table name as a string below
table_name = 'taxi_trips'

q_1.check()

# your code here
chicago_taxi_helper.head(table_name)

rides_per_year_query = """
SELECT EXTRACT(year FROM trip_start_timestamp) AS year,COUNT(1) num_trips
FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
GROUP BY year
ORDER BY year
"""

rides_per_year_result = chicago_taxi_helper.query_to_pandas_safe(rides_per_year_query)
예제 #6
0
from google.cloud import bigquery
import bq_helper
import os

#replace with path to JSON file containing Google Service Account private key
os.environ[
    "GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/jason.wang/Documents/Analytics Projects/EDA-Google-PK.json"

names = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                 dataset_name="usa_names")
query = """SELECT year, gender, name, sum(number) as number FROM `bigquery-public-data.usa_names.usa_1910_current` GROUP BY year, gender, name"""
agg_names = names.query_to_pandas_safe(query)
agg_names.to_csv("names.csv")
예제 #7
0
# Set up feedack system
from learntools.core import binder
binder.bind(globals())
from learntools.sql.ex4 import *

# import package with helper functions
import bq_helper

# create a helper object for this dataset
education_data = bq_helper.BigQueryHelper(
    active_project="bigquery-public-data",
    dataset_name="world_bank_intl_education")

education_data.head('international_education')

# Your Code Here

country_spend_pct_query = """
SELECT country_name,AVG(value) avg_ed_spending_pct
FROM `bigquery-public-data.world_bank_intl_education.international_education`
WHERE year >= 2010 and year <= 2017 and indicator_code = 'SE.XPD.TOTL.GD.ZS'
GROUP BY country_name
ORDER BY AVG(value) DESC
"""

country_spending_results = education_data.query_to_pandas_safe(
    country_spend_pct_query)

print(country_spending_results.head())
q_1.check()
예제 #8
0
    with open(file, 'w') as f:
        f.write(html_str)
        f.close()
    string="""<div id='html_string2'><iframe src='"""+\
           file+"""' height="""+str(height+20)+\
           """ width="""+str(width+20)+"""></iframe></div>"""
    display(HTML(string))


# Commented out IPython magic to ensure Python compatibility.
# %run numpy_sage.py

dhtml('Example #1')

import bq_helper
open_aq = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                   dataset_name="openaq")
open_aq.head("global_air_quality", 3).T

my_query = """
SELECT country,pollutant,AVG(value) as avg_value
FROM `bigquery-public-data.openaq.global_air_quality`
WHERE unit='µg/m³' 
GROUP BY country,pollutant
ORDER BY country,pollutant
"""
pollutants = open_aq.query_to_pandas_safe(my_query)
pollutants.head(10).T

import pandas as pd, pylab as pl
df_pollutants = pd.DataFrame(index=list(set(pollutants['country'])),
                             columns=list(set(pollutants['pollutant'])))
예제 #9
0
# coding: utf-8

# # SQL Scavenger Hunt: Day 5
# ## Example: How many files are covered by each license?

# In[ ]:

import bq_helper

# In[ ]:

github_repos = bq_helper.BigQueryHelper(active_project='bigquery-public-data',
                                        dataset_name='github_repos')

# In[ ]:

query = """
SELECT licenses.license, COUNT(files.id) AS num_files
FROM `bigquery-public-data.github_repos.sample_files` AS files
JOIN `bigquery-public-data.github_repos.licenses` AS licenses -- JOIN is the same as INNER JOIN
    ON licenses.repo_name = files.repo_name
GROUP BY license
ORDER BY num_files DESC
"""

license_counts = github_repos.query_to_pandas_safe(query, max_gb_scanned=10)

# In[ ]:

license_counts.shape
예제 #10
0
import bq_helper

baseball = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                    dataset_name="baseball")

col = [
    'gameId', 'startTime', 'gameStatus', 'attendance', 'dayNight',
    'durationMinutes', 'awayTeamName', 'homeTeamName', 'venueName',
    'venueSurface', 'venueCity', 'venueState', 'homeFinalRuns',
    'homeFinalHits', 'homeFinalErrors', 'awayFinalRuns', 'awayFinalHits',
    'awayFinalErrors', 'inningNumber', 'inningHalf', 'description',
    'atBatEventSequenceNumber', 'outcomeDescription', 'hitterLastName',
    'hitterBatHand', 'pitcherLastName', 'pitcherThrowHand',
    'pitchTypeDescription', 'pitchSpeed', 'balls', 'strikes', 'outs'
]
col_field = ''
for c in col:
    col_field += c + ','
col_field = col_field[:-1]
print(col_field)
query = """SELECT %s
            FROM `bigquery-public-data.baseball.games_wide`
             """
games = baseball.query_to_pandas_safe(query % col_field)
print(games)
games.to_csv('new_games.csv', mode='w')
예제 #11
0
def GLquery(query="query1",
            country="US",
            start_year="1790",
            end_year="2019",
            min_count="100",
            kind_code="B2",
            cpc_code="A63F",
            assignee="President and Fellows of Harvard College",
            keyword="internet of things",
            budget=1000,
            output_to_csv=False,
            plot=False,
            ask_before_running=False):
    '''

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

FUNCTION PARAMETERS

GLquery takes the following arguments:

- query (string): choose among one of the options in the next section (see below) (default is "query1")

- country (string) = format "LL" (default is "US")

- start_year (string) = format "YYYY" (default is "1790")

- end_year (string) = format "YYYY" (default is "2019")

- min_count (string) = minimum threshold (default is "100")

- kind_code (string) = format "LN", see kind codes here: https://www.cas.org/support/documentation/references/patkind (default is "B2")

- cpc_code (string) = formant "LNNL", see CPC codes here: https://www.uspto.gov/web/patents/classification/cpc/html/cpc.html (default is "A63F")

- assignee (string) = any format (case sensitive) (default is "President and Fellows of Harvard College")

- keyword (string) = any format (case sensitive) (default is "internet of things")

- budget (number) = any number (default is 1.000 GB) (queries above value 'budget' will not run; queries below 'budget' will use only the minimum amount of memory necessary to run the query, not the full value of 'budget')

- output_to_csv (True or False) = output results as .csv file to your current working directory (default is False)

- plot (True of False) = plot results as .pdf file to your current working directory (default is False) (implemented for queries 1, 2, 3, 10)

- ask_before_running (True or False) = given a query size (in GB), asks user input before running the query (default is False)

///////////////////////////////////////////////////////////////////////////////

QUERIES

As of June/2019, GLquery can perform the following queries:

- query1: Number of patent applications by country (takes no inputs)

- query2: Number of patents published between years X and Y by country (necessary arguments: start_year and end_year)

- query3: Number of patents published to country Z between years X and Y (necessary arguments: start_year, end_year and country)

- query4: Which patents country Z published between years X and Y? (necessary arguments: start_year, end_year, country, kind_code)

- query5: Most common patenting technology areas by year (takes no inputs)

- query6: Most common patenting technology areas in country Z between years X and Y (necessary arguments: start_year, end_year and country)

- query7: Inventors with over N patents by country (necessary arguments: min_count)

- query8: Patent landscaping of technology T between years X and Y" (necessary arguments: start_year, end_year, min_count and cpc_code) (this query returns patents, firms and countries associated with technology T)

- query9: Which firms is assignee A citing in their patents? (necessary arguments: assignee)

- query10: Number of patent applications with keyword K in country Z (necessary arguments: country and keyword)

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////


    '''

    global my_output_path

    patents = bq_helper.BigQueryHelper(active_project="patents-public-data",
                                       dataset_name="patents")
    bq_assistant = BigQueryHelper("patents-public-data", "patents")

    print("Country selected: {}.".format(country))
    print("Period selected: from {} to {}.".format(start_year, end_year))

    def replace(string, substitutions):
        substrings = sorted(substitutions, key=len, reverse=True)
        regex = re.compile('|'.join(map(re.escape, substrings)))
        return regex.sub(lambda match: substitutions[match.group(0)], string)

    def query_selector(query):
        switcher = {
            "query1":
            """
            -- "Number of publications by country"
            
            SELECT COUNT(*) AS cnt, country_code
            FROM (
                SELECT ANY_VALUE(country_code) AS country_code
                FROM `patents-public-data.patents.publications`
                GROUP BY application_number)
            GROUP BY country_code
            ORDER BY cnt DESC
            """,
            "query2":
            replace(
                """
            -- "Number of patents published between years X and Y by country"
            
            SELECT country_code,
            COUNT(DISTINCT publication_number) AS publications
            FROM `patents-public-data.patents.publications`
            WHERE publication_date >= XXXX0000 
            AND publication_date < YYYY0000
            AND application_kind = 'A'
            GROUP BY country_code
            ORDER BY publications DESC
            """, {
                    "XXXX": start_year,
                    "YYYY": end_year
                }),
            "query3":
            replace(
                """
            -- "Number of patents granted to country Z between years X and Y"
            
            SELECT FLOOR(publication_date/10000) as datum,
            COUNT(DISTINCT publication_number) as publications
            FROM `patents-public-data.patents.publications`
            WHERE country_code = 'ZZZZ'
            AND publication_date >= XXXX0000
            AND publication_date <= YYYY0000
            AND application_kind = 'A'
            GROUP BY datum, application_kind
            ORDER BY application_kind, datum
            """, {
                    "XXXX": start_year,
                    "YYYY": end_year,
                    "ZZZZ": country
                }),
            "query4":
            replace(
                """
            -- "Which patents country Z published between years X and Y"
            
            SELECT publication_number
            FROM `patents-public-data.patents.publications`
            WHERE country_code = 'ZZZZ'
            AND publication_date >= XXXX0000
            AND publication_date <= YYYY0000
            AND application_kind = 'A'
            AND kind_code = 'BBBB'
            """, {
                    "XXXX": start_year,
                    "YYYY": end_year,
                    "ZZZZ": country,
                    "BBBB": kind_code
                }),
            "query5":
            """
            -- "Most common patenting technology areas by year"
            
            CREATE TEMPORARY FUNCTION highest_moving_avg(yearcnt ARRAY<STRUCT<filing_year INT64, cnt INT64>>)
            RETURNS STRUCT<filing_year INT64, avg INT64>
            LANGUAGE js AS \"""
            let avg = 0;
            let a = 1.0;
            let highest = {filing_year: -1, avg: -1};
            for (let x of yearcnt) {
                    avg = a * x.cnt + (1 - a) * avg;
                    if (avg > highest.avg) {
                            highest = {filing_year: x.filing_year, avg: avg};}
                    }
                    return highest;
                \""";
                
            WITH patent_cpcs AS (
                SELECT cd.parents,
                CAST(FLOOR(filing_date/10000) AS INT64) AS filing_year
                FROM (
                    SELECT ANY_VALUE(cpc) AS cpc, ANY_VALUE(filing_date) AS filing_date
                    FROM `patents-public-data.patents.publications`
                    WHERE application_number != ""
                    GROUP BY application_number), UNNEST(cpc) AS cpcs
                JOIN `patents-public-data.cpc.definition` cd ON cd.symbol = cpcs.code
                WHERE cpcs.first = TRUE AND filing_date > 0)

            SELECT c.title_full, cpc_group, best_year.*
            FROM (
                SELECT cpc_group, highest_moving_avg(ARRAY_AGG(STRUCT<filing_year INT64, cnt INT64>(filing_year, cnt) ORDER BY filing_year ASC)) AS best_year
                FROM (
                    SELECT cpc_group, filing_year, COUNT(*) AS cnt
                    FROM (
                        SELECT cpc_parent AS cpc_group, filing_year
                        FROM patent_cpcs, UNNEST(parents) AS cpc_parent)
                    GROUP BY cpc_group, filing_year
                    ORDER BY filing_year DESC, cnt DESC)
                GROUP BY cpc_group)
            JOIN `patents-public-data.cpc.definition` c ON cpc_group = c.symbol
            WHERE c.level = 5
            ORDER BY best_year.filing_year ASC;
            """,
            "query6":
            replace(
                """
            -- "Most common patenting technology areas in country Z between years X and Y"
            
            CREATE TEMPORARY FUNCTION highest_moving_avg(yearcnt ARRAY<STRUCT<filing_year INT64, cnt INT64>>)
            RETURNS STRUCT<filing_year INT64, avg INT64>
            LANGUAGE js AS \"""
            let avg = 0;
            let a = 1.0;
            let highest = {filing_year: -1, avg: -1};
            for (let x of yearcnt) {
                    avg = a * x.cnt + (1 - a) * avg;
                    if (avg > highest.avg) {
                            highest = {filing_year: x.filing_year, avg: avg};}
                    }
                    return highest;
                \""";
                
            WITH patent_cpcs AS (
                SELECT cd.parents,
                CAST(FLOOR(filing_date/10000) AS INT64) AS filing_year
                FROM (
                    SELECT ANY_VALUE(cpc) AS cpc, ANY_VALUE(filing_date) AS filing_date
                    FROM `patents-public-data.patents.publications`
                    WHERE application_number != ""
                    AND country_code = 'ZZZZ'
                    AND grant_date >= XXXX0000
                    AND grant_date <= YYYY0000
                    GROUP BY application_number), UNNEST(cpc) AS cpcs
                JOIN `patents-public-data.cpc.definition` cd ON cd.symbol = cpcs.code
                WHERE cpcs.first = TRUE AND filing_date > 0)

            SELECT c.title_full, cpc_group, best_year.*
            FROM (
                SELECT cpc_group, highest_moving_avg(ARRAY_AGG(STRUCT<filing_year INT64, cnt INT64>(filing_year, cnt) ORDER BY filing_year ASC)) AS best_year
                FROM (
                    SELECT cpc_group, filing_year, COUNT(*) AS cnt
                    FROM (
                        SELECT cpc_parent AS cpc_group, filing_year
                        FROM patent_cpcs, UNNEST(parents) AS cpc_parent)
                    GROUP BY cpc_group, filing_year
                    ORDER BY filing_year DESC, cnt DESC)
                GROUP BY cpc_group)
            JOIN `patents-public-data.cpc.definition` c ON cpc_group = c.symbol
            WHERE c.level = 5 
            ORDER BY best_year.filing_year ASC;
            """, {
                    "XXXX": start_year,
                    "YYYY": end_year,
                    "ZZZZ": country
                }),
            "query7":
            replace(
                """
            -- "Inventors with over N patents by country"
            
            WITH temp1 AS (
            SELECT DISTINCT PUB.country_code,
            PUB.application_number AS patent_number, inventor_name
            FROM `patents-public-data.patents.publications` PUB
            CROSS JOIN
            UNNEST(PUB.inventor) AS inventor_name
            WHERE PUB.grant_date >= 1790000
            AND PUB.country_code IS NOT NULL
            AND PUB.application_number IS NOT NULL
            AND PUB.inventor IS NOT NULL)
            SELECT * FROM (
            SELECT temp1.country_code AS country, temp1.inventor_name AS inventor,
            COUNT(temp1.patent_number) AS count_of_patents
            FROM temp1
            GROUP BY temp1.country_code, temp1.inventor_name)
            WHERE count_of_patents >= NNNN
            """, {"NNNN": min_count}),
            "query8":
            replace(
                """
            -- "Patent landscaping of technology T between years X and Y" 
            
            SELECT SUM(year_cnt) AS total_count, assignee_name,
            ARRAY_AGG(STRUCT<cnt INT64, filing_year INT64, countries STRING>(year_cnt, filing_year, countries) ORDER BY year_cnt DESC LIMIT 1)[SAFE_ORDINAL(1)] AS Number_of_patents_under_this_CPC_code_Peak_year_Top_countries
            FROM (
                SELECT SUM(year_country_cnt) AS year_cnt, assignee_name, filing_year, STRING_AGG(country_code ORDER BY year_country_cnt DESC LIMIT 5) AS countries
                FROM (
                    SELECT COUNT(*) AS year_country_cnt, a.name AS assignee_name, CAST(FLOOR(filing_date/10000) AS INT64) AS filing_year, apps.country_code
                    FROM (
                        SELECT ANY_VALUE(assignee_harmonized) AS assignee_harmonized, ANY_VALUE(filing_date) AS filing_date, ANY_VALUE(country_code) AS country_code
                        FROM `patents-public-data.patents.publications` AS pubs
                        WHERE (SELECT MAX(TRUE) FROM UNNEST(pubs.cpc) AS c WHERE REGEXP_CONTAINS(c.code, "TTTT"))
                        AND publication_date >= XXXX0000
                        AND publication_date <= YYYY0000
                        GROUP BY application_number) AS apps, UNNEST(assignee_harmonized) AS a
                    WHERE filing_date > 0
                    GROUP BY a.name, filing_year, country_code)
                GROUP BY assignee_name, filing_year)
            GROUP BY assignee_name
            ORDER BY total_count DESC
            LIMIT NNNN
            """, {
                    "XXXX": start_year,
                    "YYYY": end_year,
                    "NNNN": min_count,
                    "TTTT": cpc_code
                }),
            "query9":
            replace(
                """
            -- "Which firms is assignee A citing in their patents?"
            
            SELECT citing_assignee,
            COUNT(*) AS num_cites, citing_cpc_subclass, cpcdef.title_full AS citing_cpc_title
            FROM (
                SELECT pubs.publication_number AS citing_publication_number, cite.publication_number AS cited_publication_number, citing_assignee_s.name AS citing_assignee, SUBSTR(cpcs.code, 0, 4) AS citing_cpc_subclass
                FROM `patents-public-data.patents.publications` AS pubs, UNNEST(citation) AS cite, UNNEST(assignee_harmonized) AS citing_assignee_s, UNNEST(cpc) AS cpcs
                WHERE cpcs.first = TRUE) AS pubs
                JOIN (
                    SELECT publication_number AS cited_publication_number, cited_assignee_s.name AS cited_assignee
                    FROM `patents-public-data.patents.publications`, UNNEST(assignee_harmonized) AS cited_assignee_s) AS refs ON pubs.cited_publication_number = refs.cited_publication_number
                JOIN `patents-public-data.cpc.definition` AS cpcdef ON cpcdef.symbol = citing_cpc_subclass
                WHERE cited_assignee = "AAAA" AND citing_assignee != "AAAA"
                GROUP BY cited_assignee, citing_assignee, citing_cpc_subclass, cpcdef.title_full
                ORDER BY num_cites DESC
                """, {"AAAA": assignee}),
            "query10":
            replace(
                """
            --  Number of patent applications with keyword K in country Z             
            
            WITH Patent_Matches AS (
                SELECT PARSE_DATE('%Y%m%d', SAFE_CAST(ANY_VALUE(patentsdb.filing_date) AS STRING)) AS Patent_Filing_Date, patentsdb.application_number AS Patent_Application_Number,
                ANY_VALUE(abstract_info.text) AS Patent_Title,
                ANY_VALUE(abstract_info.language) AS Patent_Title_Language
                FROM `patents-public-data.patents.publications` AS patentsdb,
                UNNEST(abstract_localized) AS abstract_info
                WHERE
                LOWER(abstract_info.text) LIKE '%KKKK%'
                AND patentsdb.country_code = 'ZZZZ'
                GROUP BY Patent_Application_Number),
            Date_Series_Table AS (
                SELECT day, 0 AS Number_of_Patents
                FROM UNNEST (GENERATE_DATE_ARRAY(
                    (SELECT MIN(Patent_Filing_Date) FROM Patent_Matches),
                    (SELECT MAX(Patent_Filing_Date) FROM Patent_Matches))) AS day)
            SELECT SAFE_CAST(FORMAT_DATE('%Y-%m',Date_Series_Table.day) AS STRING) AS Patent_Date_YearMonth, COUNT(Patent_Matches.Patent_Application_Number) AS Number_of_Patent_Applications
            FROM Patent_Matches
            RIGHT JOIN Date_Series_Table
            ON Patent_Matches.Patent_Filing_Date = Date_Series_Table.day
            GROUP BY Patent_Date_YearMonth
            ORDER BY Patent_Date_YearMonth
            """, {
                    "ZZZZ": country,
                    "KKKK": keyword
                }),
        }
        return switcher.get(query, "Invalid query")

    print("Estimated query size: {} GB.".format(
        bq_assistant.estimate_query_size(query_selector(query))))

    def kenvelo(question, answer="no"):

        range_of_choices = {
            "yes": True,
            "y": True,
            "": True,
            "no": False,
            "n": False
        }

        if answer is None:
            prompt = "[Y/N]"

        elif answer == "yes":
            prompt = "[Y/N]"

        elif answer == "no":
            prompt = "[Y/N]"

        else:
            raise ValueError("Answer '%s' is invalid.")

        while True:
            sys.stdout.write(question + prompt)
            choice = input().lower()

            if answer is not None and choice == '':
                return range_of_choices[answer]

            elif choice in range_of_choices:
                return range_of_choices[choice]

            else:
                sys.stdout.write(
                    "Please answer any variation of 'yes' or 'no'. ")

    while ask_before_running is False or kenvelo("Run query? "):

        def plot_query1():
            figure(num=None, figsize=(24, 16), facecolor='w', edgecolor='k')
            sns.set(context='paper', style='ticks', font_scale=0.9)
            sns.barplot(x='country_code',
                        y='cnt',
                        data=patents.query_to_pandas_safe(
                            query_selector("query1"),
                            max_gb_scanned=bq_assistant.estimate_query_size(
                                query_selector("query1"))))
            plt.title("Number of publications by country",
                      loc='left',
                      fontsize=24,
                      style='oblique')
            plt.ylabel('# of publications (log)', fontsize=14)
            plt.xlabel('Country', fontsize=14)
            plt.yscale('log')
            sns.despine(offset=10, trim=True)
            plt.savefig(my_output_path + "query1" + '.pdf',
                        orientation='landscape',
                        bbox_inches='tight')
            plt.show()

        def plot_query2():
            figure(num=None, figsize=(24, 16), facecolor='w', edgecolor='k')
            sns.set(context='paper', style='ticks', font_scale=0.9)
            sns.barplot(x='country_code',
                        y='publications',
                        data=patents.query_to_pandas_safe(
                            query_selector("query2"),
                            max_gb_scanned=bq_assistant.estimate_query_size(
                                query_selector("query2"))))
            plt.title(
                "Number of patents published between years {} and {} by country"
                .format(start_year, end_year),
                loc='left',
                fontsize=24,
                style='oblique')
            plt.ylabel('# of publications')
            plt.xlabel('Country')
            plt.yscale('log')
            sns.despine(offset=10, trim=True)
            plt.savefig(my_output_path + "query2" + '.pdf',
                        orientation='landscape',
                        bbox_inches='tight')
            plt.show()

        def plot_query3():
            figure(num=None, figsize=(24, 16), facecolor='w', edgecolor='k')
            sns.set(context='paper', style='ticks', font_scale=0.9)
            sns.barplot(x='datum',
                        y='publications',
                        data=patents.query_to_pandas_safe(
                            query_selector("query3"),
                            max_gb_scanned=bq_assistant.estimate_query_size(
                                query_selector("query3"))))
            plt.title(
                "Number of patents granted to country {} between years {} and {}"
                .format(country, start_year, end_year),
                loc='left',
                fontsize=24,
                style='oblique')
            plt.ylabel('# of patents', fontsize=14)
            plt.xlabel('')
            sns.despine(offset=10, trim=True)
            plt.savefig(my_output_path + "query3" + '.pdf',
                        orientation='landscape',
                        bbox_inches='tight')
            plt.show()

        def plot_query10():
            figure(num=None, figsize=(24, 16), facecolor='w', edgecolor='k')
            sns.set(context='paper', style='ticks', font_scale=0.9)
            sns.barplot(x='Patent_Date_YearMonth',
                        y='Number_of_Patent_Applications',
                        data=patents.query_to_pandas_safe(
                            query_selector("query10"),
                            max_gb_scanned=bq_assistant.estimate_query_size(
                                query_selector("query10"))))
            plt.title(
                "Number of patent applications of technology {} in country {}".
                format(keyword, country),
                loc='left',
                fontsize=24,
                style='oblique')
            plt.ylabel('# of applications', fontsize=14)
            plt.xlabel('Date', fontsize=14, rotation=45)
            sns.despine(offset=10, trim=True)
            plt.savefig(my_output_path + "query10" + '.pdf',
                        orientation='landscape',
                        bbox_inches='tight')
            plt.show()

        def plotter(query):
            if query == "query1":
                plot_query1()
                query = "query2"

            elif query == "query2":
                plot_query2()
                query = "query3"

            elif query == "query3":
                plot_query3()
                query = "query10"

            elif query == "query10":
                plot_query10()

            else:
                print('\033[1m' +
                      "Sorry, this query doesn't output graphs (yet)." +
                      '\033[0m')

        temp_path = Path(my_output_path)

        if output_to_csv and plot:
            return patents.query_to_pandas_safe(
                query_selector(query),
                max_gb_scanned=min(
                    budget,
                    bq_assistant.estimate_query_size(
                        query_selector(query)))).to_csv(
                            Path(temp_path, query + '.csv'),
                            index=False,
                            encoding="utf-8"), plotter(query)

        if output_to_csv:
            return patents.query_to_pandas_safe(
                query_selector(query),
                max_gb_scanned=min(
                    budget,
                    bq_assistant.estimate_query_size(
                        query_selector(query)))).to_csv(Path(
                            temp_path, query + '.csv'),
                                                        index=False,
                                                        encoding="utf-8")

        if plot:
            return plotter(query)

        else:
            return patents.query_to_pandas_safe(
                query_selector(query),
                max_gb_scanned=min(
                    budget,
                    bq_assistant.estimate_query_size(query_selector(query))))

        if kenvelo("You will use",
                   bq_assistant.estimate_query_size(query_selector(query)),
                   "GB in this query. Continue? ") is False:
            break
예제 #12
0
# Set up feedack system
from learntools.core import binder
binder.bind(globals())
from learntools.sql.ex1 import *

# create a helper object for our bigquery dataset
import bq_helper
chicago_crime = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                         dataset_name="chicago_crime")
print("Setup Complete")

table_list = chicago_crime.list_tables(
)  # Write the code you need here to figure out the answer

num_tables = len(
    table_list)  # store the answer as num_tables and then run this cell

q_1.check()

crime_schema = chicago_crime.table_schema(
    'crime')  # Write the code to figure out the answer

num_timestamp_fields = sum(
    crime_schema.type == 'TIMESTAMP')  # put your answer here

q_2.check()

crime_table = chicago_crime.head(
    'crime'
)  # Write the code here to explore the data so you can find the answer
예제 #13
0
This is a temporary script file.
"""
import numpy as np
import pandas as pd
import bq_helper 
from google.cloud import bigquery
from apiclient import discovery
import os
credentials = GoogleCredentials.get_application_default()
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = 'C:\\Users\\lukek\\Desktop\\SQL\\lschlab2weather-495aae5d3687.json'




# create a helper object for our bigquery dataset
bqh = bq_helper.BigQueryHelper(active_project= "bigquery-public-data", dataset_name= "noaa_gsod")

# build and run a series of queries to get annual temperatures for the US
# WARNING: each year takes 5+ mins to run and the resultant dataset is about 100MB!

START_YEAR = 2009
END_YEAR = 2009

for year in range(START_YEAR, END_YEAR):
    query = "SELECT * FROM `bigquery-public-data.noaa_gsod.gsod{}`".format(year)

    df_wthr = bqh.query_to_pandas_safe(query, max_gb_scanned=5)
    filename = 'US_weather_{}.csv'.format(year)
    df_wthr.to_csv(filename, index = False)
    print ("Saved {}".format(filename))
예제 #14
0
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_rows = 10

# google bigquery library for quering data
from google.cloud import bigquery

import bq_helper

# BigQueryHelper for converting query result direct to dataframe
from bq_helper import BigQueryHelper

#-------------------------------------------------------------------------

# create a helper object for this dataset
bigData = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                   dataset_name="epa_historical_air_quality")

# print all the tables in this dataset
bigData.list_tables()

# THE FOLOOWING ANALYSIS WILL BE CARRIED OUT IN FOR ALL THE DISTINCT COUNTIES IN THE US BASED ON CATEGORIES FOR EACH OF THE POLLUTANT

# create schema of all the relevant tables avaliable in the dataset
CO = bigData.table_schema('co_daily_summary')
SO2 = bigData.table_schema('so2_daily_summary')
O3 = bigData.table_schema('o3_daily_summary')
NO2 = bigData.table_schema('no2_daily_summary')
PM10 = bigData.table_schema('pm10_daily_summary')
PM25 = bigData.table_schema('pm25_frm_daily_summary')
PM25n = bigData.table_schema('pm25_nonfrm_daily_summary')
# import our bq_helper package
import bq_helper
# create a helper object for our bigquery dataset
hacker_news = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                       dataset_name="hacker_news")
# print a list of all the tables in the hacker_news dataset
hacker_news.list_tables()
# print information on all the columns in the "full" table
# in the hacker_news dataset
hacker_news.table_schema("full")
# preview the first couple lines of the "full" table
hacker_news.head("full")
# preview the first ten entries in the by column of the full table
hacker_news.head("full", selected_columns="by", num_rows=10)
# this query looks in the full table in the hacker_news
# dataset, then gets the score column from every row where
# the type column has "job" in it.
query = """SELECT score
            FROM `bigquery-public-data.hacker_news.full`
            WHERE type = "job" """

# check how big this query will be
hacker_news.estimate_query_size(query)
# only run this query if it's less than 100 MB
hacker_news.query_to_pandas_safe(query, max_gb_scanned=0.1)
# check out the scores of job postings (if the
# query is smaller than 1 gig)
job_post_scores = hacker_news.query_to_pandas_safe(query)
# average score for job posts
job_post_scores.score.mean()
# save our dataframe as a .csv
예제 #16
0
import pandas as pd
import bq_helper
from bq_helper import BigQueryHelper
# https://www.kaggle.com/sohier/introduction-to-the-bq-helper-package
medicare = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                    dataset_name="cms_medicare")
bq_assistant = BigQueryHelper("bigquery-public-data", "cms_medicare")
bq_assistant.list_tables()
bq_assistant.head("inpatient_charges_2015", num_rows=15)
예제 #17
0
import bq_helper
from bq_helper import BigQueryHelper
stackOverflow = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                         dataset_name="stackoverflow")
import pandas as pd
import numpy as np

import urllib.request
from lxml import html, etree

import os
import sys
import math
sys.path.append('qtester')
from libs.my_progress_bar import MyBar
import libs.my_paths as mp
from datetime import date, timedelta


def get_init_data(begin_time: date = None,
                  end_time: date = None) -> pd.DataFrame:
    where_str = "WHERE EXTRACT(YEAR FROM creation_date) = 2019 AND EXTRACT(MONTH FROM creation_date) = 10"

    if begin_time != None and end_time != None:
        where_str = f"""WHERE (EXTRACT(YEAR FROM creation_date) BETWEEN {begin_time.year} AND {end_time.year}) AND 
        (EXTRACT(MONTH FROM creation_date) BETWEEN {begin_time.month} AND {end_time.month}) AND 
        (EXTRACT(DAY FROM creation_date) BETWEEN {begin_time.day} AND {end_time.day})"""
    elif begin_time != None:
        where_str = f"""WHERE (EXTRACT(YEAR FROM creation_date) = {begin_time.year}) AND 
        (EXTRACT(MONTH FROM creation_date) = {begin_time.month}) AND 
        (EXTRACT(DAY FROM creation_date) = {begin_time.day}"""
예제 #18
0
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_rows = 10
get_ipython().run_line_magic('matplotlib', 'inline')

# ### Bigquery
# BigQuery is a RESTful web service that enables interactive analysis of massively large datasets working in conjunction with Google Storage. It is an Infrastructure as a Service that may be used complementarily with MapReduce.

# In[135]:

# Customized query helper function explosively in Kaggle
import bq_helper

# Helper object
openAQ = bq_helper.BigQueryHelper(active_project='bigquery-public-data',
                                  dataset_name='openaq')
# List of table
openAQ.list_tables()

# In[136]:

#Schema
openAQ.table_schema('global_air_quality')

# ### Table display

# In[137]:

openAQ.head('global_air_quality')

# In[138]:
예제 #19
0
import bq_helper
import os
import numpy as np
import pandas as pd
from google.cloud.bigquery.client import Client
import json
os.environ['GOOGLE_APPLICATION_CREDENTIALS']='CA675-ede38ecf6679.json'
bq_client = Client()

github_repos = bq_helper.BigQueryHelper(active_project= "bigquery-public-data", 
                                       dataset_name = "github_repos")


query1= """SELECT size
            FROM `bigquery-public-data.github_repos.contents`
            WHERE binary = True
            LIMIT 5000
        """
github_repos.estimate_query_size(query1)
github_repo_sizes = github_repos.query_to_pandas_safe(query1, max_gb_scanned=2.34)
BYTES_PER_MB = 2**20

query9 ="""
        SELECT repo_name, watch_count
        FROM `bigquery-public-data.github_repos.sample_repos`
        ORDER BY watch_count DESC 
        LIMIT 2000
        """
data=github_repos.estimate_query_size(query9)
github_repo_trending_repos = github_repos.query_to_pandas_safe(query9)