def read_bq_query(query, activate_project="bigquery-public-data", dataset_name="openaq"): """Create a helper function to execute differernt queries. """ # initiate bq object open_aq = bq_helper.BigQueryHelper(active_project=activate_project, dataset_name=dataset_name) query_result = open_aq.query_to_pandas_safe(query) return query_result
def load_data(): #Read the datset from BigQuery file dataset = bq_helper.BigQueryHelper(active_project="bigquery-public-data", dataset_name="usa_names") # df = pd.read_csv( # "https://query.data.world/s/6joi7hjgjmwifhl2clpldwm36xmvmx") # df["REPORTDATETIME"] = pd.to_datetime( # df["REPORTDATETIME"], infer_datetime_format=True) # df["Day"] = df["REPORTDATETIME"].dt.day # df["Month"] = df["REPORTDATETIME"].dt.month # df["Hour"] = df["REPORTDATETIME"].dt.hour return dataset
# ## Example: What are all the U.S. cities in the OpenAQ dataset? # ___ # # Now that you've got the basics down, let's work through an example with a real dataset. Today we're going to be working with the OpenAQ dataset, which has information on air quality around the world. (The data in it should be current: it's updated weekly.) # # To help get you situated, I'm going to run through a complete query first. Then it will be your turn to get started running your queries! # # First, I'm going to set up everything we need to run queries and take a quick peek at what tables are in our database. # In[ ]: # import package with helper functions import bq_helper # create a helper object for this dataset open_aq = bq_helper.BigQueryHelper(active_project="bigquery-public-data", dataset_name="openaq") # print all the tables in this dataset (there's only one!) open_aq.list_tables() # I'm going to take a peek at the first couple of rows to help me see what sort of data is in this dataset. # In[ ]: # print the first couple rows of the "global_air_quality" dataset open_aq.head("global_air_quality") # Great, everything looks good! Now that I'm set up, I'm going to put together a query. I want to select all the values from the "city" column for the rows there the "country" column is "us" (for "United States"). # # > **What's up with the triple quotation marks (""")?** These tell Python that everything inside them is a single string, even though we have line breaks in it. The line breaks aren't necessary, but they do make it much easier to read your query.
# Your code goes here :) # import package with helper functions import bq_helper # create a helper object for this dataset accidents = bq_helper.BigQueryHelper(active_project="bigquery-public-data", dataset_name="nhtsa_traffic_fatalities") # print couple of rows from the table accident_2015 accidents.head("accident_2015") # query for the first question query_for_first_question = """SELECT COUNT(consecutive_number), EXTRACT(HOUR FROM timestamp_of_crash) FROM `bigquery-public-data.nhtsa_traffic_fatalities.accident_2015` GROUP BY EXTRACT(HOUR FROM timestamp_of_crash) ORDER BY COUNT(consecutive_number) DESC """ # run the query accidents_by_hour = accidents.query_to_pandas_safe(query_for_first_question) # print the dataframe accidents_by_hour # library for plotting import matplotlib.pyplot as plt # make a plot to show that our data is, actually, sorted: plt.plot(accidents_by_hour.f0_)
# Set up feedack system from learntools.core import binder binder.bind(globals()) from learntools.sql.ex5 import * # import package with helper functions import bq_helper # create a helper object for this dataset chicago_taxi_helper = bq_helper.BigQueryHelper(active_project="bigquery-public-data", dataset_name="chicago_taxi_trips" # Your code here to find the table name chicago_taxi_helper.list_tables() # write the table name as a string below table_name = 'taxi_trips' q_1.check() # your code here chicago_taxi_helper.head(table_name) rides_per_year_query = """ SELECT EXTRACT(year FROM trip_start_timestamp) AS year,COUNT(1) num_trips FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips` GROUP BY year ORDER BY year """ rides_per_year_result = chicago_taxi_helper.query_to_pandas_safe(rides_per_year_query)
from google.cloud import bigquery import bq_helper import os #replace with path to JSON file containing Google Service Account private key os.environ[ "GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/jason.wang/Documents/Analytics Projects/EDA-Google-PK.json" names = bq_helper.BigQueryHelper(active_project="bigquery-public-data", dataset_name="usa_names") query = """SELECT year, gender, name, sum(number) as number FROM `bigquery-public-data.usa_names.usa_1910_current` GROUP BY year, gender, name""" agg_names = names.query_to_pandas_safe(query) agg_names.to_csv("names.csv")
# Set up feedack system from learntools.core import binder binder.bind(globals()) from learntools.sql.ex4 import * # import package with helper functions import bq_helper # create a helper object for this dataset education_data = bq_helper.BigQueryHelper( active_project="bigquery-public-data", dataset_name="world_bank_intl_education") education_data.head('international_education') # Your Code Here country_spend_pct_query = """ SELECT country_name,AVG(value) avg_ed_spending_pct FROM `bigquery-public-data.world_bank_intl_education.international_education` WHERE year >= 2010 and year <= 2017 and indicator_code = 'SE.XPD.TOTL.GD.ZS' GROUP BY country_name ORDER BY AVG(value) DESC """ country_spending_results = education_data.query_to_pandas_safe( country_spend_pct_query) print(country_spending_results.head()) q_1.check()
with open(file, 'w') as f: f.write(html_str) f.close() string="""<div id='html_string2'><iframe src='"""+\ file+"""' height="""+str(height+20)+\ """ width="""+str(width+20)+"""></iframe></div>""" display(HTML(string)) # Commented out IPython magic to ensure Python compatibility. # %run numpy_sage.py dhtml('Example #1') import bq_helper open_aq = bq_helper.BigQueryHelper(active_project="bigquery-public-data", dataset_name="openaq") open_aq.head("global_air_quality", 3).T my_query = """ SELECT country,pollutant,AVG(value) as avg_value FROM `bigquery-public-data.openaq.global_air_quality` WHERE unit='µg/m³' GROUP BY country,pollutant ORDER BY country,pollutant """ pollutants = open_aq.query_to_pandas_safe(my_query) pollutants.head(10).T import pandas as pd, pylab as pl df_pollutants = pd.DataFrame(index=list(set(pollutants['country'])), columns=list(set(pollutants['pollutant'])))
# coding: utf-8 # # SQL Scavenger Hunt: Day 5 # ## Example: How many files are covered by each license? # In[ ]: import bq_helper # In[ ]: github_repos = bq_helper.BigQueryHelper(active_project='bigquery-public-data', dataset_name='github_repos') # In[ ]: query = """ SELECT licenses.license, COUNT(files.id) AS num_files FROM `bigquery-public-data.github_repos.sample_files` AS files JOIN `bigquery-public-data.github_repos.licenses` AS licenses -- JOIN is the same as INNER JOIN ON licenses.repo_name = files.repo_name GROUP BY license ORDER BY num_files DESC """ license_counts = github_repos.query_to_pandas_safe(query, max_gb_scanned=10) # In[ ]: license_counts.shape
import bq_helper baseball = bq_helper.BigQueryHelper(active_project="bigquery-public-data", dataset_name="baseball") col = [ 'gameId', 'startTime', 'gameStatus', 'attendance', 'dayNight', 'durationMinutes', 'awayTeamName', 'homeTeamName', 'venueName', 'venueSurface', 'venueCity', 'venueState', 'homeFinalRuns', 'homeFinalHits', 'homeFinalErrors', 'awayFinalRuns', 'awayFinalHits', 'awayFinalErrors', 'inningNumber', 'inningHalf', 'description', 'atBatEventSequenceNumber', 'outcomeDescription', 'hitterLastName', 'hitterBatHand', 'pitcherLastName', 'pitcherThrowHand', 'pitchTypeDescription', 'pitchSpeed', 'balls', 'strikes', 'outs' ] col_field = '' for c in col: col_field += c + ',' col_field = col_field[:-1] print(col_field) query = """SELECT %s FROM `bigquery-public-data.baseball.games_wide` """ games = baseball.query_to_pandas_safe(query % col_field) print(games) games.to_csv('new_games.csv', mode='w')
def GLquery(query="query1", country="US", start_year="1790", end_year="2019", min_count="100", kind_code="B2", cpc_code="A63F", assignee="President and Fellows of Harvard College", keyword="internet of things", budget=1000, output_to_csv=False, plot=False, ask_before_running=False): ''' ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// FUNCTION PARAMETERS GLquery takes the following arguments: - query (string): choose among one of the options in the next section (see below) (default is "query1") - country (string) = format "LL" (default is "US") - start_year (string) = format "YYYY" (default is "1790") - end_year (string) = format "YYYY" (default is "2019") - min_count (string) = minimum threshold (default is "100") - kind_code (string) = format "LN", see kind codes here: https://www.cas.org/support/documentation/references/patkind (default is "B2") - cpc_code (string) = formant "LNNL", see CPC codes here: https://www.uspto.gov/web/patents/classification/cpc/html/cpc.html (default is "A63F") - assignee (string) = any format (case sensitive) (default is "President and Fellows of Harvard College") - keyword (string) = any format (case sensitive) (default is "internet of things") - budget (number) = any number (default is 1.000 GB) (queries above value 'budget' will not run; queries below 'budget' will use only the minimum amount of memory necessary to run the query, not the full value of 'budget') - output_to_csv (True or False) = output results as .csv file to your current working directory (default is False) - plot (True of False) = plot results as .pdf file to your current working directory (default is False) (implemented for queries 1, 2, 3, 10) - ask_before_running (True or False) = given a query size (in GB), asks user input before running the query (default is False) /////////////////////////////////////////////////////////////////////////////// QUERIES As of June/2019, GLquery can perform the following queries: - query1: Number of patent applications by country (takes no inputs) - query2: Number of patents published between years X and Y by country (necessary arguments: start_year and end_year) - query3: Number of patents published to country Z between years X and Y (necessary arguments: start_year, end_year and country) - query4: Which patents country Z published between years X and Y? (necessary arguments: start_year, end_year, country, kind_code) - query5: Most common patenting technology areas by year (takes no inputs) - query6: Most common patenting technology areas in country Z between years X and Y (necessary arguments: start_year, end_year and country) - query7: Inventors with over N patents by country (necessary arguments: min_count) - query8: Patent landscaping of technology T between years X and Y" (necessary arguments: start_year, end_year, min_count and cpc_code) (this query returns patents, firms and countries associated with technology T) - query9: Which firms is assignee A citing in their patents? (necessary arguments: assignee) - query10: Number of patent applications with keyword K in country Z (necessary arguments: country and keyword) ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ''' global my_output_path patents = bq_helper.BigQueryHelper(active_project="patents-public-data", dataset_name="patents") bq_assistant = BigQueryHelper("patents-public-data", "patents") print("Country selected: {}.".format(country)) print("Period selected: from {} to {}.".format(start_year, end_year)) def replace(string, substitutions): substrings = sorted(substitutions, key=len, reverse=True) regex = re.compile('|'.join(map(re.escape, substrings))) return regex.sub(lambda match: substitutions[match.group(0)], string) def query_selector(query): switcher = { "query1": """ -- "Number of publications by country" SELECT COUNT(*) AS cnt, country_code FROM ( SELECT ANY_VALUE(country_code) AS country_code FROM `patents-public-data.patents.publications` GROUP BY application_number) GROUP BY country_code ORDER BY cnt DESC """, "query2": replace( """ -- "Number of patents published between years X and Y by country" SELECT country_code, COUNT(DISTINCT publication_number) AS publications FROM `patents-public-data.patents.publications` WHERE publication_date >= XXXX0000 AND publication_date < YYYY0000 AND application_kind = 'A' GROUP BY country_code ORDER BY publications DESC """, { "XXXX": start_year, "YYYY": end_year }), "query3": replace( """ -- "Number of patents granted to country Z between years X and Y" SELECT FLOOR(publication_date/10000) as datum, COUNT(DISTINCT publication_number) as publications FROM `patents-public-data.patents.publications` WHERE country_code = 'ZZZZ' AND publication_date >= XXXX0000 AND publication_date <= YYYY0000 AND application_kind = 'A' GROUP BY datum, application_kind ORDER BY application_kind, datum """, { "XXXX": start_year, "YYYY": end_year, "ZZZZ": country }), "query4": replace( """ -- "Which patents country Z published between years X and Y" SELECT publication_number FROM `patents-public-data.patents.publications` WHERE country_code = 'ZZZZ' AND publication_date >= XXXX0000 AND publication_date <= YYYY0000 AND application_kind = 'A' AND kind_code = 'BBBB' """, { "XXXX": start_year, "YYYY": end_year, "ZZZZ": country, "BBBB": kind_code }), "query5": """ -- "Most common patenting technology areas by year" CREATE TEMPORARY FUNCTION highest_moving_avg(yearcnt ARRAY<STRUCT<filing_year INT64, cnt INT64>>) RETURNS STRUCT<filing_year INT64, avg INT64> LANGUAGE js AS \""" let avg = 0; let a = 1.0; let highest = {filing_year: -1, avg: -1}; for (let x of yearcnt) { avg = a * x.cnt + (1 - a) * avg; if (avg > highest.avg) { highest = {filing_year: x.filing_year, avg: avg};} } return highest; \"""; WITH patent_cpcs AS ( SELECT cd.parents, CAST(FLOOR(filing_date/10000) AS INT64) AS filing_year FROM ( SELECT ANY_VALUE(cpc) AS cpc, ANY_VALUE(filing_date) AS filing_date FROM `patents-public-data.patents.publications` WHERE application_number != "" GROUP BY application_number), UNNEST(cpc) AS cpcs JOIN `patents-public-data.cpc.definition` cd ON cd.symbol = cpcs.code WHERE cpcs.first = TRUE AND filing_date > 0) SELECT c.title_full, cpc_group, best_year.* FROM ( SELECT cpc_group, highest_moving_avg(ARRAY_AGG(STRUCT<filing_year INT64, cnt INT64>(filing_year, cnt) ORDER BY filing_year ASC)) AS best_year FROM ( SELECT cpc_group, filing_year, COUNT(*) AS cnt FROM ( SELECT cpc_parent AS cpc_group, filing_year FROM patent_cpcs, UNNEST(parents) AS cpc_parent) GROUP BY cpc_group, filing_year ORDER BY filing_year DESC, cnt DESC) GROUP BY cpc_group) JOIN `patents-public-data.cpc.definition` c ON cpc_group = c.symbol WHERE c.level = 5 ORDER BY best_year.filing_year ASC; """, "query6": replace( """ -- "Most common patenting technology areas in country Z between years X and Y" CREATE TEMPORARY FUNCTION highest_moving_avg(yearcnt ARRAY<STRUCT<filing_year INT64, cnt INT64>>) RETURNS STRUCT<filing_year INT64, avg INT64> LANGUAGE js AS \""" let avg = 0; let a = 1.0; let highest = {filing_year: -1, avg: -1}; for (let x of yearcnt) { avg = a * x.cnt + (1 - a) * avg; if (avg > highest.avg) { highest = {filing_year: x.filing_year, avg: avg};} } return highest; \"""; WITH patent_cpcs AS ( SELECT cd.parents, CAST(FLOOR(filing_date/10000) AS INT64) AS filing_year FROM ( SELECT ANY_VALUE(cpc) AS cpc, ANY_VALUE(filing_date) AS filing_date FROM `patents-public-data.patents.publications` WHERE application_number != "" AND country_code = 'ZZZZ' AND grant_date >= XXXX0000 AND grant_date <= YYYY0000 GROUP BY application_number), UNNEST(cpc) AS cpcs JOIN `patents-public-data.cpc.definition` cd ON cd.symbol = cpcs.code WHERE cpcs.first = TRUE AND filing_date > 0) SELECT c.title_full, cpc_group, best_year.* FROM ( SELECT cpc_group, highest_moving_avg(ARRAY_AGG(STRUCT<filing_year INT64, cnt INT64>(filing_year, cnt) ORDER BY filing_year ASC)) AS best_year FROM ( SELECT cpc_group, filing_year, COUNT(*) AS cnt FROM ( SELECT cpc_parent AS cpc_group, filing_year FROM patent_cpcs, UNNEST(parents) AS cpc_parent) GROUP BY cpc_group, filing_year ORDER BY filing_year DESC, cnt DESC) GROUP BY cpc_group) JOIN `patents-public-data.cpc.definition` c ON cpc_group = c.symbol WHERE c.level = 5 ORDER BY best_year.filing_year ASC; """, { "XXXX": start_year, "YYYY": end_year, "ZZZZ": country }), "query7": replace( """ -- "Inventors with over N patents by country" WITH temp1 AS ( SELECT DISTINCT PUB.country_code, PUB.application_number AS patent_number, inventor_name FROM `patents-public-data.patents.publications` PUB CROSS JOIN UNNEST(PUB.inventor) AS inventor_name WHERE PUB.grant_date >= 1790000 AND PUB.country_code IS NOT NULL AND PUB.application_number IS NOT NULL AND PUB.inventor IS NOT NULL) SELECT * FROM ( SELECT temp1.country_code AS country, temp1.inventor_name AS inventor, COUNT(temp1.patent_number) AS count_of_patents FROM temp1 GROUP BY temp1.country_code, temp1.inventor_name) WHERE count_of_patents >= NNNN """, {"NNNN": min_count}), "query8": replace( """ -- "Patent landscaping of technology T between years X and Y" SELECT SUM(year_cnt) AS total_count, assignee_name, ARRAY_AGG(STRUCT<cnt INT64, filing_year INT64, countries STRING>(year_cnt, filing_year, countries) ORDER BY year_cnt DESC LIMIT 1)[SAFE_ORDINAL(1)] AS Number_of_patents_under_this_CPC_code_Peak_year_Top_countries FROM ( SELECT SUM(year_country_cnt) AS year_cnt, assignee_name, filing_year, STRING_AGG(country_code ORDER BY year_country_cnt DESC LIMIT 5) AS countries FROM ( SELECT COUNT(*) AS year_country_cnt, a.name AS assignee_name, CAST(FLOOR(filing_date/10000) AS INT64) AS filing_year, apps.country_code FROM ( SELECT ANY_VALUE(assignee_harmonized) AS assignee_harmonized, ANY_VALUE(filing_date) AS filing_date, ANY_VALUE(country_code) AS country_code FROM `patents-public-data.patents.publications` AS pubs WHERE (SELECT MAX(TRUE) FROM UNNEST(pubs.cpc) AS c WHERE REGEXP_CONTAINS(c.code, "TTTT")) AND publication_date >= XXXX0000 AND publication_date <= YYYY0000 GROUP BY application_number) AS apps, UNNEST(assignee_harmonized) AS a WHERE filing_date > 0 GROUP BY a.name, filing_year, country_code) GROUP BY assignee_name, filing_year) GROUP BY assignee_name ORDER BY total_count DESC LIMIT NNNN """, { "XXXX": start_year, "YYYY": end_year, "NNNN": min_count, "TTTT": cpc_code }), "query9": replace( """ -- "Which firms is assignee A citing in their patents?" SELECT citing_assignee, COUNT(*) AS num_cites, citing_cpc_subclass, cpcdef.title_full AS citing_cpc_title FROM ( SELECT pubs.publication_number AS citing_publication_number, cite.publication_number AS cited_publication_number, citing_assignee_s.name AS citing_assignee, SUBSTR(cpcs.code, 0, 4) AS citing_cpc_subclass FROM `patents-public-data.patents.publications` AS pubs, UNNEST(citation) AS cite, UNNEST(assignee_harmonized) AS citing_assignee_s, UNNEST(cpc) AS cpcs WHERE cpcs.first = TRUE) AS pubs JOIN ( SELECT publication_number AS cited_publication_number, cited_assignee_s.name AS cited_assignee FROM `patents-public-data.patents.publications`, UNNEST(assignee_harmonized) AS cited_assignee_s) AS refs ON pubs.cited_publication_number = refs.cited_publication_number JOIN `patents-public-data.cpc.definition` AS cpcdef ON cpcdef.symbol = citing_cpc_subclass WHERE cited_assignee = "AAAA" AND citing_assignee != "AAAA" GROUP BY cited_assignee, citing_assignee, citing_cpc_subclass, cpcdef.title_full ORDER BY num_cites DESC """, {"AAAA": assignee}), "query10": replace( """ -- Number of patent applications with keyword K in country Z WITH Patent_Matches AS ( SELECT PARSE_DATE('%Y%m%d', SAFE_CAST(ANY_VALUE(patentsdb.filing_date) AS STRING)) AS Patent_Filing_Date, patentsdb.application_number AS Patent_Application_Number, ANY_VALUE(abstract_info.text) AS Patent_Title, ANY_VALUE(abstract_info.language) AS Patent_Title_Language FROM `patents-public-data.patents.publications` AS patentsdb, UNNEST(abstract_localized) AS abstract_info WHERE LOWER(abstract_info.text) LIKE '%KKKK%' AND patentsdb.country_code = 'ZZZZ' GROUP BY Patent_Application_Number), Date_Series_Table AS ( SELECT day, 0 AS Number_of_Patents FROM UNNEST (GENERATE_DATE_ARRAY( (SELECT MIN(Patent_Filing_Date) FROM Patent_Matches), (SELECT MAX(Patent_Filing_Date) FROM Patent_Matches))) AS day) SELECT SAFE_CAST(FORMAT_DATE('%Y-%m',Date_Series_Table.day) AS STRING) AS Patent_Date_YearMonth, COUNT(Patent_Matches.Patent_Application_Number) AS Number_of_Patent_Applications FROM Patent_Matches RIGHT JOIN Date_Series_Table ON Patent_Matches.Patent_Filing_Date = Date_Series_Table.day GROUP BY Patent_Date_YearMonth ORDER BY Patent_Date_YearMonth """, { "ZZZZ": country, "KKKK": keyword }), } return switcher.get(query, "Invalid query") print("Estimated query size: {} GB.".format( bq_assistant.estimate_query_size(query_selector(query)))) def kenvelo(question, answer="no"): range_of_choices = { "yes": True, "y": True, "": True, "no": False, "n": False } if answer is None: prompt = "[Y/N]" elif answer == "yes": prompt = "[Y/N]" elif answer == "no": prompt = "[Y/N]" else: raise ValueError("Answer '%s' is invalid.") while True: sys.stdout.write(question + prompt) choice = input().lower() if answer is not None and choice == '': return range_of_choices[answer] elif choice in range_of_choices: return range_of_choices[choice] else: sys.stdout.write( "Please answer any variation of 'yes' or 'no'. ") while ask_before_running is False or kenvelo("Run query? "): def plot_query1(): figure(num=None, figsize=(24, 16), facecolor='w', edgecolor='k') sns.set(context='paper', style='ticks', font_scale=0.9) sns.barplot(x='country_code', y='cnt', data=patents.query_to_pandas_safe( query_selector("query1"), max_gb_scanned=bq_assistant.estimate_query_size( query_selector("query1")))) plt.title("Number of publications by country", loc='left', fontsize=24, style='oblique') plt.ylabel('# of publications (log)', fontsize=14) plt.xlabel('Country', fontsize=14) plt.yscale('log') sns.despine(offset=10, trim=True) plt.savefig(my_output_path + "query1" + '.pdf', orientation='landscape', bbox_inches='tight') plt.show() def plot_query2(): figure(num=None, figsize=(24, 16), facecolor='w', edgecolor='k') sns.set(context='paper', style='ticks', font_scale=0.9) sns.barplot(x='country_code', y='publications', data=patents.query_to_pandas_safe( query_selector("query2"), max_gb_scanned=bq_assistant.estimate_query_size( query_selector("query2")))) plt.title( "Number of patents published between years {} and {} by country" .format(start_year, end_year), loc='left', fontsize=24, style='oblique') plt.ylabel('# of publications') plt.xlabel('Country') plt.yscale('log') sns.despine(offset=10, trim=True) plt.savefig(my_output_path + "query2" + '.pdf', orientation='landscape', bbox_inches='tight') plt.show() def plot_query3(): figure(num=None, figsize=(24, 16), facecolor='w', edgecolor='k') sns.set(context='paper', style='ticks', font_scale=0.9) sns.barplot(x='datum', y='publications', data=patents.query_to_pandas_safe( query_selector("query3"), max_gb_scanned=bq_assistant.estimate_query_size( query_selector("query3")))) plt.title( "Number of patents granted to country {} between years {} and {}" .format(country, start_year, end_year), loc='left', fontsize=24, style='oblique') plt.ylabel('# of patents', fontsize=14) plt.xlabel('') sns.despine(offset=10, trim=True) plt.savefig(my_output_path + "query3" + '.pdf', orientation='landscape', bbox_inches='tight') plt.show() def plot_query10(): figure(num=None, figsize=(24, 16), facecolor='w', edgecolor='k') sns.set(context='paper', style='ticks', font_scale=0.9) sns.barplot(x='Patent_Date_YearMonth', y='Number_of_Patent_Applications', data=patents.query_to_pandas_safe( query_selector("query10"), max_gb_scanned=bq_assistant.estimate_query_size( query_selector("query10")))) plt.title( "Number of patent applications of technology {} in country {}". format(keyword, country), loc='left', fontsize=24, style='oblique') plt.ylabel('# of applications', fontsize=14) plt.xlabel('Date', fontsize=14, rotation=45) sns.despine(offset=10, trim=True) plt.savefig(my_output_path + "query10" + '.pdf', orientation='landscape', bbox_inches='tight') plt.show() def plotter(query): if query == "query1": plot_query1() query = "query2" elif query == "query2": plot_query2() query = "query3" elif query == "query3": plot_query3() query = "query10" elif query == "query10": plot_query10() else: print('\033[1m' + "Sorry, this query doesn't output graphs (yet)." + '\033[0m') temp_path = Path(my_output_path) if output_to_csv and plot: return patents.query_to_pandas_safe( query_selector(query), max_gb_scanned=min( budget, bq_assistant.estimate_query_size( query_selector(query)))).to_csv( Path(temp_path, query + '.csv'), index=False, encoding="utf-8"), plotter(query) if output_to_csv: return patents.query_to_pandas_safe( query_selector(query), max_gb_scanned=min( budget, bq_assistant.estimate_query_size( query_selector(query)))).to_csv(Path( temp_path, query + '.csv'), index=False, encoding="utf-8") if plot: return plotter(query) else: return patents.query_to_pandas_safe( query_selector(query), max_gb_scanned=min( budget, bq_assistant.estimate_query_size(query_selector(query)))) if kenvelo("You will use", bq_assistant.estimate_query_size(query_selector(query)), "GB in this query. Continue? ") is False: break
# Set up feedack system from learntools.core import binder binder.bind(globals()) from learntools.sql.ex1 import * # create a helper object for our bigquery dataset import bq_helper chicago_crime = bq_helper.BigQueryHelper(active_project="bigquery-public-data", dataset_name="chicago_crime") print("Setup Complete") table_list = chicago_crime.list_tables( ) # Write the code you need here to figure out the answer num_tables = len( table_list) # store the answer as num_tables and then run this cell q_1.check() crime_schema = chicago_crime.table_schema( 'crime') # Write the code to figure out the answer num_timestamp_fields = sum( crime_schema.type == 'TIMESTAMP') # put your answer here q_2.check() crime_table = chicago_crime.head( 'crime' ) # Write the code here to explore the data so you can find the answer
This is a temporary script file. """ import numpy as np import pandas as pd import bq_helper from google.cloud import bigquery from apiclient import discovery import os credentials = GoogleCredentials.get_application_default() os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = 'C:\\Users\\lukek\\Desktop\\SQL\\lschlab2weather-495aae5d3687.json' # create a helper object for our bigquery dataset bqh = bq_helper.BigQueryHelper(active_project= "bigquery-public-data", dataset_name= "noaa_gsod") # build and run a series of queries to get annual temperatures for the US # WARNING: each year takes 5+ mins to run and the resultant dataset is about 100MB! START_YEAR = 2009 END_YEAR = 2009 for year in range(START_YEAR, END_YEAR): query = "SELECT * FROM `bigquery-public-data.noaa_gsod.gsod{}`".format(year) df_wthr = bqh.query_to_pandas_safe(query, max_gb_scanned=5) filename = 'US_weather_{}.csv'.format(year) df_wthr.to_csv(filename, index = False) print ("Saved {}".format(filename))
import warnings warnings.filterwarnings('ignore') pd.options.display.max_rows = 10 # google bigquery library for quering data from google.cloud import bigquery import bq_helper # BigQueryHelper for converting query result direct to dataframe from bq_helper import BigQueryHelper #------------------------------------------------------------------------- # create a helper object for this dataset bigData = bq_helper.BigQueryHelper(active_project="bigquery-public-data", dataset_name="epa_historical_air_quality") # print all the tables in this dataset bigData.list_tables() # THE FOLOOWING ANALYSIS WILL BE CARRIED OUT IN FOR ALL THE DISTINCT COUNTIES IN THE US BASED ON CATEGORIES FOR EACH OF THE POLLUTANT # create schema of all the relevant tables avaliable in the dataset CO = bigData.table_schema('co_daily_summary') SO2 = bigData.table_schema('so2_daily_summary') O3 = bigData.table_schema('o3_daily_summary') NO2 = bigData.table_schema('no2_daily_summary') PM10 = bigData.table_schema('pm10_daily_summary') PM25 = bigData.table_schema('pm25_frm_daily_summary') PM25n = bigData.table_schema('pm25_nonfrm_daily_summary')
# import our bq_helper package import bq_helper # create a helper object for our bigquery dataset hacker_news = bq_helper.BigQueryHelper(active_project="bigquery-public-data", dataset_name="hacker_news") # print a list of all the tables in the hacker_news dataset hacker_news.list_tables() # print information on all the columns in the "full" table # in the hacker_news dataset hacker_news.table_schema("full") # preview the first couple lines of the "full" table hacker_news.head("full") # preview the first ten entries in the by column of the full table hacker_news.head("full", selected_columns="by", num_rows=10) # this query looks in the full table in the hacker_news # dataset, then gets the score column from every row where # the type column has "job" in it. query = """SELECT score FROM `bigquery-public-data.hacker_news.full` WHERE type = "job" """ # check how big this query will be hacker_news.estimate_query_size(query) # only run this query if it's less than 100 MB hacker_news.query_to_pandas_safe(query, max_gb_scanned=0.1) # check out the scores of job postings (if the # query is smaller than 1 gig) job_post_scores = hacker_news.query_to_pandas_safe(query) # average score for job posts job_post_scores.score.mean() # save our dataframe as a .csv
import pandas as pd import bq_helper from bq_helper import BigQueryHelper # https://www.kaggle.com/sohier/introduction-to-the-bq-helper-package medicare = bq_helper.BigQueryHelper(active_project="bigquery-public-data", dataset_name="cms_medicare") bq_assistant = BigQueryHelper("bigquery-public-data", "cms_medicare") bq_assistant.list_tables() bq_assistant.head("inpatient_charges_2015", num_rows=15)
import bq_helper from bq_helper import BigQueryHelper stackOverflow = bq_helper.BigQueryHelper(active_project="bigquery-public-data", dataset_name="stackoverflow") import pandas as pd import numpy as np import urllib.request from lxml import html, etree import os import sys import math sys.path.append('qtester') from libs.my_progress_bar import MyBar import libs.my_paths as mp from datetime import date, timedelta def get_init_data(begin_time: date = None, end_time: date = None) -> pd.DataFrame: where_str = "WHERE EXTRACT(YEAR FROM creation_date) = 2019 AND EXTRACT(MONTH FROM creation_date) = 10" if begin_time != None and end_time != None: where_str = f"""WHERE (EXTRACT(YEAR FROM creation_date) BETWEEN {begin_time.year} AND {end_time.year}) AND (EXTRACT(MONTH FROM creation_date) BETWEEN {begin_time.month} AND {end_time.month}) AND (EXTRACT(DAY FROM creation_date) BETWEEN {begin_time.day} AND {end_time.day})""" elif begin_time != None: where_str = f"""WHERE (EXTRACT(YEAR FROM creation_date) = {begin_time.year}) AND (EXTRACT(MONTH FROM creation_date) = {begin_time.month}) AND (EXTRACT(DAY FROM creation_date) = {begin_time.day}"""
import warnings warnings.filterwarnings('ignore') pd.options.display.max_rows = 10 get_ipython().run_line_magic('matplotlib', 'inline') # ### Bigquery # BigQuery is a RESTful web service that enables interactive analysis of massively large datasets working in conjunction with Google Storage. It is an Infrastructure as a Service that may be used complementarily with MapReduce. # In[135]: # Customized query helper function explosively in Kaggle import bq_helper # Helper object openAQ = bq_helper.BigQueryHelper(active_project='bigquery-public-data', dataset_name='openaq') # List of table openAQ.list_tables() # In[136]: #Schema openAQ.table_schema('global_air_quality') # ### Table display # In[137]: openAQ.head('global_air_quality') # In[138]:
import bq_helper import os import numpy as np import pandas as pd from google.cloud.bigquery.client import Client import json os.environ['GOOGLE_APPLICATION_CREDENTIALS']='CA675-ede38ecf6679.json' bq_client = Client() github_repos = bq_helper.BigQueryHelper(active_project= "bigquery-public-data", dataset_name = "github_repos") query1= """SELECT size FROM `bigquery-public-data.github_repos.contents` WHERE binary = True LIMIT 5000 """ github_repos.estimate_query_size(query1) github_repo_sizes = github_repos.query_to_pandas_safe(query1, max_gb_scanned=2.34) BYTES_PER_MB = 2**20 query9 =""" SELECT repo_name, watch_count FROM `bigquery-public-data.github_repos.sample_repos` ORDER BY watch_count DESC LIMIT 2000 """ data=github_repos.estimate_query_size(query9) github_repo_trending_repos = github_repos.query_to_pandas_safe(query9)