def main():
    noaa_goes = BigQueryHelper(active_project="bigquery-public-data",
                               dataset_name="noaa_goes16")
    noaa_goes.list_tables()

    noaa_goes.table_schema('abi_l1b_radiance')
    print(noaa_goes.head("abi_l1b_radiance", num_rows=10))

    query = """
    SELECT dataset_name, platform_id, scene_id FROM `bigquery-public-data.noaa_goes16.abi_l1b_radiance` WHERE geospatial_westbound_longitude<120 and geospatial_eastbound_longitude>75 and geospatial_northbound_latitude<50 and geospatial_southbound_latitude>30
    """

    print("Query size in GB is %f " % noaa_goes.estimate_query_size(query))
Esempio n. 2
0
class TestBQHelper(unittest.TestCase):
    def setUp(self):
        self.my_bq = BigQueryHelper("bigquery-public-data", "openaq")
        self.query = "SELECT location FROM `bigquery-public-data.openaq.global_air_quality`"
        # Query randomized so it won't hit the cache across multiple test runs
        self.randomizable_query = """
            SELECT value FROM `bigquery-public-data.openaq.global_air_quality`
            WHERE value = {0}"""

    def test_list_tables(self):
        self.assertEqual(self.my_bq.list_tables(), ['global_air_quality'])

    def test_list_schema(self):
        self.assertEqual(len(self.my_bq.table_schema('global_air_quality')),
                         11)

    def test_estimate_query_size(self):
        self.assertIsInstance(self.my_bq.estimate_query_size(self.query),
                              float)

    def test_query_to_pandas(self):
        self.assertIsInstance(self.my_bq.query_to_pandas(self.query),
                              DataFrame)

    def test_query_safe_passes(self):
        self.assertIsInstance(self.my_bq.query_to_pandas_safe(self.query),
                              DataFrame)

    def test_query_safe_fails(self):
        # Different query must be used for this test to ensure we don't hit the
        # cache and end up passing by testing a query that would use zero bytes.
        fail_query = self.randomizable_query.format(random())
        self.assertIsNone(self.my_bq.query_to_pandas_safe(fail_query, 10**-10))

    def test_head(self):
        self.assertIsInstance(self.my_bq.head('global_air_quality'), DataFrame)

    def test_useage_tracker(self):
        self.my_bq.query_to_pandas(self.randomizable_query.format(random()))
        self.assertNotEqual(self.my_bq.total_gb_used_net_cache, 0)

    def test_bad_query_raises_right_error(self):
        with self.assertRaises(BadRequest):
            self.my_bq.query_to_pandas("Not a valid query")

    def test_list_nested_schema(self):
        nested_helper = BigQueryHelper("bigquery-public-data", "github_repos")
        self.assertEqual(len(nested_helper.table_schema('commits')), 33)
Esempio n. 3
0
#

#
# ### Lets find the addresses who have the most number of bitcoins

# In[ ]:

q = """
SELECT  o.output_pubkey_base58, sum(o.output_satoshis) as output_sum from 
    `bigquery-public-data.bitcoin_blockchain.transactions`JOIN
    UNNEST(outputs) as o 
    where o.output_pubkey_base58 not in (select i.input_pubkey_base58
    from UNNEST(inputs) as i)
    group by o.output_pubkey_base58 order by output_sum desc limit 1000
"""
print(str(round((bq_assistant.estimate_query_size(q)), 2)) + str(" GB"))

results2 = bq.Run_Query(q, max_gb_scanned=70)
results2["output_sum"] = results2["output_sum"].apply(
    lambda x: float(x / 100000000))

# In[ ]:

import matplotlib.pyplot as plt
plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
objects = results2["output_pubkey_base58"][:10]
y_pos = np.arange(len(objects))
performance = results2["output_sum"][:10]
plt.bar(y_pos, performance, align='center', alpha=0.5)
Esempio n. 4
0
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in

from bq_helper import BigQueryHelper
bq_assistant = BigQueryHelper("bigquery-public-data", "github_repos")
QUERY = """
        SELECT message
        FROM `bigquery-public-data.github_repos.commits`
        WHERE LENGTH(message) > 6 AND LENGTH(message) <= 20
        LIMIT 2000
        """
bq_assistant.estimate_query_size(QUERY)
bq_assistant.listtables()

# Any results you write to the current directory are saved as output.
Esempio n. 5
0
def GLquery(query="query1",
            country="US",
            start_year="1790",
            end_year="2019",
            min_count="100",
            kind_code="B2",
            cpc_code="A63F",
            assignee="President and Fellows of Harvard College",
            keyword="internet of things",
            budget=1000,
            output_to_csv=False,
            plot=False,
            ask_before_running=False):
    '''

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

FUNCTION PARAMETERS

GLquery takes the following arguments:

- query (string): choose among one of the options in the next section (see below) (default is "query1")

- country (string) = format "LL" (default is "US")

- start_year (string) = format "YYYY" (default is "1790")

- end_year (string) = format "YYYY" (default is "2019")

- min_count (string) = minimum threshold (default is "100")

- kind_code (string) = format "LN", see kind codes here: https://www.cas.org/support/documentation/references/patkind (default is "B2")

- cpc_code (string) = formant "LNNL", see CPC codes here: https://www.uspto.gov/web/patents/classification/cpc/html/cpc.html (default is "A63F")

- assignee (string) = any format (case sensitive) (default is "President and Fellows of Harvard College")

- keyword (string) = any format (case sensitive) (default is "internet of things")

- budget (number) = any number (default is 1.000 GB) (queries above value 'budget' will not run; queries below 'budget' will use only the minimum amount of memory necessary to run the query, not the full value of 'budget')

- output_to_csv (True or False) = output results as .csv file to your current working directory (default is False)

- plot (True of False) = plot results as .pdf file to your current working directory (default is False) (implemented for queries 1, 2, 3, 10)

- ask_before_running (True or False) = given a query size (in GB), asks user input before running the query (default is False)

///////////////////////////////////////////////////////////////////////////////

QUERIES

As of June/2019, GLquery can perform the following queries:

- query1: Number of patent applications by country (takes no inputs)

- query2: Number of patents published between years X and Y by country (necessary arguments: start_year and end_year)

- query3: Number of patents published to country Z between years X and Y (necessary arguments: start_year, end_year and country)

- query4: Which patents country Z published between years X and Y? (necessary arguments: start_year, end_year, country, kind_code)

- query5: Most common patenting technology areas by year (takes no inputs)

- query6: Most common patenting technology areas in country Z between years X and Y (necessary arguments: start_year, end_year and country)

- query7: Inventors with over N patents by country (necessary arguments: min_count)

- query8: Patent landscaping of technology T between years X and Y" (necessary arguments: start_year, end_year, min_count and cpc_code) (this query returns patents, firms and countries associated with technology T)

- query9: Which firms is assignee A citing in their patents? (necessary arguments: assignee)

- query10: Number of patent applications with keyword K in country Z (necessary arguments: country and keyword)

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////


    '''

    global my_output_path

    patents = bq_helper.BigQueryHelper(active_project="patents-public-data",
                                       dataset_name="patents")
    bq_assistant = BigQueryHelper("patents-public-data", "patents")

    print("Country selected: {}.".format(country))
    print("Period selected: from {} to {}.".format(start_year, end_year))

    def replace(string, substitutions):
        substrings = sorted(substitutions, key=len, reverse=True)
        regex = re.compile('|'.join(map(re.escape, substrings)))
        return regex.sub(lambda match: substitutions[match.group(0)], string)

    def query_selector(query):
        switcher = {
            "query1":
            """
            -- "Number of publications by country"
            
            SELECT COUNT(*) AS cnt, country_code
            FROM (
                SELECT ANY_VALUE(country_code) AS country_code
                FROM `patents-public-data.patents.publications`
                GROUP BY application_number)
            GROUP BY country_code
            ORDER BY cnt DESC
            """,
            "query2":
            replace(
                """
            -- "Number of patents published between years X and Y by country"
            
            SELECT country_code,
            COUNT(DISTINCT publication_number) AS publications
            FROM `patents-public-data.patents.publications`
            WHERE publication_date >= XXXX0000 
            AND publication_date < YYYY0000
            AND application_kind = 'A'
            GROUP BY country_code
            ORDER BY publications DESC
            """, {
                    "XXXX": start_year,
                    "YYYY": end_year
                }),
            "query3":
            replace(
                """
            -- "Number of patents granted to country Z between years X and Y"
            
            SELECT FLOOR(publication_date/10000) as datum,
            COUNT(DISTINCT publication_number) as publications
            FROM `patents-public-data.patents.publications`
            WHERE country_code = 'ZZZZ'
            AND publication_date >= XXXX0000
            AND publication_date <= YYYY0000
            AND application_kind = 'A'
            GROUP BY datum, application_kind
            ORDER BY application_kind, datum
            """, {
                    "XXXX": start_year,
                    "YYYY": end_year,
                    "ZZZZ": country
                }),
            "query4":
            replace(
                """
            -- "Which patents country Z published between years X and Y"
            
            SELECT publication_number
            FROM `patents-public-data.patents.publications`
            WHERE country_code = 'ZZZZ'
            AND publication_date >= XXXX0000
            AND publication_date <= YYYY0000
            AND application_kind = 'A'
            AND kind_code = 'BBBB'
            """, {
                    "XXXX": start_year,
                    "YYYY": end_year,
                    "ZZZZ": country,
                    "BBBB": kind_code
                }),
            "query5":
            """
            -- "Most common patenting technology areas by year"
            
            CREATE TEMPORARY FUNCTION highest_moving_avg(yearcnt ARRAY<STRUCT<filing_year INT64, cnt INT64>>)
            RETURNS STRUCT<filing_year INT64, avg INT64>
            LANGUAGE js AS \"""
            let avg = 0;
            let a = 1.0;
            let highest = {filing_year: -1, avg: -1};
            for (let x of yearcnt) {
                    avg = a * x.cnt + (1 - a) * avg;
                    if (avg > highest.avg) {
                            highest = {filing_year: x.filing_year, avg: avg};}
                    }
                    return highest;
                \""";
                
            WITH patent_cpcs AS (
                SELECT cd.parents,
                CAST(FLOOR(filing_date/10000) AS INT64) AS filing_year
                FROM (
                    SELECT ANY_VALUE(cpc) AS cpc, ANY_VALUE(filing_date) AS filing_date
                    FROM `patents-public-data.patents.publications`
                    WHERE application_number != ""
                    GROUP BY application_number), UNNEST(cpc) AS cpcs
                JOIN `patents-public-data.cpc.definition` cd ON cd.symbol = cpcs.code
                WHERE cpcs.first = TRUE AND filing_date > 0)

            SELECT c.title_full, cpc_group, best_year.*
            FROM (
                SELECT cpc_group, highest_moving_avg(ARRAY_AGG(STRUCT<filing_year INT64, cnt INT64>(filing_year, cnt) ORDER BY filing_year ASC)) AS best_year
                FROM (
                    SELECT cpc_group, filing_year, COUNT(*) AS cnt
                    FROM (
                        SELECT cpc_parent AS cpc_group, filing_year
                        FROM patent_cpcs, UNNEST(parents) AS cpc_parent)
                    GROUP BY cpc_group, filing_year
                    ORDER BY filing_year DESC, cnt DESC)
                GROUP BY cpc_group)
            JOIN `patents-public-data.cpc.definition` c ON cpc_group = c.symbol
            WHERE c.level = 5
            ORDER BY best_year.filing_year ASC;
            """,
            "query6":
            replace(
                """
            -- "Most common patenting technology areas in country Z between years X and Y"
            
            CREATE TEMPORARY FUNCTION highest_moving_avg(yearcnt ARRAY<STRUCT<filing_year INT64, cnt INT64>>)
            RETURNS STRUCT<filing_year INT64, avg INT64>
            LANGUAGE js AS \"""
            let avg = 0;
            let a = 1.0;
            let highest = {filing_year: -1, avg: -1};
            for (let x of yearcnt) {
                    avg = a * x.cnt + (1 - a) * avg;
                    if (avg > highest.avg) {
                            highest = {filing_year: x.filing_year, avg: avg};}
                    }
                    return highest;
                \""";
                
            WITH patent_cpcs AS (
                SELECT cd.parents,
                CAST(FLOOR(filing_date/10000) AS INT64) AS filing_year
                FROM (
                    SELECT ANY_VALUE(cpc) AS cpc, ANY_VALUE(filing_date) AS filing_date
                    FROM `patents-public-data.patents.publications`
                    WHERE application_number != ""
                    AND country_code = 'ZZZZ'
                    AND grant_date >= XXXX0000
                    AND grant_date <= YYYY0000
                    GROUP BY application_number), UNNEST(cpc) AS cpcs
                JOIN `patents-public-data.cpc.definition` cd ON cd.symbol = cpcs.code
                WHERE cpcs.first = TRUE AND filing_date > 0)

            SELECT c.title_full, cpc_group, best_year.*
            FROM (
                SELECT cpc_group, highest_moving_avg(ARRAY_AGG(STRUCT<filing_year INT64, cnt INT64>(filing_year, cnt) ORDER BY filing_year ASC)) AS best_year
                FROM (
                    SELECT cpc_group, filing_year, COUNT(*) AS cnt
                    FROM (
                        SELECT cpc_parent AS cpc_group, filing_year
                        FROM patent_cpcs, UNNEST(parents) AS cpc_parent)
                    GROUP BY cpc_group, filing_year
                    ORDER BY filing_year DESC, cnt DESC)
                GROUP BY cpc_group)
            JOIN `patents-public-data.cpc.definition` c ON cpc_group = c.symbol
            WHERE c.level = 5 
            ORDER BY best_year.filing_year ASC;
            """, {
                    "XXXX": start_year,
                    "YYYY": end_year,
                    "ZZZZ": country
                }),
            "query7":
            replace(
                """
            -- "Inventors with over N patents by country"
            
            WITH temp1 AS (
            SELECT DISTINCT PUB.country_code,
            PUB.application_number AS patent_number, inventor_name
            FROM `patents-public-data.patents.publications` PUB
            CROSS JOIN
            UNNEST(PUB.inventor) AS inventor_name
            WHERE PUB.grant_date >= 1790000
            AND PUB.country_code IS NOT NULL
            AND PUB.application_number IS NOT NULL
            AND PUB.inventor IS NOT NULL)
            SELECT * FROM (
            SELECT temp1.country_code AS country, temp1.inventor_name AS inventor,
            COUNT(temp1.patent_number) AS count_of_patents
            FROM temp1
            GROUP BY temp1.country_code, temp1.inventor_name)
            WHERE count_of_patents >= NNNN
            """, {"NNNN": min_count}),
            "query8":
            replace(
                """
            -- "Patent landscaping of technology T between years X and Y" 
            
            SELECT SUM(year_cnt) AS total_count, assignee_name,
            ARRAY_AGG(STRUCT<cnt INT64, filing_year INT64, countries STRING>(year_cnt, filing_year, countries) ORDER BY year_cnt DESC LIMIT 1)[SAFE_ORDINAL(1)] AS Number_of_patents_under_this_CPC_code_Peak_year_Top_countries
            FROM (
                SELECT SUM(year_country_cnt) AS year_cnt, assignee_name, filing_year, STRING_AGG(country_code ORDER BY year_country_cnt DESC LIMIT 5) AS countries
                FROM (
                    SELECT COUNT(*) AS year_country_cnt, a.name AS assignee_name, CAST(FLOOR(filing_date/10000) AS INT64) AS filing_year, apps.country_code
                    FROM (
                        SELECT ANY_VALUE(assignee_harmonized) AS assignee_harmonized, ANY_VALUE(filing_date) AS filing_date, ANY_VALUE(country_code) AS country_code
                        FROM `patents-public-data.patents.publications` AS pubs
                        WHERE (SELECT MAX(TRUE) FROM UNNEST(pubs.cpc) AS c WHERE REGEXP_CONTAINS(c.code, "TTTT"))
                        AND publication_date >= XXXX0000
                        AND publication_date <= YYYY0000
                        GROUP BY application_number) AS apps, UNNEST(assignee_harmonized) AS a
                    WHERE filing_date > 0
                    GROUP BY a.name, filing_year, country_code)
                GROUP BY assignee_name, filing_year)
            GROUP BY assignee_name
            ORDER BY total_count DESC
            LIMIT NNNN
            """, {
                    "XXXX": start_year,
                    "YYYY": end_year,
                    "NNNN": min_count,
                    "TTTT": cpc_code
                }),
            "query9":
            replace(
                """
            -- "Which firms is assignee A citing in their patents?"
            
            SELECT citing_assignee,
            COUNT(*) AS num_cites, citing_cpc_subclass, cpcdef.title_full AS citing_cpc_title
            FROM (
                SELECT pubs.publication_number AS citing_publication_number, cite.publication_number AS cited_publication_number, citing_assignee_s.name AS citing_assignee, SUBSTR(cpcs.code, 0, 4) AS citing_cpc_subclass
                FROM `patents-public-data.patents.publications` AS pubs, UNNEST(citation) AS cite, UNNEST(assignee_harmonized) AS citing_assignee_s, UNNEST(cpc) AS cpcs
                WHERE cpcs.first = TRUE) AS pubs
                JOIN (
                    SELECT publication_number AS cited_publication_number, cited_assignee_s.name AS cited_assignee
                    FROM `patents-public-data.patents.publications`, UNNEST(assignee_harmonized) AS cited_assignee_s) AS refs ON pubs.cited_publication_number = refs.cited_publication_number
                JOIN `patents-public-data.cpc.definition` AS cpcdef ON cpcdef.symbol = citing_cpc_subclass
                WHERE cited_assignee = "AAAA" AND citing_assignee != "AAAA"
                GROUP BY cited_assignee, citing_assignee, citing_cpc_subclass, cpcdef.title_full
                ORDER BY num_cites DESC
                """, {"AAAA": assignee}),
            "query10":
            replace(
                """
            --  Number of patent applications with keyword K in country Z             
            
            WITH Patent_Matches AS (
                SELECT PARSE_DATE('%Y%m%d', SAFE_CAST(ANY_VALUE(patentsdb.filing_date) AS STRING)) AS Patent_Filing_Date, patentsdb.application_number AS Patent_Application_Number,
                ANY_VALUE(abstract_info.text) AS Patent_Title,
                ANY_VALUE(abstract_info.language) AS Patent_Title_Language
                FROM `patents-public-data.patents.publications` AS patentsdb,
                UNNEST(abstract_localized) AS abstract_info
                WHERE
                LOWER(abstract_info.text) LIKE '%KKKK%'
                AND patentsdb.country_code = 'ZZZZ'
                GROUP BY Patent_Application_Number),
            Date_Series_Table AS (
                SELECT day, 0 AS Number_of_Patents
                FROM UNNEST (GENERATE_DATE_ARRAY(
                    (SELECT MIN(Patent_Filing_Date) FROM Patent_Matches),
                    (SELECT MAX(Patent_Filing_Date) FROM Patent_Matches))) AS day)
            SELECT SAFE_CAST(FORMAT_DATE('%Y-%m',Date_Series_Table.day) AS STRING) AS Patent_Date_YearMonth, COUNT(Patent_Matches.Patent_Application_Number) AS Number_of_Patent_Applications
            FROM Patent_Matches
            RIGHT JOIN Date_Series_Table
            ON Patent_Matches.Patent_Filing_Date = Date_Series_Table.day
            GROUP BY Patent_Date_YearMonth
            ORDER BY Patent_Date_YearMonth
            """, {
                    "ZZZZ": country,
                    "KKKK": keyword
                }),
        }
        return switcher.get(query, "Invalid query")

    print("Estimated query size: {} GB.".format(
        bq_assistant.estimate_query_size(query_selector(query))))

    def kenvelo(question, answer="no"):

        range_of_choices = {
            "yes": True,
            "y": True,
            "": True,
            "no": False,
            "n": False
        }

        if answer is None:
            prompt = "[Y/N]"

        elif answer == "yes":
            prompt = "[Y/N]"

        elif answer == "no":
            prompt = "[Y/N]"

        else:
            raise ValueError("Answer '%s' is invalid.")

        while True:
            sys.stdout.write(question + prompt)
            choice = input().lower()

            if answer is not None and choice == '':
                return range_of_choices[answer]

            elif choice in range_of_choices:
                return range_of_choices[choice]

            else:
                sys.stdout.write(
                    "Please answer any variation of 'yes' or 'no'. ")

    while ask_before_running is False or kenvelo("Run query? "):

        def plot_query1():
            figure(num=None, figsize=(24, 16), facecolor='w', edgecolor='k')
            sns.set(context='paper', style='ticks', font_scale=0.9)
            sns.barplot(x='country_code',
                        y='cnt',
                        data=patents.query_to_pandas_safe(
                            query_selector("query1"),
                            max_gb_scanned=bq_assistant.estimate_query_size(
                                query_selector("query1"))))
            plt.title("Number of publications by country",
                      loc='left',
                      fontsize=24,
                      style='oblique')
            plt.ylabel('# of publications (log)', fontsize=14)
            plt.xlabel('Country', fontsize=14)
            plt.yscale('log')
            sns.despine(offset=10, trim=True)
            plt.savefig(my_output_path + "query1" + '.pdf',
                        orientation='landscape',
                        bbox_inches='tight')
            plt.show()

        def plot_query2():
            figure(num=None, figsize=(24, 16), facecolor='w', edgecolor='k')
            sns.set(context='paper', style='ticks', font_scale=0.9)
            sns.barplot(x='country_code',
                        y='publications',
                        data=patents.query_to_pandas_safe(
                            query_selector("query2"),
                            max_gb_scanned=bq_assistant.estimate_query_size(
                                query_selector("query2"))))
            plt.title(
                "Number of patents published between years {} and {} by country"
                .format(start_year, end_year),
                loc='left',
                fontsize=24,
                style='oblique')
            plt.ylabel('# of publications')
            plt.xlabel('Country')
            plt.yscale('log')
            sns.despine(offset=10, trim=True)
            plt.savefig(my_output_path + "query2" + '.pdf',
                        orientation='landscape',
                        bbox_inches='tight')
            plt.show()

        def plot_query3():
            figure(num=None, figsize=(24, 16), facecolor='w', edgecolor='k')
            sns.set(context='paper', style='ticks', font_scale=0.9)
            sns.barplot(x='datum',
                        y='publications',
                        data=patents.query_to_pandas_safe(
                            query_selector("query3"),
                            max_gb_scanned=bq_assistant.estimate_query_size(
                                query_selector("query3"))))
            plt.title(
                "Number of patents granted to country {} between years {} and {}"
                .format(country, start_year, end_year),
                loc='left',
                fontsize=24,
                style='oblique')
            plt.ylabel('# of patents', fontsize=14)
            plt.xlabel('')
            sns.despine(offset=10, trim=True)
            plt.savefig(my_output_path + "query3" + '.pdf',
                        orientation='landscape',
                        bbox_inches='tight')
            plt.show()

        def plot_query10():
            figure(num=None, figsize=(24, 16), facecolor='w', edgecolor='k')
            sns.set(context='paper', style='ticks', font_scale=0.9)
            sns.barplot(x='Patent_Date_YearMonth',
                        y='Number_of_Patent_Applications',
                        data=patents.query_to_pandas_safe(
                            query_selector("query10"),
                            max_gb_scanned=bq_assistant.estimate_query_size(
                                query_selector("query10"))))
            plt.title(
                "Number of patent applications of technology {} in country {}".
                format(keyword, country),
                loc='left',
                fontsize=24,
                style='oblique')
            plt.ylabel('# of applications', fontsize=14)
            plt.xlabel('Date', fontsize=14, rotation=45)
            sns.despine(offset=10, trim=True)
            plt.savefig(my_output_path + "query10" + '.pdf',
                        orientation='landscape',
                        bbox_inches='tight')
            plt.show()

        def plotter(query):
            if query == "query1":
                plot_query1()
                query = "query2"

            elif query == "query2":
                plot_query2()
                query = "query3"

            elif query == "query3":
                plot_query3()
                query = "query10"

            elif query == "query10":
                plot_query10()

            else:
                print('\033[1m' +
                      "Sorry, this query doesn't output graphs (yet)." +
                      '\033[0m')

        temp_path = Path(my_output_path)

        if output_to_csv and plot:
            return patents.query_to_pandas_safe(
                query_selector(query),
                max_gb_scanned=min(
                    budget,
                    bq_assistant.estimate_query_size(
                        query_selector(query)))).to_csv(
                            Path(temp_path, query + '.csv'),
                            index=False,
                            encoding="utf-8"), plotter(query)

        if output_to_csv:
            return patents.query_to_pandas_safe(
                query_selector(query),
                max_gb_scanned=min(
                    budget,
                    bq_assistant.estimate_query_size(
                        query_selector(query)))).to_csv(Path(
                            temp_path, query + '.csv'),
                                                        index=False,
                                                        encoding="utf-8")

        if plot:
            return plotter(query)

        else:
            return patents.query_to_pandas_safe(
                query_selector(query),
                max_gb_scanned=min(
                    budget,
                    bq_assistant.estimate_query_size(query_selector(query))))

        if kenvelo("You will use",
                   bq_assistant.estimate_query_size(query_selector(query)),
                   "GB in this query. Continue? ") is False:
            break
Esempio n. 6
0
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "keys.json"

import pandas as pd
from bq_helper import BigQueryHelper

bq_assistant = BigQueryHelper('bigquery-public-data', 'new_york')

QUERY = """
        SELECT * FROM `bigquery-public-data.new_york.citibike_trips`
        """

print(bq_assistant.estimate_query_size(QUERY))  #4.5 GB

QUERY = """
        SELECT * FROM `bigquery-public-data.new_york.citibike_trips`
        LIMIT 100
        """

df = bq_assistant.query_to_pandas(QUERY)

print(df.head(10))
Esempio n. 7
0
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "key.json"

from bq_helper import BigQueryHelper

client = BigQueryHelper('bigquery-public-data', 'stackoverflow')

QUERY = """
    SELECT * FROM `bigquery-public-data.stackoverflow.comments` 
    
    """

print(client.estimate_query_size(QUERY))  #13.3 GB

bq_assistant = BigQueryHelper('bigquery-public-data', 'new_york')

QUERY = """
        SELECT * FROM `bigquery-public-data.new_york.citibike_trips`
        """

print(bq_assistant.estimate_query_size(QUERY))  #4.6 GB
Esempio n. 8
0
  block_timestamp,
  from_address,
  to_address,
  value,
  receipt_contract_address,
  receipt_status,
  block_number
FROM
  `bigquery-public-data.ethereum_blockchain.transactions` AS transactions
WHERE TRUE
  AND block_timestamp < '2017-01-01 00:00:00'
"""
# This establishes an authenticated session and prepares a reference to the dataset that lives in BigQuery.
bq_assistant = BigQueryHelper("bigquery-public-data", "ethereum_blockchain")
print("estimated query data size: " +
      str(bq_assistant.estimate_query_size(query)))

df = bq_assistant.query_to_pandas_safe(query, max_gb_scanned=60)

df.to_csv(
    "/home/yueduan/yueduan/postdoc_study/ether_transactions_before2017.csv")

# # Import dataset from big query
# from google.cloud import bigquery
# from bq_helper import BigQueryHelper
# import matplotlib.pyplot as plt
# import seaborn as sns
# import pandas as pd
# #%matplotlib inline
# plt.style.use('ggplot')
# sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})