Ejemplo n.º 1
0
def main():
    path = '../data/states'
    bq_assistant = BigQueryHelper('bigquery-public-data', 'nhtsa_traffic_fatalities')

    """
    Per State Overview
        - Fatal accidents per state
        - Accident Fatalities per state
        - Fatalities / Accident per state
    """

    QUERY = """
        SELECT
            state_number,
            state_name,
            COUNT(consecutive_number) AS accidents,
            SUM(number_of_fatalities) AS fatalities,
            SUM(number_of_fatalities) / COUNT(consecutive_number) AS fatalities_per_accident
        FROM
            `bigquery-public-data.nhtsa_traffic_fatalities.accident_{0}`
        GROUP BY state_number, state_name
        ORDER BY fatalities_per_accident DESC
    """

    accident_fatalities_2015 = bq_assistant.query_to_pandas(QUERY.format(2015))
    accident_fatalities_2015['year'] = 2015
    accident_fatalities_2016 = bq_assistant.query_to_pandas(QUERY.format(2016))
    accident_fatalities_2016['year'] = 2016

    accident_fatalities_2015.to_csv(f'{path}/accident_fatalities_2015.csv', index=False)
    accident_fatalities_2016.to_csv(f'{path}/accident_fatalities_2016.csv', index=False)
Ejemplo n.º 2
0
class TestBQHelper(unittest.TestCase):
    def setUp(self):
        self.my_bq = BigQueryHelper("bigquery-public-data", "openaq")
        self.query = "SELECT location FROM `bigquery-public-data.openaq.global_air_quality`"
        # Query randomized so it won't hit the cache across multiple test runs
        self.randomizable_query = """
            SELECT value FROM `bigquery-public-data.openaq.global_air_quality`
            WHERE value = {0}"""

    def test_list_tables(self):
        self.assertEqual(self.my_bq.list_tables(), ['global_air_quality'])

    def test_list_schema(self):
        self.assertEqual(len(self.my_bq.table_schema('global_air_quality')),
                         11)

    def test_estimate_query_size(self):
        self.assertIsInstance(self.my_bq.estimate_query_size(self.query),
                              float)

    def test_query_to_pandas(self):
        self.assertIsInstance(self.my_bq.query_to_pandas(self.query),
                              DataFrame)

    def test_query_safe_passes(self):
        self.assertIsInstance(self.my_bq.query_to_pandas_safe(self.query),
                              DataFrame)

    def test_query_safe_fails(self):
        # Different query must be used for this test to ensure we don't hit the
        # cache and end up passing by testing a query that would use zero bytes.
        fail_query = self.randomizable_query.format(random())
        self.assertIsNone(self.my_bq.query_to_pandas_safe(fail_query, 10**-10))

    def test_head(self):
        self.assertIsInstance(self.my_bq.head('global_air_quality'), DataFrame)

    def test_useage_tracker(self):
        self.my_bq.query_to_pandas(self.randomizable_query.format(random()))
        self.assertNotEqual(self.my_bq.total_gb_used_net_cache, 0)

    def test_bad_query_raises_right_error(self):
        with self.assertRaises(BadRequest):
            self.my_bq.query_to_pandas("Not a valid query")

    def test_list_nested_schema(self):
        nested_helper = BigQueryHelper("bigquery-public-data", "github_repos")
        self.assertEqual(len(nested_helper.table_schema('commits')), 33)
Ejemplo n.º 3
0
def main():
    path = '../data/persons'
    bq_assistant = BigQueryHelper('bigquery-public-data',
                                  'nhtsa_traffic_fatalities')

    QUERY = """
        SELECT
            p.state_number,
            p.consecutive_number,
            p.vehicle_number,
            p.person_number,
            person_type_name AS person_type,
            age,
            sex,
            injury_severity_name AS injury_severity,
            land_use_name AS land_use,
            functional_system_name AS trafficway_type,
            manner_of_collision_name AS manner_of_collision,
            body_type_name AS body_type,
            rollover,
            seating_position_name AS seating_position,
            ejection_name AS ejection,
            restraint_system_helmet_use_name AS safety_equipment_use,
            air_bag_deployed_name AS air_bag_deployed,
            non_motorist_safety_equipment_use
        FROM
            `bigquery-public-data.nhtsa_traffic_fatalities.person_{0}` AS p
        LEFT JOIN
            `bigquery-public-data.nhtsa_traffic_fatalities.safetyeq_{0}` AS s
            ON p.consecutive_number=s.consecutive_number AND p.vehicle_number=s.vehicle_number AND p.person_number=s.person_number
        ORDER BY
            consecutive_number, vehicle_number, person_number
    """

    data_2015 = bq_assistant.query_to_pandas(QUERY.format(2015))
    data_2016 = bq_assistant.query_to_pandas(QUERY.format(2016))

    data_2015.to_csv(f'{path}/person_data_2015.csv', index=False)
    data_2016.to_csv(f'{path}/person_data_2016.csv', index=False)
Ejemplo n.º 4
0
import pandas as pd
from bq_helper import BigQueryHelper
# import plotly.graph_objs as go
# from plotly.offline import plot

bq_assistant = BigQueryHelper('bigquery-public-data', 'san_francisco')

QUERY = """

        SELECT call_number, unit_id, call_type, battalion FROM `bigquery-public-data.san_francisco.sffd_service_calls`

        LIMIT 10000

        """

df = bq_assistant.query_to_pandas(QUERY)

print(df.head(3))
# trace1 = go.Scatter(
#
#                     )
#
# trace2 = go.Pie(
#
#                     )
#
# trace3 = go.Bar(
#
# )
# data = [trace1]
#
Ejemplo n.º 5
0
import plotly.graph_objs as go
from plotly.offline import plot

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "keys.json"

bq_assistant = BigQueryHelper('bigquery-public-data', 'epa_historical_air_quality.no2_daily_summary')


QUERY = """
        SELECT sample_duration, date_local, units_of_measure, county_name
        FROM bigquery-public-data.epa_historical_air_quality.no2_daily_summary
        LIMIT 100
        """


df: pandas.core.frame.DataFrame = bq_assistant.query_to_pandas(QUERY)

df_group_by_county_name = df.groupby(['county_name'])
df.num_of_samples = df_group_by_county_name["sample_duration"].count()

trace1 = go.Scatter(
    x = df.num_of_samples.index,
    y = df.num_of_samples.values,
    mode = "lines+markers",
    name = "num of samples"
)



trace2 = go.Pie(
    labels=df.num_of_samples.index,
Ejemplo n.º 6
0
QUERY2016 = """
    SELECT
        pollutant.county_name AS County, AVG(pollutant.aqi) AS AvgAQI_pollutant
    FROM
      `bigquery-public-data.epa_historical_air_quality.pollutant_daily_summary` as pollutant
    WHERE
      pollutant.poc = 1
      AND EXTRACT(YEAR FROM pollutant.date_local) = 2016
    GROUP BY 
      pollutant.county_name
"""

df_2016 = None
for elem_g in pollutants:
    query = QUERY2016.replace("pollutant", elem_g)
    temp = bq_assistant.query_to_pandas(query).set_index('County')
    df_2016 = pd.concat([df_2016, temp], axis=1, join='outer')
df_2016 = df_2016.apply(lambda x: x.fillna(x.mean()), axis=0)

df_2016.sample(10, random_state=42)

# Okay so now we have the 2016 measures for every county in the US, what can we do about it to retrieve some useful insights ? Well, **clustering** is a natural answer in the context !
# Let's see what t-SNE will give us here :

# In[ ]:

from sklearn.manifold import TSNE
X_tsne = TSNE(n_components=2, n_iter=2000, perplexity=35,
              random_state=5).fit_transform(df_2016)
df_tsne = pd.DataFrame(X_tsne)
df_tsne['County'] = list(df_2016.index)
Ejemplo n.º 7
0
get_ipython().run_line_magic('matplotlib', 'inline')
import seaborn as sns
plt.style.use('fivethirtyeight')

# #### Lets make a query to get a total sum of bitcoins  transacted for every day.

# In[ ]:

q = """
SELECT  TIMESTAMP_MILLIS((timestamp - MOD(timestamp,
          86400000))) as Timestamp, sum(o.output_satoshis) as output_price from 
    `bigquery-public-data.bitcoin_blockchain.transactions`JOIN
    UNNEST(outputs) as o group by timestamp
"""
print(str(round((bq_assistant.estimate_query_size(q)), 2)) + str(" GB"))
results3 = bq_assistant.query_to_pandas(q)
results3["output_price"] = results3["output_price"].apply(
    lambda x: float(x / 100000000))
results3 = results3.sort_values(by="Timestamp")
results3.head()

# ### In this query we will find the total count of transactions per day

# In[ ]:

q = """
SELECT  TIMESTAMP_MILLIS((timestamp - MOD(timestamp,
          86400000))) as Timestamp , count(Timestamp) as output_count from 
    `bigquery-public-data.bitcoin_blockchain.transactions` group by timestamp
"""
print(str(round((bq_assistant.estimate_query_size(q)), 2)) + str(" GB"))
Ejemplo n.º 8
0
import plotly.graph_objects as go
from plotly.offline import plot

from bq_helper import BigQueryHelper
client = BigQueryHelper('bigquery-public-data','stackoverflow')


query = """ select 
                    *
            from bigquery-public-data.stackoverflow.stackoverflow_posts 
            
         
            limit 1000
        """

df = client.query_to_pandas(query)

print(df.head(10))

df_filter = df[['id', 'title', 'comment_count', 'answer_count']][ (df.comment_count>=2) & (df.answer_count>=1) ]

print(df_filter.comment_count.max())

print(df_filter[['comment_count']].min())

print(df_filter.head(10))
print(df_filter.size)
print(df_filter.count)


# trace = go.Scatter(x = df.creation_date, y= df.count_of_posts)
Ejemplo n.º 9
0
import pandas as pd
import numpy as np
from google.cloud import bigquery
import os
from bq_helper import BigQueryHelper

os.environ[
    "GOOGLE_APPLICATION_CREDENTIALS"] = r"C:\Users\ntang\Downloads\My Project-66567def43dc.json"
bq_assistant = BigQueryHelper("bigquery-public-data",
                              "epa_historical_air_quality")
pollutants = ['o3', 'no2']

QUERY = """
    SELECT
        pollutant.latitude AS Lat, pollutant.longitude AS Lon, pollutant.sample_measurement AS pollutant, pollutant.date_local as Date, pollutant.time_local as Time
    FROM
      `bigquery-public-data.epa_historical_air_quality.pollutant_hourly_summary` as pollutant
    WHERE
      pollutant.poc = 1
      AND EXTRACT(YEAR FROM pollutant.date_local) BETWEEN 2013 AND 2017
"""

df = None
for elem_g in pollutants:
    query = QUERY.replace("pollutant", elem_g)
    temp = bq_assistant.query_to_pandas(query)
    df = pd.concat([df, temp], axis=1, join='outer')
Ejemplo n.º 10
0
        }
    }
    es.indices.create(index='chicago-taxis', body=body)

    # query for chicago taxi records using BigQuery
    chicago_taxis = BigQueryHelper(active_project="bigquery-public-data",
                                   dataset_name="chicago_taxi_trips")
    i = 0
    succeeded = 0
    failed = 0
    while True:
        # we need to iterate through the table because a single query is too large
        # we'll query 5000 records at a time
        query = "SELECT * from `bigquery-public-data.chicago_taxi_trips.taxi_trips` LIMIT 5000 OFFSET {}".format(
            5000 * i)
        df = chicago_taxis.query_to_pandas(query)

        if df.shape[0] == 0:
            break

        # drop any records without a pickup or dropoff location
        df.dropna(subset=[
            'pickup_latitude', 'pickup_longitude', 'dropoff_latitude',
            'dropoff_longitude'
        ],
                  how='any',
                  inplace=True)

        # convert the data frame to a list of dicts
        L = df.fillna('').to_dict(orient='records')
Ejemplo n.º 11
0
def main():
    path = '../data/accidents'
    bq_assistant = BigQueryHelper('bigquery-public-data',
                                  'nhtsa_traffic_fatalities')

    QUERY = """
        SELECT
            a.state_number,
            a.consecutive_number,
            veh.vehicle_number,
            state_name,
            a.hour_of_crash AS hour_of_day,
            CASE
                WHEN day_of_week = 1 THEN 'Sunday'
                WHEN day_of_week = 2 THEN 'Monday'
                WHEN day_of_week = 3 THEN 'Tuesday'
                WHEN day_of_week = 4 THEN 'Wednesday'
                WHEN day_of_week = 5 THEN 'Thursday'
                WHEN day_of_week = 6 THEN 'Friday'
                WHEN day_of_week = 7 THEN 'Saturday'
            END day_of_week,
            CASE
                WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 1 THEN 'January'
                WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 2 THEN 'February'
                WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 3 THEN 'March'
                WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 4 THEN 'April'
                WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 5 THEN 'May'
                WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 6 THEN 'June'
                WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 7 THEN 'July'
                WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 8 THEN 'August'
                WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 9 THEN 'September'
                WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 10 THEN 'October'
                WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 11 THEN 'November'
                WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 12 THEN 'December'
            END month,
            EXTRACT(YEAR FROM a.timestamp_of_crash) AS year,
            land_use_name AS land_use,
            national_highway_system,
            route_signing_name AS roadway_type,
            type_of_intersection AS intersection,
            light_condition_name AS light_condition,
            atmospheric_conditions_1_name AS atmospheric_conditions,
            latitude,
            longitude,
            manner_of_collision_name AS manner_of_collision,
            number_of_vehicle_forms_submitted_all AS num_vehicles,
            number_of_persons_not_in_motor_vehicles_in_transport_mvit AS num_nonmotorists,
            number_of_persons_in_motor_vehicles_in_transport_mvit AS num_motorists,
            number_of_fatalities AS num_fatalities,
            number_of_drunk_drivers AS num_drunk_drivers,
            body_type_name AS body_type,
            vehicle_model_year AS vehicle_year,
            rollover,
            contributing_circumstances_motor_vehicle_name AS vehicle_conditions,
            previous_dwi_convictions,
            previous_speeding_convictions,
            speeding_related,
            speed_limit,
            drivers_vision_obscured_by_name AS driver_vision_obscured,
        FROM
            `bigquery-public-data.nhtsa_traffic_fatalities.accident_{0}` AS a
        JOIN
            `bigquery-public-data.nhtsa_traffic_fatalities.vehicle_{0}` AS veh
            ON a.consecutive_number=veh.consecutive_number
        JOIN
            `bigquery-public-data.nhtsa_traffic_fatalities.factor_{0}` AS f
            ON a.consecutive_number=f.consecutive_number AND veh.vehicle_number=f.vehicle_number
        JOIN
            `bigquery-public-data.nhtsa_traffic_fatalities.vision_{0}` AS v
            ON a.consecutive_number=v.consecutive_number AND veh.vehicle_number=v.vehicle_number
        ORDER BY consecutive_number, vehicle_number
    """

    data_2015 = bq_assistant.query_to_pandas(QUERY.format(2015))
    data_2016 = bq_assistant.query_to_pandas(QUERY.format(2016))

    data_2015.to_csv(f'{path}/accident_driver_data_2015.csv', index=False)
    data_2016.to_csv(f'{path}/accident_driver_data_2016.csv', index=False)
Ejemplo n.º 12
0
                    from bigquery-public-data.covid19_italy.data_by_region 
            
                    group by region_name
                """
query_for_scatter = """ select 
                                date, AVG(total_hospitalized_patients) as hospitalized
                                
                        from bigquery-public-data.covid19_italy.data_by_region 
                        
                        group by 
                                date
                        order by
                                date
                    """

df = client.query_to_pandas(query)

# 3.1 BAR >><<
df_for_bar = client.query_to_pandas(query_for_bar)

bar = go.Bar(x=df_for_bar.date, y=df_for_bar.hospitalized_patients_symptoms)

bar_layout = go.Layout(title='Toscana',
                       xaxis=dict(title='Date'),
                       yaxis=dict(title='Amount'))

fig_bar = go.Figure(data=[bar], layout=bar_layout)

plot(fig_bar, filename='bar_file')

# 3.2 Pie >>><<<