def get_query_job_config(): return bigquery.QueryJobConfig()
def promotion_prediction_(project_id, dataset_id, area, mechanic): # Load client client = bigquery.Client() job_config = bigquery.QueryJobConfig() logger.info("Filtering on promotion mechanic {a}...".format(a=mechanic)) promotion_pred_sql = """ WITH temp_aggr_promo AS ( SELECT sku_root_id, description, area, section, category, subcategory, segment, brand_name, eroskibrand_flag, eroskibrand_label, wealthy_range_flag, flag_healthy, innovation_flag, tourism_flag, local_flag, regional_flag, no_hipermercados_stores, no_supermercados_stores, no_gasolineras_stores, no_comercio_electronico_stores, no_otros_negocio_stores, no_plataformas_stores, no_other_stores, no_impacted_stores, no_impacted_regions, AVG(avg_store_size) AS avg_store_size, promo_id, promo_year, promo_mechanic, promo_mechanic_description as Promo_mechanic_en, name, type, start_date, end_date, customer_profile_type, marketing_type, duration AS duration_days, MAX(includes_weekend) AS includes_weekend, campaign_start_day, campaign_start_month, campaign_start_quarter, campaign_start_week, leaflet_cover, leaflet_priv_space, in_leaflet_flag, in_gondola_flag, in_both_leaflet_gondola_flag, discount_depth, discount_depth_rank, CASE WHEN change_flag in (1,2) THEN 'promotion' ELSE 'post_promotion' END AS period, SUM(tt_discount) AS p_discount, SUM(sale_amt_bl) AS p_sale_bl, SUM(sale_qty_bl) AS p_qty_bl, SUM(margin_amt_bl) AS p_margin_bl, SUM(tt_sale_amt) AS p_sale_amt, SUM(tt_sale_qty) AS p_sale_qty, SUM(tt_margin_amt) AS p_margin_amt, SUM(inc_sale_amt) AS p_cal_inc_sale_amt, SUM(inc_sale_qty) AS p_cal_inc_sale_qty, SUM(inc_margin_amt) AS p_cal_inc_margin_amt, SAFE_DIVIDE(SUM(inc_sale_amt),no_impacted_stores) AS p_cal_inc_sale_amt_per_store, SAFE_DIVIDE(SUM(inc_sale_qty),no_impacted_stores) AS p_cal_inc_sale_qty_per_store, SAFE_DIVIDE(SUM(inc_margin_amt),no_impacted_stores) AS p_cal_inc_margin_amt_per_store, SAFE_DIVIDE(SUM(inc_sale_amt),SUM(sale_amt_bl)) AS p_cal_perc_inc_sale_amt, SAFE_DIVIDE(SUM(inc_sale_qty),SUM(sale_qty_bl)) AS p_cal_perc_inc_sale_qty, SAFE_DIVIDE(SUM(inc_margin_amt),SUM(margin_amt_bl)) AS p_cal_perc_inc_margin, SUM(avg_bline_sale) AS p_avg_sale_bl, SUM(avg_bline_qty) AS p_avg_qty_bl, SUM(avg_bline_margin) AS p_avg_margin_bl, SUM(avg_bl_inc_sale) AS p_cal_inc_avg_sale, SUM(avg_bl_inc_qty) AS p_cal_inc_avg_qty, SUM(avg_bl_inc_margin) AS p_cal_avg_margin, SAFE_DIVIDE(SUM(avg_bl_inc_sale),no_impacted_stores) AS p_cal_inc_avg_sale_per_store, SAFE_DIVIDE(SUM(avg_bl_inc_qty),no_impacted_stores) AS p_cal_inc_avg_qty_per_store, SAFE_DIVIDE(SUM(avg_bl_inc_margin),no_impacted_stores) AS p_cal_avg_margin_per_store, SAFE_DIVIDE(SUM(avg_bl_inc_sale),SUM(avg_bline_sale)) AS p_cal_perc_inc_avg_sale_amt, SAFE_DIVIDE(SUM(avg_bl_inc_qty),SUM(avg_bline_qty)) AS p_cal_perc_inc_avg_sale_qty, SAFE_DIVIDE(SUM(avg_bl_inc_margin),SUM(avg_bline_margin)) AS p_cal_perc_inc_avg_margin FROM `gum-eroski-dev.baseline.baseline_promo` WHERE promo_mechanic IN {m} AND area = "{a}" GROUP BY sku_root_id, description, area, section, category, subcategory, segment, brand_name, eroskibrand_flag, eroskibrand_label, wealthy_range_flag, flag_healthy, innovation_flag, tourism_flag, local_flag, regional_flag, no_hipermercados_stores, no_supermercados_stores, no_gasolineras_stores, no_comercio_electronico_stores, no_otros_negocio_stores, no_plataformas_stores, no_other_stores, no_impacted_stores, no_impacted_regions, promo_id, promo_year, promo_mechanic, promo_mechanic_description, name, type, start_date, end_date, customer_profile_type, marketing_type, duration, campaign_start_day, campaign_start_month, campaign_start_quarter, campaign_start_week, leaflet_cover, leaflet_priv_space, in_leaflet_flag, in_gondola_flag, in_both_leaflet_gondola_flag, discount_depth, discount_depth_rank, period ), temp_aggr_promo_f AS ( SELECT * EXCEPT (eroskibrand_flag, eroskibrand_label, wealthy_range_flag), CASE WHEN eroskibrand_label IS NOT NULL THEN eroskibrand_label WHEN wealthy_range_flag = 'N' THEN 'Normal' WHEN wealthy_range_flag = 'S' THEN 'Premium' ELSE NULL END AS brand_price_label FROM temp_aggr_promo ) SELECT * FROM temp_aggr_promo_f WHERE discount_depth IS NOT NULL AND promo_mechanic IS NOT NULL AND sku_root_id IS NOT NULL AND segment IS NOT NULL AND period IN ('promotion') ORDER BY sku_root_id, promo_id, promo_year, period """.format(m="(\'" + "\',\'".join(str(x) for x in mechanic) + "\')", a=area) # Create a disctionary to loop over all destination tables and scripts tables = {'prediction_train_input': promotion_pred_sql} job_config.write_disposition = "WRITE_TRUNCATE" for key in tables: # Set the destination table table_ref = client.dataset(dataset_id).table(key) job_config.destination = table_ref # Start the query, passing in the extra configuration. query_job = client.query( tables[key], # Location must match that of the dataset(s) referenced in the query # and of the destination table. location='europe-west3', job_config=job_config) # API request - starts the query query_job.result() # Waits for the query to finish logger.info("Completed writing {a} table...".format(a=key))
sql += " FROM cte" sql += " GROUP BY " sql += " pfafid_30spfaf06" # In[7]: #sql += " stddev_riverdischarge_m_30spfaf06 / nullif(stddev_riverdischarge_m_30spfaf06,0) AS cv_riverdischarge_m_30spfaf06" # In[8]: sql # In[9]: job_config = bigquery.QueryJobConfig() table_ref = client.dataset(BQ_OUTPUT_DATASET_NAME).table(BQ_OUTPUT_TABLE_NAME) job_config.destination = table_ref if TESTING: job_config.dry_run = True job_config.use_query_cache = False # In[10]: query_job = client.query(query=sql, location="US", job_config=job_config) # In[11]: query_job.result(timeout=120)
def fetch_results( spark, start_date, end_date, channel=None, min_firefox_version="53", project_id="moz-fx-data-shared-prod", dataset_id="analysis", table_id="graphics_telemetry_dashboard_tmp", ): channel_filter = "" if channel is not None: channel_filter = "normalized_channel = '{}' AND".format(channel) query = """ -- This function uses mozfun.hist.extract to tolerate compact string encodings -- and then turns the parsed struct back into a JSON string to maintain compatibility -- with the existing logic that expects JSON blobs. -- See https://bugzilla.mozilla.org/show_bug.cgi?id=1657724 CREATE TEMP FUNCTION hist_to_json(h STRING) AS ( IF(h IS NULL, NULL, FORMAT('{"bucket_count":%d,"histogram_type":%d,"sum":%d,"range":[%d,%d],"values":{%s}}', mozfun.hist.extract(h).bucket_count, mozfun.hist.extract(h).histogram_type, mozfun.hist.extract(h).sum, mozfun.hist.extract(h).range[SAFE_OFFSET(0)], mozfun.hist.extract(h).range[SAFE_OFFSET(1)], ARRAY_TO_STRING(ARRAY( SELECT FORMAT('"%d":%d', key, value) FROM UNNEST(mozfun.hist.extract(h).`values`)), ','))) ); -- Extra wrapper function for dealing with keyed histograms. CREATE TEMP FUNCTION keyed_hist_to_json(h ANY TYPE) AS ( ARRAY(SELECT AS STRUCT key, hist_to_json(value) AS value FROM UNNEST(h)) ); WITH sample AS (select client_id, creation_date, additional_properties, environment.build.version as environment__build__version, environment.build.build_id as environment__build__build_id, environment.system.memory_mb as environment__system__memory_mb, environment.system.is_wow64 as environment__system__is_wow64, environment.system.cpu as environment__system__cpu, environment.system.os.name as environment__system__os__name, environment.system.os.version as environment__system__os__version, environment.system.os.service_pack_major as environment__system__os__service_pack_major, environment.system.gfx.adapters as environment__system__gfx__adapters, payload.info.revision as payload__info__revision, environment.system.gfx as environment__system__gfx, environment.system.gfx.monitors as environment__system__gfx__monitors, environment.build.architecture as environment__build__architecture, environment.system.gfx.features as environment__system__gfx__features, hist_to_json(payload.histograms.DEVICE_RESET_REASON) as payload__histograms__DEVICE_RESET_REASON, hist_to_json(payload.histograms.GRAPHICS_SANITY_TEST) as payload__histograms__GRAPHICS_SANITY_TEST, hist_to_json(payload.histograms.GRAPHICS_SANITY_TEST_REASON) as payload__histograms__GRAPHICS_SANITY_TEST_REASON, hist_to_json(payload.histograms.GRAPHICS_DRIVER_STARTUP_TEST) as payload__histograms__GRAPHICS_DRIVER_STARTUP_TEST, hist_to_json(payload.histograms.CANVAS_WEBGL_SUCCESS) as payload__histograms__CANVAS_WEBGL_SUCCESS, hist_to_json(payload.histograms.CANVAS_WEBGL2_SUCCESS) as payload__histograms__CANVAS_WEBGL2_SUCCESS, hist_to_json(payload.histograms.PLUGIN_DRAWING_MODEL) as payload__histograms__PLUGIN_DRAWING_MODEL, hist_to_json(payload.histograms.MEDIA_DECODER_BACKEND_USED) as payload__histograms__MEDIA_DECODER_BACKEND_USED, hist_to_json(payload.processes.content.histograms.DEVICE_RESET_REASON) as payload__processes__content__histograms__DEVICE_RESET_REASON, hist_to_json(payload.processes.content.histograms.GRAPHICS_SANITY_TEST) as payload__processes__content__histograms__GRAPHICS_SANITY_TEST, hist_to_json(payload.processes.content.histograms.GRAPHICS_SANITY_TEST_REASON) as payload__processes__content__histograms__GRAPHICS_SANITY_TEST_REASON, hist_to_json(payload.processes.content.histograms.GRAPHICS_DRIVER_STARTUP_TEST) as payload__processes__content__histograms__GRAPHICS_DRIVER_STARTUP_TEST, hist_to_json(payload.processes.content.histograms.CANVAS_WEBGL_SUCCESS) as payload__processes__content__histograms__CANVAS_WEBGL_SUCCESS, hist_to_json(payload.processes.content.histograms.CANVAS_WEBGL2_SUCCESS) as payload__processes__content__histograms__CANVAS_WEBGL2_SUCCESS, hist_to_json(payload.processes.content.histograms.PLUGIN_DRAWING_MODEL) as payload__processes__content__histograms__PLUGIN_DRAWING_MODEL, hist_to_json(payload.processes.content.histograms.MEDIA_DECODER_BACKEND_USED) as payload__processes__content__histograms__MEDIA_DECODER_BACKEND_USED, keyed_hist_to_json(payload.keyed_histograms.D3D11_COMPOSITING_FAILURE_ID) as payload__keyed_histograms__D3D11_COMPOSITING_FAILURE_ID, keyed_hist_to_json(payload.keyed_histograms.OPENGL_COMPOSITING_FAILURE_ID) as payload__keyed_histograms__OPENGL_COMPOSITING_FAILURE_ID, keyed_hist_to_json(payload.keyed_histograms.CANVAS_WEBGL_ACCL_FAILURE_ID) as payload__keyed_histograms__CANVAS_WEBGL_ACCL_FAILURE_ID, keyed_hist_to_json(payload.keyed_histograms.CANVAS_WEBGL_FAILURE_ID) as payload__keyed_histograms__CANVAS_WEBGL_FAILURE_ID, keyed_hist_to_json(payload.processes.content.keyed_histograms.D3D11_COMPOSITING_FAILURE_ID) as payload__processes__content__keyed_histograms__D3D11_COMPOSITING_FAILURE_ID, keyed_hist_to_json(payload.processes.content.keyed_histograms.OPENGL_COMPOSITING_FAILURE_ID) as payload__processes__content__keyed_histograms__OPENGL_COMPOSITING_FAILURE_ID, keyed_hist_to_json(payload.processes.content.keyed_histograms.CANVAS_WEBGL_ACCL_FAILURE_ID) as payload__processes__content__keyed_histograms__CANVAS_WEBGL_ACCL_FAILURE_ID, keyed_hist_to_json(payload.processes.content.keyed_histograms.CANVAS_WEBGL_FAILURE_ID) as payload__processes__content__keyed_histograms__CANVAS_WEBGL_FAILURE_ID from `moz-fx-data-shared-prod.telemetry_stable.main_v4` where date(submission_timestamp) >= '{start_date}' AND date(submission_timestamp) <= '{end_date}' AND normalized_app_name = 'Firefox' AND {channel_filter} CAST(SPLIT(application.version, '.')[OFFSET(0)] AS INT64) > {min_firefox_version} AND -- NOTE: fixed fraction corresponding to 0.0003 sample_id = 42 AND MOD(CAST(RAND()*10 AS INT64), 10) < 3), distinct_client_ids AS (SELECT distinct(client_id) FROM sample), -- Retain only the first seen documents for each client ID , base AS (SELECT * FROM sample JOIN distinct_client_ids USING (client_id)), numbered_duplicates AS (SELECT *, ROW_NUMBER() OVER (PARTITION BY client_id) AS _n FROM base) -- -- Retain only one document for each ID. SELECT * EXCEPT(_n) FROM numbered_duplicates WHERE _n = 1 """.format( start_date=start_date.strftime(FORMAT_DS), end_date=end_date.strftime(FORMAT_DS), channel_filter=channel_filter, min_firefox_version=min_firefox_version, ) bq = bigquery.Client() table_ref = bq.dataset(dataset_id, project=project_id).table(table_id) job_config = bigquery.QueryJobConfig() job_config.destination = table_ref job_config.write_disposition = "WRITE_TRUNCATE" query_job = bq.query(query, job_config=job_config) # Wait for query execution result = query_job.result() return (spark.read.format("bigquery").option("project", project_id).option( "dataset", query_job.destination.dataset_id).option( "table", query_job.destination.table_id).load().rdd)
def run_authorized_view_tutorial(override_values={}): # Note to user: This is a group email for testing purposes. Replace with # your own group email address when running this code. analyst_group_email = "*****@*****.**" # [START bigquery_authorized_view_tutorial] # Create a source dataset # [START bigquery_avt_create_source_dataset] from google.cloud import bigquery client = bigquery.Client() source_dataset_id = "github_source_data" # [END bigquery_authorized_view_tutorial] # [END bigquery_avt_create_source_dataset] # To facilitate testing, we replace values with alternatives # provided by the testing harness. source_dataset_id = override_values.get("source_dataset_id", source_dataset_id) # [START bigquery_authorized_view_tutorial] # [START bigquery_avt_create_source_dataset] source_dataset = bigquery.Dataset(client.dataset(source_dataset_id)) # Specify the geographic location where the dataset should reside. source_dataset.location = "US" source_dataset = client.create_dataset(source_dataset) # API request # [END bigquery_avt_create_source_dataset] # Populate a source table # [START bigquery_avt_create_source_table] source_table_id = "github_contributors" job_config = bigquery.QueryJobConfig() job_config.destination = source_dataset.table(source_table_id) sql = """ SELECT commit, author, committer, repo_name FROM `bigquery-public-data.github_repos.commits` LIMIT 1000 """ query_job = client.query( sql, # Location must match that of the dataset(s) referenced in the query # and of the destination table. location="US", job_config=job_config, ) # API request - starts the query query_job.result() # Waits for the query to finish # [END bigquery_avt_create_source_table] # Create a separate dataset to store your view # [START bigquery_avt_create_shared_dataset] shared_dataset_id = "shared_views" # [END bigquery_authorized_view_tutorial] # [END bigquery_avt_create_shared_dataset] # To facilitate testing, we replace values with alternatives # provided by the testing harness. shared_dataset_id = override_values.get("shared_dataset_id", shared_dataset_id) # [START bigquery_authorized_view_tutorial] # [START bigquery_avt_create_shared_dataset] shared_dataset = bigquery.Dataset(client.dataset(shared_dataset_id)) shared_dataset.location = "US" shared_dataset = client.create_dataset(shared_dataset) # API request # [END bigquery_avt_create_shared_dataset] # Create the view in the new dataset # [START bigquery_avt_create_view] shared_view_id = "github_analyst_view" view = bigquery.Table(shared_dataset.table(shared_view_id)) sql_template = """ SELECT commit, author.name as author, committer.name as committer, repo_name FROM `{}.{}.{}` """ view.view_query = sql_template.format(client.project, source_dataset_id, source_table_id) view = client.create_table(view) # API request # [END bigquery_avt_create_view] # Assign access controls to the dataset containing the view # [START bigquery_avt_shared_dataset_access] # analyst_group_email = '*****@*****.**' access_entries = shared_dataset.access_entries access_entries.append( bigquery.AccessEntry("READER", "groupByEmail", analyst_group_email)) shared_dataset.access_entries = access_entries shared_dataset = client.update_dataset(shared_dataset, ["access_entries"]) # API request # [END bigquery_avt_shared_dataset_access] # Authorize the view to access the source dataset # [START bigquery_avt_source_dataset_access] access_entries = source_dataset.access_entries access_entries.append( bigquery.AccessEntry(None, "view", view.reference.to_api_repr())) source_dataset.access_entries = access_entries source_dataset = client.update_dataset(source_dataset, ["access_entries"]) # API request
def load_new_snippet_data(dataset_id, table_name, next_load_date, end_load_date): ''' Queries different snippet related GA properties and loads results to a permanent table in bigquery :param dataset_id: Name of dataset to be loaded into :param table_name: Name of table to be loaded into :param next_load_date: Earliest date to be loaded into table_name :param end_load_date: Latest date to be loaded into table_name :return: ''' while next_load_date < end_load_date: # Set dates required for loading new data next_load_date = datetime.strftime(next_load_date, '%Y%m%d') logging.info( f'{job_name}: Starting load for next load date: {next_load_date}') client = bigquery.Client(project='ga-mozilla-org-prod-001') load_dataset_id = dataset_id load_table_name = table_name load_table_suffix = next_load_date load_table_id = f'{load_table_name.lower()}_{load_table_suffix}' # Set Sample Size Multiplier sample_rate_change_date = datetime.strptime( '20171031', '%Y%m%d') # date sampling changed from 1% to 0.1% if datetime.strptime(next_load_date, '%Y%m%d') < sample_rate_change_date: sample_multiplier = 100 else: sample_multiplier = 1000 # Configure load job dataset_ref = client.dataset(load_dataset_id) table_ref = dataset_ref.table(load_table_id) load_job_config = bigquery.QueryJobConfig() # load job call load_job_config.schema = [ bigquery.SchemaField('date', 'DATE'), bigquery.SchemaField('snippetID', 'STRING'), bigquery.SchemaField('country', 'STRING'), bigquery.SchemaField('site', 'STRING'), bigquery.SchemaField('impression', 'INTEGER'), bigquery.SchemaField('snippetBlocked', 'INTEGER'), bigquery.SchemaField('clicks', 'INTEGER'), bigquery.SchemaField('otherSnippetInteractions', 'INTEGER'), bigquery.SchemaField('sessions', 'INTEGER'), bigquery.SchemaField('addonInstallsTotal', 'INTEGER'), bigquery.SchemaField('addonInstallsGoalComp', 'INTEGER'), bigquery.SchemaField('themeInstallsTotal', 'INTEGER'), bigquery.SchemaField('themeInstallsGoalComp', 'INTEGER'), bigquery.SchemaField('donations', 'INTEGER'), bigquery.SchemaField('name', 'STRING'), bigquery.SchemaField('campaign', 'STRING'), bigquery.SchemaField('category', 'STRING'), bigquery.SchemaField('url', 'STRING'), bigquery.SchemaField('body', 'STRING') ] # Define schema load_job_config.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, field='date', ) load_job_config.write_disposition = 'WRITE_TRUNCATE' # Options are WRITE_TRUNCATE, WRITE_APPEND, WRITE_EMPTY load_job_config.destination = table_ref sql = f""" WITH impressionData AS( SELECT visitData.date, visitData.snippetID, visitData.country, visitData.eventCategory, -- Get statistics for top 3 events. All other = other CASE WHEN eventCategory = 'impression' THEN COUNT(DISTINCT(fullVisitorId)) ELSE 0 END AS impression, CASE WHEN eventCategory = 'snippet-blocked' THEN COUNT(DISTINCT(fullVisitorId)) ELSE 0 END AS snippetBlocked, CASE WHEN eventCategory = 'click' OR eventCategory = 'button-click' THEN COUNT(DISTINCT(fullVisitorId)) ELSE 0 END AS clicks, CASE WHEN eventCategory NOT IN('impression','snippet-blocked', 'click','button-click') THEN COUNT(DISTINCT(fullVisitorId)) ELSE 0 END AS other FROM ( SELECT date, geoNetwork.country, fullVisitorId, eventInfo.eventAction AS snippetID, eventInfo.eventCategory FROM `ga-mozilla-org-prod-001.125230768.ga_sessions_*`, UNNEST (hits) AS hits WHERE _TABLE_SUFFIX = '{load_table_suffix}' GROUP BY 1,2,3,4,5) AS visitData GROUP BY 1,2,3,4 ORDER BY 4 DESC), -- Pull data from addons.mozilla.org addonsData AS(SELECT date AS date, trafficSource.keyword AS snippetID, geoNetwork.country AS country, SUM(totals.visits) AS sessions, SUM((SELECT SUM(DISTINCT IF (REGEXP_CONTAINS(hits.eventInfo.eventCategory, '^AMO (?:Addon|Theme|Addon / Theme) Installs$') AND hits.eventInfo.eventAction = 'addon',1,0)) FROM UNNEST(hits) hits)) AS sessionsInstallingAddons, SUM((SELECT SUM(IF (REGEXP_CONTAINS(hits.eventInfo.eventCategory, '^AMO (?:Addon|Theme|Addon / Theme) Installs$') AND hits.eventInfo.eventAction = 'addon',1,0)) FROM UNNEST(hits) hits)) AS totalAddonsInstalled, SUM((SELECT SUM(DISTINCT IF (REGEXP_CONTAINS(hits.eventInfo.eventCategory, '^AMO (?:Addon|Theme|Addon / Theme) Installs$') AND hits.eventInfo.eventAction = 'theme',1,0)) FROM UNNEST(hits) hits)) AS sessionsInstallingThemes, SUM((SELECT SUM(IF (REGEXP_CONTAINS(hits.eventInfo.eventCategory, '^AMO (?:Addon|Theme|Addon / Theme) Installs$') AND hits.eventInfo.eventAction = 'theme',1,0)) FROM UNNEST(hits) hits)) AS totalThemesInstalled FROM `ga-mozilla-org-prod-001.67693596.ga_sessions_*` WHERE _TABLE_SUFFIX = '{load_table_suffix}' AND trafficSource.medium = 'snippet' GROUP BY 1,2,3 ORDER BY 2 ASC, 4 DESC), -- Pull data from mozilla.org mozorgData AS( SELECT date as date, trafficSource.keyword as snippetID, geoNetwork.country as country, SUM(totals.visits) AS sessions FROM `ga-mozilla-org-prod-001.65789850.ga_sessions_*` WHERE _TABLE_SUFFIX = '{load_table_suffix}' AND trafficSource.medium = 'snippet' GROUP By 1,2,3 ORDER BY 4 DESC ), -- Pull data from blog.mozilla.org blogData AS( SELECT date as date, trafficSource.keyword as snippetID, geoNetwork.country as country, SUM(totals.visits) AS sessions FROM `ga-mozilla-org-prod-001.66602784.ga_sessions_*` WHERE _TABLE_SUFFIX = '{load_table_suffix}' AND trafficSource.medium = 'snippet' GROUP By 1,2,3 ORDER BY 4 DESC ), -- Pull data from testpilot.firefox.com testPilotData AS( SELECT date as date, trafficSource.keyword as snippetID, geoNetwork.country as country, SUM(totals.visits) AS sessions FROM `ga-mozilla-org-prod-001.106368739.ga_sessions_*` WHERE _TABLE_SUFFIX = '{load_table_suffix}' AND trafficSource.medium = 'snippet' GROUP By 1,2,3 ORDER BY 4 DESC ), -- Pull data from developer.mozilla.org developerData AS( SELECT date as date, trafficSource.keyword as snippetID, geoNetwork.country as country, SUM(totals.visits) AS sessions FROM `ga-mozilla-org-prod-001.66726481.ga_sessions_*` WHERE _TABLE_SUFFIX = '{load_table_suffix}' AND trafficSource.medium = 'snippet' GROUP By 1,2,3 ORDER BY 4 DESC ), -- Pull data from support.mozilla.org sumoData AS( SELECT date as date, trafficSource.keyword as snippetID, geoNetwork.country as country, SUM(totals.visits) AS sessions FROM `ga-mozilla-org-prod-001.65912487.ga_sessions_*` WHERE _TABLE_SUFFIX = '{load_table_suffix}' AND trafficSource.medium = 'snippet' GROUP By 1,2,3 ORDER BY 4 DESC ), -- Pull data from hacks.mozilla.org hacksData AS( SELECT date as date, trafficSource.keyword as snippetID, geoNetwork.country as country, SUM(totals.visits) AS sessions FROM `ga-mozilla-org-prod-001.65887927.ga_sessions_*` WHERE _TABLE_SUFFIX = '{load_table_suffix}' AND trafficSource.medium = 'snippet' GROUP By 1,2,3 ORDER BY 4 DESC ), -- Pull data from donate.mozilla.org donateData AS( SELECT date AS date, trafficSource.keyword AS snippetID, geoNetwork.country AS country, SUM(totals.visits) AS sessions, SUM((SELECT SUM(DISTINCT IF(REGEXP_CONTAINS(page.pagePath, '/thank-you/'),1,0)) FROM UNNEST(hits) )) AS donations FROM `ga-mozilla-org-prod-001.105783219.ga_sessions_*` WHERE _TABLE_SUFFIX = '{load_table_suffix}' AND trafficSource.medium = 'snippet' GROUP BY 1,2,3 ORDER BY 2 ASC,4 DESC ), aggregates as ( -- Aggregate by date, snippetID, country and site SELECT PARSE_DATE('%Y%m%d', impressions.date) as date, impressions.snippetID, impressions.country, 'snippets tracking' as site, SUM(impressions.impression)*{sample_multiplier} AS impression, SUM(impressions.snippetBlocked)*{sample_multiplier} AS snippetBlocked, SUM(impressions.clicks)*{sample_multiplier} AS clicks, SUM(impressions.other)*{sample_multiplier} as otherSnippetInteractions, NULL as sessions, NULL as addonInstallsTotal, NULL as addonInstallsGoalComp, NULL as themeInstallsTotal, NULL as themeInstallsGoalComp, NULL as donations FROM impressionData as impressions GROUP By 1,2,3,4 -- Join addons data UNION ALL SELECT PARSE_DATE('%Y%m%d', addonsData.date) as date, addonsData.snippetID, addonsData.country, 'addons.mozilla.org' as site, NULL as impression, NULL as snippetBlocked, NULL as clicks, NULL as otherSnippetInteractions, SUM(addonsData.sessions) as sessions, SUM(addonsData.totalAddonsInstalled) as addonInstallsTotal, SUM(addonsData.sessionsInstallingAddons) as addonInstallsGoalComp, SUM(addonsData.totalThemesInstalled) as themeInstallsTotal, SUM(addonsData.sessionsInstallingThemes) as themeInstallsGoalComp, NULL as donations FROM addonsData GROUP BY 1,2,3,4 -- Join mozilla.org data UNION ALL SELECT PARSE_DATE('%Y%m%d', mozorgData.date) as date, mozorgData.snippetID, mozorgData.country, 'mozilla.org' as site, NULL as impression, NULL as snippetBlocked, NULL as clicks, NULL as otherSnippetInteractions, SUM(mozorgData.sessions) as sessions, NULL as addonInstallsTotal, NULL as addonInstallsGoalComp, NULL as themeInstallsTotal, NULL as themeInstallsGoalComp, NULL as donations FROM mozorgData GROUP BY 1,2,3,4 -- Join blog.mozilla.org data UNION ALL SELECT PARSE_DATE('%Y%m%d', blogData.date) as date, blogData.snippetID, blogData.country, 'blog.mozilla.org' as site, NULL as impression, NULL as snippetBlocked, NULL as clicks, NULL as otherSnippetInteractions, SUM(blogData.sessions) as sessions, NULL as addonInstallsTotal, NULL as addonInstallsGoalComp, NULL as themeInstallsTotal, NULL as themeInstallsGoalComp, NULL as donations FROM blogData GROUP BY 1,2,3,4 -- Join testpilot.firefox.com data UNION ALL SELECT PARSE_DATE('%Y%m%d', testPilotData.date) as date, testPilotData.snippetID, testPilotData.country, 'testpilot.firefox.com' as site, NULL as impression, NULL as snippetBlocked, NULL as clicks, NULL as otherSnippetInteractions, SUM(testPilotData.sessions) as sessions, NULL as addonInstallsTotal, NULL as addonInstallsGoalComp, NULL as themeInstallsTotal, NULL as themeInstallsGoalComp, NULL as donations FROM testPilotData GROUP BY 1,2,3,4 -- Join developer.mozilla.org data UNION ALL SELECT PARSE_DATE('%Y%m%d', developerData.date) as date, developerData.snippetID, developerData.country, 'developer.mozilla.org' as site, NULL as impression, NULL as snippetBlocked, NULL as clicks, NULL as otherSnippetInteractions, SUM(developerData.sessions) as sessions, NULL as addonInstallsTotal, NULL as addonInstallsGoalComp, NULL as themeInstallsTotal, NULL as themeInstallsGoalComp, NULL as donations FROM developerData GROUP BY 1,2,3,4 -- Join support.mozilla.org data UNION ALL SELECT PARSE_DATE('%Y%m%d', sumoData.date) as date, sumoData.snippetID, sumoData.country, 'support.mozilla.org' as site, NULL as impression, NULL as snippetBlocked, NULL as clicks, NULL as otherSnippetInteractions, SUM(sumoData.sessions) as sessions, NULL as addonInstallsTotal, NULL as addonInstallsGoalComp, NULL as themeInstallsTotal, NULL as themeInstallsGoalComp, NULL as donations FROM sumoData GROUP BY 1,2,3,4 -- Join hacks.mozilla.org data UNION ALL SELECT PARSE_DATE('%Y%m%d', hacksData.date) as date, hacksData.snippetID, hacksData.country, 'support.mozilla.org' as site, NULL as impression, NULL as snippetBlocked, NULL as clicks, NULL as otherSnippetInteractions, SUM(hacksData.sessions) as sessions, NULL as addonInstallsTotal, NULL as addonInstallsGoalComp, NULL as themeInstallsTotal, NULL as themeInstallsGoalComp, NULL as donations FROM hacksData GROUP BY 1,2,3,4 -- Join donate.mozilla.org data UNION ALL SELECT PARSE_DATE('%Y%m%d', donateData.date) as date, donateData.snippetID, donateData.country, 'donate.mozilla.org' as site, NULL as impression, NULL as snippetBlocked, NULL as clicks, NULL as otherSnippetInteractions, SUM(donateData.sessions) as sessions, NULL as addonInstallsTotal, NULL as addonInstallsGoalComp, NULL as themeInstallsTotal, NULL as themeInstallsGoalComp, SUM(donateData.donations) as donations FROM donateData GROUP BY 1,2,3,4 -- Join telemetry tracking data UNION ALL SELECT sendDate, messageID, countryCode, 'telemetry tracking' as site, SUM(impressions) as impression, SUM(blocks) as snippetBlocked, SUM(clicks) as clicks, NULL as other, NULL as sessions, NULL as addonInstallsTotal, NULL as addonInstallsGoalComp, NULL as themeInstallsTotal, NULL as themeInstallsGoalComp, NULL as donations FROM `ga-mozilla-org-prod-001.snippets.snippets_telemetry_tracking_*` WHERE _TABLE_SUFFIX = '{load_table_suffix}' GROUP BY 1,2,3,4), metaData as ( SELECT * FROM `ga-mozilla-org-prod-001.snippets.snippets_metadata`) SELECT aggregates.*, metaData.name, metaData.campaign, metaData.category, metaData.url, metaData.body FROM aggregates LEFT JOIN metaData ON aggregates.snippetID = metaData.ID """ # Run Load Job query_job = client.query( sql, # Location must match that of the dataset(s) referenced in the query # and of the destination table. location='US', job_config=load_job_config) # API request - starts the query query_job.result() # Waits for the query to finish logging.info( f'{job_name}: Query results loaded to table {table_ref.path}') # Set next_load_date next_load_date = datetime.strptime(next_load_date, '%Y%m%d') + timedelta(1) return
def main(project, source_dataset, destination_dataset, create_table, backfill, dryrun): """Generate queries and optionally create the tables in BigQuery.""" client = bigquery.Client(project=project) exported_tables = [ table.table_id for table in client.list_tables(source_dataset) if table.table_type == "TABLE" ] tables_by_dimension = defaultdict(list) opt_in_metrics = set() # group table names by the dimension it is grouped by for table_name in exported_tables: if table_name.endswith("_total"): dimension = None else: metric, dimension = table_name.split("_by_") if dimension.startswith("opt_in"): opt_in_metrics.add(metric) dimension = dimension.replace("opt_in_", "") tables_by_dimension[dimension].append(table_name) for dimension, table_names in tables_by_dimension.items(): qualified_table_names = [ f"`{project}.{source_dataset}.{table_name}`" for table_name in table_names ] if dimension is not None: fields = f"date, app_name, {dimension}" table_name = f"metrics_by_{dimension}" metrics = [table_name.split("_by_")[0] for table_name in table_names] else: fields = "date, app_name" table_name = "metrics_total" metrics = [table_name.split("_total")[0] for table_name in table_names] join_clauses = [ JOIN_TEMPLATE.format(table=table_name, fields=fields) for table_name in qualified_table_names[1:] ] # add _opt_in to opt-in metrics fields_to_add_opt_in = [ metric for metric in metrics if metric in opt_in_metrics ] excepted_fields = ",".join(fields_to_add_opt_in) additional_fields = [ f"{name} AS {name}_opt_in" for name in fields_to_add_opt_in if name != "rate" ] # rename rate column to opt_in_rate and if "rate" in metrics: additional_fields.append("rate AS opt_in_rate") query_text = QUERY_TEMPLATE.format( excepted_fields=excepted_fields, additional_fields=", ".join(additional_fields), first_table=qualified_table_names[0], joined_tables="\n".join(join_clauses), filter="date=@date", ) query_path = os.path.join(SQL_DIR, destination_dataset, table_name, "query.sql") if not os.path.exists(os.path.dirname(query_path)): os.makedirs(os.path.dirname(query_path)) with open(query_path, "w") as f: print(f"Writing {query_path}") f.write(reformat(query_text)) f.write("\n") if create_table: query_text = QUERY_TEMPLATE.format( excepted_fields=excepted_fields, additional_fields=", ".join(additional_fields), first_table=qualified_table_names[0], joined_tables="\n".join(join_clauses), filter="TRUE" if backfill else "FALSE", ) schema_update_options = ( [] if backfill else [bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION] ) job_config = bigquery.QueryJobConfig( use_legacy_sql=False, dry_run=dryrun, destination=f"{project}.{destination_dataset}.{table_name}", schema_update_options=schema_update_options, time_partitioning=bigquery.TimePartitioning(field="date"), create_disposition=bigquery.CreateDisposition.CREATE_IF_NEEDED, write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE if backfill else bigquery.WriteDisposition.WRITE_APPEND, ) print(f"Creating table {table_name}") query_job = client.query(query_text, job_config) if not dryrun: query_job.result()
def update_with_bq_access(self): # cron status status = "" # delete all records in resource and resource_access table status += deleteAllRecordInTable("resource") status += deleteAllRecordInTable("resource_access") # return string with concatenated SQL insert result return_string = "" # Instantiates a client bigquery_client = bigquery.Client() # BQ Total Bytes Billed to report to status total_bytes_billed = 0 # loop through multiple course ids, 20 at a time # (This is set by the CRON_BQ_IN_LIMIT from settings) for data_warehouse_course_ids in split_list( Course.objects.get_supported_courses(), settings.CRON_BQ_IN_LIMIT): # query to retrieve all file access events for one course # There is no catch if this query fails, event_store.events needs to exist final_bq_query = [] for k, query_obj in settings.RESOURCE_ACCESS_CONFIG.items(): final_bq_query.append(query_obj['query']) final_bq_query = " UNION ALL ".join(final_bq_query) data_warehouse_course_ids_short = [ db_util.incremented_id_to_canvas_id(id) for id in data_warehouse_course_ids ] logger.debug(final_bq_query) logger.debug(data_warehouse_course_ids) query_params = [ bigquery.ArrayQueryParameter('course_ids', 'STRING', data_warehouse_course_ids), bigquery.ArrayQueryParameter('course_ids_short', 'STRING', data_warehouse_course_ids_short), bigquery.ScalarQueryParameter( 'canvas_data_id_increment', 'INT64', settings.CANVAS_DATA_ID_INCREMENT) ] job_config = bigquery.QueryJobConfig() job_config.query_parameters = query_params # Location must match that of the dataset(s) referenced in the query. bq_query = bigquery_client.query(final_bq_query, location='US', job_config=job_config) #bq_query.result() resource_access_df = bq_query.to_dataframe() total_bytes_billed += bq_query.total_bytes_billed logger.debug("df row number=" + str(resource_access_df.shape[0])) # drop duplicates resource_access_df.drop_duplicates( ["resource_id", "user_id", "access_time"], keep='first', inplace=True) logger.debug("after drop duplicates, df row number=" + str(resource_access_df.shape[0])) logger.debug(resource_access_df) # Because we're pulling all the data down into one query we need to manipulate it a little bit # Make a copy of the access dataframe resource_df = resource_access_df.copy(deep=True) # Drop out the columns user and access time from resource data frame resource_df.drop(["user_id", "access_time"], axis=1, inplace=True) # Drop out the duplicates resource_df.drop_duplicates(["resource_id", "course_id"], inplace=True) # Rename the column resource_id to id resource_df.rename(columns={"resource_id": "id"}, inplace=True) # Drop out the columns resource_type, course_id, name from the resource_access resource_access_df.drop(["resource_type", "name", "course_id"], axis=1, inplace=True) # Drop the columns where there is a Na value resource_access_df_drop_na = resource_access_df.dropna() logger.info( f"{len(resource_access_df) - len(resource_access_df_drop_na)} / {len(resource_access_df)} rows were dropped because of NA" ) # First update the resource table # write to MySQL try: resource_df.to_sql(con=engine, name='resource', if_exists='append', index=False) except Exception as e: logger.exception("Error running to_sql on table resource") raise try: resource_access_df_drop_na.to_sql(con=engine, name='resource_access', if_exists='append', index=False) except Exception as e: logger.exception( "Error running to_sql on table resource_access") raise return_string += str(resource_access_df_drop_na.shape[0] ) + " rows for courses " + ",".join( map(str, data_warehouse_course_ids)) + "\n" logger.info(return_string) total_tbytes_billed = total_bytes_billed / 1024 / 1024 / 1024 / 1024 # $5 per TB as of Feb 2019 https://cloud.google.com/bigquery/pricing total_tbytes_price = round(5 * total_tbytes_billed, 2) status += ( f"TBytes billed for BQ: {total_tbytes_billed} = ${total_tbytes_price}\n" ) return status
lengthArg=sys.argv[2] query = """ SELECT * FROM `@dataset` LIMIT @length """ query_params = [ bigquery.ScalarQueryParameter('dataset', 'STRING', str(dataArg)), bigquery.ScalarQueryParameter('length', 'INT64', int(lengthArg)) ] config_obj = bigquery.QueryJobConfig() config_obj.query_parameters = query_params def run_query(): # init BQ client client = bigquery.Client() ## construct query query_job = client.query( query, location='US', config_obj=config_obj ) # query_job = client.query(""" # SELECT # *
def get_data_bigquery(spark, chunk_start, chunk_end): bq = bigquery.Client() filtered_data_sql = f""" WITH rank_per_client AS ( SELECT *, ROW_NUMBER() OVER (PARTITION BY client_id ORDER BY submission_timestamp DESC) AS rn FROM `moz-fx-data-shared-prod.telemetry_stable.main_v4` WHERE sample_id = 42 AND DATE(submission_timestamp)>='{chunk_start.strftime('%Y-%m-%d')}' AND DATE(submission_timestamp)<'{chunk_end.strftime('%Y-%m-%d')}' ), latest_per_client AS( SELECT * FROM rank_per_client WHERE rn=1 ) SELECT environment.build.architecture AS browser_arch, environment.system.os.name AS os_name, environment.system.os.version AS os_version, environment.system.memory_mb, coalesce(environment.system.is_wow64, FALSE) AS is_wow64, environment.system.gfx.adapters[OFFSET(0)].vendor_id AS gfx0_vendor_id, environment.system.gfx.adapters[OFFSET(0)].device_id AS gfx0_device_id, IF(ARRAY_LENGTH(environment.system.gfx.monitors)>0, environment.system.gfx.monitors[OFFSET(0)].screen_width, 0) AS screen_width, IF(ARRAY_LENGTH(environment.system.gfx.monitors)>0, environment.system.gfx.monitors[OFFSET(0)].screen_height, 0) AS screen_height, environment.system.cpu.cores AS cpu_cores, environment.system.cpu.vendor AS cpu_vendor, environment.system.cpu.speed_m_hz AS cpu_speed, 'Shockwave Flash' IN (SELECT name FROM UNNEST(environment.addons.active_plugins)) AS has_flash FROM latest_per_client WHERE environment.system.cpu.speed_m_hz IS NOT NULL """ print("Query is: " + filtered_data_sql) TABLE_PROJECT = "moz-fx-data-derived-datasets" TABLE_DATASET = "analysis" TABLE_NAME = "hardware_report_filtered_data" table_ref = bq.dataset(TABLE_DATASET, project=TABLE_PROJECT).table(TABLE_NAME) job_config = bigquery.QueryJobConfig() job_config.destination = table_ref job_config.write_disposition = "WRITE_TRUNCATE" query_job = bq.query(filtered_data_sql, job_config=job_config) # # Wait for query execution query_job.result() filtered_data_df = (spark.read.format("bigquery").option( "parallelism", 200).option("table", f"{TABLE_PROJECT}.{TABLE_DATASET}.{TABLE_NAME}").load()) # Defined to keep compatibility with AWS implementation, # they're 0 here since these describe longitudinal data quality broken_ratio = 0 inactive_ratio = 0 return (filtered_data_df.rdd, broken_ratio, inactive_ratio)
def gen_job_config(): job_config = bigquery.QueryJobConfig() job_config.use_legacy_sql = False return job_config
def resize(self): """ This is the execute function of this class. It copies the source table into the destination table and then copies the destination table into itself until it reaches or exceeds the target_rows. """ # How many rows short of our target are we? gap = self.target_rows - self.source_table.num_rows while gap > 0: # Copy until we've reached or exceeded target_rows # API requests to get the latest table info. source_table = self.client.get_table(self.source_table) try: dest_table = self.client.get_table(self.dest_table_ref) except NotFound: dest_table = self.client.create_table( bigquery.Table(self.dest_table_ref)) # Get the latest size of the dest_table. # Note that for the first call these properties are None. dest_rows = dest_table.num_rows dest_bytes = dest_table.num_bytes dest_gb = dest_bytes / float(1024**3) # Recalculate the gap. if dest_rows: gap = self.target_rows - dest_rows else: gap = self.target_rows print(('{} rows in table of size {} GB, with a target of {}, ' 'leaving a gap of {}'.format(dest_rows, round(dest_gb, 2), self.target_rows, gap))) # Greedily copy the largest of dest_table and source_table into # dest_table without going over the target rows. The last query # will be a subset of source_table via a limit query. if gap < source_table.num_rows: # This will be the last copy operation if target_rows is # not a power of 2 times the number of rows originally in the # source table. It is not a full copy. job_config = bigquery.QueryJobConfig() # Set the destination table job_config.destination = self.dest_table_ref job_config.write_disposition = 'WRITE_APPEND' job_config.allow_large_results = True sql = """ SELECT * FROM `{}.{}.{}` LIMIT {} """.format(self.project, self.source_table.dataset_id, self.source_table.table_id, gap) # API request to BigQuery with query and config defined above. query_job = self.client.query( sql, # Location must match that of the dataset(s) referenced in # the query and of the destination table. location=self.location, job_config=job_config) # Wait for query_job to finish. query_job.result() else: if source_table.num_rows < dest_table.num_rows < gap: use_as_source_table = self.dest_table_ref else: # source_table.num_rows < gap < dest_table.num_rows use_as_source_table = self.source_table.reference copy_config = bigquery.CopyJobConfig() copy_config.write_disposition = 'WRITE_APPEND' copy_job = self.client.copy_table(use_as_source_table, self.dest_table_ref, job_config=copy_config) # Wait for copy_job to finish. copy_job.result()
def main(): """Process deletion requests.""" args = parser.parse_args() if args.partition_limit is not None and not args.dry_run: parser.print_help() print("ERROR: --partition-limit specified without --dry-run") if args.start_date is None: args.start_date = args.end_date - timedelta(days=14) source_condition = ( f"DATE(submission_timestamp) >= '{args.start_date}' " f"AND DATE(submission_timestamp) < '{args.end_date}'" ) client_q = ClientQueue(args.billing_projects, args.parallelism) client = client_q.default_client states = {} if args.state_table: state_table_exists = False try: client.get_table(args.state_table) state_table_exists = True except NotFound: if not args.dry_run: client.create_table( bigquery.Table( args.state_table, [ bigquery.SchemaField("task_id", "STRING"), bigquery.SchemaField("job_id", "STRING"), bigquery.SchemaField("job_created", "TIMESTAMP"), bigquery.SchemaField("start_date", "DATE"), bigquery.SchemaField("end_date", "DATE"), ], ) ) state_table_exists = True if state_table_exists: states = dict( client.query( reformat( f""" SELECT task_id, job_id, FROM `{args.state_table}` WHERE start_date = '{args.start_date}' AND end_date = '{args.end_date}' ORDER BY job_created """ ) ).result() ) if args.environment == "telemetry": with ThreadPool(args.parallelism) as pool: glean_targets = find_glean_targets(pool, client) experiment_analysis_targets = find_experiment_analysis_targets(pool, client) targets_with_sources = chain( DELETE_TARGETS.items(), glean_targets.items(), experiment_analysis_targets.items(), ) elif args.environment == "pioneer": with ThreadPool(args.parallelism) as pool: targets_with_sources = find_pioneer_targets( pool, client, study_projects=args.pioneer_study_projects ).items() tasks = [ task for target, sources in targets_with_sources if args.table_filter(target.table) for task in delete_from_table( client=client, target=replace(target, project=args.target_project or target.project), sources=[ replace(source, project=args.source_project or source.project) for source in (sources if isinstance(sources, tuple) else (sources,)) ], source_condition=source_condition, dry_run=args.dry_run, read_only=args.read_only, priority=args.priority, start_date=args.start_date, end_date=args.end_date, max_single_dml_bytes=args.max_single_dml_bytes, partition_limit=args.partition_limit, state_table=args.state_table, states=states, ) ] if not tasks: logging.error("No tables selected") parser.exit(1) # ORDER BY partition_sort_key DESC, sql_table_id ASC # https://docs.python.org/3/howto/sorting.html#sort-stability-and-complex-sorts tasks.sort(key=lambda task: sql_table_id(task.table)) tasks.sort(key=attrgetter("partition_sort_key"), reverse=True) with ThreadPool(args.parallelism) as pool: if args.task_table and not args.dry_run: # record task information try: client.get_table(args.task_table) except NotFound: table = bigquery.Table( args.task_table, [ bigquery.SchemaField("task_id", "STRING"), bigquery.SchemaField("start_date", "DATE"), bigquery.SchemaField("end_date", "DATE"), bigquery.SchemaField("target", "STRING"), bigquery.SchemaField("target_rows", "INT64"), bigquery.SchemaField("target_bytes", "INT64"), bigquery.SchemaField("source_bytes", "INT64"), ], ) table.time_partitioning = bigquery.TimePartitioning() client.create_table(table) sources = list(set(source for task in tasks for source in task.sources)) source_bytes = { source: job.total_bytes_processed for source, job in zip( sources, pool.starmap( client.query, [ ( reformat( f""" SELECT {source.field} FROM `{sql_table_id(source)}` WHERE {source_condition} """ ), bigquery.QueryJobConfig(dry_run=True), ) for source in sources ], chunksize=1, ), ) } step = 10000 # max 10K rows per insert for start in range(0, len(tasks), step): end = start + step BigQueryInsertError.raise_if_present( errors=client.insert_rows_json( args.task_table, [ { "task_id": get_task_id(task.table, task.partition_id), "start_date": args.start_date.isoformat(), "end_date": args.end_date.isoformat(), "target": sql_table_id(task.table), "target_rows": task.table.num_rows, "target_bytes": task.table.num_bytes, "source_bytes": sum( map(source_bytes.get, task.sources) ), } for task in tasks[start:end] ], ) ) results = pool.map( client_q.with_client, (task.func for task in tasks), chunksize=1 ) jobs_by_table = defaultdict(list) for i, job in enumerate(results): jobs_by_table[tasks[i].table].append(job) bytes_processed = rows_deleted = 0 for table, jobs in jobs_by_table.items(): table_bytes_processed = sum(job.total_bytes_processed or 0 for job in jobs) bytes_processed += table_bytes_processed table_id = sql_table_id(table) if args.dry_run: logging.info(f"Would scan {table_bytes_processed} bytes from {table_id}") else: table_rows_deleted = sum(job.num_dml_affected_rows or 0 for job in jobs) rows_deleted += table_rows_deleted logging.info( f"Scanned {table_bytes_processed} bytes and " f"deleted {table_rows_deleted} rows from {table_id}" ) if args.dry_run: logging.info(f"Would scan {bytes_processed} in total") else: logging.info( f"Scanned {bytes_processed} and deleted {rows_deleted} rows in total" )
def BQPreprocess(cpu, date_generated, client, query_fit, loadfrom="elastic"): bq_client = client job_config = bigquery.QueryJobConfig() datalist = [] datalist_hist = [] logger.info("Starting data fetch iterative...") for ndate in date_generated: tframe = getBig(ndate.strftime("%Y-%m-%d"), query_fit) if tframe is not None: if not tframe.empty: if loadfrom.strip().lower() == 'datastore': X_split = np.array_split(tframe, 5) logger.info("loading history data from datastore...") logger.info("Len of X_split for batch load: %d", len(X_split)) logger.info("Appending history data...") for ix in range(len(X_split)): # ~ loading history """ disini antara kita gabungkan dengan tframe, atau buat df sendiri """ logger.info("processing batch-%d", ix) # https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas' logger.info("creating list history data...") #lhistory = list(X_split[ix]["user_id"].head(1000).map(str) + "_" + X_split[ix]["topic_id"].head(1000).map(str)) lhistory = list(X_split[ix]["user_id"].map(str) + "_" + X_split[ix]["topic_id"].map(str)) logger.info("call history data...") h_frame = mh.loadDSHistory(lhistory) # me = os.getpid() # kill_proc_tree(me) logger.info("done collecting history data, appending now...") for m in h_frame: if m is not None: if len(m) > 0: datalist_hist.append(pd.DataFrame(m)) del h_frame del lhistory logger.info("Appending training data...") datalist.append(tframe) elif loadfrom.strip().lower() == 'elastic': X_split = np.array_split(tframe, 50) logger.info("loading history data from elastic...") logger.info("Len of X_split for batch load: %d", len(X_split)) logger.info("Appending history data...") for ix in range(len(X_split)): lhistory = list(X_split[ix]["user_id"].map(str) + "_" + X_split[ix]["topic_id"].map(str)) logger.info("call %d history data...", len(lhistory)) inside_data = mh.loadESHistory(lhistory, es, esindex_name='fitted_hist_index', estype_name='fitted_hist_type') if inside_data is not None: # split back the user_id and topic_id inside_data[['user_id','topic_id']] = inside_data.uid_topid.str.split('_', expand=True) inside_data = inside_data[["user_id","topic_id", "pt_posterior_x_Nt", "smoothed_pt_posterior", "p0_cat_ci", "sigma_Nt"]] logger.info("Appending %d data into datalist_hist..", len(inside_data)) datalist_hist.append(inside_data) del inside_data else: logger.info("inside_data is None...") logger.info("Appending training data...") datalist.append(tframe) else: logger.info("Unknows source is selected !") break else: logger.info("tframe for date: %s is empty", ndate.strftime("%Y-%m-%d")) logger.info("len datalist: %d", len(datalist)) logger.info("All data fetch iterative done!!") return datalist, datalist_hist
def query(self, request_body, page_size): query = request_body['query'] jobConfig = request_body['jobConfig'] dryRunOnly = request_body['dryRunOnly'] # process flags processed_flags = { support_flag: jobConfig[support_flag] for support_flag in SUPPORTED_JOB_CONFIG_FLAGS if support_flag in jobConfig } if 'params' in processed_flags: processed_flags['query_parameters'] = _helpers.to_query_parameters( processed_flags['params']) if 'maximum_bytes_billed' in processed_flags and\ processed_flags['maximum_bytes_billed'] is None: del processed_flags['maximum_bytes_billed'] if 'use_legacy_sql' in processed_flags and\ not isinstance(processed_flags['use_legacy_sql'], bool): raise ValueError( 'use_legacy_sql shoud be boolean, instead received {}'.format( processed_flags['use_legacy_sql'])) if 'destination_table' in processed_flags: processed_flags['destination'] = processed_flags[ 'destination_table'] del processed_flags['destination_table'] # dry run, will throw exception if fail dry_run_job_config = bigquery.QueryJobConfig(**processed_flags) dry_run_job_config.dry_run = True dry_run_job_config.use_query_cache = False try: with PagedQueryHandler.client_lock: if 'project' in jobConfig and jobConfig['project'] is not None: PagedQueryHandler.client.project = jobConfig['project'] else: PagedQueryHandler.client.project = PagedQueryHandler.orig_project dry_run_job = PagedQueryHandler.client.query( query, job_config=dry_run_job_config) PagedQueryHandler.client.project = PagedQueryHandler.orig_project except Exception as err: if hasattr(err, 'errors'): raise Exception(err.errors[0]['message']) else: raise Exception(err) total_bytes_processed = dry_run_job.total_bytes_processed if dryRunOnly: job_id = 'dry_run' if dry_run_job.job_id is None else dry_run_job.job_id yield dry_run_job, job_id yield { 'content': json.dumps(None), 'labels': json.dumps(None), 'bytesProcessed': json.dumps(total_bytes_processed) } return # actual run job_config = bigquery.QueryJobConfig(**processed_flags) # need synchronization since all query handler share the same client with PagedQueryHandler.client_lock: if 'project' in jobConfig and jobConfig['project'] is not None: PagedQueryHandler.client.project = jobConfig['project'] else: PagedQueryHandler.client.project = PagedQueryHandler.orig_project query_job = PagedQueryHandler.client.query(query, job_config=job_config) PagedQueryHandler.client.project = PagedQueryHandler.orig_project if query_job.error_result is not None: raise Exception(query_job.error_result) yield query_job, query_job.job_id # send contents en = query_job.result(page_size) schema_fields = format_preview_fields(en.schema) duration = (query_job.ended - query_job.started).total_seconds() for page in en.pages: if page.num_items > USE_PARALLEL_THRESH: content = parallel_format_preview_rows(page, en.schema, pool=self.pool) else: content = format_preview_rows(page, en.schema) response = { 'content': json.dumps(content), 'labels': schema_fields, 'bytesProcessed': total_bytes_processed, 'project': query_job.project, 'duration': duration, } yield response
def execute(self, data): #setup client = self.client bigquery = self.bigquery datetime = self.datetime pytz = self.pytz time = self.time name = data.get("titleName") emails = data.get("emails") query = data.get("query") table = "" # # create a dataset first if needed dataset_main = self.make_dataset() table_id = "{}.{}".format(dataset_main, name) # # create external table if (self.env.get("create_external_table")): try: # Configure the external data source dataset_id = dataset_main table_id = "{}.{}".format(dataset_main, query) schema = [ bigquery.SchemaField("name", "STRING"), bigquery.SchemaField("post_abbr", "STRING"), ] table = bigquery.Table(table_id, schema=schema) external_config = bigquery.ExternalConfig("CSV") external_config.source_uris = [ "gs://cloud-samples-data/bigquery/us-states/us-states.csv" ] external_config.options.skip_leading_rows = 1 # optionally skip header row table.external_data_configuration = external_config # Create a permanent table linked to the GCS file table = client.create_table(table) # API request # Example query to find states starting with 'W' sql = 'SELECT * FROM `{}` WHERE name LIKE "W%"'.format( table_id) query_job = client.query(sql) # API request w_states = list(query_job) # Waits for query to finish return "There are {} states with names starting with W. we pulled the data from us-states.csv in cloud storage".format( len(w_states)) except BaseException as e: print('my custom error\n') print(e.__class__.__name__) print('\n') print(e) return 'an error occured check the output from the backend' # # create temp external table elif (self.env.get("create_temp_external_table")): try: schema = ["filename", "name"] # Configure the external data source and query job. external_config = bigquery.ExternalConfig("CSV") external_config.source_uris = [ "gs://cloud-samples-data/bigquery/us-states/us-states.csv" ] external_config.schema = [ bigquery.SchemaField("name", "STRING"), bigquery.SchemaField("post_abbr", "STRING"), ] external_config.options.skip_leading_rows = 1 table_id = "usa_states" job_config = bigquery.QueryJobConfig( table_definitions={table_id: external_config}) # Example query to find states starting with 'W'. sql = """ SELECT _FILE_NAME AS {},{} FROM `{}` WHERE name LIKE "W%" """.format(schema[0], schema[1], table_id) query_job = client.query( sql, job_config=job_config) # Make an API request. query_job.result() return json.dumps({ "schema": [{ "field": x } for x in schema], "data": [ # Row values can be accessed by field name or index. { schema[0]: row[schema[0]], schema[1]: row[schema[1]] } for row in query_job ] }) except BaseException as e: print('my custom error\n') print(e.__class__.__name__) print('\n') print(e) return 'an error occured check the output from the backend' # # drive create external table elif (self.env.get("drive_create_external_table")): try: dataset_id = dataset_main # Configure the external data source. dataset = client.get_dataset(dataset_id) table_id = query schema = [ bigquery.SchemaField("name", "STRING"), bigquery.SchemaField("post_abbr", "STRING"), ] table = bigquery.Table(dataset.table(table_id), schema=schema) external_config = bigquery.ExternalConfig("GOOGLE_SHEETS") # Use a shareable link or grant viewing access to the email address you # used to authenticate with BigQuery (this example Sheet is public). sheet_url = ( "https://docs.google.com/spreadsheets/d/1i_QCL-7HcSyUZmIbP9E6lO_T5u3HnpLe7dnpHaijg_E/edit?usp=sharing" ) external_config.source_uris = [sheet_url] external_config.options.skip_leading_rows = 1 # Optionally skip header row. external_config.options.range = ( "us-states!A20:B49" # Optionally set range of the sheet to query from. ) table.external_data_configuration = external_config # Create a permanent table linked to the Sheets file. table = client.create_table(table) # Make an API request. # Example query to find states starting with "W". sql = 'SELECT * FROM `{}.{}` WHERE name LIKE "W%"'.format( dataset_id, table_id) query_job = client.query(sql) # Make an API request. # Wait for the query to complete. w_states = list(query_job) return "There are {} states with names starting with W in the selected range. this data came from google drive".format( len(w_states)) except BaseException as e: print('my custom error\n') print(e.__class__.__name__) print('\n') print(e) return 'an error occured check the output from the backend' # # drive create temp external table elif (self.env.get("drive_create_temp_external_table")): try: schema = ["name", "post_abbr"] # Configure the external data source and query job. external_config = bigquery.ExternalConfig("GOOGLE_SHEETS") sheet_url = ( "https://docs.google.com/spreadsheets" "/d/1i_QCL-7HcSyUZmIbP9E6lO_T5u3HnpLe7dnpHaijg_E/edit?usp=sharing" ) external_config.source_uris = [sheet_url] external_config.schema = [ bigquery.SchemaField("name", "STRING"), bigquery.SchemaField("post_abbr", "STRING"), ] external_config.options.skip_leading_rows = 1 # Optionally skip header row. external_config.options.range = ( "us-states!A20:B49" # Optionally set range of the sheet to query from. ) table_id = "usa_states" job_config = bigquery.QueryJobConfig( table_definitions={table_id: external_config}) # Example query to find states starting with 'W'. sql = """ SELECT * FROM `{}` WHERE name LIKE "W%" """.format(table_id) query_job = client.query( sql, job_config=job_config) # Make an API request. query_job.result() [print(row) for row in query_job] return json.dumps({ "schema": [{ "field": x } for x in schema], "data": [ # Row values can be accessed by field name or index. { schema[0]: row[schema[0]], schema[1]: row[schema[1]] } for row in query_job ] }) except BaseException as e: print('my custom error\n') print(e.__class__.__name__) print('\n') print(e) return 'an error occured check the output from the backend' # return "Check the backend env dictionary you did set it so the backend didnt do anything"
def submit(self, sql, create, dml=None): """ Submit the sql query to create a de-identified table. :param sql: The sql to send. :param create: a flag to identify if this query should create a new table or append to an existing table. :param dml: boolean flag identifying if a statement is a dml statement """ dml = False if dml is None else dml table_name = self.get_tablename() client = bq.Client.from_service_account_json(self.private_key) # # Let's make sure the out dataset exists datasets = list(client.list_datasets()) found = np.sum( [1 for dataset in datasets if dataset.dataset_id == self.odataset]) if not found: dataset = bq.Dataset(client.dataset(self.odataset)) client.create_dataset(dataset) # create the output table if create: LOGGER.info('creating new table:\t%s', self.tablename) bq_utils.create_standard_table(self.tablename, self.tablename, drop_existing=True, dataset_id=self.odataset) write_disposition = bq_consts.WRITE_EMPTY else: write_disposition = bq_consts.WRITE_APPEND LOGGER.info('appending results to table:\t%s', self.tablename) job = bq.QueryJobConfig() job.priority = self.priority job.dry_run = True dml_job = None if not dml: job.destination = client.dataset(self.odataset).table( self.tablename) job.use_query_cache = True job.allow_large_results = True job.write_disposition = write_disposition if self.partition: job._properties['timePartitioning'] = {'type': 'DAY'} job._properties['clustering'] = {'field': 'person_id'} else: # create a copy of the job config to use if the dry-run passes dml_job = copy(job) LOGGER.info( 'submitting a dry-run for:\t%s\t\tpriority:\t%s\t\tpartition:\t%s', self.get_tablename(), self.priority, self.partition) logpath = os.path.join(self.logpath, self.idataset) try: os.makedirs(logpath) except OSError: # log path already exists and we don't care pass try: response = client.query(sql, location='US', job_config=job) except Exception: LOGGER.exception( 'dry run query failed for:\t%s\n' '\t\tSQL:\t%s\n' '\t\tjob config:\t%s', self.get_tablename(), sql, job) else: if response.state == 'DONE': if dml_job: job = dml_job job.dry_run = False LOGGER.info('dry-run passed. submitting query for execution.') response = client.query(sql, location='US', job_config=job) LOGGER.info( 'submitted a %s job for table:\t%s\t\tstatus:\t%s\t\tvalue:\t%s', 'bigquery', table_name, 'pending', response.job_id) self.wait(client, response.job_id)
def test_validate_hmac_sha256(sql): """Validate hmac_sha256.""" job_config = bigquery.QueryJobConfig(use_legacy_sql=False) job = bigquery.Client().query(sql, job_config=job_config) job.result()
def main(submission_date, dst_table, project, tmp_project, dataset): """Run query per app_version.""" bq_client = bigquery.Client(project=project) app_versions = [ row["app_version"] for row in bq_client.query( VERSION_QUERY_TEMPLATE.format(date=submission_date, project=project, dataset=dataset)).result() ] print(f"Found versions: {app_versions}") if len(app_versions) == 0: print("Source table empty", file=sys.stderr) sys.exit(1) sql_path = SQL_BASE_DIR / dst_table / "query.sql" query_text = sql_path.read_text() # Write to intermediate table to avoid partial writes to destination table if tmp_project is None: tmp_project = project intermediate_table = f"{tmp_project}.analysis.glam_temp_clustered_query_{dst_table}" print(f"Writing results to {intermediate_table}") for i, app_version in enumerate(app_versions): print(f"Querying for app_version {app_version}") query_config = bigquery.QueryJobConfig( query_parameters=[ bigquery.ScalarQueryParameter("submission_date", "DATE", str(submission_date)), bigquery.ScalarQueryParameter("app_version", "INT64", app_version), ], clustering_fields=["metric", "channel"], destination=intermediate_table, default_dataset=f"{project}.{dataset}", write_disposition=(bigquery.WriteDisposition.WRITE_TRUNCATE if i == 0 else bigquery.WriteDisposition.WRITE_APPEND), ) query_job = bq_client.query(query_text, job_config=query_config) # Periodically print so airflow gke operator doesn't think task is dead elapsed = 0 while not query_job.done(): time.sleep(10) elapsed += 10 if elapsed % 200 == 10: print("Waiting on query...") print(f"Total elapsed: approximately {elapsed} seconds") results = query_job.result() print(f"Query job {query_job.job_id} finished") print(f"{results.total_rows} rows in {intermediate_table}") copy_config = bigquery.CopyJobConfig( write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE, ) print(f"Copying {intermediate_table} to {project}.{dataset}.{dst_table}") bq_client.copy_table( intermediate_table, f"{project}.{dataset}.{dst_table}", job_config=copy_config, ).result() print(f"Deleting {intermediate_table}") bq_client.delete_table(intermediate_table)
def test_ncaa_tutorial(delete_dataset): # [START bqml_ncaa_tutorial_create_dataset] dataset = bigquery.Dataset(client.dataset('bqml_tutorial')) dataset.location = 'US' client.create_dataset(dataset) # [END bqml_ncaa_tutorial_create_dataset] # Create the tables used by the tutorial # Note: the queries are saved to a file. This should be updated to use the # saved queries once the library supports running saved queries. query_filepath_to_table_name = { 'feature_input_query.sql': 'cume_games', 'training_data_query.sql': 'wide_games' } resources_directory = os.path.join(os.path.dirname(__file__), 'resources') for query_filepath, table_name in query_filepath_to_table_name.items(): table_ref = dataset.table(table_name) job_config = bigquery.QueryJobConfig() job_config.destination = table_ref query_filepath = os.path.join( resources_directory, query_filepath) sql = io.open(query_filepath, 'r', encoding='utf-8').read() client.query(sql, job_config=job_config).result() # [START bqml_ncaa_tutorial_create_model] sql = """ CREATE OR REPLACE MODEL `bqml_tutorial.ncaa_model` OPTIONS ( model_type='linear_reg', data_split_eval_fraction=0.1, max_iteration=50 ) AS SELECT * EXCEPT ( game_id, season, scheduled_date, total_three_points_made, total_three_points_att), total_three_points_att as label FROM `bqml_tutorial.wide_games` WHERE # remove the game to predict game_id != 'f1063e80-23c7-486b-9a5e-faa52beb2d83' """ df = client.query(sql).to_dataframe() print(df) # [END bqml_ncaa_tutorial_create_model] # [START bqml_ncaa_tutorial_get_training_statistics] sql = """ SELECT * FROM ML.TRAINING_INFO(MODEL `bqml_tutorial.ncaa_model`) """ df = client.query(sql).to_dataframe() print(df) # [END bqml_ncaa_tutorial_get_training_statistics] # [START bqml_ncaa_tutorial_evaluate_model] sql = """ WITH eval_table AS ( SELECT *, total_three_points_att AS label FROM `bqml_tutorial.wide_games` ) SELECT * FROM ML.EVALUATE(MODEL `bqml_tutorial.ncaa_model`, TABLE eval_table) """ df = client.query(sql).to_dataframe() print(df) # [END bqml_ncaa_tutorial_evaluate_model] # [START bqml_ncaa_tutorial_predict_outcomes] sql = """ WITH game_to_predict AS ( SELECT * FROM `bqml_tutorial.wide_games` WHERE game_id='f1063e80-23c7-486b-9a5e-faa52beb2d83' ) SELECT truth.game_id AS game_id, total_three_points_att, predicted_total_three_points_att FROM ( SELECT game_id, predicted_label AS predicted_total_three_points_att FROM ML.PREDICT(MODEL `bqml_tutorial.ncaa_model`, table game_to_predict) ) AS predict JOIN ( SELECT game_id, total_three_points_att AS total_three_points_att FROM game_to_predict) AS truth ON predict.game_id = truth.game_id """ df = client.query(sql).to_dataframe() print(df)
def update_with_bq_access(request): # Instantiates a client bigquery_client = bigquery.Client() datasets = list(bigquery_client.list_datasets()) project = bigquery_client.project # list all datasets if datasets: logger.debug('Datasets in project {}:'.format(project)) for dataset in datasets: # API request(s) logger.debug('\t{}'.format(dataset.dataset_id)) # choose the right dataset if ("learning_datasets" == dataset.dataset_id): # list all tables dataset_ref = bigquery_client.dataset(dataset.dataset_id) tables = list( bigquery_client.list_tables(dataset_ref)) # API request(s) for table in tables: if ("enriched_events" == table.table_id): logger.debug('\t{}'.format("found table")) # query to retrieve all file access events for one course query = 'select CAST(SUBSTR(JSON_EXTRACT_SCALAR(event, "$.object.id"), 35) AS STRING) AS FILE_ID, ' \ 'SUBSTR(JSON_EXTRACT_SCALAR(event, "$.membership.member.id"), 29) AS USER_ID, ' \ 'datetime(EVENT_TIME) as ACCESS_TIME ' \ 'FROM learning_datasets.enriched_events ' \ 'where JSON_EXTRACT_SCALAR(event, "$.edApp.id") = \'http://umich.instructure.com/\' ' \ 'and event_type = \'NavigationEvent\' ' \ 'and JSON_EXTRACT_SCALAR(event, "$.object.name") = \'attachment\' ' \ 'and JSON_EXTRACT_SCALAR(event, "$.action") = \'NavigatedTo\' ' \ 'and JSON_EXTRACT_SCALAR(event, "$.membership.member.id") is not null ' \ 'and SUBSTR(JSON_EXTRACT_SCALAR(event, "$.group.id"),31) = @course_id ' logger.debug(query) query_params = [ bigquery.ScalarQueryParameter( 'course_id', 'STRING', UDW_COURSE_ID), ] job_config = bigquery.QueryJobConfig() job_config.query_parameters = query_params # Location must match that of the dataset(s) referenced in the query. df = bigquery_client.query( query, location='US', job_config=job_config).to_dataframe( ) # API request - starts the query logger.debug("df row number=" + str(df.shape[0])) # drop duplicates df.drop_duplicates( ["FILE_ID", "USER_ID", "ACCESS_TIME"], keep='first', inplace=True) logger.debug("after drop duplicates, df row number=" + str(df.shape[0])) # write to MySQL df.to_sql(con=engine, name='FILE_ACCESS', if_exists='append', index=False) else: logger.debug( '{} project does not contain any datasets.'.format(project)) return HttpResponse("loaded file access info: inserted " + str(df.shape[0]) + " rows.")
Creates analytics.movie Bigquery table """ from google.cloud import bigquery from google.oauth2 import service_account key_path = "../../credentials/edgart-experiments-67ca4ddbda73.json" credentials = service_account.Credentials.from_service_account_file( key_path, scopes=["https://www.googleapis.com/auth/cloud-platform"], ) client = bigquery.Client(credentials=credentials, project=credentials.project_id, ) table_id = "{}.analytics.rating".format(credentials.project_id) job_config = bigquery.QueryJobConfig(destination=table_id, write_disposition='WRITE_TRUNCATE',) sql = """ WITH ml_rating AS ( SELECT m.tconst, TRUNC(AVG(rating), 1) as rating, COUNT(1) as num_votes FROM `edgart-experiments.ml.ratings` r JOIN `edgart-experiments.ml.links` l ON l.movieId=r.movieId JOIN `edgart-experiments.analytics.movie` m ON CAST(REPLACE(m.tconst, 'tt', '') AS INT64)=l.imdbId GROUP BY m.tconst ) SELECT m.tconst, imdb.averageRating as imdb_rating,
def get_averages_by_addon_from_bigquery(today, exclude=None): """This function is used to compute the 'hotness' score of each add-on (see also `update_addon_hotness()` cron task). It returns a dict with top-level keys being add-on GUIDs and values being dicts containing average values.""" client = create_client() one_week_date = today - timedelta(days=7) four_weeks_date = today - timedelta(days=28) query = f""" WITH this_week AS ( SELECT addon_id, AVG(dau) AS avg_this_week FROM `{get_amo_stats_dau_view_name()}` WHERE submission_date >= @one_week_date GROUP BY addon_id), three_weeks_before_this_week AS ( SELECT addon_id, AVG(dau) AS avg_three_weeks_before FROM `{get_amo_stats_dau_view_name()}` WHERE submission_date BETWEEN @four_weeks_date AND @one_week_date GROUP BY addon_id) SELECT * FROM this_week JOIN three_weeks_before_this_week USING (addon_id) """ query_parameters = [ bigquery.ScalarQueryParameter('one_week_date', 'DATE', one_week_date), bigquery.ScalarQueryParameter('four_weeks_date', 'DATE', four_weeks_date), ] if exclude and len(exclude) > 0: query = f'{query} WHERE addon_id NOT IN UNNEST(@excluded_addon_ids)' query_parameters.append( bigquery.ArrayQueryParameter('excluded_addon_ids', 'STRING', exclude) ) rows = client.query( query, job_config=bigquery.QueryJobConfig(query_parameters=query_parameters), ).result() return { row['addon_id']: { 'avg_this_week': row['avg_this_week'], 'avg_three_weeks_before': row['avg_three_weeks_before'], } for row in rows if row['addon_id'] }
def submit(self, sql): """ """ table_name = self.get_tablename() client = bq.Client.from_service_account_json(self.private_key) # # Let's make sure the out dataset exists datasets = list(client.list_datasets()) found = np.sum([1 for dataset in datasets if dataset.dataset_id == self.odataset]) if not found: dataset = bq.Dataset(client.dataset(self.odataset)) client.create_dataset(dataset) # create the output table bq_utils.create_standard_table(self.tablename, self.tablename, drop_existing=True, dataset_id=self.odataset) job = bq.QueryJobConfig() job.destination = client.dataset(self.odataset).table(self.tablename) job.use_query_cache = True job.allow_large_results = True if self.partition: job._properties['timePartitioning'] = {'type': 'DAY'} job._properties['clustering'] = {'field': 'person_id'} job.priority = self.priority job.dry_run = True self.log(module='submit-job', subject=self.get_tablename(), action='dry-run', value={'priority': self.priority, 'parition': self.partition}) logpath = os.path.join(self.logpath, self.idataset) try: os.makedirs(logpath) except OSError: # log path already exists and we don't care pass r = client.query(sql, location='US', job_config=job) if r.errors is None and r.state == 'DONE': job.dry_run = False r = client.query(sql, location='US', job_config=job) self.log(module='submit', subject=self.get_tablename(), action='submit-job', table=table_name, status='pending', value=r.job_id, object='bigquery') self.wait(client, r.job_id) # self.finalize(client) # # At this point we must try to partition the table else: self.log(module='submit', subject=self.get_tablename(), action='submit-job', table=table_name, status='error', value=r.errors) print (r.errors)
def promotion_prediction_res(project_id, dataset_id): # Load client client = bigquery.Client() job_config = bigquery.QueryJobConfig() promo_update = """ WITH prelim AS (SELECT CAST(pred.p_cal_inc_sale_qty AS NUMERIC) AS p_cal_inc_sale_qty, CAST(pred.prediction_interval AS NUMERIC) AS prediction_interval, CAST(pred.prediction_error_perc AS NUMERIC) AS prediction_error_perc, pred.sku_root_id,pred.description, pred.area, pred.section, pred.category, pred.subcategory, pred.segment, pred.brand_name, pred.brand_price_label, pred.flag_healthy, pred.innovation_flag, pred.tourism_flag, pred.local_flag, pred.regional_flag, CAST(pred.no_hipermercados_stores AS INT64) AS no_hipermercados_stores, CAST(pred.no_supermercados_stores AS INT64) AS no_supermercados_stores, CAST(pred.no_gasolineras_stores AS INT64) AS no_gasolineras_stores, CAST(pred.no_comercio_electronico_stores AS INT64) AS no_comercio_electronico_stores, CAST(pred.no_otros_negocio_stores AS INT64) AS no_otros_negocio_stores, CAST(pred.no_plataformas_stores AS INT64) AS no_plataformas_stores, CAST(pred.no_other_stores AS INT64) AS no_other_stores, CAST(pred.no_impacted_stores AS INT64) AS no_impacted_stores, CAST(pred.no_impacted_regions AS INT64) AS no_impacted_regions, CAST(pred.avg_store_size AS NUMERIC) AS avg_store_size, CAST(pred.type AS STRING) AS type, pred.customer_profile_type, pred.marketing_type, CAST(pred.duration_days AS INT64) AS duration_days, pred.includes_weekend, pred.campaign_start_day, pred.campaign_start_month , CAST(pred.campaign_start_quarter AS INT64) AS campaign_start_quarter, CAST(pred.campaign_start_week AS INT64) AS campaign_start_week, CAST(pred.leaflet_cover AS INT64) AS leaflet_cover, CAST(pred.leaflet_priv_space AS INT64) AS leaflet_priv_space, CAST(pred.in_leaflet_flag AS INT64) AS in_leaflet_flag, CAST(pred.in_gondola_flag AS INT64) AS in_gondola_flag, CAST(pred.in_both_leaflet_gondola_flag AS INT64) AS in_both_leaflet_gondola_flag, CAST(pred.p_qty_bl AS NUMERIC) AS p_qty_bl, pred.promo_mechanic, pred.Promo_mechanic_en , pred.discount_depth, CAST(pred.promoted_in_past AS NUMERIC) as promoted_in_past, std_price.margin_per_unit as std_margin_per_unit, std_price.std_price_per_unit as std_price_per_unit, CAST(pred.p_qty_bl AS NUMERIC)*std_price.std_price_per_unit as p_sale_bl, CAST(pred.p_qty_bl AS NUMERIC)*std_price.margin_per_unit as p_margin_bl, std_price.cost_per_unit as cost_price, (CAST(discount_depth_rank AS NUMERIC)/100) as equivalent_discount, (1-(CAST(discount_depth_rank AS NUMERIC)/100))*std_price.std_price_per_unit as effective_discount_price_per_unit, CAST(pred.p_cal_inc_sale_qty AS NUMERIC)*(1-(CAST(discount_depth_rank AS NUMERIC)/100))*std_price.std_price_per_unit as p_cal_inc_sale_amt, (CAST(pred.p_cal_inc_sale_qty AS NUMERIC)*(1-(CAST(discount_depth_rank AS NUMERIC)/100))*std_price.std_price_per_unit) - (CAST(pred.p_cal_inc_sale_qty AS NUMERIC)*(std_price.cost_per_unit)) as p_cal_inc_margin_amt FROM `gum-eroski-dev.prediction_results.prediction_promotion_results` pred LEFT JOIN `gum-eroski-dev.ETL.aggregate_std_price_margin` std_price on std_price.sku_root_id = pred.sku_root_id ) SELECT *, SAFE_DIVIDE(p_cal_inc_sale_qty, p_qty_bl) AS perc_uplift_qty, SAFE_DIVIDE(p_cal_inc_sale_amt, p_sale_bl) AS perc_uplift_amt, SAFE_DIVIDE(p_cal_inc_margin_amt, p_margin_bl) AS perc_uplift_margin FROM prelim pred """ promotion_pred_sql = """ SELECT avg(p_cal_inc_sale_qty) as avg_p_cal_inc_sale_qty, avg(p_cal_inc_sale_amt) as avg_p_cal_inc_sale_amt, avg(p_cal_inc_margin_amt) as avg_p_cal_inc_margin_amt, avg(perc_uplift_qty) as avg_perc_uplift_qty, avg(perc_uplift_amt) as avg_perc_uplift_amt, avg(perc_uplift_margin) as avg_perc_uplift_margin, sum(p_cal_inc_sale_qty) as sum_p_cal_inc_sale_qty, sum(p_cal_inc_sale_amt) as sum_p_cal_inc_sale_amt, sum(p_cal_inc_margin_amt) as sum_p_cal_inc_margin_amt, avg(prediction_interval) as avg_prediction_interval, avg(prediction_error_perc) as avg_prediction_error_perc, area, section, category, subcategory, brand_name, promo_mechanic, Promo_mechanic_en, discount_depth, count(distinct sku_root_id) as no_skus_in_brand_cat, max(promoted_in_past) as promoted_in_past FROM `gum-eroski-dev.prediction_results.prediction_promotion_results` group by area, section, category, subcategory, brand_name, promo_mechanic, Promo_mechanic_en, discount_depth """ # Create a disctionary to loop over all destination tables and scripts tables = { 'prediction_promotion_results': promo_update, 'prediction_promotion_results_cat_brand': promotion_pred_sql } job_config.write_disposition = "WRITE_TRUNCATE" for key in tables: # Set the destination table table_ref = client.dataset(dataset_id).table(key) job_config.destination = table_ref # Start the query, passing in the extra configuration. query_job = client.query( tables[key], # Location must match that of the dataset(s) referenced in the query # and of the destination table. location='europe-west3', job_config=job_config) # API request - starts the query query_job.result() # Waits for the query to finish logger.info("Completed writing {a} table...".format(a=key))
def run( self, query: str = None, query_params: List[tuple] = None, project: str = None, location: str = "US", dry_run_max_bytes: int = None, credentials: dict = None, dataset_dest: str = None, table_dest: str = None, to_dataframe: bool = False, job_config: dict = None, ): """ Run method for this Task. Invoked by _calling_ this Task within a Flow context, after initialization. Args: - query (str, optional): a string of the query to execute - query_params (list[tuple], optional): a list of 3-tuples specifying BigQuery query parameters; currently only scalar query parameters are supported. See [the Google documentation](https://cloud.google.com/bigquery/docs/parameterized-queries#bigquery-query-params-python) for more details on how both the query and the query parameters should be formatted - project (str, optional): the project to initialize the BigQuery Client with; if not provided, will default to the one inferred from your credentials - location (str, optional): location of the dataset that will be queried; defaults to "US" - dry_run_max_bytes (int, optional): if provided, the maximum number of bytes the query is allowed to process; this will be determined by executing a dry run and raising a `ValueError` if the maximum is exceeded - credentials (dict, optional): a JSON document containing Google Cloud credentials. You should provide these at runtime with an upstream Secret task. If not provided, Prefect will first check `context` for `GCP_CREDENTIALS` and lastly will use default Google client logic. - dataset_dest (str, optional): the optional name of a destination dataset to write the query results to, if you don't want them returned; if provided, `table_dest` must also be provided - table_dest (str, optional): the optional name of a destination table to write the query results to, if you don't want them returned; if provided, `dataset_dest` must also be provided - to_dataframe (bool, optional): if provided, returns the results of the query as a pandas dataframe instead of a list of `bigquery.table.Row` objects. Defaults to False - job_config (dict, optional): an optional dictionary of job configuration parameters; note that the parameters provided here must be pickleable (e.g., dataset references will be rejected) Raises: - ValueError: if the `query` is `None` - ValueError: if only one of `dataset_dest` / `table_dest` is provided - ValueError: if the query will execeed `dry_run_max_bytes` Returns: - list: a fully populated list of Query results, with one item per row """ # check for any argument inconsistencies if query is None: raise ValueError("No query provided.") if sum([dataset_dest is None, table_dest is None]) == 1: raise ValueError( "Both `dataset_dest` and `table_dest` must be provided if writing to a " "destination table.") # create client client = get_bigquery_client(project=project, credentials=credentials) # setup jobconfig job_config = bigquery.QueryJobConfig(**job_config) if query_params is not None: hydrated_params = [ bigquery.ScalarQueryParameter(*qp) for qp in query_params ] job_config.query_parameters = hydrated_params # perform dry_run if requested if dry_run_max_bytes is not None: old_info = dict(dry_run=job_config.dry_run, use_query_cache=job_config.use_query_cache) job_config.dry_run = True job_config.use_query_cache = False self.logger.debug("Performing a dry run...") query_job = client.query(query, location=location, job_config=job_config) if query_job.total_bytes_processed > dry_run_max_bytes: msg = ( "Query will process {0} bytes which is above the set maximum of {1} " "for this task.").format(query_job.total_bytes_processed, dry_run_max_bytes) raise ValueError(msg) job_config.dry_run = old_info["dry_run"] job_config.use_query_cache = old_info["use_query_cache"] # if writing to a destination table if dataset_dest is not None: table_ref = client.dataset(dataset_dest).table(table_dest) job_config.destination = table_ref query_job = client.query(query, location=location, job_config=job_config) # if returning the results as a dataframe if to_dataframe: return query_job.result().to_dataframe() # else if returning as a list of bigquery.table.Row objects (default) else: return list(query_job.result())
def load_data(spark, date_from, date_to): """Load a set of aggregated metrics for the provided timeframe. Returns Spark dataframe containing preaggregated user counts per various dimensions. Args: date_from: Start date (inclusive) date_to: End date (exclusive) """ bq = bigquery.Client() query = """ WITH rank_per_client AS ( SELECT *, ROW_NUMBER() OVER (PARTITION BY client_id ORDER BY submission_timestamp DESC) AS rn FROM `moz-fx-data-shared-prod.telemetry_stable.main_v4` WHERE DATE(submission_timestamp) >= @date_from AND DATE(submission_timestamp) < @date_to ), latest_per_client_all AS ( SELECT * FROM rank_per_client WHERE rn=1 ), latest_per_client AS ( SELECT environment.build.architecture AS browser_arch, COALESCE(environment.system.os.name, 'Other') AS os_name, COALESCE( IF (environment.system.os.name IN ('Linux', 'Darwin'), CONCAT(REGEXP_EXTRACT(environment.system.os.version, r"^[0-9]+"), '.x'), environment.system.os.version), 'Other') AS os_version, environment.system.memory_mb, coalesce(environment.system.is_wow64, FALSE) AS is_wow64, IF (ARRAY_LENGTH(environment.system.gfx.adapters)>0, environment.system.gfx.adapters[OFFSET(0)].vendor_id, NULL) AS gfx0_vendor_id, IF (ARRAY_LENGTH(environment.system.gfx.adapters)>0, environment.system.gfx.adapters[OFFSET(0)].device_id, NULL) AS gfx0_device_id, IF (ARRAY_LENGTH(environment.system.gfx.monitors)>0, environment.system.gfx.monitors[OFFSET(0)].screen_width, 0) AS screen_width, IF (ARRAY_LENGTH(environment.system.gfx.monitors)>0, environment.system.gfx.monitors[OFFSET(0)].screen_height, 0) AS screen_height, environment.system.cpu.cores AS cpu_cores, environment.system.cpu.vendor AS cpu_vendor, environment.system.cpu.speed_m_hz AS cpu_speed, 'Shockwave Flash' IN ( SELECT name FROM UNNEST(environment.addons.active_plugins) ) AS has_flash FROM latest_per_client_all ), transformed AS ( SELECT browser_arch, CONCAT(os_name, '-', os_version) AS os, COALESCE(SAFE_CAST(ROUND(memory_mb / 1024.0) AS INT64), 0) AS memory_gb, is_wow64, gfx0_vendor_id, gfx0_device_id, CONCAT(CAST(screen_width AS STRING), 'x', CAST(screen_height AS STRING)) AS resolution, cpu_cores, cpu_vendor, cpu_speed, has_flash FROM latest_per_client ), by_dimensions AS ( SELECT *, count(*) AS count FROM transformed GROUP BY browser_arch, os, memory_gb, is_wow64, gfx0_vendor_id, gfx0_device_id, resolution, cpu_cores, cpu_vendor, cpu_speed, has_flash ) select * from by_dimensions """ job_config = bigquery.QueryJobConfig(query_parameters=[ bigquery.ScalarQueryParameter("date_from", "DATE", date_from), bigquery.ScalarQueryParameter("date_to", "DATE", date_to), ]) hardware_by_dimensions_query_job = bq.query(query, job_config=job_config) hardware_by_dimensions_query_job.result() hardware_by_dimensions_df = (spark.read.format("bigquery").option( "project", hardware_by_dimensions_query_job.destination.project).option( "dataset", hardware_by_dimensions_query_job.destination.dataset_id).option( "table", hardware_by_dimensions_query_job.destination.table_id).load()) return hardware_by_dimensions_df
### -------------------------------------------------------------------------- ### RIDES PER YEAR ### -------------------------------------------------------------------------- # Query to select the number of rides per year, sorted by year rides_per_year_query = """ SELECT EXTRACT(YEAR FROM trip_start_timestamp) AS year, COUNT(1) AS num_trips FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips` GROUP BY year ORDER BY year """ # Set up the query (cancel the query if it would use too much of # your quota, with the limit set to 10 GB) safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10) rides_per_year_query_job = client.query(rides_per_year_query, job_config=safe_config) # API request - run the query, and convert the results to a pandas DataFrame rides_per_year_result = rides_per_year_query_job.to_dataframe() ### -------------------------------------------------------------------------- ### RIDES PER MONTH ### -------------------------------------------------------------------------- # Query to select the number of rides per month in 2017 rides_per_month_query = """ SELECT EXTRACT(MONTH FROM trip_start_timestamp) AS month, COUNT(1) AS num_trips FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips` WHERE EXTRACT(YEAR FROM trip_start_timestamp) = 2017
def runtest(self): """Run.""" query_name = self.fspath.dirpath().basename query = read( f"{self.fspath.dirname.replace('tests', 'sql')}/query.sql") expect = load(self.fspath.strpath, "expect") tables: Dict[str, Table] = {} views: Dict[str, str] = {} # generate tables for files with a supported table extension for resource in next(os.walk(self.fspath))[2]: if "." not in resource: continue # tables require an extension table_name, extension = resource.rsplit(".", 1) if table_name.endswith(".schema") or table_name in ( "expect", "query_params", ): continue # not a table if extension in TABLE_EXTENSIONS or extension in ("yaml", "json"): if extension in TABLE_EXTENSIONS: source_format = TABLE_EXTENSIONS[extension] source_path = os.path.join(self.fspath.strpath, resource) else: source_format = TABLE_EXTENSIONS["ndjson"] source_path = (self.fspath.strpath, table_name) if "." in table_name: # combine project and dataset name with table name original, table_name = ( table_name, table_name.replace(".", "_").replace("-", "_"), ) query = query.replace(original, table_name) tables[table_name] = Table(table_name, source_format, source_path) elif extension == "sql": if "." in table_name: # combine project and dataset name with table name original, table_name = ( table_name, table_name.replace(".", "_").replace("-", "_"), ) query = query.replace(original, table_name) views[table_name] = read(self.fspath.strpath, resource) # rewrite all udfs as temporary query = parse_udf.persistent_udf_as_temp(query) dataset_id = "_".join(self.fspath.strpath.split(os.path.sep)[-3:]) if "CIRCLE_BUILD_NUM" in os.environ: dataset_id += f"_{os.environ['CIRCLE_BUILD_NUM']}" bq = bigquery.Client() with dataset(bq, dataset_id) as default_dataset: load_tables(bq, default_dataset, tables.values()) load_views(bq, default_dataset, views) # configure job res_table = bigquery.TableReference(default_dataset, query_name) job_config = bigquery.QueryJobConfig( default_dataset=default_dataset, destination=res_table, query_parameters=get_query_params(self.fspath.strpath), use_legacy_sql=False, write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE, ) # run query job = bq.query(query, job_config=job_config) result = list(coerce_result(*job.result())) result.sort(key=lambda row: json.dumps(row, sort_keys=True)) expect.sort(key=lambda row: json.dumps(row, sort_keys=True)) print_and_test(expect, result)
def update_from_cloud_storage(args): os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = args.google_key_path client = bigquery.Client() bucket_name = "igenie-tweets" blob_name = "historical/{}.json".format("tweets-raw") GS_URL = 'gs://{}/{}'.format(bucket_name, blob_name) external_config = bigquery.ExternalConfig("NEWLINE_DELIMITED_JSON") external_config.autodetect = True external_config.source_uris = [GS_URL] job_config = bigquery.QueryJobConfig() job_config.table_definitions = {"temp": external_config} file_name = "tweets-enriched.json" QUERY = ('SELECT id,' 'id_str,' 'constituent,' 'text,' 'coordinates,' 'created_at,' 'favorited,' 'place,' 'lang,' 'metadata,' 'retweeted,' 'entities.hashtags,' 'entities.symbols,' 'source,' 'user.time_zone,' 'user.location,' 'user.friends_count,' 'user.followers_count,' 'favorite_count,' 'retweet_count,' 'geo ,' 'search_term ' 'FROM `temp`') TIMEOUT = 100 # in seconds query_job = client.query( QUERY, job_config=job_config) # API request - starts the query assert query_job.state == 'RUNNING' # Waits for the query to finish iterator = query_job.result(timeout=TIMEOUT) with open(file_name, "a") as f: for row in iterator: # Included attributes result = {} result["id"] = row.id result['id_str'] = row.id_str result['text'] = row.text result['coordinates'] = row.coordinates result['favorited'] = row.favorited result['place'] = row.place result['lang'] = row.lang result['metadata'] = row.metadata result['retweeted'] = row.retweeted result['entities_hashtags'] = row["entities.hashtags"] result['entities_symbols'] = row["entities.symbols"] result['source'] = row.source result['user_time_zone'] = row["user.time_zone"] result['user_location'] = row["user.location"] result['user_friends_count'] = row["user.friends_count"] result['user_followers_count'] = row["user.followers_count"] result['favorite_count'] = row.favorite_count result['retweet_count'] = row.retweet_count result['geo'] = row.geo result['search_term'] = row.search_term # Extra attributes # constituent_id, constituent_name constituent_id, constituent_name = get_constituent_id_name( row.constituent) result['constituent_id'] = constituent_id result['constituent_name'] = constituent_name # created at - date result['date'] = datetime.strptime(row.created_at, '%a %b %d %H:%M:%S %z %Y') if not row.relevance: result["relevance"] = -1 else: result["relevance"] = row.relevance # sentiment score result["sentiment_score"] = get_nltk_sentiment(row.text) update_tags(result) f.write(json.dumps(result, cls=MongoEncoder) + '\n')