Beispiel #1
0
 def get_query_job_config():
     return bigquery.QueryJobConfig()
Beispiel #2
0
def promotion_prediction_(project_id, dataset_id, area, mechanic):

    # Load client
    client = bigquery.Client()

    job_config = bigquery.QueryJobConfig()

    logger.info("Filtering on promotion mechanic {a}...".format(a=mechanic))

    promotion_pred_sql = """
         WITH
        temp_aggr_promo AS (
        SELECT
          sku_root_id,
          description,
          area,
          section,
          category,
          subcategory,
          segment,
          brand_name,
          eroskibrand_flag,
          eroskibrand_label,
          wealthy_range_flag,
          flag_healthy,
          innovation_flag,
          tourism_flag,
          local_flag,
          regional_flag,
          no_hipermercados_stores,
          no_supermercados_stores,
          no_gasolineras_stores,
          no_comercio_electronico_stores,
          no_otros_negocio_stores,
          no_plataformas_stores,
          no_other_stores,
          no_impacted_stores,
          no_impacted_regions,
          AVG(avg_store_size) AS avg_store_size,
          promo_id,
          promo_year,
          promo_mechanic,
          promo_mechanic_description as Promo_mechanic_en,
          name,
          type,
          start_date,
          end_date,
          customer_profile_type,
          marketing_type,
          duration AS duration_days,
          MAX(includes_weekend) AS includes_weekend,
          campaign_start_day,
          campaign_start_month,
          campaign_start_quarter,
          campaign_start_week,
          leaflet_cover,
          leaflet_priv_space,
          in_leaflet_flag,
          in_gondola_flag,
          in_both_leaflet_gondola_flag,
          discount_depth,
          discount_depth_rank,
          CASE
            WHEN change_flag in (1,2) THEN 'promotion'
          ELSE
          'post_promotion'
        END
          AS period,
          SUM(tt_discount) AS p_discount,
          SUM(sale_amt_bl) AS p_sale_bl,
          SUM(sale_qty_bl) AS p_qty_bl,
          SUM(margin_amt_bl) AS p_margin_bl,
          SUM(tt_sale_amt) AS p_sale_amt,
          SUM(tt_sale_qty) AS p_sale_qty,
          SUM(tt_margin_amt) AS p_margin_amt,
          SUM(inc_sale_amt) AS p_cal_inc_sale_amt,
          SUM(inc_sale_qty) AS p_cal_inc_sale_qty,
          SUM(inc_margin_amt) AS p_cal_inc_margin_amt,
          SAFE_DIVIDE(SUM(inc_sale_amt),no_impacted_stores) AS p_cal_inc_sale_amt_per_store,
          SAFE_DIVIDE(SUM(inc_sale_qty),no_impacted_stores) AS p_cal_inc_sale_qty_per_store,
          SAFE_DIVIDE(SUM(inc_margin_amt),no_impacted_stores) AS p_cal_inc_margin_amt_per_store,
          SAFE_DIVIDE(SUM(inc_sale_amt),SUM(sale_amt_bl)) AS p_cal_perc_inc_sale_amt,
          SAFE_DIVIDE(SUM(inc_sale_qty),SUM(sale_qty_bl)) AS p_cal_perc_inc_sale_qty,
          SAFE_DIVIDE(SUM(inc_margin_amt),SUM(margin_amt_bl)) AS p_cal_perc_inc_margin,
          SUM(avg_bline_sale) AS p_avg_sale_bl,
          SUM(avg_bline_qty) AS p_avg_qty_bl,
          SUM(avg_bline_margin) AS p_avg_margin_bl,
          SUM(avg_bl_inc_sale) AS p_cal_inc_avg_sale,
          SUM(avg_bl_inc_qty) AS p_cal_inc_avg_qty,
          SUM(avg_bl_inc_margin) AS p_cal_avg_margin,
          SAFE_DIVIDE(SUM(avg_bl_inc_sale),no_impacted_stores) AS p_cal_inc_avg_sale_per_store,
          SAFE_DIVIDE(SUM(avg_bl_inc_qty),no_impacted_stores) AS p_cal_inc_avg_qty_per_store,
          SAFE_DIVIDE(SUM(avg_bl_inc_margin),no_impacted_stores) AS p_cal_avg_margin_per_store,
          SAFE_DIVIDE(SUM(avg_bl_inc_sale),SUM(avg_bline_sale)) AS p_cal_perc_inc_avg_sale_amt,
          SAFE_DIVIDE(SUM(avg_bl_inc_qty),SUM(avg_bline_qty)) AS p_cal_perc_inc_avg_sale_qty,
          SAFE_DIVIDE(SUM(avg_bl_inc_margin),SUM(avg_bline_margin)) AS p_cal_perc_inc_avg_margin
        FROM
          `gum-eroski-dev.baseline.baseline_promo`
        WHERE
          promo_mechanic IN {m}
          AND area = "{a}"
        GROUP BY
          sku_root_id,
          description,
          area,
          section,
          category,
          subcategory,
          segment,
          brand_name,
          eroskibrand_flag,
          eroskibrand_label,
          wealthy_range_flag,
          flag_healthy,
          innovation_flag,
          tourism_flag,
          local_flag,
          regional_flag,
          no_hipermercados_stores,
          no_supermercados_stores,
          no_gasolineras_stores,
          no_comercio_electronico_stores,
          no_otros_negocio_stores,
          no_plataformas_stores,
          no_other_stores,
          no_impacted_stores,
          no_impacted_regions,
          promo_id,
          promo_year,
          promo_mechanic,
          promo_mechanic_description,
          name,
          type,
          start_date,
          end_date,
          customer_profile_type,
          marketing_type,
          duration,
          campaign_start_day,
          campaign_start_month,
          campaign_start_quarter,
          campaign_start_week,
          leaflet_cover,
          leaflet_priv_space,
          in_leaflet_flag,
          in_gondola_flag,
          in_both_leaflet_gondola_flag,
          discount_depth,
          discount_depth_rank,
          period ),
        temp_aggr_promo_f AS (
        SELECT
          * EXCEPT (eroskibrand_flag,
            eroskibrand_label,
            wealthy_range_flag),
          CASE
            WHEN eroskibrand_label IS NOT NULL THEN eroskibrand_label
            WHEN wealthy_range_flag = 'N' THEN 'Normal'
            WHEN wealthy_range_flag = 'S' THEN 'Premium'
          ELSE
          NULL
        END
          AS brand_price_label
        FROM
          temp_aggr_promo )
      SELECT
        *
      FROM
        temp_aggr_promo_f
      WHERE
        discount_depth IS NOT NULL
        AND promo_mechanic IS NOT NULL
        AND sku_root_id IS NOT NULL
        AND segment IS NOT NULL
        AND period IN ('promotion')
      ORDER BY
        sku_root_id,
        promo_id,
        promo_year,
        period
        """.format(m="(\'" + "\',\'".join(str(x) for x in mechanic) + "\')",
                   a=area)

    # Create a disctionary to loop over all destination tables and scripts
    tables = {'prediction_train_input': promotion_pred_sql}

    job_config.write_disposition = "WRITE_TRUNCATE"
    for key in tables:

        # Set the destination table
        table_ref = client.dataset(dataset_id).table(key)
        job_config.destination = table_ref

        # Start the query, passing in the extra configuration.
        query_job = client.query(
            tables[key],
            # Location must match that of the dataset(s) referenced in the query
            # and of the destination table.
            location='europe-west3',
            job_config=job_config)  # API request - starts the query

        query_job.result()  # Waits for the query to finish
        logger.info("Completed writing {a} table...".format(a=key))
Beispiel #3
0
sql += " FROM cte"
sql += " GROUP BY "
sql += " pfafid_30spfaf06"

# In[7]:

#sql +=  " stddev_riverdischarge_m_30spfaf06 / nullif(stddev_riverdischarge_m_30spfaf06,0) AS cv_riverdischarge_m_30spfaf06"

# In[8]:

sql

# In[9]:

job_config = bigquery.QueryJobConfig()
table_ref = client.dataset(BQ_OUTPUT_DATASET_NAME).table(BQ_OUTPUT_TABLE_NAME)
job_config.destination = table_ref

if TESTING:
    job_config.dry_run = True
    job_config.use_query_cache = False

# In[10]:

query_job = client.query(query=sql, location="US", job_config=job_config)

# In[11]:

query_job.result(timeout=120)
Beispiel #4
0
def fetch_results(
    spark,
    start_date,
    end_date,
    channel=None,
    min_firefox_version="53",
    project_id="moz-fx-data-shared-prod",
    dataset_id="analysis",
    table_id="graphics_telemetry_dashboard_tmp",
):
    channel_filter = ""
    if channel is not None:
        channel_filter = "normalized_channel = '{}' AND".format(channel)

    query = """
    -- This function uses mozfun.hist.extract to tolerate compact string encodings
    -- and then turns the parsed struct back into a JSON string to maintain compatibility
    -- with the existing logic that expects JSON blobs.
    -- See https://bugzilla.mozilla.org/show_bug.cgi?id=1657724
    CREATE TEMP FUNCTION hist_to_json(h STRING) AS (
    IF(h IS NULL, NULL,
       FORMAT('{"bucket_count":%d,"histogram_type":%d,"sum":%d,"range":[%d,%d],"values":{%s}}',
         mozfun.hist.extract(h).bucket_count,
         mozfun.hist.extract(h).histogram_type,
         mozfun.hist.extract(h).sum,
         mozfun.hist.extract(h).range[SAFE_OFFSET(0)],
         mozfun.hist.extract(h).range[SAFE_OFFSET(1)],
         ARRAY_TO_STRING(ARRAY(
           SELECT
             FORMAT('"%d":%d', key, value)
           FROM
             UNNEST(mozfun.hist.extract(h).`values`)), ',')))
    );

    -- Extra wrapper function for dealing with keyed histograms.
    CREATE TEMP FUNCTION keyed_hist_to_json(h ANY TYPE) AS (
      ARRAY(SELECT AS STRUCT key, hist_to_json(value) AS value FROM UNNEST(h))
    );

    WITH sample AS
    (select client_id,
        creation_date,
        additional_properties,
        environment.build.version                                                 as environment__build__version,
        environment.build.build_id                                                as environment__build__build_id,
        environment.system.memory_mb                                              as environment__system__memory_mb,
        environment.system.is_wow64                                               as environment__system__is_wow64,
        environment.system.cpu                                                    as environment__system__cpu,
        environment.system.os.name                                                as environment__system__os__name,
        environment.system.os.version                                             as environment__system__os__version,
        environment.system.os.service_pack_major                                  as environment__system__os__service_pack_major,
        environment.system.gfx.adapters                                           as environment__system__gfx__adapters,
        payload.info.revision                                                     as payload__info__revision,
        environment.system.gfx                                                    as environment__system__gfx,
        environment.system.gfx.monitors                                           as environment__system__gfx__monitors,
        environment.build.architecture                                            as environment__build__architecture,
        environment.system.gfx.features                                           as environment__system__gfx__features,
        hist_to_json(payload.histograms.DEVICE_RESET_REASON)                                    as payload__histograms__DEVICE_RESET_REASON,
        hist_to_json(payload.histograms.GRAPHICS_SANITY_TEST)                                   as payload__histograms__GRAPHICS_SANITY_TEST,
        hist_to_json(payload.histograms.GRAPHICS_SANITY_TEST_REASON)                            as payload__histograms__GRAPHICS_SANITY_TEST_REASON,
        hist_to_json(payload.histograms.GRAPHICS_DRIVER_STARTUP_TEST)                           as payload__histograms__GRAPHICS_DRIVER_STARTUP_TEST,
        hist_to_json(payload.histograms.CANVAS_WEBGL_SUCCESS)                                   as payload__histograms__CANVAS_WEBGL_SUCCESS,
        hist_to_json(payload.histograms.CANVAS_WEBGL2_SUCCESS)                                  as payload__histograms__CANVAS_WEBGL2_SUCCESS,
        hist_to_json(payload.histograms.PLUGIN_DRAWING_MODEL)                                   as payload__histograms__PLUGIN_DRAWING_MODEL,
        hist_to_json(payload.histograms.MEDIA_DECODER_BACKEND_USED)                             as payload__histograms__MEDIA_DECODER_BACKEND_USED,
        hist_to_json(payload.processes.content.histograms.DEVICE_RESET_REASON)                  as payload__processes__content__histograms__DEVICE_RESET_REASON,
        hist_to_json(payload.processes.content.histograms.GRAPHICS_SANITY_TEST)                 as payload__processes__content__histograms__GRAPHICS_SANITY_TEST,
        hist_to_json(payload.processes.content.histograms.GRAPHICS_SANITY_TEST_REASON)          as payload__processes__content__histograms__GRAPHICS_SANITY_TEST_REASON,
        hist_to_json(payload.processes.content.histograms.GRAPHICS_DRIVER_STARTUP_TEST)         as payload__processes__content__histograms__GRAPHICS_DRIVER_STARTUP_TEST,
        hist_to_json(payload.processes.content.histograms.CANVAS_WEBGL_SUCCESS)                 as payload__processes__content__histograms__CANVAS_WEBGL_SUCCESS,
        hist_to_json(payload.processes.content.histograms.CANVAS_WEBGL2_SUCCESS)                as payload__processes__content__histograms__CANVAS_WEBGL2_SUCCESS,
        hist_to_json(payload.processes.content.histograms.PLUGIN_DRAWING_MODEL)                 as payload__processes__content__histograms__PLUGIN_DRAWING_MODEL,
        hist_to_json(payload.processes.content.histograms.MEDIA_DECODER_BACKEND_USED)           as payload__processes__content__histograms__MEDIA_DECODER_BACKEND_USED,
        keyed_hist_to_json(payload.keyed_histograms.D3D11_COMPOSITING_FAILURE_ID)                     as payload__keyed_histograms__D3D11_COMPOSITING_FAILURE_ID,
        keyed_hist_to_json(payload.keyed_histograms.OPENGL_COMPOSITING_FAILURE_ID)                    as payload__keyed_histograms__OPENGL_COMPOSITING_FAILURE_ID,
        keyed_hist_to_json(payload.keyed_histograms.CANVAS_WEBGL_ACCL_FAILURE_ID)                     as payload__keyed_histograms__CANVAS_WEBGL_ACCL_FAILURE_ID,
        keyed_hist_to_json(payload.keyed_histograms.CANVAS_WEBGL_FAILURE_ID)                          as payload__keyed_histograms__CANVAS_WEBGL_FAILURE_ID,
        keyed_hist_to_json(payload.processes.content.keyed_histograms.D3D11_COMPOSITING_FAILURE_ID)   as payload__processes__content__keyed_histograms__D3D11_COMPOSITING_FAILURE_ID,
        keyed_hist_to_json(payload.processes.content.keyed_histograms.OPENGL_COMPOSITING_FAILURE_ID)  as payload__processes__content__keyed_histograms__OPENGL_COMPOSITING_FAILURE_ID,
        keyed_hist_to_json(payload.processes.content.keyed_histograms.CANVAS_WEBGL_ACCL_FAILURE_ID)   as payload__processes__content__keyed_histograms__CANVAS_WEBGL_ACCL_FAILURE_ID,
        keyed_hist_to_json(payload.processes.content.keyed_histograms.CANVAS_WEBGL_FAILURE_ID)        as payload__processes__content__keyed_histograms__CANVAS_WEBGL_FAILURE_ID
        from `moz-fx-data-shared-prod.telemetry_stable.main_v4` where
        date(submission_timestamp) >= '{start_date}' AND date(submission_timestamp) <= '{end_date}' AND
        normalized_app_name = 'Firefox' AND
        {channel_filter}
        CAST(SPLIT(application.version, '.')[OFFSET(0)] AS INT64) > {min_firefox_version} AND
        -- NOTE: fixed fraction corresponding to 0.0003
        sample_id = 42 AND
        MOD(CAST(RAND()*10 AS INT64), 10) < 3),

    distinct_client_ids AS (SELECT distinct(client_id) FROM sample),

    -- Retain only the first seen documents for each client ID ,
    base AS (SELECT * FROM sample JOIN distinct_client_ids USING (client_id)),

    numbered_duplicates AS (SELECT *, ROW_NUMBER() OVER (PARTITION BY client_id) AS _n FROM base)
    --
    -- Retain only one document for each ID.
    SELECT
      * EXCEPT(_n)
    FROM
      numbered_duplicates
    WHERE
      _n = 1
    """.format(
        start_date=start_date.strftime(FORMAT_DS),
        end_date=end_date.strftime(FORMAT_DS),
        channel_filter=channel_filter,
        min_firefox_version=min_firefox_version,
    )

    bq = bigquery.Client()
    table_ref = bq.dataset(dataset_id, project=project_id).table(table_id)
    job_config = bigquery.QueryJobConfig()
    job_config.destination = table_ref
    job_config.write_disposition = "WRITE_TRUNCATE"

    query_job = bq.query(query, job_config=job_config)

    # Wait for query execution
    result = query_job.result()

    return (spark.read.format("bigquery").option("project", project_id).option(
        "dataset", query_job.destination.dataset_id).option(
            "table", query_job.destination.table_id).load().rdd)
Beispiel #5
0
def run_authorized_view_tutorial(override_values={}):
    # Note to user: This is a group email for testing purposes. Replace with
    # your own group email address when running this code.
    analyst_group_email = "*****@*****.**"

    # [START bigquery_authorized_view_tutorial]
    # Create a source dataset
    # [START bigquery_avt_create_source_dataset]
    from google.cloud import bigquery

    client = bigquery.Client()
    source_dataset_id = "github_source_data"

    # [END bigquery_authorized_view_tutorial]
    # [END bigquery_avt_create_source_dataset]
    # To facilitate testing, we replace values with alternatives
    # provided by the testing harness.
    source_dataset_id = override_values.get("source_dataset_id",
                                            source_dataset_id)
    # [START bigquery_authorized_view_tutorial]
    # [START bigquery_avt_create_source_dataset]

    source_dataset = bigquery.Dataset(client.dataset(source_dataset_id))
    # Specify the geographic location where the dataset should reside.
    source_dataset.location = "US"
    source_dataset = client.create_dataset(source_dataset)  # API request
    # [END bigquery_avt_create_source_dataset]

    # Populate a source table
    # [START bigquery_avt_create_source_table]
    source_table_id = "github_contributors"
    job_config = bigquery.QueryJobConfig()
    job_config.destination = source_dataset.table(source_table_id)
    sql = """
        SELECT commit, author, committer, repo_name
        FROM `bigquery-public-data.github_repos.commits`
        LIMIT 1000
    """
    query_job = client.query(
        sql,
        # Location must match that of the dataset(s) referenced in the query
        # and of the destination table.
        location="US",
        job_config=job_config,
    )  # API request - starts the query

    query_job.result()  # Waits for the query to finish
    # [END bigquery_avt_create_source_table]

    # Create a separate dataset to store your view
    # [START bigquery_avt_create_shared_dataset]
    shared_dataset_id = "shared_views"

    # [END bigquery_authorized_view_tutorial]
    # [END bigquery_avt_create_shared_dataset]
    # To facilitate testing, we replace values with alternatives
    # provided by the testing harness.
    shared_dataset_id = override_values.get("shared_dataset_id",
                                            shared_dataset_id)
    # [START bigquery_authorized_view_tutorial]
    # [START bigquery_avt_create_shared_dataset]

    shared_dataset = bigquery.Dataset(client.dataset(shared_dataset_id))
    shared_dataset.location = "US"
    shared_dataset = client.create_dataset(shared_dataset)  # API request
    # [END bigquery_avt_create_shared_dataset]

    # Create the view in the new dataset
    # [START bigquery_avt_create_view]
    shared_view_id = "github_analyst_view"
    view = bigquery.Table(shared_dataset.table(shared_view_id))
    sql_template = """
        SELECT
            commit, author.name as author,
            committer.name as committer, repo_name
        FROM
            `{}.{}.{}`
    """
    view.view_query = sql_template.format(client.project, source_dataset_id,
                                          source_table_id)
    view = client.create_table(view)  # API request
    # [END bigquery_avt_create_view]

    # Assign access controls to the dataset containing the view
    # [START bigquery_avt_shared_dataset_access]
    # analyst_group_email = '*****@*****.**'
    access_entries = shared_dataset.access_entries
    access_entries.append(
        bigquery.AccessEntry("READER", "groupByEmail", analyst_group_email))
    shared_dataset.access_entries = access_entries
    shared_dataset = client.update_dataset(shared_dataset,
                                           ["access_entries"])  # API request
    # [END bigquery_avt_shared_dataset_access]

    # Authorize the view to access the source dataset
    # [START bigquery_avt_source_dataset_access]
    access_entries = source_dataset.access_entries
    access_entries.append(
        bigquery.AccessEntry(None, "view", view.reference.to_api_repr()))
    source_dataset.access_entries = access_entries
    source_dataset = client.update_dataset(source_dataset,
                                           ["access_entries"])  # API request
def load_new_snippet_data(dataset_id, table_name, next_load_date,
                          end_load_date):
    '''
    Queries different snippet related GA properties and loads results to a permanent table in bigquery
    :param dataset_id: Name of dataset to be loaded into
    :param table_name: Name of table to be loaded into
    :param next_load_date: Earliest date to be loaded into table_name
    :param end_load_date: Latest date to be loaded into table_name
    :return:
    '''
    while next_load_date < end_load_date:
        # Set dates required for loading new data
        next_load_date = datetime.strftime(next_load_date, '%Y%m%d')
        logging.info(
            f'{job_name}: Starting load for next load date: {next_load_date}')
        client = bigquery.Client(project='ga-mozilla-org-prod-001')
        load_dataset_id = dataset_id
        load_table_name = table_name
        load_table_suffix = next_load_date
        load_table_id = f'{load_table_name.lower()}_{load_table_suffix}'

        # Set Sample Size Multiplier
        sample_rate_change_date = datetime.strptime(
            '20171031', '%Y%m%d')  # date sampling changed from 1% to 0.1%
        if datetime.strptime(next_load_date,
                             '%Y%m%d') < sample_rate_change_date:
            sample_multiplier = 100
        else:
            sample_multiplier = 1000

        # Configure load job
        dataset_ref = client.dataset(load_dataset_id)
        table_ref = dataset_ref.table(load_table_id)
        load_job_config = bigquery.QueryJobConfig()  # load job call
        load_job_config.schema = [
            bigquery.SchemaField('date', 'DATE'),
            bigquery.SchemaField('snippetID', 'STRING'),
            bigquery.SchemaField('country', 'STRING'),
            bigquery.SchemaField('site', 'STRING'),
            bigquery.SchemaField('impression', 'INTEGER'),
            bigquery.SchemaField('snippetBlocked', 'INTEGER'),
            bigquery.SchemaField('clicks', 'INTEGER'),
            bigquery.SchemaField('otherSnippetInteractions', 'INTEGER'),
            bigquery.SchemaField('sessions', 'INTEGER'),
            bigquery.SchemaField('addonInstallsTotal', 'INTEGER'),
            bigquery.SchemaField('addonInstallsGoalComp', 'INTEGER'),
            bigquery.SchemaField('themeInstallsTotal', 'INTEGER'),
            bigquery.SchemaField('themeInstallsGoalComp', 'INTEGER'),
            bigquery.SchemaField('donations', 'INTEGER'),
            bigquery.SchemaField('name', 'STRING'),
            bigquery.SchemaField('campaign', 'STRING'),
            bigquery.SchemaField('category', 'STRING'),
            bigquery.SchemaField('url', 'STRING'),
            bigquery.SchemaField('body', 'STRING')
        ]  # Define schema
        load_job_config.time_partitioning = bigquery.TimePartitioning(
            type_=bigquery.TimePartitioningType.DAY,
            field='date',
        )
        load_job_config.write_disposition = 'WRITE_TRUNCATE'  # Options are WRITE_TRUNCATE, WRITE_APPEND, WRITE_EMPTY
        load_job_config.destination = table_ref
        sql = f"""
            WITH impressionData AS(
            SELECT
                visitData.date,
                visitData.snippetID,
                visitData.country,
                visitData.eventCategory,
                -- Get statistics for top 3 events. All other = other
                CASE WHEN eventCategory = 'impression' THEN COUNT(DISTINCT(fullVisitorId)) ELSE 0 END AS impression,
                CASE WHEN eventCategory = 'snippet-blocked' THEN COUNT(DISTINCT(fullVisitorId)) ELSE 0 END AS snippetBlocked,
                CASE WHEN eventCategory = 'click' OR eventCategory = 'button-click' THEN COUNT(DISTINCT(fullVisitorId)) ELSE 0 END AS clicks,
                CASE WHEN eventCategory NOT IN('impression','snippet-blocked', 'click','button-click') THEN COUNT(DISTINCT(fullVisitorId)) ELSE 0 END AS other
            FROM (
                SELECT
                date,
                geoNetwork.country,
                fullVisitorId,
                eventInfo.eventAction AS snippetID,
                eventInfo.eventCategory
                FROM
                `ga-mozilla-org-prod-001.125230768.ga_sessions_*`,
                UNNEST (hits) AS hits
                WHERE
                _TABLE_SUFFIX = '{load_table_suffix}'
                GROUP BY 1,2,3,4,5) AS visitData
            GROUP BY
                1,2,3,4
            ORDER BY 4 DESC),
    
            -- Pull data from addons.mozilla.org
    
            addonsData AS(SELECT
                date AS date,
                trafficSource.keyword AS snippetID,
                geoNetwork.country AS country,
                SUM(totals.visits) AS sessions,
                SUM((SELECT SUM(DISTINCT IF (REGEXP_CONTAINS(hits.eventInfo.eventCategory, '^AMO (?:Addon|Theme|Addon / Theme) Installs$') AND hits.eventInfo.eventAction = 'addon',1,0)) FROM UNNEST(hits) hits)) AS sessionsInstallingAddons,
                SUM((SELECT SUM(IF (REGEXP_CONTAINS(hits.eventInfo.eventCategory, '^AMO (?:Addon|Theme|Addon / Theme) Installs$') AND hits.eventInfo.eventAction = 'addon',1,0)) FROM UNNEST(hits) hits)) AS totalAddonsInstalled,
                SUM((SELECT SUM(DISTINCT IF (REGEXP_CONTAINS(hits.eventInfo.eventCategory, '^AMO (?:Addon|Theme|Addon / Theme) Installs$') AND hits.eventInfo.eventAction = 'theme',1,0)) FROM UNNEST(hits) hits)) AS sessionsInstallingThemes,
                SUM((SELECT SUM(IF (REGEXP_CONTAINS(hits.eventInfo.eventCategory, '^AMO (?:Addon|Theme|Addon / Theme) Installs$') AND hits.eventInfo.eventAction = 'theme',1,0)) FROM UNNEST(hits) hits)) AS totalThemesInstalled
            FROM `ga-mozilla-org-prod-001.67693596.ga_sessions_*`
            WHERE
            _TABLE_SUFFIX = '{load_table_suffix}'
            AND trafficSource.medium = 'snippet'
            GROUP BY 1,2,3
            ORDER BY 2 ASC, 4 DESC),
    
            -- Pull data from mozilla.org
            mozorgData AS(
            SELECT
            date as date,
            trafficSource.keyword as snippetID,
            geoNetwork.country as country,
            SUM(totals.visits) AS sessions
            FROM
            `ga-mozilla-org-prod-001.65789850.ga_sessions_*`
            WHERE
            _TABLE_SUFFIX = '{load_table_suffix}'
            AND trafficSource.medium = 'snippet'
            GROUP By 1,2,3
            ORDER BY 4 DESC
            ),
    
            -- Pull data from blog.mozilla.org
            blogData AS(
            SELECT
              date as date,
              trafficSource.keyword as snippetID,
              geoNetwork.country as country,
              SUM(totals.visits) AS sessions
            FROM
              `ga-mozilla-org-prod-001.66602784.ga_sessions_*`
            WHERE
              _TABLE_SUFFIX = '{load_table_suffix}'
              AND trafficSource.medium = 'snippet'
            GROUP By 1,2,3
            ORDER BY 4 DESC
            ),
    
            -- Pull data from testpilot.firefox.com
            testPilotData AS(
            SELECT
              date as date,
              trafficSource.keyword as snippetID,
              geoNetwork.country as country,
              SUM(totals.visits) AS sessions
            FROM
              `ga-mozilla-org-prod-001.106368739.ga_sessions_*`
            WHERE
              _TABLE_SUFFIX = '{load_table_suffix}'
              AND trafficSource.medium = 'snippet'
            GROUP By 1,2,3
            ORDER BY 4 DESC
            ),
    
            -- Pull data from developer.mozilla.org
            developerData AS(
            SELECT
              date as date,
              trafficSource.keyword as snippetID,
              geoNetwork.country as country,
              SUM(totals.visits) AS sessions
            FROM
              `ga-mozilla-org-prod-001.66726481.ga_sessions_*`
            WHERE
              _TABLE_SUFFIX = '{load_table_suffix}'
              AND trafficSource.medium = 'snippet'
            GROUP By 1,2,3
            ORDER BY 4 DESC
            ),
    
            -- Pull data from support.mozilla.org
            sumoData AS(
            SELECT
              date as date,
              trafficSource.keyword as snippetID,
              geoNetwork.country as country,
              SUM(totals.visits) AS sessions
            FROM
              `ga-mozilla-org-prod-001.65912487.ga_sessions_*`
            WHERE
              _TABLE_SUFFIX = '{load_table_suffix}'
              AND trafficSource.medium = 'snippet'
            GROUP By 1,2,3
            ORDER BY 4 DESC
            ),
    
            -- Pull data from hacks.mozilla.org
            hacksData AS(
            SELECT
              date as date,
              trafficSource.keyword as snippetID,
              geoNetwork.country as country,
              SUM(totals.visits) AS sessions
            FROM
              `ga-mozilla-org-prod-001.65887927.ga_sessions_*`
            WHERE
              _TABLE_SUFFIX = '{load_table_suffix}'
              AND trafficSource.medium = 'snippet'
            GROUP By 1,2,3
            ORDER BY 4 DESC
            ),
    
            -- Pull data from donate.mozilla.org
            donateData AS(
            SELECT
                date AS date,
                trafficSource.keyword AS snippetID,
                geoNetwork.country AS country,
                SUM(totals.visits) AS sessions,
                SUM((SELECT SUM(DISTINCT IF(REGEXP_CONTAINS(page.pagePath, '/thank-you/'),1,0)) FROM UNNEST(hits) )) AS donations
              FROM
                `ga-mozilla-org-prod-001.105783219.ga_sessions_*`
              WHERE
              _TABLE_SUFFIX = '{load_table_suffix}'
              AND trafficSource.medium = 'snippet'
              GROUP BY 1,2,3
              ORDER BY 2 ASC,4 DESC
            ),
            
            aggregates as (
    
            -- Aggregate by date, snippetID, country and site
            SELECT
              PARSE_DATE('%Y%m%d', impressions.date) as date,
              impressions.snippetID,
              impressions.country,
              'snippets tracking' as site,
              SUM(impressions.impression)*{sample_multiplier} AS impression,
              SUM(impressions.snippetBlocked)*{sample_multiplier} AS snippetBlocked,
              SUM(impressions.clicks)*{sample_multiplier} AS clicks,
              SUM(impressions.other)*{sample_multiplier} as otherSnippetInteractions,
              NULL as sessions,
              NULL as addonInstallsTotal,
              NULL as addonInstallsGoalComp,
              NULL as themeInstallsTotal,
              NULL as themeInstallsGoalComp,
              NULL as donations
            FROM impressionData as impressions
            GROUP By 1,2,3,4
    
            -- Join addons data
            UNION ALL
            SELECT
              PARSE_DATE('%Y%m%d', addonsData.date) as date,
              addonsData.snippetID,
              addonsData.country,
              'addons.mozilla.org' as site,
              NULL as impression,
              NULL as snippetBlocked,
              NULL as clicks,
              NULL as otherSnippetInteractions,
              SUM(addonsData.sessions) as sessions,
              SUM(addonsData.totalAddonsInstalled) as addonInstallsTotal,
              SUM(addonsData.sessionsInstallingAddons) as addonInstallsGoalComp,
              SUM(addonsData.totalThemesInstalled) as themeInstallsTotal,
              SUM(addonsData.sessionsInstallingThemes) as themeInstallsGoalComp,
              NULL as donations
            FROM addonsData
            GROUP BY 1,2,3,4
    
            -- Join mozilla.org data
            UNION ALL
            SELECT
              PARSE_DATE('%Y%m%d', mozorgData.date) as date,
              mozorgData.snippetID,
              mozorgData.country,
              'mozilla.org' as site,
              NULL as impression,
              NULL as snippetBlocked,
              NULL as clicks,
              NULL as otherSnippetInteractions,
              SUM(mozorgData.sessions) as sessions,
              NULL as addonInstallsTotal,
              NULL as addonInstallsGoalComp,
              NULL as themeInstallsTotal,
              NULL as themeInstallsGoalComp,
              NULL as donations
            FROM mozorgData
            GROUP BY 1,2,3,4
    
            -- Join blog.mozilla.org data
            UNION ALL
            SELECT
              PARSE_DATE('%Y%m%d', blogData.date) as date,
              blogData.snippetID,
              blogData.country,
              'blog.mozilla.org' as site,
              NULL as impression,
              NULL as snippetBlocked,
              NULL as clicks,
              NULL as otherSnippetInteractions,
              SUM(blogData.sessions) as sessions,
              NULL as addonInstallsTotal,
              NULL as addonInstallsGoalComp,
              NULL as themeInstallsTotal,
              NULL as themeInstallsGoalComp,
              NULL as donations
            FROM blogData
            GROUP BY 1,2,3,4
    
            -- Join testpilot.firefox.com data
            UNION ALL
            SELECT
              PARSE_DATE('%Y%m%d', testPilotData.date) as date,
              testPilotData.snippetID,
              testPilotData.country,
              'testpilot.firefox.com' as site,
              NULL as impression,
              NULL as snippetBlocked,
              NULL as clicks,
              NULL as otherSnippetInteractions,
              SUM(testPilotData.sessions) as sessions,
              NULL as addonInstallsTotal,
              NULL as addonInstallsGoalComp,
              NULL as themeInstallsTotal,
              NULL as themeInstallsGoalComp,
              NULL as donations
            FROM testPilotData
            GROUP BY 1,2,3,4
    
            -- Join developer.mozilla.org data
            UNION ALL
            SELECT
              PARSE_DATE('%Y%m%d', developerData.date) as date,
              developerData.snippetID,
              developerData.country,
              'developer.mozilla.org' as site,
              NULL as impression,
              NULL as snippetBlocked,
              NULL as clicks,
              NULL as otherSnippetInteractions,
              SUM(developerData.sessions) as sessions,
              NULL as addonInstallsTotal,
              NULL as addonInstallsGoalComp,
              NULL as themeInstallsTotal,
              NULL as themeInstallsGoalComp,
              NULL as donations
            FROM developerData
            GROUP BY 1,2,3,4
    
            -- Join support.mozilla.org data
            UNION ALL
            SELECT
              PARSE_DATE('%Y%m%d', sumoData.date) as date,
              sumoData.snippetID,
              sumoData.country,
              'support.mozilla.org' as site,
              NULL as impression,
              NULL as snippetBlocked,
              NULL as clicks,
              NULL as otherSnippetInteractions,
              SUM(sumoData.sessions) as sessions,
              NULL as addonInstallsTotal,
              NULL as addonInstallsGoalComp,
              NULL as themeInstallsTotal,
              NULL as themeInstallsGoalComp,
              NULL as donations
            FROM sumoData
            GROUP BY 1,2,3,4
    
            -- Join hacks.mozilla.org data
            UNION ALL
            SELECT
              PARSE_DATE('%Y%m%d', hacksData.date) as date,
              hacksData.snippetID,
              hacksData.country,
              'support.mozilla.org' as site,
              NULL as impression,
              NULL as snippetBlocked,
              NULL as clicks,
              NULL as otherSnippetInteractions,
              SUM(hacksData.sessions) as sessions,
              NULL as addonInstallsTotal,
              NULL as addonInstallsGoalComp,
              NULL as themeInstallsTotal,
              NULL as themeInstallsGoalComp,
              NULL as donations
            FROM hacksData
            GROUP BY 1,2,3,4
    
            -- Join donate.mozilla.org data
            UNION ALL
            SELECT
              PARSE_DATE('%Y%m%d', donateData.date) as date,
              donateData.snippetID,
              donateData.country,
              'donate.mozilla.org' as site,
              NULL as impression,
              NULL as snippetBlocked,
              NULL as clicks,
              NULL as otherSnippetInteractions,
              SUM(donateData.sessions) as sessions,
              NULL as addonInstallsTotal,
              NULL as addonInstallsGoalComp,
              NULL as themeInstallsTotal,
              NULL as themeInstallsGoalComp,
              SUM(donateData.donations) as donations
            FROM donateData
            GROUP BY 1,2,3,4
            
            -- Join telemetry tracking data
            UNION ALL
            SELECT 
              sendDate, 
              messageID,
              countryCode,
              'telemetry tracking' as site,
              SUM(impressions) as impression,
              SUM(blocks) as snippetBlocked,
              SUM(clicks) as clicks,
              NULL as other,
              NULL as sessions,
              NULL as addonInstallsTotal,
              NULL as addonInstallsGoalComp,
              NULL as themeInstallsTotal,
              NULL as themeInstallsGoalComp,
              NULL as donations
            FROM `ga-mozilla-org-prod-001.snippets.snippets_telemetry_tracking_*`
            WHERE
                _TABLE_SUFFIX = '{load_table_suffix}'
            GROUP BY 1,2,3,4),
            
            metaData as (
            SELECT 
              *
            FROM `ga-mozilla-org-prod-001.snippets.snippets_metadata`)

            SELECT
              aggregates.*,
              metaData.name,
              metaData.campaign,
              metaData.category,
              metaData.url,
              metaData.body
            FROM
              aggregates
            LEFT JOIN
              metaData
            ON
              aggregates.snippetID = metaData.ID
            """

        # Run Load Job
        query_job = client.query(
            sql,
            # Location must match that of the dataset(s) referenced in the query
            # and of the destination table.
            location='US',
            job_config=load_job_config)  # API request - starts the query

        query_job.result()  # Waits for the query to finish
        logging.info(
            f'{job_name}: Query results loaded to table {table_ref.path}')

        # Set next_load_date
        next_load_date = datetime.strptime(next_load_date,
                                           '%Y%m%d') + timedelta(1)
    return
def main(project, source_dataset, destination_dataset, create_table, backfill, dryrun):
    """Generate queries and optionally create the tables in BigQuery."""
    client = bigquery.Client(project=project)

    exported_tables = [
        table.table_id
        for table in client.list_tables(source_dataset)
        if table.table_type == "TABLE"
    ]

    tables_by_dimension = defaultdict(list)
    opt_in_metrics = set()

    # group table names by the dimension it is grouped by
    for table_name in exported_tables:
        if table_name.endswith("_total"):
            dimension = None
        else:
            metric, dimension = table_name.split("_by_")
            if dimension.startswith("opt_in"):
                opt_in_metrics.add(metric)
                dimension = dimension.replace("opt_in_", "")

        tables_by_dimension[dimension].append(table_name)

    for dimension, table_names in tables_by_dimension.items():
        qualified_table_names = [
            f"`{project}.{source_dataset}.{table_name}`" for table_name in table_names
        ]

        if dimension is not None:
            fields = f"date, app_name, {dimension}"
            table_name = f"metrics_by_{dimension}"
            metrics = [table_name.split("_by_")[0] for table_name in table_names]
        else:
            fields = "date, app_name"
            table_name = "metrics_total"
            metrics = [table_name.split("_total")[0] for table_name in table_names]

        join_clauses = [
            JOIN_TEMPLATE.format(table=table_name, fields=fields)
            for table_name in qualified_table_names[1:]
        ]

        # add _opt_in to opt-in metrics
        fields_to_add_opt_in = [
            metric for metric in metrics if metric in opt_in_metrics
        ]
        excepted_fields = ",".join(fields_to_add_opt_in)
        additional_fields = [
            f"{name} AS {name}_opt_in"
            for name in fields_to_add_opt_in
            if name != "rate"
        ]

        # rename rate column to opt_in_rate and
        if "rate" in metrics:
            additional_fields.append("rate AS opt_in_rate")

        query_text = QUERY_TEMPLATE.format(
            excepted_fields=excepted_fields,
            additional_fields=", ".join(additional_fields),
            first_table=qualified_table_names[0],
            joined_tables="\n".join(join_clauses),
            filter="date=@date",
        )
        query_path = os.path.join(SQL_DIR, destination_dataset, table_name, "query.sql")

        if not os.path.exists(os.path.dirname(query_path)):
            os.makedirs(os.path.dirname(query_path))

        with open(query_path, "w") as f:
            print(f"Writing {query_path}")
            f.write(reformat(query_text))
            f.write("\n")

        if create_table:
            query_text = QUERY_TEMPLATE.format(
                excepted_fields=excepted_fields,
                additional_fields=", ".join(additional_fields),
                first_table=qualified_table_names[0],
                joined_tables="\n".join(join_clauses),
                filter="TRUE" if backfill else "FALSE",
            )
            schema_update_options = (
                [] if backfill else [bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION]
            )
            job_config = bigquery.QueryJobConfig(
                use_legacy_sql=False,
                dry_run=dryrun,
                destination=f"{project}.{destination_dataset}.{table_name}",
                schema_update_options=schema_update_options,
                time_partitioning=bigquery.TimePartitioning(field="date"),
                create_disposition=bigquery.CreateDisposition.CREATE_IF_NEEDED,
                write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE
                if backfill
                else bigquery.WriteDisposition.WRITE_APPEND,
            )
            print(f"Creating table {table_name}")
            query_job = client.query(query_text, job_config)
            if not dryrun:
                query_job.result()
Beispiel #8
0
    def update_with_bq_access(self):

        # cron status
        status = ""

        # delete all records in resource and resource_access table
        status += deleteAllRecordInTable("resource")
        status += deleteAllRecordInTable("resource_access")

        # return string with concatenated SQL insert result
        return_string = ""

        # Instantiates a client
        bigquery_client = bigquery.Client()

        # BQ Total Bytes Billed to report to status
        total_bytes_billed = 0

        # loop through multiple course ids, 20 at a time
        # (This is set by the CRON_BQ_IN_LIMIT from settings)
        for data_warehouse_course_ids in split_list(
                Course.objects.get_supported_courses(),
                settings.CRON_BQ_IN_LIMIT):
            # query to retrieve all file access events for one course
            # There is no catch if this query fails, event_store.events needs to exist

            final_bq_query = []
            for k, query_obj in settings.RESOURCE_ACCESS_CONFIG.items():
                final_bq_query.append(query_obj['query'])
            final_bq_query = "  UNION ALL   ".join(final_bq_query)

            data_warehouse_course_ids_short = [
                db_util.incremented_id_to_canvas_id(id)
                for id in data_warehouse_course_ids
            ]

            logger.debug(final_bq_query)
            logger.debug(data_warehouse_course_ids)
            query_params = [
                bigquery.ArrayQueryParameter('course_ids', 'STRING',
                                             data_warehouse_course_ids),
                bigquery.ArrayQueryParameter('course_ids_short', 'STRING',
                                             data_warehouse_course_ids_short),
                bigquery.ScalarQueryParameter(
                    'canvas_data_id_increment', 'INT64',
                    settings.CANVAS_DATA_ID_INCREMENT)
            ]
            job_config = bigquery.QueryJobConfig()
            job_config.query_parameters = query_params

            # Location must match that of the dataset(s) referenced in the query.
            bq_query = bigquery_client.query(final_bq_query,
                                             location='US',
                                             job_config=job_config)
            #bq_query.result()
            resource_access_df = bq_query.to_dataframe()
            total_bytes_billed += bq_query.total_bytes_billed

            logger.debug("df row number=" + str(resource_access_df.shape[0]))
            # drop duplicates
            resource_access_df.drop_duplicates(
                ["resource_id", "user_id", "access_time"],
                keep='first',
                inplace=True)

            logger.debug("after drop duplicates, df row number=" +
                         str(resource_access_df.shape[0]))

            logger.debug(resource_access_df)

            # Because we're pulling all the data down into one query we need to manipulate it a little bit
            # Make a copy of the access dataframe
            resource_df = resource_access_df.copy(deep=True)
            # Drop out the columns user and access time from resource data frame
            resource_df.drop(["user_id", "access_time"], axis=1, inplace=True)
            # Drop out the duplicates
            resource_df.drop_duplicates(["resource_id", "course_id"],
                                        inplace=True)
            # Rename the column resource_id to id
            resource_df.rename(columns={"resource_id": "id"}, inplace=True)

            # Drop out the columns resource_type, course_id, name from the resource_access
            resource_access_df.drop(["resource_type", "name", "course_id"],
                                    axis=1,
                                    inplace=True)

            # Drop the columns where there is a Na value
            resource_access_df_drop_na = resource_access_df.dropna()

            logger.info(
                f"{len(resource_access_df) - len(resource_access_df_drop_na)} / {len(resource_access_df)} rows were dropped because of NA"
            )

            # First update the resource table
            # write to MySQL
            try:
                resource_df.to_sql(con=engine,
                                   name='resource',
                                   if_exists='append',
                                   index=False)
            except Exception as e:
                logger.exception("Error running to_sql on table resource")
                raise

            try:
                resource_access_df_drop_na.to_sql(con=engine,
                                                  name='resource_access',
                                                  if_exists='append',
                                                  index=False)
            except Exception as e:
                logger.exception(
                    "Error running to_sql on table resource_access")
                raise
            return_string += str(resource_access_df_drop_na.shape[0]
                                 ) + " rows for courses " + ",".join(
                                     map(str,
                                         data_warehouse_course_ids)) + "\n"
            logger.info(return_string)

        total_tbytes_billed = total_bytes_billed / 1024 / 1024 / 1024 / 1024
        # $5 per TB as of Feb 2019 https://cloud.google.com/bigquery/pricing
        total_tbytes_price = round(5 * total_tbytes_billed, 2)
        status += (
            f"TBytes billed for BQ: {total_tbytes_billed} = ${total_tbytes_price}\n"
        )
        return status
lengthArg=sys.argv[2]


query = """
    SELECT
      *
    FROM `@dataset`
    LIMIT @length
"""

query_params = [
    bigquery.ScalarQueryParameter('dataset', 'STRING', str(dataArg)),
    bigquery.ScalarQueryParameter('length', 'INT64', int(lengthArg))
]

config_obj = bigquery.QueryJobConfig()
config_obj.query_parameters = query_params

def run_query():
    # init BQ client
    client = bigquery.Client()

    ## construct query
    query_job = client.query(
        query,
        location='US',
        config_obj=config_obj
    )
    # query_job = client.query("""
    #     SELECT
    #       *
def get_data_bigquery(spark, chunk_start, chunk_end):
    bq = bigquery.Client()

    filtered_data_sql = f"""
    WITH
    rank_per_client AS (
        SELECT
            *,
            ROW_NUMBER() OVER (PARTITION BY client_id ORDER BY submission_timestamp DESC) AS rn
        FROM
            `moz-fx-data-shared-prod.telemetry_stable.main_v4`
        WHERE
            sample_id = 42
            AND DATE(submission_timestamp)>='{chunk_start.strftime('%Y-%m-%d')}'
            AND DATE(submission_timestamp)<'{chunk_end.strftime('%Y-%m-%d')}' ),
        latest_per_client AS(
        SELECT
            *
        FROM
            rank_per_client
        WHERE
            rn=1 )
    SELECT
        environment.build.architecture AS browser_arch,
        environment.system.os.name AS os_name,
        environment.system.os.version AS os_version,
        environment.system.memory_mb,
        coalesce(environment.system.is_wow64, FALSE) AS is_wow64,
        environment.system.gfx.adapters[OFFSET(0)].vendor_id AS gfx0_vendor_id,
        environment.system.gfx.adapters[OFFSET(0)].device_id AS gfx0_device_id,
        IF(ARRAY_LENGTH(environment.system.gfx.monitors)>0,
            environment.system.gfx.monitors[OFFSET(0)].screen_width, 0) AS screen_width,
        IF(ARRAY_LENGTH(environment.system.gfx.monitors)>0,
            environment.system.gfx.monitors[OFFSET(0)].screen_height, 0) AS screen_height,
        environment.system.cpu.cores AS cpu_cores,
        environment.system.cpu.vendor AS cpu_vendor,
        environment.system.cpu.speed_m_hz AS cpu_speed,
        'Shockwave Flash' IN (SELECT name FROM UNNEST(environment.addons.active_plugins)) AS has_flash
    FROM
        latest_per_client
    WHERE
        environment.system.cpu.speed_m_hz IS NOT NULL
    """

    print("Query is: " + filtered_data_sql)

    TABLE_PROJECT = "moz-fx-data-derived-datasets"
    TABLE_DATASET = "analysis"
    TABLE_NAME = "hardware_report_filtered_data"

    table_ref = bq.dataset(TABLE_DATASET,
                           project=TABLE_PROJECT).table(TABLE_NAME)
    job_config = bigquery.QueryJobConfig()
    job_config.destination = table_ref
    job_config.write_disposition = "WRITE_TRUNCATE"

    query_job = bq.query(filtered_data_sql, job_config=job_config)

    # # Wait for query execution
    query_job.result()

    filtered_data_df = (spark.read.format("bigquery").option(
        "parallelism",
        200).option("table",
                    f"{TABLE_PROJECT}.{TABLE_DATASET}.{TABLE_NAME}").load())

    # Defined to keep compatibility with AWS implementation,
    # they're 0 here since these describe longitudinal data quality
    broken_ratio = 0
    inactive_ratio = 0
    return (filtered_data_df.rdd, broken_ratio, inactive_ratio)
Beispiel #11
0
def gen_job_config():
    job_config = bigquery.QueryJobConfig()
    job_config.use_legacy_sql = False
    return job_config
    def resize(self):
        """
        This is the execute function of this class. It copies the source table
        into the destination table and then copies the destination table into
        itself until it reaches or exceeds the target_rows.
        """
        # How many rows short of our target are we?
        gap = self.target_rows - self.source_table.num_rows

        while gap > 0:  # Copy until we've reached or exceeded target_rows

            # API requests to get the latest table info.
            source_table = self.client.get_table(self.source_table)
            try:
                dest_table = self.client.get_table(self.dest_table_ref)

            except NotFound:
                dest_table = self.client.create_table(
                    bigquery.Table(self.dest_table_ref))

            # Get the latest size of the dest_table.
            # Note that for the first call these properties are None.
            dest_rows = dest_table.num_rows
            dest_bytes = dest_table.num_bytes
            dest_gb = dest_bytes / float(1024**3)

            # Recalculate the gap.
            if dest_rows:
                gap = self.target_rows - dest_rows
            else:
                gap = self.target_rows

            print(('{} rows in table of size {} GB, with a target of {}, '
                   'leaving a gap of {}'.format(dest_rows, round(dest_gb, 2),
                                                self.target_rows, gap)))

            # Greedily copy the largest of dest_table and source_table into
            # dest_table without going over the target rows. The last query
            # will be a subset of source_table via a limit query.
            if gap < source_table.num_rows:
                # This will be the last copy operation if target_rows is
                # not a power of 2 times the number of rows originally in the
                # source table. It is not a full copy.
                job_config = bigquery.QueryJobConfig()
                # Set the destination table

                job_config.destination = self.dest_table_ref
                job_config.write_disposition = 'WRITE_APPEND'
                job_config.allow_large_results = True

                sql = """
                    SELECT *
                    FROM `{}.{}.{}`
                    LIMIT {}

                """.format(self.project, self.source_table.dataset_id,
                           self.source_table.table_id, gap)

                # API request to BigQuery with query and config defined above.
                query_job = self.client.query(
                    sql,
                    # Location must match that of the dataset(s) referenced in
                    # the query and of the destination table.
                    location=self.location,
                    job_config=job_config)
                # Wait for query_job to finish.
                query_job.result()
            else:
                if source_table.num_rows < dest_table.num_rows < gap:
                    use_as_source_table = self.dest_table_ref
                else:  # source_table.num_rows < gap < dest_table.num_rows
                    use_as_source_table = self.source_table.reference
                copy_config = bigquery.CopyJobConfig()
                copy_config.write_disposition = 'WRITE_APPEND'

                copy_job = self.client.copy_table(use_as_source_table,
                                                  self.dest_table_ref,
                                                  job_config=copy_config)
                # Wait for copy_job to finish.
                copy_job.result()
Beispiel #13
0
def main():
    """Process deletion requests."""
    args = parser.parse_args()
    if args.partition_limit is not None and not args.dry_run:
        parser.print_help()
        print("ERROR: --partition-limit specified without --dry-run")
    if args.start_date is None:
        args.start_date = args.end_date - timedelta(days=14)
    source_condition = (
        f"DATE(submission_timestamp) >= '{args.start_date}' "
        f"AND DATE(submission_timestamp) < '{args.end_date}'"
    )
    client_q = ClientQueue(args.billing_projects, args.parallelism)
    client = client_q.default_client
    states = {}
    if args.state_table:
        state_table_exists = False
        try:
            client.get_table(args.state_table)
            state_table_exists = True
        except NotFound:
            if not args.dry_run:
                client.create_table(
                    bigquery.Table(
                        args.state_table,
                        [
                            bigquery.SchemaField("task_id", "STRING"),
                            bigquery.SchemaField("job_id", "STRING"),
                            bigquery.SchemaField("job_created", "TIMESTAMP"),
                            bigquery.SchemaField("start_date", "DATE"),
                            bigquery.SchemaField("end_date", "DATE"),
                        ],
                    )
                )
                state_table_exists = True
        if state_table_exists:
            states = dict(
                client.query(
                    reformat(
                        f"""
                        SELECT
                          task_id,
                          job_id,
                        FROM
                          `{args.state_table}`
                        WHERE
                          start_date = '{args.start_date}'
                          AND end_date = '{args.end_date}'
                        ORDER BY
                          job_created
                        """
                    )
                ).result()
            )

    if args.environment == "telemetry":
        with ThreadPool(args.parallelism) as pool:
            glean_targets = find_glean_targets(pool, client)
            experiment_analysis_targets = find_experiment_analysis_targets(pool, client)
        targets_with_sources = chain(
            DELETE_TARGETS.items(),
            glean_targets.items(),
            experiment_analysis_targets.items(),
        )
    elif args.environment == "pioneer":
        with ThreadPool(args.parallelism) as pool:
            targets_with_sources = find_pioneer_targets(
                pool, client, study_projects=args.pioneer_study_projects
            ).items()

    tasks = [
        task
        for target, sources in targets_with_sources
        if args.table_filter(target.table)
        for task in delete_from_table(
            client=client,
            target=replace(target, project=args.target_project or target.project),
            sources=[
                replace(source, project=args.source_project or source.project)
                for source in (sources if isinstance(sources, tuple) else (sources,))
            ],
            source_condition=source_condition,
            dry_run=args.dry_run,
            read_only=args.read_only,
            priority=args.priority,
            start_date=args.start_date,
            end_date=args.end_date,
            max_single_dml_bytes=args.max_single_dml_bytes,
            partition_limit=args.partition_limit,
            state_table=args.state_table,
            states=states,
        )
    ]
    if not tasks:
        logging.error("No tables selected")
        parser.exit(1)
    # ORDER BY partition_sort_key DESC, sql_table_id ASC
    # https://docs.python.org/3/howto/sorting.html#sort-stability-and-complex-sorts
    tasks.sort(key=lambda task: sql_table_id(task.table))
    tasks.sort(key=attrgetter("partition_sort_key"), reverse=True)
    with ThreadPool(args.parallelism) as pool:
        if args.task_table and not args.dry_run:
            # record task information
            try:
                client.get_table(args.task_table)
            except NotFound:
                table = bigquery.Table(
                    args.task_table,
                    [
                        bigquery.SchemaField("task_id", "STRING"),
                        bigquery.SchemaField("start_date", "DATE"),
                        bigquery.SchemaField("end_date", "DATE"),
                        bigquery.SchemaField("target", "STRING"),
                        bigquery.SchemaField("target_rows", "INT64"),
                        bigquery.SchemaField("target_bytes", "INT64"),
                        bigquery.SchemaField("source_bytes", "INT64"),
                    ],
                )
                table.time_partitioning = bigquery.TimePartitioning()
                client.create_table(table)
            sources = list(set(source for task in tasks for source in task.sources))
            source_bytes = {
                source: job.total_bytes_processed
                for source, job in zip(
                    sources,
                    pool.starmap(
                        client.query,
                        [
                            (
                                reformat(
                                    f"""
                                    SELECT
                                      {source.field}
                                    FROM
                                      `{sql_table_id(source)}`
                                    WHERE
                                      {source_condition}
                                    """
                                ),
                                bigquery.QueryJobConfig(dry_run=True),
                            )
                            for source in sources
                        ],
                        chunksize=1,
                    ),
                )
            }
            step = 10000  # max 10K rows per insert
            for start in range(0, len(tasks), step):
                end = start + step
                BigQueryInsertError.raise_if_present(
                    errors=client.insert_rows_json(
                        args.task_table,
                        [
                            {
                                "task_id": get_task_id(task.table, task.partition_id),
                                "start_date": args.start_date.isoformat(),
                                "end_date": args.end_date.isoformat(),
                                "target": sql_table_id(task.table),
                                "target_rows": task.table.num_rows,
                                "target_bytes": task.table.num_bytes,
                                "source_bytes": sum(
                                    map(source_bytes.get, task.sources)
                                ),
                            }
                            for task in tasks[start:end]
                        ],
                    )
                )
        results = pool.map(
            client_q.with_client, (task.func for task in tasks), chunksize=1
        )
    jobs_by_table = defaultdict(list)
    for i, job in enumerate(results):
        jobs_by_table[tasks[i].table].append(job)
    bytes_processed = rows_deleted = 0
    for table, jobs in jobs_by_table.items():
        table_bytes_processed = sum(job.total_bytes_processed or 0 for job in jobs)
        bytes_processed += table_bytes_processed
        table_id = sql_table_id(table)
        if args.dry_run:
            logging.info(f"Would scan {table_bytes_processed} bytes from {table_id}")
        else:
            table_rows_deleted = sum(job.num_dml_affected_rows or 0 for job in jobs)
            rows_deleted += table_rows_deleted
            logging.info(
                f"Scanned {table_bytes_processed} bytes and "
                f"deleted {table_rows_deleted} rows from {table_id}"
            )
    if args.dry_run:
        logging.info(f"Would scan {bytes_processed} in total")
    else:
        logging.info(
            f"Scanned {bytes_processed} and deleted {rows_deleted} rows in total"
        )
Beispiel #14
0
def BQPreprocess(cpu, date_generated, client, query_fit, loadfrom="elastic"):
    bq_client = client
    job_config = bigquery.QueryJobConfig()

    datalist = []
    datalist_hist = []

    logger.info("Starting data fetch iterative...")
    for ndate in date_generated:
        tframe = getBig(ndate.strftime("%Y-%m-%d"), query_fit)
        if tframe is not None:
            if not tframe.empty:
                if loadfrom.strip().lower() == 'datastore':
                    X_split = np.array_split(tframe, 5)
                    logger.info("loading history data from datastore...")
                    logger.info("Len of X_split for batch load: %d", len(X_split))
                    logger.info("Appending history data...")
                    for ix in range(len(X_split)):
                        # ~ loading history
                        """
                            disini antara kita gabungkan dengan tframe, atau buat df sendiri
                        """
                        logger.info("processing batch-%d", ix)
                        # https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas'
                        logger.info("creating list history data...")
                        #lhistory = list(X_split[ix]["user_id"].head(1000).map(str) + "_" + X_split[ix]["topic_id"].head(1000).map(str))
                        lhistory = list(X_split[ix]["user_id"].map(str) + "_" + X_split[ix]["topic_id"].map(str))

                        logger.info("call history data...")
                        h_frame = mh.loadDSHistory(lhistory)

                        # me = os.getpid()
                        # kill_proc_tree(me)

                        logger.info("done collecting history data, appending now...")
                        for m in h_frame:
                            if m is not None:
                                if len(m) > 0:
                                    datalist_hist.append(pd.DataFrame(m))
                        del h_frame
                        del lhistory

                    logger.info("Appending training data...")
                    datalist.append(tframe)
                elif loadfrom.strip().lower() == 'elastic':
                    X_split = np.array_split(tframe, 50)
                    logger.info("loading history data from elastic...")
                    logger.info("Len of X_split for batch load: %d", len(X_split))
                    logger.info("Appending history data...")
                    
                    for ix in range(len(X_split)):
                        lhistory = list(X_split[ix]["user_id"].map(str) + "_" + X_split[ix]["topic_id"].map(str))
                        logger.info("call %d history data...", len(lhistory))
                        inside_data = mh.loadESHistory(lhistory, es,
                                                       esindex_name='fitted_hist_index',
                                                       estype_name='fitted_hist_type')
                        
                        if inside_data is not None:
                            # split back the user_id and topic_id
                            inside_data[['user_id','topic_id']] = inside_data.uid_topid.str.split('_', expand=True)
                            inside_data = inside_data[["user_id","topic_id", "pt_posterior_x_Nt", "smoothed_pt_posterior", "p0_cat_ci", "sigma_Nt"]]
                            logger.info("Appending %d data into datalist_hist..", len(inside_data))
                            datalist_hist.append(inside_data)
                            del inside_data
                        else:
                            logger.info("inside_data is None...")
                    
                    logger.info("Appending training data...")
                    datalist.append(tframe)
                else:
                    logger.info("Unknows source is selected !")
                    break
        else: 
            logger.info("tframe for date: %s is empty", ndate.strftime("%Y-%m-%d"))
    logger.info("len datalist: %d", len(datalist))
    logger.info("All data fetch iterative done!!")

    return datalist, datalist_hist
Beispiel #15
0
    def query(self, request_body, page_size):
        query = request_body['query']
        jobConfig = request_body['jobConfig']
        dryRunOnly = request_body['dryRunOnly']

        # process flags
        processed_flags = {
            support_flag: jobConfig[support_flag]
            for support_flag in SUPPORTED_JOB_CONFIG_FLAGS
            if support_flag in jobConfig
        }

        if 'params' in processed_flags:
            processed_flags['query_parameters'] = _helpers.to_query_parameters(
                processed_flags['params'])

        if 'maximum_bytes_billed' in processed_flags and\
          processed_flags['maximum_bytes_billed'] is None:
            del processed_flags['maximum_bytes_billed']

        if 'use_legacy_sql' in processed_flags and\
          not isinstance(processed_flags['use_legacy_sql'], bool):
            raise ValueError(
                'use_legacy_sql shoud be boolean, instead received {}'.format(
                    processed_flags['use_legacy_sql']))
        if 'destination_table' in processed_flags:
            processed_flags['destination'] = processed_flags[
                'destination_table']
            del processed_flags['destination_table']

        # dry run, will throw exception if fail
        dry_run_job_config = bigquery.QueryJobConfig(**processed_flags)
        dry_run_job_config.dry_run = True
        dry_run_job_config.use_query_cache = False

        try:
            with PagedQueryHandler.client_lock:
                if 'project' in jobConfig and jobConfig['project'] is not None:
                    PagedQueryHandler.client.project = jobConfig['project']
                else:
                    PagedQueryHandler.client.project = PagedQueryHandler.orig_project
                dry_run_job = PagedQueryHandler.client.query(
                    query, job_config=dry_run_job_config)
                PagedQueryHandler.client.project = PagedQueryHandler.orig_project
        except Exception as err:
            if hasattr(err, 'errors'):
                raise Exception(err.errors[0]['message'])
            else:
                raise Exception(err)
        total_bytes_processed = dry_run_job.total_bytes_processed

        if dryRunOnly:
            job_id = 'dry_run' if dry_run_job.job_id is None else dry_run_job.job_id
            yield dry_run_job, job_id
            yield {
                'content': json.dumps(None),
                'labels': json.dumps(None),
                'bytesProcessed': json.dumps(total_bytes_processed)
            }
            return

        # actual run
        job_config = bigquery.QueryJobConfig(**processed_flags)

        # need synchronization since all query handler share the same client
        with PagedQueryHandler.client_lock:
            if 'project' in jobConfig and jobConfig['project'] is not None:
                PagedQueryHandler.client.project = jobConfig['project']
            else:
                PagedQueryHandler.client.project = PagedQueryHandler.orig_project
            query_job = PagedQueryHandler.client.query(query,
                                                       job_config=job_config)
            PagedQueryHandler.client.project = PagedQueryHandler.orig_project

        if query_job.error_result is not None:
            raise Exception(query_job.error_result)

        yield query_job, query_job.job_id

        # send contents
        en = query_job.result(page_size)
        schema_fields = format_preview_fields(en.schema)
        duration = (query_job.ended - query_job.started).total_seconds()

        for page in en.pages:
            if page.num_items > USE_PARALLEL_THRESH:
                content = parallel_format_preview_rows(page,
                                                       en.schema,
                                                       pool=self.pool)
            else:
                content = format_preview_rows(page, en.schema)

            response = {
                'content': json.dumps(content),
                'labels': schema_fields,
                'bytesProcessed': total_bytes_processed,
                'project': query_job.project,
                'duration': duration,
            }
            yield response
Beispiel #16
0
    def execute(self, data):

        #setup
        client = self.client
        bigquery = self.bigquery
        datetime = self.datetime
        pytz = self.pytz
        time = self.time
        name = data.get("titleName")
        emails = data.get("emails")
        query = data.get("query")
        table = ""
        #

        # create a dataset first if needed
        dataset_main = self.make_dataset()
        table_id = "{}.{}".format(dataset_main, name)
        #

        # create external table
        if (self.env.get("create_external_table")):
            try:
                # Configure the external data source
                dataset_id = dataset_main
                table_id = "{}.{}".format(dataset_main, query)
                schema = [
                    bigquery.SchemaField("name", "STRING"),
                    bigquery.SchemaField("post_abbr", "STRING"),
                ]
                table = bigquery.Table(table_id, schema=schema)
                external_config = bigquery.ExternalConfig("CSV")
                external_config.source_uris = [
                    "gs://cloud-samples-data/bigquery/us-states/us-states.csv"
                ]
                external_config.options.skip_leading_rows = 1  # optionally skip header row
                table.external_data_configuration = external_config

                # Create a permanent table linked to the GCS file
                table = client.create_table(table)  # API request

                # Example query to find states starting with 'W'
                sql = 'SELECT * FROM `{}` WHERE name LIKE "W%"'.format(
                    table_id)

                query_job = client.query(sql)  # API request

                w_states = list(query_job)  # Waits for query to finish
                return "There are {} states with names starting with W. we pulled the data from us-states.csv in cloud storage".format(
                    len(w_states))
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'
        #

        # create temp external table
        elif (self.env.get("create_temp_external_table")):
            try:
                schema = ["filename", "name"]
                # Configure the external data source and query job.
                external_config = bigquery.ExternalConfig("CSV")
                external_config.source_uris = [
                    "gs://cloud-samples-data/bigquery/us-states/us-states.csv"
                ]
                external_config.schema = [
                    bigquery.SchemaField("name", "STRING"),
                    bigquery.SchemaField("post_abbr", "STRING"),
                ]
                external_config.options.skip_leading_rows = 1
                table_id = "usa_states"
                job_config = bigquery.QueryJobConfig(
                    table_definitions={table_id: external_config})

                # Example query to find states starting with 'W'.
                sql = """
                SELECT _FILE_NAME AS {},{} FROM `{}` WHERE name LIKE "W%"
                
                """.format(schema[0], schema[1], table_id)
                query_job = client.query(
                    sql, job_config=job_config)  # Make an API request.
                query_job.result()
                return json.dumps({
                    "schema": [{
                        "field": x
                    } for x in schema],
                    "data": [
                        # Row values can be accessed by field name or index.
                        {
                            schema[0]: row[schema[0]],
                            schema[1]: row[schema[1]]
                        } for row in query_job
                    ]
                })
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'
        #

        # drive create external table
        elif (self.env.get("drive_create_external_table")):
            try:
                dataset_id = dataset_main

                # Configure the external data source.
                dataset = client.get_dataset(dataset_id)
                table_id = query
                schema = [
                    bigquery.SchemaField("name", "STRING"),
                    bigquery.SchemaField("post_abbr", "STRING"),
                ]
                table = bigquery.Table(dataset.table(table_id), schema=schema)
                external_config = bigquery.ExternalConfig("GOOGLE_SHEETS")
                # Use a shareable link or grant viewing access to the email address you
                # used to authenticate with BigQuery (this example Sheet is public).
                sheet_url = (
                    "https://docs.google.com/spreadsheets/d/1i_QCL-7HcSyUZmIbP9E6lO_T5u3HnpLe7dnpHaijg_E/edit?usp=sharing"
                )
                external_config.source_uris = [sheet_url]
                external_config.options.skip_leading_rows = 1  # Optionally skip header row.
                external_config.options.range = (
                    "us-states!A20:B49"  # Optionally set range of the sheet to query from.
                )
                table.external_data_configuration = external_config

                # Create a permanent table linked to the Sheets file.
                table = client.create_table(table)  # Make an API request.

                # Example query to find states starting with "W".
                sql = 'SELECT * FROM `{}.{}` WHERE name LIKE "W%"'.format(
                    dataset_id, table_id)

                query_job = client.query(sql)  # Make an API request.

                # Wait for the query to complete.
                w_states = list(query_job)
                return "There are {} states with names starting with W in the selected range. this data came from google drive".format(
                    len(w_states))

            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'
        #

        # drive create temp external table
        elif (self.env.get("drive_create_temp_external_table")):
            try:
                schema = ["name", "post_abbr"]
                # Configure the external data source and query job.
                external_config = bigquery.ExternalConfig("GOOGLE_SHEETS")
                sheet_url = (
                    "https://docs.google.com/spreadsheets"
                    "/d/1i_QCL-7HcSyUZmIbP9E6lO_T5u3HnpLe7dnpHaijg_E/edit?usp=sharing"
                )
                external_config.source_uris = [sheet_url]
                external_config.schema = [
                    bigquery.SchemaField("name", "STRING"),
                    bigquery.SchemaField("post_abbr", "STRING"),
                ]
                external_config.options.skip_leading_rows = 1  # Optionally skip header row.
                external_config.options.range = (
                    "us-states!A20:B49"  # Optionally set range of the sheet to query from.
                )
                table_id = "usa_states"
                job_config = bigquery.QueryJobConfig(
                    table_definitions={table_id: external_config})

                # Example query to find states starting with 'W'.
                sql = """
                SELECT * FROM `{}` WHERE name LIKE "W%"
                """.format(table_id)
                query_job = client.query(
                    sql, job_config=job_config)  # Make an API request.
                query_job.result()
                [print(row) for row in query_job]
                return json.dumps({
                    "schema": [{
                        "field": x
                    } for x in schema],
                    "data": [
                        # Row values can be accessed by field name or index.
                        {
                            schema[0]: row[schema[0]],
                            schema[1]: row[schema[1]]
                        } for row in query_job
                    ]
                })
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'
        #

        return "Check the backend env dictionary you did set it so the backend didnt do anything"
Beispiel #17
0
    def submit(self, sql, create, dml=None):
        """
        Submit the sql query to create a de-identified table.

        :param sql:  The sql to send.
        :param create: a flag to identify if this query should create a new
            table or append to an existing table.
        :param dml:  boolean flag identifying if a statement is a dml statement
        """
        dml = False if dml is None else dml
        table_name = self.get_tablename()
        client = bq.Client.from_service_account_json(self.private_key)
        #
        # Let's make sure the out dataset exists
        datasets = list(client.list_datasets())
        found = np.sum(
            [1 for dataset in datasets if dataset.dataset_id == self.odataset])
        if not found:
            dataset = bq.Dataset(client.dataset(self.odataset))
            client.create_dataset(dataset)

        # create the output table
        if create:
            LOGGER.info('creating new table:\t%s', self.tablename)
            bq_utils.create_standard_table(self.tablename,
                                           self.tablename,
                                           drop_existing=True,
                                           dataset_id=self.odataset)
            write_disposition = bq_consts.WRITE_EMPTY
        else:
            write_disposition = bq_consts.WRITE_APPEND
            LOGGER.info('appending results to table:\t%s', self.tablename)

        job = bq.QueryJobConfig()
        job.priority = self.priority
        job.dry_run = True

        dml_job = None
        if not dml:
            job.destination = client.dataset(self.odataset).table(
                self.tablename)
            job.use_query_cache = True
            job.allow_large_results = True
            job.write_disposition = write_disposition
            if self.partition:
                job._properties['timePartitioning'] = {'type': 'DAY'}
                job._properties['clustering'] = {'field': 'person_id'}
        else:
            # create a copy of the job config to use if the dry-run passes
            dml_job = copy(job)

        LOGGER.info(
            'submitting a dry-run for:\t%s\t\tpriority:\t%s\t\tpartition:\t%s',
            self.get_tablename(), self.priority, self.partition)

        logpath = os.path.join(self.logpath, self.idataset)
        try:
            os.makedirs(logpath)
        except OSError:
            # log path already exists and we don't care
            pass

        try:
            response = client.query(sql, location='US', job_config=job)
        except Exception:
            LOGGER.exception(
                'dry run query failed for:\t%s\n'
                '\t\tSQL:\t%s\n'
                '\t\tjob config:\t%s', self.get_tablename(), sql, job)
        else:

            if response.state == 'DONE':
                if dml_job:
                    job = dml_job

                job.dry_run = False

                LOGGER.info('dry-run passed.  submitting query for execution.')

                response = client.query(sql, location='US', job_config=job)
                LOGGER.info(
                    'submitted a %s job for table:\t%s\t\tstatus:\t%s\t\tvalue:\t%s',
                    'bigquery', table_name, 'pending', response.job_id)
                self.wait(client, response.job_id)
Beispiel #18
0
def test_validate_hmac_sha256(sql):
    """Validate hmac_sha256."""
    job_config = bigquery.QueryJobConfig(use_legacy_sql=False)
    job = bigquery.Client().query(sql, job_config=job_config)
    job.result()
def main(submission_date, dst_table, project, tmp_project, dataset):
    """Run query per app_version."""
    bq_client = bigquery.Client(project=project)

    app_versions = [
        row["app_version"] for row in bq_client.query(
            VERSION_QUERY_TEMPLATE.format(date=submission_date,
                                          project=project,
                                          dataset=dataset)).result()
    ]

    print(f"Found versions: {app_versions}")

    if len(app_versions) == 0:
        print("Source table empty", file=sys.stderr)
        sys.exit(1)

    sql_path = SQL_BASE_DIR / dst_table / "query.sql"

    query_text = sql_path.read_text()

    # Write to intermediate table to avoid partial writes to destination table
    if tmp_project is None:
        tmp_project = project
    intermediate_table = f"{tmp_project}.analysis.glam_temp_clustered_query_{dst_table}"
    print(f"Writing results to {intermediate_table}")

    for i, app_version in enumerate(app_versions):
        print(f"Querying for app_version {app_version}")

        query_config = bigquery.QueryJobConfig(
            query_parameters=[
                bigquery.ScalarQueryParameter("submission_date", "DATE",
                                              str(submission_date)),
                bigquery.ScalarQueryParameter("app_version", "INT64",
                                              app_version),
            ],
            clustering_fields=["metric", "channel"],
            destination=intermediate_table,
            default_dataset=f"{project}.{dataset}",
            write_disposition=(bigquery.WriteDisposition.WRITE_TRUNCATE
                               if i == 0 else
                               bigquery.WriteDisposition.WRITE_APPEND),
        )

        query_job = bq_client.query(query_text, job_config=query_config)

        # Periodically print so airflow gke operator doesn't think task is dead
        elapsed = 0
        while not query_job.done():
            time.sleep(10)
            elapsed += 10
            if elapsed % 200 == 10:
                print("Waiting on query...")

        print(f"Total elapsed: approximately {elapsed} seconds")

        results = query_job.result()
        print(f"Query job {query_job.job_id} finished")
        print(f"{results.total_rows} rows in {intermediate_table}")

    copy_config = bigquery.CopyJobConfig(
        write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE, )

    print(f"Copying {intermediate_table} to {project}.{dataset}.{dst_table}")
    bq_client.copy_table(
        intermediate_table,
        f"{project}.{dataset}.{dst_table}",
        job_config=copy_config,
    ).result()

    print(f"Deleting {intermediate_table}")
    bq_client.delete_table(intermediate_table)
def test_ncaa_tutorial(delete_dataset):
    # [START bqml_ncaa_tutorial_create_dataset]
    dataset = bigquery.Dataset(client.dataset('bqml_tutorial'))
    dataset.location = 'US'
    client.create_dataset(dataset)
    # [END bqml_ncaa_tutorial_create_dataset]

    # Create the tables used by the tutorial
    # Note: the queries are saved to a file. This should be updated to use the
    # saved queries once the library supports running saved queries.
    query_filepath_to_table_name = {
        'feature_input_query.sql': 'cume_games',
        'training_data_query.sql': 'wide_games'
    }
    resources_directory = os.path.join(os.path.dirname(__file__), 'resources')
    for query_filepath, table_name in query_filepath_to_table_name.items():
        table_ref = dataset.table(table_name)
        job_config = bigquery.QueryJobConfig()
        job_config.destination = table_ref
        query_filepath = os.path.join(
            resources_directory, query_filepath)
        sql = io.open(query_filepath, 'r', encoding='utf-8').read()
        client.query(sql, job_config=job_config).result()

    # [START bqml_ncaa_tutorial_create_model]
    sql = """
        CREATE OR REPLACE MODEL `bqml_tutorial.ncaa_model`
        OPTIONS (
            model_type='linear_reg',
            data_split_eval_fraction=0.1,
            max_iteration=50 ) AS
        SELECT
            * EXCEPT (
                game_id, season, scheduled_date,
                total_three_points_made,
                total_three_points_att),
            total_three_points_att as label
        FROM
            `bqml_tutorial.wide_games`
        WHERE
            # remove the game to predict
            game_id != 'f1063e80-23c7-486b-9a5e-faa52beb2d83'
    """
    df = client.query(sql).to_dataframe()
    print(df)
    # [END bqml_ncaa_tutorial_create_model]

    # [START bqml_ncaa_tutorial_get_training_statistics]
    sql = """
        SELECT
            *
        FROM
            ML.TRAINING_INFO(MODEL `bqml_tutorial.ncaa_model`)
    """
    df = client.query(sql).to_dataframe()
    print(df)
    # [END bqml_ncaa_tutorial_get_training_statistics]

    # [START bqml_ncaa_tutorial_evaluate_model]
    sql = """
        WITH eval_table AS (
            SELECT
                *,
                total_three_points_att AS label
            FROM
                `bqml_tutorial.wide_games` )
        SELECT
            *
        FROM
            ML.EVALUATE(MODEL `bqml_tutorial.ncaa_model`,
                TABLE eval_table)
    """
    df = client.query(sql).to_dataframe()
    print(df)
    # [END bqml_ncaa_tutorial_evaluate_model]

    # [START bqml_ncaa_tutorial_predict_outcomes]
    sql = """
        WITH game_to_predict AS (
            SELECT
                *
            FROM
                `bqml_tutorial.wide_games`
            WHERE
                game_id='f1063e80-23c7-486b-9a5e-faa52beb2d83' )
        SELECT
            truth.game_id AS game_id,
            total_three_points_att,
            predicted_total_three_points_att
        FROM (
            SELECT
                game_id,
                predicted_label AS predicted_total_three_points_att
            FROM
                ML.PREDICT(MODEL `bqml_tutorial.ncaa_model`,
                table game_to_predict) ) AS predict
        JOIN (
            SELECT
                game_id,
                total_three_points_att AS total_three_points_att
            FROM
                game_to_predict) AS truth
        ON
            predict.game_id = truth.game_id
    """
    df = client.query(sql).to_dataframe()
    print(df)
def update_with_bq_access(request):

    # Instantiates a client
    bigquery_client = bigquery.Client()

    datasets = list(bigquery_client.list_datasets())
    project = bigquery_client.project

    # list all datasets
    if datasets:
        logger.debug('Datasets in project {}:'.format(project))
        for dataset in datasets:  # API request(s)
            logger.debug('\t{}'.format(dataset.dataset_id))

            # choose the right dataset
            if ("learning_datasets" == dataset.dataset_id):
                # list all tables
                dataset_ref = bigquery_client.dataset(dataset.dataset_id)
                tables = list(
                    bigquery_client.list_tables(dataset_ref))  # API request(s)
                for table in tables:
                    if ("enriched_events" == table.table_id):
                        logger.debug('\t{}'.format("found table"))

                        # query to retrieve all file access events for one course
                        query = 'select CAST(SUBSTR(JSON_EXTRACT_SCALAR(event, "$.object.id"), 35) AS STRING) AS FILE_ID, ' \
                                'SUBSTR(JSON_EXTRACT_SCALAR(event, "$.membership.member.id"), 29) AS USER_ID, ' \
                                'datetime(EVENT_TIME) as ACCESS_TIME ' \
                                'FROM learning_datasets.enriched_events ' \
                                'where JSON_EXTRACT_SCALAR(event, "$.edApp.id") = \'http://umich.instructure.com/\' ' \
                                'and event_type = \'NavigationEvent\' ' \
                                'and JSON_EXTRACT_SCALAR(event, "$.object.name") = \'attachment\' ' \
                                'and JSON_EXTRACT_SCALAR(event, "$.action") = \'NavigatedTo\' ' \
                                'and JSON_EXTRACT_SCALAR(event, "$.membership.member.id") is not null ' \
                                'and SUBSTR(JSON_EXTRACT_SCALAR(event, "$.group.id"),31) = @course_id '
                        logger.debug(query)
                        query_params = [
                            bigquery.ScalarQueryParameter(
                                'course_id', 'STRING', UDW_COURSE_ID),
                        ]
                        job_config = bigquery.QueryJobConfig()
                        job_config.query_parameters = query_params

                        # Location must match that of the dataset(s) referenced in the query.
                        df = bigquery_client.query(
                            query,
                            location='US', job_config=job_config).to_dataframe(
                            )  # API request - starts the query

                        logger.debug("df row number=" + str(df.shape[0]))
                        # drop duplicates
                        df.drop_duplicates(
                            ["FILE_ID", "USER_ID", "ACCESS_TIME"],
                            keep='first',
                            inplace=True)

                        logger.debug("after drop duplicates, df row number=" +
                                     str(df.shape[0]))

                        # write to MySQL
                        df.to_sql(con=engine,
                                  name='FILE_ACCESS',
                                  if_exists='append',
                                  index=False)

    else:
        logger.debug(
            '{} project does not contain any datasets.'.format(project))

    return HttpResponse("loaded file access info: inserted " +
                        str(df.shape[0]) + " rows.")
Beispiel #22
0
Creates analytics.movie Bigquery table
"""

from google.cloud import bigquery
from google.oauth2 import service_account

key_path = "../../credentials/edgart-experiments-67ca4ddbda73.json"

credentials = service_account.Credentials.from_service_account_file(
    key_path, scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

client = bigquery.Client(credentials=credentials, project=credentials.project_id, )
table_id = "{}.analytics.rating".format(credentials.project_id)

job_config = bigquery.QueryJobConfig(destination=table_id, write_disposition='WRITE_TRUNCATE',)

sql = """
WITH ml_rating AS (
  SELECT
    m.tconst,
    TRUNC(AVG(rating), 1) as rating,
    COUNT(1) as num_votes
  FROM `edgart-experiments.ml.ratings` r
  JOIN `edgart-experiments.ml.links` l ON l.movieId=r.movieId 
  JOIN `edgart-experiments.analytics.movie` m ON CAST(REPLACE(m.tconst, 'tt', '') AS INT64)=l.imdbId
  GROUP BY m.tconst
)
SELECT
m.tconst,
imdb.averageRating as imdb_rating,
Beispiel #23
0
def get_averages_by_addon_from_bigquery(today, exclude=None):
    """This function is used to compute the 'hotness' score of each add-on (see
    also `update_addon_hotness()` cron task). It returns a dict with top-level
    keys being add-on GUIDs and values being dicts containing average
    values."""
    client = create_client()

    one_week_date = today - timedelta(days=7)
    four_weeks_date = today - timedelta(days=28)

    query = f"""
WITH
  this_week AS (
  SELECT
    addon_id,
    AVG(dau) AS avg_this_week
  FROM
    `{get_amo_stats_dau_view_name()}`
  WHERE
    submission_date >= @one_week_date
  GROUP BY
    addon_id),
  three_weeks_before_this_week AS (
  SELECT
    addon_id,
    AVG(dau) AS avg_three_weeks_before
  FROM
    `{get_amo_stats_dau_view_name()}`
  WHERE
    submission_date BETWEEN @four_weeks_date AND @one_week_date
  GROUP BY
    addon_id)
SELECT
  *
FROM
  this_week
JOIN
  three_weeks_before_this_week
USING
  (addon_id)
"""
    query_parameters = [
        bigquery.ScalarQueryParameter('one_week_date', 'DATE', one_week_date),
        bigquery.ScalarQueryParameter('four_weeks_date', 'DATE', four_weeks_date),
    ]

    if exclude and len(exclude) > 0:
        query = f'{query} WHERE addon_id NOT IN UNNEST(@excluded_addon_ids)'
        query_parameters.append(
            bigquery.ArrayQueryParameter('excluded_addon_ids', 'STRING', exclude)
        )

    rows = client.query(
        query,
        job_config=bigquery.QueryJobConfig(query_parameters=query_parameters),
    ).result()

    return {
        row['addon_id']: {
            'avg_this_week': row['avg_this_week'],
            'avg_three_weeks_before': row['avg_three_weeks_before'],
        }
        for row in rows
        if row['addon_id']
    }
Beispiel #24
0
    def submit(self, sql):
        """
        """
        table_name = self.get_tablename()
        client = bq.Client.from_service_account_json(self.private_key)
        #
        # Let's make sure the out dataset exists
        datasets = list(client.list_datasets())
        found = np.sum([1  for dataset in datasets if dataset.dataset_id == self.odataset])
        if not found:
            dataset = bq.Dataset(client.dataset(self.odataset))
            client.create_dataset(dataset)

        # create the output table
        bq_utils.create_standard_table(self.tablename, self.tablename, drop_existing=True, dataset_id=self.odataset)

        job = bq.QueryJobConfig()
        job.destination = client.dataset(self.odataset).table(self.tablename)
        job.use_query_cache = True
        job.allow_large_results = True
        if self.partition:
            job._properties['timePartitioning'] = {'type': 'DAY'}
            job._properties['clustering'] = {'field': 'person_id'}

        job.priority = self.priority
        job.dry_run = True
        self.log(module='submit-job',
                 subject=self.get_tablename(),
                 action='dry-run',
                 value={'priority': self.priority, 'parition': self.partition})

        logpath = os.path.join(self.logpath, self.idataset)
        try:
            os.makedirs(logpath)
        except OSError:
            # log path already exists and we don't care
            pass

        r = client.query(sql, location='US', job_config=job)
        if r.errors is None and r.state == 'DONE':
            job.dry_run = False

            r = client.query(sql, location='US', job_config=job)
            self.log(module='submit',
                     subject=self.get_tablename(),
                     action='submit-job',
                     table=table_name,
                     status='pending',
                     value=r.job_id,
                     object='bigquery')
            self.wait(client, r.job_id)
#            self.finalize(client)
            #
            # At this point we must try to partition the table
        else:
            self.log(module='submit',
                     subject=self.get_tablename(),
                     action='submit-job',
                     table=table_name,
                     status='error',
                     value=r.errors)
            print (r.errors)
Beispiel #25
0
def promotion_prediction_res(project_id, dataset_id):

    # Load client
    client = bigquery.Client()

    job_config = bigquery.QueryJobConfig()

    promo_update = """
        WITH prelim AS 
        (SELECT 
        CAST(pred.p_cal_inc_sale_qty AS NUMERIC) AS p_cal_inc_sale_qty,
        CAST(pred.prediction_interval AS NUMERIC) AS prediction_interval,
        CAST(pred.prediction_error_perc AS NUMERIC) AS prediction_error_perc,
        pred.sku_root_id,pred.description, pred.area, pred.section, pred.category, pred.subcategory, pred.segment,
        pred.brand_name, pred.brand_price_label, pred.flag_healthy, pred.innovation_flag, pred.tourism_flag,
        pred.local_flag, pred.regional_flag, 
        CAST(pred.no_hipermercados_stores AS INT64) AS no_hipermercados_stores,
        CAST(pred.no_supermercados_stores AS INT64) AS no_supermercados_stores,
        CAST(pred.no_gasolineras_stores AS INT64) AS no_gasolineras_stores,
        CAST(pred.no_comercio_electronico_stores AS INT64) AS no_comercio_electronico_stores,
        CAST(pred.no_otros_negocio_stores AS INT64) AS no_otros_negocio_stores,
        CAST(pred.no_plataformas_stores AS INT64) AS no_plataformas_stores,
        CAST(pred.no_other_stores AS INT64) AS no_other_stores,
        CAST(pred.no_impacted_stores AS INT64) AS no_impacted_stores, 
        CAST(pred.no_impacted_regions AS INT64) AS no_impacted_regions,
        CAST(pred.avg_store_size AS NUMERIC) AS avg_store_size,
        CAST(pred.type AS STRING) AS type,
        pred.customer_profile_type,  pred.marketing_type, 
        CAST(pred.duration_days AS INT64) AS duration_days, 
        pred.includes_weekend, pred.campaign_start_day, 
        pred.campaign_start_month , 
        CAST(pred.campaign_start_quarter AS INT64) AS campaign_start_quarter,
        CAST(pred.campaign_start_week AS INT64) AS campaign_start_week, 
        CAST(pred.leaflet_cover AS INT64) AS leaflet_cover,
        CAST(pred.leaflet_priv_space AS INT64) AS leaflet_priv_space, 
        CAST(pred.in_leaflet_flag AS INT64) AS in_leaflet_flag,
        CAST(pred.in_gondola_flag AS INT64) AS in_gondola_flag,
        CAST(pred.in_both_leaflet_gondola_flag AS INT64) AS in_both_leaflet_gondola_flag,
        CAST(pred.p_qty_bl AS NUMERIC) AS p_qty_bl, 
        pred.promo_mechanic, pred.Promo_mechanic_en , pred.discount_depth, 
        CAST(pred.promoted_in_past AS NUMERIC) as promoted_in_past,
        std_price.margin_per_unit as std_margin_per_unit,
        std_price.std_price_per_unit as std_price_per_unit,
        CAST(pred.p_qty_bl AS NUMERIC)*std_price.std_price_per_unit as p_sale_bl,
        CAST(pred.p_qty_bl AS NUMERIC)*std_price.margin_per_unit as p_margin_bl,
        std_price.cost_per_unit as cost_price,
        (CAST(discount_depth_rank AS NUMERIC)/100) as equivalent_discount,
        (1-(CAST(discount_depth_rank AS NUMERIC)/100))*std_price.std_price_per_unit as effective_discount_price_per_unit,
        CAST(pred.p_cal_inc_sale_qty AS NUMERIC)*(1-(CAST(discount_depth_rank AS NUMERIC)/100))*std_price.std_price_per_unit as p_cal_inc_sale_amt,
        (CAST(pred.p_cal_inc_sale_qty AS NUMERIC)*(1-(CAST(discount_depth_rank AS NUMERIC)/100))*std_price.std_price_per_unit) 
        - (CAST(pred.p_cal_inc_sale_qty AS NUMERIC)*(std_price.cost_per_unit)) as p_cal_inc_margin_amt
        FROM `gum-eroski-dev.prediction_results.prediction_promotion_results` pred
        LEFT JOIN `gum-eroski-dev.ETL.aggregate_std_price_margin` std_price
        on std_price.sku_root_id = pred.sku_root_id
        ) 
        SELECT 
        *, 
        SAFE_DIVIDE(p_cal_inc_sale_qty, p_qty_bl) AS perc_uplift_qty,
        SAFE_DIVIDE(p_cal_inc_sale_amt, p_sale_bl) AS perc_uplift_amt,
        SAFE_DIVIDE(p_cal_inc_margin_amt, p_margin_bl) AS perc_uplift_margin
        FROM prelim pred
        """

    promotion_pred_sql = """
        SELECT avg(p_cal_inc_sale_qty) as avg_p_cal_inc_sale_qty, 
        avg(p_cal_inc_sale_amt) as avg_p_cal_inc_sale_amt,
        avg(p_cal_inc_margin_amt) as avg_p_cal_inc_margin_amt,

        avg(perc_uplift_qty) as avg_perc_uplift_qty,
        avg(perc_uplift_amt) as avg_perc_uplift_amt,
        avg(perc_uplift_margin) as avg_perc_uplift_margin,

        sum(p_cal_inc_sale_qty) as sum_p_cal_inc_sale_qty, 
        sum(p_cal_inc_sale_amt) as sum_p_cal_inc_sale_amt,
        sum(p_cal_inc_margin_amt) as sum_p_cal_inc_margin_amt,

        avg(prediction_interval) as avg_prediction_interval,
        avg(prediction_error_perc) as avg_prediction_error_perc,
        area, section, category, subcategory, brand_name, 
        promo_mechanic, Promo_mechanic_en, discount_depth, 
        count(distinct sku_root_id) as no_skus_in_brand_cat,
        max(promoted_in_past) as promoted_in_past

        FROM `gum-eroski-dev.prediction_results.prediction_promotion_results`

        group by area, section, category, subcategory, brand_name, promo_mechanic, Promo_mechanic_en, discount_depth
        """

    # Create a disctionary to loop over all destination tables and scripts
    tables = {
        'prediction_promotion_results': promo_update,
        'prediction_promotion_results_cat_brand': promotion_pred_sql
    }

    job_config.write_disposition = "WRITE_TRUNCATE"
    for key in tables:

        # Set the destination table
        table_ref = client.dataset(dataset_id).table(key)
        job_config.destination = table_ref

        # Start the query, passing in the extra configuration.
        query_job = client.query(
            tables[key],
            # Location must match that of the dataset(s) referenced in the query
            # and of the destination table.
            location='europe-west3',
            job_config=job_config)  # API request - starts the query

        query_job.result()  # Waits for the query to finish
        logger.info("Completed writing {a} table...".format(a=key))
Beispiel #26
0
    def run(
        self,
        query: str = None,
        query_params: List[tuple] = None,
        project: str = None,
        location: str = "US",
        dry_run_max_bytes: int = None,
        credentials: dict = None,
        dataset_dest: str = None,
        table_dest: str = None,
        to_dataframe: bool = False,
        job_config: dict = None,
    ):
        """
        Run method for this Task.  Invoked by _calling_ this Task within a Flow context, after
        initialization.

        Args:
            - query (str, optional): a string of the query to execute
            - query_params (list[tuple], optional): a list of 3-tuples specifying BigQuery
                query parameters; currently only scalar query parameters are supported. See
                [the Google
                documentation](https://cloud.google.com/bigquery/docs/parameterized-queries#bigquery-query-params-python)
                for more details on how both the query and the query parameters should be
                formatted
            - project (str, optional): the project to initialize the BigQuery Client with; if
                not provided, will default to the one inferred from your credentials
            - location (str, optional): location of the dataset that will be queried; defaults
                to "US"
            - dry_run_max_bytes (int, optional): if provided, the maximum number of bytes the
                query is allowed to process; this will be determined by executing a dry run and
                raising a `ValueError` if the maximum is exceeded
            - credentials (dict, optional): a JSON document containing Google Cloud credentials.
                You should provide these at runtime with an upstream Secret task.  If not
                provided, Prefect will first check `context` for `GCP_CREDENTIALS` and lastly
                will use default Google client logic.
            - dataset_dest (str, optional): the optional name of a destination dataset to write the
                query results to, if you don't want them returned; if provided, `table_dest`
                must also be provided
            - table_dest (str, optional): the optional name of a destination table to write the
                query results to, if you don't want them returned; if provided, `dataset_dest` must also
                be provided
            - to_dataframe (bool, optional): if provided, returns the results of the query as a pandas
                dataframe instead of a list of `bigquery.table.Row` objects. Defaults to False
            - job_config (dict, optional): an optional dictionary of job configuration parameters; note
                that the parameters provided here must be pickleable (e.g., dataset references will be
                rejected)

        Raises:
            - ValueError: if the `query` is `None`
            - ValueError: if only one of `dataset_dest` / `table_dest` is provided
            - ValueError: if the query will execeed `dry_run_max_bytes`

        Returns:
            - list: a fully populated list of Query results, with one item per row
        """
        # check for any argument inconsistencies
        if query is None:
            raise ValueError("No query provided.")
        if sum([dataset_dest is None, table_dest is None]) == 1:
            raise ValueError(
                "Both `dataset_dest` and `table_dest` must be provided if writing to a "
                "destination table.")

        # create client
        client = get_bigquery_client(project=project, credentials=credentials)

        # setup jobconfig
        job_config = bigquery.QueryJobConfig(**job_config)
        if query_params is not None:
            hydrated_params = [
                bigquery.ScalarQueryParameter(*qp) for qp in query_params
            ]
            job_config.query_parameters = hydrated_params

        # perform dry_run if requested
        if dry_run_max_bytes is not None:
            old_info = dict(dry_run=job_config.dry_run,
                            use_query_cache=job_config.use_query_cache)
            job_config.dry_run = True
            job_config.use_query_cache = False
            self.logger.debug("Performing a dry run...")
            query_job = client.query(query,
                                     location=location,
                                     job_config=job_config)
            if query_job.total_bytes_processed > dry_run_max_bytes:
                msg = (
                    "Query will process {0} bytes which is above the set maximum of {1} "
                    "for this task.").format(query_job.total_bytes_processed,
                                             dry_run_max_bytes)
                raise ValueError(msg)
            job_config.dry_run = old_info["dry_run"]
            job_config.use_query_cache = old_info["use_query_cache"]

        # if writing to a destination table
        if dataset_dest is not None:
            table_ref = client.dataset(dataset_dest).table(table_dest)
            job_config.destination = table_ref

        query_job = client.query(query,
                                 location=location,
                                 job_config=job_config)

        # if returning the results as a dataframe
        if to_dataframe:
            return query_job.result().to_dataframe()
        # else if returning as a list of bigquery.table.Row objects (default)
        else:
            return list(query_job.result())
Beispiel #27
0
def load_data(spark, date_from, date_to):
    """Load a set of aggregated metrics for the provided timeframe.

    Returns Spark dataframe containing preaggregated user counts per various dimensions.

    Args:
        date_from: Start date (inclusive)
        date_to: End date (exclusive)
    """
    bq = bigquery.Client()

    query = """
  WITH
    rank_per_client AS (
      SELECT
        *,
        ROW_NUMBER() OVER (PARTITION BY client_id ORDER BY submission_timestamp DESC) AS rn
      FROM
        `moz-fx-data-shared-prod.telemetry_stable.main_v4`
      WHERE
        DATE(submission_timestamp) >= @date_from
        AND DATE(submission_timestamp) < @date_to
    ),
    latest_per_client_all AS (
      SELECT
        *
      FROM
        rank_per_client
      WHERE
        rn=1
    ),
    latest_per_client AS (
      SELECT
        environment.build.architecture AS browser_arch,
        COALESCE(environment.system.os.name,
            'Other') AS os_name,
        COALESCE(
            IF (environment.system.os.name IN ('Linux', 'Darwin'),
                CONCAT(REGEXP_EXTRACT(environment.system.os.version, r"^[0-9]+"), '.x'),
                environment.system.os.version),
            'Other') AS os_version,
        environment.system.memory_mb,
        coalesce(environment.system.is_wow64, FALSE) AS is_wow64,
        IF (ARRAY_LENGTH(environment.system.gfx.adapters)>0,
            environment.system.gfx.adapters[OFFSET(0)].vendor_id,
            NULL) AS gfx0_vendor_id,
        IF (ARRAY_LENGTH(environment.system.gfx.adapters)>0,
            environment.system.gfx.adapters[OFFSET(0)].device_id,
            NULL) AS gfx0_device_id,
        IF (ARRAY_LENGTH(environment.system.gfx.monitors)>0,
            environment.system.gfx.monitors[OFFSET(0)].screen_width,
            0) AS screen_width,
        IF (ARRAY_LENGTH(environment.system.gfx.monitors)>0,
            environment.system.gfx.monitors[OFFSET(0)].screen_height,
            0) AS screen_height,
        environment.system.cpu.cores AS cpu_cores,
        environment.system.cpu.vendor AS cpu_vendor,
        environment.system.cpu.speed_m_hz AS cpu_speed,
        'Shockwave Flash' IN (
            SELECT name FROM UNNEST(environment.addons.active_plugins)
            ) AS has_flash
      FROM
        latest_per_client_all
    ),
    transformed AS (
      SELECT
        browser_arch,
        CONCAT(os_name, '-', os_version) AS os,
        COALESCE(SAFE_CAST(ROUND(memory_mb / 1024.0) AS INT64), 0) AS memory_gb,
        is_wow64,
        gfx0_vendor_id,
        gfx0_device_id,
        CONCAT(CAST(screen_width AS STRING), 'x', CAST(screen_height AS STRING)) AS resolution,
        cpu_cores,
        cpu_vendor,
        cpu_speed,
        has_flash
      FROM
        latest_per_client
    ),
    by_dimensions AS (
      SELECT
        *,
        count(*) AS count
      FROM
        transformed
      GROUP BY
        browser_arch,
        os,
        memory_gb,
        is_wow64,
        gfx0_vendor_id,
        gfx0_device_id,
        resolution,
        cpu_cores,
        cpu_vendor,
        cpu_speed,
        has_flash
    )
    select * from by_dimensions
  """

    job_config = bigquery.QueryJobConfig(query_parameters=[
        bigquery.ScalarQueryParameter("date_from", "DATE", date_from),
        bigquery.ScalarQueryParameter("date_to", "DATE", date_to),
    ])
    hardware_by_dimensions_query_job = bq.query(query, job_config=job_config)
    hardware_by_dimensions_query_job.result()

    hardware_by_dimensions_df = (spark.read.format("bigquery").option(
        "project",
        hardware_by_dimensions_query_job.destination.project).option(
            "dataset",
            hardware_by_dimensions_query_job.destination.dataset_id).option(
                "table",
                hardware_by_dimensions_query_job.destination.table_id).load())

    return hardware_by_dimensions_df
Beispiel #28
0
### --------------------------------------------------------------------------
### RIDES PER YEAR
### --------------------------------------------------------------------------

# Query to select the number of rides per year, sorted by year
rides_per_year_query = """
                        SELECT EXTRACT(YEAR FROM trip_start_timestamp) AS year,
                                     COUNT(1) AS num_trips
                        FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
                        GROUP BY year
                        ORDER BY year
                        """
        
# Set up the query (cancel the query if it would use too much of 
# your quota, with the limit set to 10 GB)
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
rides_per_year_query_job = client.query(rides_per_year_query, job_config=safe_config)

# API request - run the query, and convert the results to a pandas DataFrame
rides_per_year_result = rides_per_year_query_job.to_dataframe()

### --------------------------------------------------------------------------
### RIDES PER MONTH
### --------------------------------------------------------------------------

# Query to select the number of rides per month in 2017
rides_per_month_query = """                       
                        SELECT EXTRACT(MONTH FROM trip_start_timestamp) AS month,
                            COUNT(1) AS num_trips
                        FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
                        WHERE EXTRACT(YEAR FROM trip_start_timestamp) = 2017
Beispiel #29
0
    def runtest(self):
        """Run."""
        query_name = self.fspath.dirpath().basename
        query = read(
            f"{self.fspath.dirname.replace('tests', 'sql')}/query.sql")
        expect = load(self.fspath.strpath, "expect")

        tables: Dict[str, Table] = {}
        views: Dict[str, str] = {}

        # generate tables for files with a supported table extension
        for resource in next(os.walk(self.fspath))[2]:
            if "." not in resource:
                continue  # tables require an extension
            table_name, extension = resource.rsplit(".", 1)
            if table_name.endswith(".schema") or table_name in (
                    "expect",
                    "query_params",
            ):
                continue  # not a table
            if extension in TABLE_EXTENSIONS or extension in ("yaml", "json"):
                if extension in TABLE_EXTENSIONS:
                    source_format = TABLE_EXTENSIONS[extension]
                    source_path = os.path.join(self.fspath.strpath, resource)
                else:
                    source_format = TABLE_EXTENSIONS["ndjson"]
                    source_path = (self.fspath.strpath, table_name)
                if "." in table_name:
                    # combine project and dataset name with table name
                    original, table_name = (
                        table_name,
                        table_name.replace(".", "_").replace("-", "_"),
                    )
                    query = query.replace(original, table_name)
                tables[table_name] = Table(table_name, source_format,
                                           source_path)
            elif extension == "sql":
                if "." in table_name:
                    # combine project and dataset name with table name
                    original, table_name = (
                        table_name,
                        table_name.replace(".", "_").replace("-", "_"),
                    )
                    query = query.replace(original, table_name)
                views[table_name] = read(self.fspath.strpath, resource)

        # rewrite all udfs as temporary
        query = parse_udf.persistent_udf_as_temp(query)

        dataset_id = "_".join(self.fspath.strpath.split(os.path.sep)[-3:])
        if "CIRCLE_BUILD_NUM" in os.environ:
            dataset_id += f"_{os.environ['CIRCLE_BUILD_NUM']}"

        bq = bigquery.Client()
        with dataset(bq, dataset_id) as default_dataset:
            load_tables(bq, default_dataset, tables.values())
            load_views(bq, default_dataset, views)

            # configure job
            res_table = bigquery.TableReference(default_dataset, query_name)

            job_config = bigquery.QueryJobConfig(
                default_dataset=default_dataset,
                destination=res_table,
                query_parameters=get_query_params(self.fspath.strpath),
                use_legacy_sql=False,
                write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
            )

            # run query
            job = bq.query(query, job_config=job_config)
            result = list(coerce_result(*job.result()))
            result.sort(key=lambda row: json.dumps(row, sort_keys=True))
            expect.sort(key=lambda row: json.dumps(row, sort_keys=True))

            print_and_test(expect, result)
Beispiel #30
0
def update_from_cloud_storage(args):
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = args.google_key_path
    client = bigquery.Client()

    bucket_name = "igenie-tweets"
    blob_name = "historical/{}.json".format("tweets-raw")

    GS_URL = 'gs://{}/{}'.format(bucket_name, blob_name)
    external_config = bigquery.ExternalConfig("NEWLINE_DELIMITED_JSON")
    external_config.autodetect = True
    external_config.source_uris = [GS_URL]
    job_config = bigquery.QueryJobConfig()
    job_config.table_definitions = {"temp": external_config}

    file_name = "tweets-enriched.json"

    QUERY = ('SELECT id,'
             'id_str,'
             'constituent,'
             'text,'
             'coordinates,'
             'created_at,'
             'favorited,'
             'place,'
             'lang,'
             'metadata,'
             'retweeted,'
             'entities.hashtags,'
             'entities.symbols,'
             'source,'
             'user.time_zone,'
             'user.location,'
             'user.friends_count,'
             'user.followers_count,'
             'favorite_count,'
             'retweet_count,'
             'geo ,'
             'search_term '
             'FROM `temp`')

    TIMEOUT = 100  # in seconds

    query_job = client.query(
        QUERY, job_config=job_config)  # API request - starts the query
    assert query_job.state == 'RUNNING'

    # Waits for the query to finish
    iterator = query_job.result(timeout=TIMEOUT)

    with open(file_name, "a") as f:
        for row in iterator:
            # Included attributes
            result = {}
            result["id"] = row.id
            result['id_str'] = row.id_str
            result['text'] = row.text
            result['coordinates'] = row.coordinates
            result['favorited'] = row.favorited
            result['place'] = row.place
            result['lang'] = row.lang
            result['metadata'] = row.metadata
            result['retweeted'] = row.retweeted
            result['entities_hashtags'] = row["entities.hashtags"]
            result['entities_symbols'] = row["entities.symbols"]
            result['source'] = row.source
            result['user_time_zone'] = row["user.time_zone"]
            result['user_location'] = row["user.location"]
            result['user_friends_count'] = row["user.friends_count"]
            result['user_followers_count'] = row["user.followers_count"]
            result['favorite_count'] = row.favorite_count
            result['retweet_count'] = row.retweet_count
            result['geo'] = row.geo
            result['search_term'] = row.search_term

            # Extra attributes
            # constituent_id, constituent_name
            constituent_id, constituent_name = get_constituent_id_name(
                row.constituent)
            result['constituent_id'] = constituent_id
            result['constituent_name'] = constituent_name
            # created at - date
            result['date'] = datetime.strptime(row.created_at,
                                               '%a %b %d %H:%M:%S %z %Y')

            if not row.relevance:
                result["relevance"] = -1
            else:
                result["relevance"] = row.relevance

            # sentiment score
            result["sentiment_score"] = get_nltk_sentiment(row.text)

            update_tags(result)

            f.write(json.dumps(result, cls=MongoEncoder) + '\n')