Exemple #1
0
def run_query(client,
              baseline_table,
              date,
              dry_run,
              output_dir=None,
              views_only=False):
    """Process a single table, potentially also writing out the generated queries."""
    tables = table_names_from_baseline(baseline_table)

    last_seen_table = tables["last_seen_table"]
    last_seen_view = tables["last_seen_view"]
    render_kwargs = dict(header="-- Generated via bigquery_etl.glean_usage\n",
                         usage_types=USAGE_TYPES)
    render_kwargs.update(tables)
    job_kwargs = dict(use_legacy_sql=False, dry_run=dry_run)

    query_sql = render(QUERY_FILENAME, **render_kwargs)
    init_sql = render(QUERY_FILENAME, init=True, **render_kwargs)
    view_sql = render(VIEW_FILENAME, **render_kwargs)
    sql = query_sql

    try:
        client.get_table(last_seen_table)
    except NotFound:
        if views_only:
            logging.info("Skipping view for table which doesn't exist:"
                         f" {last_seen_table}")
            return
        elif dry_run:
            logging.info(f"Table does not yet exist: {last_seen_table}")
        else:
            logging.info(f"Creating table: {last_seen_table}")
        sql = init_sql
    else:
        if views_only:
            write_sql(output_dir, last_seen_view, "view.sql", view_sql)
            return
        # Table exists, so we will run the incremental query.
        job_kwargs.update(
            destination=f"{last_seen_table}${date.strftime('%Y%m%d')}",
            write_disposition=WriteDisposition.WRITE_TRUNCATE,
            query_parameters=[
                ScalarQueryParameter("submission_date", "DATE", date)
            ],
        )
        if not dry_run:
            logging.info(f"Running query for: {last_seen_table}")

    if output_dir:
        write_sql(output_dir, last_seen_view, "view.sql", view_sql)
        write_sql(output_dir, last_seen_table, "query.sql", query_sql)
        write_sql(output_dir, last_seen_table, "init.sql", init_sql)

    job_config = bigquery.QueryJobConfig(**job_kwargs)
    job = client.query(sql, job_config)
    if not dry_run:
        job.result()
        logging.info(f"Recreating view {last_seen_view}")
        client.query(view_sql,
                     bigquery.QueryJobConfig(use_legacy_sql=False)).result()
def run_query(project_id,
              baseline_table,
              date,
              dry_run,
              output_dir=None,
              output_only=False):
    """Process a single table, potentially also writing out the generated queries."""
    tables = table_names_from_baseline(baseline_table)

    last_seen_table = tables["last_seen_table"]
    last_seen_view = tables["last_seen_view"]
    render_kwargs = dict(header="-- Generated via bigquery_etl.glean_usage\n",
                         usage_types=USAGE_TYPES)
    render_kwargs.update(tables)
    job_kwargs = dict(use_legacy_sql=False, dry_run=dry_run)

    query_sql = render(QUERY_FILENAME, **render_kwargs)
    init_sql = render(QUERY_FILENAME, init=True, **render_kwargs)
    view_sql = render(VIEW_FILENAME, **render_kwargs)
    view_metadata = render(VIEW_METADATA_FILENAME,
                           format=False,
                           **render_kwargs)
    sql = query_sql

    if not (referenced_table_exists(view_sql)):
        if output_only:
            logging.info("Skipping view for table which doesn't exist:"
                         f" {last_seen_table}")
            return
        elif dry_run:
            logging.info(f"Table does not yet exist: {last_seen_table}")
        else:
            logging.info(f"Creating table: {last_seen_table}")
        sql = init_sql
    elif output_only:
        pass
    else:
        # Table exists, so we will run the incremental query.
        job_kwargs.update(
            destination=f"{last_seen_table}${date.strftime('%Y%m%d')}",
            write_disposition=WriteDisposition.WRITE_TRUNCATE,
            query_parameters=[
                ScalarQueryParameter("submission_date", "DATE", date)
            ],
        )
        if not dry_run:
            logging.info(f"Running query for: {last_seen_table}")

    if output_dir:
        write_sql(output_dir, last_seen_view, "metadata.yaml", view_metadata)
        write_sql(output_dir, last_seen_view, "view.sql", view_sql)
        write_sql(output_dir, last_seen_table, "query.sql", query_sql)
        write_sql(output_dir, last_seen_table, "init.sql", init_sql)
    if output_only:
        # Return before we initialize the BQ client so that we can generate SQL
        # without having BQ credentials.
        return

    client = bigquery.Client(project_id)
    job_config = bigquery.QueryJobConfig(**job_kwargs)
    job = client.query(sql, job_config)
    if not dry_run:
        job.result()
        logging.info(f"Recreating view {last_seen_view}")
        client.query(view_sql,
                     bigquery.QueryJobConfig(use_legacy_sql=False)).result()
def run_query(
    project_id, baseline_table, date, dry_run, output_dir=None, output_only=False
):
    """Process a single table, potentially also writing out the generated queries."""
    tables = table_names_from_baseline(baseline_table, include_project_id=False)

    table_id = tables["first_seen_table"]
    view_id = tables["first_seen_view"]
    render_kwargs = dict(
        header="-- Generated via bigquery_etl.glean_usage\n",
        project_id=project_id,
        # do not match on org_mozilla_firefoxreality
        fennec_id=any(
            (f"{app_id}_stable" in baseline_table)
            for app_id in [
                "org_mozilla_firefox",
                "org_mozilla_fenix_nightly",
                "org_mozilla_fennec_aurora",
                "org_mozilla_firefox_beta",
                "org_mozilla_fenix",
            ]
        ),
    )
    render_kwargs.update(tables)
    job_kwargs = dict(use_legacy_sql=False, dry_run=dry_run)

    query_sql = render(QUERY_FILENAME, **render_kwargs)
    init_sql = render(INIT_FILENAME, **render_kwargs)
    view_sql = render(VIEW_FILENAME, **render_kwargs)
    view_metadata = render(VIEW_METADATA_FILENAME, format=False, **render_kwargs)
    sql = query_sql

    if not (referenced_table_exists(view_sql)):
        if output_only:
            logging.info("Skipping view for table which doesn't exist:" f" {table_id}")
            return
        elif dry_run:
            logging.info(f"Table does not yet exist: {table_id}")
        else:
            logging.info(f"Creating table: {table_id}")
        sql = init_sql
    elif output_only:
        pass
    else:
        # Table exists, so just overwrite the entire table with the day's results
        job_kwargs.update(
            destination=f"{project_id}.{table_id}",
            write_disposition=WriteDisposition.WRITE_TRUNCATE,
            query_parameters=[ScalarQueryParameter("submission_date", "DATE", date)],
        )
        if not dry_run:
            logging.info(f"Running query for: {table_id}")

    if output_dir:
        write_sql(output_dir, view_id, "metadata.yaml", view_metadata)
        write_sql(output_dir, view_id, "view.sql", view_sql)
        write_sql(output_dir, table_id, "query.sql", query_sql)
        write_sql(output_dir, table_id, "init.sql", init_sql)
    if output_only:
        # Return before we initialize the BQ client so that we can generate SQL
        # without having BQ credentials.
        return

    client = bigquery.Client(project_id)
    job_config = bigquery.QueryJobConfig(**job_kwargs)
    job = client.query(sql, job_config)
    if not dry_run:
        job.result()
        logging.info(f"Recreating view {view_id}")
        client.query(view_sql, bigquery.QueryJobConfig(use_legacy_sql=False)).result()