def main(): """Generate GLAM ETL queries.""" parser = ArgumentParser(description=main.__doc__) parser.add_argument("--prefix") parser.add_argument("--dataset", default="glam_etl") parser.add_argument("--sql-root", default="sql/moz-fx-data-shared-prod/") parser.add_argument("--daily-view-only", action="store_true", default=False) args = parser.parse_args() env = Environment(loader=PackageLoader("bigquery_etl", "glam/templates")) dataset_path = Path(args.sql_root) / args.dataset if not dataset_path.is_dir(): raise NotADirectoryError(f"path to {dataset_path} not found") # curry functions for convenience template = partial(from_template, environment=env, dataset_path=dataset_path, args=args) view = partial(template, QueryType.VIEW) table = partial(template, QueryType.TABLE) init = partial(template, QueryType.INIT) # If this is a logical app id, generate it. Assert that the daily view for # the app exists. This assumes that both scalar and histogram aggregates # exist and will break down in the case where a glean app only contains one # of the scalar or histogram view. for daily_view in [ "view_clients_daily_scalar_aggregates_v1", "view_clients_daily_histogram_aggregates_v1", ]: try: view(f"logical_app_id/{args.prefix}__{daily_view}") except TemplateNotFound: print(f"{args.prefix} is not a logical app id") # generate the view for the app id directly view(daily_view) if not (dataset_path / f"{args.prefix}__{daily_view}").is_dir(): raise ValueError(f"missing {daily_view}") # exit early if we're only generating a daily view if args.daily_view_only: return # Supported fenix/firefox for android products. These are logical ids that # are formed from the union of several app_ids (sometimes across date # boundaries). config_schema = { "type": "object", "additionalProperties": { "type": "object", "properties": { "build_date_udf": { "type": "string", "description": ("The fully qualified path to a UDF that accepts the " "client_info.app_build_id field and returns a datetime."), }, "filter_version": { "type": "boolean", "description": ("Whether the integer extracted from the " "client_info.app_display_version should be used to " "filter incremental aggregates."), }, "num_versions_to_keep": { "type": "integer", "minimum": 1, "description": "The number of versions to keep.", }, }, "required": ["build_date_udf", "filter_version", "num_versions_to_keep"], }, } config = { "org_mozilla_fenix_glam_nightly": { "build_date_udf": "`moz-fx-data-shared-prod`.udf.fenix_build_to_datetime", "filter_version": False, # this value is ignored due to filter version "num_versions_to_keep": 1000, }, "org_mozilla_fenix_glam_beta": { "build_date_udf": "`moz-fx-data-shared-prod`.udf.fenix_build_to_datetime", "filter_version": True, "num_versions_to_keep": 2, }, "org_mozilla_fenix_glam_release": { "build_date_udf": "`moz-fx-data-shared-prod`.udf.fenix_build_to_datetime", "filter_version": True, "num_versions_to_keep": 2, }, } validate(instance=config, schema=config_schema) if not config.get(args.prefix, {}).get("build_date_udf"): raise ValueError(f"build date udf for {args.prefix} was not found") [ table( "latest_versions_v1", **dict(source_table=( f"glam_etl.{args.prefix}__view_clients_daily_scalar_aggregates_v1" )), ), init( "clients_scalar_aggregates_v1", **models.clients_scalar_aggregates( source_table= (f"glam_etl.{args.prefix}__view_clients_daily_scalar_aggregates_v1" ), destination_table=( f"glam_etl.{args.prefix}__clients_scalar_aggregates_v1"), ), ), table( "clients_scalar_aggregates_v1", **models.clients_scalar_aggregates( source_table= (f"glam_etl.{args.prefix}__view_clients_daily_scalar_aggregates_v1" ), destination_table=( f"glam_etl.{args.prefix}__clients_scalar_aggregates_v1"), **config[args.prefix], ), ), init( "clients_histogram_aggregates_v1", **models.clients_histogram_aggregates(parameterize=True), ), table( "clients_histogram_aggregates_v1", **models.clients_histogram_aggregates(parameterize=True, **config[args.prefix]), ), table( "scalar_bucket_counts_v1", **models.scalar_bucket_counts( source_table= f"glam_etl.{args.prefix}__clients_scalar_aggregates_v1"), ), table( "histogram_bucket_counts_v1", **models.histogram_bucket_counts( source_table= f"glam_etl.{args.prefix}__clients_histogram_aggregates_v1"), ), table( "probe_counts_v1", query_name_prefix="scalar", **models.probe_counts( source_table=f"glam_etl.{args.prefix}__scalar_bucket_counts_v1", is_scalar=True, ), ), table( "probe_counts_v1", query_name_prefix="histogram", **models.probe_counts( source_table= f"glam_etl.{args.prefix}__histogram_bucket_counts_v1", is_scalar=False, ), ), table( "scalar_percentiles_v1", **models.scalar_percentiles( source_table= f"glam_etl.{args.prefix}__clients_scalar_aggregates_v1"), ), table("histogram_percentiles_v1"), view("view_probe_counts_v1"), view("view_user_counts_v1", **models.user_counts()), table("extract_user_counts_v1", **config[args.prefix]), table("extract_probe_counts_v1", **config[args.prefix]), ]
def main(): """Generate GLAM ETL queries.""" parser = ArgumentParser(description=main.__doc__) parser.add_argument("--prefix") parser.add_argument("--dataset", default="glam_etl") parser.add_argument("--sql-root", default="sql/") args = parser.parse_args() env = Environment(loader=PackageLoader("bigquery_etl", "glam/templates")) dataset_path = Path(args.sql_root) / args.dataset if not dataset_path.is_dir(): raise NotADirectoryError(f"path to {dataset_path} not found") build_date_udf_mapping = dict( org_mozilla_fenix="`moz-fx-data-shared-prod`.udf.fenix_build_to_datetime", org_mozilla_firefox="`moz-fx-data-shared-prod`.udf.fenix_build_to_datetime", org_mozilla_firefox_beta="`moz-fx-data-shared-prod`.udf.fenix_build_to_datetime", # noqa org_mozilla_fennec_aurora="`moz-fx-data-shared-prod`.udf.fenix_build_to_datetime", # noqa ) if not build_date_udf_mapping.get(args.prefix): raise ValueError(f"build date udf for {args.prefix} was not found") # curry functions for convenience template = partial( from_template, environment=env, dataset_path=dataset_path, args=args ) view = partial(template, QueryType.VIEW) table = partial(template, QueryType.TABLE) init = partial(template, QueryType.INIT) [ table( "latest_versions_v1", **dict(source_table=f"{args.prefix}_stable.baseline_v1"), ), init( "clients_scalar_aggregates_v1", **models.clients_scalar_aggregates( source_table=( f"glam_etl.{args.prefix}__view_clients_daily_scalar_aggregates_v1" ), destination_table=( f"glam_etl.{args.prefix}__clients_scalar_aggregates_v1" ), ), ), table( "clients_scalar_aggregates_v1", **models.clients_scalar_aggregates( source_table=( f"glam_etl.{args.prefix}__view_clients_daily_scalar_aggregates_v1" ), destination_table=( f"glam_etl.{args.prefix}__clients_scalar_aggregates_v1" ), ), ), init( "clients_histogram_aggregates_v1", **models.clients_histogram_aggregates(parameterize=True), ), table( "clients_histogram_aggregates_v1", **models.clients_histogram_aggregates(parameterize=True), ), table( "scalar_bucket_counts_v1", **models.scalar_bucket_counts( source_table=f"glam_etl.{args.prefix}__clients_scalar_aggregates_v1" ), ), table( "histogram_bucket_counts_v1", **models.histogram_bucket_counts( source_table=f"glam_etl.{args.prefix}__clients_histogram_aggregates_v1" ), ), table( "probe_counts_v1", query_name_prefix="scalar", **models.probe_counts( source_table=f"glam_etl.{args.prefix}__scalar_bucket_counts_v1", is_scalar=True, ), ), table( "probe_counts_v1", query_name_prefix="histogram", **models.probe_counts( source_table=f"glam_etl.{args.prefix}__histogram_bucket_counts_v1", is_scalar=False, ), ), table( "scalar_percentiles_v1", **models.scalar_percentiles( source_table=f"glam_etl.{args.prefix}__clients_scalar_aggregates_v1" ), ), view("view_clients_daily_scalar_aggregates_v1"), view("view_clients_daily_histogram_aggregates_v1"), table("histogram_percentiles_v1"), view("view_probe_counts_v1"), view("view_user_counts_v1", **models.user_counts()), table( "extract_user_counts_v1", build_date_udf=build_date_udf_mapping[args.prefix] ), table( "extract_probe_counts_v1", build_date_udf=build_date_udf_mapping[args.prefix], ), ]
def main(): """Generate GLAM ETL queries.""" parser = ArgumentParser(description=main.__doc__) parser.add_argument("--prefix") parser.add_argument("--dataset", default="glam_etl") parser.add_argument("--sql-root", default="sql/") parser.add_argument("--daily-view-only", action="store_true", default=False) args = parser.parse_args() env = Environment(loader=PackageLoader("bigquery_etl", "glam/templates")) dataset_path = Path(args.sql_root) / args.dataset if not dataset_path.is_dir(): raise NotADirectoryError(f"path to {dataset_path} not found") # curry functions for convenience template = partial(from_template, environment=env, dataset_path=dataset_path, args=args) view = partial(template, QueryType.VIEW) table = partial(template, QueryType.TABLE) init = partial(template, QueryType.INIT) # If this is a logical app id, generate it. Assert that the daily view for # the app exists. This assumes that both scalar and histogram aggregates # exist and will break down in the case where a glean app only contains one # of the scalar or histogram view. for daily_view in [ "view_clients_daily_scalar_aggregates_v1", "view_clients_daily_histogram_aggregates_v1", ]: try: view(f"logical_app_id/{args.prefix}__{daily_view}") except TemplateNotFound: print(f"{args.prefix} is not a logical app id") # generate the view for the app id directly view(daily_view) if not (dataset_path / f"{args.prefix}__{daily_view}").is_dir(): raise ValueError(f"missing {daily_view}") # exit early if we're only generating a daily view if args.daily_view_only: return # Supported fenix/firefox for android products. These are logical ids that # are formed from the union of several app_ids (sometimes across date # boundaries). fenix_app_ids = [ "org_mozilla_fenix_glam_nightly", "org_mozilla_fenix_glam_beta", "org_mozilla_fenix_glam_release", ] build_date_udf_mapping = dict( **{ app_id: "`moz-fx-data-shared-prod`.udf.fenix_build_to_datetime" for app_id in fenix_app_ids }) if not build_date_udf_mapping.get(args.prefix): raise ValueError(f"build date udf for {args.prefix} was not found") [ table( "latest_versions_v1", **dict(source_table=( f"glam_etl.{args.prefix}__view_clients_daily_scalar_aggregates_v1" )), ), init( "clients_scalar_aggregates_v1", **models.clients_scalar_aggregates( source_table= (f"glam_etl.{args.prefix}__view_clients_daily_scalar_aggregates_v1" ), destination_table=( f"glam_etl.{args.prefix}__clients_scalar_aggregates_v1"), ), ), table( "clients_scalar_aggregates_v1", **models.clients_scalar_aggregates( source_table= (f"glam_etl.{args.prefix}__view_clients_daily_scalar_aggregates_v1" ), destination_table=( f"glam_etl.{args.prefix}__clients_scalar_aggregates_v1"), ), ), init( "clients_histogram_aggregates_v1", **models.clients_histogram_aggregates(parameterize=True), ), table( "clients_histogram_aggregates_v1", **models.clients_histogram_aggregates(parameterize=True), ), table( "scalar_bucket_counts_v1", **models.scalar_bucket_counts( source_table= f"glam_etl.{args.prefix}__clients_scalar_aggregates_v1"), ), table( "histogram_bucket_counts_v1", **models.histogram_bucket_counts( source_table= f"glam_etl.{args.prefix}__clients_histogram_aggregates_v1"), ), table( "probe_counts_v1", query_name_prefix="scalar", **models.probe_counts( source_table=f"glam_etl.{args.prefix}__scalar_bucket_counts_v1", is_scalar=True, ), ), table( "probe_counts_v1", query_name_prefix="histogram", **models.probe_counts( source_table= f"glam_etl.{args.prefix}__histogram_bucket_counts_v1", is_scalar=False, ), ), table( "scalar_percentiles_v1", **models.scalar_percentiles( source_table= f"glam_etl.{args.prefix}__clients_scalar_aggregates_v1"), ), table("histogram_percentiles_v1"), view("view_probe_counts_v1"), view("view_user_counts_v1", **models.user_counts()), table("extract_user_counts_v1", build_date_udf=build_date_udf_mapping[args.prefix]), table( "extract_probe_counts_v1", build_date_udf=build_date_udf_mapping[args.prefix], ), ]