def bigquery_xcom_query( destination_table, dataset_id, xcom_task_id, parameters=(), arguments=(), project_id=None, gcp_conn_id="google_cloud_derived_datasets", gke_location="us-central1-a", gke_cluster_name="bq-load-gke-1", gke_namespace="default", docker_image="mozilla/bigquery-etl:latest", date_partition_parameter="submission_date", **kwargs ): """ Generate a GKEPodOperator which runs an xcom result as a bigquery query. :param str destination_table: [Required] BigQuery destination table :param str dataset_id: [Required] BigQuery default dataset id :param str xcom_task_id: [Required] task_id which generated the xcom to pull :param Tuple[str] parameters: Parameters passed to bq query :param Tuple[str] arguments: Additional bq query arguments :param Optional[str] project_id: BigQuery default project id :param str gcp_conn_id: Airflow connection id for GCP access :param str gke_location: GKE cluster location :param str gke_cluster_name: GKE cluster name :param str gke_namespace: GKE cluster namespace :param str docker_image: docker image to use :param Optional[str] date_partition_parameter: Parameter for indicating destination partition to generate, if None destination should be whole table rather than partition :param Dict[str, Any] kwargs: Additional keyword arguments for GKEPodOperator :return: GKEPodOperator """ kwargs["task_id"] = kwargs.get("task_id", destination_table) kwargs["name"] = kwargs.get("name", kwargs["task_id"].replace("_", "-")) if destination_table is not None and date_partition_parameter is not None: destination_table = destination_table + "${{ds_nodash}}" parameters += (date_partition_parameter + ":DATE:{{ds}}",) query = "{{ " + "task_instance.xcom_pull({!r})".format(xcom_task_id) + " }}" return GKEPodOperator( gcp_conn_id=gcp_conn_id, project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id, location=gke_location, cluster_name=gke_cluster_name, namespace=gke_namespace, image=docker_image, arguments=["bq"] + ["query"] + (["--destination_table=" + destination_table] if destination_table else []) + ["--dataset_id=" + dataset_id] + (["--project_id=" + project_id] if project_id else []) + ["--parameter=" + parameter for parameter in parameters] + list(arguments) + [query], **kwargs )
def bigquery_etl_query(destination_table, dataset_id, parameters=(), arguments=(), project_id=None, sql_file_path=None, gcp_conn_id="google_cloud_derived_datasets", gke_location="us-central1-a", gke_cluster_name="bq-load-gke-1", gke_namespace="default", docker_image="mozilla/bigquery-etl:latest", date_partition_parameter="submission_date", multipart=False, **kwargs): """ Generate. :param str destination_table: [Required] BigQuery destination table :param str dataset_id: [Required] BigQuery default dataset id :param Tuple[str] parameters: Parameters passed to bq query :param Tuple[str] arguments: Additional bq query arguments :param Optional[str] project_id: BigQuery default project id :param Optional[str] sql_file_path: Optional override for path to the SQL query file to run :param str gcp_conn_id: Airflow connection id for GCP access :param str gke_location: GKE cluster location :param str gke_cluster_name: GKE cluster name :param str gke_namespace: GKE cluster namespace :param str docker_image: docker image to use :param Optional[str] date_partition_parameter: Parameter for indicating destination partition to generate, if None destination should be whole table rather than partition :param Dict[str, Any] kwargs: Additional keyword arguments for GKEPodOperator :return: GKEPodOperator """ kwargs["task_id"] = kwargs.get("task_id", destination_table) kwargs["name"] = kwargs.get("name", kwargs["task_id"].replace("_", "-")) sql_file_path = sql_file_path or "sql/{}/{}/query.sql".format( dataset_id, destination_table) if destination_table is not None and date_partition_parameter is not None: destination_table = destination_table + "${{ds_nodash}}" parameters += (date_partition_parameter + ":DATE:{{ds}}", ) return GKEPodOperator( gcp_conn_id=gcp_conn_id, project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id, location=gke_location, cluster_name=gke_cluster_name, namespace=gke_namespace, image=docker_image, arguments=["script/run_multipart_query" if multipart else "query"] + (["--destination_table=" + destination_table] if destination_table else []) + ["--dataset_id=" + dataset_id] + (["--project_id=" + project_id] if project_id else []) + ["--parameter=" + parameter for parameter in parameters] + list(arguments) + [sql_file_path], **kwargs)
def bigquery_etl_copy_deduplicate(task_id, target_project_id, only_tables=None, except_tables=None, parallelism=4, priority="INTERACTIVE", hourly=False, slices=None, gcp_conn_id="google_cloud_derived_datasets", gke_location="us-central1-a", gke_cluster_name="bq-load-gke-1", gke_namespace="default", docker_image="mozilla/bigquery-etl:latest", **kwargs): """ Copy a day's data from live ping tables to stable ping tables, deduplicating on document_id. :param str task_id: [Required] ID for the task :param str target_project_id: [Required] ID of project where target tables live :param Tuple[str] only_tables: Only process tables matching the given globs of form 'telemetry_live.main_v*' :param Tuple[str] except_tables: Process all tables except those matching the given globs :param int parallelism: Maximum number of queries to execute concurrently :param str priority: BigQuery query priority to use, must be BATCH or INTERACTIVE :param bool hourly: Alias for --slices=24 :param int slices: Number of time-based slices to deduplicate in, rather than for whole days at once :param str gcp_conn_id: Airflow connection id for GCP access :param str gke_location: GKE cluster location :param str gke_cluster_name: GKE cluster name :param str gke_namespace: GKE cluster namespace :param str docker_image: docker image to use :param Dict[str, Any] kwargs: Additional keyword arguments for GKEPodOperator :return: GKEPodOperator """ kwargs["name"] = kwargs.get("name", task_id.replace("_", "-")) table_qualifiers = [] if only_tables: table_qualifiers.append('--only') table_qualifiers += only_tables if except_tables: table_qualifiers.append('--except') table_qualifiers += except_tables return GKEPodOperator( task_id=task_id, gcp_conn_id=gcp_conn_id, project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id, location=gke_location, cluster_name=gke_cluster_name, namespace=gke_namespace, image=docker_image, arguments=["script/copy_deduplicate"] + ["--project-id=" + target_project_id] + ["--date={{ds}}"] + ["--parallelism={}".format(parallelism)] + ["--priority={}".format(priority)] + (["--hourly"] if hourly else []) + (["--slices={}".format(slices)] if slices is not None else []) + table_qualifiers, **kwargs)
def bigquery_etl_copy_deduplicate( task_id, target_project_id, only_tables=None, except_tables=None, parallelism=4, gcp_conn_id="google_cloud_derived_datasets", gke_location="us-central1-a", gke_cluster_name="bq-load-gke-1", gke_namespace="default", docker_image="mozilla/bigquery-etl:latest", image_pull_policy="Always", **kwargs ): """ Copy a day's data from live ping tables to stable ping tables, deduplicating on document_id. :param str task_id: [Required] ID for the task :param str target_project_id: [Required] ID of project where target tables live :param Tuple[str] only_tables: Only process tables matching the given globs of form 'telemetry_live.main_v*' :param Tuple[str] except_tables: Process all tables except those matching the given globs :param int parallelism: Maximum number of queries to execute concurrently :param str gcp_conn_id: Airflow connection id for GCP access :param str gke_location: GKE cluster location :param str gke_cluster_name: GKE cluster name :param str gke_namespace: GKE cluster namespace :param str docker_image: docker image to use :param str image_pull_policy: Kubernetes policy for when to pull docker_image :param Dict[str, Any] kwargs: Additional keyword arguments for GKEPodOperator :return: GKEPodOperator """ kwargs["name"] = kwargs.get("name", task_id.replace("_", "-")) table_qualifiers = [] if only_tables: table_qualifiers.append('--only') table_qualifiers += only_tables if except_tables: table_qualifiers.append('--except') table_qualifiers += except_tables return GKEPodOperator( task_id=task_id, gcp_conn_id=gcp_conn_id, project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id, location=gke_location, cluster_name=gke_cluster_name, namespace=gke_namespace, image=docker_image, arguments=["script/copy_deduplicate"] + ["--project-id=" + target_project_id] + ["--date={{ds}}"] + ["--parallelism={}".format(parallelism)] + table_qualifiers, image_pull_policy=image_pull_policy, **kwargs )
def gke_command( task_id, command, docker_image, aws_conn_id="aws_dev_iam_s3", gcp_conn_id="google_cloud_derived_datasets", gke_location="us-central1-a", gke_cluster_name="bq-load-gke-1", gke_namespace="default", image_pull_policy="Always", xcom_push=False, env_vars={}, **kwargs ): """ Run a docker command on GKE :param str task_id: [Required] ID for the task :param List[str] command: [Required] Command to run :param str docker_image: [Required] docker image to use :param str aws_conn_id: Airflow connection id for AWS access :param str gcp_conn_id: Airflow connection id for GCP access :param str gke_location: GKE cluster location :param str gke_cluster_name: GKE cluster name :param str gke_namespace: GKE cluster namespace :param str image_pull_policy: Kubernetes policy for when to pull docker_image :param bool xcom_push: Return the output of this command as an xcom :param Dict[str, Any] kwargs: Additional keyword arguments for GKEPodOperator :return: GKEPodOperator """ kwargs["name"] = kwargs.get("name", task_id.replace("_", "-")) context_env_vars = { key: value for key, value in zip( ("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_SESSION_TOKEN"), AwsHook(aws_conn_id).get_credentials() if aws_conn_id else (), ) if value is not None} context_env_vars["XCOM_PUSH"] = json.dumps(xcom_push) context_env_vars.update(env_vars) return GKEPodOperator( task_id=task_id, gcp_conn_id=gcp_conn_id, project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id, location=gke_location, cluster_name=gke_cluster_name, namespace=gke_namespace, image=docker_image, arguments=command, image_pull_policy=image_pull_policy, xcom_push=xcom_push, env_vars=context_env_vars, **kwargs )
def bigquery_etl_query( destination_table, parameters=(), arguments=(), gcp_conn_id="google_cloud_derived_datasets", gke_location="us-central1-a", gke_cluster_name="bq-load-gke-1", gke_namespace="default", docker_image="mozilla/bigquery-etl:latest", image_pull_policy="Always", date_partition_parameter="submission_date", **kwargs ): """ Generate. :param str destination_table: [Required] BigQuery destination table :param Tuple[str] parameters: Parameters passed to bq query :param Tuple[str] arguments: Additional bq query arguments :param str gcp_conn_id: Airflow connection id for GCP access :param str gke_location: GKE cluster location :param str gke_cluster_name: GKE cluster name :param str gke_namespace: GKE cluster namespace :param str docker_image: docker image to use :param str image_pull_policy: Kubernetes policy for when to pull docker_image :param Optional[str] date_partition_parameter: Parameter for indicating destination partition to generate, if None destination should be whole table rather than partition :param Dict[str, Any] kwargs: Additional keyword arguments for GKEPodOperator :return: GKEPodOperator """ kwargs["task_id"] = kwargs.get("task_id", destination_table) kwargs["name"] = kwargs.get("name", kwargs["task_id"].replace("_", "-")) sql_file_path = "sql/{}.sql".format(destination_table) if date_partition_parameter is not None: destination_table = destination_table + "${{ds_nodash}}" parameters += (date_partition_parameter + ":DATE:{{ds}}",) return GKEPodOperator( gcp_conn_id=gcp_conn_id, project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id, location=gke_location, cluster_name=gke_cluster_name, namespace=gke_namespace, image=docker_image, arguments=["query"] + ["--destination_table=" + destination_table] + ["--parameter=" + parameter for parameter in parameters] + list(arguments) + [sql_file_path], image_pull_policy=image_pull_policy, **kwargs )
def burnham_bigquery_run( task_id, project_id, burnham_test_run, burnham_test_scenarios, gcp_conn_id=DEFAULT_GCP_CONN_ID, gke_location=DEFAULT_GKE_LOCATION, gke_cluster_name=DEFAULT_GKE_CLUSTER_NAME, gke_namespace=DEFAULT_GKE_NAMESPACE, **kwargs, ): """Create a new GKEPodOperator that runs the burnham-bigquery Docker image. :param str task_id: [Required] ID for the task :param str project_id: [Required] Project ID where target table lives :param str burnham_test_run: [Required] UUID for the test run :param str burnham_test_scenarios: [Required] Encoded burnham test scenarios :param str gcp_conn_id: Airflow connection id for GCP access :param str gke_location: GKE cluster location :param str gke_cluster_name: GKE cluster name :param str gke_namespace: GKE cluster namespace :param Dict[str, Any] kwargs: Additional kwargs for GKEPodOperator :return: GKEPodOperator """ kwargs["name"] = kwargs.get("name", task_id.replace("_", "-")) return GKEPodOperator( task_id=task_id, gcp_conn_id=gcp_conn_id, project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id, location=gke_location, cluster_name=gke_cluster_name, namespace=gke_namespace, image="gcr.io/moz-fx-data-airflow-prod-88e0/burnham-bigquery:latest", image_pull_policy="Always", arguments=[ "-vv", "--project-id", project_id, "--run-id", burnham_test_run, "--scenarios", burnham_test_scenarios, "--results-table", "burnham_derived.test_results_v1", "--log-url", "{{ task_instance.log_url }}", "--start-timestamp", "{{ dag_run.start_date.isoformat() }}", ], **kwargs, )
def burnham_run( task_id, burnham_test_run, burnham_test_name, burnham_missions, burnham_spore_drive=None, gcp_conn_id=DEFAULT_GCP_CONN_ID, gke_location=DEFAULT_GKE_LOCATION, gke_cluster_name=DEFAULT_GKE_CLUSTER_NAME, gke_namespace=DEFAULT_GKE_NAMESPACE, **kwargs, ): """Create a new GKEPodOperator that runs the burnham Docker image. :param str task_id: [Required] ID for the task :param str burnham_test_run: [Required] UUID for the test run :param str burnham_test_name: [Required] Name for the test item :param List[str] burnham_missions: [Required] List of mission identifiers :param Optional[str] burnham_spore_drive: Interface for the spore-drive technology :param str gcp_conn_id: Airflow connection id for GCP access :param str gke_location: GKE cluster location :param str gke_cluster_name: GKE cluster name :param str gke_namespace: GKE cluster namespace :param Dict[str, Any] kwargs: Additional kwargs for GKEPodOperator :return: GKEPodOperator """ kwargs["name"] = kwargs.get("name", task_id.replace("_", "-")) env_vars = { "BURNHAM_PLATFORM_URL": BURNHAM_PLATFORM_URL, "BURNHAM_TEST_RUN": burnham_test_run, "BURNHAM_TEST_NAME": burnham_test_name, "BURNHAM_VERBOSE": "true", "GLEAN_LOG_PINGS": "true", } if burnham_spore_drive is not None: env_vars["BURNHAM_SPORE_DRIVE"] = burnham_spore_drive return GKEPodOperator( task_id=task_id, gcp_conn_id=gcp_conn_id, project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id, location=gke_location, cluster_name=gke_cluster_name, namespace=gke_namespace, image="gcr.io/moz-fx-data-airflow-prod-88e0/burnham:latest", image_pull_policy="Always", env_vars=env_vars, arguments=burnham_missions, **kwargs, )
def simpleprophet_forecast( task_id, datasource, project_id, dataset_id, table_id, gcp_conn_id="google_cloud_derived_datasets", gke_location="us-central1-a", gke_cluster_name="bq-load-gke-1", gke_namespace="default", docker_image="gcr.io/moz-fx-data-forecasting/simpleprophet:latest", image_pull_policy="Always", **kwargs ): """Run all simpleprophet models for the given datasource and model date. :param str task_id: [Required] ID for the task :param str datasource: [Required] One of desktop, mobile, fxa :param str project_id: [Required] ID of project where target table lives :param str dataset_id: [Required] ID of dataset where target table lives :param str table_id: [Required] ID of target table :param str gcp_conn_id: Airflow connection id for GCP access :param str gke_location: GKE cluster location :param str gke_cluster_name: GKE cluster name :param str gke_namespace: GKE cluster namespace :param str docker_image: docker image to use :param str image_pull_policy: Kubernetes policy for when to pull docker_image :param Dict[str, Any] kwargs: Additional keyword arguments for GKEPodOperator :return: GKEPodOperator """ kwargs["name"] = kwargs.get("name", task_id.replace("_", "-")) return GKEPodOperator( task_id=task_id, gcp_conn_id=gcp_conn_id, project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id, location=gke_location, cluster_name=gke_cluster_name, namespace=gke_namespace, image=docker_image, arguments=["{{ds}}"] + ["--datasource=" + datasource] + ["--project-id=" + project_id] + ["--dataset-id=" + dataset_id] + ["--table-id=" + table_id], image_pull_policy=image_pull_policy, **kwargs )
objects_prefix, "--cluster-by", "crash_date", ] # We remove the current date partition for idempotency. remove_bq_table_partition = BigQueryTableDeleteOperator( task_id="remove_bq_table_partition", bigquery_conn_id=bq_gcp_conn_id, deletion_dataset_table="{}.{}${{{{ds_nodash}}}}".format( bq_dataset, bq_table_name), ignore_if_missing=True, dag=dag, ) bq_load = GKEPodOperator( task_id="bigquery_load", gcp_conn_id=bq_gcp_conn_id, project_id=bq_connection.project_id, name="load-socorro-crash-parquet-to-bq", image=docker_image, arguments=gke_args, env_vars={ "GOOGLE_CLOUD_PROJECT": "{{ var.value.gcp_shared_prod_project }}" }, dag=dag, ) s3_to_gcs >> crash_report_parquet crash_report_parquet >> remove_bq_table_partition >> bq_load
def container_subdag( parent_dag_name, child_dag_name, default_args, gcp_conn_id, service_account, server_id, env_vars={}, arguments=[], machine_type="n1-standard-1", image="mozilla/prio-processor:latest", location="us-west1-b", owner_label="amiyaguchi", team_label="dataeng", ): """Run a command on an ephemeral container running the `mozilla/prio-processor:latest` image. :param str parent_dag_name: Name of the parent DAG. :param str child_dag_name: Name of the child DAG. :param Dict[str, Any] default_args: Default arguments for the child DAG. :param str gcp_conn_id: Name of the connection string. :param str service_account: The address of the service account. :param str server_id: The identifier for the Prio processor :param Dict[str, str] env_vars: Environment variables for configuring the processor. :param List[str] arguments: The command to run after loading the image. :param str machine_type: The machine type for running the image. :param str image: Dockerhub image :param str location: The region of the GKE cluster. :param str owner_label: Label for associating the owner :param str team_label: Label for associating the team :return: DAG """ assert server_id in ["a", "b", "admin"] connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) cluster_name = "gke-prio-{}".format(server_id) shared_config = { "project_id": connection.project_id, "gcp_conn_id": gcp_conn_id, "location": location, } with DAG("{}.{}".format(parent_dag_name, child_dag_name), default_args=default_args) as dag: create_gke_cluster = GKEClusterCreateOperator( task_id="create_gke_cluster", body=create_gke_config( name=cluster_name, service_account=service_account, owner_label=owner_label, team_label=team_label, machine_type=machine_type, # DataProc clusters require VPC with auto-created subnets subnetwork="default" if server_id == "admin" else "gke-subnet", is_dev=environ.get("DEPLOY_ENVIRONMENT") == "dev", ), dag=dag, **shared_config) # Running the pod without any time in-between will cause the scope-based # authentication in Google Cloud Platform to fail. For example: # # `ServiceException: 401 Anonymous caller does not have # storage.objects.get access to moz-fx-prio-dev-a-private/processed/` # # Sleeping by a small amount solves this problem. This issue was first # noticed intermittently on 2019-09-09. sleep = BashOperator(task_id="sleep", bash_command="sleep 60", dag=dag) run_prio = GKEPodOperator(task_id="processor_{}".format(server_id), name="run-prio-project-{}".format(server_id), cluster_name=cluster_name, namespace="default", image=image, arguments=arguments, env_vars=env_vars, dag=dag, **shared_config) delete_gke_cluster = GKEClusterDeleteOperator( task_id="delete_gke_cluster", name=cluster_name, trigger_rule="all_done", dag=dag, **shared_config) create_gke_cluster >> sleep >> run_prio >> delete_gke_cluster return dag
# Cluster autoscaling works on pod resource requests, instead of usage resources = {'request_memory':'13312Mi', 'request_cpu': None, 'limit_memory':'20480Mi', 'limit_cpu': None, 'limit_gpu': None} probe_scraper = GKEPodOperator( task_id="probe_scraper", name='probe-scraper', # Needed to scale the highmem pool from 0 -> 1 resources=resources, # This python job requires 13 GB of memory, thus the highmem node pool node_selectors={"nodepool" : "highmem"}, # Due to the nature of the container run, we set get_logs to False, # To avoid urllib3.exceptions.ProtocolError: 'Connection broken: IncompleteRead(0 bytes read)' errors # Where the pod continues to run, but airflow loses its connection and sets the status to Failed get_logs=False, # Give additional time since we will likely always scale up when running this job startup_timeout_seconds=360, image=probe_scraper_image, arguments=probe_scraper_args, email=['*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**'], env_vars={ "AWS_ACCESS_KEY_ID": aws_access_key, "AWS_SECRET_ACCESS_KEY": aws_secret_key }, dag=dag) schema_generator = GKEPodOperator( email=['*****@*****.**'], task_id='mozilla_schema_generator', name='schema-generator-1',
baseline_etl_kwargs = dict( gcp_conn_id=gcp_conn_id, project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id, location="us-central1-a", cluster_name="bq-load-gke-1", namespace="default", image="mozilla/bigquery-etl:latest", ) baseline_args = [ "--project-id=moz-fx-data-shared-prod", "--date={{ ds }}", "--only=*_stable.baseline_v1" ] baseline_clients_daily = GKEPodOperator( task_id='baseline_clients_daily', name='baseline-clients-daily', arguments=["script/run_glean_baseline_clients_daily"] + baseline_args, **baseline_etl_kwargs ) baseline_clients_last_seen = GKEPodOperator( task_id='baseline_clients_last_seen', name='baseline-clients-last-seen', arguments=["script/run_glean_baseline_clients_last_seen"] + baseline_args, depends_on_past=True, **baseline_etl_kwargs ) (copy_deduplicate_all >> baseline_clients_daily >> baseline_clients_last_seen)
timedelta(minutes=30), } with DAG("public_analysis", default_args=default_args, schedule_interval="0 1 * * *") as dag: # Built from https://github.com/mozilla/forecasting/tree/master/anomdtct anomdtct_image = "gcr.io/moz-fx-data-forecasting/anomdtct:latest" anomdtct = GKEPodOperator( task_id="anomdtct", name="anomdtct", image=anomdtct_image, email=[ "*****@*****.**", "*****@*****.**", ], arguments=["{{ds}}"] + ["--spreadsheet-id=" + Variable.get('anomdtct_spreadsheet_id')] + ["--spreadsheet-key=" + Variable.get('anomdtct_spreadsheet_api_key')], dag=dag, ) wait_for_clients_first_seen = ExternalTaskSensor( task_id="wait_for_clients_first_seen", external_dag_id="main_summary", external_task_id="clients_first_seen", dag=dag, ) anomdtct.set_upstream([
with DAG("jetstream", default_args=default_args, schedule_interval="0 4 * * *") as dag: # Built from repo https://github.com/mozilla/jetstream jetstream_image = "gcr.io/moz-fx-data-experiments/jetstream:latest" jetstream = GKEPodOperator( task_id="jetstream", name="jetstream", image=jetstream_image, email=[ "*****@*****.**", "*****@*****.**", "*****@*****.**", ], arguments=[ "run-argo", "--date={{ ds }}", # the Airflow cluster doesn't have Compute Engine API access so pass in IP # and certificate in order for the pod to connect to the Kubernetes cluster # running Jetstream "--cluster-ip={{ var.value.jetstream_cluster_ip }}", "--cluster-cert={{ var.value.jetstream_cluster_cert }}" ], dag=dag, ) wait_for_clients_daily_export = ExternalTaskSensor( task_id="wait_for_clients_daily", external_dag_id="bqetl_main_summary", external_task_id="telemetry_derived__clients_daily__v6", execution_delta=timedelta(hours=2),
"--iso-date={{ ds_nodash }}", "--gcp-project=%s" % TAAR_PROFILE_PROJECT_ID, "--avro-gcs-bucket=%s" % TAAR_ETL_STORAGE_BUCKET, "--bigtable-instance-id=%s" % TAAR_BIGTABLE_INSTANCE_ID, "--sample-rate=1.0", "--subnetwork=%s" % TAAR_DATAFLOW_SUBNETWORK, ] wipe_gcs_bucket = GKEPodOperator( owner="*****@*****.**", email=[ "*****@*****.**", "*****@*****.**", "*****@*****.**" ], task_id="wipe_taar_gcs_bucket", name="wipe_taar_gcs_bucket", image="google/cloud-sdk:242.0.0-alpine", arguments=wipe_gcs_files(), location="us-central1-a", cluster_name="bq-load-gke-1", dag=taar_weekly, ) dump_bq_to_tmp_table = GKEPodOperator( owner="*****@*****.**", email=[ "*****@*****.**", "*****@*****.**", "*****@*****.**" ], task_id="dump_bq_to_tmp_table", name="dump_bq_to_tmp_table", image=TAAR_ETL_CONTAINER_IMAGE,
"email": ["*****@*****.**", "*****@*****.**", "*****@*****.**"], "email_on_failure": True, "email_on_retry": True, "retries": 0, "retry_delay": timedelta(minutes=30), } dag = DAG("taar_daily", default_args=default_args, schedule_interval="0 4 * * *") amodump = GKEPodOperator( task_id="taar_amodump", name="taar-amodump", # This uses a circleci built docker image from github.com/mozilla/taar_gcp_etl image="gcr.io/moz-fx-data-airflow-prod-88e0/taar_gcp_etl:0.1", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**", "*****@*****.**"], arguments=["-m", "taar_etl.taar_amodump", "--date", "{{ ds_nodash }}"], env_vars={ "AWS_ACCESS_KEY_ID": taar_aws_access_key, "AWS_SECRET_ACCESS_KEY": taar_aws_secret_key, }, dag=dag, ) amowhitelist = GKEPodOperator( task_id="taar_amowhitelist", name="taar-amowhitelist", # This uses a circleci built docker image from github.com/mozilla/taar_gcp_etl image="gcr.io/moz-fx-data-airflow-prod-88e0/taar_gcp_etl:0.1", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**", "*****@*****.**"], # We are extracting addons from the AMO server's APIs which don't
s3_to_gcs = S3ToGoogleCloudStorageTransferOperator( task_id='s3_to_gcs', s3_bucket='net-mozaws-data-us-west-2-data-analysis', gcs_bucket='moz-fx-data-derived-datasets-blpadi', description='blpadi copy from s3 to gcs', aws_conn_id='aws_data_iam_blpadi', gcp_conn_id=gcp_conn_id, project_id=connection.project_id, object_conditions=gcstj_object_conditions, transfer_options=gcstj_transfer_options, timeout=720, dag=blp_dag) load_blpadi_to_bq = GKEPodOperator(task_id='bigquery_load', name='load-blpadi-to-bq', image='google/cloud-sdk:242.0.0-alpine', arguments=bq_args, dag=blp_dag) blp_logs.set_downstream(blp_job_sensor) blp_job_sensor.set_downstream(s3_to_gcs) s3_to_gcs.set_downstream(load_blpadi_to_bq) amo_dag = DAG('mango_log_processing_amo', default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=6), schedule_interval='0 3 * * *') amo_logs = EmrCreateJobFlowOperator(task_id='amo_create_job_flow', job_flow_overrides={'Steps': AMO_STEPS}, aws_conn_id='aws_data_iam',
default_args=default_args, schedule_interval="0 2 * * *") as dag: # Make sure all the data for the given day has arrived before running. wait_for_main_ping = ExternalTaskSensor( task_id="wait_for_main_ping", external_dag_id="copy_deduplicate", external_task_id="copy_deduplicate_main_ping", execution_delta=timedelta(hours=1), check_existence=True, mode="reschedule", pool="DATA_ENG_EXTERNALTASKSENSOR", dag=dag, ) # Built from repo https://github.com/mozilla/webrender_intel_win10_nightly webrender_ds_283 = GKEPodOperator( task_id="webrender_ds_283", name="webrender_ds_283", image="gcr.io/moz-fx-ds-283/ds_283_prod:latest", env_vars=dict( BUCKET="gs://moz-fx-ds-283", PROJECT_ID="moz-fx-data-shared-prod", # source dataset, results are written to the analysis dataset DATASET="telemetry", ), dag=dag, ) wait_for_main_ping >> webrender_ds_283
def load_to_bigquery(parent_dag_name=None, default_args=None, dataset_s3_bucket=None, aws_conn_id=None, dataset=None, dataset_version=None, gke_cluster_name=None, date_submission_col='submission_date_s3', ds_type='ds_nodash', dag_name='load_to_bigquery', gke_location='us-central1-a', gke_namespace='default', docker_image='docker.io/mozilla/parquet2bigquery:20191017', # noqa reprocess=False, p2b_concurrency='10', p2b_resume=False, p2b_table_alias=None, objects_prefix=None, spark_gs_dataset_location=None, bigquery_dataset='telemetry', dataset_gcs_bucket='moz-fx-data-derived-datasets-parquet', gcp_conn_id='google_cloud_derived_datasets', cluster_by=(), drop=(), rename={}, replace=()): """ Load Parquet data into BigQuery. Used with SubDagOperator. We use S3ToGoogleCloudStorageTransferOperator to create a GCS Transfer Service job to transfer the AWS S3 parquet data into a GCS Bucket. Once that is completed we launch a Kubernates pod on a existing GKE cluster using the GKEPodOperator. :param str parent_dag_name: parent dag name :param dict default_args: dag configuration :param str dataset_s3_bucket: source S3 Bucket :param str dataset_gcs_bucket: destination GCS Bucket :param str aws_conn_id: airflow connection id for S3 access :param str gcp_conn_id: airflow connection id for GCP access :param str dataset: dataset name :param str dataset_version: dataset version :param str date_submission_col: dataset date submission column :param str ds_type: dataset format (ds or ds_nodash) :param str gke_location: GKE cluster zone :param str gke_namespace: GKE cluster namespace :param str docker_image: docker image to use for GKE pod operations # noqa :param str bigquery_dataset: bigquery load destination dataset :param str p2b_concurrency: number of processes for parquet2bigquery load :param str p2b_table_alias: override p2b table name with alias :param str p2b_resume allow resume support. defaults to False :param bool reprocess: enable dataset reprocessing defaults to False :param str objects_prefix: custom objects_prefix to override defaults :param str spark_gs_dataset_location: custom spark dataset load location to override defaults :param List[str] cluster_by: top level fields to cluster by when creating destination table :param List[str] drop: top level fields to exclude from destination table :param Dict[str, str] rename: top level fields to rename in destination table :param List[str] replace: top level field replacement expressions :return airflow.models.DAG """ connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) _dag_name = '{}.{}'.format(parent_dag_name, dag_name) if objects_prefix: _objects_prefix = objects_prefix else: _objects_prefix = '{}/{}/{}={{{{{}}}}}'.format(dataset, dataset_version, date_submission_col, ds_type) gcs_buckets = { 'transfer': dataset_gcs_bucket, 'load': dataset_gcs_bucket, } gcstj_object_conditions = { 'includePrefixes': _objects_prefix } gcstj_transfer_options = { 'deleteObjectsUniqueInSink': True } gke_args = [ '-d', bigquery_dataset, '-c', p2b_concurrency, '-b', gcs_buckets['load'], ] if not p2b_resume: gke_args += ['-R'] if p2b_table_alias: gke_args += ['-a', p2b_table_alias] if reprocess: reprocess_objects_prefix = _objects_prefix.replace('_nodash', '') gcs_buckets['transfer'] += '-tmp' gke_args += ['-p', reprocess_objects_prefix] else: gke_args += ['-p', _objects_prefix] if cluster_by: gke_args += ['--cluster-by'] + cluster_by if drop: gke_args += ['--drop'] + drop if rename: gke_args += ['--rename'] + [k + "=" + v for k, v in rename.items()] if replace: gke_args += ['--replace'] + replace bq_table_name = p2b_table_alias or normalize_table_id('_'.join([dataset, dataset_version])) with models.DAG(_dag_name, default_args=default_args) as dag: if dataset_s3_bucket is not None: s3_to_gcs = S3ToGoogleCloudStorageTransferOperator( task_id='s3_to_gcs', s3_bucket=dataset_s3_bucket, gcs_bucket=gcs_buckets['transfer'], description=_objects_prefix, aws_conn_id=aws_conn_id, gcp_conn_id=gcp_conn_id, project_id=connection.project_id, object_conditions=gcstj_object_conditions, transfer_options=gcstj_transfer_options, ) else: s3_to_gcs = DummyOperator(task_id='no_s3_to_gcs') reprocess = SubDagOperator( subdag=reprocess_parquet( _dag_name, default_args, reprocess, gcp_conn_id, gcs_buckets, _objects_prefix, date_submission_col, dataset, dataset_version, gs_dataset_location=spark_gs_dataset_location), task_id='reprocess_parquet') remove_bq_table = BigQueryTableDeleteOperator( task_id='remove_bq_table', bigquery_conn_id=gcp_conn_id, deletion_dataset_table='{}.{}${{{{ds_nodash}}}}'.format(bigquery_dataset, bq_table_name), # noqa ignore_if_missing=True, ) bulk_load = GKEPodOperator( task_id='bigquery_load', gcp_conn_id=gcp_conn_id, project_id=connection.project_id, location=gke_location, cluster_name=gke_cluster_name, name=_dag_name.replace('_', '-'), namespace=gke_namespace, image=docker_image, arguments=gke_args, ) s3_to_gcs >> reprocess >> remove_bq_table >> bulk_load return dag
'crash_date' ] # We remove the current date partition for idempotency. remove_bq_table_partition = BigQueryTableDeleteOperator( task_id='remove_bq_table_partition', bigquery_conn_id=bq_gcp_conn_id, deletion_dataset_table='{}.{}${{{{ds_nodash}}}}'.format( bq_dataset, bq_table_name), # noqa ignore_if_missing=True, dag=dag) bq_load = GKEPodOperator( task_id='bigquery_load', gcp_conn_id=bq_gcp_conn_id, project_id=bq_connection.project_id, name='load-socorro-crash-parquet-to-bq', image=docker_image, arguments=gke_args, dag=dag, ) register_status( bq_load, "Socorro Crash Reports Parquet", "Convert processed crash reports into parquet for analysis", ) s3_to_gcs >> crash_report_parquet crash_report_parquet >> remove_bq_table_partition >> bq_load
"email_on_retry": True, "retries": 2, } with DAG( "bqetl_public_data_json", default_args=default_args, schedule_interval="0 4 * * *" ) as dag: docker_image = "mozilla/bigquery-etl:latest" export_public_data_json_telemetry_derived__ssl_ratios__v1 = GKEPodOperator( task_id="export_public_data_json_telemetry_derived__ssl_ratios__v1", name="export_public_data_json_telemetry_derived__ssl_ratios__v1", arguments=["script/publish_public_data_json"] + [ "--query_file=sql/moz-fx-data-shared-prod/telemetry_derived/ssl_ratios_v1/query.sql" ] + ["--destination_table=ssl_ratios${{ds_nodash}}"] + ["--dataset_id=telemetry_derived"] + ["--project_id=moz-fx-data-shared-prod"] + ["--parameter=submission_date:DATE:{{ds}}"], image=docker_image, dag=dag, ) wait_for_telemetry_derived__ssl_ratios__v1 = ExternalTaskSensor( task_id="wait_for_telemetry_derived__ssl_ratios__v1", external_dag_id="bqetl_ssl_ratios", external_task_id="telemetry_derived__ssl_ratios__v1", execution_delta=datetime.timedelta(seconds=7200), check_existence=True, mode="reschedule", pool="DATA_ENG_EXTERNALTASKSENSOR",
execution_delta=datetime.timedelta(hours=1), dag=dag, ) fission_monitoring_crash_v1 = bigquery_etl_query( task_id="fission_monitoring_crash_v1", project_id="moz-fx-data-shared-prod", destination_table="fission_monitoring_crash_v1", dataset_id="telemetry_derived", ) # Built from https://github.com/mozilla/fission_monitoring_nightly fission_aggregation_for_dashboard = GKEPodOperator( task_id="fission_aggregation_for_dashboard", name="fission_aggregation_for_dashboard", image="gcr.io/moz-fx-data-airflow-prod-88e0/fission-monitoring:latest", env_vars=dict( BQ_BILLING_PROJECT_ID="moz-fx-data-shared-prod", BQ_INPUT_MAIN_TABLE="moz-fx-data-shared-prod.telemetry_derived.fission_monitoring_main_v1", BQ_INPUT_CRASH_TABLE="moz-fx-data-shared-prod.telemetry_derived.fission_monitoring_crash_v1", BQ_OUTPUT_TABLE="moz-fx-data-shared-prod.analysis.fission_monitoring_analyzed_v1", GCS_BUCKET="fission-experiment-monitoring-dashboard", ), image_pull_policy="Always", dag=dag, ) wait_for_copy_deduplicate_main_ping >> fission_monitoring_main_v1 wait_for_copy_deduplicate_crash_ping >> fission_monitoring_crash_v1 [fission_monitoring_main_v1, fission_monitoring_crash_v1] >> fission_aggregation_for_dashboard
default_args = { 'owner': '*****@*****.**', 'depends_on_past': False, 'start_date': datetime(2019, 12, 26), 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } dag = DAG("clean-gke-pods", default_args=default_args, schedule_interval="@daily", doc_md=docs) docker_image = 'gcr.io/moz-fx-data-airflow-prod-88e0/gke-pod-clean:1.3' gke_cluster_name = 'bq-load-gke-1' gke_location = 'us-central1-a' docker_args = [ '--project', 'moz-fx-data-derived-datasets', '--gke-cluster', gke_cluster_name, '--region', gke_location, '--retention-days', '2' ] clean_gke_pods = GKEPodOperator(task_id="clean-gke-pods", name='clean-gke-pods', image=docker_image, arguments=docker_args, dag=dag)
default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=6), schedule_interval='@weekly') create_gke_cluster = GKEClusterCreateOperator(task_id='create_gke_cluster', project_id=connection.project_id, location='us-west1-b', gcp_conn_id=gcp_conn_id, body=cluster_def, dag=gke_dag) run_prio = GKEPodOperator(task_id='run_prio_a', gcp_conn_id=gcp_conn_id, project_id=connection.project_id, location='us-west1-b', cluster_name=cluster_name, name='run-prio-project-a', namespace='default', image='mozilla/python-libprio:latest', arguments=['scripts/test-cli-integration'], dag=gke_dag) delete_gke_cluster = GKEClusterDeleteOperator(task_id='delete_gke_cluster', project_id=connection.project_id, location='us-west1-b', name=cluster_name, gcp_conn_id=gcp_conn_id, dag=gke_dag) create_gke_cluster.set_downstream(run_prio) run_prio.set_downstream(delete_gke_cluster)
insert_args = [ 'bq', '--location=US', 'query', '--replace', '--destination_table', 'moz-fx-data-derived-datasets:blpadi.adi_dimensional_by_date${{ ds_nodash }}', '--use_legacy_sql=false', "select tot_requests_on_date, _year_quarter, bl_date, product, v_prod_major, prod_os, v_prod_os, channel, locale, continent_code, cntry_code, distro_name, distro_version from blpadi.adi_dim_backfill where bl_date = '{{ ds }}'", ] load_bq_to_tmp_tbl = GKEPodOperator(task_id='bq_load_tmp_tbl', gcp_conn_id=gcp_conn_id, project_id=connection.project_id, location='us-central1-a', cluster_name='bq-load-gke-1', name='bq-load-tmp-tbl', namespace='default', image='google/cloud-sdk:242.0.0-alpine', arguments=load_args, dag=blp_dag) select_insert_into_final_table = GKEPodOperator( task_id='bigquery_insert_final_table', gcp_conn_id=gcp_conn_id, project_id='moz-fx-data-derived-datasets', location='us-central1-a', cluster_name='bq-load-gke-1', name='bq-query-insert-final-tbl', namespace='default', image='google/cloud-sdk:242.0.0-alpine', arguments=insert_args,
} dag = DAG("taar_amodump", default_args=default_args, schedule_interval="@daily") amodump = GKEPodOperator( task_id="taar_amodump", gcp_conn_id=gcp_conn_id, project_id=connection.project_id, location="us-central1-a", cluster_name=gke_cluster_name, name="taar-amodump", namespace="default", # This uses a circleci built docker image from github.com/mozilla/taar_gcp_etl image="gcr.io/moz-fx-data-airflow-prod-88e0/taar_gcp_etl:0.1", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**", "*****@*****.**"], arguments=["-m", "taar_etl.taar_amodump", "--date", "{{ ds_nodash }}"], env_vars={ "AWS_ACCESS_KEY_ID": aws_access_key, "AWS_SECRET_ACCESS_KEY": aws_secret_key, }, dag=dag, ) amowhitelist = GKEPodOperator( task_id="taar_amowhitelist", gcp_conn_id=gcp_conn_id, project_id=connection.project_id, location="us-central1-a",
"retries": 2, "retry_delay": timedelta(minutes=30), } with DAG("pensieve", default_args=default_args, schedule_interval="0 1 * * *") as dag: # Built from repo https://github.com/mozilla/pensieve pensieve_image = "gcr.io/moz-fx-data-experiments/pensieve:latest" pensieve = GKEPodOperator( task_id="pensieve", name="pensieve", image=pensieve_image, email=[ "*****@*****.**", "*****@*****.**", "*****@*****.**", ], arguments=["--date={{ds}}"], dag=dag, ) wait_for_clients_daily_export = ExternalTaskSensor( task_id="wait_for_clients_daily_export", external_dag_id="main_summary", external_task_id="clients_daily_export", dag=dag, ) wait_for_main_summary_export = ExternalTaskSensor( task_id="wait_for_main_summary_export",
task_id='s3_to_gcs', s3_bucket='net-mozaws-data-us-west-2-data-analysis', gcs_bucket='moz-fx-data-derived-datasets-blpadi', description='blpadi copy from s3 to gcs', aws_conn_id='aws_data_iam_blpadi', gcp_conn_id=gcp_conn_id, project_id=connection.project_id, object_conditions=gcstj_object_conditions, transfer_options=gcstj_transfer_options, dag=blp_dag) load_blpadi_to_bq = GKEPodOperator(task_id='bigquery_load', gcp_conn_id=gcp_conn_id, project_id=connection.project_id, location='us-central1-a', cluster_name='bq-load-gke-1', name='load-blpadi-to-bq', namespace='default', image='google/cloud-sdk:242.0.0-alpine', arguments=bq_args, dag=blp_dag) blp_logs.set_downstream(blp_job_sensor) blp_job_sensor.set_downstream(s3_to_gcs) s3_to_gcs.set_downstream(load_blpadi_to_bq) amo_dag = DAG('mango_log_processing_amo', default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=6), schedule_interval='0 3 * * *') amo_logs = EmrCreateJobFlowOperator(task_id='amo_create_job_flow',
dataset_id="telemetry_derived", dag=dag, ) user_activity_usage_behavior_export = GKEPodOperator( task_id="user_activity_export", name="user_activity_export", image= "gcr.io/moz-fx-data-airflow-prod-88e0/firefox-public-data-report-etl:latest", arguments=[ "-m", "public_data_report.cli", "user_activity", "--bq_table", "moz-fx-data-shared-prod.telemetry_derived.public_data_report_user_activity_v1", "--s3_bucket", "telemetry-public-analysis-2", "--s3_path", "public-data-report/user_activity", ], env_vars={ "AWS_ACCESS_KEY_ID": aws_access_key, "AWS_SECRET_ACCESS_KEY": aws_secret_key, }, image_pull_policy="Always", dag=dag, ) annotations_export = GKEPodOperator( task_id="annotations_export", name="annotations_export",