def get_dataproc_parameters(conn_id="google_cloud_airflow_dataproc"): """This function can be used to gather parameters that correspond to development parameters. The provided connection string should be a Google Cloud connection and should either be the production default ("dataproc-runner-prod"), or a service key associated with a sandbox account. """ gcp_conn = GoogleCloudBaseHook(conn_id) keyfile = json.loads( gcp_conn.extras["extra__google_cloud_platform__keyfile_dict"]) project_id = keyfile["project_id"] is_dev = os.environ.get("DEPLOY_ENVIRONMENT") == "dev" client_email = ( keyfile["client_email"] if is_dev else "*****@*****.**") artifact_bucket = ("{}-dataproc-artifacts".format(project_id) if is_dev else "moz-fx-data-prod-airflow-dataproc-artifacts") storage_bucket = ("{}-dataproc-scratch".format(project_id) if is_dev else "moz-fx-data-prod-dataproc-scratch") output_bucket = (artifact_bucket if is_dev else "moz-fx-data-derived-datasets-parquet") return DataprocParameters(conn_id, project_id, is_dev, client_email, artifact_bucket, storage_bucket, output_bucket)
def bigquery_xcom_query( destination_table, dataset_id, xcom_task_id, parameters=(), arguments=(), project_id=None, gcp_conn_id="google_cloud_derived_datasets", gke_location="us-central1-a", gke_cluster_name="bq-load-gke-1", gke_namespace="default", docker_image="mozilla/bigquery-etl:latest", image_pull_policy="Always", date_partition_parameter="submission_date", **kwargs ): """ Generate a GKEPodOperator which runs an xcom result as a bigquery query. :param str destination_table: [Required] BigQuery destination table :param str dataset_id: [Required] BigQuery default dataset id :param str xcom_task_id: [Required] task_id which generated the xcom to pull :param Tuple[str] parameters: Parameters passed to bq query :param Tuple[str] arguments: Additional bq query arguments :param Optional[str] project_id: BigQuery default project id :param str gcp_conn_id: Airflow connection id for GCP access :param str gke_location: GKE cluster location :param str gke_cluster_name: GKE cluster name :param str gke_namespace: GKE cluster namespace :param str docker_image: docker image to use :param str image_pull_policy: Kubernetes policy for when to pull docker_image :param Optional[str] date_partition_parameter: Parameter for indicating destination partition to generate, if None destination should be whole table rather than partition :param Dict[str, Any] kwargs: Additional keyword arguments for GKEPodOperator :return: GKEPodOperator """ kwargs["task_id"] = kwargs.get("task_id", destination_table) kwargs["name"] = kwargs.get("name", kwargs["task_id"].replace("_", "-")) if destination_table is not None and date_partition_parameter is not None: destination_table = destination_table + "${{ds_nodash}}" parameters += (date_partition_parameter + ":DATE:{{ds}}",) query = "{{ " + "task_instance.xcom_pull({!r})".format(xcom_task_id) + " }}" return GKEPodOperator( gcp_conn_id=gcp_conn_id, project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id, location=gke_location, cluster_name=gke_cluster_name, namespace=gke_namespace, image=docker_image, arguments=["bq"] + ["query"] + (["--destination_table=" + destination_table] if destination_table else []) + ["--dataset_id=" + dataset_id] + (["--project_id=" + project_id] if project_id else []) + ["--parameter=" + parameter for parameter in parameters] + list(arguments) + [query], image_pull_policy=image_pull_policy, **kwargs )
def bigquery_etl_copy_deduplicate( task_id, target_project_id, only_tables=None, except_tables=None, parallelism=4, priority="INTERACTIVE", hourly=False, slices=None, gcp_conn_id="google_cloud_derived_datasets", gke_location="us-central1-a", gke_cluster_name="bq-load-gke-1", gke_namespace="default", docker_image="mozilla/bigquery-etl:latest", image_pull_policy="Always", **kwargs ): """ Copy a day's data from live ping tables to stable ping tables, deduplicating on document_id. :param str task_id: [Required] ID for the task :param str target_project_id: [Required] ID of project where target tables live :param Tuple[str] only_tables: Only process tables matching the given globs of form 'telemetry_live.main_v*' :param Tuple[str] except_tables: Process all tables except those matching the given globs :param int parallelism: Maximum number of queries to execute concurrently :param str priority: BigQuery query priority to use, must be BATCH or INTERACTIVE :param bool hourly: Alias for --slices=24 :param int slices: Number of time-based slices to deduplicate in, rather than for whole days at once :param str gcp_conn_id: Airflow connection id for GCP access :param str gke_location: GKE cluster location :param str gke_cluster_name: GKE cluster name :param str gke_namespace: GKE cluster namespace :param str docker_image: docker image to use :param str image_pull_policy: Kubernetes policy for when to pull docker_image :param Dict[str, Any] kwargs: Additional keyword arguments for GKEPodOperator :return: GKEPodOperator """ kwargs["name"] = kwargs.get("name", task_id.replace("_", "-")) table_qualifiers = [] if only_tables: table_qualifiers.append('--only') table_qualifiers += only_tables if except_tables: table_qualifiers.append('--except') table_qualifiers += except_tables return GKEPodOperator( task_id=task_id, gcp_conn_id=gcp_conn_id, project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id, location=gke_location, cluster_name=gke_cluster_name, namespace=gke_namespace, image=docker_image, arguments=["script/copy_deduplicate"] + ["--project-id=" + target_project_id] + ["--date={{ds}}"] + ["--parallelism={}".format(parallelism)] + ["--priority={}".format(priority)] + (["--hourly"] if hourly else []) + (["--slices={}".format(slices)] if slices is not None else []) + table_qualifiers, image_pull_policy=image_pull_policy, **kwargs )
def bigquery_etl_query( destination_table, dataset_id, parameters=(), arguments=(), project_id=None, sql_file_path=None, gcp_conn_id="google_cloud_derived_datasets", gke_location="us-central1-a", gke_cluster_name="bq-load-gke-1", gke_namespace="default", docker_image="mozilla/bigquery-etl:latest", image_pull_policy="Always", date_partition_parameter="submission_date", multipart=False, **kwargs ): """ Generate. :param str destination_table: [Required] BigQuery destination table :param str dataset_id: [Required] BigQuery default dataset id :param Tuple[str] parameters: Parameters passed to bq query :param Tuple[str] arguments: Additional bq query arguments :param Optional[str] project_id: BigQuery default project id :param Optional[str] sql_file_path: Optional override for path to the SQL query file to run :param str gcp_conn_id: Airflow connection id for GCP access :param str gke_location: GKE cluster location :param str gke_cluster_name: GKE cluster name :param str gke_namespace: GKE cluster namespace :param str docker_image: docker image to use :param str image_pull_policy: Kubernetes policy for when to pull docker_image :param Optional[str] date_partition_parameter: Parameter for indicating destination partition to generate, if None destination should be whole table rather than partition :param Dict[str, Any] kwargs: Additional keyword arguments for GKEPodOperator :return: GKEPodOperator """ kwargs["task_id"] = kwargs.get("task_id", destination_table) kwargs["name"] = kwargs.get("name", kwargs["task_id"].replace("_", "-")) sql_file_path = sql_file_path or "sql/{}/{}/query.sql".format(dataset_id, destination_table) if destination_table is not None and date_partition_parameter is not None: destination_table = destination_table + "${{ds_nodash}}" parameters += (date_partition_parameter + ":DATE:{{ds}}",) return GKEPodOperator( gcp_conn_id=gcp_conn_id, project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id, location=gke_location, cluster_name=gke_cluster_name, namespace=gke_namespace, image=docker_image, arguments=["script/run_multipart_query" if multipart else "query"] + (["--destination_table=" + destination_table] if destination_table else []) + ["--dataset_id=" + dataset_id] + (["--project_id=" + project_id] if project_id else []) + ["--parameter=" + parameter for parameter in parameters] + list(arguments) + [sql_file_path], image_pull_policy=image_pull_policy, **kwargs )
def export_to_parquet( table, destination_table=None, static_partitions=[], arguments=[], use_storage_api=False, dag_name="export_to_parquet", parent_dag_name=None, default_args=None, aws_conn_id="aws_dev_iam_s3", gcp_conn_id="google_cloud_derived_datasets", dataproc_zone="us-central1-a", dataproc_storage_bucket="moz-fx-data-derived-datasets-parquet", num_workers=2, num_preemptible_workers=0, gcs_output_bucket="moz-fx-data-derived-datasets-parquet", s3_output_bucket="telemetry-parquet", ): """ Export a BigQuery table to Parquet. https://github.com/mozilla/bigquery-etl/blob/master/script/pyspark/export_to_parquet.py :param str table: [Required] BigQuery table name :param Optional[str] destination_table: Output table name, defaults to table, will have r'_v[0-9]+$' replaced with r'/v[0-9]+' :param List[str] arguments: Additional pyspark arguments :param bool use_storage_api: Whether to read from the BigQuery Storage API or an AVRO export :param str dag_name: Name of DAG :param Optional[str] parent_dag_name: Parent DAG name :param Optional[Dict[str, Any]] default_args: DAG configuration :param str gcp_conn_id: Airflow connection id for GCP access :param str dataproc_storage_bucket: Dataproc staging GCS bucket :param str dataproc_zone: GCP zone to launch dataproc clusters :param int num_preemptible_workers: Number of Dataproc preemptible workers :return: airflow.models.DAG """ # remove the dataset prefix and partition suffix from table table_id = table.rsplit(".", 1)[-1] unqualified_table, _, partition_id = table_id.partition("$") # limit cluster name to 35 characters plus suffix of -export-YYYYMMDD (51 total) cluster_name = unqualified_table.replace("_", "-") if len(cluster_name) > 35: # preserve version when truncating cluster name to 42 characters prefix, version = re.match(r"(.*?)(-v[0-9]+)?$", cluster_name).groups("") cluster_name = prefix[:35 - len(version)] + version cluster_name += "-export-{{ ds_nodash }}" dag_prefix = parent_dag_name + "." if parent_dag_name else "" connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) if destination_table is None: destination_table = unqualified_table # separate version using "/" instead of "_" export_prefix = re.sub(r"_(v[0-9]+)$", r"/\1", destination_table) + "/" if static_partitions: export_prefix += "/".join(static_partitions) + "/" avro_prefix = "avro/" + export_prefix if not static_partitions and partition_id: avro_prefix += "partition_id=" + partition_id + "/" avro_path = "gs://" + gcs_output_bucket + "/" + avro_prefix + "*.avro" with models.DAG(dag_id=dag_prefix + dag_name, default_args=default_args) as dag: create_dataproc_cluster = DataprocClusterCreateOperator( task_id="create_dataproc_cluster", cluster_name=cluster_name, gcp_conn_id=gcp_conn_id, project_id=connection.project_id, num_workers=num_workers, image_version="1.4", storage_bucket=dataproc_storage_bucket, zone=dataproc_zone, master_machine_type="n1-standard-8", worker_machine_type="n1-standard-8", num_preemptible_workers=num_preemptible_workers, init_actions_uris=[ "gs://dataproc-initialization-actions/python/pip-install.sh", ], metadata={"PIP_PACKAGES": "google-cloud-bigquery==1.20.0"}, ) run_dataproc_pyspark = DataProcPySparkOperator( task_id="run_dataproc_pyspark", cluster_name=cluster_name, dataproc_pyspark_jars=[ "gs://spark-lib/bigquery/spark-bigquery-latest.jar" ], dataproc_pyspark_properties={ "spark.jars.packages": "org.apache.spark:spark-avro_2.11:2.4.4", }, main="https://raw.githubusercontent.com/mozilla/bigquery-etl/master" "/script/pyspark/export_to_parquet.py", arguments=[table] + [ "--" + key + "=" + value for key, value in { "avro-path": (not use_storage_api) and avro_path, "destination": "gs://" + gcs_output_bucket, "destination-table": destination_table, }.items() if value ] + (["--static-partitions"] if static_partitions else []) + [static_partitions] + arguments, gcp_conn_id=gcp_conn_id, ) gcs_to_s3 = DataProcHadoopOperatorWithAws( task_id="gcs_to_s3", main_jar="file:///usr/lib/hadoop-mapreduce/hadoop-distcp.jar", arguments=[ "-update", "-delete", "gs://{}/{}".format(gcs_output_bucket, export_prefix), "s3a://{}/{}".format(s3_output_bucket, export_prefix), ], cluster_name=cluster_name, gcp_conn_id=gcp_conn_id, project_id=connection.project_id, aws_conn_id=aws_conn_id, ) delete_dataproc_cluster = DataprocClusterDeleteOperator( task_id="delete_dataproc_cluster", cluster_name=cluster_name, gcp_conn_id=gcp_conn_id, project_id=connection.project_id, trigger_rule=trigger_rule.TriggerRule.ALL_DONE, ) if not use_storage_api: avro_export = BigQueryToCloudStorageOperator( task_id="avro_export", source_project_dataset_table=table, destination_cloud_storage_uris=avro_path, compression=None, export_format="AVRO", bigquery_conn_id=gcp_conn_id, ) avro_delete = GoogleCloudStorageDeleteOperator( task_id="avro_delete", bucket_name=gcs_output_bucket, prefix=avro_prefix, gcp_conn_id=gcp_conn_id, trigger_rule=trigger_rule.TriggerRule.ALL_DONE, ) avro_export >> run_dataproc_pyspark >> avro_delete create_dataproc_cluster >> run_dataproc_pyspark >> gcs_to_s3 gcs_to_s3 >> delete_dataproc_cluster return dag
def spark_subdag( parent_dag_name, child_dag_name, default_args, gcp_conn_id, service_account, main, pyfiles, arguments, dataproc_zone="us-west1-a", num_preemptible_workers=10, ): """Run the PySpark job for unnesting and range-partitioning Prio pings from the ingestion service. :param str parent_dag_name: Name of the parent DAG. :param str child_dag_name: Name of the child DAG. :param Dict[str, Any] default_args: Default arguments for the child DAG. :param str gcp_conn_id: Name of the connection string. :param str service_account: The address of the service account. :param str dataproc_zone: The region of the DataProc cluster. :param str main: :param List[str] pyfiles: :param List[str] arguments: :param int num_preemptible_workers: The number of preemptible workers. :return: DAG """ connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) shared_config = { "cluster_name": "prio-staging", "gcp_conn_id": gcp_conn_id, "project_id": connection.project_id, } with DAG("{}.{}".format(parent_dag_name, child_dag_name), default_args=default_args) as dag: create_dataproc_cluster = DataprocClusterCreateOperator( task_id="create_dataproc_cluster", num_workers=2, image_version="1.4", zone=dataproc_zone, service_account=service_account, master_machine_type="n1-standard-8", worker_machine_type="n1-standard-8", num_preemptible_workers=num_preemptible_workers, metadata={"PIP_PACKAGES": "click jsonschema gcsfs==0.2.3"}, init_actions_uris=[ "gs://dataproc-initialization-actions/python/pip-install.sh" ], dag=dag, **shared_config) run_dataproc_spark = DataProcPySparkOperator( task_id="run_dataproc_spark", main=main, dataproc_pyspark_jars=[ "gs://spark-lib/bigquery/spark-bigquery-latest.jar" ], pyfiles=pyfiles, arguments=arguments, dag=dag, **shared_config) delete_dataproc_cluster = DataprocClusterDeleteOperator( task_id="delete_dataproc_cluster", trigger_rule="all_done", dag=dag, **shared_config) create_dataproc_cluster >> run_dataproc_spark >> delete_dataproc_cluster return dag
emr_conn_id='emr_data_iam_mango', dag=blp_dag) blp_job_sensor = EmrJobFlowSensor( task_id='blp_check_job_flow', job_flow_id= "{{ task_instance.xcom_pull('blp_create_job_flow', key='return_value') }}", aws_conn_id='aws_data_iam', dag=blp_dag, on_retry_callback=lambda context: blp_dag.clear( start_date=context['execution_date'], end_date=context['execution_date']), ) gcp_conn_id = "google_cloud_derived_datasets" connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) gcstj_object_conditions = {'includePrefixes': 'blpadi/{{ ds }}'} gcstj_transfer_options = {'deleteObjectsUniqueInSink': True} bq_args = [ 'bq', '--location=US', 'load', '--source_format=CSV', '--skip_leading_rows=0', '--replace', "--field_delimiter=\001", 'blpadi.adi_dimensional_by_date${{ ds_nodash }}', 'gs://moz-fx-data-derived-datasets-blpadi/blpadi/{{ ds }}/*',
3, "retry_delay": timedelta(minutes=30), } dag = DAG( "prerelease_telemetry_aggregates", default_args=default_args, schedule_interval="@daily", ) subdag_args = default_args.copy() subdag_args["retries"] = 0 task_id = "prerelease_telemetry_aggregate_view_dataproc" gcp_conn = GoogleCloudBaseHook("google_cloud_airflow_dataproc") keyfile = json.loads( gcp_conn.extras["extra__google_cloud_platform__keyfile_dict"]) project_id = keyfile["project_id"] is_dev = os.environ.get("DEPLOY_ENVIRONMENT") == "dev" client_email = ( keyfile["client_email"] if is_dev else "*****@*****.**") artifact_bucket = ("{}-dataproc-artifacts".format(project_id) if is_dev else "moz-fx-data-prod-airflow-dataproc-artifacts") storage_bucket = ("{}-dataproc-scratch".format(project_id) if is_dev else "moz-fx-data-prod-dataproc-scratch") prerelease_telemetry_aggregate_view_dataproc = SubDagOperator( task_id=task_id,
def __init__( self, cluster_name=None, num_workers=2, image_version='1.4', zone='us-west1-b', idle_delete_ttl='14400', auto_delete_ttl='28800', master_machine_type='n1-standard-8', worker_machine_type='n1-standard-4', num_preemptible_workers=0, service_account='*****@*****.**', init_actions_uris=None, additional_metadata=None, additional_properties=None, optional_components=['ANACONDA'], install_component_gateway=True, aws_conn_id=None, gcp_conn_id='google_cloud_airflow_dataproc', artifact_bucket='moz-fx-data-prod-airflow-dataproc-artifacts', storage_bucket='moz-fx-data-prod-dataproc-scratch', ): self.cluster_name = cluster_name self.num_workers = num_workers self.image_version = image_version self.zone = zone self.idle_delete_ttl = idle_delete_ttl self.auto_delete_ttl = auto_delete_ttl self.master_machine_type = master_machine_type self.worker_machine_type = worker_machine_type self.num_preemptible_workers = num_preemptible_workers self.service_account = service_account # The bucket with a default dataproc init script self.artifact_bucket = artifact_bucket self.storage_bucket = storage_bucket if init_actions_uris is None: self.init_actions_uris = [ 'gs://{}/bootstrap/dataproc_init.sh'.format( self.artifact_bucket) ] else: self.init_actions_uris = init_actions_uris if additional_metadata is None: self.additional_metadata = {} else: self.additional_metadata = additional_metadata if additional_properties is None: self.additional_properties = {} else: self.additional_properties = additional_properties self.optional_components = optional_components self.install_component_gateway = install_component_gateway self.aws_conn_id = aws_conn_id self.gcp_conn_id = gcp_conn_id self.connection = GoogleCloudBaseHook(gcp_conn_id=self.gcp_conn_id)
def export_to_parquet( table, arguments=[], dag_name="export_to_parquet", parent_dag_name=None, default_args=None, aws_conn_id="aws_dev_iam_s3", gcp_conn_id="google_cloud_derived_datasets", dataproc_zone="us-central1-a", dataproc_storage_bucket="moz-fx-data-derived-datasets-parquet", num_preemptible_workers=0, ): """ Export a BigQuery table to Parquet. https://github.com/mozilla/bigquery-etl/blob/master/script/pyspark/export_to_parquet.py :param str table: [Required] BigQuery table name :param List[str] arguments: Additional pyspark arguments :param str dag_name: Name of DAG :param Optional[str] parent_dag_name: Parent DAG name :param Optional[Dict[str, Any]] default_args: DAG configuration :param str gcp_conn_id: Airflow connection id for GCP access :param str dataproc_storage_bucket: Dataproc staging GCS bucket :param str dataproc_zone: GCP zone to launch dataproc clusters :param int num_preemptible_workers: Number of Dataproc preemptible workers :return: airflow.models.DAG """ # limit cluster name to 42 characters then suffix with -YYYYMMDD cluster_name = table.replace("_", "-") if len(cluster_name) > 42: if cluster_name.rsplit("-v", 1)[-1].isdigit(): prefix, version = cluster_name.rsplit("-v", 1) cluster_name = prefix[:40 - len(version)] + "-v" + version else: cluster_name = cluster_name[:42] cluster_name += "-{{ ds_nodash }}" dag_prefix = parent_dag_name + "." if parent_dag_name else "" connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) properties = { "core:fs.s3a." + key: value for key, value in zip( ("access.key", "secret.key", "session.token"), AwsHook(aws_conn_id).get_credentials(), ) if value is not None } with models.DAG(dag_id=dag_prefix + dag_name, default_args=default_args) as dag: create_dataproc_cluster = DataprocClusterCreateOperator( task_id="create_dataproc_cluster", cluster_name=cluster_name, gcp_conn_id=gcp_conn_id, project_id=connection.project_id, properties=properties, num_workers=2, image_version="1.3", storage_bucket=dataproc_storage_bucket, zone=dataproc_zone, master_machine_type="n1-standard-8", worker_machine_type="n1-standard-8", num_preemptible_workers=num_preemptible_workers, ) run_dataproc_pyspark = DataProcPySparkOperator( task_id="run_dataproc_pyspark", cluster_name=cluster_name, dataproc_pyspark_jars=[ "gs://mozilla-bigquery-etl/jars/spark-bigquery-0.5.1-beta-SNAPSHOT.jar" ], main="https://raw.githubusercontent.com/mozilla/bigquery-etl/master" "/script/pyspark/export_to_parquet.py", arguments=[table] + arguments, gcp_conn_id=gcp_conn_id, ) delete_dataproc_cluster = DataprocClusterDeleteOperator( task_id="delete_dataproc_cluster", cluster_name=cluster_name, gcp_conn_id=gcp_conn_id, project_id=connection.project_id, trigger_rule=trigger_rule.TriggerRule.ALL_DONE, ) create_dataproc_cluster >> run_dataproc_pyspark >> delete_dataproc_cluster return dag
def container_subdag( parent_dag_name, child_dag_name, default_args, gcp_conn_id, service_account, server_id, env_vars={}, arguments=[], machine_type="n1-standard-1", image="mozilla/prio-processor:v3.0.1", location="us-west1-a", owner_label="amiyaguchi", team_label="dataeng", **kwargs, ): """Run a command on an ephemeral container running the `mozilla/prio-processor:latest` image. :param str parent_dag_name: Name of the parent DAG. :param str child_dag_name: Name of the child DAG. :param Dict[str, Any] default_args: Default arguments for the child DAG. :param str gcp_conn_id: Name of the connection string. :param str service_account: The address of the service account. :param str server_id: The identifier for the Prio processor :param Dict[str, str] env_vars: Environment variables for configuring the processor. :param List[str] arguments: The command to run after loading the image. :param str machine_type: The machine type for running the image. :param str image: Dockerhub image :param str location: The region of the GKE cluster. :param str owner_label: Label for associating the owner :param str team_label: Label for associating the team :return: DAG """ assert server_id in ["a", "b", "admin"] connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) cluster_name = f"gke-prio-{server_id}" shared_config = { "project_id": connection.project_id, "gcp_conn_id": gcp_conn_id, "location": location, } with DAG(f"{parent_dag_name}.{child_dag_name}", default_args=default_args) as dag: # https://cloud.google.com/composer/docs/how-to/using/using-kubernetes-pod-operator#kubernetespodoperator_configuration # https://medium.com/google-cloud/scale-your-kubernetes-cluster-to-almost-zero-with-gke-autoscaler-9c78051cbf40 # https://docs.openshift.com/container-platform/3.6/admin_guide/scheduling/pod_affinity.html # https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/ # https://cloud.google.com/composer/docs/how-to/using/using-kubernetes-pod-operator # https://airflow.apache.org/docs/stable/_api/airflow/contrib/operators/kubernetes_pod_operator/index.html create_gke_cluster = GKEClusterCreateOperator( task_id="create_gke_cluster", body=create_gke_config( name=cluster_name, service_account=service_account, owner_label=owner_label, team_label=team_label, machine_type=machine_type, location=location, # DataProc clusters require VPC with auto-created subnets subnetwork="default" if server_id == "admin" else "gke-subnet", is_dev=environ.get("DEPLOY_ENVIRONMENT") == "dev", ), dag=dag, **shared_config, ) # Running the pod without any time in-between will cause the scope-based # authentication in Google Cloud Platform to fail. For example: # # `ServiceException: 401 Anonymous caller does not have # storage.objects.get access to moz-fx-prio-dev-a-private/processed/` # # Sleeping by a small amount solves this problem. This issue was first # noticed intermittently on 2019-09-09. sleep = BashOperator(task_id="sleep", bash_command="sleep 60", dag=dag) run_prio = GKEPodOperator( task_id=f"processor_{server_id}", name=f"processor_{server_id}", cluster_name=cluster_name, namespace="default", image=image, arguments=arguments, env_vars=env_vars, dag=dag, # choose the autoscaling node-pool for any jobs node_selectors={"node-label": "burstable"}, labels={"pod-label": "burstable-pod"}, affinity={ "podAntiAffinity": { "requiredDuringSchedulingIgnoredDuringExecution": [ { "labelSelector": { "matchExpressions": [ { "key": "pod-label", "operator": "In", "values": ["burstable-pod"], } ] }, "topologyKey": "kubernetes.io/hostname", } ] } }, # tolerate the tainted node tolerations=[ { "key": "reserved-pool", "operator": "Equal", "value": "true", "effect": "NoSchedule", } ], # A new VM instance may take more than 120 seconds to boot startup_timeout_seconds=240, # delete the pod after running is_delete_operator_pod=True, **shared_config, **kwargs, ) delete_gke_cluster = GKEClusterDeleteOperator( task_id="delete_gke_cluster", name=cluster_name, trigger_rule="all_done", dag=dag, **shared_config, ) create_gke_cluster >> sleep >> run_prio >> delete_gke_cluster return dag
def export( leanplum_app_id, leanplum_client_key, bq_dataset_id, task_id, bq_project, gcs_bucket="moz-fx-data-prod-external-data", table_prefix=None, gcs_prefix=None, project_id=None, gcp_conn_id="google_cloud_derived_datasets", gke_location="us-central1-a", gke_cluster_name="bq-load-gke-1", gke_namespace="default", docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/leanplum-data-export:latest", **kwargs): """ Export a day of data from Leanplum for a single application, and make it available in BigQuery. See bug 1588654 for information on which buckets and datasets these tabes should live in. :param str leanplum_app_id: [Required] Leanplum application ID :param str leanplum_client_key: [Required] Leanplum client key :param str bq_dataset: [Required] BigQuery default dataset id :param str task_id: [Required] The task ID for this task :param str bq_project: [Required] The project to create tables in :param str gcs_bucket: GCS Bucket to export data to :param str gcs_prefix: Prefix for data exported to GCS :param str project_id: Project the GKE cluster is in :param str gcp_conn_id: Airflow connection id for GCP access :param str gke_location: GKE cluster location :param str gke_cluster_name: GKE cluster name :param str gke_namespace: GKE cluster namespace :param str docker_image: docker image to use :param Dict[str, Any] kwargs: Additional keyword arguments for GKEPodOperator :return: GKEPodOperator """ kwargs["name"] = kwargs.get("name", task_id.replace("_", "-")) if project_id is None: project_id = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id args = [ "leanplum-data-export", "export-leanplum", "--app-id", leanplum_app_id, "--client-key", leanplum_client_key, "--date", "{{ ds_nodash }}", "--bucket", gcs_bucket, "--bq-dataset", bq_dataset_id, "--project", bq_project ] if gcs_prefix is not None: args += ["--prefix", gcs_prefix] if table_prefix is not None: args += ["--table-prefix", table_prefix] return GKEPodOperator(task_id=task_id, gcp_conn_id=gcp_conn_id, project_id=project_id, location=gke_location, cluster_name=gke_cluster_name, namespace=gke_namespace, image=docker_image, arguments=args, **kwargs)
def spark_subdag( parent_dag_name, child_dag_name, default_args, gcp_conn_id, service_account, main, pyfiles, arguments, bootstrap_bucket, dataproc_region="us-west1", num_preemptible_workers=10, ): """Run the PySpark job for unnesting and range-partitioning Prio pings from the ingestion service. :param str parent_dag_name: Name of the parent DAG. :param str child_dag_name: Name of the child DAG. :param Dict[str, Any] default_args: Default arguments for the child DAG. :param str gcp_conn_id: Name of the connection string. :param str service_account: The address of the service account. :param str dataproc_region: The region of the DataProc cluster. :param str main: :param List[str] pyfiles: :param List[str] arguments: :param int num_preemptible_workers: The number of preemptible workers. :return: DAG """ connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) shared_config = { "cluster_name": "prio-staging-{{ds_nodash}}", "gcp_conn_id": gcp_conn_id, "project_id": connection.project_id, # From an error when not specifying the region: # - Dataproc images 2.0 and higher do not support the to-be # deprecated global region. Please use any non-global Dataproc # region instead # - Must specify a zone in GCE configuration when using # 'regions/global'. To use auto zone placement, specify # regions/<non-global-region> in request path, e.g. # regions/us-central1 "region": dataproc_region, } with DAG(f"{parent_dag_name}.{child_dag_name}", default_args=default_args) as dag: create_dataproc_cluster = DataprocClusterCreateOperator( task_id="create_dataproc_cluster", image_version="preview-ubuntu18", service_account=service_account, master_machine_type="n1-standard-4", worker_machine_type="n1-standard-4", num_workers=2, num_preemptible_workers=num_preemptible_workers, init_actions_uris=[ f"{bootstrap_bucket}/install-python-requirements.sh" ], idle_delete_ttl=600, dag=dag, **shared_config, ) run_dataproc_spark = DataProcPySparkOperator( task_id="run_dataproc_spark", main=main, dataproc_pyspark_jars=[ "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar" ], pyfiles=pyfiles, arguments=arguments, dag=dag, **shared_config, ) delete_dataproc_cluster = DataprocClusterDeleteOperator( task_id="delete_dataproc_cluster", trigger_rule="all_done", dag=dag, **shared_config, ) create_dataproc_cluster >> run_dataproc_spark >> delete_dataproc_cluster return dag
AUTOML_DATASET = models.Variable.get('automl_dataset') AUTOML_MODEL = models.Variable.get('automl_model') AUTOML_TRAINING_BUDGET = int(models.Variable.get('automl_training_budget')) #[START dag_build_train_deploy] default_dag_args = { 'start_date': datetime.datetime(2050, 1, 1), 'schedule_internal': None, 'provide_context': True } dag = models.DAG('build_train_deploy', default_args=default_dag_args) #[END dag_build_train_deploy] # instantiate Google Cloud base hook to get credentials and create automl clients gcp_hook = GoogleCloudBaseHook(conn_id='google_cloud_default') automl_client = AutoMlClient(credentials=gcp_hook._get_credentials()) # Loads the database dump from Cloud Storage to BigQuery t1 = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="db_dump_to_bigquery", bucket=COMPOSER_BUCKET_NAME, source_objects=[DB_DUMP_FILENAME], schema_object="schema_source.json", source_format="CSV", skip_leading_rows=1, destination_project_dataset_table="{}.{}.{}".format( PROJECT, DATASET, 'data_source'), create_disposition="CREATE_IF_NEEDED", write_disposition="WRITE_TRUNCATE", dag=dag)
def bigquery_etl_query(destination_table, dataset_id, parameters=(), arguments=(), project_id=None, sql_file_path=None, gcp_conn_id="google_cloud_derived_datasets", gke_location="us-central1-a", gke_cluster_name="bq-load-gke-1", gke_namespace="default", docker_image="mozilla/bigquery-etl:latest", date_partition_parameter="submission_date", multipart=False, allow_field_addition_on_date=None, **kwargs): """ Generate. :param str destination_table: [Required] BigQuery destination table :param str dataset_id: [Required] BigQuery default dataset id :param Tuple[str] parameters: Parameters passed to bq query :param Tuple[str] arguments: Additional bq query arguments :param Optional[str] project_id: BigQuery default project id :param Optional[str] sql_file_path: Optional override for path to the SQL query file to run :param str gcp_conn_id: Airflow connection id for GCP access :param str gke_location: GKE cluster location :param str gke_cluster_name: GKE cluster name :param str gke_namespace: GKE cluster namespace :param str docker_image: docker image to use :param Optional[str] date_partition_parameter: Parameter for indicating destination partition to generate, if None destination should be whole table rather than partition :param Dict[str, Any] kwargs: Additional keyword arguments for GKEPodOperator :param Optional[str] allow_field_addition_on_date: Optional {{ds}} value that should be run with ALLOW_FIELD_ADDITION :return: GKEPodOperator """ kwargs["task_id"] = kwargs.get("task_id", destination_table) kwargs["name"] = kwargs.get("name", kwargs["task_id"].replace("_", "-")) if not project_id: project_id = "moz-fx-data-shared-prod" sql_file_path = sql_file_path or "sql/{}/{}/{}/query.sql".format( project_id, dataset_id, destination_table) if destination_table is not None and date_partition_parameter is not None: destination_table = destination_table + "${{ds_nodash}}" parameters += (date_partition_parameter + ":DATE:{{ds}}", ) return GKEPodOperator( gcp_conn_id=gcp_conn_id, project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id, location=gke_location, cluster_name=gke_cluster_name, namespace=gke_namespace, image=docker_image, arguments=["script/run_multipart_query" if multipart else "query"] + (["--destination_table=" + destination_table] if destination_table else []) + ["--dataset_id=" + dataset_id] + (["--project_id=" + project_id] if project_id else []) + ["--parameter=" + parameter for parameter in parameters] + ( # Date comparisons for field additions need to happen within the parameter. # Template substitution occurs only within the operator with `arguments` being # one of the options of GKEPodOperator that allows templated arguments. # See also: https://github.com/mozilla/telemetry-airflow/pull/1174#discussion_r517505678 [ "--schema_update_option=" + "{{ 'ALLOW_FIELD_ADDITION' if ds == %r else '' }}" % allow_field_addition_on_date ] if allow_field_addition_on_date else []) + list(arguments) + [sql_file_path], **kwargs)
dataset_id="telemetry_derived", xcom_task_id=experiment_search_query_task_id, owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"]) (copy_deduplicate_main_ping >> experiment_search_aggregates >> experiment_search_aggregates_live_generate_view >> experiment_search_aggregates_live_deploy_view) # Daily and last seen views on top of every Glean application. gcp_conn_id = "google_cloud_derived_datasets" baseline_etl_kwargs = dict( gcp_conn_id=gcp_conn_id, project_id=GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id, location="us-central1-a", cluster_name="bq-load-gke-1", namespace="default", image="mozilla/bigquery-etl:latest", ) baseline_args = [ "--project-id=moz-fx-data-shared-prod", "--date={{ ds }}", "--only=*_stable.baseline_v1" ] baseline_clients_daily = GKEPodOperator( task_id='baseline_clients_daily', name='baseline-clients-daily', arguments=["script/run_glean_baseline_clients_daily"] + baseline_args, **baseline_etl_kwargs
def container_subdag( parent_dag_name, child_dag_name, default_args, gcp_conn_id, service_account, server_id, env_vars={}, arguments=[], machine_type="n1-standard-1", image="mozilla/prio-processor:latest", location="us-west1-b", owner_label="amiyaguchi", team_label="dataeng", ): """Run a command on an ephemeral container running the `mozilla/prio-processor:latest` image. :param str parent_dag_name: Name of the parent DAG. :param str child_dag_name: Name of the child DAG. :param Dict[str, Any] default_args: Default arguments for the child DAG. :param str gcp_conn_id: Name of the connection string. :param str service_account: The address of the service account. :param str server_id: The identifier for the Prio processor :param Dict[str, str] env_vars: Environment variables for configuring the processor. :param List[str] arguments: The command to run after loading the image. :param str machine_type: The machine type for running the image. :param str image: Dockerhub image :param str location: The region of the GKE cluster. :param str owner_label: Label for associating the owner :param str team_label: Label for associating the team :return: DAG """ assert server_id in ["a", "b", "admin"] connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) cluster_name = "gke-prio-{}".format(server_id) shared_config = { "project_id": connection.project_id, "gcp_conn_id": gcp_conn_id, "location": location, } with DAG("{}.{}".format(parent_dag_name, child_dag_name), default_args=default_args) as dag: create_gke_cluster = GKEClusterCreateOperator( task_id="create_gke_cluster", body=create_gke_config( name=cluster_name, service_account=service_account, owner_label=owner_label, team_label=team_label, machine_type=machine_type, # DataProc clusters require VPC with auto-created subnets subnetwork="default" if server_id == "admin" else "gke-subnet", is_dev=environ.get("DEPLOY_ENVIRONMENT") == "dev", ), dag=dag, **shared_config) # Running the pod without any time in-between will cause the scope-based # authentication in Google Cloud Platform to fail. For example: # # `ServiceException: 401 Anonymous caller does not have # storage.objects.get access to moz-fx-prio-dev-a-private/processed/` # # Sleeping by a small amount solves this problem. This issue was first # noticed intermittently on 2019-09-09. sleep = BashOperator(task_id="sleep", bash_command="sleep 60", dag=dag) run_prio = GKEPodOperator(task_id="processor_{}".format(server_id), name="run-prio-project-{}".format(server_id), cluster_name=cluster_name, namespace="default", image=image, arguments=arguments, env_vars=env_vars, dag=dag, **shared_config) delete_gke_cluster = GKEClusterDeleteOperator( task_id="delete_gke_cluster", name=cluster_name, trigger_rule="all_done", dag=dag, **shared_config) create_gke_cluster >> sleep >> run_prio >> delete_gke_cluster return dag
def load_to_bigquery(parent_dag_name=None, default_args=None, dataset_s3_bucket=None, aws_conn_id=None, dataset=None, dataset_version=None, gke_cluster_name=None, date_submission_col='submission_date_s3', ds_type='ds_nodash', dag_name='load_to_bigquery', gke_location='us-central1-a', gke_namespace='default', docker_image='docker.io/mozilla/parquet2bigquery:20191017', # noqa reprocess=False, p2b_concurrency='10', p2b_resume=False, p2b_table_alias=None, objects_prefix=None, spark_gs_dataset_location=None, bigquery_dataset='telemetry', dataset_gcs_bucket='moz-fx-data-derived-datasets-parquet', gcp_conn_id='google_cloud_derived_datasets', cluster_by=(), drop=(), rename={}, replace=()): """ Load Parquet data into BigQuery. Used with SubDagOperator. We use S3ToGoogleCloudStorageTransferOperator to create a GCS Transfer Service job to transfer the AWS S3 parquet data into a GCS Bucket. Once that is completed we launch a Kubernates pod on a existing GKE cluster using the GKEPodOperator. :param str parent_dag_name: parent dag name :param dict default_args: dag configuration :param str dataset_s3_bucket: source S3 Bucket :param str dataset_gcs_bucket: destination GCS Bucket :param str aws_conn_id: airflow connection id for S3 access :param str gcp_conn_id: airflow connection id for GCP access :param str dataset: dataset name :param str dataset_version: dataset version :param str date_submission_col: dataset date submission column :param str ds_type: dataset format (ds or ds_nodash) :param str gke_location: GKE cluster zone :param str gke_namespace: GKE cluster namespace :param str docker_image: docker image to use for GKE pod operations # noqa :param str bigquery_dataset: bigquery load destination dataset :param str p2b_concurrency: number of processes for parquet2bigquery load :param str p2b_table_alias: override p2b table name with alias :param str p2b_resume allow resume support. defaults to False :param bool reprocess: enable dataset reprocessing defaults to False :param str objects_prefix: custom objects_prefix to override defaults :param str spark_gs_dataset_location: custom spark dataset load location to override defaults :param List[str] cluster_by: top level fields to cluster by when creating destination table :param List[str] drop: top level fields to exclude from destination table :param Dict[str, str] rename: top level fields to rename in destination table :param List[str] replace: top level field replacement expressions :return airflow.models.DAG """ connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) _dag_name = '{}.{}'.format(parent_dag_name, dag_name) if objects_prefix: _objects_prefix = objects_prefix else: _objects_prefix = '{}/{}/{}={{{{{}}}}}'.format(dataset, dataset_version, date_submission_col, ds_type) gcs_buckets = { 'transfer': dataset_gcs_bucket, 'load': dataset_gcs_bucket, } gcstj_object_conditions = { 'includePrefixes': _objects_prefix } gcstj_transfer_options = { 'deleteObjectsUniqueInSink': True } gke_args = [ '-d', bigquery_dataset, '-c', p2b_concurrency, '-b', gcs_buckets['load'], ] if not p2b_resume: gke_args += ['-R'] if p2b_table_alias: gke_args += ['-a', p2b_table_alias] if reprocess: reprocess_objects_prefix = _objects_prefix.replace('_nodash', '') gcs_buckets['transfer'] += '-tmp' gke_args += ['-p', reprocess_objects_prefix] else: gke_args += ['-p', _objects_prefix] if cluster_by: gke_args += ['--cluster-by'] + cluster_by if drop: gke_args += ['--drop'] + drop if rename: gke_args += ['--rename'] + [k + "=" + v for k, v in rename.items()] if replace: gke_args += ['--replace'] + replace bq_table_name = p2b_table_alias or normalize_table_id('_'.join([dataset, dataset_version])) with models.DAG(_dag_name, default_args=default_args) as dag: if dataset_s3_bucket is not None: s3_to_gcs = S3ToGoogleCloudStorageTransferOperator( task_id='s3_to_gcs', s3_bucket=dataset_s3_bucket, gcs_bucket=gcs_buckets['transfer'], description=_objects_prefix, aws_conn_id=aws_conn_id, gcp_conn_id=gcp_conn_id, project_id=connection.project_id, object_conditions=gcstj_object_conditions, transfer_options=gcstj_transfer_options, ) else: s3_to_gcs = DummyOperator(task_id='no_s3_to_gcs') reprocess = SubDagOperator( subdag=reprocess_parquet( _dag_name, default_args, reprocess, gcp_conn_id, gcs_buckets, _objects_prefix, date_submission_col, dataset, dataset_version, gs_dataset_location=spark_gs_dataset_location), task_id='reprocess_parquet') remove_bq_table = BigQueryTableDeleteOperator( task_id='remove_bq_table', bigquery_conn_id=gcp_conn_id, deletion_dataset_table='{}.{}${{{{ds_nodash}}}}'.format(bigquery_dataset, bq_table_name), # noqa ignore_if_missing=True, ) bulk_load = GKEPodOperator( task_id='bigquery_load', gcp_conn_id=gcp_conn_id, project_id=connection.project_id, location=gke_location, cluster_name=gke_cluster_name, name=_dag_name.replace('_', '-'), namespace=gke_namespace, image=docker_image, arguments=gke_args, ) s3_to_gcs >> reprocess >> remove_bq_table >> bulk_load return dag
def export_to_amplitude( parent_dag_name, dag_name, default_args, project, dataset, table_or_view, s3_prefix, gcs_bucket='moz-fx-data-derived-datasets-amplitude-export', gcp_conn_id='google_cloud_derived_datasets', amplitude_s3_conn='amplitude_s3_conn', amplitude_s3_bucket='com-amplitude-vacuum-mozilla-vacuum-wup'): """Export a bigquery table or view to Amplitude. This uses the BigQueryToCloudStorage operator to export the partition to GCS, then pushes that data to S3. It operates on a temporary table that is dropped after the job is finished. :param str parent_dag_name: Parent dag name :param str dag_name: This dag's name (appended to parent_dag_name) :param str default_args: DAG configuration :param str dataset: BigQuery project containing the table to be exported :param str dataset: BigQuery dataset :param str table_or_view: Table or view name :param str gcs_bucket: The bucket the data will be exported to :param str gcp_conn_id: GCP connection ID :param str amplitude_s3_conn: S3 connection ID :param str amplitude_s3_bucket: The bucket to export data to :param str s3_prefix: The prefix for the s3 objects """ environment = environ['DEPLOY_ENVIRONMENT'] _dag_name = '{}.{}'.format(parent_dag_name, dag_name) with models.DAG(_dag_name, default_args=default_args) as dag: # For now, we assume the view is already updated # See https://github.com/mozilla/bigquery-etl/issues/218 exec_date = '{{ ds }}' # Check that we have data for this date check_sql = ( 'SELECT COUNT(*) ' 'FROM `{}.{}.{}` ' 'WHERE DATE(submission_timestamp) = "{}"' ).format(project, dataset, table_or_view, exec_date) wait_for_data = BigQuerySQLSensorOperator( task_id='wait_for_data', sql=check_sql, bigquery_conn_id=gcp_conn_id, use_legacy_sql=False ) # Create the table with yesterday's data project_id = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id temp_table_name = table_or_view + '_{{ ds_nodash }}' fully_qualified_table_name = '{}.{}.{}'.format(project_id, dataset, temp_table_name) sql = ( 'SELECT * EXCEPT (submission_timestamp) ' 'FROM `{}.{}.{}` ' 'WHERE DATE(submission_timestamp) = "{}"' ).format(project, dataset, table_or_view, exec_date) create_table = BigQueryOperator( task_id='create_temporary_table', sql=sql, destination_dataset_table=fully_qualified_table_name, bigquery_conn_id=gcp_conn_id, use_legacy_sql=False ) directory = '/'.join((environment, s3_prefix, '{{ ds_nodash }}')) extension = '.tsv.gz' # Export from bq to gcs # Docs: https://github.com/apache/airflow/blob/master/airflow/contrib/operators/bigquery_to_gcs.py#L28 # noqa: E501 gcs_uri = 'gs://{}/{}/*{}'.format(gcs_bucket, directory, extension) table_extract = BigQueryToCloudStorageOperator( task_id='bq_to_gcs', source_project_dataset_table=fully_qualified_table_name, destination_cloud_storage_uris=[gcs_uri], bigquery_conn_id=gcp_conn_id, compression='GZIP', export_format='CSV', field_delimiter='\t', print_header=True ) # Push the data to S3 # Docs: https://github.com/apache/airflow/blob/master/airflow/contrib/operators/gcs_to_s3.py#L29 # noqa: E501 s3_push = GoogleCloudStorageToS3Operator( task_id='gcs_to_s3', bucket=gcs_bucket, prefix=directory, delimiter=extension, google_cloud_storage_conn_id=gcp_conn_id, dest_aws_conn_id=amplitude_s3_conn, dest_s3_key='s3://{}/'.format(amplitude_s3_bucket), replace=True ) # Drop the temporary table table_drop = BigQueryOperator( task_id='drop_temp_table', sql='DROP TABLE `{}`'.format(fully_qualified_table_name), bigquery_conn_id=gcp_conn_id, use_legacy_sql=False ) # Delete the GCS data data_delete = GoogleCloudStorageDeleteOperator( task_id='delete_gcs_data', bucket_name=gcs_bucket, prefix=directory, gcp_conn_id=gcp_conn_id ) wait_for_data >> create_table >> table_extract >> s3_push s3_push >> table_drop s3_push >> data_delete return dag
def reprocess_parquet(parent_dag_name, default_args, reprocess, gcp_conn_id, gcs_buckets, objects_prefix, date_submission_col, dataset, dataset_version, gs_dataset_location=None, dataproc_zone='us-central1-a', dag_name='reprocess_parquet', num_preemptible_workers=10): """ Reprocess Parquet datasets to conform with BigQuery Parquet loader. This function should be invoked as part of `load_to_bigquery`. https://github.com/mozilla-services/spark-parquet-to-bigquery/blob/master/src/main/scala/com/mozilla/dataops/spark/TransformParquet.scala ## noqa :param str parent_dag_name: parent dag name :param dict default_args: dag configuration :param str gcp_conn_id: airflow connection id for GCP access :param dict gcp_buckets: source and dest gcp buckets for reprocess :param str dataset: dataset name :param str dataset_version: dataset version :param str object_prefix objects location :param str date_submission_col: dataset date submission column :param str dataproc_zone: GCP zone to launch dataproc clusters :param str dag_name: name of dag :param int num_preemptible_workers: number of dataproc cluster workers to provision :param bool reprocess: enable dataset reprocessing. defaults to False :param str gs_dataset_location: override source location, defaults to None :return airflow.models.DAG """ JAR = [ 'gs://moz-fx-data-derived-datasets-parquet-tmp/jars/spark-parquet-to-bigquery-assembly-1.0.jar' # noqa ] if gs_dataset_location: _gs_dataset_location = gs_dataset_location else: _gs_dataset_location = 'gs://{}/{}'.format(gcs_buckets['transfer'], objects_prefix) cluster_name = '{}-{}'.format(dataset.replace('_', '-'), dataset_version) + '-{{ ds_nodash }}' connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) spark_args = [ '--files', _gs_dataset_location, '--submission-date-col', date_submission_col, '--gcp-project-id', connection.project_id, '--gcs-bucket', 'gs://{}'.format(gcs_buckets['load']), ] _dag_name = '%s.%s' % (parent_dag_name, dag_name) with models.DAG( _dag_name, default_args=default_args) as dag: if reprocess: create_dataproc_cluster = DataprocClusterCreateOperator( task_id='create_dataproc_cluster', cluster_name=cluster_name, gcp_conn_id=gcp_conn_id, project_id=connection.project_id, num_workers=2, image_version='1.3', storage_bucket=gcs_buckets['transfer'], zone=dataproc_zone, master_machine_type='n1-standard-8', worker_machine_type='n1-standard-8', num_preemptible_workers=num_preemptible_workers, metadata={ 'gcs-connector-version': '1.9.6', 'bigquery-connector-version': '0.13.6' }) run_dataproc_spark = DataProcSparkOperator( task_id='run_dataproc_spark', cluster_name=cluster_name, dataproc_spark_jars=JAR, main_class='com.mozilla.dataops.spark.TransformParquet', arguments=spark_args, gcp_conn_id=gcp_conn_id) delete_dataproc_cluster = DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', cluster_name=cluster_name, gcp_conn_id=gcp_conn_id, project_id=connection.project_id, trigger_rule=trigger_rule.TriggerRule.ALL_DONE) create_dataproc_cluster >> run_dataproc_spark >> delete_dataproc_cluster # noqa else: DummyOperator(task_id='no_reprocess') return dag
# this adds the error log url at the end of the msg slack_msg = task_msg + """ (<{log_url}|log>)""".format( log_url=context.get('task_instance').log_url, ) failed_alert = SlackWebhookOperator( task_id='slack_test', http_conn_id='slack', webhook_token=slack_webhook_token, message=slack_msg, username='******', ) return failed_alert.execute(context=context) #to get credentials to access google sheets wys_api_hook = GoogleCloudBaseHook('vz_api_google') cred = wys_api_hook._get_credentials() service = build('sheets', 'v4', credentials=cred, cache_discovery=False) #to connect to pgadmin bot wys_postgres = PostgresHook("wys_bot") connection = BaseHook.get_connection('wys_api_key') api_key = connection.password default_args = { 'owner': 'rdumas', 'depends_on_past': False, 'start_date': datetime(2020, 4, 1), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_success': False,
from airflow.hooks.base_hook import BaseHook from airflow.contrib.operators import mlengine_operator from airflow.contrib.operators import mlengine_operator_utils from airflow.contrib.operators import dataflow_operator from airflow.contrib.operators import gcs_to_bq # TODO Add when Composer on v2.0 and more Hook # from airflow.contrib.operators import gcs_list_operator from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook from airflow.utils import trigger_rule from google.cloud.automl_v1beta1 import AutoMlClient, PredictionServiceClient from clv_automl import clv_automl # instantiate Google Cloud base hook to get credentials and create automl clients gcp_credentials = GoogleCloudBaseHook( conn_id='google_cloud_default')._get_credentials() automl_client = AutoMlClient(credentials=gcp_credentials) automl_predict_client = PredictionServiceClient(credentials=gcp_credentials) def _get_project_id(): """Get project ID from default GCP connection.""" extras = BaseHook.get_connection('google_cloud_default').extra_dejson key = 'extra__google_cloud_platform__project' if key in extras: project_id = extras[key] else: raise ('Must configure project_id in google_cloud_default ' 'connection from Airflow Console') return project_id