Ejemplo n.º 1
0
 subdag=moz_dataproc_pyspark_runner(
     parent_dag_name=dag.dag_id,
     dag_name=task_id,
     job_name="firefox-android-beta-adjust-import",
     cluster_name="firefox-android-beta-adjust-import-{{ ds_nodash }}",
     idle_delete_ttl="600",
     num_workers=40,
     worker_machine_type="n1-standard-8",
     init_actions_uris=[
         "gs://dataproc-initialization-actions/python/pip-install.sh"
     ],
     additional_properties={
         "spark:spark.jars":
         "gs://spark-lib/bigquery/spark-bigquery-latest.jar"
     },
     additional_metadata={"PIP_PACKAGES": "click==7.1.2"},
     python_driver_code="gs://{}/jobs/adjust_import.py".format(
         params.artifact_bucket),
     py_args=[
         "--pbkdf2",
         "--salt",
         "org.mozilla.fenix-salt",
         "--project",
         project,
         "--input_table",
         "tmp.adjust_firefox_preview",
         "--output_table",
         "firefox_android_beta_external.adjust_install_time_v1",
         "--bucket",
         params.storage_bucket,
     ],
     gcp_conn_id=params.conn_id,
     service_account=params.client_email,
     artifact_bucket=params.artifact_bucket,
     storage_bucket=params.storage_bucket,
     default_args=subdag_args,
 ),
Ejemplo n.º 2
0
 subdag=moz_dataproc_pyspark_runner(
     parent_dag_name=dag.dag_id,
     dag_name=task_id,
     job_name="ltv-daily",
     cluster_name="ltv-daily-{{ ds_nodash }}",
     idle_delete_ttl="600",
     num_workers=5,
     worker_machine_type="n1-standard-8",
     optional_components=["ANACONDA"],
     init_actions_uris=[
         "gs://dataproc-initialization-actions/python/pip-install.sh"
     ],
     additional_properties={
         "spark:spark.jars":
         "gs://spark-lib/bigquery/spark-bigquery-latest.jar"
     },
     additional_metadata={"PIP_PACKAGES": "lifetimes==0.11.1"},
     python_driver_code="gs://{}/jobs/ltv_daily.py".format(
         params.artifact_bucket),
     py_args=[
         "--submission-date",
         "{{ ds }}",
         "--prediction-days",
         "364",
         "--project-id",
         project,
         "--source-qualified-table-id",
         "{project}.search.search_rfm".format(project=project),
         "--dataset-id",
         "analysis",
         "--intermediate-table-id",
         "ltv_daily_temporary_search_rfm_day",
         "--model-input-table-id",
         "ltv_daily_model_perf",
         "--model-output-table-id",
         "ltv_daily",
         "--temporary-gcs-bucket",
         params.storage_bucket,
     ],
     gcp_conn_id=params.conn_id,
     service_account=params.client_email,
     artifact_bucket=params.artifact_bucket,
     storage_bucket=params.storage_bucket,
     default_args=subdag_args,
 ),
Ejemplo n.º 3
0
 subdag=moz_dataproc_pyspark_runner(
     parent_dag_name=dag.dag_id,
     dag_name=task_id,
     job_name="prerelease_aggregates",
     cluster_name="prerelease-telemetry-aggregates-{{ ds_nodash }}",
     idle_delete_ttl="600",
     num_workers=10,
     worker_machine_type="n1-standard-8",
     init_actions_uris=[
         "gs://dataproc-initialization-actions/python/pip-install.sh"
     ],
     additional_properties={
         "spark:spark.jars":
         "gs://spark-lib/bigquery/spark-bigquery-latest.jar",
         "spark:spark.jars.packages":
         "org.apache.spark:spark-avro_2.11:2.4.4",
     },
     additional_metadata={
         "PIP_PACKAGES":
         "git+https://github.com/mozilla/python_mozaggregator.git"
     },
     python_driver_code="gs://{}/jobs/mozaggregator_runner.py".format(
         artifact_bucket),
     py_args=[
         "aggregator",
         "--date",
         "{{ ds_nodash }}",
         "--channels",
         "nightly,aurora,beta",
         "--postgres-db",
         "telemetry",
         "--postgres-user",
         "root",
         "--postgres-pass",
         "{{ var.value.mozaggregator_postgres_pass }}",
         "--postgres-host",
         "{{ var.value.mozaggregator_postgres_host }}",
         "--postgres-ro-host",
         "{{ var.value.mozaggregator_postgres_ro_host }}",
         "--num-partitions",
         str(10 * 32),
     ] + ([
         "--source", "bigquery", "--project-id", "moz-fx-data-shared-prod"
     ] if not EXPORT_TO_AVRO else [
         "--source",
         "avro",
         "--avro-prefix",
         "gs://moz-fx-data-derived-datasets-parquet-tmp/avro/mozaggregator/prerelease/moz-fx-data-shared-prod",
     ]),
     gcp_conn_id=gcp_conn.gcp_conn_id,
     service_account=client_email,
     artifact_bucket=artifact_bucket,
     storage_bucket=storage_bucket,
     default_args=subdag_args,
 ),
    task_id="public_data_hardware_report",
    dag=dag,
    subdag = moz_dataproc_pyspark_runner(
        parent_dag_name=dag.dag_id,
        dag_name="public_data_hardware_report",
        default_args=default_args,
        cluster_name="public-data-hardware-report-{{ ds }}",
        job_name="Firefox_Public_Data_Hardware_Report-{{ ds }}",
        python_driver_code="gs://{}/jobs/moz_dataproc_runner.py".format(params.artifact_bucket),
        init_actions_uris=["gs://dataproc-initialization-actions/python/pip-install.sh"],
        additional_metadata={'PIP_PACKAGES': "git+https://github.com/mozilla/firefox-public-data-report-etl.git"},
        additional_properties={"spark:spark.jars":"gs://spark-lib/bigquery/spark-bigquery-latest.jar",
                               "spark-env:AWS_ACCESS_KEY_ID": aws_access_key,
                               "spark-env:AWS_SECRET_ACCESS_KEY": aws_secret_key},
        py_args=[
            "public_data_report",
            "hardware_report",
            "--date_from", "{{ ds }}",
            "--bq_table", "moz-fx-data-shared-prod.telemetry_derived.public_data_report_hardware",
            "--temporary_gcs_bucket", params.storage_bucket,
            "--s3_bucket", "telemetry-public-analysis-2",
            "--s3_path", "public-data-report/hardware/",
        ],
        idle_delete_ttl='14400',
        num_workers=2,
        worker_machine_type='n1-standard-4',
        gcp_conn_id=params.conn_id,
        service_account=params.client_email,
        storage_bucket=params.storage_bucket,
    )
)
Ejemplo n.º 5
0
 subdag=moz_dataproc_pyspark_runner(
     parent_dag_name=dag.dag_id,
     dag_name=task_id,
     job_name="bgbb_pred_dataproc",
     cluster_name="bgbb-pred-{{ ds_nodash }}",
     idle_delete_ttl="600",
     num_workers=10,
     worker_machine_type="n1-standard-8",
     init_actions_uris=[
         "gs://dataproc-initialization-actions/python/pip-install.sh"
     ],
     additional_properties={
         "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest.jar"
     },
     additional_metadata={
         "PIP_PACKAGES": "git+https://github.com/wcbeard/bgbb_airflow.git"
     },
     python_driver_code="gs://{}/jobs/bgbb_runner.py".format(params.artifact_bucket),
     py_args=[
         "bgbb_pred",
         "--submission-date",
         "{{ ds }}",
         "--model-win",
         "90",
         "--sample-ids",
         "[42]" if params.is_dev else "[]",
         "--source",
         "bigquery",
         "--view-materialization-project",
         params.project_id if params.is_dev else "moz-fx-data-shared-prod",
         "--view-materialization-dataset",
         "analysis",
         "--bucket-protocol",
         "gs",
         "--param-bucket",
         params.output_bucket,
         "--param-prefix",
         "bgbb/params/v1",
         "--pred-bucket",
         params.output_bucket,
         "--pred-prefix",
         "bgbb/active_profiles/v1",
     ],
     gcp_conn_id=params.conn_id,
     service_account=params.client_email,
     artifact_bucket=params.artifact_bucket,
     storage_bucket=params.storage_bucket,
     default_args=subdag_args,
 ),
Ejemplo n.º 6
0
 subdag=moz_dataproc_pyspark_runner(
     parent_dag_name=dag.dag_id,
     dag_name="hardware_report",
     default_args=default_args,
     cluster_name=cluster_name,
     job_name="Firefox_Hardware_Report",
     python_driver_code=
     "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/hardware_report.py",
     init_actions_uris=[
         "gs://dataproc-initialization-actions/python/pip-install.sh"
     ],
     additional_metadata={
         'PIP_PACKAGES':
         "google-cloud-bigquery==1.21.0 python_moztelemetry==0.10.2 boto3==1.9.87 click==6.7 click_datetime==0.2 requests-toolbelt==0.8.0 requests==2.20.1 typing==3.6.4"
     },
     additional_properties={
         "spark:spark.jars":
         "gs://spark-lib/bigquery/spark-bigquery-latest.jar",
         "spark-env:AWS_ACCESS_KEY_ID": aws_access_key,
         "spark-env:AWS_SECRET_ACCESS_KEY": aws_secret_key
     },
     py_args=[
         "--start_date",
         DS_WEEKLY,
         "--bucket",
         "telemetry-public-analysis-2",
         "--spark-provider",
         "dataproc",
     ],
     idle_delete_ttl='14400',
     num_workers=15,
     worker_machine_type='n1-standard-4',
     gcp_conn_id=gcp_conn_id))
Ejemplo n.º 7
0
# Spark job reads gcs json and writes gcs parquet
crash_report_parquet = SubDagOperator(
    task_id="crash_report_parquet",
    dag=dag,
    subdag=moz_dataproc_pyspark_runner(
        parent_dag_name=dag.dag_id,
        dag_name="crash_report_parquet",
        default_args=default_args,
        cluster_name=cluster_name,
        job_name="Socorro_Crash_Reports_to_Parquet",
        python_driver_code=
        "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/socorro_import_crash_data.py",
        py_args=[
            "--date",
            "{{ ds_nodash }}",
            "--source-gcs-path",
            "gs://{}/v1/crash_report".format(gcs_data_bucket),
            "--dest-gcs-path",
            "gs://{}/{}".format(gcs_data_bucket, dataset),
        ],
        idle_delete_ttl="14400",
        num_workers=8,
        worker_machine_type="n1-standard-8",
        aws_conn_id=read_aws_conn_id,
        gcp_conn_id=gcp_conn_id,
    ),
)

bq_gcp_conn_id = "google_cloud_derived_datasets"
bq_connection = GoogleCloudBaseHook(gcp_conn_id=bq_gcp_conn_id)
Ejemplo n.º 8
0
    dag=dag)

taar_dynamo_job = SubDagOperator(
    task_id="taar_dynamo_job",
    subdag=moz_dataproc_pyspark_runner(
        parent_dag_name=dag.dag_id,
        dag_name="taar_dynamo_job",
        default_args=default_args,
        master_machine_type='n1-standard-32',
        worker_machine_type='n1-standard-32',
        cluster_name=taar_dynamo_cluster_name,
        job_name="TAAR_Dynamo",
        python_driver_code=
        "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/taar_dynamo.py",
        num_workers=12,
        py_args=[
            "--date",
            "{{ ds_nodash }}",
            "--aws_access_key_id",
            taar_aws_access_key,
            "--aws_secret_access_key",
            taar_aws_secret_key,
        ],
        aws_conn_id=taar_aws_conn_id,
        gcp_conn_id=taar_gcpdataproc_conn_id,
        master_disk_type='pd-ssd',
        worker_disk_type='pd-ssd',
    ),
    dag=dag,
)

taar_locale = SubDagOperator(
Ejemplo n.º 9
0

taar_locale = SubDagOperator(
    task_id="taar_locale",
    subdag=moz_dataproc_pyspark_runner(
        parent_dag_name=dag.dag_id,
        dag_name="taar_locale",
        default_args=default_args,
        cluster_name=taar_locale_cluster_name,
        job_name="TAAR_Locale",
        python_driver_code="gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/taar_locale.py",
        num_workers=12,
        py_args=[
            "--date",
            "{{ ds_nodash }}",
            "--aws_access_key_id",
            taar_aws_access_key,
            "--aws_secret_access_key",
            taar_aws_secret_key,
            "--bucket",
            "telemetry-private-analysis-2",
            "--prefix",
            "taar/locale/",
        ],
        aws_conn_id=taar_aws_conn_id,
        gcp_conn_id=taar_gcpdataproc_conn_id,
    ),
    dag=dag,
)

taar_similarity_args = default_args.copy()
Ejemplo n.º 10
0
 subdag=moz_dataproc_pyspark_runner(
     parent_dag_name=dag.dag_id,
     image_version="1.5",
     dag_name="graphics_trends",
     default_args=default_args,
     cluster_name="graphics-trends-{{ ds }}",
     job_name="graphics-trends",
     python_driver_code=
     "https://raw.githubusercontent.com/mozilla/python_mozetl/main/mozetl/graphics/graphics_telemetry_trends.py",
     init_actions_uris=[
         "gs://dataproc-initialization-actions/python/pip-install.sh"
     ],
     additional_metadata={'PIP_PACKAGES': " ".join(PIP_PACKAGES)},
     additional_properties={
         "spark:spark.jars":
         "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar",
         "spark-env:AWS_ACCESS_KEY_ID": aws_access_key,
         "spark-env:AWS_SECRET_ACCESS_KEY": aws_secret_key
     },
     py_args=[
         "--s3-bucket",
         S3_BUCKET,
         "--s3-prefix",
         S3_PREFIX,
         "--weekly-fraction",
         "0.003",
     ],
     idle_delete_ttl="14400",
     num_workers=2,
     worker_machine_type="n1-standard-4",
     gcp_conn_id=params.conn_id,
     service_account=params.client_email,
     storage_bucket=params.storage_bucket,
 ))
Ejemplo n.º 11
0
 subdag=moz_dataproc_pyspark_runner(
     parent_dag_name=taar_weekly.dag_id,
     dag_name="taar_ensemble",
     default_args=default_args_weekly,
     cluster_name=taar_ensemble_cluster_name,
     job_name="TAAR_ensemble",
     python_driver_code=
     "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/taar_ensemble.py",
     additional_properties={
         "spark:spark.jars":
         "gs://spark-lib/bigquery/spark-bigquery-latest.jar",
         "spark:spark.hadoop.fs.s3a.access.key": taar_aws_access_key,
         "spark:spark.hadoop.fs.s3a.secret.key": taar_aws_secret_key,
         "spark:spark.jars.packages":
         "org.apache.spark:spark-avro_2.11:2.4.4",
         "spark:spark.python.profile": "true",
     },
     num_workers=35,
     worker_machine_type="n1-standard-8",
     master_machine_type="n1-standard-8",
     init_actions_uris=[
         "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/pip-install.sh"
     ],
     additional_metadata={
         "PIP_PACKAGES":
         "mozilla-taar3==0.4.12 mozilla-srgutil==0.2.1 python-decouple==3.1 click==7.0 boto3==1.7.71 dockerflow==2018.4.0"
     },
     optional_components=["ANACONDA", "JUPYTER"],
     py_args=[
         "--date",
         "{{ ds_nodash }}",
         "--aws_access_key_id",
         taar_aws_access_key,
         "--aws_secret_access_key",
         taar_aws_secret_key,
         "--sample_rate",
         "0.005",
     ],
     aws_conn_id=taar_aws_conn_id,
     gcp_conn_id=taar_gcpdataproc_conn_id,
     master_disk_type="pd-ssd",
     worker_disk_type="pd-ssd",
     master_disk_size=1024,
     worker_disk_size=1024,
     master_num_local_ssds=2,
     worker_num_local_ssds=2,
 ),
# Required to write json output back to s3://telemetry-public-analysis-2/app-update/data/out-of-date/
write_aws_conn_id='aws_dev_telemetry_public_analysis_2_rw'
aws_access_key, aws_secret_key, session = AwsHook(write_aws_conn_id).get_credentials()

crash_report_parquet = SubDagOperator(
    task_id="update_orphaning_dashboard_etl",
    dag=dag,
    subdag = moz_dataproc_pyspark_runner(
        parent_dag_name=dag.dag_id,
        dag_name="update_orphaning_dashboard_etl",
        default_args=default_args,
        cluster_name=cluster_name,
        job_name="update_orphaning_dashboard_etl",
        python_driver_code="gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/update_orphaning_dashboard_etl.py",
        init_actions_uris=["gs://dataproc-initialization-actions/python/pip-install.sh"],
        additional_metadata={'PIP_PACKAGES': "google-cloud-bigquery==1.20.0 google-cloud-storage==1.19.1 boto3==1.9.253"},
        additional_properties={"spark:spark.jars.packages": "org.apache.spark:spark-avro_2.11:2.4.3"},
        py_args=[
            "--run-date", DS_WEEKLY,
            "--gcs-bucket", "moz-fx-data-derived-datasets-analysis",
            "--gcs-prefix", "update-orphaning-airflow",
            "--s3-output-bucket", "telemetry-public-analysis-2",
            "--s3-output-path", "app-update/data/out-of-date/",
            "--aws-access-key-id", aws_access_key,
            "--aws-secret-access-key", aws_secret_key
        ],
        idle_delete_ttl='14400',
        num_workers=20,
        worker_machine_type='n1-standard-8',
        gcp_conn_id=gcp_conn_id)
)
Ejemplo n.º 13
0
 subdag=moz_dataproc_pyspark_runner(
     parent_dag_name=taar_weekly.dag_id,
     dag_name="taar_ensemble",
     default_args=default_args_weekly,
     cluster_name=taar_ensemble_cluster_name,
     job_name="TAAR_ensemble",
     # GCS bucket for testing is located in `cfr-personalization-experiment` project
     # python_driver_code="gs://taar_models/tmp/jobs/taar_ensemble.py",
     python_driver_code=
     "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/taar_ensemble.py",
     additional_properties={
         "spark:spark.jars":
         "gs://spark-lib/bigquery/spark-bigquery-latest.jar",
         "spark:spark.jars.packages":
         "org.apache.spark:spark-avro_2.11:2.4.4",
         "spark:spark.python.profile": "true",
     },
     num_workers=35,
     worker_machine_type="n1-standard-8",
     master_machine_type="n1-standard-8",
     init_actions_uris=[
         "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/pip-install.sh"
     ],
     additional_metadata={
         "PIP_PACKAGES":
         "mozilla-taar3==1.0.7 python-decouple==3.1 click==7.0 "
         "google-cloud-storage==1.19.1"
     },
     optional_components=["ANACONDA", "JUPYTER"],
     py_args=[
         "--date",
         "{{ ds_nodash }}",
         "--gcs_model_bucket",
         TAAR_ETL_MODEL_STORAGE_BUCKET,
         "--sample_rate",
         "0.005",
     ],
     gcp_conn_id=taar_gcpdataproc_conn_id,
     master_disk_type="pd-ssd",
     worker_disk_type="pd-ssd",
     master_disk_size=1024,
     worker_disk_size=1024,
     master_num_local_ssds=2,
     worker_num_local_ssds=2,
 ),
Ejemplo n.º 14
0
 subdag=moz_dataproc_pyspark_runner(
     parent_dag_name=dag.dag_id,
     dag_name=task_id,
     job_name="prerelease_aggregates_cloudsql",
     cluster_name="prerelease-telemetry-aggregates-cloudsql-{{ ds_nodash }}",
     idle_delete_ttl="600",
     zone="us-west2-a",
     subnetwork_uri="default",
     internal_ip_only=True,
     num_workers=10,
     worker_machine_type="n1-standard-8",
     init_actions_uris=[
         "gs://dataproc-initialization-actions/python/pip-install.sh"
     ],
     additional_properties={
         "spark:spark.jars":
         "gs://spark-lib/bigquery/spark-bigquery-latest.jar"
     },
     additional_metadata={
         "PIP_PACKAGES":
         "git+https://github.com/mozilla/python_mozaggregator.git"
     },
     python_driver_code="gs://{}/jobs/mozaggregator_runner.py".format(
         artifact_bucket),
     py_args=[
         "aggregator",
         "--date",
         "{{ ds_nodash }}",
         "--channels",
         "nightly,aurora,beta",
         "--postgres-db",
         "telemetry",
         "--postgres-user",
         "root",
         "--postgres-pass",
         "{{ var.value.mozaggregator_cloudsql_pass }}",
         "--postgres-host",
         "{{ var.value.mozaggregator_cloudsql_host }}",
         "--postgres-ro-host",
         "{{ var.value.mozaggregator_cloudsql_ro_host }}",
         "--num-partitions",
         str(10 * 32),
         "--source",
         "bigquery",
         "--project-id",
         "moz-fx-data-shared-prod",
     ],
     gcp_conn_id=gcp_conn.gcp_conn_id,
     service_account=client_email,
     artifact_bucket=artifact_bucket,
     storage_bucket=storage_bucket,
     default_args=subdag_args,
 ),
Ejemplo n.º 15
0
    dag=dag,
)

taar_lite = SubDagOperator(
    task_id="taar_lite",
    subdag=moz_dataproc_pyspark_runner(
        parent_dag_name="taar_amodump",
        dag_name="taar_lite",
        default_args=default_args,
        cluster_name=taarlite_cluster_name,
        job_name="TAAR_Lite_GUID_GUID",
        python_driver_code=
        "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/taar_lite_guidguid.py",
        # python_driver_code="gs://temp-hwoo-removemelater/taar_lite_guidguid.py",
        num_workers=8,
        py_args=[
            "--date",
            "{{ ds_nodash }}",
            "--aws_access_key_id",
            aws_access_key,
            "--aws_secret_access_key",
            aws_secret_key,
        ],
        aws_conn_id=aws_conn_id,
        gcp_conn_id=gcpdataproc_conn_id,
    ),
    dag=dag,
)
# Set a dependency on amodump from amowhitelist
amowhitelist.set_upstream(amodump)
Ejemplo n.º 16
0
 subdag=moz_dataproc_pyspark_runner(
     parent_dag_name=dag.dag_id,
     image_version="1.5",
     dag_name="modules_with_missing_symbols",
     default_args=default_args,
     cluster_name="modules-with-missing-symbols-{{ ds }}",
     job_name="modules-with-missing-symbols",
     python_driver_code=
     "https://raw.githubusercontent.com/mozilla/python_mozetl/main/mozetl/symbolication/modules_with_missing_symbols.py",
     init_actions_uris=[
         "gs://dataproc-initialization-actions/python/pip-install.sh"
     ],
     additional_metadata={"PIP_PACKAGES": " ".join(PIP_PACKAGES)},
     additional_properties={
         "spark:spark.jars":
         "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar",
         "spark-env:AWS_ACCESS_KEY_ID": ses_access_key,
         "spark-env:AWS_SECRET_ACCESS_KEY": ses_secret_key,
     },
     py_args=[
         "--run-on-days",
         "0",  # run monday
         "--date",
         "{{ ds }}"
     ],
     idle_delete_ttl="14400",
     num_workers=2,
     worker_machine_type="n1-standard-4",
     gcp_conn_id=params.conn_id,
     service_account=params.client_email,
     storage_bucket=params.storage_bucket,
 ),
Ejemplo n.º 17
0
 subdag=moz_dataproc_pyspark_runner(
     parent_dag_name=dag.dag_id,
     image_version="1.5",
     dag_name="bhr_collection",
     default_args=default_args,
     cluster_name="bhr-collection-{{ ds }}",
     job_name="bhr-collection",
     python_driver_code=
     "https://raw.githubusercontent.com/mozilla/python_mozetl/main/mozetl/bhr_collection/bhr_collection.py",
     init_actions_uris=[
         "gs://dataproc-initialization-actions/python/pip-install.sh"
     ],
     additional_metadata={
         "PIP_PACKAGES": "boto3==1.16.20 click==7.1.2"
     },
     additional_properties={
         "spark:spark.jars":
         "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar",
         "spark-env:AWS_ACCESS_KEY_ID": aws_access_key,
         "spark-env:AWS_SECRET_ACCESS_KEY": aws_secret_key
     },
     py_args=[
         "--date",
         "{{ ds }}",
         "--sample-size",
         "0.5",
     ],
     idle_delete_ttl="14400",
     num_workers=6,
     worker_machine_type="n1-highmem-4",
     gcp_conn_id=params.conn_id,
     service_account=params.client_email,
     storage_bucket=params.storage_bucket,
 ))
Ejemplo n.º 18
0
    pool="DATA_ENG_EXTERNALTASKSENSOR",
    email_on_retry=False,
    dag=dag)

taar_locale = SubDagOperator(
    task_id="taar_locale",
    subdag=moz_dataproc_pyspark_runner(
        parent_dag_name=dag.dag_id,
        dag_name="taar_locale",
        default_args=default_args,
        cluster_name=taar_locale_cluster_name,
        job_name="TAAR_Locale",
        python_driver_code=
        "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/taar_locale.py",
        # GCS bucket for testing is located in `cfr-personalization-experiment` project
        # python_driver_code="gs://taar_models/tmp/jobs/taar_locale.py",
        num_workers=12,
        py_args=[
            "--date",
            "{{ ds_nodash }}",
            "--bucket",
            TAAR_ETL_MODEL_STORAGE_BUCKET,
            "--prefix",
            "taar/locale",
        ],
        gcp_conn_id=taar_gcpdataproc_conn_id),
    dag=dag)

taar_similarity = SubDagOperator(
    task_id="taar_similarity",
    subdag=moz_dataproc_pyspark_runner(
        parent_dag_name=dag.dag_id,