Exemple #1
0
def add_search_rollup(dag, mode, instance_count, upstream=None):
    """Create a search rollup for a particular date date

    This can be called with an optional task passed into `upstream`. The rollup
    job will inherit the default values of the referenced DAG.
    """

    search_rollup = EMRSparkOperator(
        task_id="search_rollup_{}".format(mode),
        job_name="{} search rollup".format(mode).title(),
        owner="*****@*****.**",
        email=[
            '*****@*****.**',
            '*****@*****.**',
            '*****@*****.**',
        ],
        execution_timeout=timedelta(hours=4),
        instance_count=instance_count,
        disable_on_dev=True,
        env=mozetl_envvar(
            "search_rollup", {
                "start_date": "{{ ds_nodash }}",
                "mode": mode,
                "bucket": "net-mozaws-prod-us-west-2-pipeline-analysis",
                "prefix": "spenrose/search/to_vertica",
            }),
        uri=
        "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
        dag=dag)

    if upstream:
        search_rollup.set_upstream(upstream)
Exemple #2
0
def add_search_rollup(dag, mode, instance_count, upstream=None):
    """Create a search rollup for a particular date date"""

    search_rollup = EMRSparkOperator(
        task_id="search_rollup_{}".format(mode),
        job_name="{} search rollup".format(mode).title(),
        execution_timeout=timedelta(hours=4),
        instance_count=instance_count,
        env=mozetl_envvar("search_rollup", {
            "start_date": "{{ ds_nodash }}",
            "mode": mode,
            "bucket": "net-mozaws-prod-us-west-2-pipeline-analysis",
            "prefix": "spenrose/search/to_vertica",
        }),
        uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
        dag=dag
    )

    if upstream:
        search_rollup.set_upstream(upstream)
    job_name="A placeholder for the implicit clients daily dependency",
    dag=dag,
)

bgbb_fit = MozDatabricksSubmitRunOperator(
    task_id="bgbb_fit",
    job_name="Fit parameters for a BGBB model to determine active profiles",
    execution_timeout=timedelta(hours=2),
    instance_count=3,
    env=mozetl_envvar(
        "bgbb_fit",
        {
            "submission-date": "{{ next_ds }}",
            "model-win": "120",
            "start-params": "[0.387, 0.912, 0.102, 1.504]",
            "sample-ids": "[42]",
            "sample-fraction": "1.0",
            "penalizer-coef": "0.01",
            "bucket": "{{ task.__class__.private_output_bucket }}",
            "prefix": "bgbb/params/v1",
        },
        dev_options={"model-win": "30"},
        other={
            "MOZETL_GIT_PATH": "https://github.com/wcbeard/bgbb_airflow.git",
            "MOZETL_EXTERNAL_MODULE": "bgbb_airflow",
        },
    ),
    dag=dag,
)

clients_daily_v6_dummy >> bgbb_fit
        replace=["SAFE_CAST(sample_id AS INT64) AS sample_id"],
    ),
    task_id="main_summary_bigquery_load",
    dag=dag)

engagement_ratio = EMRSparkOperator(
    task_id="engagement_ratio",
    job_name="Update Engagement Ratio",
    execution_timeout=timedelta(hours=6),
    instance_count=10,
    env=mozetl_envvar("engagement_ratio",
                      options={
                          "input_bucket":
                          "{{ task.__class__.private_output_bucket }}",
                          "output_bucket":
                          "net-mozaws-prod-us-west-2-pipeline-analysis"
                      },
                      dev_options={
                          "output_bucket":
                          "{{ task.__class__.private_output_bucket }}"
                      }),
    uri=
    "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    output_visibility="public",
    dag=dag)

addons = EMRSparkOperator(
    task_id="addons",
    job_name="Addons View",
    execution_timeout=timedelta(hours=4),
    instance_count=3,
    rename={"submission_date_s3": "submission_date"},
    replace=["SAFE_CAST(sample_id AS INT64) AS sample_id"],
),
                                           task_id="main_events_bigquery_load",
                                           dag=dag)

addon_aggregates = EMRSparkOperator(
    task_id="addon_aggregates",
    job_name="Addon Aggregates View",
    execution_timeout=timedelta(hours=8),
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    instance_count=10,
    env=mozetl_envvar(
        "addon_aggregates", {
            "date": "{{ ds_nodash }}",
            "input-bucket": "{{ task.__class__.private_output_bucket }}",
            "output-bucket": "{{ task.__class__.private_output_bucket }}"
        }),
    uri=
    "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    dag=dag)

addon_aggregates_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="addon_aggregates_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
        dataset="addons/agg",
        dataset_version="v2",
    'email_on_retry': True,
    'retries': 3,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('taar_amodump', default_args=default_args, schedule_interval='@daily')

amodump = EMRSparkOperator(
    task_id="taar_amodump",
    job_name="Dump AMO JSON blobs with oldest creation date",
    execution_timeout=timedelta(hours=1),
    instance_count=1,
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    env=mozetl_envvar("taar_amodump",
                      {"date": "{{ ds_nodash }}"},
                      {'MOZETL_SUBMISSION_METHOD': 'python'}),
    uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    output_visibility="private",
    dag=dag
)

amowhitelist = EMRSparkOperator(
    task_id="taar_amowhitelist",
    job_name="Generate a whitelisted set of addons for TAAR",
    execution_timeout=timedelta(hours=1),
    instance_count=1,
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    env=mozetl_envvar("taar_amowhitelist",
                      {},
from airflow import DAG
from airflow.operators.moz_databricks import MozDatabricksSubmitRunOperator
from datetime import datetime, timedelta
from utils.mozetl import mozetl_envvar

default_args = {
    'owner': '*****@*****.**',
    'depends_on_past': False,
    'start_date': datetime(2018, 11, 26),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 3,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('tab_spinner_severity', default_args=default_args, schedule_interval='@daily')

update_tab_spinner_severity = MozDatabricksSubmitRunOperator(
    task_id="update_tab_spinner_severity",
    job_name="Tab Spinner Severity Job",
    execution_timeout=timedelta(hours=12),
    instance_count=12,
    env=mozetl_envvar("long_tab_spinners", {}),
    uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    dag=dag
)

Exemple #8
0
    email=[
        "*****@*****.**", "*****@*****.**",
        "*****@*****.**"
    ],
    env={
        "date": "{{ ds_nodash }}",
        "bucket": "{{ task.__class__.public_output_bucket }}"
    },
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/hardware_report.sh",
    output_visibility="public",
    dag=dag)

taar_lite_guidranking = EMRSparkOperator(
    task_id="taar_lite_guidranking",
    job_name="TAARlite Addon Ranking",
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    execution_timeout=timedelta(hours=2),
    instance_count=4,
    env=mozetl_envvar("taar_lite_guidranking", {"date": "{{ ds_nodash }}"},
                      {'MOZETL_SUBMISSION_METHOD': 'spark'}),
    uri=
    "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    output_visibility="private",
    dag=dag)

addon_recommender.set_upstream(longitudinal)
game_hw_survey.set_upstream(longitudinal)
taar_lite_guidranking.set_upstream(longitudinal)
Exemple #9
0
            "bucket": "{{ task.__class__.private_output_bucket }}"
        }),
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py",
    dag=dag)

addon_aggregates = EMRSparkOperator(
    task_id="addon_aggregates",
    job_name="Addon Aggregates View",
    execution_timeout=timedelta(hours=8),
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    instance_count=10,
    env=mozetl_envvar(
        "addon_aggregates", {
            "date": "{{ ds_nodash }}",
            "output-bucket": "{{ task.__class__.private_output_bucket }}"
        }),
    uri=
    "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    dag=dag)

txp_mau_dau = EMRSparkOperator(
    task_id="txp_mau_dau",
    job_name="Test Pilot MAU DAU",
    execution_timeout=timedelta(hours=4),
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    instance_count=5,
    env={
        "date": "{{ ds_nodash }}",
Exemple #10
0
        dataset_version="v1",
        gke_cluster_name="bq-load-gke-1",
        bigquery_dataset="telemetry_derived",
    ),
    task_id="sync_flat_view_bigquery_load",
    dag=dag)

sync_bookmark_validation = EMRSparkOperator(
    task_id="sync_bookmark_validation",
    job_name="Sync Bookmark Validation",
    execution_timeout=timedelta(hours=2),
    instance_count=1,
    email=["*****@*****.**", "*****@*****.**"],
    env=mozetl_envvar(
        "sync_bookmark_validation", {
            "start_date": "{{ ds_nodash }}",
            "bucket": "{{ task.__class__.private_output_bucket }}",
        }),
    uri=
    "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    dag=dag)

sync_bookmark_validation_total_per_day_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="sync_bookmark_validation_total_per_day_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
        p2b_table_alias="sync_bmk_total_per_day_v1",
        dataset="sync/bmk_total_per_day",
Exemple #11
0
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('churn', default_args=default_args, schedule_interval='0 0 * * 3')

churn = EMRSparkOperator(
    task_id="churn",
    job_name="churn 7-day v3",
    execution_timeout=timedelta(hours=4),
    instance_count=10,
    env=mozetl_envvar("churn", {
        "start_date": "{{ ds_nodash }}",
        "bucket": "{{ task.__class__.private_output_bucket }}"
    }),
    uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    output_visibility="public",
    dag=dag)

churn_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="churn_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
        dataset="churn",
        dataset_version="v3",
        date_submission_col="week_start",
Exemple #12
0
default_args = {
    'owner': '*****@*****.**',
    'depends_on_past': False,
    'start_date': datetime(2018, 9, 10),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('landfill', default_args=default_args, schedule_interval='0 1 * * *')

landfill_sampler = MozDatabricksSubmitRunOperator(
    task_id="landfill_sampler",
    job_name="Landfill Sampler",
    execution_timeout=timedelta(hours=2),
    instance_count=3,
    iam_role=
    "arn:aws:iam::144996185633:instance-profile/databricks-ec2-landfill",
    env=mozetl_envvar(
        "landfill_sampler", {
            "submission-date": "{{ ds_nodash }}",
            "bucket": "{{ task.__class__.private_output_bucket }}",
            "prefix": "sanitized-landfill-sample",
        }),
    uri=
    "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    dag=dag)
    instance_count=5,
    env={"date": DS_WEEKLY},
    uri=
    "https://raw.githubusercontent.com/mozilla/distribution-viewer/master/notebooks/aggregate-and-import.py",
    dag=dag)

taar_locale_job = EMRSparkOperator(
    task_id="taar_locale_job",
    job_name="TAAR Locale Model",
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    execution_timeout=timedelta(hours=10),
    instance_count=5,
    env=mozetl_envvar(
        "taar_locale", {
            "date": "{{ ds_nodash }}",
            "bucket": "{{ task.__class__.private_output_bucket }}",
            "prefix": "taar/locale/"
        }),
    release_label="emr-5.8.0",
    uri=
    "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    output_visibility="private",
    dag=dag)

taar_legacy_job = EMRSparkOperator(
    task_id="taar_legacy_job",
    job_name="TAAR Legacy Model",
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    execution_timeout=timedelta(hours=1),
    instance_count=1,
Exemple #14
0
        "glue_secret_access_key": "{{ var.value.glue_secret_access_key }}",
        "glue_default_region": "{{ var.value.glue_default_region }}",
    },
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/update_glue.sh",
    dag=dag)

taar_dynamo = EMRSparkOperator(
    task_id="taar_dynamo",
    job_name="TAAR DynamoDB loader",
    execution_timeout=timedelta(hours=14),
    instance_count=6,
    disable_on_dev=True,
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**", "*****@*****.**"],
    env=mozetl_envvar("taar_dynamo", {"date": "{{ ds_nodash }}"}),
    uri=
    "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    output_visibility="private",
    dag=dag)

taar_locale_job = SubDagOperator(
    task_id="taar_locale_job",
    subdag=moz_dataproc_pyspark_runner(
        parent_dag_name=dag.dag_id,
        dag_name="taar_locale_job",
        default_args=default_args,
        cluster_name=taar_locale_cluster_name,
        job_name="TAAR_Locale",
        python_driver_code=
        "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/taar_locale.py",
    task_id="game_hw_survey",
    job_name="Firefox Hardware Report",
    execution_timeout=timedelta(hours=5),
    instance_count=15,
    owner="*****@*****.**",
    depends_on_past=True,
    email=["*****@*****.**", "*****@*****.**",
           "*****@*****.**"],
    env={"date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.public_output_bucket }}"},
    uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/hardware_report.sh",
    output_visibility="public",
    dag=dag)


taar_lite_guidranking = EMRSparkOperator(
    task_id="taar_lite_guidranking",
    job_name="TAARlite Addon Ranking",
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    execution_timeout=timedelta(hours=2),
    instance_count=4,
    env=mozetl_envvar("taar_lite_guidranking",
                      {"date": "{{ ds_nodash }}"},
                      {'MOZETL_SUBMISSION_METHOD': 'spark'}),
    uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    output_visibility="private",
    dag=dag)

game_hw_survey.set_upstream(longitudinal)
taar_lite_guidranking.set_upstream(longitudinal)
Exemple #16
0
          schedule_interval="@daily")

mobile_aggregate_view = MozDatabricksSubmitRunOperator(
    task_id="mobile_aggregate_view",
    job_name="Mobile Aggregate View",
    release_label="6.1.x-scala2.11",
    instance_count=5,
    execution_timeout=timedelta(hours=12),
    env=mozetl_envvar(
        "mobile",
        {
            "date": "{{ ds_nodash }}",
            "channels": "nightly",
            "output":
            "s3://{{ task.__class__.private_output_bucket }}/mobile_metrics_aggregates/v2",
            "num-partitions": 5 * 32
        },
        other={
            "MOZETL_GIT_PATH":
            "https://github.com/mozilla/python_mozaggregator.git",
            "MOZETL_EXTERNAL_MODULE": "mozaggregator",
        },
    ),
    dag=dag,
)

register_status(
    mobile_aggregate_view,
    "Mobile Aggregates",
    "Aggregates of metrics sent through the mobile-events pings.",
)
    execution_delta=timedelta(
        days=-7, hours=-1
    ),  # main_summary waits one hour, execution date is beginning of the week
    dag=taar_weekly,
)

taar_ensemble = MozDatabricksSubmitRunOperator(
    task_id="taar_ensemble",
    job_name="TAAR Ensemble Model",
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    execution_timeout=timedelta(hours=11),
    instance_count=5,
    instance_type="i3.2xlarge",
    spot_bid_price_percent=100,
    max_instance_count=60,
    enable_autoscale=True,
    pypi_libs=[
        "mozilla-taar3==0.4.5",
        "mozilla-srgutil==0.1.10",
        "python-decouple==3.1",
    ],
    env=mozetl_envvar("taar_ensemble", {"date": "{{ ds_nodash }}"}),
    start_date=datetime(2019, 7, 14),
    uri=
    "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-databricks.py",
    output_visibility="private",
)

taar_ensemble.set_upstream(wait_for_clients_daily)
    schedule_interval="@daily",
)

prerelease_telemetry_aggregate_view = MozDatabricksSubmitRunOperator(
    task_id="prerelease_telemetry_aggregate_view",
    job_name="Prerelease Telemetry Aggregate View",
    instance_count=10,
    dev_instance_count=10,
    execution_timeout=timedelta(hours=12),
    python_version=2,
    env=mozetl_envvar(
        "aggregator",
        {
            "date": "{{ ds_nodash }}",
            "channels": "nightly,aurora,beta",
            "credentials-bucket": "telemetry-spark-emr-2",
            "credentials-prefix": "aggregator_database_envvars.json",
            "num-partitions": 10 * 32,
        },
        dev_options={
            "credentials-prefix": "aggregator_dev_database_envvars.json"
        },
        other={
            "MOZETL_GIT_PATH":
            "https://github.com/mozilla/python_mozaggregator.git",
            "MOZETL_EXTERNAL_MODULE": "mozaggregator",
        },
    ),
    dag=dag,
)
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
        dataset="sync_flat_summary",
        dataset_version="v1",
        gke_cluster_name="bq-load-gke-1",
        ),
    task_id="sync_flat_view_bigquery_load",
    dag=dag)

sync_bookmark_validation = EMRSparkOperator(
    task_id="sync_bookmark_validation",
    job_name="Sync Bookmark Validation",
    execution_timeout=timedelta(hours=2),
    instance_count=1,
    email=["*****@*****.**", "*****@*****.**"],
    env=mozetl_envvar("sync_bookmark_validation", {
        "start_date": "{{ ds_nodash }}",
        "bucket": "{{ task.__class__.private_output_bucket }}",
    }),
    uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    dag=dag)


sync_bookmark_validation.set_upstream(sync_view)

sync_view_bigquery_load.set_upstream(sync_view)

sync_events_view_bigquery_load.set_upstream(sync_events_view)

sync_flat_view_bigquery_load.set_upstream(sync_flat_view)
dag = DAG(
    "telemetry_aggregates_parquet",
    default_args=default_args,
    schedule_interval="@daily",
)

telemetry_aggregate_parquet_view = MozDatabricksSubmitRunOperator(
    task_id="telemetry_aggregate_parquet_view",
    job_name="Telemetry Aggregate Parquet View",
    instance_count=5,
    execution_timeout=timedelta(hours=12),
    python_version=2,
    env=mozetl_envvar(
        "parquet",
        {
            "date":
            "{{ ds_nodash }}",
            "channels":
            "nightly",
            "output":
            "s3://{{ task.__class__.private_output_bucket }}/aggregates_poc/v1",
        },
        other={
            "MOZETL_GIT_PATH":
            "https://github.com/mozilla/python_mozaggregator.git",
            "MOZETL_EXTERNAL_MODULE": "mozaggregator",
        },
    ),
    dag=dag,
)
Exemple #21
0
main_summary_schema = EmailSchemaChangeOperator(
    task_id="main_summary_schema",
    email=["*****@*****.**", "*****@*****.**"],
    to=["*****@*****.**", "*****@*****.**"],
    key_prefix='schema/main_summary/submission_date_s3=',
    dag=dag)

engagement_ratio = EMRSparkOperator(
    task_id="engagement_ratio",
    job_name="Update Engagement Ratio",
    execution_timeout=timedelta(hours=6),
    instance_count=10,
    env=mozetl_envvar("engagement_ratio",
        options={
            "input_bucket": "{{ task.__class__.private_output_bucket }}",
            "output_bucket": "net-mozaws-prod-us-west-2-pipeline-analysis"
        },
        dev_options={
            "output_bucket": "{{ task.__class__.private_output_bucket }}"
        }),
    uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    output_visibility="public",
    dag=dag)

addons = EMRSparkOperator(
    task_id="addons",
    job_name="Addons View",
    execution_timeout=timedelta(hours=4),
    instance_count=3,
    env=tbv_envvar("com.mozilla.telemetry.views.AddonsView", {
        "from": "{{ ds_nodash }}",
        "to": "{{ ds_nodash }}",
        dataset="main_summary",
        dataset_version="v4",
        gke_cluster_name="bq-load-gke-1",
        ),
    task_id="main_summary_bigquery_load",
    dag=dag)

engagement_ratio = EMRSparkOperator(
    task_id="engagement_ratio",
    job_name="Update Engagement Ratio",
    execution_timeout=timedelta(hours=6),
    instance_count=10,
    env=mozetl_envvar("engagement_ratio",
        options={
            "input_bucket": "{{ task.__class__.private_output_bucket }}",
            "output_bucket": "net-mozaws-prod-us-west-2-pipeline-analysis"
        },
        dev_options={
            "output_bucket": "{{ task.__class__.private_output_bucket }}"
        }),
    uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    output_visibility="public",
    dag=dag)

addons = EMRSparkOperator(
    task_id="addons",
    job_name="Addons View",
    execution_timeout=timedelta(hours=4),
    instance_count=3,
    env=tbv_envvar("com.mozilla.telemetry.views.AddonsView", {
        "from": "{{ ds_nodash }}",
        "to": "{{ ds_nodash }}",
)

addons_daily = MozDatabricksSubmitRunOperator(
    task_id="addons_daily",
    job_name="Addons Daily",
    execution_timeout=timedelta(hours=4),
    instance_count=10,
    owner="*****@*****.**",
    email=[
        "*****@*****.**",
        "*****@*****.**",
        "*****@*****.**",
        "*****@*****.**",
        "*****@*****.**",
    ],
    env=mozetl_envvar(
        "addons_report",
        {
            "date": "{{ ds_nodash }}",
            "deploy_environment": "{{ task.__class__.deploy_environment }}",
        },
        other={
            "MOZETL_GIT_PATH": "https://github.com/mozilla/addons_daily.git",
            "MOZETL_EXTERNAL_MODULE": "addons_daily",
        },
    ),
    dag=dag,
)

addons_daily.set_upstream(wait_for_search_clients_daily)
    'email_on_retry': True,
    'retries': 3,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('taar_amodump', default_args=default_args, schedule_interval='@daily')

amodump = EMRSparkOperator(
    task_id="taar_amodump",
    job_name="Dump AMO JSON blobs with oldest creation date",
    execution_timeout=timedelta(hours=1),
    instance_count=1,
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    env=mozetl_envvar("taar_amodump",
                      {"date": "{{ ds_nodash }}"},
                      {'MOZETL_SUBMISSION_METHOD': 'python'}),
    uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    output_visibility="private",
    dag=dag
)

amowhitelist = EMRSparkOperator(
    task_id="taar_amowhitelist",
    job_name="Generate an algorithmically defined set of whitelisted addons for TAAR",
    execution_timeout=timedelta(hours=1),
    instance_count=1,
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    env=mozetl_envvar("taar_amowhitelist",
                      {},
Exemple #25
0
    instance_count=5,
    env={"date": DS_WEEKLY},
    uri=
    "https://raw.githubusercontent.com/mozilla/distribution-viewer/master/notebooks/aggregate-and-import.py",
    dag=dag)

taar_locale_job = EMRSparkOperator(
    task_id="taar_locale_job",
    job_name="TAAR Locale Model",
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    execution_timeout=timedelta(hours=10),
    instance_count=5,
    env=mozetl_envvar(
        "taar_locale", {
            "date": "{{ ds_nodash }}",
            "bucket": "{{ task.__class__.private_output_bucket }}",
            "prefix": "taar/locale/"
        }),
    release_label="emr-5.8.0",
    uri=
    "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    output_visibility="private",
    dag=dag)

taar_legacy_job = EMRSparkOperator(
    task_id="taar_legacy_job",
    job_name="TAAR Legacy Model",
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    execution_timeout=timedelta(hours=1),
    instance_count=1,
Exemple #26
0
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('churn', default_args=default_args, schedule_interval='0 0 * * 3')

churn = EMRSparkOperator(
    task_id="churn",
    job_name="churn 7-day v3",
    execution_timeout=timedelta(hours=4),
    instance_count=5,
    env=mozetl_envvar(
        "churn", {
            "start_date": "{{ ds_nodash }}",
            "bucket": "{{ task.__class__.private_output_bucket }}"
        }),
    uri=
    "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    output_visibility="public",
    dag=dag)

churn_v2 = EMRSparkOperator(
    task_id="churn_v2",
    job_name="churn 7-day v2",
    execution_timeout=timedelta(hours=4),
    instance_count=5,
    env=mozetl_envvar(
        "churn", {
            "start_date": "{{ ds_nodash }}",
Exemple #27
0
from airflow import DAG
from datetime import datetime, timedelta
from operators.emr_spark_operator import EMRSparkOperator
from utils.mozetl import mozetl_envvar

default_args = {
    'owner': '*****@*****.**',
    'depends_on_past': False,
    'start_date': datetime(2018, 11, 26),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 3,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('tab_spinner_severity', default_args=default_args, schedule_interval='@daily')

update_tab_spinner_severity = EMRSparkOperator(
    task_id="update_tab_spinner_severity",
    job_name="Tab Spinner Severity Job",
    execution_timeout=timedelta(hours=12),
    instance_count=12,
    env=mozetl_envvar("long_tab_spinners", {}),
    uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    dag=dag
)