def topline_dag(dag, mode, instance_count):
    topline_summary = EMRSparkOperator(
        task_id="topline_summary",
        job_name="Topline Summary View",
        execution_timeout=timedelta(hours=8),
        instance_count=instance_count,
        env={
            "date": "{{ ds_nodash }}",
            "bucket": "{{ task.__class__.private_output_bucket }}",
            "mode": mode
        },
        uri=
        "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/topline_summary_view.sh",
        dag=dag)

    topline_dashboard = EMRSparkOperator(
        task_id="topline_dashboard",
        job_name="Topline Dashboard",
        execution_timeout=timedelta(hours=2),
        instance_count=1,
        env={"mode": mode},
        uri=
        "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/topline_dashboard.sh",
        dag=dag)

    topline_dashboard.set_upstream(topline_summary)
コード例 #2
0
def add_search_rollup(dag, mode, instance_count, upstream=None):
    """Create a search rollup for a particular date date

    This can be called with an optional task passed into `upstream`. The rollup
    job will inherit the default values of the referenced DAG.
    """

    search_rollup = EMRSparkOperator(
        task_id="search_rollup_{}".format(mode),
        job_name="{} search rollup".format(mode).title(),
        owner="*****@*****.**",
        email=[
            '*****@*****.**',
            '*****@*****.**',
            '*****@*****.**',
        ],
        execution_timeout=timedelta(hours=4),
        instance_count=instance_count,
        disable_on_dev=True,
        env=mozetl_envvar(
            "search_rollup", {
                "start_date": "{{ ds_nodash }}",
                "mode": mode,
                "bucket": "net-mozaws-prod-us-west-2-pipeline-analysis",
                "prefix": "spenrose/search/to_vertica",
            }),
        uri=
        "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
        dag=dag)

    if upstream:
        search_rollup.set_upstream(upstream)
コード例 #3
0
def add_search_rollup(dag, mode, instance_count, upstream=None):
    """Create a search rollup for a particular date date"""

    search_rollup = EMRSparkOperator(
        task_id="search_rollup_{}".format(mode),
        job_name="{} search rollup".format(mode).title(),
        execution_timeout=timedelta(hours=4),
        instance_count=instance_count,
        env=mozetl_envvar("search_rollup", {
            "start_date": "{{ ds_nodash }}",
            "mode": mode,
            "bucket": "net-mozaws-prod-us-west-2-pipeline-analysis",
            "prefix": "spenrose/search/to_vertica",
        }),
        uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
        dag=dag
    )

    if upstream:
        search_rollup.set_upstream(upstream)
コード例 #4
0
ファイル: topline.py プロジェクト: mozilla/telemetry-airflow
def topline_dag(dag, mode, instance_count):
    topline_summary = EMRSparkOperator(
        task_id="topline_summary",
        job_name="Topline Summary View",
        execution_timeout=timedelta(hours=8),
        instance_count=instance_count,
        env={
            "date": "{{ ds_nodash }}",
            "bucket": "{{ task.__class__.private_output_bucket }}",
            "mode": mode
        },
        uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/topline_summary_view.sh",
        dag=dag)

    topline_dashboard = EMRSparkOperator(
        task_id="topline_dashboard",
        job_name="Topline Dashboard",
        execution_timeout=timedelta(hours=2),
        instance_count=1,
        env={"mode": mode},
        uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/topline_dashboard.sh",
        dag=dag)

    topline_dashboard.set_upstream(topline_summary)
コード例 #5
0
from airflow import DAG
from datetime import datetime, timedelta
from operators.emr_spark_operator import EMRSparkOperator

default_args = {
    'owner': '*****@*****.**',
    'depends_on_past': False,
    'start_date': datetime(2017, 1, 1),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=30),
}

# Make sure all the data for the given day has arrived before running.
# Running at 1am should suffice.
dag = DAG('sync_log', default_args=default_args, schedule_interval='0 1 * * *')

t0 = EMRSparkOperator(task_id="sync_log",
                      job_name="Sync Log Import",
                      execution_timeout=timedelta(hours=10),
                      release_label="emr-5.0.0",
                      instance_count=10,
                      env={"date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.private_output_bucket }}"},
                      uri="https://raw.githubusercontent.com/mozilla/mozilla-reports/master/etl/sync_log.kp/orig_src/ImportSyncLogs.ipynb",
                      dag=dag)
コード例 #6
0
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 3,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('taar_amodump', default_args=default_args, schedule_interval='@daily')

amodump = EMRSparkOperator(
    task_id="taar_amodump",
    job_name="Dump AMO JSON blobs with oldest creation date",
    execution_timeout=timedelta(hours=1),
    instance_count=1,
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    env=mozetl_envvar("taar_amodump",
                      {"date": "{{ ds_nodash }}"},
                      {'MOZETL_SUBMISSION_METHOD': 'python'}),
    uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    output_visibility="private",
    dag=dag
)

amowhitelist = EMRSparkOperator(
    task_id="taar_amowhitelist",
    job_name="Generate an algorithmically defined set of whitelisted addons for TAAR",
    execution_timeout=timedelta(hours=1),
    instance_count=1,
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    env=mozetl_envvar("taar_amowhitelist",
コード例 #7
0
    "retries": 2,
    "retry_delay": timedelta(minutes=30),
}

dag = DAG("socorro_import",
          default_args=default_args,
          schedule_interval="@daily")

# input: crashstats-telemetry-crashes-prod-us-west-2/v1/crash_report
# output: telemetry-parquet/socorro_crash/v2
crash_report_parquet = EMRSparkOperator(
    task_id="crash_report_parquet",
    job_name="Socorro Crash Reports Parquet",
    execution_timeout=timedelta(hours=4),
    instance_count=10,
    env={"date": "{{ ds_nodash }}"},
    uri=
    "https://raw.githubusercontent.com/mozilla-services/data-pipeline/master/reports/socorro_import/ImportCrashData.ipynb",
    output_visibility="public",
    dag=dag,
)

register_status(
    crash_report_parquet,
    crash_report_parquet.job_name,
    "Convert processed crash reports into parquet for analysis",
)

crash_report_parquet_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
コード例 #8
0
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('fx_usage_report', default_args=default_args, schedule_interval='@weekly')

wait_for_main_summary = ExternalTaskSensor(
    task_id='wait_for_main_summary',
    external_dag_id='main_summary',
    external_task_id='main_summary',
    execution_delta=timedelta(days=-7, hours=-1), # main_summary waits one hour, execution date is beginning of the week
    dag=dag)

usage_report = EMRSparkOperator(
    task_id="fx_usage_report",
    job_name="Fx Usage Report",
    execution_timeout=timedelta(hours=4),
    instance_count=10,
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**", "*****@*****.**"],
    env={"date": DS_WEEKLY,
         "bucket": "net-mozaws-prod-us-west-2-data-public",
         "deploy_environment": "{{ task.__class__.deploy_environment }}"},
    uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/fx_usage_report.sh",
    dag=dag)

usage_report.set_upstream(wait_for_main_summary)
コード例 #9
0
        aws_conn_id="aws_dev_iam_s3",
        dataset="main_summary",
        dataset_version="v4",
        gke_cluster_name="bq-load-gke-1",
        ),
    task_id="main_summary_bigquery_load",
    dag=dag)

engagement_ratio = EMRSparkOperator(
    task_id="engagement_ratio",
    job_name="Update Engagement Ratio",
    execution_timeout=timedelta(hours=6),
    instance_count=10,
    env=mozetl_envvar("engagement_ratio",
        options={
            "input_bucket": "{{ task.__class__.private_output_bucket }}",
            "output_bucket": "net-mozaws-prod-us-west-2-pipeline-analysis"
        },
        dev_options={
            "output_bucket": "{{ task.__class__.private_output_bucket }}"
        }),
    uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    output_visibility="public",
    dag=dag)

addons = EMRSparkOperator(
    task_id="addons",
    job_name="Addons View",
    execution_timeout=timedelta(hours=4),
    instance_count=3,
    env=tbv_envvar("com.mozilla.telemetry.views.AddonsView", {
        "from": "{{ ds_nodash }}",
    'depends_on_past': False,
    'start_date': datetime(2099, 5, 31),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 3,
    'retry_delay': timedelta(minutes=10),
}

dag = DAG('example', default_args=default_args, schedule_interval='@daily')

spark = EMRSparkOperator(
    task_id="spark",
    job_name="Spark Example Job",
    instance_count=1,
    execution_timeout=timedelta(hours=4),
    env={"date": "{{ ds_nodash }}"},
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/examples/spark/example_date.ipynb",
    dag=dag)

bash = EMRSparkOperator(
    task_id="bash",
    job_name="Bash Example Job",
    instance_count=1,
    execution_timeout=timedelta(hours=4),
    env={"date": "{{ ds_nodash }}"},
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/examples/spark/example_date.sh",
    dag=dag)
コード例 #11
0
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('churn', default_args=default_args, schedule_interval='0 0 * * 3')

churn = EMRSparkOperator(
    task_id="churn",
    job_name="churn 7-day v3",
    execution_timeout=timedelta(hours=4),
    instance_count=5,
    env=mozetl_envvar(
        "churn", {
            "start_date": "{{ ds_nodash }}",
            "bucket": "{{ task.__class__.private_output_bucket }}"
        }),
    uri=
    "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    output_visibility="public",
    dag=dag)

churn_v2 = EMRSparkOperator(
    task_id="churn_v2",
    job_name="churn 7-day v2",
    execution_timeout=timedelta(hours=4),
    instance_count=5,
    env=mozetl_envvar(
        "churn", {
            "start_date": "{{ ds_nodash }}",
コード例 #12
0
from datetime import datetime, timedelta
from operators.emr_spark_operator import EMRSparkOperator

default_args = {
    'owner': '*****@*****.**',
    'depends_on_past': False,
    'start_date': datetime(2017, 1, 1),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=30),
}

# Make sure all the data for the given day has arrived before running.
# Running at 1am should suffice.
dag = DAG('sync_log', default_args=default_args, schedule_interval='0 1 * * *')

sync_log = EMRSparkOperator(
    task_id="sync_log",
    job_name="Sync Log Import",
    execution_timeout=timedelta(hours=10),
    instance_count=10,
    env={
        "date": "{{ ds_nodash }}",
        "bucket": "{{ task.__class__.private_output_bucket }}"
    },
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/ImportSyncLogs.ipynb",
    dag=dag)
コード例 #13
0
ファイル: churn.py プロジェクト: mozilla/telemetry-airflow
    dag=dag)

churn_v2_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="churn_v2_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
        dataset="churn",
        dataset_version="v2",
        date_submission_col="week_start",
        gke_cluster_name="bq-load-gke-1",
        ),
    task_id="churn_v2_bigquery_load",
    dag=dag)

churn_to_csv = EMRSparkOperator(
    task_id="churn_to_csv",
    job_name="Convert Churn v2 to csv",
    execution_timeout=timedelta(hours=4),
    instance_count=1,
    env=mozetl_envvar("churn_to_csv", {"start_date": "{{ ds_nodash }}"}),
    uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    dag=dag)

churn_bigquery_load.set_upstream(churn)

churn_to_csv.set_upstream(churn_v2)
churn_v2_bigquery_load.set_upstream(churn_v2)
コード例 #14
0
from operators.emr_spark_operator import EMRSparkOperator
from utils.constants import DS_WEEKLY

default_args = {
    'owner': '*****@*****.**',
    'depends_on_past': False,
    'start_date': datetime(2017, 5, 26),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('core_client_count',
          default_args=default_args,
          schedule_interval='@weekly')

core_client_count_view = EMRSparkOperator(
    task_id="core_client_count_view",
    job_name="Core Client Count View",
    execution_timeout=timedelta(hours=4),
    instance_count=20,
    env={
        "date": DS_WEEKLY,
        "bucket": "{{ task.__class__.private_output_bucket }}"
    },
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/core_client_count_view.sh",
    dag=dag)
コード例 #15
0
            "to": DS_WEEKLY
        },
        metastore_location="s3://telemetry-parquet/longitudinal"),
    uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py",
    dag=dag)

register_status(longitudinal, "Longitudinal", "A 6-month longitudinal view of client history.")


game_hw_survey = EMRSparkOperator(
    task_id="game_hw_survey",
    job_name="Firefox Hardware Report",
    execution_timeout=timedelta(hours=5),
    instance_count=15,
    owner="*****@*****.**",
    depends_on_past=True,
    email=["*****@*****.**", "*****@*****.**",
           "*****@*****.**"],
    env={"date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.public_output_bucket }}"},
    uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/hardware_report.sh",
    output_visibility="public",
    dag=dag)


taar_lite_guidranking = EMRSparkOperator(
    task_id="taar_lite_guidranking",
    job_name="TAARlite Addon Ranking",
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    execution_timeout=timedelta(hours=2),
    instance_count=4,
コード例 #16
0
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 3,
    'retry_delay': timedelta(minutes=30),
}

try:
    dag = DAG('bugzilla_dataset', default_args=default_args, schedule_interval='@daily')

    connection_details = BaseHook.get_connection('bugzilla_db')

    env = {
        "DATABASE_USER": connection_details.login,
        "DATABASE_PASSWORD": connection_details.password,
        "DATABASE_HOST": connection_details.host,
        "DATABASE_PORT": connection_details.port,
        "DATABASE_NAME": connection_details.schema,
    }

    update_bugs = EMRSparkOperator(
        task_id="update_bugs",
        job_name="Bugzilla Dataset Update",
        execution_timeout=timedelta(hours=5),
        instance_count=1,
        env=env,
        uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/bugzilla_dataset.sh",
        dag=dag
    )
except AirflowException:
    pass
    'email_on_failure':
    True,
    'email_on_retry':
    True,
    'retries':
    3,
    'retry_delay':
    timedelta(minutes=30),
}

dag = DAG('mobile_aggregates',
          default_args=default_args,
          schedule_interval='@daily')

mobile_aggregate_view = EMRSparkOperator(
    task_id="mobile_aggregate_view",
    job_name="Mobile Aggregate View",
    instance_count=5,
    execution_timeout=timedelta(hours=12),
    env={
        "date": "{{ ds_nodash }}",
        "channels": "nightly",
        "bucket": "{{ task.__class__.private_output_bucket }}",
    },
    uri=("https://raw.githubusercontent.com/"
         "mozilla/telemetry-airflow/master/jobs/run_mobile_aggregator.sh"),
    dag=dag)

register_status(mobile_aggregate_view, 'Mobile Aggregates',
                'Aggregates of metrics sent through the mobile-events pings.')
コード例 #18
0
from datetime import datetime, timedelta
from operators.emr_spark_operator import EMRSparkOperator

default_args = {
    'owner': '*****@*****.**',
    'depends_on_past': False,
    'start_date': datetime(2016, 9, 20),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('android_events',
          default_args=default_args,
          schedule_interval='@daily')

t0 = EMRSparkOperator(
    task_id="android_events",
    job_name="Update android events",
    execution_timeout=timedelta(hours=4),
    instance_count=5,
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    env={"date": "{{ ds_nodash }}"},
    uri=
    "https://raw.githubusercontent.com/mozilla-services/data-pipeline/master/reports/android-events/android-events.ipynb",
    output_visibility="public",
    dag=dag)
コード例 #19
0
    'start_date': datetime(2016, 6, 30),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('longitudinal', default_args=default_args, schedule_interval='@weekly')

t0 = EMRSparkOperator(task_id="longitudinal",
                      job_name="Longitudinal View",
                      execution_timeout=timedelta(hours=10),
                      instance_count=30,
                      env={"date": "{{ ds_nodash }}", "bucket": "{{ task.__class__.airflow_bucket }}"},
                      uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/longitudinal_view.sh",
                      dag=dag)

t1 = EMRSparkOperator(task_id="update_orphaning",
                      job_name="Update Orphaning View",
                      execution_timeout=timedelta(hours=10),
                      instance_count=1,
                      owner="*****@*****.**",
                      email=["*****@*****.**", "*****@*****.**",
                             "*****@*****.**"],
                      env={"date": "{{ ds_nodash }}"},
                      uri="https://raw.githubusercontent.com/mozilla-services/data-pipeline/master/reports/update-orphaning/Update%20orphaning%20analysis%20using%20longitudinal%20dataset.ipynb",
                      dag=dag)

t1.set_upstream(t0)
コード例 #20
0
    2,
    'retry_delay':
    timedelta(minutes=30),
}

dag = DAG('longitudinal',
          default_args=default_args,
          schedule_interval='@weekly')

longitudinal = EMRSparkOperator(
    task_id="longitudinal",
    job_name="Longitudinal View",
    execution_timeout=timedelta(hours=12),
    instance_count=40,
    release_label="emr-5.11.0",
    env={
        "date": DS_WEEKLY,
        "bucket": "{{ task.__class__.private_output_bucket }}"
    },
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/longitudinal_view.sh",
    dag=dag)

addon_recommender = EMRSparkOperator(
    task_id="addon_recommender",
    job_name="Train the Addon Recommender",
    execution_timeout=timedelta(hours=10),
    instance_count=20,
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    env={
コード例 #21
0
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('main_summary',
          default_args=default_args,
          schedule_interval='@daily',
          max_active_runs=10)

# Make sure all the data for the given day has arrived before running.
t0 = BashOperator(task_id="delayed_start", bash_command="sleep 1800", dag=dag)

t1 = EMRSparkOperator(
    task_id="main_summary",
    job_name="Main Summary View",
    execution_timeout=timedelta(hours=10),
    instance_count=10,
    env={
        "date": "{{ ds_nodash }}",
        "bucket": "{{ task.__class__.private_output_bucket }}"
    },
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/main_summary_view.sh",
    dag=dag)

# Wait a little while after midnight to start for a given day.
t1.set_upstream(t0)
コード例 #22
0
    email=["*****@*****.**", "*****@*****.**"],
    env=mozetl_envvar("taar_amodump",
                      {"date": "{{ ds_nodash }}"},
                      {'MOZETL_SUBMISSION_METHOD': 'python'}),
    uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    output_visibility="private",
    dag=dag
)

amowhitelist = EMRSparkOperator(
    task_id="taar_amowhitelist",
    job_name="Generate a whitelisted set of addons for TAAR",
    execution_timeout=timedelta(hours=1),
    instance_count=1,
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    env=mozetl_envvar("taar_amowhitelist",
                      {},
                      {'MOZETL_SUBMISSION_METHOD': 'spark'}),
    uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    output_visibility="private",
    dag=dag
)

taar_lite = EMRSparkOperator(
    task_id="taar_lite",
    job_name="Generate GUID coinstallation JSON for TAAR",
    instance_count=5,
    execution_timeout=timedelta(hours=4),
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    env=mozetl_envvar("taar_lite",
コード例 #23
0
    'retry_delay': timedelta(minutes=30),
    'bootstrap_args': ['--metrics-provider', 'datadog'],
}

dag = DAG('events_to_amplitude',
          default_args=default_args,
          schedule_interval='0 1 * * *')

focus_events_to_amplitude = EMRSparkOperator(
    task_id="focus_android_events_to_amplitude",
    job_name="Focus Android Events to Amplitude",
    execution_timeout=timedelta(hours=8),
    instance_count=FOCUS_ANDROID_INSTANCES,
    env={
        "date": "{{ ds_nodash }}",
        "max_requests": FOCUS_ANDROID_INSTANCES * VCPUS_PER_INSTANCE,
        "key_file": key_file("focus_android"),
        "artifact": get_artifact_url(slug, branch="master"),
        "config_filename": "focus_android_events_schemas.json",
    },
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/events_to_amplitude.sh",
    dag=dag)

devtools_prerelease_events_to_amplitude = EMRSparkOperator(
    task_id="devtools_prerelease_events_to_amplitude",
    job_name="DevTools Prerelease Events to Amplitude",
    execution_timeout=timedelta(hours=8),
    instance_count=DEVTOOLS_INSTANCES,
    email=['*****@*****.**', '*****@*****.**'],
    owner='*****@*****.**',
コード例 #24
0
ファイル: addons.py プロジェクト: robhudson/telemetry-airflow
from airflow import DAG
from datetime import datetime, timedelta
from operators.emr_spark_operator import EMRSparkOperator

default_args = {
    'owner': '*****@*****.**',
    'depends_on_past': False,
    'start_date': datetime(2016, 7, 1),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('addons', default_args=default_args, schedule_interval='@daily')

t0 = EMRSparkOperator(
    task_id="addons",
    job_name="Addons View",
    execution_timeout=timedelta(hours=4),
    release_label="emr-5.0.0",
    instance_count=10,
    env={
        "date": "{{ ds_nodash }}",
        "bucket": "{{ task.__class__.private_output_bucket }}"
    },
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/addons_view.sh",
    dag=dag)
コード例 #25
0
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=30),
}

# Make sure all the data for the given day has arrived before running.
# Running at 1am should suffice.
dag = DAG('first_shutdown_summary',
          default_args=default_args,
          schedule_interval='0 1 * * *')

first_shutdown_summary = EMRSparkOperator(
    task_id="first_shutdown_summary",
    job_name="First Shutdown Summary View",
    execution_timeout=timedelta(hours=1),
    instance_count=1,
    env=tbv_envvar(
        "com.mozilla.telemetry.views.MainSummaryView", {
            "from": "{{ ds_nodash }}",
            "to": "{{ ds_nodash }}",
            "bucket": "{{ task.__class__.private_output_bucket }}",
            "doc-type": "first_shutdown",
            "read-mode": "aligned",
            "input-partition-multiplier": "4"
        }),
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py",
    dag=dag)
コード例 #26
0
from operators.emr_spark_operator import EMRSparkOperator

default_args = {
    'owner': '*****@*****.**',
    'depends_on_past': False,
    'start_date': datetime(2018, 11, 20),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('client_count',
          default_args=default_args,
          schedule_interval='@daily')

client_count_view = EMRSparkOperator(
    task_id="client_count_view",
    job_name="Client Count View",
    execution_timeout=timedelta(hours=10),
    owner="*****@*****.**",
    instance_count=20,
    env={
        "date": "{{ ds_nodash }}",
        "bucket": "{{ task.__class__.private_output_bucket }}"
    },
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/client_count_view.sh",
    dag=dag)
from airflow import DAG
from datetime import datetime, timedelta
from operators.emr_spark_operator import EMRSparkOperator

default_args = {
    'owner': '*****@*****.**',
    'depends_on_past': False,
    'start_date': datetime(2018, 11, 26),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('mobile_clients', default_args=default_args, schedule_interval='@daily')

mobile_clients = EMRSparkOperator(
    task_id="mobile_clients",
    job_name="Update mobile clients",
    execution_timeout=timedelta(hours=8),
    instance_count=10,
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**"],
    env={"date": "{{ ds_nodash }}"},
    uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/mobile-clients.ipynb",
    output_visibility="public",
    dag=dag)
コード例 #28
0
from airflow import DAG
from datetime import datetime, timedelta
from operators.emr_spark_operator import EMRSparkOperator

default_args = {
    'owner': '*****@*****.**',
    'depends_on_past': True,
    'start_date': datetime(2016, 6, 29),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 3,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('telemetry_aggregates',
          default_args=default_args,
          schedule_interval='@daily')

telemetry_aggregate_view = EMRSparkOperator(
    task_id="telemetry_aggregate_view",
    job_name="Telemetry Aggregate View",
    owner="*****@*****.**",
    instance_count=10,
    execution_timeout=timedelta(hours=12),
    env={"date": "{{ ds_nodash }}"},
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_aggregator.py",
    dag=dag)
コード例 #29
0
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('fx_usage_report', default_args=default_args, schedule_interval='@weekly')

wait_for_main_summary = ExternalTaskSensor(
    task_id='wait_for_main_summary',
    external_dag_id='main_summary',
    external_task_id='main_summary',
    execution_delta=timedelta(days=-7, hours=-1), # main_summary waits one hour, execution date is beginning of the week
    dag=dag)

usage_report = EMRSparkOperator(
    task_id="fx_usage_report",
    job_name="Fx Usage Report",
    execution_timeout=timedelta(hours=4),
    instance_count=10,
    release_label="emr-5.11.0",
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**", "*****@*****.**"],
    env={"date": DS_WEEKLY,
         "bucket": "{{ task.__class__.public_output_bucket }}",
         "deploy_environment": "{{ task.__class__.deploy_environment }}"},
    uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/fx_usage_report.sh",
    dag=dag)

usage_report.set_upstream(wait_for_main_summary)
コード例 #30
0
        rename={"submission_date_s3": "submission_date"},
        replace=["SAFE_CAST(sample_id AS INT64) AS sample_id"],
    ),
    task_id="main_summary_bigquery_load",
    dag=dag)

engagement_ratio = EMRSparkOperator(
    task_id="engagement_ratio",
    job_name="Update Engagement Ratio",
    execution_timeout=timedelta(hours=6),
    instance_count=10,
    env=mozetl_envvar("engagement_ratio",
                      options={
                          "input_bucket":
                          "{{ task.__class__.private_output_bucket }}",
                          "output_bucket":
                          "net-mozaws-prod-us-west-2-pipeline-analysis"
                      },
                      dev_options={
                          "output_bucket":
                          "{{ task.__class__.private_output_bucket }}"
                      }),
    uri=
    "https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    output_visibility="public",
    dag=dag)

addons = EMRSparkOperator(
    task_id="addons",
    job_name="Addons View",
    execution_timeout=timedelta(hours=4),
    instance_count=3,
コード例 #31
0
from datetime import datetime, timedelta
from operators.emr_spark_operator import EMRSparkOperator

default_args = {
    'owner': '*****@*****.**',
    'depends_on_past': False,
    'start_date': datetime(2017, 1, 30),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 3,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('crash_summary',
          default_args=default_args,
          schedule_interval='@daily')

crash_summary_view = EMRSparkOperator(
    task_id="crash_summary_view",
    job_name="Crash Summary View",
    instance_count=20,
    execution_timeout=timedelta(hours=4),
    env={
        "date": "{{ ds_nodash }}",
        "bucket": "{{ task.__class__.private_output_bucket }}"
    },
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/crash_summary_view.sh",
    dag=dag)
コード例 #32
0
    2,
    'retry_delay':
    timedelta(minutes=30),
}

dag = DAG('longitudinal',
          default_args=default_args,
          schedule_interval='@weekly')

t0 = EMRSparkOperator(
    task_id="longitudinal",
    job_name="Longitudinal View",
    execution_timeout=timedelta(hours=10),
    release_label="emr-5.0.0",
    instance_count=30,
    env={
        "date": DS_WEEKLY,
        "bucket": "{{ task.__class__.private_output_bucket }}"
    },
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/longitudinal_view.sh",
    dag=dag)

t1 = EMRSparkOperator(
    task_id="update_orphaning",
    job_name="Update Orphaning View",
    execution_timeout=timedelta(hours=10),
    instance_count=1,
    owner="*****@*****.**",
    email=[
        "*****@*****.**", "*****@*****.**",
コード例 #33
0
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py",
    dag=dag)

register_status(longitudinal, "Longitudinal",
                "A 6-month longitudinal view of client history.")

addon_recommender = EMRSparkOperator(
    task_id="addon_recommender",
    job_name="Train the Addon Recommender",
    execution_timeout=timedelta(hours=10),
    instance_count=20,
    owner="*****@*****.**",
    email=[
        "*****@*****.**", "*****@*****.**",
        "*****@*****.**"
    ],
    env={
        "date": DS_WEEKLY,
        "privateBucket": "{{ task.__class__.private_output_bucket }}",
        "publicBucket": "{{ task.__class__.public_output_bucket }}"
    },
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/addon_recommender.sh",
    dag=dag)

game_hw_survey = EMRSparkOperator(
    task_id="game_hw_survey",
    job_name="Firefox Hardware Report",
    execution_timeout=timedelta(hours=5),
    instance_count=15,
    owner="*****@*****.**",
コード例 #34
0
}

# Make sure all the data for the given day has arrived before running.
# Running at 1am should suffice.
dag = DAG('main_summary',
          default_args=default_args,
          schedule_interval='0 1 * * *')

main_summary = EMRSparkOperator(
    task_id="main_summary",
    job_name="Main Summary View",
    execution_timeout=timedelta(hours=14),
    instance_count=40,
    env=tbv_envvar(
        "com.mozilla.telemetry.views.MainSummaryView", {
            "from": "{{ ds_nodash }}",
            "to": "{{ ds_nodash }}",
            "schema-report-location":
            "s3://{{ task.__class__.private_output_bucket }}/schema/main_summary/submission_date_s3={{ ds_nodash }}",
            "bucket": "{{ task.__class__.private_output_bucket }}"
        }),
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py",
    dag=dag)

main_summary_schema = EmailSchemaChangeOperator(
    task_id="main_summary_schema",
    email=["*****@*****.**", "*****@*****.**"],
    to=["*****@*****.**"],
    key_prefix='schema/main_summary/submission_date_s3=',
    dag=dag)
コード例 #35
0
default_args = {
    'owner': '*****@*****.**',
    'depends_on_past': False,
    'start_date': datetime(2017, 3, 26),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=30),
}

dag_daily = DAG('probe_scraper',
                default_args=default_args,
                schedule_interval='@daily')

probe_scraper = EMRSparkOperator(
    task_id="probe_scraper",
    job_name="Probe Scraper",
    execution_timeout=timedelta(hours=4),
    instance_count=1,
    owner="*****@*****.**",
    email=[
        '*****@*****.**', '*****@*****.**',
        '*****@*****.**'
    ],
    env={},
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/probe_scraper.sh",
    output_visibility="public",
    dag=dag_daily)
コード例 #36
0
default_args = {
    'owner': '*****@*****.**',
    'depends_on_past': False,
    'start_date': datetime(2016, 9, 20),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('android_addons',
          default_args=default_args,
          schedule_interval='@daily')

android_addons = EMRSparkOperator(
    task_id="android_addons",
    job_name="Update android addons",
    execution_timeout=timedelta(hours=4),
    instance_count=5,
    owner="*****@*****.**",
    email=[
        "*****@*****.**", "*****@*****.**",
        "*****@*****.**"
    ],
    env={"date": "{{ ds_nodash }}"},
    uri=
    "https://raw.githubusercontent.com/mozilla/mozilla-reports/master/etl/android-addons.kp/orig_src/android-addons.ipynb",
    output_visibility="public",
    dag=dag)
default_args = {
    'owner': '*****@*****.**',
    'depends_on_past': False,
    'start_date': datetime(2018, 11, 20),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 3,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('crash_summary',
          default_args=default_args,
          schedule_interval='@daily')

crash_summary_view = EMRSparkOperator(
    task_id="crash_summary_view",
    job_name="Crash Summary View",
    instance_count=20,
    execution_timeout=timedelta(hours=4),
    env=tbv_envvar(
        "com.mozilla.telemetry.views.CrashSummaryView", {
            "from": "{{ ds_nodash }}",
            "to": "{{ ds_nodash }}",
            "outputBucket": "{{ task.__class__.private_output_bucket }}"
        }),
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py",
    dag=dag)
コード例 #38
0
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
        dataset="sync_flat_summary",
        dataset_version="v1",
        gke_cluster_name="bq-load-gke-1",
        ),
    task_id="sync_flat_view_bigquery_load",
    dag=dag)

sync_bookmark_validation = EMRSparkOperator(
    task_id="sync_bookmark_validation",
    job_name="Sync Bookmark Validation",
    execution_timeout=timedelta(hours=2),
    instance_count=1,
    email=["*****@*****.**", "*****@*****.**"],
    env=mozetl_envvar("sync_bookmark_validation", {
        "start_date": "{{ ds_nodash }}",
        "bucket": "{{ task.__class__.private_output_bucket }}",
    }),
    uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    dag=dag)


sync_bookmark_validation.set_upstream(sync_view)

sync_view_bigquery_load.set_upstream(sync_view)

sync_events_view_bigquery_load.set_upstream(sync_events_view)

sync_flat_view_bigquery_load.set_upstream(sync_flat_view)