def tbv_envvar(klass, options, dev_options={}, branch=None, tag=None, other={}, metastore_location=None, artifact_url=None): """Set up environment variables for telemetry-batch-view jobs. The command line interface can read options from the environment. All environment variables must be prefixed by `TBV_`. For example, a class in telemetry-batch-view taking a `--date` option can use `TBV_DATE` instead. There is a limitation that spaces cannot be in environment variables, so ValueError is thrown if spaces are found outside templating brackets. :klass string: name of the class in telemetry-batch-view :options dict: environment variables to prefix :dev_options dict: variables to use when in the development environment :branch string: the branch to run the job from, incompatible with tag :tag string: the tag to run the job from, incompatible with branch :other dict: environment variables to pass through :metastore_location string: Location of the data-set metastore :artifact_url string: Location of pre-built binaries :returns: a dictionary that contains properly prefixed class and options """ if artifact_url is None: slug = "{{ task.__class__.telemetry_batch_view_slug }}" url = get_artifact_url(slug, branch=branch, tag=tag) else: url = artifact_url if EMRSparkOperator.deploy_environment == 'dev': options.update(dev_options) prefixed_options = { "TBV_{}".format(key.replace("-", "_")): value for key, value in options.items() } if klass is not None: prefixed_options["TBV_CLASS"] = klass else: assert other.get( "DO_SUBMIT", "True") == "False", "To submit there must be a class name" if metastore_location is not None: prefixed_options["METASTORE_LOCATION"] = metastore_location prefixed_options["ARTIFACT_URL"] = url prefixed_options.update(other) # raise ValueError if spaces found in non-templated envvar values for item in prefixed_options.values(): if "{{" not in item and " " in item: raise ValueError("env cannot contain spaces: '{}'".format(item)) return prefixed_options
def tbv_envvar(klass, options, dev_options={}, branch=None, tag=None, other={}, metastore_location=None, artifact_url=None): """Set up environment variables for telemetry-batch-view jobs. The command line interface can read options from the environment. All environment variables must be prefixed by `TBV_`. For example, a class in telemetry-batch-view taking a `--date` option can use `TBV_DATE` instead. There is a limitation that spaces cannot be in environment variables, so ValueError is thrown if spaces are found outside templating brackets. :klass string: name of the class in telemetry-batch-view :options dict: environment variables to prefix :dev_options dict: variables to use when in the development environment :branch string: the branch to run the job from, incompatible with tag :tag string: the tag to run the job from, incompatible with branch :other dict: environment variables to pass through :metastore_location string: Location of the data-set metastore :artifact_url string: Location of pre-built binaries :returns: a dictionary that contains properly prefixed class and options """ if artifact_url is None: slug = "{{ task.__class__.telemetry_batch_view_slug }}" url = get_artifact_url(slug, branch=branch, tag=tag) else: url = artifact_url if EMRSparkOperator.deploy_environment == 'dev': options.update(dev_options) prefixed_options = { "TBV_{}".format(key.replace("-", "_")): value for key, value in options.items() } if klass is not None: prefixed_options["TBV_CLASS"] = klass else: assert other.get("DO_SUBMIT", "True") == "False", "To submit there must be a class name" if metastore_location is not None: prefixed_options["METASTORE_LOCATION"] = metastore_location prefixed_options["ARTIFACT_URL"] = url prefixed_options.update(other) # raise ValueError if spaces found in non-templated envvar values for item in prefixed_options.values(): if "{{" not in item and " " in item: raise ValueError("env cannot contain spaces: '{}'".format(item)) return prefixed_options
} dag = DAG('events_to_amplitude', default_args=default_args, schedule_interval='0 1 * * *') focus_events_to_amplitude = EMRSparkOperator( task_id="focus_android_events_to_amplitude", job_name="Focus Android Events to Amplitude", execution_timeout=timedelta(hours=8), instance_count=FOCUS_ANDROID_INSTANCES, env={ "date": "{{ ds_nodash }}", "max_requests": FOCUS_ANDROID_INSTANCES * VCPUS_PER_INSTANCE, "key_file": key_file("focus_android"), "artifact": get_artifact_url(slug, branch="master"), "config_filename": "focus_android_events_schemas.json", }, uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/events_to_amplitude.sh", dag=dag) devtools_prerelease_events_to_amplitude = EMRSparkOperator( task_id="devtools_prerelease_events_to_amplitude", job_name="DevTools Prerelease Events to Amplitude", execution_timeout=timedelta(hours=8), instance_count=DEVTOOLS_INSTANCES, email=['*****@*****.**', '*****@*****.**'], owner='*****@*****.**', env={ "date": "{{ ds_nodash }}",
from operators.emr_spark_operator import EMRSparkOperator from utils.constants import DS_WEEKLY from utils.mozetl import mozetl_envvar from utils.deploy import get_artifact_url FOCUS_ANDROID_INSTANCES = 10 VCPUS_PER_INSTANCE = 16 environment = "{{ task.__class__.deploy_environment }}" key_file = "s3://telemetry-airflow/config/amplitude/{}/apiKey".format( environment) config_file = "focus_android_events_schemas.json" slug = "{{ task.__class__.telemetry_streaming_slug }}" tag = "v1.0.1" url = get_artifact_url(slug, tag=tag) default_args = { 'owner': '*****@*****.**', 'depends_on_past': False, 'start_date': datetime(2018, 1, 1), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } dag = DAG('events_to_amplitude', default_args=default_args, schedule_interval='0 1 * * *')
task_id="experiments_error_aggregates", job_name="Experiments Error Aggregates View", execution_timeout=timedelta(hours=5), instance_count=20, owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], env=tbv_envvar( "com.mozilla.telemetry.streaming.ExperimentsErrorAggregator", options={ "from": "{{ ds_nodash }}", "to": "{{ds_nodash }}", "outputPath": "s3://{{ task.__class__.private_output_bucket }}", "numParquetFiles": "6" }, dev_options={"channel": "nightly"}, artifact_url=get_artifact_url( "{{ task.__class__.telemetry_streaming_slug }}")), uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", dag=dag) engagement_ratio = EMRSparkOperator( task_id="engagement_ratio", job_name="Update Engagement Ratio", execution_timeout=timedelta(hours=6), instance_count=10, env=mozetl_envvar("engagement_ratio", options={ "input_bucket": "{{ task.__class__.private_output_bucket }}", "output_bucket": "net-mozaws-prod-us-west-2-pipeline-analysis"
from airflow import DAG from datetime import datetime, timedelta from operators.emr_spark_operator import EMRSparkOperator from utils.constants import DS_WEEKLY from utils.mozetl import mozetl_envvar from utils.deploy import get_artifact_url FOCUS_ANDROID_INSTANCES = 10 VCPUS_PER_INSTANCE = 16 environment = "{{ task.__class__.deploy_environment }}" key_file = "s3://telemetry-airflow/config/amplitude/{}/apiKey".format(environment) config_file = "focus_android_events_schemas.json" slug = "{{ task.__class__.telemetry_streaming_slug }}" url = get_artifact_url(slug) default_args = { 'owner': '*****@*****.**', 'depends_on_past': False, 'start_date': datetime(2018, 1, 1), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } dag = DAG('events_to_amplitude', default_args=default_args, schedule_interval='0 1 * * *')
dag = DAG('events_to_amplitude', default_args=default_args, schedule_interval='0 1 * * *') focus_events_to_amplitude = EMRSparkOperator( task_id="focus_android_events_to_amplitude", job_name="Focus Android Events to Amplitude", execution_timeout=timedelta(hours=8), instance_count=FOCUS_ANDROID_INSTANCES, env={ "date": "{{ ds_nodash }}", "max_requests": FOCUS_ANDROID_INSTANCES * VCPUS_PER_INSTANCE, "key_file": key_file("focus_android"), # This Focus events job is pinned to a tag for now due to breaking changes in telemetry-streaming. "artifact": get_artifact_url(slug, tag="v1.0.1"), "config_filename": "focus_android_events_schemas.json", }, uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/events_to_amplitude.sh", dag=dag) devtools_events_to_amplitude = EMRSparkOperator( task_id="devtools_events_to_amplitude", job_name="DevTools Events to Amplitude", execution_timeout=timedelta(hours=8), instance_count=DEVTOOLS_INSTANCES, env={ "date": "{{ ds_nodash }}", "max_requests": DEVTOOLS_INSTANCES * VCPUS_PER_INSTANCE, "key_file": key_file("devtools"),
from airflow import DAG from datetime import datetime, timedelta from operators.emr_spark_operator import EMRSparkOperator from utils.deploy import get_artifact_url from utils.tbv import tbv_envvar slug = "{{ task.__class__.telemetry_streaming_slug }}" url = get_artifact_url(slug) default_args = { 'owner': '*****@*****.**', 'depends_on_past': False, 'start_date': datetime(2018, 11, 26), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=30), } dag = DAG('event_ping_events', default_args=default_args, schedule_interval='0 1 * * *') event_ping_events = EMRSparkOperator( task_id="event_ping_events", job_name="Event Ping Events Dataset", execution_timeout=timedelta(hours=8), instance_count=5, env=tbv_envvar("com.mozilla.telemetry.streaming.EventPingEvents", { "from": "{{ ds_nodash }}", "to": "{{ ds_nodash }}",