def test_sync_to_db(self, mock_now): dag = DAG( 'dag', start_date=DEFAULT_DATE, ) with dag: DummyOperator(task_id='task', owner='owner1') SubDagOperator( task_id='subtask', owner='owner2', subdag=DAG( 'dag.subtask', start_date=DEFAULT_DATE, ) ) now = datetime.datetime.utcnow().replace(tzinfo=pendulum.timezone('UTC')) mock_now.return_value = now session = settings.Session() dag.sync_to_db(session=session) orm_dag = session.query(DagModel).filter(DagModel.dag_id == 'dag').one() self.assertEqual(set(orm_dag.owners.split(', ')), {'owner1', 'owner2'}) self.assertEqual(orm_dag.last_scheduler_run, now) self.assertTrue(orm_dag.is_active) self.assertIsNone(orm_dag.default_view) self.assertEqual(orm_dag.get_default_view(), conf.get('webserver', 'dag_default_view').lower()) self.assertEqual(orm_dag.safe_dag_id, 'dag') orm_subdag = session.query(DagModel).filter( DagModel.dag_id == 'dag.subtask').one() self.assertEqual(set(orm_subdag.owners.split(', ')), {'owner1', 'owner2'}) self.assertEqual(orm_subdag.last_scheduler_run, now) self.assertTrue(orm_subdag.is_active) self.assertEqual(orm_subdag.safe_dag_id, 'dag__dot__subtask') self.assertEqual(orm_subdag.fileloc, orm_dag.fileloc) session.close()
def test_subdag_pools_no_possible_conflict(self): """ Subdags and subdag tasks with no pool overlap, should not to query pools """ dag = DAG('parent', default_args=default_args) subdag = DAG('parent.child', default_args=default_args) session = airflow.settings.Session() pool_1 = airflow.models.Pool(pool='test_pool_1', slots=1) pool_10 = airflow.models.Pool(pool='test_pool_10', slots=10) session.add(pool_1) session.add(pool_10) session.commit() DummyOperator(task_id='dummy', dag=subdag, pool='test_pool_10') mock_session = Mock() SubDagOperator(task_id='child', dag=dag, subdag=subdag, pool='test_pool_1', session=mock_session) self.assertFalse(mock_session.query.called) session.delete(pool_1) session.delete(pool_10) session.commit()
def test_is_paused_subdag(self, mock_dag_bag): subdag_id = 'dag.subdag' subdag = DAG( subdag_id, start_date=DEFAULT_DATE, ) with subdag: DummyOperator(task_id='dummy_task', ) dag_id = 'dag' dag = DAG( dag_id, start_date=DEFAULT_DATE, ) with dag: SubDagOperator(task_id='subdag', subdag=subdag) mock_dag_bag.return_value.get_dag.return_value = dag session = settings.Session() dag.sync_to_db(session=session) unpaused_dags = session.query(DagModel).filter( DagModel.dag_id.in_([subdag_id, dag_id]), ).filter( DagModel.is_paused.is_(False)).count() self.assertEqual(2, unpaused_dags) DagModel.get_dagmodel(dag.dag_id).set_is_paused(is_paused=True) paused_dags = session.query(DagModel).filter( DagModel.dag_id.in_([subdag_id, dag_id]), ).filter( DagModel.is_paused.is_(True)).count() self.assertEqual(2, paused_dags)
def test_sync_to_db_default_view(self, mock_now): dag = DAG( 'dag', start_date=DEFAULT_DATE, default_view="graph", ) with dag: DummyOperator(task_id='task', owner='owner1') SubDagOperator(task_id='subtask', owner='owner2', subdag=DAG( 'dag.subtask', start_date=DEFAULT_DATE, )) now = datetime.datetime.utcnow().replace( tzinfo=pendulum.timezone('UTC')) mock_now.return_value = now session = settings.Session() dag.sync_to_db(session=session) orm_dag = session.query(DagModel).filter( DagModel.dag_id == 'dag').one() self.assertIsNotNone(orm_dag.default_view) self.assertEqual(orm_dag.get_default_view(), "graph")
def validate_site_design(parent_dag_name, child_dag_name, args): ''' Subdag to delegate design verification to the UCP components ''' dag = DAG( '{}.{}'.format(parent_dag_name, child_dag_name), default_args=args) deckhand_validate_docs = SubDagOperator( subdag=deckhand_validate_site_design(dag.dag_id, DECKHAND_VALIDATE_DOCS_DAG_NAME, args), task_id=DECKHAND_VALIDATE_DOCS_DAG_NAME, dag=dag) # TODO () use the real operator here drydock_validate_docs = PlaceholderOperator( task_id='drydock_validate_site_design', dag=dag) # TODO () use the real operator here armada_validate_docs = PlaceholderOperator( task_id='armada_validate_site_design', dag=dag) return dag
def generate_dag(self, job, **kwargs): """ Generates an AWS airflow dag from CWL Parameters ---------- job: str Name of the file (ex. job.yml) kwargs: dict Keyword arguments to pass to the DAG creation Returns ------- DAG """ # Create the unique name of the workflow based on the dir containing the job file wf_id = os.path.basename(os.path.dirname(os.path.abspath(job))) with open(job) as fp: job = yaml.full_load(fp) dag_id = "{}_{}".format(self.workflow_name, wf_id) self.dag_id = dag_id default_args = { "depends_on_past": False, "start_date": datetime(2018, 2, 23), "max_retries": 300, } try: self.dj_hook.init_workflow(id=dag_id, name=self.workflow_name) except (DuplicateError, DataJointError): log.warning( "Workflow database entry for {} already exists, reinserting". format(self.workflow_name)) pass if self.cwl["class"] != "Workflow": raise TypeError("CWL is not a workflow") dag = DAG(dag_id=dag_id, default_args=self.default_args, schedule_interval=None) job_params, deps = self.resolve_args(job) if len(self.parameterization) > 1: log.info( "Parameterization produces {} workflows, totaling {} jobs...". format( len(self.parameterization), len(self.steps) * len(self.parameterization), )) # If the parameter is a file, use the path param_db_update_dict = {} for param in self.cwl["inputs"]: if type(job[param]) != dict: param_db_update_dict[param] = job[param] elif "path" in job[param]: param_db_update_dict[param] = job[param]["path"] else: raise ValueError( "Unable to insert parameter {} into job parameter database" .format(param)) try: use_subdag = self.cwl["hints"]["saber"]["use_subdag"] except KeyError: use_subdag = True for i, iteration in enumerate(self.parameterization): if self.optimization_iteration is None: task_id = str(i) else: task_id = "{}_{}".format(self.optimization_iteration, i) if use_subdag: subdag = self.create_subdag( iteration, task_id, param_db_update_dict, job_params, job, wf_id, deps, dag=None, ) SubDagOperator(subdag=subdag, task_id=task_id, dag=dag) else: dag = self.create_subdag( iteration, task_id, param_db_update_dict, job_params, job, wf_id, deps, dag=dag, ) return dag
dag = DAG('main', default_args=default_args, description='main dag', schedule_interval=None, dagrun_timeout=timedelta(minutes=60)) start = BashOperator( task_id='start', bash_command='echo start-dag', dag=dag, ) load_tasks = SubDagOperator( task_id='load_tasks', subdag=load_subdag('main', 'load_tasks', default_args), default_args=default_args, dag=dag, ) load_tasks1 = SubDagOperator( task_id='load_tasks1', subdag=load_subdag('main', 'load_tasks1', default_args), default_args=default_args, dag=dag, ) load_tasks2 = SubDagOperator( task_id='load_tasks2', subdag=load_subdag('main', 'load_tasks2', default_args), default_args=default_args, dag=dag,
def nested_subdag_cycle(): import datetime # pylint: disable=redefined-outer-name,reimported from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.subdag_operator import SubDagOperator dag_name = 'nested_cycle' default_args = { 'owner': 'owner1', 'start_date': datetime.datetime(2016, 1, 1) } dag = DAG(dag_name, default_args=default_args) # cycle: # A -> op_subdag_0 # cycle.op_subdag_0: # -> opSubDag_A # cycle.op_subdag_0.opSubdag_A: # -> subdag_a.task # -> opSubdag_B # cycle.op_subdag_0.opSubdag_B: # -> subdag_b.task # A -> op_subdag_1 # cycle.op_subdag_1: # -> opSubdag_C # cycle.op_subdag_1.opSubdag_C: # -> subdag_c.task -> subdag_c.task >Invalid Loop< # -> opSubDag_D # cycle.op_subdag_1.opSubdag_D: # -> subdag_d.task with dag: def subdag_a(): subdag_a = DAG('nested_cycle.op_subdag_0.opSubdag_A', default_args=default_args) DummyOperator(task_id='subdag_a.task', dag=subdag_a) return subdag_a def subdag_b(): subdag_b = DAG('nested_cycle.op_subdag_0.opSubdag_B', default_args=default_args) DummyOperator(task_id='subdag_b.task', dag=subdag_b) return subdag_b def subdag_c(): subdag_c = DAG('nested_cycle.op_subdag_1.opSubdag_C', default_args=default_args) op_subdag_c_task = DummyOperator(task_id='subdag_c.task', dag=subdag_c) # introduce a loop in opSubdag_C op_subdag_c_task.set_downstream(op_subdag_c_task) return subdag_c def subdag_d(): subdag_d = DAG('nested_cycle.op_subdag_1.opSubdag_D', default_args=default_args) DummyOperator(task_id='subdag_d.task', dag=subdag_d) return subdag_d def subdag_0(): subdag_0 = DAG('nested_cycle.op_subdag_0', default_args=default_args) SubDagOperator(task_id='opSubdag_A', dag=subdag_0, subdag=subdag_a()) SubDagOperator(task_id='opSubdag_B', dag=subdag_0, subdag=subdag_b()) return subdag_0 def subdag_1(): subdag_1 = DAG('nested_cycle.op_subdag_1', default_args=default_args) SubDagOperator(task_id='opSubdag_C', dag=subdag_1, subdag=subdag_c()) SubDagOperator(task_id='opSubdag_D', dag=subdag_1, subdag=subdag_d()) return subdag_1 op_subdag_0 = SubDagOperator(task_id='op_subdag_0', dag=dag, subdag=subdag_0()) op_subdag_1 = SubDagOperator(task_id='op_subdag_1', dag=dag, subdag=subdag_1()) op_a = DummyOperator(task_id='A') op_a.set_downstream(op_subdag_0) op_a.set_downstream(op_subdag_1) return dag
prerelease_telemetry_aggregate_view_dataproc = SubDagOperator( task_id=task_id, dag=dag, subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, dag_name=task_id, job_name="prerelease_aggregates", cluster_name="prerelease-telemetry-aggregates-{{ ds_nodash }}", idle_delete_ttl="600", num_workers=10, worker_machine_type="n1-standard-8", init_actions_uris=[ "gs://dataproc-initialization-actions/python/pip-install.sh" ], additional_properties={ "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest.jar", "spark:spark.jars.packages": "org.apache.spark:spark-avro_2.11:2.4.4", }, additional_metadata={ "PIP_PACKAGES": "git+https://github.com/mozilla/python_mozaggregator.git" }, python_driver_code="gs://{}/jobs/mozaggregator_runner.py".format( artifact_bucket), py_args=[ "aggregator", "--date", "{{ ds_nodash }}", "--channels", "nightly,aurora,beta", "--postgres-db", "telemetry", "--postgres-user", "root", "--postgres-pass", "{{ var.value.mozaggregator_postgres_pass }}", "--postgres-host", "{{ var.value.mozaggregator_postgres_host }}", "--postgres-ro-host", "{{ var.value.mozaggregator_postgres_ro_host }}", "--num-partitions", str(10 * 32), ] + ([ "--source", "bigquery", "--project-id", "moz-fx-data-shared-prod" ] if not EXPORT_TO_AVRO else [ "--source", "avro", "--avro-prefix", "gs://moz-fx-data-derived-datasets-parquet-tmp/avro/mozaggregator/prerelease/moz-fx-data-shared-prod", ]), gcp_conn_id=gcp_conn.gcp_conn_id, service_account=client_email, artifact_bucket=artifact_bucket, storage_bucket=storage_bucket, default_args=subdag_args, ), )
"""This generates the image_conversion subdag A subdag basically acts like an array of tasks, at least in this case""" image_conversion = DAG( '%s.%s' % (parent_dag_name, child_dag_name), schedule_interval=schedule_interval, start_date=start_date, ) cell_image_analysis_generate_tasks(NUM_CHANNELS, image_conversion) return image_conversion image_conversion_dag = SubDagOperator( subdag=generate_image_conversion_sub_dag( 'cell_image_analysis_2channels', 'image_conversion', datetime(2019, 1, 1), cell_image_analysis_2channels_dag.schedule_interval), task_id='image_conversion', dag=cell_image_analysis_2channels_dag, ) def prepare_cellprofiler_csv(ds, **kwargs): """Prepare the cellprofiler csv based on the args""" df = get_cell_images_df(**kwargs) kwargs['ti'].xcom_push(key='cell_images_df', value=df) return prepare_cellprofiler_csv_op = PythonOperator( task_id='prepare_cellprofiler_csv', provide_context=True,
bhr_collection = SubDagOperator( task_id="bhr_collection", dag=dag, subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, image_version="1.5", dag_name="bhr_collection", default_args=default_args, cluster_name="bhr-collection-{{ ds }}", job_name="bhr-collection", python_driver_code= "https://raw.githubusercontent.com/mozilla/python_mozetl/main/mozetl/bhr_collection/bhr_collection.py", init_actions_uris=[ "gs://dataproc-initialization-actions/python/pip-install.sh" ], additional_metadata={ "PIP_PACKAGES": "boto3==1.16.20 click==7.1.2" }, additional_properties={ "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar", "spark-env:AWS_ACCESS_KEY_ID": aws_access_key, "spark-env:AWS_SECRET_ACCESS_KEY": aws_secret_key }, py_args=[ "--date", "{{ ds }}", "--sample-size", "0.5", ], idle_delete_ttl="14400", num_workers=6, worker_machine_type="n1-highmem-4", gcp_conn_id=params.conn_id, service_account=params.client_email, storage_bucket=params.storage_bucket, ))
'email': '*****@*****.**', 'retries': 3, 'retry_delay': timedelta(minutes=1), } dag = DAG( dag_id=DAG_NAME, default_args=args, schedule_interval='0 1 * * *', catchup=False, #concurrency=1, ) brightedge_keyword_rank_data = SubDagOperator( task_id='brightedge_keyword_rank_data', subdag=dmd_subdag( DAG_NAME, 'brightedge_keyword_rank_data', args, conf['parameters']['brightedge_keyword_rank_data_paras']), default_args=args, on_failure_callback=send_email_failure, dag=dag) brightedge_share_of_voice = SubDagOperator( task_id='brightedge_share_of_voice', subdag=dmd_subdag(DAG_NAME, 'brightedge_share_of_voice', args, conf['parameters']['brightedge_share_of_voice_paras']), default_args=args, on_failure_callback=send_email_failure, dag=dag, ) doubleclick_search_campaign = SubDagOperator( task_id='doubleclick_search_campaign',
def create_subdag_operator(dag_parent, label, team): subdag, dependencies = create_subdag(dag_parent, label, team) sd_op = SubDagOperator(task_id=label, dag=dag_parent, subdag=subdag) return sd_op, dependencies
def subdag_1(): subdag_1 = DAG('nested_cycle.opSubdag_1', default_args=DEFAULT_ARGS) SubDagOperator(task_id='opSubdag_C', dag=subdag_1, subdag=subdag_C()) SubDagOperator(task_id='opSubdag_D', dag=subdag_1, subdag=subdag_D()) return subdag_1
def subdag_0(): subdag_0 = DAG('nested_cycle.opSubdag_0', default_args=DEFAULT_ARGS) SubDagOperator(task_id='opSubdag_A', dag=subdag_0, subdag=subdag_A()) SubDagOperator(task_id='opSubdag_B', dag=subdag_0, subdag=subdag_B()) return subdag_0
def nested_subdag_cycle(): from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.subdag_operator import SubDagOperator import datetime DAG_NAME = 'nested_cycle' DEFAULT_ARGS = { 'owner': 'owner1', 'start_date': datetime.datetime(2016, 1, 1) } dag = DAG( DAG_NAME, default_args=DEFAULT_ARGS) # cycle: # A -> opSubdag_0 # cycle.opSubdag_0: # -> opSubDag_A # cycle.opSubdag_0.opSubdag_A: # -> subdag_A.task # -> opSubdag_B # cycle.opSubdag_0.opSubdag_B: # -> subdag_B.task # A -> opSubdag_1 # cycle.opSubdag_1: # -> opSubdag_C # cycle.opSubdag_1.opSubdag_C: # -> subdag_C.task -> subdag_C.task >Invalid Loop< # -> opSubDag_D # cycle.opSubdag_1.opSubdag_D: # -> subdag_D.task with dag: def subdag_A(): subdag_A = DAG( 'nested_cycle.opSubdag_0.opSubdag_A', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_A.task', dag=subdag_A) return subdag_A def subdag_B(): subdag_B = DAG( 'nested_cycle.opSubdag_0.opSubdag_B', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_B.task', dag=subdag_B) return subdag_B def subdag_C(): subdag_C = DAG( 'nested_cycle.opSubdag_1.opSubdag_C', default_args=DEFAULT_ARGS) opSubdag_C_task = DummyOperator( task_id='subdag_C.task', dag=subdag_C) # introduce a loop in opSubdag_C opSubdag_C_task.set_downstream(opSubdag_C_task) return subdag_C def subdag_D(): subdag_D = DAG( 'nested_cycle.opSubdag_1.opSubdag_D', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_D.task', dag=subdag_D) return subdag_D def subdag_0(): subdag_0 = DAG('nested_cycle.opSubdag_0', default_args=DEFAULT_ARGS) SubDagOperator(task_id='opSubdag_A', dag=subdag_0, subdag=subdag_A()) SubDagOperator(task_id='opSubdag_B', dag=subdag_0, subdag=subdag_B()) return subdag_0 def subdag_1(): subdag_1 = DAG('nested_cycle.opSubdag_1', default_args=DEFAULT_ARGS) SubDagOperator(task_id='opSubdag_C', dag=subdag_1, subdag=subdag_C()) SubDagOperator(task_id='opSubdag_D', dag=subdag_1, subdag=subdag_D()) return subdag_1 opSubdag_0 = SubDagOperator( task_id='opSubdag_0', dag=dag, subdag=subdag_0()) opSubdag_1 = SubDagOperator( task_id='opSubdag_1', dag=dag, subdag=subdag_1()) opA = DummyOperator(task_id='A') opA.set_downstream(opSubdag_0) opA.set_downstream(opSubdag_1) return dag
def nested_subdags(): from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.subdag_operator import SubDagOperator import datetime DAG_NAME = 'master' DEFAULT_ARGS = { 'owner': 'owner1', 'start_date': datetime.datetime(2016, 1, 1) } dag = DAG( DAG_NAME, default_args=DEFAULT_ARGS) # master: # A -> opSubdag_0 # master.opSubdag_0: # -> opSubDag_A # master.opSubdag_0.opSubdag_A: # -> subdag_A.task # -> opSubdag_B # master.opSubdag_0.opSubdag_B: # -> subdag_B.task # A -> opSubdag_1 # master.opSubdag_1: # -> opSubdag_C # master.opSubdag_1.opSubdag_C: # -> subdag_C.task # -> opSubDag_D # master.opSubdag_1.opSubdag_D: # -> subdag_D.task with dag: def subdag_A(): subdag_A = DAG( 'master.opSubdag_0.opSubdag_A', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_A.task', dag=subdag_A) return subdag_A def subdag_B(): subdag_B = DAG( 'master.opSubdag_0.opSubdag_B', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_B.task', dag=subdag_B) return subdag_B def subdag_C(): subdag_C = DAG( 'master.opSubdag_1.opSubdag_C', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_C.task', dag=subdag_C) return subdag_C def subdag_D(): subdag_D = DAG( 'master.opSubdag_1.opSubdag_D', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_D.task', dag=subdag_D) return subdag_D def subdag_0(): subdag_0 = DAG('master.opSubdag_0', default_args=DEFAULT_ARGS) SubDagOperator(task_id='opSubdag_A', dag=subdag_0, subdag=subdag_A()) SubDagOperator(task_id='opSubdag_B', dag=subdag_0, subdag=subdag_B()) return subdag_0 def subdag_1(): subdag_1 = DAG('master.opSubdag_1', default_args=DEFAULT_ARGS) SubDagOperator(task_id='opSubdag_C', dag=subdag_1, subdag=subdag_C()) SubDagOperator(task_id='opSubdag_D', dag=subdag_1, subdag=subdag_D()) return subdag_1 opSubdag_0 = SubDagOperator( task_id='opSubdag_0', dag=dag, subdag=subdag_0()) opSubdag_1 = SubDagOperator( task_id='opSubdag_1', dag=dag, subdag=subdag_1()) opA = DummyOperator(task_id='A') opA.set_downstream(opSubdag_0) opA.set_downstream(opSubdag_1) return dag
} dag = DAG(dag_id=DAG_NAME, default_args=args, start_date=days_ago(2), schedule_interval="@once", tags=['example']) start = DummyOperator( task_id='start', dag=dag, ) section_1 = SubDagOperator( task_id='section-1', subdag=subdag(DAG_NAME, 'section-1', args), dag=dag, ) some_other_task = DummyOperator( task_id='some-other-task', dag=dag, ) section_2 = SubDagOperator( task_id='section-2', subdag=subdag(DAG_NAME, 'section-2', args), dag=dag, ) end = DummyOperator(
fenix_beta_adjust_import = SubDagOperator( task_id=task_id, dag=dag, subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, dag_name=task_id, job_name="firefox-android-beta-adjust-import", cluster_name="firefox-android-beta-adjust-import-{{ ds_nodash }}", idle_delete_ttl="600", num_workers=40, worker_machine_type="n1-standard-8", init_actions_uris=[ "gs://dataproc-initialization-actions/python/pip-install.sh" ], additional_properties={ "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest.jar" }, additional_metadata={"PIP_PACKAGES": "click==7.1.2"}, python_driver_code="gs://{}/jobs/adjust_import.py".format( params.artifact_bucket), py_args=[ "--pbkdf2", "--salt", "org.mozilla.fenix-salt", "--project", project, "--input_table", "tmp.adjust_firefox_preview", "--output_table", "firefox_android_beta_external.adjust_install_time_v1", "--bucket", params.storage_bucket, ], gcp_conn_id=params.conn_id, service_account=params.client_email, artifact_bucket=params.artifact_bucket, storage_bucket=params.storage_bucket, default_args=subdag_args, ), )
usage_report = SubDagOperator( task_id="fx_usage_report", dag=dag, subdag=moz_dataproc_scriptrunner( parent_dag_name=dag.dag_id, dag_name='fx_usage_report', default_args=default_args, cluster_name=cluster_name, service_account= '*****@*****.**', job_name="Fx_Usage_Report", uri= "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/fx_usage_report.sh", env={ "date": DS_WEEKLY, "bucket": output_bucket, "PYTHONPATH": "/usr/lib/spark/python/lib/pyspark.zip", "deploy_environment": "prod", # These env variables are needed in addition to the s3a configs, since some code uses boto to list bucket objects "AWS_ACCESS_KEY_ID": aws_access_key, "AWS_SECRET_ACCESS_KEY": aws_secret_key }, gcp_conn_id=gcp_conn_id, # This should is used to set the s3a configs for read/write to s3 for non boto calls aws_conn_id=aws_conn_id, num_workers=9, worker_machine_type='n1-standard-16', image_version='1.3', init_actions_uris=[ 'gs://moz-fx-data-prod-airflow-dataproc-artifacts/bootstrap/fx_usage_init.sh' ], ))
main_path = 'source/dataset/' data_lake = dl_helper.DataLake(version='v3') file_extension = '.txt' raw_features.init(main_path, data_lake, file_extension) vector_features.init(main_path, data_lake, file_extension) xgboost_subdag.init(main_path, data_lake, file_extension) naive_bayes_subdag.init(main_path, data_lake, file_extension) #================# child_dag_name = 'raw_features_extraction' raw_extr_subdag = SubDagOperator( subdag=feature_extr_sub_dag(parent_dag_name, child_dag_name, default_args, dag.schedule_interval), task_id=child_dag_name, default_args=default_args, dag=dag) start >> raw_extr_subdag #================# child_dag_name = 'vector_features_extraction' vector_extr_subdag = SubDagOperator( subdag=vector_extr_sub_dag(parent_dag_name, child_dag_name, default_args, dag.schedule_interval), task_id=child_dag_name, default_args=default_args, dag=dag) start >> vector_extr_subdag
args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': datetime(2016, 12, 1), 'retries': 1, } dag = DAG(dag_id='fda', default_args=args, max_active_runs=1, schedule_interval='@monthly') fda_dap_task = SubDagOperator( dag=dag, subdag=fda_dap(parent_dag_name='fda', child_dag_name='dap', start_date=dag.start_date, schedule_interval=dag.schedule_interval), task_id='dap', ) fda_linker_task = SubDagOperator( dag=dag, subdag=fda_dap(parent_dag_name='fda', child_dag_name='linker', start_date=dag.start_date, schedule_interval=dag.schedule_interval), task_id='linker', ) remove_unknown_documentcloud_docs_task = DockerOperator( task_id='remove_unknown_documentcloud_docs',
start_date = datetime.datetime.utcnow() dag = DAG( "lesson3.exercise3", start_date=start_date, ) trips_task_id = "trips_subdag" trips_subdag_task = SubDagOperator( subdag=get_s3_to_redshift_dag( "lesson3.exercise3", trips_task_id, "redshift", "aws_credentials", "trips", sql_statements.CREATE_TRIPS_TABLE_SQL, s3_bucket="udac-data-pipelines", s3_key="divvy/unpartitioned/divvy_trips_2018.csv", start_date=start_date, ), task_id=trips_task_id, dag=dag, ) stations_task_id = "stations_subdag" stations_subdag_task = SubDagOperator( subdag=get_s3_to_redshift_dag( "lesson3.exercise3", stations_task_id, "redshift", "aws_credentials",
value=kwargs['dag_run'].conf['action']) action_xcom = PythonOperator(task_id='action_xcom', dag=dag, python_callable=xcom_push) concurrency_check = ConcurrencyCheckOperator( task_id=DAG_CONCURRENCY_CHECK_DAG_NAME, on_failure_callback=failure_handlers.step_failure_handler, dag=dag) get_design_version = SubDagOperator( subdag=get_design_deckhand(PARENT_DAG_NAME, DECKHAND_GET_DESIGN_VERSION, args=default_args), task_id=DECKHAND_GET_DESIGN_VERSION, on_failure_callback=failure_handlers.step_failure_handler, dag=dag) validate_site_design = SubDagOperator( subdag=validate_site_design(PARENT_DAG_NAME, VALIDATE_SITE_DESIGN_DAG_NAME, args=default_args), task_id=VALIDATE_SITE_DESIGN_DAG_NAME, on_failure_callback=failure_handlers.step_failure_handler, dag=dag) drydock_build = SubDagOperator( subdag=deploy_site_drydock(PARENT_DAG_NAME, DRYDOCK_BUILD_DAG_NAME,
t_analysis = BashOperator(task_id='clean_data', bash_command=a1, dag=dag) t_scrap_data = BashOperator(task_id='scrap_data', bash_command=a1, dag=dag) t_run_main_PROJECTNAME = BashOperator(task_id='run_main_PROJECTNAME', bash_command=a1, dag=dag) t_clean_data = BashOperator(task_id='clean_data', bash_command=a1, dag=dag) t_download_data = BashOperator(task_id='download_data', bash_command=a1, dag=dag) t_to_hive = BashOperator(task_id='to_hive', bash_command=a1, dag=dag) feature_analysis = SubDagOperator( task_id='feature_analysis', subdag=subdag(DAG_NAME, 'feature_analysis', default_args), dag=dag, ) feature_slope = SubDagOperator( task_id='feature_slope', subdag=subdag(DAG_NAME, 'feature_slope', default_args), dag=dag, ) kdj_rsi = SubDagOperator( task_id='kdj_rsi_stockstats', subdag=subdag(DAG_NAME, 'kdj_rsi_stockstats', default_args), dag=dag, ) download_main = DummyOperator(task_id='run_main_PROJECTNAME') to_hive = DummyOperator(task_id='to_hive') #t_mv_daily_report.set_upstream(t_dazongjiaoyi)
load_songplays_table = LoadFactOperator( task_id='Load_songplays_fact_table', dag=dag, provide_context=True, aws_credentials_id="aws_credentials", redshift_conn_id='redshift', sql_query=SqlQueries.songplay_table_insert) load_user_dimension_table_task_id = 'Load_user_dim_table' load_user_dimension_table = SubDagOperator( subdag=load_dimensional_tables_dag( parent_dag_name=dag_name, task_id=load_user_dimension_table_task_id, redshift_conn_id="redshift", aws_credentials_id="aws_credentials", start_date=datetime(2018, 5, 1), table="users", sql_query=SqlQueries.user_table_insert, ), task_id=load_user_dimension_table_task_id, dag=dag, ) load_song_dimension_table_task_id = 'Load_song_dim_table' load_song_dimension_table = SubDagOperator( subdag=load_dimensional_tables_dag( parent_dag_name=dag_name, task_id=load_song_dimension_table_task_id, redshift_conn_id="redshift", aws_credentials_id="aws_credentials", start_date=datetime(2018, 5, 1),
def sub_dag_operator_with_default_executor(subdag, *args, **kwargs): return SubDagOperator(subdag=subdag, executor=GetDefaultExecutor(), *args, **kwargs)
start_operator = DummyOperator(dag=dag, task_id='start_operator') # Read table definitions from YAML file with open('dags/configuration/copy_from_s3_to_redshift.yml', 'r') as file: copy_definitions = yaml.safe_load(file) with dag: subdag_id = 'copy_data_to_redshift' copy_data_to_redshift = SubDagOperator(subdag=get_s3_to_redshift( parent_dag_name='udacity-dend-capstone', task_id=subdag_id, tables_definition=copy_definitions, redshift_conn_id='redshift', redshift_schema='public', s3_conn_id='aws_credentials', s3_bucket='udac-dend-capstone-dz', load_type='truncate', schema_location='Local', start_date=start_date), task_id=subdag_id, dag=dag, executor=LocalExecutor()) copy_data_to_redshift.set_upstream(start_operator) process_dim_category = PostgresOperator(dag=dag, task_id='process_dim_category', sql='/sql/categories.sql', postgres_conn_id='redshift') process_dim_category.set_upstream(copy_data_to_redshift) process_dim_cities = PostgresOperator(dag=dag,
def git_push_callback(context): slack_report(context, status='new data') git_push_task = GitPushOperator(task_id='git_push', dag=dag, pool='etl', dataset=out_dir, on_success_callback=git_push_callback) # reseting the branch in case of anything failed cleanup_task = GitResetAndGoMasterOperator(task_id='cleanup', dag=dag, dataset=out_dir, trigger_rule="all_done") # set dependencies if len(depends_on) > 0: dependency_task = SubDagOperator(subdag=sub_dag(), task_id='dependency_check', on_failure_callback=None, dag=dag) dependency_task >> checkout_task # etl (checkout_task >> git_pull_task >> source_update_task >> recipe_task >> datapackage_task >> validate_ddf >> git_commit_task ) # commit do_nothing = DummyOperator(task_id='do_nothing', dag=dag) git_commit_task >> branch_task
dag=dag, ) taar_lite = SubDagOperator( task_id="taar_lite", subdag=moz_dataproc_pyspark_runner( parent_dag_name="taar_amodump", dag_name="taar_lite", default_args=default_args, cluster_name=taarlite_cluster_name, job_name="TAAR_Lite_GUID_GUID", python_driver_code= "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/taar_lite_guidguid.py", # python_driver_code="gs://temp-hwoo-removemelater/taar_lite_guidguid.py", num_workers=8, py_args=[ "--date", "{{ ds_nodash }}", "--aws_access_key_id", aws_access_key, "--aws_secret_access_key", aws_secret_key, ], aws_conn_id=aws_conn_id, gcp_conn_id=gcpdataproc_conn_id, ), dag=dag, ) # Set a dependency on amodump from amowhitelist amowhitelist.set_upstream(amodump)