Esempio n. 1
0
    def test_sync_to_db(self, mock_now):
        dag = DAG(
            'dag',
            start_date=DEFAULT_DATE,
        )
        with dag:
            DummyOperator(task_id='task', owner='owner1')
            SubDagOperator(
                task_id='subtask',
                owner='owner2',
                subdag=DAG(
                    'dag.subtask',
                    start_date=DEFAULT_DATE,
                )
            )
        now = datetime.datetime.utcnow().replace(tzinfo=pendulum.timezone('UTC'))
        mock_now.return_value = now
        session = settings.Session()
        dag.sync_to_db(session=session)

        orm_dag = session.query(DagModel).filter(DagModel.dag_id == 'dag').one()
        self.assertEqual(set(orm_dag.owners.split(', ')), {'owner1', 'owner2'})
        self.assertEqual(orm_dag.last_scheduler_run, now)
        self.assertTrue(orm_dag.is_active)
        self.assertIsNone(orm_dag.default_view)
        self.assertEqual(orm_dag.get_default_view(),
                         conf.get('webserver', 'dag_default_view').lower())
        self.assertEqual(orm_dag.safe_dag_id, 'dag')

        orm_subdag = session.query(DagModel).filter(
            DagModel.dag_id == 'dag.subtask').one()
        self.assertEqual(set(orm_subdag.owners.split(', ')), {'owner1', 'owner2'})
        self.assertEqual(orm_subdag.last_scheduler_run, now)
        self.assertTrue(orm_subdag.is_active)
        self.assertEqual(orm_subdag.safe_dag_id, 'dag__dot__subtask')
        self.assertEqual(orm_subdag.fileloc, orm_dag.fileloc)
        session.close()
    def test_subdag_pools_no_possible_conflict(self):
        """
        Subdags and subdag tasks with no pool overlap, should not to query
        pools
        """
        dag = DAG('parent', default_args=default_args)
        subdag = DAG('parent.child', default_args=default_args)

        session = airflow.settings.Session()
        pool_1 = airflow.models.Pool(pool='test_pool_1', slots=1)
        pool_10 = airflow.models.Pool(pool='test_pool_10', slots=10)
        session.add(pool_1)
        session.add(pool_10)
        session.commit()

        DummyOperator(task_id='dummy', dag=subdag, pool='test_pool_10')

        mock_session = Mock()
        SubDagOperator(task_id='child', dag=dag, subdag=subdag, pool='test_pool_1', session=mock_session)
        self.assertFalse(mock_session.query.called)

        session.delete(pool_1)
        session.delete(pool_10)
        session.commit()
Esempio n. 3
0
    def test_is_paused_subdag(self, mock_dag_bag):
        subdag_id = 'dag.subdag'
        subdag = DAG(
            subdag_id,
            start_date=DEFAULT_DATE,
        )
        with subdag:
            DummyOperator(task_id='dummy_task', )

        dag_id = 'dag'
        dag = DAG(
            dag_id,
            start_date=DEFAULT_DATE,
        )

        with dag:
            SubDagOperator(task_id='subdag', subdag=subdag)

        mock_dag_bag.return_value.get_dag.return_value = dag

        session = settings.Session()
        dag.sync_to_db(session=session)

        unpaused_dags = session.query(DagModel).filter(
            DagModel.dag_id.in_([subdag_id, dag_id]), ).filter(
                DagModel.is_paused.is_(False)).count()

        self.assertEqual(2, unpaused_dags)

        DagModel.get_dagmodel(dag.dag_id).set_is_paused(is_paused=True)

        paused_dags = session.query(DagModel).filter(
            DagModel.dag_id.in_([subdag_id, dag_id]), ).filter(
                DagModel.is_paused.is_(True)).count()

        self.assertEqual(2, paused_dags)
Esempio n. 4
0
    def test_sync_to_db_default_view(self, mock_now):
        dag = DAG(
            'dag',
            start_date=DEFAULT_DATE,
            default_view="graph",
        )
        with dag:
            DummyOperator(task_id='task', owner='owner1')
            SubDagOperator(task_id='subtask',
                           owner='owner2',
                           subdag=DAG(
                               'dag.subtask',
                               start_date=DEFAULT_DATE,
                           ))
        now = datetime.datetime.utcnow().replace(
            tzinfo=pendulum.timezone('UTC'))
        mock_now.return_value = now
        session = settings.Session()
        dag.sync_to_db(session=session)

        orm_dag = session.query(DagModel).filter(
            DagModel.dag_id == 'dag').one()
        self.assertIsNotNone(orm_dag.default_view)
        self.assertEqual(orm_dag.get_default_view(), "graph")
Esempio n. 5
0
def validate_site_design(parent_dag_name, child_dag_name, args):
    '''
    Subdag to delegate design verification to the UCP components
    '''
    dag = DAG(
        '{}.{}'.format(parent_dag_name, child_dag_name),
        default_args=args)

    deckhand_validate_docs = SubDagOperator(
        subdag=deckhand_validate_site_design(dag.dag_id,
                                             DECKHAND_VALIDATE_DOCS_DAG_NAME,
                                             args),
        task_id=DECKHAND_VALIDATE_DOCS_DAG_NAME,
        dag=dag)

    # TODO () use the real operator here
    drydock_validate_docs = PlaceholderOperator(
        task_id='drydock_validate_site_design', dag=dag)

    # TODO () use the real operator here
    armada_validate_docs = PlaceholderOperator(
        task_id='armada_validate_site_design', dag=dag)

    return dag
Esempio n. 6
0
    def generate_dag(self, job, **kwargs):
        """
        Generates an AWS airflow dag from CWL

        Parameters
        ----------
        job: str
            Name of the file (ex. job.yml)
        
        kwargs: dict
            Keyword arguments to pass to the DAG creation


        Returns
        -------
        DAG
        """
        # Create the unique name of the workflow based on the dir containing the job file
        wf_id = os.path.basename(os.path.dirname(os.path.abspath(job)))
        with open(job) as fp:
            job = yaml.full_load(fp)

        dag_id = "{}_{}".format(self.workflow_name, wf_id)
        self.dag_id = dag_id
        default_args = {
            "depends_on_past": False,
            "start_date": datetime(2018, 2, 23),
            "max_retries": 300,
        }
        try:
            self.dj_hook.init_workflow(id=dag_id, name=self.workflow_name)
        except (DuplicateError, DataJointError):
            log.warning(
                "Workflow database entry for {} already exists, reinserting".
                format(self.workflow_name))
            pass
        if self.cwl["class"] != "Workflow":
            raise TypeError("CWL is not a workflow")
        dag = DAG(dag_id=dag_id,
                  default_args=self.default_args,
                  schedule_interval=None)
        job_params, deps = self.resolve_args(job)
        if len(self.parameterization) > 1:
            log.info(
                "Parameterization produces {} workflows, totaling {} jobs...".
                format(
                    len(self.parameterization),
                    len(self.steps) * len(self.parameterization),
                ))
        # If the parameter is a file, use the path
        param_db_update_dict = {}
        for param in self.cwl["inputs"]:
            if type(job[param]) != dict:
                param_db_update_dict[param] = job[param]
            elif "path" in job[param]:
                param_db_update_dict[param] = job[param]["path"]
            else:
                raise ValueError(
                    "Unable to insert parameter {} into job parameter database"
                    .format(param))
        try:
            use_subdag = self.cwl["hints"]["saber"]["use_subdag"]
        except KeyError:
            use_subdag = True
        for i, iteration in enumerate(self.parameterization):
            if self.optimization_iteration is None:
                task_id = str(i)
            else:
                task_id = "{}_{}".format(self.optimization_iteration, i)
            if use_subdag:
                subdag = self.create_subdag(
                    iteration,
                    task_id,
                    param_db_update_dict,
                    job_params,
                    job,
                    wf_id,
                    deps,
                    dag=None,
                )
                SubDagOperator(subdag=subdag, task_id=task_id, dag=dag)
            else:
                dag = self.create_subdag(
                    iteration,
                    task_id,
                    param_db_update_dict,
                    job_params,
                    job,
                    wf_id,
                    deps,
                    dag=dag,
                )
        return dag
Esempio n. 7
0
dag = DAG('main',
          default_args=default_args,
          description='main dag',
          schedule_interval=None,
          dagrun_timeout=timedelta(minutes=60))

start = BashOperator(
    task_id='start',
    bash_command='echo start-dag',
    dag=dag,
)

load_tasks = SubDagOperator(
    task_id='load_tasks',
    subdag=load_subdag('main', 'load_tasks', default_args),
    default_args=default_args,
    dag=dag,
)

load_tasks1 = SubDagOperator(
    task_id='load_tasks1',
    subdag=load_subdag('main', 'load_tasks1', default_args),
    default_args=default_args,
    dag=dag,
)

load_tasks2 = SubDagOperator(
    task_id='load_tasks2',
    subdag=load_subdag('main', 'load_tasks2', default_args),
    default_args=default_args,
    dag=dag,
Esempio n. 8
0
        def nested_subdag_cycle():
            import datetime  # pylint: disable=redefined-outer-name,reimported

            from airflow.models import DAG
            from airflow.operators.dummy_operator import DummyOperator
            from airflow.operators.subdag_operator import SubDagOperator
            dag_name = 'nested_cycle'
            default_args = {
                'owner': 'owner1',
                'start_date': datetime.datetime(2016, 1, 1)
            }
            dag = DAG(dag_name, default_args=default_args)

            # cycle:
            #     A -> op_subdag_0
            #          cycle.op_subdag_0:
            #              -> opSubDag_A
            #                 cycle.op_subdag_0.opSubdag_A:
            #                     -> subdag_a.task
            #              -> opSubdag_B
            #                 cycle.op_subdag_0.opSubdag_B:
            #                     -> subdag_b.task
            #     A -> op_subdag_1
            #          cycle.op_subdag_1:
            #              -> opSubdag_C
            #                 cycle.op_subdag_1.opSubdag_C:
            #                     -> subdag_c.task -> subdag_c.task  >Invalid Loop<
            #              -> opSubDag_D
            #                 cycle.op_subdag_1.opSubdag_D:
            #                     -> subdag_d.task

            with dag:

                def subdag_a():
                    subdag_a = DAG('nested_cycle.op_subdag_0.opSubdag_A',
                                   default_args=default_args)
                    DummyOperator(task_id='subdag_a.task', dag=subdag_a)
                    return subdag_a

                def subdag_b():
                    subdag_b = DAG('nested_cycle.op_subdag_0.opSubdag_B',
                                   default_args=default_args)
                    DummyOperator(task_id='subdag_b.task', dag=subdag_b)
                    return subdag_b

                def subdag_c():
                    subdag_c = DAG('nested_cycle.op_subdag_1.opSubdag_C',
                                   default_args=default_args)
                    op_subdag_c_task = DummyOperator(task_id='subdag_c.task',
                                                     dag=subdag_c)
                    # introduce a loop in opSubdag_C
                    op_subdag_c_task.set_downstream(op_subdag_c_task)
                    return subdag_c

                def subdag_d():
                    subdag_d = DAG('nested_cycle.op_subdag_1.opSubdag_D',
                                   default_args=default_args)
                    DummyOperator(task_id='subdag_d.task', dag=subdag_d)
                    return subdag_d

                def subdag_0():
                    subdag_0 = DAG('nested_cycle.op_subdag_0',
                                   default_args=default_args)
                    SubDagOperator(task_id='opSubdag_A',
                                   dag=subdag_0,
                                   subdag=subdag_a())
                    SubDagOperator(task_id='opSubdag_B',
                                   dag=subdag_0,
                                   subdag=subdag_b())
                    return subdag_0

                def subdag_1():
                    subdag_1 = DAG('nested_cycle.op_subdag_1',
                                   default_args=default_args)
                    SubDagOperator(task_id='opSubdag_C',
                                   dag=subdag_1,
                                   subdag=subdag_c())
                    SubDagOperator(task_id='opSubdag_D',
                                   dag=subdag_1,
                                   subdag=subdag_d())
                    return subdag_1

                op_subdag_0 = SubDagOperator(task_id='op_subdag_0',
                                             dag=dag,
                                             subdag=subdag_0())
                op_subdag_1 = SubDagOperator(task_id='op_subdag_1',
                                             dag=dag,
                                             subdag=subdag_1())

                op_a = DummyOperator(task_id='A')
                op_a.set_downstream(op_subdag_0)
                op_a.set_downstream(op_subdag_1)

            return dag
Esempio n. 9
0
prerelease_telemetry_aggregate_view_dataproc = SubDagOperator(
    task_id=task_id,
    dag=dag,
    subdag=moz_dataproc_pyspark_runner(
        parent_dag_name=dag.dag_id,
        dag_name=task_id,
        job_name="prerelease_aggregates",
        cluster_name="prerelease-telemetry-aggregates-{{ ds_nodash }}",
        idle_delete_ttl="600",
        num_workers=10,
        worker_machine_type="n1-standard-8",
        init_actions_uris=[
            "gs://dataproc-initialization-actions/python/pip-install.sh"
        ],
        additional_properties={
            "spark:spark.jars":
            "gs://spark-lib/bigquery/spark-bigquery-latest.jar",
            "spark:spark.jars.packages":
            "org.apache.spark:spark-avro_2.11:2.4.4",
        },
        additional_metadata={
            "PIP_PACKAGES":
            "git+https://github.com/mozilla/python_mozaggregator.git"
        },
        python_driver_code="gs://{}/jobs/mozaggregator_runner.py".format(
            artifact_bucket),
        py_args=[
            "aggregator",
            "--date",
            "{{ ds_nodash }}",
            "--channels",
            "nightly,aurora,beta",
            "--postgres-db",
            "telemetry",
            "--postgres-user",
            "root",
            "--postgres-pass",
            "{{ var.value.mozaggregator_postgres_pass }}",
            "--postgres-host",
            "{{ var.value.mozaggregator_postgres_host }}",
            "--postgres-ro-host",
            "{{ var.value.mozaggregator_postgres_ro_host }}",
            "--num-partitions",
            str(10 * 32),
        ] + ([
            "--source", "bigquery", "--project-id", "moz-fx-data-shared-prod"
        ] if not EXPORT_TO_AVRO else [
            "--source",
            "avro",
            "--avro-prefix",
            "gs://moz-fx-data-derived-datasets-parquet-tmp/avro/mozaggregator/prerelease/moz-fx-data-shared-prod",
        ]),
        gcp_conn_id=gcp_conn.gcp_conn_id,
        service_account=client_email,
        artifact_bucket=artifact_bucket,
        storage_bucket=storage_bucket,
        default_args=subdag_args,
    ),
)
Esempio n. 10
0
    """This generates the image_conversion subdag
    A subdag basically acts like an array of tasks, at least in this case"""

    image_conversion = DAG(
        '%s.%s' % (parent_dag_name, child_dag_name),
        schedule_interval=schedule_interval,
        start_date=start_date,
    )
    cell_image_analysis_generate_tasks(NUM_CHANNELS, image_conversion)
    return image_conversion


image_conversion_dag = SubDagOperator(
    subdag=generate_image_conversion_sub_dag(
        'cell_image_analysis_2channels', 'image_conversion',
        datetime(2019, 1, 1),
        cell_image_analysis_2channels_dag.schedule_interval),
    task_id='image_conversion',
    dag=cell_image_analysis_2channels_dag,
)


def prepare_cellprofiler_csv(ds, **kwargs):
    """Prepare the cellprofiler csv based on the args"""
    df = get_cell_images_df(**kwargs)
    kwargs['ti'].xcom_push(key='cell_images_df', value=df)
    return


prepare_cellprofiler_csv_op = PythonOperator(
    task_id='prepare_cellprofiler_csv',
    provide_context=True,
 bhr_collection = SubDagOperator(
     task_id="bhr_collection",
     dag=dag,
     subdag=moz_dataproc_pyspark_runner(
         parent_dag_name=dag.dag_id,
         image_version="1.5",
         dag_name="bhr_collection",
         default_args=default_args,
         cluster_name="bhr-collection-{{ ds }}",
         job_name="bhr-collection",
         python_driver_code=
         "https://raw.githubusercontent.com/mozilla/python_mozetl/main/mozetl/bhr_collection/bhr_collection.py",
         init_actions_uris=[
             "gs://dataproc-initialization-actions/python/pip-install.sh"
         ],
         additional_metadata={
             "PIP_PACKAGES": "boto3==1.16.20 click==7.1.2"
         },
         additional_properties={
             "spark:spark.jars":
             "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar",
             "spark-env:AWS_ACCESS_KEY_ID": aws_access_key,
             "spark-env:AWS_SECRET_ACCESS_KEY": aws_secret_key
         },
         py_args=[
             "--date",
             "{{ ds }}",
             "--sample-size",
             "0.5",
         ],
         idle_delete_ttl="14400",
         num_workers=6,
         worker_machine_type="n1-highmem-4",
         gcp_conn_id=params.conn_id,
         service_account=params.client_email,
         storage_bucket=params.storage_bucket,
     ))
Esempio n. 12
0
    'email': '*****@*****.**',
    'retries': 3,
    'retry_delay': timedelta(minutes=1),
}
dag = DAG(
    dag_id=DAG_NAME,
    default_args=args,
    schedule_interval='0 1 * * *',
    catchup=False,
    #concurrency=1,
)

brightedge_keyword_rank_data = SubDagOperator(
    task_id='brightedge_keyword_rank_data',
    subdag=dmd_subdag(
        DAG_NAME, 'brightedge_keyword_rank_data', args,
        conf['parameters']['brightedge_keyword_rank_data_paras']),
    default_args=args,
    on_failure_callback=send_email_failure,
    dag=dag)

brightedge_share_of_voice = SubDagOperator(
    task_id='brightedge_share_of_voice',
    subdag=dmd_subdag(DAG_NAME, 'brightedge_share_of_voice', args,
                      conf['parameters']['brightedge_share_of_voice_paras']),
    default_args=args,
    on_failure_callback=send_email_failure,
    dag=dag,
)

doubleclick_search_campaign = SubDagOperator(
    task_id='doubleclick_search_campaign',
Esempio n. 13
0
def create_subdag_operator(dag_parent, label, team):
    subdag, dependencies = create_subdag(dag_parent, label, team)
    sd_op = SubDagOperator(task_id=label, dag=dag_parent, subdag=subdag)
    return sd_op, dependencies
Esempio n. 14
0
 def subdag_1():
     subdag_1 = DAG('nested_cycle.opSubdag_1', default_args=DEFAULT_ARGS)
     SubDagOperator(task_id='opSubdag_C', dag=subdag_1, subdag=subdag_C())
     SubDagOperator(task_id='opSubdag_D', dag=subdag_1, subdag=subdag_D())
     return subdag_1
Esempio n. 15
0
 def subdag_0():
     subdag_0 = DAG('nested_cycle.opSubdag_0', default_args=DEFAULT_ARGS)
     SubDagOperator(task_id='opSubdag_A', dag=subdag_0, subdag=subdag_A())
     SubDagOperator(task_id='opSubdag_B', dag=subdag_0, subdag=subdag_B())
     return subdag_0
Esempio n. 16
0
        def nested_subdag_cycle():
            from airflow.models import DAG
            from airflow.operators.dummy_operator import DummyOperator
            from airflow.operators.subdag_operator import SubDagOperator
            import datetime
            DAG_NAME = 'nested_cycle'
            DEFAULT_ARGS = {
                'owner': 'owner1',
                'start_date': datetime.datetime(2016, 1, 1)
            }
            dag = DAG(
                DAG_NAME,
                default_args=DEFAULT_ARGS)

            # cycle:
            #     A -> opSubdag_0
            #          cycle.opSubdag_0:
            #              -> opSubDag_A
            #                 cycle.opSubdag_0.opSubdag_A:
            #                     -> subdag_A.task
            #              -> opSubdag_B
            #                 cycle.opSubdag_0.opSubdag_B:
            #                     -> subdag_B.task
            #     A -> opSubdag_1
            #          cycle.opSubdag_1:
            #              -> opSubdag_C
            #                 cycle.opSubdag_1.opSubdag_C:
            #                     -> subdag_C.task -> subdag_C.task  >Invalid Loop<
            #              -> opSubDag_D
            #                 cycle.opSubdag_1.opSubdag_D:
            #                     -> subdag_D.task

            with dag:
                def subdag_A():
                    subdag_A = DAG(
                        'nested_cycle.opSubdag_0.opSubdag_A', default_args=DEFAULT_ARGS)
                    DummyOperator(task_id='subdag_A.task', dag=subdag_A)
                    return subdag_A

                def subdag_B():
                    subdag_B = DAG(
                        'nested_cycle.opSubdag_0.opSubdag_B', default_args=DEFAULT_ARGS)
                    DummyOperator(task_id='subdag_B.task', dag=subdag_B)
                    return subdag_B

                def subdag_C():
                    subdag_C = DAG(
                        'nested_cycle.opSubdag_1.opSubdag_C', default_args=DEFAULT_ARGS)
                    opSubdag_C_task = DummyOperator(
                        task_id='subdag_C.task', dag=subdag_C)
                    # introduce a loop in opSubdag_C
                    opSubdag_C_task.set_downstream(opSubdag_C_task)
                    return subdag_C

                def subdag_D():
                    subdag_D = DAG(
                        'nested_cycle.opSubdag_1.opSubdag_D', default_args=DEFAULT_ARGS)
                    DummyOperator(task_id='subdag_D.task', dag=subdag_D)
                    return subdag_D

                def subdag_0():
                    subdag_0 = DAG('nested_cycle.opSubdag_0', default_args=DEFAULT_ARGS)
                    SubDagOperator(task_id='opSubdag_A', dag=subdag_0, subdag=subdag_A())
                    SubDagOperator(task_id='opSubdag_B', dag=subdag_0, subdag=subdag_B())
                    return subdag_0

                def subdag_1():
                    subdag_1 = DAG('nested_cycle.opSubdag_1', default_args=DEFAULT_ARGS)
                    SubDagOperator(task_id='opSubdag_C', dag=subdag_1, subdag=subdag_C())
                    SubDagOperator(task_id='opSubdag_D', dag=subdag_1, subdag=subdag_D())
                    return subdag_1

                opSubdag_0 = SubDagOperator(
                    task_id='opSubdag_0', dag=dag, subdag=subdag_0())
                opSubdag_1 = SubDagOperator(
                    task_id='opSubdag_1', dag=dag, subdag=subdag_1())

                opA = DummyOperator(task_id='A')
                opA.set_downstream(opSubdag_0)
                opA.set_downstream(opSubdag_1)

            return dag
Esempio n. 17
0
        def nested_subdags():
            from airflow.models import DAG
            from airflow.operators.dummy_operator import DummyOperator
            from airflow.operators.subdag_operator import SubDagOperator
            import datetime
            DAG_NAME = 'master'
            DEFAULT_ARGS = {
                'owner': 'owner1',
                'start_date': datetime.datetime(2016, 1, 1)
            }
            dag = DAG(
                DAG_NAME,
                default_args=DEFAULT_ARGS)

            # master:
            #     A -> opSubdag_0
            #          master.opSubdag_0:
            #              -> opSubDag_A
            #                 master.opSubdag_0.opSubdag_A:
            #                     -> subdag_A.task
            #              -> opSubdag_B
            #                 master.opSubdag_0.opSubdag_B:
            #                     -> subdag_B.task
            #     A -> opSubdag_1
            #          master.opSubdag_1:
            #              -> opSubdag_C
            #                 master.opSubdag_1.opSubdag_C:
            #                     -> subdag_C.task
            #              -> opSubDag_D
            #                 master.opSubdag_1.opSubdag_D:
            #                     -> subdag_D.task

            with dag:
                def subdag_A():
                    subdag_A = DAG(
                        'master.opSubdag_0.opSubdag_A', default_args=DEFAULT_ARGS)
                    DummyOperator(task_id='subdag_A.task', dag=subdag_A)
                    return subdag_A

                def subdag_B():
                    subdag_B = DAG(
                        'master.opSubdag_0.opSubdag_B', default_args=DEFAULT_ARGS)
                    DummyOperator(task_id='subdag_B.task', dag=subdag_B)
                    return subdag_B

                def subdag_C():
                    subdag_C = DAG(
                        'master.opSubdag_1.opSubdag_C', default_args=DEFAULT_ARGS)
                    DummyOperator(task_id='subdag_C.task', dag=subdag_C)
                    return subdag_C

                def subdag_D():
                    subdag_D = DAG(
                        'master.opSubdag_1.opSubdag_D', default_args=DEFAULT_ARGS)
                    DummyOperator(task_id='subdag_D.task', dag=subdag_D)
                    return subdag_D

                def subdag_0():
                    subdag_0 = DAG('master.opSubdag_0', default_args=DEFAULT_ARGS)
                    SubDagOperator(task_id='opSubdag_A', dag=subdag_0, subdag=subdag_A())
                    SubDagOperator(task_id='opSubdag_B', dag=subdag_0, subdag=subdag_B())
                    return subdag_0

                def subdag_1():
                    subdag_1 = DAG('master.opSubdag_1', default_args=DEFAULT_ARGS)
                    SubDagOperator(task_id='opSubdag_C', dag=subdag_1, subdag=subdag_C())
                    SubDagOperator(task_id='opSubdag_D', dag=subdag_1, subdag=subdag_D())
                    return subdag_1

                opSubdag_0 = SubDagOperator(
                    task_id='opSubdag_0', dag=dag, subdag=subdag_0())
                opSubdag_1 = SubDagOperator(
                    task_id='opSubdag_1', dag=dag, subdag=subdag_1())

                opA = DummyOperator(task_id='A')
                opA.set_downstream(opSubdag_0)
                opA.set_downstream(opSubdag_1)

            return dag
}

dag = DAG(dag_id=DAG_NAME,
          default_args=args,
          start_date=days_ago(2),
          schedule_interval="@once",
          tags=['example'])

start = DummyOperator(
    task_id='start',
    dag=dag,
)

section_1 = SubDagOperator(
    task_id='section-1',
    subdag=subdag(DAG_NAME, 'section-1', args),
    dag=dag,
)

some_other_task = DummyOperator(
    task_id='some-other-task',
    dag=dag,
)

section_2 = SubDagOperator(
    task_id='section-2',
    subdag=subdag(DAG_NAME, 'section-2', args),
    dag=dag,
)

end = DummyOperator(
Esempio n. 19
0
fenix_beta_adjust_import = SubDagOperator(
    task_id=task_id,
    dag=dag,
    subdag=moz_dataproc_pyspark_runner(
        parent_dag_name=dag.dag_id,
        dag_name=task_id,
        job_name="firefox-android-beta-adjust-import",
        cluster_name="firefox-android-beta-adjust-import-{{ ds_nodash }}",
        idle_delete_ttl="600",
        num_workers=40,
        worker_machine_type="n1-standard-8",
        init_actions_uris=[
            "gs://dataproc-initialization-actions/python/pip-install.sh"
        ],
        additional_properties={
            "spark:spark.jars":
            "gs://spark-lib/bigquery/spark-bigquery-latest.jar"
        },
        additional_metadata={"PIP_PACKAGES": "click==7.1.2"},
        python_driver_code="gs://{}/jobs/adjust_import.py".format(
            params.artifact_bucket),
        py_args=[
            "--pbkdf2",
            "--salt",
            "org.mozilla.fenix-salt",
            "--project",
            project,
            "--input_table",
            "tmp.adjust_firefox_preview",
            "--output_table",
            "firefox_android_beta_external.adjust_install_time_v1",
            "--bucket",
            params.storage_bucket,
        ],
        gcp_conn_id=params.conn_id,
        service_account=params.client_email,
        artifact_bucket=params.artifact_bucket,
        storage_bucket=params.storage_bucket,
        default_args=subdag_args,
    ),
)
Esempio n. 20
0
usage_report = SubDagOperator(
    task_id="fx_usage_report",
    dag=dag,
    subdag=moz_dataproc_scriptrunner(
        parent_dag_name=dag.dag_id,
        dag_name='fx_usage_report',
        default_args=default_args,
        cluster_name=cluster_name,
        service_account=
        '*****@*****.**',
        job_name="Fx_Usage_Report",
        uri=
        "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/fx_usage_report.sh",
        env={
            "date": DS_WEEKLY,
            "bucket": output_bucket,
            "PYTHONPATH": "/usr/lib/spark/python/lib/pyspark.zip",
            "deploy_environment": "prod",
            # These env variables are needed in addition to the s3a configs, since some code uses boto to list bucket objects
            "AWS_ACCESS_KEY_ID": aws_access_key,
            "AWS_SECRET_ACCESS_KEY": aws_secret_key
        },
        gcp_conn_id=gcp_conn_id,
        # This should is used to set the s3a configs for read/write to s3 for non boto calls
        aws_conn_id=aws_conn_id,
        num_workers=9,
        worker_machine_type='n1-standard-16',
        image_version='1.3',
        init_actions_uris=[
            'gs://moz-fx-data-prod-airflow-dataproc-artifacts/bootstrap/fx_usage_init.sh'
        ],
    ))
Esempio n. 21
0
main_path = 'source/dataset/'
data_lake = dl_helper.DataLake(version='v3')
file_extension = '.txt'

raw_features.init(main_path, data_lake, file_extension)
vector_features.init(main_path, data_lake, file_extension)
xgboost_subdag.init(main_path, data_lake, file_extension)
naive_bayes_subdag.init(main_path, data_lake, file_extension)

#================#

child_dag_name = 'raw_features_extraction'
raw_extr_subdag = SubDagOperator(
    subdag=feature_extr_sub_dag(parent_dag_name, child_dag_name, default_args, dag.schedule_interval),
    task_id=child_dag_name,
    default_args=default_args,
    dag=dag)

start >> raw_extr_subdag

#================#

child_dag_name = 'vector_features_extraction'
vector_extr_subdag = SubDagOperator(
    subdag=vector_extr_sub_dag(parent_dag_name, child_dag_name, default_args, dag.schedule_interval),
    task_id=child_dag_name,
    default_args=default_args,
    dag=dag)

start >> vector_extr_subdag
Esempio n. 22
0
args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2016, 12, 1),
    'retries': 1,
}

dag = DAG(dag_id='fda',
          default_args=args,
          max_active_runs=1,
          schedule_interval='@monthly')

fda_dap_task = SubDagOperator(
    dag=dag,
    subdag=fda_dap(parent_dag_name='fda',
                   child_dag_name='dap',
                   start_date=dag.start_date,
                   schedule_interval=dag.schedule_interval),
    task_id='dap',
)

fda_linker_task = SubDagOperator(
    dag=dag,
    subdag=fda_dap(parent_dag_name='fda',
                   child_dag_name='linker',
                   start_date=dag.start_date,
                   schedule_interval=dag.schedule_interval),
    task_id='linker',
)

remove_unknown_documentcloud_docs_task = DockerOperator(
    task_id='remove_unknown_documentcloud_docs',
Esempio n. 23
0
start_date = datetime.datetime.utcnow()

dag = DAG(
    "lesson3.exercise3",
    start_date=start_date,
)

trips_task_id = "trips_subdag"
trips_subdag_task = SubDagOperator(
    subdag=get_s3_to_redshift_dag(
        "lesson3.exercise3",
        trips_task_id,
        "redshift",
        "aws_credentials",
        "trips",
        sql_statements.CREATE_TRIPS_TABLE_SQL,
        s3_bucket="udac-data-pipelines",
        s3_key="divvy/unpartitioned/divvy_trips_2018.csv",
        start_date=start_date,
    ),
    task_id=trips_task_id,
    dag=dag,
)

stations_task_id = "stations_subdag"
stations_subdag_task = SubDagOperator(
    subdag=get_s3_to_redshift_dag(
        "lesson3.exercise3",
        stations_task_id,
        "redshift",
        "aws_credentials",
Esempio n. 24
0
                           value=kwargs['dag_run'].conf['action'])


action_xcom = PythonOperator(task_id='action_xcom',
                             dag=dag,
                             python_callable=xcom_push)

concurrency_check = ConcurrencyCheckOperator(
    task_id=DAG_CONCURRENCY_CHECK_DAG_NAME,
    on_failure_callback=failure_handlers.step_failure_handler,
    dag=dag)

get_design_version = SubDagOperator(
    subdag=get_design_deckhand(PARENT_DAG_NAME,
                               DECKHAND_GET_DESIGN_VERSION,
                               args=default_args),
    task_id=DECKHAND_GET_DESIGN_VERSION,
    on_failure_callback=failure_handlers.step_failure_handler,
    dag=dag)

validate_site_design = SubDagOperator(
    subdag=validate_site_design(PARENT_DAG_NAME,
                                VALIDATE_SITE_DESIGN_DAG_NAME,
                                args=default_args),
    task_id=VALIDATE_SITE_DESIGN_DAG_NAME,
    on_failure_callback=failure_handlers.step_failure_handler,
    dag=dag)

drydock_build = SubDagOperator(
    subdag=deploy_site_drydock(PARENT_DAG_NAME,
                               DRYDOCK_BUILD_DAG_NAME,
Esempio n. 25
0
t_analysis = BashOperator(task_id='clean_data', bash_command=a1, dag=dag)

t_scrap_data = BashOperator(task_id='scrap_data', bash_command=a1, dag=dag)

t_run_main_PROJECTNAME = BashOperator(task_id='run_main_PROJECTNAME',
                                      bash_command=a1,
                                      dag=dag)
t_clean_data = BashOperator(task_id='clean_data', bash_command=a1, dag=dag)
t_download_data = BashOperator(task_id='download_data',
                               bash_command=a1,
                               dag=dag)
t_to_hive = BashOperator(task_id='to_hive', bash_command=a1, dag=dag)
feature_analysis = SubDagOperator(
    task_id='feature_analysis',
    subdag=subdag(DAG_NAME, 'feature_analysis', default_args),
    dag=dag,
)
feature_slope = SubDagOperator(
    task_id='feature_slope',
    subdag=subdag(DAG_NAME, 'feature_slope', default_args),
    dag=dag,
)
kdj_rsi = SubDagOperator(
    task_id='kdj_rsi_stockstats',
    subdag=subdag(DAG_NAME, 'kdj_rsi_stockstats', default_args),
    dag=dag,
)
download_main = DummyOperator(task_id='run_main_PROJECTNAME')
to_hive = DummyOperator(task_id='to_hive')
#t_mv_daily_report.set_upstream(t_dazongjiaoyi)
load_songplays_table = LoadFactOperator(
    task_id='Load_songplays_fact_table',
    dag=dag,
    provide_context=True,
    aws_credentials_id="aws_credentials",
    redshift_conn_id='redshift',
    sql_query=SqlQueries.songplay_table_insert)

load_user_dimension_table_task_id = 'Load_user_dim_table'
load_user_dimension_table = SubDagOperator(
    subdag=load_dimensional_tables_dag(
        parent_dag_name=dag_name,
        task_id=load_user_dimension_table_task_id,
        redshift_conn_id="redshift",
        aws_credentials_id="aws_credentials",
        start_date=datetime(2018, 5, 1),
        table="users",
        sql_query=SqlQueries.user_table_insert,
    ),
    task_id=load_user_dimension_table_task_id,
    dag=dag,
)

load_song_dimension_table_task_id = 'Load_song_dim_table'
load_song_dimension_table = SubDagOperator(
    subdag=load_dimensional_tables_dag(
        parent_dag_name=dag_name,
        task_id=load_song_dimension_table_task_id,
        redshift_conn_id="redshift",
        aws_credentials_id="aws_credentials",
        start_date=datetime(2018, 5, 1),
Esempio n. 27
0
def sub_dag_operator_with_default_executor(subdag, *args, **kwargs):
    return SubDagOperator(subdag=subdag, executor=GetDefaultExecutor(), *args, **kwargs)
Esempio n. 28
0
start_operator = DummyOperator(dag=dag, task_id='start_operator')

# Read table definitions from YAML file
with open('dags/configuration/copy_from_s3_to_redshift.yml', 'r') as file:
    copy_definitions = yaml.safe_load(file)

with dag:
    subdag_id = 'copy_data_to_redshift'
    copy_data_to_redshift = SubDagOperator(subdag=get_s3_to_redshift(
        parent_dag_name='udacity-dend-capstone',
        task_id=subdag_id,
        tables_definition=copy_definitions,
        redshift_conn_id='redshift',
        redshift_schema='public',
        s3_conn_id='aws_credentials',
        s3_bucket='udac-dend-capstone-dz',
        load_type='truncate',
        schema_location='Local',
        start_date=start_date),
                                           task_id=subdag_id,
                                           dag=dag,
                                           executor=LocalExecutor())
    copy_data_to_redshift.set_upstream(start_operator)

process_dim_category = PostgresOperator(dag=dag,
                                        task_id='process_dim_category',
                                        sql='/sql/categories.sql',
                                        postgres_conn_id='redshift')
process_dim_category.set_upstream(copy_data_to_redshift)

process_dim_cities = PostgresOperator(dag=dag,
Esempio n. 29
0
def git_push_callback(context):
    slack_report(context, status='new data')


git_push_task = GitPushOperator(task_id='git_push', dag=dag,
                                pool='etl',
                                dataset=out_dir,
                                on_success_callback=git_push_callback)

# reseting the branch in case of anything failed
cleanup_task = GitResetAndGoMasterOperator(task_id='cleanup', dag=dag, dataset=out_dir, trigger_rule="all_done")


# set dependencies
if len(depends_on) > 0:
    dependency_task = SubDagOperator(subdag=sub_dag(), task_id='dependency_check', on_failure_callback=None,
                                     dag=dag)
    dependency_task >> checkout_task

# etl
(checkout_task >>
 git_pull_task >>
 source_update_task >>
 recipe_task >>
 datapackage_task >>
 validate_ddf >>
 git_commit_task
)

# commit
do_nothing = DummyOperator(task_id='do_nothing', dag=dag)
git_commit_task >> branch_task
Esempio n. 30
0
    dag=dag,
)

taar_lite = SubDagOperator(
    task_id="taar_lite",
    subdag=moz_dataproc_pyspark_runner(
        parent_dag_name="taar_amodump",
        dag_name="taar_lite",
        default_args=default_args,
        cluster_name=taarlite_cluster_name,
        job_name="TAAR_Lite_GUID_GUID",
        python_driver_code=
        "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/taar_lite_guidguid.py",
        # python_driver_code="gs://temp-hwoo-removemelater/taar_lite_guidguid.py",
        num_workers=8,
        py_args=[
            "--date",
            "{{ ds_nodash }}",
            "--aws_access_key_id",
            aws_access_key,
            "--aws_secret_access_key",
            aws_secret_key,
        ],
        aws_conn_id=aws_conn_id,
        gcp_conn_id=gcpdataproc_conn_id,
    ),
    dag=dag,
)
# Set a dependency on amodump from amowhitelist
amowhitelist.set_upstream(amodump)