예제 #1
0
파일: core.py 프로젝트: pkexcellent/airflow
 def setUp(self):
     configuration.test_mode()
     utils.initdb()
     args = {'owner': 'airflow', 'start_date': datetime(2015, 1, 1)}
     dag = DAG(TEST_DAG_ID, default_args=args)
     dag.clear(start_date=DEFAULT_DATE, end_date=datetime.now())
     self.dag = dag
예제 #2
0
    def test_schedule_dag_no_previous_runs(self):
        """
        Tests scheduling a dag with no previous runs
        """
        dag = DAG(self.TEST_SCHEDULE_WITH_NO_PREVIOUS_RUNS_DAG_ID)
        dag.add_task(BaseOperator(
            task_id="faketastic",
            owner='Also fake',
            start_date=datetime(2015, 1, 2, 0, 0)))

        dag_file_processor = DagFileProcessor(dag_ids=[], log=mock.MagicMock())
        dag_run = dag_file_processor.create_dag_run(dag)
        self.assertIsNotNone(dag_run)
        self.assertEqual(dag.dag_id, dag_run.dag_id)
        self.assertIsNotNone(dag_run.run_id)
        self.assertNotEqual('', dag_run.run_id)
        self.assertEqual(
            datetime(2015, 1, 2, 0, 0),
            dag_run.execution_date,
            msg='dag_run.execution_date did not match expectation: {0}'
            .format(dag_run.execution_date)
        )
        self.assertEqual(State.RUNNING, dag_run.state)
        self.assertFalse(dag_run.external_trigger)
        dag.clear()
예제 #3
0
파일: core.py 프로젝트: kundeng/airflow
 def setUp(self):
     configuration.test_mode()
     utils.initdb()
     args = {'owner': 'airflow', 'start_date': datetime(2015, 1, 1)}
     dag = DAG(TEST_DAG_ID, default_args=args)
     dag.clear(start_date=DEFAULT_DATE, end_date=datetime.now())
     self.dag = dag
class BranchOperatorTest(unittest.TestCase):
    def setUp(self):
        self.dag = DAG('branch_operator_test',
                       default_args={
                           'owner': 'airflow',
                           'start_date': DEFAULT_DATE},
                       schedule_interval=INTERVAL)
        self.branch_op = BranchPythonOperator(task_id='make_choice',
                                              dag=self.dag,
                                              python_callable=lambda: 'branch_1')

        self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag)
        self.branch_1.set_upstream(self.branch_op)
        self.branch_2 = DummyOperator(task_id='branch_2', dag=self.dag)
        self.branch_2.set_upstream(self.branch_op)
        self.dag.clear()

    def test_without_dag_run(self):
        """This checks the defensive against non existent tasks in a dag run"""
        self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        session = Session()
        tis = session.query(TI).filter(
            TI.dag_id == self.dag.dag_id,
            TI.execution_date == DEFAULT_DATE
        )
        session.close()

        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'branch_1':
                # should exist with state None
                self.assertEquals(ti.state, State.NONE)
            elif ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.SKIPPED)
            else:
                raise

    def test_with_dag_run(self):
        dr = self.dag.create_dagrun(
            run_id="manual__",
            start_date=datetime.datetime.now(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING
        )

        self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        tis = dr.get_task_instances()
        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'branch_1':
                self.assertEquals(ti.state, State.NONE)
            elif ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.SKIPPED)
            else:
                raise
    def test_with_dag_run(self):
        value = False
        dag = DAG('shortcircuit_operator_test_with_dag_run',
                  default_args={
                       'owner': 'airflow',
                       'start_date': DEFAULT_DATE
                  },
                  schedule_interval=INTERVAL)
        short_op = ShortCircuitOperator(task_id='make_choice',
                                        dag=dag,
                                        python_callable=lambda: value)
        branch_1 = DummyOperator(task_id='branch_1', dag=dag)
        branch_1.set_upstream(short_op)
        branch_2 = DummyOperator(task_id='branch_2', dag=dag)
        branch_2.set_upstream(branch_1)
        upstream = DummyOperator(task_id='upstream', dag=dag)
        upstream.set_downstream(short_op)
        dag.clear()

        logging.error("Tasks {}".format(dag.tasks))
        dr = dag.create_dagrun(
            run_id="manual__",
            start_date=datetime.datetime.now(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING
        )

        upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
        short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        tis = dr.get_task_instances()
        self.assertEqual(len(tis), 4)
        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.SKIPPED)
            else:
                raise

        value = True
        dag.clear()
        dr.verify_integrity()
        upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
        short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        tis = dr.get_task_instances()
        self.assertEqual(len(tis), 4)
        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.NONE)
            else:
                raise
    def test_without_dag_run(self):
        """This checks the defensive against non existent tasks in a dag run"""
        value = False
        dag = DAG('shortcircuit_operator_test_without_dag_run',
                  default_args={
                       'owner': 'airflow',
                       'start_date': DEFAULT_DATE
                  },
                  schedule_interval=INTERVAL)
        short_op = ShortCircuitOperator(task_id='make_choice',
                                        dag=dag,
                                        python_callable=lambda: value)
        branch_1 = DummyOperator(task_id='branch_1', dag=dag)
        branch_1.set_upstream(short_op)
        branch_2 = DummyOperator(task_id='branch_2', dag=dag)
        branch_2.set_upstream(branch_1)
        upstream = DummyOperator(task_id='upstream', dag=dag)
        upstream.set_downstream(short_op)
        dag.clear()

        short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        session = Session()
        tis = session.query(TI).filter(
            TI.dag_id == dag.dag_id,
            TI.execution_date == DEFAULT_DATE
        )

        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                # should not exist
                raise
            elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.SKIPPED)
            else:
                raise

        value = True
        dag.clear()

        short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                # should not exist
                raise
            elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.NONE)
            else:
                raise

        session.close()
예제 #7
0
def test_operator():
    dag = DAG(dag_id='test', default_args={'start_date': datetime(2019, 1, 1)})

    with dag:
        http_to_gcs = HttpToGoogleCloudStorageOperator(
            endpoint='/test?ds={{ ds }}/',
            bucket='test-bucket',
            filename='/data/ds={{ ds }}/file',
            task_id='test')

    dag.clear()

    return http_to_gcs
예제 #8
0
    def test_schedule_dag_once(self):
        """
        Tests scheduling a dag scheduled for @once - should be scheduled the first time
        it is called, and not scheduled the second.
        """
        dag = DAG(self.TEST_SCHEDULE_ONCE_DAG_ID)
        dag.schedule_interval = '@once'
        dag.add_task(BaseOperator(
            task_id="faketastic",
            owner='Also fake',
            start_date=datetime(2015, 1, 2, 0, 0)))
        dag_run = DagFileProcessor(dag_ids=[], log=mock.MagicMock()).create_dag_run(dag)
        dag_run2 = DagFileProcessor(dag_ids=[], log=mock.MagicMock()).create_dag_run(dag)

        self.assertIsNotNone(dag_run)
        self.assertIsNone(dag_run2)
        dag.clear()
예제 #9
0
    def test_schedule_dag_once(self):
        """
        Tests scheduling a dag scheduled for @once - should be scheduled the first time
        it is called, and not scheduled the second.
        """
        dag = DAG(self.TEST_SCHEDULE_ONCE_DAG_ID)
        dag.schedule_interval = '@once'
        dag.add_task(BaseOperator(
            task_id="faketastic",
            owner='Also fake',
            start_date=datetime(2015, 1, 2, 0, 0)))
        dag_run = jobs.SchedulerJob(**self.default_scheduler_args).create_dag_run(dag)
        dag_run2 = jobs.SchedulerJob(**self.default_scheduler_args).create_dag_run(dag)

        self.assertIsNotNone(dag_run)
        self.assertIsNone(dag_run2)
        dag.clear()
예제 #10
0
    def test_schedule_dag_relativedelta(self):
        """
        Tests scheduling a dag with a relativedelta schedule_interval
        """
        delta = relativedelta(hours=+1)
        dag = DAG(self.TEST_SCHEDULE_RELATIVEDELTA_DAG_ID,
                  schedule_interval=delta)
        dag.add_task(BaseOperator(
            task_id="faketastic",
            owner='Also fake',
            start_date=datetime(2015, 1, 2, 0, 0)))

        dag_file_processor = DagFileProcessor(dag_ids=[], log=mock.MagicMock())
        dag_run = dag_file_processor.create_dag_run(dag)
        self.assertIsNotNone(dag_run)
        self.assertEqual(dag.dag_id, dag_run.dag_id)
        self.assertIsNotNone(dag_run.run_id)
        self.assertNotEqual('', dag_run.run_id)
        self.assertEqual(
            datetime(2015, 1, 2, 0, 0),
            dag_run.execution_date,
            msg='dag_run.execution_date did not match expectation: {0}'
            .format(dag_run.execution_date)
        )
        self.assertEqual(State.RUNNING, dag_run.state)
        self.assertFalse(dag_run.external_trigger)
        dag_run2 = dag_file_processor.create_dag_run(dag)
        self.assertIsNotNone(dag_run2)
        self.assertEqual(dag.dag_id, dag_run2.dag_id)
        self.assertIsNotNone(dag_run2.run_id)
        self.assertNotEqual('', dag_run2.run_id)
        self.assertEqual(
            datetime(2015, 1, 2, 0, 0) + delta,
            dag_run2.execution_date,
            msg='dag_run2.execution_date did not match expectation: {0}'
            .format(dag_run2.execution_date)
        )
        self.assertEqual(State.RUNNING, dag_run2.state)
        self.assertFalse(dag_run2.external_trigger)
        dag.clear()
예제 #11
0
            f'The two values differ {pulled_value_1} and {value_1}')
    if pulled_value_2 != value_2:
        raise ValueError(
            f'The two values differ {pulled_value_2} and {value_2}')


push1 = PythonOperator(
    task_id='push',
    dag=dag,
    python_callable=push,
)

push2 = PythonOperator(
    task_id='push_by_returning',
    dag=dag,
    python_callable=push_by_returning,
)

pull = PythonOperator(
    task_id='puller',
    dag=dag,
    python_callable=puller,
)

pull << [push1, push2]

if __name__ == '__main__':
    from airflow.utils.state import State
    dag.clear(dag_run_state=State.NONE)
    dag.run()
예제 #12
0
def test_already_running_then_skip(env, setup_teardown, airflow_session):
    def datafile(filename):
        return os.path.join('/workspace/airflow/dags/libs/shared/data',
                            filename)

    # Save these snippets for later in case we need to mock an success. - Stu M. 4/29/19
    # http = HttpMock(datafile('dataflow.json'), {'status': '200'})
    # requestBuilder = RequestMockBuilder(
    #     {'dataflow.projects.templates.launch': (None, '{"job": ""}')}
    # )
    # with pytest.raises(HttpError) as e:
    #         job = task.execute(ti.get_template_context())
    #         assert e.resp.status == 409

    http = HttpMock(datafile('dataflow.json'), {'status': '200'})
    errorResponse = httplib2.Response({
        'status': '409',
        'reason': 'Server Error'
    })
    requestBuilder = RequestMockBuilder(
        {'dataflow.projects.templates.launch': (errorResponse, b'')})

    dag = DAG('shortcircuit_operator_test_with_dag_run',
              default_args={
                  'owner': 'airflow',
                  'start_date': DEFAULT_DATE
              },
              schedule_interval=INTERVAL)
    task = ScheduleDataflowJobOperator(
        project=env['project'],
        template_name='load_vibe_to_lake',
        job_name='schedule-dataflow-test-{}'.format(int(time.time())),
        job_parameters={
            'client': 'bluesun',
            'table': 'pyr_bluesun_local.tree_user_types',
            'dest': '{}:lake.tree_user_types'.format(env['project'])
        },
        dag=dag,
        task_id='schedule_dataflow_operation',
        http=http,
        requestBuilder=requestBuilder)

    middle_task = DummyOperator(task_id='middle_task', dag=dag)

    finish_task = DummyOperator(task_id='finish', dag=dag)

    task >> middle_task >> finish_task

    dag.clear()

    task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

    with airflow_session() as session:
        tis = session.query(TaskInstance).filter(
            TaskInstance.dag_id == dag.dag_id,
            TaskInstance.execution_date == DEFAULT_DATE)
        for ti in tis:
            if ti.task_id == 'schedule_dataflow_operation':
                assert ti.state == State.SUCCESS
            elif ti.task_id == 'middle_task':
                assert ti.state == State.SKIPPED
            elif ti.task_id == 'finish':
                assert ti.state == State.SKIPPED
예제 #13
0
import os
from utils import default_args, resolve_file
from airflow import DAG
from airflow_kubernetes_job_operator.kubernetes_job_operator import KubernetesJobOperator


dag = DAG(
    "kub-job-op-custom",
    default_args=default_args,
    description="Test base job operator",
    schedule_interval=None,
    catchup=False,
)

with dag:
    KubernetesJobOperator(task_id="test_dbl_log", body_filepath=__file__ + ".yaml")

if __name__ == "__main__":
    dag.clear()
    dag.run()
예제 #14
0
class ShortCircuitOperatorTest(unittest.TestCase):
    def setUp(self):
        self.dag = DAG('shortcircuit_operator_test',
                       default_args={
                           'owner': 'airflow',
                           'start_date': DEFAULT_DATE},
                       schedule_interval=INTERVAL)
        self.short_op = ShortCircuitOperator(task_id='make_choice',
                                             dag=self.dag,
                                             python_callable=lambda: self.value)

        self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag)
        self.branch_1.set_upstream(self.short_op)
        self.branch_2 = DummyOperator(task_id='branch_2', dag=self.dag)
        self.branch_2.set_upstream(self.branch_1)
        self.upstream = DummyOperator(task_id='upstream', dag=self.dag)
        self.upstream.set_downstream(self.short_op)
        self.dag.clear()

        self.value = True

    def test_without_dag_run(self):
        """This checks the defensive against non existent tasks in a dag run"""
        self.value = False
        self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        session = Session()
        tis = session.query(TI).filter(
            TI.dag_id == self.dag.dag_id,
            TI.execution_date == DEFAULT_DATE
        )

        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                # should not exist
                raise
            elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.SKIPPED)
            else:
                raise

        self.value = True
        self.dag.clear()

        self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                # should not exist
                raise
            elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.NONE)
            else:
                raise

        session.close()

    def test_with_dag_run(self):
        self.value = False
        logging.error("Tasks {}".format(self.dag.tasks))
        dr = self.dag.create_dagrun(
            run_id="manual__",
            start_date=datetime.datetime.now(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING
        )

        self.upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
        self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        tis = dr.get_task_instances()
        self.assertEqual(len(tis), 4)
        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.SKIPPED)
            else:
                raise

        self.value = True
        self.dag.clear()
        dr.verify_integrity()
        self.upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
        self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        tis = dr.get_task_instances()
        self.assertEqual(len(tis), 4)
        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.NONE)
            else:
                raise
예제 #15
0
        'team_col_name': TEAM_COL_NAME,
        'validation_results_path': VALIDATION_RESULTS_PATH,
        'parse_results_path': PARSE_RESULTS_PATH,
        'error_stats_path': ERROR_STATS_PATH,
        'blacklist_conn_id': PROJECT_DB_CONN_ID,
        'temp_table_name': TEMP_TABLE_NAME
    },
    templates_dict={'load_data_query': 'load_data.sql'},
    provide_context=True,
    dag=dag)

send_report_task = PythonOperator(task_id='send_report',
                                  python_callable=send_report,
                                  op_kwargs={
                                      'token_id':
                                      Variable.get('HW3_TELEGRAM_BOT_TOKEN'),
                                      'chat_id':
                                      Variable.get('HW3_TELEGRAM_CHAT_ID'),
                                      'error_stats_path':
                                      ERROR_STATS_PATH
                                  },
                                  provide_context=True,
                                  dag=dag)

(process_urls_task >> parse_urls_task >> calculate_stats_task >>
 send_report_task)

if __name__ == '__main__':
    dag.clear(reset_dag_runs=True)
    dag.run()
예제 #16
0
              schedule_interval='0 3 * * *')

blp_logs = EmrCreateJobFlowOperator(task_id='blp_create_job_flow',
                                    job_flow_overrides={'Steps': BLP_STEPS},
                                    aws_conn_id='aws_data_iam',
                                    emr_conn_id='emr_data_iam_mango',
                                    dag=blp_dag)

blp_job_sensor = EmrJobFlowSensor(
    task_id='blp_check_job_flow',
    job_flow_id=
    "{{ task_instance.xcom_pull('blp_create_job_flow', key='return_value') }}",
    aws_conn_id='aws_data_iam',
    dag=blp_dag,
    on_retry_callback=lambda context: blp_dag.clear(
        start_date=context['execution_date'],
        end_date=context['execution_date']),
)

gcp_conn_id = "google_cloud_derived_datasets"
connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

gcstj_object_conditions = {'includePrefixes': 'blpadi/{{ ds }}'}

gcstj_transfer_options = {'deleteObjectsUniqueInSink': True}

bq_args = [
    'bq',
    '--location=US',
    'load',
    '--source_format=CSV',
blp_logs = EmrCreateJobFlowOperator(
    task_id='blp_create_job_flow',
    job_flow_overrides={'Steps': BLP_STEPS},
    aws_conn_id='aws_data_iam',
    emr_conn_id='emr_data_iam_mango',
    dag=blp_dag
)

blp_job_sensor = EmrJobFlowSensor(
    task_id='blp_check_job_flow',
    job_flow_id="{{ task_instance.xcom_pull('blp_create_job_flow', key='return_value') }}",
    aws_conn_id='aws_data_iam',
    dag=blp_dag,
    on_retry_callback=lambda context: blp_dag.clear(
        start_date=context['execution_date'],
        end_date=context['execution_date']),
)

gcp_conn_id = "google_cloud_derived_datasets"
connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

gcstj_object_conditions = {
    'includePrefixes':  'blpadi/{{ ds }}'
}

gcstj_transfer_options = {
    'deleteObjectsUniqueInSink': True
}

bq_args = [