Beispiel #1
0
def create_dag(client):
    dag = DAG(DAG_ID,
              default_args=default_args,
              schedule_interval=None)
    with dag:
        start_task = DummyOperator(task_id='start')
        finish_task = DummyOperator(task_id='finish')

        for table in airflow_vars['dags']['vibe_to_lake']['tables']:
            pusher_task_id = f'schedule_dataflow_{table}'
            schedule_df_task = ScheduleDataflowJobOperator(
                task_id=pusher_task_id,
                project=project,
                template_name='load_vibe_to_lake',
                job_name=f'vibe-to-lake---client---{table}',
                job_parameters={
                    'client': '--client--',
                    'table': f'`{project}.pyr_--client--_{env}.{table}`',
                    'dest': f'{project}:lake.{table}'
                },
                provide_context=True
            )
            monitor_df_job_task = DataflowJobStateSensor(
                task_id=f'monitor_df_job_{table}',
                pusher_task_id=pusher_task_id,
                poke_interval=airflow_vars['dags']['vibe_to_lake']['poke_interval'],
                timeout=airflow_vars['dags']['vibe_to_lake']['poke_timeout'],
                dag=dag
            )
            start_task.set_downstream(schedule_df_task)
            schedule_df_task.set_downstream(monitor_df_job_task)
            monitor_df_job_task.set_downstream(finish_task)

        start_task >> finish_task
    return dag
Beispiel #2
0
def create_dag():
    dag = DAG(DAG_ID,
              default_args=default_args,
              schedule_interval='@hourly',
              catchup=False)
    with dag:
        finish_task = DummyOperator(task_id='finish')
        pusher_task_id = f'schedule_df_wrench_to_lake'
        should_run_task = BranchPythonOperator(task_id='should_run',
                                               python_callable=should_run)
        schedule_df_task = ScheduleDataflowJobOperator(
            task_id=pusher_task_id,
            project=project_id,
            template_name='load_wrench_to_lake',
            job_name=f'wrench-to-lake',
            job_parameters={},
            provide_context=True)
        monitor_df_job_task = DataflowJobStateSensor(
            task_id=f'monitor_df_job',
            pusher_task_id=pusher_task_id,
            poke_interval=airflow_vars['dags']['wrench_to_lake']
            ['poke_interval'],
            timeout=airflow_vars['dags']['wrench_to_lake']['poke_timeout'],
            dag=dag)
        move_files_task = PythonOperator(task_id='move_processed_files',
                                         python_callable=move_files)
        should_run_task >> schedule_df_task >> monitor_df_job_task >> move_files_task >> finish_task

    return dag
Beispiel #3
0
def test_returns_job(env):
    with DAG(dag_id='schedule_dataflow_test',
             start_date=datetime.now(),
             schedule_interval=None) as dag:
        task = ScheduleDataflowJobOperator(
            project=env['project'],
            template_name='load_vibe_to_lake',
            job_name='schedule-dataflow-test-{}'.format(int(time.time())),
            job_parameters={
                'client': 'bluesun',
                'table': 'pyr_bluesun_local.tree_user_types',
                'dest': '{}:lake.tree_user_types'.format(env['project'])
            },
            dag=dag,
            task_id='test_task')

    ti = TaskInstance(task=task, execution_date=datetime.now())
    job = task.execute(ti.get_template_context())
    assert job['projectId'] == env['project']
def create_dag():
    dag = DAG(
        DAG_ID,
        default_args=default_args,
        # Be sure to stagger the dags so they don't run all at once,
        # possibly causing max memory usage and pod failure. - Stu M.
        schedule_interval='0 * * * *',
        catchup=False)
    with dag:
        start_task = DummyOperator(task_id='start')
        finish_task = DummyOperator(task_id='finish')
        storage = CloudStorage.factory(project_id)
        cdc_imports_bucket = storage.get_bucket(bucket)
        cdc_imports_processed_bucket = storage.get_bucket(processed_bucket)

        for files_startwith, table in table_map.items():
            pusher_task_id = f'schedule_df_gcs_to_lake_{table}'
            continue_if_file_task = BranchPythonOperator(
                task_id=f'continue_if_file_{files_startwith}',
                python_callable=should_continue,
                op_args=[files_startwith, cdc_imports_bucket, table])
            schedule_df_task = ScheduleDataflowJobOperator(
                task_id=pusher_task_id,
                project=project_id,
                template_name=f'load_cdc_from_gcs_to_lake',
                job_name=f'gcs-to-lake-{table}',
                job_parameters={
                    'files_startwith': files_startwith,
                    'dest': f'{project_id}:lake.{table}'
                },
                provide_context=True)
            monitor_df_job_task = DataflowJobStateSensor(
                task_id=f'monitor_df_job_{table}',
                pusher_task_id=pusher_task_id,
                poke_interval=airflow_vars['dags']['cdc_from_gcs_to_lake']
                ['poke_interval'],
                timeout=airflow_vars['dags']['cdc_from_gcs_to_lake']
                ['poke_timeout'],
                dag=dag)
            move_files_task = PythonOperator(
                task_id=f'move_processed_files_{files_startwith}',
                python_callable=storage.move_files,
                op_args=[
                    files_startwith, cdc_imports_bucket,
                    cdc_imports_processed_bucket
                ],
            )
            (start_task >> continue_if_file_task >> schedule_df_task >>
             monitor_df_job_task >> move_files_task >> finish_task)
    return dag
Beispiel #5
0
 def _create_df_task(dag):
     return ScheduleDataflowJobOperator(
         project=env['project'],
         template_name='load_vibe_to_lake',
         job_name='schedule-dataflow-test-{}'.format(int(time.time())),
         job_parameters={
             'client': 'bluesun',
             'table': 'pyr_bluesun_local.tree_user_types',
             'dest': '{}:lake.tree_user_types'.format(env['project'])
         },
         pull_parameters=[{
             'key': 'one'
         }, {
             'key': 'two',
             'param_name': 'two-specific-key'
         }, {
             'task_id': 'three',
             'param_name': 'three'
         }],
         dag=dag,
         task_id='test_task')
def create_dag(client):
    dag = DAG(DAG_ID, default_args=default_args, schedule_interval=None)
    with dag:
        tables = []
        with open(f'{settings.DAGS_FOLDER}/table_lists/table-list.json', 'r') as f:
            file_json_content = f.read()
            tables = json.loads(file_json_content)

        should_run_task = BranchPythonOperator(
            task_id='should_run',
            python_callable=should_run
        )
        start_sql_instance_task = BashOperator(
            task_id='start_sql_instance',
            bash_command=start_sql_cmd
        )
        pre_delete_database_task = BashOperator(
            task_id=f'pre_delete_{database_name}_database',
            bash_command=delete_db_cmd
        )
        create_db_task = BashOperator(
            task_id=f'create_{database_name}_database',
            bash_command=create_db_cmd
        )
        import_db_task = BashOperator(
            task_id=f'import_{database_name}_database',
            bash_command=import_db_cmd
        )
        delete_db_import_file_task = PythonOperator(
            task_id='delete_db_import_file',
            python_callable=delete_db_import_file
        )
        post_delete_database_task = BashOperator(
            task_id=f'post_delete_{database_name}_database',
            bash_command=delete_db_cmd
        )
        stop_sql_instance_task = BashOperator(
            task_id='stop_sql_instance',
            bash_command=stop_sql_cmd
        )
        finish_task = DummyOperator(
            task_id='finish'
        )

        try:
            for t in tables:
                pusher_task_id = f'schedule_dataflow_job_for_{t["table"]}'
                schedule_df_task = ScheduleDataflowJobOperator(
                    task_id=pusher_task_id,
                    project=project_id,
                    template_name='load_sql_to_bq',
                    job_name=f'load---client---{t["table"]}-sql-to-bq',
                    job_parameters={
                        'env': env,
                        'client': '--client--',
                        'bq_table': f'{project_id}:{database_name}.{t["table"]}',
                        'table': t["table"],
                        'key_field': t["keyField"]
                    },
                    provide_context=True
                )
                monitor_df_job_task = DataflowJobStateSensor(
                    task_id=f'monitor_df_job_{t["table"]}',
                    pusher_task_id=pusher_task_id,
                    poke_interval=airflow_vars['dags']['vibe_to_bq_initial_load']['poke_interval'],
                    timeout=airflow_vars['dags']['vibe_to_bq_initial_load']['poke_timeout']
                )
                import_db_task.set_downstream(schedule_df_task)
                schedule_df_task.set_downstream(monitor_df_job_task)
                monitor_df_job_task.set_downstream(delete_db_import_file_task)
        except Exception as e:
            log.error(e)

        (
            should_run_task
            >> start_sql_instance_task
            >> pre_delete_database_task
            >> create_db_task
            >> import_db_task
            >> delete_db_import_file_task
            >> post_delete_database_task
            >> stop_sql_instance_task
            >> finish_task
        )

    return dag
Beispiel #7
0
def create_dag():
    dag = DAG(
        DAG_ID,
        default_args=default_args,
        # Be sure to stagger the dags so they don't run all at once,
        # possibly causing max memory usage and pod failure. - Stu M.
        schedule_interval='30 * * * *',
        catchup=False)
    with dag:
        start_task = DummyOperator(task_id='start')
        finish_task = DummyOperator(task_id='finish')

        for table, sources in table_map.items():
            pusher_task_id = f'schedule_dataflow_{table}'
            parsed_table = gcloud.parse_table_name(table)

            get_checkpoint_task = GetCheckpointOperator(
                task_id=f'get_checkpoint_{table}',
                env=env,
                target=table,
                sources=sources)

            continue_if_data_task = BranchPythonOperator(
                task_id=f'continue_if_data_{table}',
                python_callable=should_continue,
                op_args=[table],
                provide_context=True)

            parse_query_task = PythonOperator(task_id=f'parse_query_{table}',
                                              python_callable=parse_query,
                                              op_args=[table],
                                              provide_context=True)

            dataflow_task = ScheduleDataflowJobOperator(
                task_id=pusher_task_id,
                project=gcloud.project(env),
                template_name=f'load_lake_to_staging_{parsed_table}',
                job_name=f'lake-to-staging-{table}',
                job_parameters={'env': env},
                pull_parameters=[{
                    'param_name': 'query',
                    'task_id': f'parse_query_{table}'
                }],
                provide_context=True)

            monitor_dataflow_task = DataflowJobStateSensor(
                task_id=f'monitor_df_job_{table}',
                poke_interval=airflow_vars['dags']['lake_to_staging']
                ['poke_interval'],
                timeout=airflow_vars['dags']['lake_to_staging']
                ['poke_timeout'],
                dag=dag,
                pusher_task_id=pusher_task_id)

            set_checkpoint_task = SetCheckpointOperator(
                task_id=f'set_checkpoint_{table}', env=env, table=table)

            start_task.set_downstream(get_checkpoint_task)
            get_checkpoint_task.set_downstream(continue_if_data_task)
            continue_if_data_task.set_downstream(parse_query_task)
            parse_query_task.set_downstream(dataflow_task)
            dataflow_task.set_downstream(monitor_dataflow_task)
            monitor_dataflow_task.set_downstream(set_checkpoint_task)
            set_checkpoint_task.set_downstream(finish_task)

        start_task >> finish_task
    return dag
Beispiel #8
0
def test_already_running_then_skip(env, setup_teardown, airflow_session):
    def datafile(filename):
        return os.path.join('/workspace/airflow/dags/libs/shared/data',
                            filename)

    # Save these snippets for later in case we need to mock an success. - Stu M. 4/29/19
    # http = HttpMock(datafile('dataflow.json'), {'status': '200'})
    # requestBuilder = RequestMockBuilder(
    #     {'dataflow.projects.templates.launch': (None, '{"job": ""}')}
    # )
    # with pytest.raises(HttpError) as e:
    #         job = task.execute(ti.get_template_context())
    #         assert e.resp.status == 409

    http = HttpMock(datafile('dataflow.json'), {'status': '200'})
    errorResponse = httplib2.Response({
        'status': '409',
        'reason': 'Server Error'
    })
    requestBuilder = RequestMockBuilder(
        {'dataflow.projects.templates.launch': (errorResponse, b'')})

    dag = DAG('shortcircuit_operator_test_with_dag_run',
              default_args={
                  'owner': 'airflow',
                  'start_date': DEFAULT_DATE
              },
              schedule_interval=INTERVAL)
    task = ScheduleDataflowJobOperator(
        project=env['project'],
        template_name='load_vibe_to_lake',
        job_name='schedule-dataflow-test-{}'.format(int(time.time())),
        job_parameters={
            'client': 'bluesun',
            'table': 'pyr_bluesun_local.tree_user_types',
            'dest': '{}:lake.tree_user_types'.format(env['project'])
        },
        dag=dag,
        task_id='schedule_dataflow_operation',
        http=http,
        requestBuilder=requestBuilder)

    middle_task = DummyOperator(task_id='middle_task', dag=dag)

    finish_task = DummyOperator(task_id='finish', dag=dag)

    task >> middle_task >> finish_task

    dag.clear()

    task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

    with airflow_session() as session:
        tis = session.query(TaskInstance).filter(
            TaskInstance.dag_id == dag.dag_id,
            TaskInstance.execution_date == DEFAULT_DATE)
        for ti in tis:
            if ti.task_id == 'schedule_dataflow_operation':
                assert ti.state == State.SUCCESS
            elif ti.task_id == 'middle_task':
                assert ti.state == State.SKIPPED
            elif ti.task_id == 'finish':
                assert ti.state == State.SKIPPED
Beispiel #9
0
def create_dag():
    dag = DAG(DAG_ID,
              catchup=False,
              default_args=default_args,
              schedule_interval='@hourly')
    with dag:
        start_task = DummyOperator(task_id='start')

        finish_task = DummyOperator(task_id='finish')

        for table in get_airflow_vars()['dags'][DAG_ID]['tables']:
            table = table['name']
            parsed_table = gcloud.parse_table_name(table)

            get_checkpoint_task = GetCheckpointOperator(
                task_id='get_checkpoint_{}'.format(table),
                env=env,
                target=table,
                sources=[table])

            continue_if_data_task = BranchPythonOperator(
                task_id='continue_if_data_{}'.format(table),
                python_callable=continue_if_data,
                op_args=[table],
                trigger_rule='all_done',
                provide_context=True)

            clear_gcs_bucket_by_table_task = PythonOperator(
                task_id='clear_gcs_bucket_{}'.format(table),
                python_callable=clear_gcs_bucket_by_table,
                op_args=[env, table])

            parse_query_task = PythonOperator(task_id=f'parse_query_{table}',
                                              python_callable=parse_query,
                                              op_args=[table],
                                              provide_context=True)

            dataflow_task = ScheduleDataflowJobOperator(
                task_id=f'schedule_dataflow_{table}',
                project=gcloud.project(env),
                template_name='offload_bq_to_cs',
                job_name=f'bq-to-wrench-{parsed_table}',
                job_parameters={
                    'destination':
                    'gs://{}/{}/{}'.format(gcs_bucket, table,
                                           f'bq-to-wrench-{parsed_table}')
                },
                pull_parameters=[{
                    'param_name': 'query',
                    'task_id': f'parse_query_{table}'
                }],
                provide_context=True)

            monitor_dataflow_task = DataflowJobStateSensor(
                task_id=f'monitor_dataflow_{table}',
                pusher_task_id=f'schedule_dataflow_{table}',
                poke_interval=get_airflow_vars()['dags'][DAG_ID]
                ['poke_interval'],
                timeout=get_airflow_vars()['dags'][DAG_ID]['poke_timeout'],
                dag=dag)

            gcs_to_wrench_s3_task = PythonOperator(
                task_id='gcs_to_wrench_s3_{}'.format(table),
                python_callable=gcs_to_wrench_s3,
                op_args=[env, table])

            commit_checkpoint_task = SetCheckpointOperator(
                task_id='commit_checkpoint_{}'.format(table),
                env=env,
                table=table)

            (start_task >> get_checkpoint_task >> continue_if_data_task >>
             clear_gcs_bucket_by_table_task >> parse_query_task >>
             dataflow_task >> monitor_dataflow_task >> gcs_to_wrench_s3_task >>
             commit_checkpoint_task >> finish_task)
    return dag