Beispiel #1
0
    def test_should_run_jobs(self):
        # given
        definition = [mock.Mock() for i in range(100)]
        workflow = Workflow(workflow_id='test_workflow', definition=definition)

        # when
        workflow.run('2019-01-01')

        # then
        for step in definition:
            step.assert_has_calls([mock.call.run(runtime='2019-01-01')])
Beispiel #2
0
    def test_should_run_single_job(self):
        # given
        first_job = mock.Mock()
        setattr(first_job, 'id', 'first job')

        definition = [first_job] + [mock.Mock() for i in range(100)]
        workflow = Workflow(workflow_id='test_workflow', definition=definition)

        # when
        workflow.run_job('first job', '2020-01-01')

        # then
        for step in definition[1:]:
            step.assert_not_called()
        # and
        first_job.assert_has_calls([mock.call.run('2020-01-01')])
Beispiel #3
0
    def test_should_have_id_and_schedule_interval(self):
        # given
        workflow = Workflow(workflow_id='test_workflow',
                            definition=[],
                            schedule_interval='@hourly')

        # expected
        self.assertEqual(workflow.schedule_interval, '@hourly')
Beispiel #4
0
    def test_should_run_jobs(self):
        # given
        definition = [mock.Mock() for i in range(100)]
        workflow = Workflow(workflow_id='test_workflow', definition=definition)

        # when
        workflow.run(datetime.datetime(2019, 1, 1))

        # then
        for step in definition:
            step.assert_has_calls([
                mock.call.execute(
                    JobContext.make(
                        runtime=datetime.datetime(2019, 1, 1),
                        runtime_str='2019-01-01 00:00:00',
                        workflow=workflow,
                    )),
            ])
Beispiel #5
0
    def test_should_run_single_classbased_job_oldapi(self):

        # given
        class FirstJob:
            id = 'first job'
            runtime = None

            def run(self, runtime):
                assert self.runtime is None
                self.runtime = runtime

        first_job = FirstJob()
        workflow = Workflow(workflow_id='test_workflow',
                            definition=[first_job])

        # when
        workflow.run_job('first job', "2020-01-01")

        # then
        self.assertEqual(first_job.runtime, "2020-01-01")
Beispiel #6
0
    def test_should_run_single_classbased_job(self):
        # given
        class FirstJob(bigflow.Job):
            id = 'first job'
            context = None

            def execute(self, context: JobContext):
                assert self.context is None
                self.context = context

        first_job = FirstJob()
        workflow = Workflow(workflow_id='test_workflow',
                            definition=[first_job])

        # when
        workflow.run_job('first job', datetime.datetime(2020, 1, 1))

        # then
        context: JobContext = first_job.context
        self.assertIsNotNone(context)
        self.assertEqual(context.runtime, datetime.datetime(2020, 1, 1))
        self.assertIs(context.workflow, workflow)
Beispiel #7
0
    def test_should_run_single_job_with_context(self):
        # given
        first_job = mock.Mock(spec_set=['execute', 'run', 'id'])
        first_job.id = 'first job'
        first_job.execute = mock.Mock()

        definition = [first_job] + [mock.Mock() for i in range(100)]
        workflow = Workflow(workflow_id='test_workflow', definition=definition)

        # when
        workflow.run_job('first job', '2020-01-01')

        # then
        for step in definition[1:]:
            step.assert_not_called()

        first_job.execute.assert_called_once()
        ((ctx, ), _kwargs) = first_job.execute.call_args

        self.assertIs(ctx.workflow, workflow)
        self.assertEqual(ctx.runtime_str, "2020-01-01")
        self.assertEqual(ctx.runtime, datetime.datetime(2020, 1, 1))
Beispiel #8
0
    def test_should_generate_DAG_file_from_workflow_with_hourly_scheduling(
            self, get_timezone_offset_seconds_mock):
        # given
        workdir = os.path.dirname(__file__)
        get_timezone_offset_seconds_mock.return_value = 2 * 3600
        docker_repository = 'eu.gcr.io/my_docker_repository_project/my-project'

        # given
        job1 = Job(id='job1',
                   component=mock.Mock(),
                   retry_count=10,
                   retry_pause_sec=20)
        job2 = Job(id='job2',
                   component=mock.Mock(),
                   retry_count=100,
                   retry_pause_sec=200)
        job3 = Job(id='job3',
                   component=mock.Mock(),
                   retry_count=100,
                   retry_pause_sec=200)
        w_job1 = WorkflowJob(job1, 1)
        w_job2 = WorkflowJob(job2, 2)
        w_job3 = WorkflowJob(job3, 3)
        graph = {w_job1: (w_job2, w_job3), w_job2: (w_job3, )}
        workflow = Workflow(workflow_id='my_workflow',
                            definition=Definition(graph),
                            start_time_factory=hourly_start_time,
                            schedule_interval='@hourly')

        # when
        dag_file_path = generate_dag_file(workdir, docker_repository, workflow,
                                          '2020-07-02 10:00:00', '0.3.0', 'ca')

        # then
        self.assertEqual(
            dag_file_path,
            workdir + '/.dags/my_workflow__v0_3_0__2020_07_02_10_00_00_dag.py')

        dag_file_content = Path(dag_file_path).read_text()
        expected_dag_content = '''
import datetime
from airflow import DAG
from airflow.contrib.operators import kubernetes_pod_operator

default_args = {
            'owner': 'airflow',
            'depends_on_past': True,
            'start_date': datetime.datetime(2020, 7, 2, 8, 0),
            'email_on_failure': False,
            'email_on_retry': False,
            'execution_timeout': datetime.timedelta(seconds=10800),
}

dag = DAG(
    'my_workflow__v0_3_0__2020_07_02_10_00_00',
    default_args=default_args,
    max_active_runs=1,
    schedule_interval='@hourly'
)


tjob1 = kubernetes_pod_operator.KubernetesPodOperator(
    task_id='job1',
    name='job1',
    cmds=['bf'],
    arguments=['run', '--job', 'my_workflow.job1', '--runtime', '{{ execution_date.strftime("%Y-%m-%d %H:%M:%S") }}', '--project-package', 'ca', '--config', '{{var.value.env}}'],
    namespace='default',
    image='eu.gcr.io/my_docker_repository_project/my-project:0.3.0',
    is_delete_operator_pod=True,
    retries=10,
    retry_delay=datetime.timedelta(seconds=20),
    dag=dag,
    execution_timeout=datetime.timedelta(seconds=10800))


tjob2 = kubernetes_pod_operator.KubernetesPodOperator(
    task_id='job2',
    name='job2',
    cmds=['bf'],
    arguments=['run', '--job', 'my_workflow.job2', '--runtime', '{{ execution_date.strftime("%Y-%m-%d %H:%M:%S") }}', '--project-package', 'ca', '--config', '{{var.value.env}}'],
    namespace='default',
    image='eu.gcr.io/my_docker_repository_project/my-project:0.3.0',
    is_delete_operator_pod=True,
    retries=100,
    retry_delay=datetime.timedelta(seconds=200),
    dag=dag,
    execution_timeout=datetime.timedelta(seconds=10800))

tjob2.set_upstream(tjob1)

tjob3 = kubernetes_pod_operator.KubernetesPodOperator(
    task_id='job3',
    name='job3',
    cmds=['bf'],
    arguments=['run', '--job', 'my_workflow.job3', '--runtime', '{{ execution_date.strftime("%Y-%m-%d %H:%M:%S") }}', '--project-package', 'ca', '--config', '{{var.value.env}}'],
    namespace='default',
    image='eu.gcr.io/my_docker_repository_project/my-project:0.3.0',
    is_delete_operator_pod=True,
    retries=100,
    retry_delay=datetime.timedelta(seconds=200),
    dag=dag,
    execution_timeout=datetime.timedelta(seconds=10800))

tjob3.set_upstream(tjob2)
tjob3.set_upstream(tjob1)
'''

        self.assert_files_are_equal(expected_dag_content, dag_file_content)
from bigflow.workflow import Workflow


class DailyJob:
    def __init__(self):
        self.id = 'daily_job'

    def run(self, runtime):
        print(f'I should process data with timestamps from: {runtime} 00:00 to {runtime} 23:59')


daily_workflow = Workflow(
    workflow_id='daily_workflow',
    schedule_interval='@daily',
    definition=[DailyJob()])

if __name__ == '__main__':
    daily_workflow.run('2020-01-01')
from bigflow.workflow import Workflow


class SimpleJob:
    def __init__(self):
        self.id = 'simple_job'

    def run(self, runtime):
        print(f'Running a simple job')


simple_workflow = Workflow(workflow_id='simple_workflow',
                           definition=[SimpleJob()])
Beispiel #11
0
from bigflow.workflow import Workflow
from bigflow.workflow import Definition
from .sequential_workflow import Job

job1, job2, job3, job4 = Job('1'), Job('2'), Job('3'), Job('4')

graph_workflow = Workflow(
    workflow_id='graph_workflow',
    definition=Definition({
        job1: [job2, job3],
        job2: [job4],
        job3: [job4],
    }),
)

if __name__ == '__main__':
    graph_workflow.run()
Beispiel #12
0
from bigflow.workflow import Workflow
from .sequential_workflow import Job

simple_workflow = Workflow(workflow_id='simple_workflow',
                           definition=[Job('1')])

if __name__ == '__main__':
    simple_workflow.run_job('1')
    simple_workflow.run()
    simple_workflow.run_job('1', '1970-01-01')
    simple_workflow.run('1970-01-01')
Beispiel #13
0
from bigflow.workflow import Workflow


class DailyJob:
    def __init__(self):
        self.id = 'daily_job'

    def run(self, runtime):
        print(
            f'I should process data with timestamps from: {runtime} 00:00 to {runtime} 23:59'
        )


daily_workflow = Workflow(workflow_id='daily_workflow',
                          schedule_interval='@daily',
                          runtime_as_datetime=False,
                          definition=[DailyJob()])

if __name__ == '__main__':
    daily_workflow.run('2020-01-01')
Beispiel #14
0
from bigflow.workflow import Workflow
from datetime import datetime
from datetime import timedelta


class HourlyJob:
    def __init__(self):
        self.id = 'hourly_job'

    def run(self, runtime):
        print(
            f'I should process data with timestamps from: {runtime} '
            f'to {datetime.strptime(runtime, "%Y-%m-%d %H:%M:%S") + timedelta(minutes=59, seconds=59)}'
        )


hourly_workflow = Workflow(workflow_id='hourly_workflow',
                           runtime_as_datetime=True,
                           schedule_interval='@hourly',
                           definition=[HourlyJob()])

if __name__ == '__main__':
    hourly_workflow.run('2020-01-01 00:00:00')
from bigflow.workflow import Workflow


class Job(object):
    def __init__(self, id):
        self.id = id

    def run(self, runtime):
        print(f'Running job {self.id} at {runtime}')


example_workflow = Workflow(workflow_id='example_workflow',
                            definition=[Job('1'), Job('2')])

if __name__ == '__main__':
    example_workflow.run()
Beispiel #16
0
from bigflow.workflow import Workflow


class HelloWorldJob:
    def __init__(self):
        self.id = 'hello_world'

    def run(self, runtime):
        print(f'Hello world on {runtime}!')


class SayGoodbyeJob:
    def __init__(self):
        self.id = 'say_goodbye'

    def run(self, runtime):
        print(f'Goodbye!')


hello_world_workflow = Workflow(workflow_id='hello_world_workflow',
                                definition=[HelloWorldJob(),
                                            SayGoodbyeJob()])
Beispiel #17
0
from pathlib import Path
from bigflow.resources import get_resource_absolute_path
from bigflow.workflow import Workflow


class PrintResourceJob:
    def __init__(self):
        self.id = 'print_resource_job'

    def run(self, runtime):
        with open(
                get_resource_absolute_path('example_resource.txt',
                                           Path(__file__))) as f:
            print(f.read())


resources_workflow = Workflow(workflow_id='resources_workflow',
                              definition=[PrintResourceJob()])
Beispiel #18
0
from bigflow.workflow import Workflow
from bigflow.workflow import Definition
from .sequential_workflow import Job

job1, job2, job3, job4 = Job('1'), Job('2'), Job('3'), Job('4')

graph_workflow = Workflow(workflow_id='graph_workflow',
                          definition=Definition({
                              job1: (job2, job3),
                              job2: (job4, ),
                              job3: (job4, )
                          }))

if __name__ == '__main__':
    graph_workflow.run()
Beispiel #19
0
    def test_should_generate_DAG_file_from_workflow_with_daily_scheduling(
            self):
        # given
        workdir = os.path.dirname(__file__)
        docker_repository = 'eu.gcr.io/my_docker_repository_project/my-project'

        # given
        job1 = Job(id='job1',
                   component=mock.Mock(),
                   retry_count=10,
                   retry_pause_sec=20)
        w_job1 = WorkflowJob(job1, 1)
        graph = {w_job1: ()}
        workflow = Workflow(workflow_id='my_daily_workflow',
                            definition=Definition(graph),
                            schedule_interval='@daily')

        # when
        dag_file_path = generate_dag_file(workdir, docker_repository, workflow,
                                          '2020-07-01', '0.3.0', 'ca')

        # then
        self.assertEqual(
            dag_file_path, workdir +
            '/.dags/my_daily_workflow__v0_3_0__2020_07_01_00_00_00_dag.py')

        dag_file_content = Path(dag_file_path).read_text()
        expected_dag_content = '''    
from airflow import DAG
from datetime import timedelta
from datetime import datetime
from airflow.contrib.operators import kubernetes_pod_operator

default_args = {
            'owner': 'airflow',
            'depends_on_past': True,
            'start_date': datetime.strptime("2020-07-01", "%Y-%m-%d") - (timedelta(hours=24)),
            'email_on_failure': False,
            'email_on_retry': False,
            'execution_timeout': timedelta(minutes=90)
}

dag = DAG(
    'my_daily_workflow__v0_3_0__2020_07_01_00_00_00',
    default_args=default_args,
    max_active_runs=1,
    schedule_interval='@daily'
)


tjob1 = kubernetes_pod_operator.KubernetesPodOperator(
    task_id='job1',
    name='job1',
    cmds=['bf'],
    arguments=['run', '--job', 'my_daily_workflow.job1', '--runtime', '{{ execution_date.strftime("%Y-%m-%d %H:%M:%S") }}', '--project-package', 'ca', '--config', '{{var.value.env}}'],
    namespace='default',
    image='eu.gcr.io/my_docker_repository_project/my-project:0.3.0',
    is_delete_operator_pod=True,
    retries=10,
    retry_delay= timedelta(seconds=20),
    dag=dag)            

'''
        self.assert_files_are_equal(expected_dag_content, dag_file_content)
from bigflow.workflow import Workflow
from .sequential_workflow import Job

simple_workflow = Workflow(workflow_id='simple_workflow',
                           runtime_as_datetime=True,
                           definition=[Job('1')])

if __name__ == '__main__':
    simple_workflow.run_job('1')
    simple_workflow.run()
    simple_workflow.run_job('1', '1970-01-01')
    simple_workflow.run('1970-01-01')
Beispiel #21
0
from bigflow import Config
from bigflow.workflow import Workflow


config = Config(name='dev',
                properties={
                        'message_to_print': 'Message to print on DEV'
                }).add_configuration(
                name='prod',
                properties={
                       'message_to_print': 'Message to print on PROD'
                })


class HelloConfigJob:
    def __init__(self, message_to_print):
        self.id = 'hello_config_job'
        self.message_to_print = message_to_print

    def run(self, runtime):
        print(self.message_to_print)


hello_world_workflow = Workflow(
    workflow_id='hello_config_workflow',
    definition=[HelloConfigJob(config.resolve_property('message_to_print'))])
Beispiel #22
0
    def test_should_run_jobs_in_order_accordingly_to_graph_schema(self):
        # given
        original_job = mock.Mock()
        job1, job2, job3, job4, job5, job6, job7, job8, job9 = [
            WorkflowJob(original_job, i) for i in range(9)
        ]
        job_graph = OrderedDict([(job1, (job5, job6)), (job2, (job6, )),
                                 (job3, (job6, )), (job4, (job7, )),
                                 (job6, (job8, )), (job7, (job8, )),
                                 (job5, (job9, ))])

        #  job1     job2  job3  job4
        #    |  \    |    /      |
        #    |   \   |   /       |
        #    |    \  |  /        |
        #    |     \ | /         |
        #  job5    job6        job7
        #    |        \         /
        #    |         \       /
        #    |          \     /
        #    |           \   /
        #    |            \ /
        #   job9         job8

        definition = Definition(job_graph)
        workflow = Workflow(workflow_id='test_workflow',
                            definition=definition,
                            schedule_interval='@hourly',
                            runtime_as_datetime=True)

        # expected
        self.assertEqual(
            list(workflow.build_sequential_order()),
            [job1, job5, job9, job2, job3, job6, job4, job7, job8])

        # given
        job_graph = OrderedDict([(job1, (job5, job6, job7)), (job2, (job6, )),
                                 (job3, (job6, )), (job4, (job7, )),
                                 (job6, (job8, )), (job7, (job8, )),
                                 (job5, (job9, )), (job6, (job9, ))])

        #  job1     job2  job3  job4
        #    |  \    |    /      |
        #    |   \   |   /       |
        #    |    \  |  /        |
        #    |     \ | /         |
        #  job5    job6        job7
        #    |      / \         /
        #    |     /   \       /
        #    |    /     \     /
        #    |   /       \   /
        #    |  /         \ /
        #   job9         job8

        definition = Definition(job_graph)
        workflow = Workflow(workflow_id='test_workflow',
                            definition=definition,
                            schedule_interval='@hourly',
                            runtime_as_datetime=True)

        # expected
        self.assertEqual(
            workflow.build_sequential_order(),
            [job1, job5, job2, job3, job6, job9, job4, job7, job8])
Beispiel #23
0
from bigflow.workflow import Workflow, hourly_start_time

from datetime import datetime
from datetime import timedelta


class HourlyJob:
    def __init__(self):
        self.id = 'hourly_job'

    def run(self, runtime):
        print(
            f'I should process data with timestamps from: {runtime} '
            f'to {datetime.strptime(runtime, "%Y-%m-%d %H:%M:%S") + timedelta(minutes=59, seconds=59)}'
        )


hourly_workflow = Workflow(workflow_id='hourly_workflow',
                           schedule_interval='@hourly',
                           start_time_factory=hourly_start_time,
                           definition=[HourlyJob()])

if __name__ == '__main__':
    hourly_workflow.run('2020-01-01 00:00:00')