def test_should_run_jobs(self): # given definition = [mock.Mock() for i in range(100)] workflow = Workflow(workflow_id='test_workflow', definition=definition) # when workflow.run('2019-01-01') # then for step in definition: step.assert_has_calls([mock.call.run(runtime='2019-01-01')])
def test_should_run_single_job(self): # given first_job = mock.Mock() setattr(first_job, 'id', 'first job') definition = [first_job] + [mock.Mock() for i in range(100)] workflow = Workflow(workflow_id='test_workflow', definition=definition) # when workflow.run_job('first job', '2020-01-01') # then for step in definition[1:]: step.assert_not_called() # and first_job.assert_has_calls([mock.call.run('2020-01-01')])
def test_should_have_id_and_schedule_interval(self): # given workflow = Workflow(workflow_id='test_workflow', definition=[], schedule_interval='@hourly') # expected self.assertEqual(workflow.schedule_interval, '@hourly')
def test_should_run_jobs(self): # given definition = [mock.Mock() for i in range(100)] workflow = Workflow(workflow_id='test_workflow', definition=definition) # when workflow.run(datetime.datetime(2019, 1, 1)) # then for step in definition: step.assert_has_calls([ mock.call.execute( JobContext.make( runtime=datetime.datetime(2019, 1, 1), runtime_str='2019-01-01 00:00:00', workflow=workflow, )), ])
def test_should_run_single_classbased_job_oldapi(self): # given class FirstJob: id = 'first job' runtime = None def run(self, runtime): assert self.runtime is None self.runtime = runtime first_job = FirstJob() workflow = Workflow(workflow_id='test_workflow', definition=[first_job]) # when workflow.run_job('first job', "2020-01-01") # then self.assertEqual(first_job.runtime, "2020-01-01")
def test_should_run_single_classbased_job(self): # given class FirstJob(bigflow.Job): id = 'first job' context = None def execute(self, context: JobContext): assert self.context is None self.context = context first_job = FirstJob() workflow = Workflow(workflow_id='test_workflow', definition=[first_job]) # when workflow.run_job('first job', datetime.datetime(2020, 1, 1)) # then context: JobContext = first_job.context self.assertIsNotNone(context) self.assertEqual(context.runtime, datetime.datetime(2020, 1, 1)) self.assertIs(context.workflow, workflow)
def test_should_run_single_job_with_context(self): # given first_job = mock.Mock(spec_set=['execute', 'run', 'id']) first_job.id = 'first job' first_job.execute = mock.Mock() definition = [first_job] + [mock.Mock() for i in range(100)] workflow = Workflow(workflow_id='test_workflow', definition=definition) # when workflow.run_job('first job', '2020-01-01') # then for step in definition[1:]: step.assert_not_called() first_job.execute.assert_called_once() ((ctx, ), _kwargs) = first_job.execute.call_args self.assertIs(ctx.workflow, workflow) self.assertEqual(ctx.runtime_str, "2020-01-01") self.assertEqual(ctx.runtime, datetime.datetime(2020, 1, 1))
def test_should_generate_DAG_file_from_workflow_with_hourly_scheduling( self, get_timezone_offset_seconds_mock): # given workdir = os.path.dirname(__file__) get_timezone_offset_seconds_mock.return_value = 2 * 3600 docker_repository = 'eu.gcr.io/my_docker_repository_project/my-project' # given job1 = Job(id='job1', component=mock.Mock(), retry_count=10, retry_pause_sec=20) job2 = Job(id='job2', component=mock.Mock(), retry_count=100, retry_pause_sec=200) job3 = Job(id='job3', component=mock.Mock(), retry_count=100, retry_pause_sec=200) w_job1 = WorkflowJob(job1, 1) w_job2 = WorkflowJob(job2, 2) w_job3 = WorkflowJob(job3, 3) graph = {w_job1: (w_job2, w_job3), w_job2: (w_job3, )} workflow = Workflow(workflow_id='my_workflow', definition=Definition(graph), start_time_factory=hourly_start_time, schedule_interval='@hourly') # when dag_file_path = generate_dag_file(workdir, docker_repository, workflow, '2020-07-02 10:00:00', '0.3.0', 'ca') # then self.assertEqual( dag_file_path, workdir + '/.dags/my_workflow__v0_3_0__2020_07_02_10_00_00_dag.py') dag_file_content = Path(dag_file_path).read_text() expected_dag_content = ''' import datetime from airflow import DAG from airflow.contrib.operators import kubernetes_pod_operator default_args = { 'owner': 'airflow', 'depends_on_past': True, 'start_date': datetime.datetime(2020, 7, 2, 8, 0), 'email_on_failure': False, 'email_on_retry': False, 'execution_timeout': datetime.timedelta(seconds=10800), } dag = DAG( 'my_workflow__v0_3_0__2020_07_02_10_00_00', default_args=default_args, max_active_runs=1, schedule_interval='@hourly' ) tjob1 = kubernetes_pod_operator.KubernetesPodOperator( task_id='job1', name='job1', cmds=['bf'], arguments=['run', '--job', 'my_workflow.job1', '--runtime', '{{ execution_date.strftime("%Y-%m-%d %H:%M:%S") }}', '--project-package', 'ca', '--config', '{{var.value.env}}'], namespace='default', image='eu.gcr.io/my_docker_repository_project/my-project:0.3.0', is_delete_operator_pod=True, retries=10, retry_delay=datetime.timedelta(seconds=20), dag=dag, execution_timeout=datetime.timedelta(seconds=10800)) tjob2 = kubernetes_pod_operator.KubernetesPodOperator( task_id='job2', name='job2', cmds=['bf'], arguments=['run', '--job', 'my_workflow.job2', '--runtime', '{{ execution_date.strftime("%Y-%m-%d %H:%M:%S") }}', '--project-package', 'ca', '--config', '{{var.value.env}}'], namespace='default', image='eu.gcr.io/my_docker_repository_project/my-project:0.3.0', is_delete_operator_pod=True, retries=100, retry_delay=datetime.timedelta(seconds=200), dag=dag, execution_timeout=datetime.timedelta(seconds=10800)) tjob2.set_upstream(tjob1) tjob3 = kubernetes_pod_operator.KubernetesPodOperator( task_id='job3', name='job3', cmds=['bf'], arguments=['run', '--job', 'my_workflow.job3', '--runtime', '{{ execution_date.strftime("%Y-%m-%d %H:%M:%S") }}', '--project-package', 'ca', '--config', '{{var.value.env}}'], namespace='default', image='eu.gcr.io/my_docker_repository_project/my-project:0.3.0', is_delete_operator_pod=True, retries=100, retry_delay=datetime.timedelta(seconds=200), dag=dag, execution_timeout=datetime.timedelta(seconds=10800)) tjob3.set_upstream(tjob2) tjob3.set_upstream(tjob1) ''' self.assert_files_are_equal(expected_dag_content, dag_file_content)
from bigflow.workflow import Workflow class DailyJob: def __init__(self): self.id = 'daily_job' def run(self, runtime): print(f'I should process data with timestamps from: {runtime} 00:00 to {runtime} 23:59') daily_workflow = Workflow( workflow_id='daily_workflow', schedule_interval='@daily', definition=[DailyJob()]) if __name__ == '__main__': daily_workflow.run('2020-01-01')
from bigflow.workflow import Workflow class SimpleJob: def __init__(self): self.id = 'simple_job' def run(self, runtime): print(f'Running a simple job') simple_workflow = Workflow(workflow_id='simple_workflow', definition=[SimpleJob()])
from bigflow.workflow import Workflow from bigflow.workflow import Definition from .sequential_workflow import Job job1, job2, job3, job4 = Job('1'), Job('2'), Job('3'), Job('4') graph_workflow = Workflow( workflow_id='graph_workflow', definition=Definition({ job1: [job2, job3], job2: [job4], job3: [job4], }), ) if __name__ == '__main__': graph_workflow.run()
from bigflow.workflow import Workflow from .sequential_workflow import Job simple_workflow = Workflow(workflow_id='simple_workflow', definition=[Job('1')]) if __name__ == '__main__': simple_workflow.run_job('1') simple_workflow.run() simple_workflow.run_job('1', '1970-01-01') simple_workflow.run('1970-01-01')
from bigflow.workflow import Workflow class DailyJob: def __init__(self): self.id = 'daily_job' def run(self, runtime): print( f'I should process data with timestamps from: {runtime} 00:00 to {runtime} 23:59' ) daily_workflow = Workflow(workflow_id='daily_workflow', schedule_interval='@daily', runtime_as_datetime=False, definition=[DailyJob()]) if __name__ == '__main__': daily_workflow.run('2020-01-01')
from bigflow.workflow import Workflow from datetime import datetime from datetime import timedelta class HourlyJob: def __init__(self): self.id = 'hourly_job' def run(self, runtime): print( f'I should process data with timestamps from: {runtime} ' f'to {datetime.strptime(runtime, "%Y-%m-%d %H:%M:%S") + timedelta(minutes=59, seconds=59)}' ) hourly_workflow = Workflow(workflow_id='hourly_workflow', runtime_as_datetime=True, schedule_interval='@hourly', definition=[HourlyJob()]) if __name__ == '__main__': hourly_workflow.run('2020-01-01 00:00:00')
from bigflow.workflow import Workflow class Job(object): def __init__(self, id): self.id = id def run(self, runtime): print(f'Running job {self.id} at {runtime}') example_workflow = Workflow(workflow_id='example_workflow', definition=[Job('1'), Job('2')]) if __name__ == '__main__': example_workflow.run()
from bigflow.workflow import Workflow class HelloWorldJob: def __init__(self): self.id = 'hello_world' def run(self, runtime): print(f'Hello world on {runtime}!') class SayGoodbyeJob: def __init__(self): self.id = 'say_goodbye' def run(self, runtime): print(f'Goodbye!') hello_world_workflow = Workflow(workflow_id='hello_world_workflow', definition=[HelloWorldJob(), SayGoodbyeJob()])
from pathlib import Path from bigflow.resources import get_resource_absolute_path from bigflow.workflow import Workflow class PrintResourceJob: def __init__(self): self.id = 'print_resource_job' def run(self, runtime): with open( get_resource_absolute_path('example_resource.txt', Path(__file__))) as f: print(f.read()) resources_workflow = Workflow(workflow_id='resources_workflow', definition=[PrintResourceJob()])
from bigflow.workflow import Workflow from bigflow.workflow import Definition from .sequential_workflow import Job job1, job2, job3, job4 = Job('1'), Job('2'), Job('3'), Job('4') graph_workflow = Workflow(workflow_id='graph_workflow', definition=Definition({ job1: (job2, job3), job2: (job4, ), job3: (job4, ) })) if __name__ == '__main__': graph_workflow.run()
def test_should_generate_DAG_file_from_workflow_with_daily_scheduling( self): # given workdir = os.path.dirname(__file__) docker_repository = 'eu.gcr.io/my_docker_repository_project/my-project' # given job1 = Job(id='job1', component=mock.Mock(), retry_count=10, retry_pause_sec=20) w_job1 = WorkflowJob(job1, 1) graph = {w_job1: ()} workflow = Workflow(workflow_id='my_daily_workflow', definition=Definition(graph), schedule_interval='@daily') # when dag_file_path = generate_dag_file(workdir, docker_repository, workflow, '2020-07-01', '0.3.0', 'ca') # then self.assertEqual( dag_file_path, workdir + '/.dags/my_daily_workflow__v0_3_0__2020_07_01_00_00_00_dag.py') dag_file_content = Path(dag_file_path).read_text() expected_dag_content = ''' from airflow import DAG from datetime import timedelta from datetime import datetime from airflow.contrib.operators import kubernetes_pod_operator default_args = { 'owner': 'airflow', 'depends_on_past': True, 'start_date': datetime.strptime("2020-07-01", "%Y-%m-%d") - (timedelta(hours=24)), 'email_on_failure': False, 'email_on_retry': False, 'execution_timeout': timedelta(minutes=90) } dag = DAG( 'my_daily_workflow__v0_3_0__2020_07_01_00_00_00', default_args=default_args, max_active_runs=1, schedule_interval='@daily' ) tjob1 = kubernetes_pod_operator.KubernetesPodOperator( task_id='job1', name='job1', cmds=['bf'], arguments=['run', '--job', 'my_daily_workflow.job1', '--runtime', '{{ execution_date.strftime("%Y-%m-%d %H:%M:%S") }}', '--project-package', 'ca', '--config', '{{var.value.env}}'], namespace='default', image='eu.gcr.io/my_docker_repository_project/my-project:0.3.0', is_delete_operator_pod=True, retries=10, retry_delay= timedelta(seconds=20), dag=dag) ''' self.assert_files_are_equal(expected_dag_content, dag_file_content)
from bigflow.workflow import Workflow from .sequential_workflow import Job simple_workflow = Workflow(workflow_id='simple_workflow', runtime_as_datetime=True, definition=[Job('1')]) if __name__ == '__main__': simple_workflow.run_job('1') simple_workflow.run() simple_workflow.run_job('1', '1970-01-01') simple_workflow.run('1970-01-01')
from bigflow import Config from bigflow.workflow import Workflow config = Config(name='dev', properties={ 'message_to_print': 'Message to print on DEV' }).add_configuration( name='prod', properties={ 'message_to_print': 'Message to print on PROD' }) class HelloConfigJob: def __init__(self, message_to_print): self.id = 'hello_config_job' self.message_to_print = message_to_print def run(self, runtime): print(self.message_to_print) hello_world_workflow = Workflow( workflow_id='hello_config_workflow', definition=[HelloConfigJob(config.resolve_property('message_to_print'))])
def test_should_run_jobs_in_order_accordingly_to_graph_schema(self): # given original_job = mock.Mock() job1, job2, job3, job4, job5, job6, job7, job8, job9 = [ WorkflowJob(original_job, i) for i in range(9) ] job_graph = OrderedDict([(job1, (job5, job6)), (job2, (job6, )), (job3, (job6, )), (job4, (job7, )), (job6, (job8, )), (job7, (job8, )), (job5, (job9, ))]) # job1 job2 job3 job4 # | \ | / | # | \ | / | # | \ | / | # | \ | / | # job5 job6 job7 # | \ / # | \ / # | \ / # | \ / # | \ / # job9 job8 definition = Definition(job_graph) workflow = Workflow(workflow_id='test_workflow', definition=definition, schedule_interval='@hourly', runtime_as_datetime=True) # expected self.assertEqual( list(workflow.build_sequential_order()), [job1, job5, job9, job2, job3, job6, job4, job7, job8]) # given job_graph = OrderedDict([(job1, (job5, job6, job7)), (job2, (job6, )), (job3, (job6, )), (job4, (job7, )), (job6, (job8, )), (job7, (job8, )), (job5, (job9, )), (job6, (job9, ))]) # job1 job2 job3 job4 # | \ | / | # | \ | / | # | \ | / | # | \ | / | # job5 job6 job7 # | / \ / # | / \ / # | / \ / # | / \ / # | / \ / # job9 job8 definition = Definition(job_graph) workflow = Workflow(workflow_id='test_workflow', definition=definition, schedule_interval='@hourly', runtime_as_datetime=True) # expected self.assertEqual( workflow.build_sequential_order(), [job1, job5, job2, job3, job6, job9, job4, job7, job8])
from bigflow.workflow import Workflow, hourly_start_time from datetime import datetime from datetime import timedelta class HourlyJob: def __init__(self): self.id = 'hourly_job' def run(self, runtime): print( f'I should process data with timestamps from: {runtime} ' f'to {datetime.strptime(runtime, "%Y-%m-%d %H:%M:%S") + timedelta(minutes=59, seconds=59)}' ) hourly_workflow = Workflow(workflow_id='hourly_workflow', schedule_interval='@hourly', start_time_factory=hourly_start_time, definition=[HourlyJob()]) if __name__ == '__main__': hourly_workflow.run('2020-01-01 00:00:00')