def test_new_run_id(clear_db_airflow_dags, session=None): dag = DAG(DAG_ID, schedule_interval='@daily', default_args=DAG_DEFAULT_ARGS, description=DAG_DESCRIPTION) run_id = dag.new_run_id() assert UUID(run_id).version == 4
def __init__(self, dag_id, schedule_interval=None, location=None, input_urns=None, output_urns=None, start_date=None, description=None, marquez_run_id=None, airflow_run_id=None, mock_marquez_client=True): self.dag_id = dag_id self.schedule_interval = schedule_interval or '*/10 * * * *' self.location = location or 'test_location' self.input_urns = input_urns or [] self.output_urns = output_urns or [] self.start_date = start_date or pendulum.datetime(2019, 1, 31, 0, 0, 0) self.description = description or 'test description' self.marquez_run_id = marquez_run_id or '71d29487-0b54-4ae1-9295' self.airflow_run_id = airflow_run_id or 'airflow_run_id_123456' self.marquez_dag = DAG(self.dag_id, schedule_interval=self.schedule_interval, default_args={ 'marquez_location': self.location, 'marquez_input_urns': self.input_urns, 'marquez_output_urns': self.output_urns, 'owner': 'na', 'depends_on_past': False, 'start_date': self.start_date }, description=self.description) if mock_marquez_client: self.marquez_dag._marquez_client = \ make_mock_marquez_client(self.marquez_run_id)
def test_marquez_dag_with_extract_on_complete( job_id_mapping, mock_get_or_create_openlineage_client, clear_db_airflow_dags, session=None): # --- test setup dag_id = 'test_marquez_dag_with_extractor_on_complete' dag = DAG( dag_id, schedule_interval='@daily', default_args=DAG_DEFAULT_ARGS, description=DAG_DESCRIPTION ) dag_run_id = 'test_marquez_dag_with_extractor_run_id' run_id = f"{dag_run_id}.{TASK_ID_COMPLETED}" # Mock the marquez client method calls mock_marquez_client = mock.Mock() mock_get_or_create_openlineage_client.return_value = mock_marquez_client # Add task that will be marked as completed task_will_complete = TestFixtureDummyOperator( task_id=TASK_ID_COMPLETED, dag=dag ) completed_task_location = get_location(task_will_complete.dag.fileloc) # Add the dummy extractor to the list for the task above _DAG_EXTRACTORS[task_will_complete.__class__] = \ TestFixtureDummyExtractorOnComplete # Create DAG run and mark as running dagrun = dag.create_dagrun( run_id=dag_run_id, execution_date=DEFAULT_DATE, state=State.RUNNING) start_time = '2016-01-01T00:00:00.000000Z' end_time = '2016-01-02T00:00:00.000000Z' mock_marquez_client.emit.assert_has_calls([ mock.call(RunEvent( eventType=RunState.START, eventTime=mock.ANY, run=Run(run_id, { "nominalTime": NominalTimeRunFacet(start_time, end_time) }), job=Job("default", f"{dag_id}.{TASK_ID_COMPLETED}", { "documentation": DocumentationJobFacet(DAG_DESCRIPTION), "sourceCodeLocation": SourceCodeLocationJobFacet("", completed_task_location) }), producer=PRODUCER, inputs=[], outputs=[] )) ]) mock_marquez_client.reset_mock() # --- Pretend complete the task job_id_mapping.pop.return_value = run_id task_will_complete.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) dag.handle_callback(dagrun, success=True, session=session) mock_marquez_client.emit.assert_has_calls([ mock.call(RunEvent( eventType=RunState.COMPLETE, eventTime=mock.ANY, run=Run(run_id), job=Job("default", f"{dag_id}.{TASK_ID_COMPLETED}"), producer=PRODUCER, inputs=[OpenLineageDataset( namespace='default', name='schema.extract_on_complete_input1', facets={ 'dataSource': DataSourceDatasetFacet( name='dummy_source_name', uri='http://dummy/source/url' ), 'schema': SchemaDatasetFacet( fields=[ SchemaField(name='field1', type='text', description=''), SchemaField(name='field2', type='text', description='') ] ) }) ], outputs=[OpenLineageDataset( namespace='default', name='extract_on_complete_output1', facets={ 'dataSource': DataSourceDatasetFacet( name='dummy_source_name', uri='http://dummy/source/url' ) }) ] )) ])
from marquez_airflow import DAG from airflow.operators.dummy_operator import DummyOperator from datetime import datetime default_args = { "owner": "airflow", "depends_on_past": False, "start_date": datetime(2019, 1, 1), "email": ["*****@*****.**"], "email_on_failure": False, "email_on_retry": False, 'marquez_location': 'github://my_dag_location', 'marquez_input_urns': ["s3://great_data", "s3://not_so_good_data"], 'marquez_output_urns': ["s3://amazing_data"], } dag = DAG('dummy_operator_example', schedule_interval='*/10 * * * *', default_args=default_args, description="My awesome DAG") run_this_1 = DummyOperator(task_id='run_this_1', dag=dag) run_this_2 = DummyOperator(task_id='run_this_2', dag=dag) run_this_2.set_upstream(run_this_1)
from marquez_airflow import DAG from airflow.operators.postgres_operator import PostgresOperator from airflow.utils.dates import days_ago default_args = { 'owner': 'datascience', 'depends_on_past': False, 'start_date': days_ago(7), 'email_on_failure': False, 'email_on_retry': False, 'email': ['*****@*****.**'] } dag = DAG( 'orders_popular_day_of_week', schedule_interval='@once', default_args=default_args, description='Determines the popular day of week orders are placed.' ) t1 = PostgresOperator( task_id='if_not_exists', postgres_conn_id='food_delivery_db', sql=''' CREATE TABLE IF NOT EXISTS popular_orders_day_of_week ( order_day_of_week VARCHAR(64) NOT NULL, order_placed_on TIMESTAMP NOT NULL, orders_placed INTEGER NOT NULL );''', dag=dag )
from datetime import datetime, timedelta default_args = { "owner": "airflow", "depends_on_past": False, "start_date": datetime(2019, 1, 1), "email": ["*****@*****.**"], "email_on_failure": False, "email_on_retry": False, 'marquez_location': 'github://my_dag_location', 'marquez_input_urns': ["s3://great_data", "s3://not_so_good_data"], 'marquez_output_urns': ["s3://amazing_data"], } dag = DAG("templated_bash_example", default_args=default_args, schedule_interval=timedelta(1)) # t1, t2 and t3 are examples of tasks created by instantiating operators t1 = BashOperator(task_id="print_date", bash_command="date", dag=dag) t2 = BashOperator(task_id="sleep", bash_command="sleep 5", retries=3, dag=dag) templated_command = """ {% for i in range(5) %} echo "{{ ds }}" echo "{{ macros.ds_add(ds, 7)}}" echo "{{ params.my_param }}" {% endfor %} """
from airflow.operators.sensors import ExternalTaskSensor from airflow.utils.dates import days_ago default_args = { 'owner': 'datascience', 'depends_on_past': False, 'start_date': days_ago(1), 'email_on_failure': False, 'email_on_retry': False, 'email': ['*****@*****.**'] } dag = DAG( 'email_discounts', schedule_interval='@hourly', catchup=False, default_args=default_args, description= 'Email discounts to customers that have experienced order delays daily.') # Wait for delivery_times_7_days DAG to complete t1 = ExternalTaskSensor(task_id='wait_for_delivery_times_7_days', external_dag_id='delivery_times_7_days', mode='reschedule', dag=dag) t2 = PostgresOperator(task_id='insert', postgres_conn_id='food_delivery_db', sql=''' SELECT * FROM discounts; ''',
from airflow.operators.sensors import ExternalTaskSensor from airflow.utils.dates import days_ago default_args = { 'owner': 'datascience', 'depends_on_past': False, 'start_date': days_ago(1), 'email_on_failure': False, 'email_on_retry': False, 'email': ['*****@*****.**'] } dag = DAG( 'etl_menus', schedule_interval='@hourly', catchup=False, default_args=default_args, description='Loads newly added restaurant menus daily.' ) # Wait for new_food_deliveries DAG to complete t1 = ExternalTaskSensor( task_id='wait_for_new_food_deliveries', external_dag_id='new_food_deliveries', mode='reschedule', dag=dag ) # Wait for etl_restaurants DAG to complete t2 = ExternalTaskSensor( task_id='wait_for_etl_restaurants',
from airflow.operators.postgres_operator import PostgresOperator from airflow.utils.dates import days_ago default_args = { 'owner': 'datascience', 'depends_on_past': False, 'start_date': days_ago(1), 'email_on_failure': False, 'email_on_retry': False, 'email': ['*****@*****.**'] } dag = DAG( 'new_food_deliveries', schedule_interval='@hourly', catchup=False, default_args=default_args, description='Add new food delivery data.' ) t1 = PostgresOperator( task_id='if_not_exists', postgres_conn_id='food_delivery_db', sql=''' CREATE TABLE IF NOT EXISTS cities ( id SERIAL PRIMARY KEY, name VARCHAR(64) NOT NULL, state VARCHAR(64) NOT NULL, zip_code VARCHAR(64) NOT NULL, UNIQUE (name, state, zip_code)
def test_marquez_dag(mock_get_or_create_marquez_client, mock_uuid, clear_db_airflow_dags, session=None): dag = DAG(DAG_ID, schedule_interval='@daily', default_args=DAG_DEFAULT_ARGS, description=DAG_DESCRIPTION) # (1) Mock the marquez client method calls mock_marquez_client = mock.Mock() mock_get_or_create_marquez_client.return_value = mock_marquez_client run_id_completed = "my-test_marquez_dag-uuid-completed" run_id_failed = "my-test_marquez_dag-uuid-failed" mock_uuid.side_effect = [run_id_completed, run_id_failed] # (2) Add task that will be marked as completed task_will_complete = DummyOperator(task_id=TASK_ID_COMPLETED, dag=dag) completed_task_location = get_location(task_will_complete.dag.fileloc) # (3) Add task that will be marked as failed task_will_fail = DummyOperator(task_id=TASK_ID_FAILED, dag=dag) failed_task_location = get_location(task_will_complete.dag.fileloc) # (4) Create DAG run and mark as running dagrun = dag.create_dagrun(run_id=DAG_RUN_ID, execution_date=DEFAULT_DATE, state=State.RUNNING) # Assert namespace meta call mock_marquez_client.create_namespace.assert_called_once_with( DAG_NAMESPACE, DAG_OWNER) # Assert source and dataset meta calls mock_marquez_client.create_source.assert_not_called() mock_marquez_client.create_dataset.assert_not_called() # Assert job meta calls create_job_calls = [ mock.call(job_name=f"{DAG_ID}.{TASK_ID_COMPLETED}", job_type=JobType.BATCH, location=completed_task_location, input_dataset=None, output_dataset=None, context=mock.ANY, description=DAG_DESCRIPTION, namespace_name=DAG_NAMESPACE, run_id=None), mock.call(job_name=f"{DAG_ID}.{TASK_ID_FAILED}", job_type=JobType.BATCH, location=failed_task_location, input_dataset=None, output_dataset=None, context=mock.ANY, description=DAG_DESCRIPTION, namespace_name=DAG_NAMESPACE, run_id=None) ] log.info( f"{ [name for name, args, kwargs in mock_marquez_client.mock_calls]}") mock_marquez_client.create_job.assert_has_calls(create_job_calls) # Assert job run meta calls create_job_run_calls = [ mock.call(job_name=f"{DAG_ID}.{TASK_ID_COMPLETED}", run_id=mock.ANY, run_args=DAG_RUN_ARGS, nominal_start_time=mock.ANY, nominal_end_time=mock.ANY, namespace_name=DAG_NAMESPACE), mock.call(job_name=f"{DAG_ID}.{TASK_ID_FAILED}", run_id=mock.ANY, run_args=DAG_RUN_ARGS, nominal_start_time=mock.ANY, nominal_end_time=mock.ANY, namespace_name=DAG_NAMESPACE) ] mock_marquez_client.create_job_run.assert_has_calls(create_job_run_calls) # (5) Start task that will be marked as completed task_will_complete.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) # (6) Start task that will be marked as failed ti1 = TaskInstance(task=task_will_fail, execution_date=DEFAULT_DATE) ti1.state = State.FAILED session.add(ti1) session.commit() dag.handle_callback(dagrun, success=True, session=session) # Assert start run meta calls start_job_run_calls = [ mock.call(run_id_completed, mock.ANY), mock.call(run_id_failed, mock.ANY) ] mock_marquez_client.mark_job_run_as_started.assert_has_calls( start_job_run_calls) mock_marquez_client.mark_job_run_as_completed.assert_called_once_with( run_id=run_id_completed) # When a task run completes, the task outputs are also updated in order # to link a job version (=task version) to a dataset version. # Using a DummyOperator, no outputs exists, so assert that the create # dataset call is not invoked. mock_marquez_client.create_dataset.assert_not_called() dag.handle_callback(dagrun, success=False, session=session) mock_marquez_client.mark_job_run_as_failed.assert_called_once_with( run_id=run_id_failed) # Assert an attempt to version the outputs of a task is not made when # a task fails mock_marquez_client.create_dataset.assert_not_called()
def test_marquez_dag_with_extract_on_complete( mock_get_or_create_marquez_client, mock_uuid, clear_db_airflow_dags, session=None): # --- test setup dag_id = 'test_marquez_dag_with_extractor' dag = DAG(dag_id, schedule_interval='@daily', default_args=DAG_DEFAULT_ARGS, description=DAG_DESCRIPTION) run_id = "my-test-uuid" mock_uuid.side_effect = [run_id] # Mock the marquez client method calls mock_marquez_client = mock.Mock() mock_get_or_create_marquez_client.return_value = mock_marquez_client # Add task that will be marked as completed task_will_complete = TestFixtureDummyOperator(task_id=TASK_ID_COMPLETED, dag=dag) completed_task_location = get_location(task_will_complete.dag.fileloc) # Add the dummy extractor to the list for the task above dag._extractors[task_will_complete.__class__] = \ TestFixtureDummyExtractorOnComplete # Create DAG run and mark as running dagrun = dag.create_dagrun(run_id='test_marquez_dag_with_extractor_run_id', execution_date=DEFAULT_DATE, state=State.RUNNING) # Namespace created mock_marquez_client.create_namespace.assert_called_once_with( DAG_NAMESPACE, DAG_OWNER) log.info("Marquez client calls when starting:") for call in mock_marquez_client.mock_calls: log.info(call) assert [name for name, args, kwargs in mock_marquez_client.mock_calls ] == ['create_namespace'] mock_marquez_client.reset_mock() # --- Pretend complete the task task_will_complete.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) dag.handle_callback(dagrun, success=True, session=session) # Datasets are updated mock_marquez_client.create_source.assert_called_with( 'dummy_source_name', 'DummySource', 'http://dummy/source/url') # Datasets get called twice, once to reenact the _begin_run_flow # and then again at _end_run_flow w/ the run id appended for # the output dataset mock_marquez_client.create_dataset.assert_has_calls([ mock.call(dataset_name='schema.extract_on_complete_input1', dataset_type=DatasetType.DB_TABLE, physical_name='schema.extract_on_complete_input1', source_name='dummy_source_name', namespace_name=DAG_NAMESPACE, fields=mock.ANY, run_id=None), mock.call(dataset_name='extract_on_complete_output1', dataset_type=DatasetType.DB_TABLE, physical_name='extract_on_complete_output1', source_name='dummy_source_name', namespace_name=DAG_NAMESPACE, fields=[], run_id=None), mock.call(dataset_name='schema.extract_on_complete_input1', dataset_type=DatasetType.DB_TABLE, physical_name='schema.extract_on_complete_input1', source_name='dummy_source_name', namespace_name=DAG_NAMESPACE, fields=mock.ANY, run_id=None), mock.call(dataset_name='extract_on_complete_output1', dataset_type=DatasetType.DB_TABLE, physical_name='extract_on_complete_output1', source_name='dummy_source_name', namespace_name=DAG_NAMESPACE, fields=[], run_id='my-test-uuid') ]) # job is updated mock_marquez_client.create_job.assert_has_calls([ mock.call(job_name=f"{dag_id}.{TASK_ID_COMPLETED}", job_type=JobType.BATCH, location=completed_task_location, input_dataset=[{ 'namespace': 'default', 'name': 'schema.extract_on_complete_input1' }], output_dataset=[{ 'namespace': 'default', 'name': 'extract_on_complete_output1' }], context=mock.ANY, description=DAG_DESCRIPTION, namespace_name=DAG_NAMESPACE, run_id=None), mock.call(job_name=f"{dag_id}.{TASK_ID_COMPLETED}", job_type=JobType.BATCH, location=completed_task_location, input_dataset=[{ 'namespace': 'default', 'name': 'schema.extract_on_complete_input1' }], output_dataset=[{ 'namespace': 'default', 'name': 'extract_on_complete_output1' }], context=mock.ANY, description=DAG_DESCRIPTION, namespace_name=DAG_NAMESPACE, run_id='my-test-uuid') ]) assert mock_marquez_client.create_job.mock_calls[0].\ kwargs['context'].get('extract_on_complete') == 'extract_on_complete' # run is created mock_marquez_client.create_job_run.assert_called_once_with( job_name=f"{dag_id}.{TASK_ID_COMPLETED}", run_id=run_id, run_args=DAG_RUN_ARGS, nominal_start_time=mock.ANY, nominal_end_time=mock.ANY, namespace_name=DAG_NAMESPACE) # run is started mock_marquez_client.mark_job_run_as_started.assert_called_once_with( run_id, mock.ANY) # --- Assert that the right marquez calls are done # job is updated before completion mock_marquez_client.create_job.assert_has_calls([ mock.call(namespace_name=DAG_NAMESPACE, job_name=f"{dag_id}.{TASK_ID_COMPLETED}", job_type=JobType.BATCH, location=completed_task_location, input_dataset=[{ 'namespace': 'default', 'name': 'schema.extract_on_complete_input1' }], output_dataset=[{ 'namespace': 'default', 'name': 'extract_on_complete_output1' }], context=mock.ANY, description=DAG_DESCRIPTION, run_id=run_id) ]) assert mock_marquez_client.create_job.mock_calls[0].\ kwargs['context'].get('extract_on_complete') == 'extract_on_complete' mock_marquez_client.mark_job_run_as_completed.assert_called_once_with( run_id=run_id) # When a task run completes, the task outputs are also updated in order # to link a job version (=task version) to a dataset version. mock_marquez_client.create_dataset.assert_has_calls([ mock.call(dataset_name='schema.extract_on_complete_input1', dataset_type=DatasetType.DB_TABLE, physical_name='schema.extract_on_complete_input1', source_name='dummy_source_name', namespace_name=DAG_NAMESPACE, fields=mock.ANY, run_id=None), mock.call(dataset_name='extract_on_complete_output1', dataset_type=DatasetType.DB_TABLE, physical_name='extract_on_complete_output1', source_name='dummy_source_name', namespace_name=DAG_NAMESPACE, fields=[], run_id=run_id) ]) log.info("Marquez client calls when completing:") for call in mock_marquez_client.mock_calls: log.info(call) assert [name for name, args, kwargs in mock_marquez_client.mock_calls] == [ 'create_namespace', 'create_source', 'create_dataset', 'create_source', 'create_dataset', 'create_job', 'create_job_run', 'create_source', 'create_dataset', 'create_source', 'create_dataset', 'create_job', 'mark_job_run_as_started', 'mark_job_run_as_completed' ]
from datetime import datetime from airflow.operators.dummy_operator import DummyOperator from marquez_airflow import DAG DAG_NAME = 'test_dag' default_args = { 'depends_on_past': False, 'start_date': datetime(2019, 2, 1), } dag = DAG(DAG_NAME, schedule_interval='0 0 * * *', catchup=False, default_args=default_args, description="My awesome DAG") run_this_1 = DummyOperator(task_id='run_this_1', dag=dag) run_this_2 = DummyOperator(task_id='run_this_2', dag=dag) run_this_2.set_upstream(run_this_1)
from marquez_airflow import DAG from airflow.operators.postgres_operator import PostgresOperator from airflow.utils.dates import days_ago default_args = { 'owner': 'datascience', 'depends_on_past': False, 'start_date': days_ago(1), 'email_on_failure': False, 'email_on_retry': False, 'email': ['*****@*****.**'] } dag = DAG('etl_restaurants', schedule_interval='@hourly', catchup=False, default_args=default_args, description='Loads newly registered restaurants daily.') t1 = PostgresOperator(task_id='if_not_exists', postgres_conn_id='food_delivery_db', sql=''' CREATE TABLE IF NOT EXISTS restaurants ( id SERIAL PRIMARY KEY, created_at TIMESTAMP NOT NULL, updated_at TIMESTAMP NOT NULL, name VARCHAR(64) NOT NULL, email VARCHAR(64) UNIQUE NOT NULL, address VARCHAR(64) NOT NULL, phone VARCHAR(64) NOT NULL, city_id INTEGER REFERENCES cities(id),
'aws_conn_id': "aws_default", 'consumer_key':"mT81o4cCwLhB3v8T19obCLl9m", 'consumer_secret':"khIx52CeapnNU1Ux8NjJig2hNzAux7hQBol2q7ZfMzAHC2fSa9", 'access_token': "712363196-buAMU1eDePCkbjWhtT8BAlry2et9SeA6oY8CWRPN", 'access_token_secret':"rDXek6tRjNO9nrw43fSKxhaQlxzu0rsiYzAiNUSqvLZHi", 'postgres_conn_id': 'postgres_conn_id_bingyu', 'email_on_failure': False, 'email_on_retry': False, 'retries': 0, 'retry_delay': timedelta(minutes=5) } dag = DAG('airflow_individual', description='test for twitter web scraping', schedule_interval='@weekly', # cron scheduling for more detailed time catchup=False, default_args=default_args, max_active_runs=1) # ============================================================================= # 2. Define different functions # ============================================================================= def scrape_data_twitter(**kwargs): # authorization of consumer key and consumer secret auth = tweepy.OAuthHandler(kwargs['consumer_key'], kwargs['consumer_secret']) # set access to user's access key and access secret auth.set_access_token(kwargs['access_token'], kwargs['access_token_secret']) # calling the api
def test_marquez_dag_with_extractor_returning_two_steps( job_id_mapping, mock_get_or_create_openlineage_client, clear_db_airflow_dags, session=None): # --- test setup dag_id = 'test_marquez_dag_with_extractor_returning_two_steps' dag = DAG( dag_id, schedule_interval='@daily', default_args=DAG_DEFAULT_ARGS, description=DAG_DESCRIPTION ) dag_run_id = 'test_marquez_dag_with_extractor_returning_two_steps_run_id' run_id = f"{dag_run_id}.{TASK_ID_COMPLETED}" # Mock the marquez client method calls mock_marquez_client = mock.Mock() mock_get_or_create_openlineage_client.return_value = mock_marquez_client # Add task that will be marked as completed task_will_complete = TestFixtureDummyOperator( task_id=TASK_ID_COMPLETED, dag=dag ) completed_task_location = get_location(task_will_complete.dag.fileloc) # Add the dummy extractor to the list for the task above _DAG_EXTRACTORS[task_will_complete.__class__] = TestFixtureDummyExtractorWithMultipleSteps # --- pretend run the DAG # Create DAG run and mark as running dagrun = dag.create_dagrun( run_id=dag_run_id, execution_date=DEFAULT_DATE, state=State.RUNNING) # --- Asserts that the job starting triggers openlineage event start_time = '2016-01-01T00:00:00.000000Z' end_time = '2016-01-02T00:00:00.000000Z' mock_marquez_client.emit.assert_called_once_with( RunEvent( RunState.START, mock.ANY, Run(run_id, {"nominalTime": NominalTimeRunFacet(start_time, end_time)}), Job("default", f"{dag_id}.{TASK_ID_COMPLETED}", { "documentation": DocumentationJobFacet(DAG_DESCRIPTION), "sourceCodeLocation": SourceCodeLocationJobFacet("", completed_task_location) }), PRODUCER, [OpenLineageDataset(DAG_NAMESPACE, 'extract_input1', { "dataSource": DataSourceDatasetFacet( name='dummy_source_name', uri='http://dummy/source/url' ) })], [] ) ) mock_marquez_client.reset_mock() # --- Pretend complete the task job_id_mapping.pop.return_value = run_id task_will_complete.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) dag.handle_callback(dagrun, success=True, session=session) # --- Assert that the openlineage call is done mock_marquez_client.emit.assert_called_once_with( RunEvent( RunState.COMPLETE, mock.ANY, Run(run_id), Job("default", f"{dag_id}.{TASK_ID_COMPLETED}"), PRODUCER, [OpenLineageDataset(DAG_NAMESPACE, 'extract_input1', { "dataSource": DataSourceDatasetFacet( name='dummy_source_name', uri='http://dummy/source/url' ) })], [] ) )
from airflow.operators.postgres_operator import PostgresOperator from airflow.operators.sensors import ExternalTaskSensor from airflow.utils.dates import days_ago default_args = { 'owner': 'datascience', 'depends_on_past': False, 'start_date': days_ago(1), 'email_on_failure': False, 'email_on_retry': False, 'email': ['*****@*****.**'] } dag = DAG('etl_customers', schedule_interval='@hourly', catchup=False, default_args=default_args, description='Loads newly registered customers daily.') # Wait for new_food_deliveries DAG to complete t1 = ExternalTaskSensor(task_id='wait_for_new_food_deliveries', external_dag_id='new_food_deliveries', mode='reschedule', dag=dag) t2 = PostgresOperator(task_id='if_not_exists', postgres_conn_id='food_delivery_db', sql=''' CREATE TABLE IF NOT EXISTS customers ( id SERIAL PRIMARY KEY, created_at TIMESTAMP NOT NULL,
def test_marquez_dag(job_id_mapping, mock_get_or_create_openlineage_client, clear_db_airflow_dags, session=None): dag = DAG( DAG_ID, schedule_interval='@daily', default_args=DAG_DEFAULT_ARGS, description=DAG_DESCRIPTION ) # (1) Mock the marquez client method calls mock_marquez_client = mock.Mock() mock_get_or_create_openlineage_client.return_value = mock_marquez_client run_id_completed = f"{DAG_RUN_ID}.{TASK_ID_COMPLETED}" run_id_failed = f"{DAG_RUN_ID}.{TASK_ID_FAILED}" # mock_uuid.side_effect = [run_id_completed, run_id_failed] # (2) Add task that will be marked as completed task_will_complete = DummyOperator( task_id=TASK_ID_COMPLETED, dag=dag ) completed_task_location = get_location(task_will_complete.dag.fileloc) # (3) Add task that will be marked as failed task_will_fail = DummyOperator( task_id=TASK_ID_FAILED, dag=dag ) failed_task_location = get_location(task_will_complete.dag.fileloc) # (4) Create DAG run and mark as running dagrun = dag.create_dagrun( run_id=DAG_RUN_ID, execution_date=DEFAULT_DATE, state=State.RUNNING) # Assert emit calls start_time = '2016-01-01T00:00:00.000000Z' end_time = '2016-01-02T00:00:00.000000Z' emit_calls = [ mock.call(RunEvent( eventType=RunState.START, eventTime=mock.ANY, run=Run(run_id_completed, {"nominalTime": NominalTimeRunFacet(start_time, end_time)}), job=Job("default", f"{DAG_ID}.{TASK_ID_COMPLETED}", { "documentation": DocumentationJobFacet(DAG_DESCRIPTION), "sourceCodeLocation": SourceCodeLocationJobFacet("", completed_task_location) }), producer=PRODUCER, inputs=[], outputs=[] )), mock.call(RunEvent( eventType=RunState.START, eventTime=mock.ANY, run=Run(run_id_failed, {"nominalTime": NominalTimeRunFacet(start_time, end_time)}), job=Job("default", f"{DAG_ID}.{TASK_ID_FAILED}", { "documentation": DocumentationJobFacet(DAG_DESCRIPTION), "sourceCodeLocation": SourceCodeLocationJobFacet("", failed_task_location) }), producer=PRODUCER, inputs=[], outputs=[] )) ] log.info( f"{ [name for name, args, kwargs in mock_marquez_client.mock_calls]}") mock_marquez_client.emit.assert_has_calls(emit_calls) # (5) Start task that will be marked as completed task_will_complete.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) # (6) Start task that will be marked as failed ti1 = TaskInstance(task=task_will_fail, execution_date=DEFAULT_DATE) ti1.state = State.FAILED session.add(ti1) session.commit() job_id_mapping.pop.side_effect = [run_id_completed, run_id_failed] dag.handle_callback(dagrun, success=False, session=session) emit_calls += [ mock.call(RunEvent( eventType=RunState.COMPLETE, eventTime=mock.ANY, run=Run(run_id_completed), job=Job("default", f"{DAG_ID}.{TASK_ID_COMPLETED}"), producer=PRODUCER, inputs=[], outputs=[] )), mock.call(RunEvent( eventType=RunState.FAIL, eventTime=mock.ANY, run=Run(run_id_failed), job=Job("default", f"{DAG_ID}.{TASK_ID_FAILED}"), producer=PRODUCER, inputs=[], outputs=[] )) ] mock_marquez_client.emit.assert_has_calls(emit_calls)
from airflow.operators.postgres_operator import PostgresOperator from airflow.operators.sensors import ExternalTaskSensor from airflow.utils.dates import days_ago default_args = { 'owner': 'datascience', 'depends_on_past': False, 'start_date': days_ago(1), 'email_on_failure': False, 'email_on_retry': False, 'email': ['*****@*****.**'] } dag = DAG('etl_delivery_7_days', schedule_interval='@hourly', catchup=False, default_args=default_args, description='Loads new deliveries for the week.') # Wait for etl_orders_7_days DAG to complete t1 = ExternalTaskSensor(task_id='wait_for_etl_orders_7_days', external_dag_id='etl_orders_7_days', mode='reschedule', dag=dag) # Wait for etl_restaurants DAG to complete t2 = ExternalTaskSensor(task_id='wait_for_etl_restaurants', external_dag_id='etl_restaurants', mode='reschedule', dag=dag)
from airflow.operators.postgres_operator import PostgresOperator from airflow.operators.sensors import ExternalTaskSensor from airflow.utils.dates import days_ago default_args = { 'owner': 'datascience', 'depends_on_past': False, 'start_date': days_ago(1), 'email_on_failure': False, 'email_on_retry': False, 'email': ['*****@*****.**'] } dag = DAG('orders_popular_day_of_week', schedule_interval='@hourly', catchup=False, default_args=default_args, description='Determines the popular day of week orders are placed.') # Wait for delivery_times_7_days DAG to complete t1 = ExternalTaskSensor(task_id='wait_for_delivery_times_7_days', external_dag_id='delivery_times_7_days', mode='reschedule', dag=dag) t2 = PostgresOperator(task_id='if_not_exists', postgres_conn_id='food_delivery_db', sql=''' CREATE TABLE IF NOT EXISTS popular_orders_day_of_week ( order_day_of_week VARCHAR(64) NOT NULL, order_placed_on TIMESTAMP NOT NULL,
from airflow.operators.postgres_operator import PostgresOperator from airflow.operators.sensors import ExternalTaskSensor from airflow.utils.dates import days_ago default_args = { 'owner': 'datascience', 'depends_on_past': False, 'start_date': days_ago(1), 'email_on_failure': False, 'email_on_retry': False, 'email': ['*****@*****.**'] } dag = DAG('delivery_times_7_days', schedule_interval='@hourly', catchup=False, default_args=default_args, description='Determine weekly top delivery times by restaurant.') # Wait for etl_delivery_7_days DAG to complete t1 = ExternalTaskSensor(task_id='wait_for_etl_delivery_7_days', external_dag_id='etl_delivery_7_days', mode='reschedule', dag=dag) t2 = PostgresOperator(task_id='if_not_exists', postgres_conn_id='food_delivery_db', sql=''' CREATE TABLE IF NOT EXISTS top_delivery_times ( order_id INTEGER REFERENCES orders(id), order_placed_on TIMESTAMP NOT NULL,
from datetime import datetime from airflow.operators.dummy_operator import DummyOperator from marquez_airflow import DAG dag = DAG(dag_id='test_dummy_dag', description='Test dummy DAG', schedule_interval='*/2 * * * *', start_date=datetime(2020, 1, 8), catchup=False, max_active_runs=1) dummy_task = DummyOperator(task_id='test_dummy', dag=dag)
DAG_ID = 'email_discounts' DAG_OWNER = 'datascience' DAG_DEFAULT_ARGS = { 'owner': DAG_OWNER, 'depends_on_past': False, 'start_date': days_ago(7), 'email_on_failure': False, 'email_on_retry': False, 'email': ['*****@*****.**'] } DAG_DESCRIPTION = \ 'Email discounts to customers that have experienced order delays daily' DAG = dag = DAG(DAG_ID, schedule_interval='@weekly', default_args=DAG_DEFAULT_ARGS, description=DAG_DESCRIPTION) TASK_ID = 'select' TASK = PostgresOperator(task_id=TASK_ID, postgres_conn_id=CONN_ID, sql=SQL, dag=DAG) @mock.patch('marquez_airflow.extractors.postgres_extractor.\ PostgresExtractor._get_table_schemas') def test_extract(mock_get_table_schemas): mock_get_table_schemas.side_effect = \ [[DB_TABLE_SCHEMA], NO_DB_TABLE_SCHEMA]
from airflow.operators.sensors import ExternalTaskSensor from airflow.utils.dates import days_ago default_args = { 'owner': 'datascience', 'depends_on_past': False, 'start_date': days_ago(1), 'email_on_failure': False, 'email_on_retry': False, 'email': ['*****@*****.**'] } dag = DAG( 'etl_categories', schedule_interval='@hourly', catchup=False, default_args=default_args, description='Loads newly added menus categories daily.' ) # Wait for new_food_deliveries DAG to complete t1 = ExternalTaskSensor( task_id='wait_for_new_food_deliveries', external_dag_id='new_food_deliveries', mode='reschedule', dag=dag ) # Wait for etl_menus DAG to complete t2 = ExternalTaskSensor( task_id='wait_for_etl_menus',
from marquez_airflow import DAG from airflow.operators.postgres_operator import PostgresOperator from airflow.utils.dates import days_ago default_args = { 'owner': 'datascience', 'depends_on_past': False, 'start_date': days_ago(1), 'email_on_failure': False, 'email_on_retry': False, 'email': ['*****@*****.**'] } dag = DAG('etl_order_status', schedule_interval='@hourly', catchup=False, default_args=default_args, description='Loads order statues updates daily.') t1 = PostgresOperator(task_id='if_not_exists', postgres_conn_id='food_delivery_db', sql=''' CREATE TABLE IF NOT EXISTS order_status ( id SERIAL PRIMARY KEY, transitioned_at TIMESTAMP NOT NULL, status VARCHAR(64), order_id INTEGER REFERENCES orders(id), customer_id INTEGER REFERENCES customers(id), restaurant_id INTEGER REFERENCES restaurants(id), driver_id INTEGER REFERENCES drivers(id) );''',
from airflow.operators.sensors import ExternalTaskSensor from airflow.utils.dates import days_ago default_args = { 'owner': 'datascience', 'depends_on_past': False, 'start_date': days_ago(1), 'email_on_failure': False, 'email_on_retry': False, 'email': ['*****@*****.**'] } dag = DAG( 'etl_orders', schedule_interval='@hourly', catchup=False, default_args=default_args, description='Loads newly placed orders daily.' ) # Wait for new_food_deliveries DAG to complete t1 = ExternalTaskSensor( task_id='wait_for_new_food_deliveries', external_dag_id='new_food_deliveries', mode='reschedule', dag=dag ) # Wait for etl_menu_items DAG to complete t2 = ExternalTaskSensor( task_id='wait_for_etl_menu_items',
from marquez_airflow import DAG from airflow.operators.dummy_operator import DummyOperator from datetime import datetime DAG_NAME = 'test_dag_v2' default_args = { 'marquez_location': 'github://my_dag_location', 'marquez_input_urns': ["s3://great_data", "s3://not_so_good_data"], 'marquez_output_urns': ["s3://amazing_data"], 'owner': 'some dag developer', 'depends_on_past': False, 'start_date': datetime(2019, 1, 31), } dag = DAG(DAG_NAME, schedule_interval='*/10 * * * *', default_args=default_args, description="My awesome DAG") run_this_1 = DummyOperator(task_id='run_this_1', dag=dag) run_this_2 = DummyOperator(task_id='run_this_2', dag=dag) run_this_2.set_upstream(run_this_1)