Ejemplo n.º 1
0
def test_new_run_id(clear_db_airflow_dags, session=None):
    dag = DAG(DAG_ID,
              schedule_interval='@daily',
              default_args=DAG_DEFAULT_ARGS,
              description=DAG_DESCRIPTION)
    run_id = dag.new_run_id()
    assert UUID(run_id).version == 4
Ejemplo n.º 2
0
    def __init__(self,
                 dag_id,
                 schedule_interval=None,
                 location=None,
                 input_urns=None,
                 output_urns=None,
                 start_date=None,
                 description=None,
                 marquez_run_id=None,
                 airflow_run_id=None,
                 mock_marquez_client=True):
        self.dag_id = dag_id
        self.schedule_interval = schedule_interval or '*/10 * * * *'
        self.location = location or 'test_location'
        self.input_urns = input_urns or []
        self.output_urns = output_urns or []
        self.start_date = start_date or pendulum.datetime(2019, 1, 31, 0, 0, 0)
        self.description = description or 'test description'

        self.marquez_run_id = marquez_run_id or '71d29487-0b54-4ae1-9295'
        self.airflow_run_id = airflow_run_id or 'airflow_run_id_123456'

        self.marquez_dag = DAG(self.dag_id,
                               schedule_interval=self.schedule_interval,
                               default_args={
                                   'marquez_location': self.location,
                                   'marquez_input_urns': self.input_urns,
                                   'marquez_output_urns': self.output_urns,
                                   'owner': 'na',
                                   'depends_on_past': False,
                                   'start_date': self.start_date
                               },
                               description=self.description)
        if mock_marquez_client:
            self.marquez_dag._marquez_client = \
                make_mock_marquez_client(self.marquez_run_id)
Ejemplo n.º 3
0
def test_marquez_dag_with_extract_on_complete(
        job_id_mapping,
        mock_get_or_create_openlineage_client,
        clear_db_airflow_dags,
        session=None):

    # --- test setup
    dag_id = 'test_marquez_dag_with_extractor_on_complete'
    dag = DAG(
        dag_id,
        schedule_interval='@daily',
        default_args=DAG_DEFAULT_ARGS,
        description=DAG_DESCRIPTION
    )

    dag_run_id = 'test_marquez_dag_with_extractor_run_id'
    run_id = f"{dag_run_id}.{TASK_ID_COMPLETED}"
    # Mock the marquez client method calls
    mock_marquez_client = mock.Mock()
    mock_get_or_create_openlineage_client.return_value = mock_marquez_client

    # Add task that will be marked as completed
    task_will_complete = TestFixtureDummyOperator(
        task_id=TASK_ID_COMPLETED,
        dag=dag
    )
    completed_task_location = get_location(task_will_complete.dag.fileloc)

    # Add the dummy extractor to the list for the task above
    _DAG_EXTRACTORS[task_will_complete.__class__] = \
        TestFixtureDummyExtractorOnComplete

    # Create DAG run and mark as running
    dagrun = dag.create_dagrun(
        run_id=dag_run_id,
        execution_date=DEFAULT_DATE,
        state=State.RUNNING)

    start_time = '2016-01-01T00:00:00.000000Z'
    end_time = '2016-01-02T00:00:00.000000Z'

    mock_marquez_client.emit.assert_has_calls([
        mock.call(RunEvent(
            eventType=RunState.START,
            eventTime=mock.ANY,
            run=Run(run_id, {
                "nominalTime": NominalTimeRunFacet(start_time, end_time)
            }),
            job=Job("default",  f"{dag_id}.{TASK_ID_COMPLETED}", {
                "documentation": DocumentationJobFacet(DAG_DESCRIPTION),
                "sourceCodeLocation": SourceCodeLocationJobFacet("", completed_task_location)
            }),
            producer=PRODUCER,
            inputs=[],
            outputs=[]
        ))
    ])

    mock_marquez_client.reset_mock()

    # --- Pretend complete the task
    job_id_mapping.pop.return_value = run_id

    task_will_complete.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

    dag.handle_callback(dagrun, success=True, session=session)

    mock_marquez_client.emit.assert_has_calls([
        mock.call(RunEvent(
            eventType=RunState.COMPLETE,
            eventTime=mock.ANY,
            run=Run(run_id),
            job=Job("default", f"{dag_id}.{TASK_ID_COMPLETED}"),
            producer=PRODUCER,
            inputs=[OpenLineageDataset(
                namespace='default',
                name='schema.extract_on_complete_input1',
                facets={
                    'dataSource': DataSourceDatasetFacet(
                        name='dummy_source_name',
                        uri='http://dummy/source/url'
                    ),
                    'schema': SchemaDatasetFacet(
                        fields=[
                            SchemaField(name='field1', type='text', description=''),
                            SchemaField(name='field2', type='text', description='')
                        ]
                    )
                })
            ],
            outputs=[OpenLineageDataset(
                namespace='default',
                name='extract_on_complete_output1',
                facets={
                    'dataSource': DataSourceDatasetFacet(
                        name='dummy_source_name',
                        uri='http://dummy/source/url'
                    )
                })
            ]
        ))
    ])
Ejemplo n.º 4
0
from marquez_airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from datetime import datetime

default_args = {
    "owner": "airflow",
    "depends_on_past": False,
    "start_date": datetime(2019, 1, 1),
    "email": ["*****@*****.**"],
    "email_on_failure": False,
    "email_on_retry": False,
    'marquez_location': 'github://my_dag_location',
    'marquez_input_urns': ["s3://great_data", "s3://not_so_good_data"],
    'marquez_output_urns': ["s3://amazing_data"],
}

dag = DAG('dummy_operator_example',
          schedule_interval='*/10 * * * *',
          default_args=default_args,
          description="My awesome DAG")

run_this_1 = DummyOperator(task_id='run_this_1', dag=dag)
run_this_2 = DummyOperator(task_id='run_this_2', dag=dag)
run_this_2.set_upstream(run_this_1)
Ejemplo n.º 5
0
from marquez_airflow import DAG
from airflow.operators.postgres_operator import PostgresOperator
from airflow.utils.dates import days_ago

default_args = {
    'owner': 'datascience',
    'depends_on_past': False,
    'start_date': days_ago(7),
    'email_on_failure': False,
    'email_on_retry': False,
    'email': ['*****@*****.**']
}

dag = DAG(
    'orders_popular_day_of_week',
    schedule_interval='@once',
    default_args=default_args,
    description='Determines the popular day of week orders are placed.'
)

t1 = PostgresOperator(
    task_id='if_not_exists',
    postgres_conn_id='food_delivery_db',
    sql='''
    CREATE TABLE IF NOT EXISTS popular_orders_day_of_week (
      order_day_of_week VARCHAR(64) NOT NULL,
      order_placed_on   TIMESTAMP NOT NULL,
      orders_placed     INTEGER NOT NULL
    );''',
    dag=dag
)
Ejemplo n.º 6
0
from datetime import datetime, timedelta

default_args = {
    "owner": "airflow",
    "depends_on_past": False,
    "start_date": datetime(2019, 1, 1),
    "email": ["*****@*****.**"],
    "email_on_failure": False,
    "email_on_retry": False,
    'marquez_location': 'github://my_dag_location',
    'marquez_input_urns': ["s3://great_data", "s3://not_so_good_data"],
    'marquez_output_urns': ["s3://amazing_data"],
}

dag = DAG("templated_bash_example",
          default_args=default_args,
          schedule_interval=timedelta(1))

# t1, t2 and t3 are examples of tasks created by instantiating operators
t1 = BashOperator(task_id="print_date", bash_command="date", dag=dag)

t2 = BashOperator(task_id="sleep", bash_command="sleep 5", retries=3, dag=dag)

templated_command = """
    {% for i in range(5) %}
        echo "{{ ds }}"
        echo "{{ macros.ds_add(ds, 7)}}"
        echo "{{ params.my_param }}"
    {% endfor %}
"""
Ejemplo n.º 7
0
from airflow.operators.sensors import ExternalTaskSensor
from airflow.utils.dates import days_ago

default_args = {
    'owner': 'datascience',
    'depends_on_past': False,
    'start_date': days_ago(1),
    'email_on_failure': False,
    'email_on_retry': False,
    'email': ['*****@*****.**']
}

dag = DAG(
    'email_discounts',
    schedule_interval='@hourly',
    catchup=False,
    default_args=default_args,
    description=
    'Email discounts to customers that have experienced order delays daily.')

# Wait for delivery_times_7_days DAG to complete
t1 = ExternalTaskSensor(task_id='wait_for_delivery_times_7_days',
                        external_dag_id='delivery_times_7_days',
                        mode='reschedule',
                        dag=dag)

t2 = PostgresOperator(task_id='insert',
                      postgres_conn_id='food_delivery_db',
                      sql='''
    SELECT * FROM discounts;
    ''',
Ejemplo n.º 8
0
from airflow.operators.sensors import ExternalTaskSensor
from airflow.utils.dates import days_ago

default_args = {
    'owner': 'datascience',
    'depends_on_past': False,
    'start_date': days_ago(1),
    'email_on_failure': False,
    'email_on_retry': False,
    'email': ['*****@*****.**']
}

dag = DAG(
    'etl_menus',
    schedule_interval='@hourly',
    catchup=False,
    default_args=default_args,
    description='Loads newly added restaurant menus daily.'
)

# Wait for new_food_deliveries DAG to complete
t1 = ExternalTaskSensor(
    task_id='wait_for_new_food_deliveries',
    external_dag_id='new_food_deliveries',
    mode='reschedule',
    dag=dag
)

# Wait for etl_restaurants DAG to complete
t2 = ExternalTaskSensor(
    task_id='wait_for_etl_restaurants',
Ejemplo n.º 9
0
from airflow.operators.postgres_operator import PostgresOperator
from airflow.utils.dates import days_ago

default_args = {
    'owner': 'datascience',
    'depends_on_past': False,
    'start_date': days_ago(1),
    'email_on_failure': False,
    'email_on_retry': False,
    'email': ['*****@*****.**']
}

dag = DAG(
    'new_food_deliveries',
    schedule_interval='@hourly',
    catchup=False,
    default_args=default_args,
    description='Add new food delivery data.'
)


t1 = PostgresOperator(
    task_id='if_not_exists',
    postgres_conn_id='food_delivery_db',
    sql='''
    CREATE TABLE IF NOT EXISTS cities (
      id       SERIAL PRIMARY KEY,
      name     VARCHAR(64) NOT NULL,
      state    VARCHAR(64) NOT NULL,
      zip_code VARCHAR(64) NOT NULL,
      UNIQUE (name, state, zip_code)
Ejemplo n.º 10
0
def test_marquez_dag(mock_get_or_create_marquez_client,
                     mock_uuid,
                     clear_db_airflow_dags,
                     session=None):

    dag = DAG(DAG_ID,
              schedule_interval='@daily',
              default_args=DAG_DEFAULT_ARGS,
              description=DAG_DESCRIPTION)
    # (1) Mock the marquez client method calls
    mock_marquez_client = mock.Mock()
    mock_get_or_create_marquez_client.return_value = mock_marquez_client
    run_id_completed = "my-test_marquez_dag-uuid-completed"
    run_id_failed = "my-test_marquez_dag-uuid-failed"
    mock_uuid.side_effect = [run_id_completed, run_id_failed]

    # (2) Add task that will be marked as completed
    task_will_complete = DummyOperator(task_id=TASK_ID_COMPLETED, dag=dag)
    completed_task_location = get_location(task_will_complete.dag.fileloc)

    # (3) Add task that will be marked as failed
    task_will_fail = DummyOperator(task_id=TASK_ID_FAILED, dag=dag)
    failed_task_location = get_location(task_will_complete.dag.fileloc)

    # (4) Create DAG run and mark as running
    dagrun = dag.create_dagrun(run_id=DAG_RUN_ID,
                               execution_date=DEFAULT_DATE,
                               state=State.RUNNING)

    # Assert namespace meta call
    mock_marquez_client.create_namespace.assert_called_once_with(
        DAG_NAMESPACE, DAG_OWNER)

    # Assert source and dataset meta calls
    mock_marquez_client.create_source.assert_not_called()
    mock_marquez_client.create_dataset.assert_not_called()

    # Assert job meta calls
    create_job_calls = [
        mock.call(job_name=f"{DAG_ID}.{TASK_ID_COMPLETED}",
                  job_type=JobType.BATCH,
                  location=completed_task_location,
                  input_dataset=None,
                  output_dataset=None,
                  context=mock.ANY,
                  description=DAG_DESCRIPTION,
                  namespace_name=DAG_NAMESPACE,
                  run_id=None),
        mock.call(job_name=f"{DAG_ID}.{TASK_ID_FAILED}",
                  job_type=JobType.BATCH,
                  location=failed_task_location,
                  input_dataset=None,
                  output_dataset=None,
                  context=mock.ANY,
                  description=DAG_DESCRIPTION,
                  namespace_name=DAG_NAMESPACE,
                  run_id=None)
    ]
    log.info(
        f"{ [name for name, args, kwargs in mock_marquez_client.mock_calls]}")
    mock_marquez_client.create_job.assert_has_calls(create_job_calls)

    # Assert job run meta calls
    create_job_run_calls = [
        mock.call(job_name=f"{DAG_ID}.{TASK_ID_COMPLETED}",
                  run_id=mock.ANY,
                  run_args=DAG_RUN_ARGS,
                  nominal_start_time=mock.ANY,
                  nominal_end_time=mock.ANY,
                  namespace_name=DAG_NAMESPACE),
        mock.call(job_name=f"{DAG_ID}.{TASK_ID_FAILED}",
                  run_id=mock.ANY,
                  run_args=DAG_RUN_ARGS,
                  nominal_start_time=mock.ANY,
                  nominal_end_time=mock.ANY,
                  namespace_name=DAG_NAMESPACE)
    ]
    mock_marquez_client.create_job_run.assert_has_calls(create_job_run_calls)

    # (5) Start task that will be marked as completed
    task_will_complete.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

    # (6) Start task that will be marked as failed
    ti1 = TaskInstance(task=task_will_fail, execution_date=DEFAULT_DATE)
    ti1.state = State.FAILED
    session.add(ti1)
    session.commit()

    dag.handle_callback(dagrun, success=True, session=session)

    # Assert start run meta calls
    start_job_run_calls = [
        mock.call(run_id_completed, mock.ANY),
        mock.call(run_id_failed, mock.ANY)
    ]
    mock_marquez_client.mark_job_run_as_started.assert_has_calls(
        start_job_run_calls)

    mock_marquez_client.mark_job_run_as_completed.assert_called_once_with(
        run_id=run_id_completed)

    # When a task run completes, the task outputs are also updated in order
    # to link a job version (=task version) to a dataset version.
    # Using a DummyOperator, no outputs exists, so assert that the create
    # dataset call is not invoked.
    mock_marquez_client.create_dataset.assert_not_called()

    dag.handle_callback(dagrun, success=False, session=session)
    mock_marquez_client.mark_job_run_as_failed.assert_called_once_with(
        run_id=run_id_failed)

    # Assert an attempt to version the outputs of a task is not made when
    # a task fails
    mock_marquez_client.create_dataset.assert_not_called()
Ejemplo n.º 11
0
def test_marquez_dag_with_extract_on_complete(
        mock_get_or_create_marquez_client,
        mock_uuid,
        clear_db_airflow_dags,
        session=None):

    # --- test setup
    dag_id = 'test_marquez_dag_with_extractor'
    dag = DAG(dag_id,
              schedule_interval='@daily',
              default_args=DAG_DEFAULT_ARGS,
              description=DAG_DESCRIPTION)

    run_id = "my-test-uuid"
    mock_uuid.side_effect = [run_id]
    # Mock the marquez client method calls
    mock_marquez_client = mock.Mock()
    mock_get_or_create_marquez_client.return_value = mock_marquez_client

    # Add task that will be marked as completed
    task_will_complete = TestFixtureDummyOperator(task_id=TASK_ID_COMPLETED,
                                                  dag=dag)
    completed_task_location = get_location(task_will_complete.dag.fileloc)

    # Add the dummy extractor to the list for the task above
    dag._extractors[task_will_complete.__class__] = \
        TestFixtureDummyExtractorOnComplete

    # Create DAG run and mark as running
    dagrun = dag.create_dagrun(run_id='test_marquez_dag_with_extractor_run_id',
                               execution_date=DEFAULT_DATE,
                               state=State.RUNNING)

    # Namespace created
    mock_marquez_client.create_namespace.assert_called_once_with(
        DAG_NAMESPACE, DAG_OWNER)

    log.info("Marquez client calls when starting:")
    for call in mock_marquez_client.mock_calls:
        log.info(call)

    assert [name for name, args, kwargs in mock_marquez_client.mock_calls
            ] == ['create_namespace']
    mock_marquez_client.reset_mock()

    # --- Pretend complete the task
    task_will_complete.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

    dag.handle_callback(dagrun, success=True, session=session)

    # Datasets are updated
    mock_marquez_client.create_source.assert_called_with(
        'dummy_source_name', 'DummySource', 'http://dummy/source/url')
    # Datasets get called twice, once to reenact the _begin_run_flow
    #  and then again at _end_run_flow w/ the run id appended for
    #  the output dataset
    mock_marquez_client.create_dataset.assert_has_calls([
        mock.call(dataset_name='schema.extract_on_complete_input1',
                  dataset_type=DatasetType.DB_TABLE,
                  physical_name='schema.extract_on_complete_input1',
                  source_name='dummy_source_name',
                  namespace_name=DAG_NAMESPACE,
                  fields=mock.ANY,
                  run_id=None),
        mock.call(dataset_name='extract_on_complete_output1',
                  dataset_type=DatasetType.DB_TABLE,
                  physical_name='extract_on_complete_output1',
                  source_name='dummy_source_name',
                  namespace_name=DAG_NAMESPACE,
                  fields=[],
                  run_id=None),
        mock.call(dataset_name='schema.extract_on_complete_input1',
                  dataset_type=DatasetType.DB_TABLE,
                  physical_name='schema.extract_on_complete_input1',
                  source_name='dummy_source_name',
                  namespace_name=DAG_NAMESPACE,
                  fields=mock.ANY,
                  run_id=None),
        mock.call(dataset_name='extract_on_complete_output1',
                  dataset_type=DatasetType.DB_TABLE,
                  physical_name='extract_on_complete_output1',
                  source_name='dummy_source_name',
                  namespace_name=DAG_NAMESPACE,
                  fields=[],
                  run_id='my-test-uuid')
    ])

    # job is updated
    mock_marquez_client.create_job.assert_has_calls([
        mock.call(job_name=f"{dag_id}.{TASK_ID_COMPLETED}",
                  job_type=JobType.BATCH,
                  location=completed_task_location,
                  input_dataset=[{
                      'namespace': 'default',
                      'name': 'schema.extract_on_complete_input1'
                  }],
                  output_dataset=[{
                      'namespace': 'default',
                      'name': 'extract_on_complete_output1'
                  }],
                  context=mock.ANY,
                  description=DAG_DESCRIPTION,
                  namespace_name=DAG_NAMESPACE,
                  run_id=None),
        mock.call(job_name=f"{dag_id}.{TASK_ID_COMPLETED}",
                  job_type=JobType.BATCH,
                  location=completed_task_location,
                  input_dataset=[{
                      'namespace': 'default',
                      'name': 'schema.extract_on_complete_input1'
                  }],
                  output_dataset=[{
                      'namespace': 'default',
                      'name': 'extract_on_complete_output1'
                  }],
                  context=mock.ANY,
                  description=DAG_DESCRIPTION,
                  namespace_name=DAG_NAMESPACE,
                  run_id='my-test-uuid')
    ])
    assert mock_marquez_client.create_job.mock_calls[0].\
        kwargs['context'].get('extract_on_complete') == 'extract_on_complete'

    # run is created
    mock_marquez_client.create_job_run.assert_called_once_with(
        job_name=f"{dag_id}.{TASK_ID_COMPLETED}",
        run_id=run_id,
        run_args=DAG_RUN_ARGS,
        nominal_start_time=mock.ANY,
        nominal_end_time=mock.ANY,
        namespace_name=DAG_NAMESPACE)

    # run is started
    mock_marquez_client.mark_job_run_as_started.assert_called_once_with(
        run_id, mock.ANY)

    # --- Assert that the right marquez calls are done

    # job is updated before completion
    mock_marquez_client.create_job.assert_has_calls([
        mock.call(namespace_name=DAG_NAMESPACE,
                  job_name=f"{dag_id}.{TASK_ID_COMPLETED}",
                  job_type=JobType.BATCH,
                  location=completed_task_location,
                  input_dataset=[{
                      'namespace': 'default',
                      'name': 'schema.extract_on_complete_input1'
                  }],
                  output_dataset=[{
                      'namespace': 'default',
                      'name': 'extract_on_complete_output1'
                  }],
                  context=mock.ANY,
                  description=DAG_DESCRIPTION,
                  run_id=run_id)
    ])

    assert mock_marquez_client.create_job.mock_calls[0].\
        kwargs['context'].get('extract_on_complete') == 'extract_on_complete'

    mock_marquez_client.mark_job_run_as_completed.assert_called_once_with(
        run_id=run_id)

    # When a task run completes, the task outputs are also updated in order
    # to link a job version (=task version) to a dataset version.
    mock_marquez_client.create_dataset.assert_has_calls([
        mock.call(dataset_name='schema.extract_on_complete_input1',
                  dataset_type=DatasetType.DB_TABLE,
                  physical_name='schema.extract_on_complete_input1',
                  source_name='dummy_source_name',
                  namespace_name=DAG_NAMESPACE,
                  fields=mock.ANY,
                  run_id=None),
        mock.call(dataset_name='extract_on_complete_output1',
                  dataset_type=DatasetType.DB_TABLE,
                  physical_name='extract_on_complete_output1',
                  source_name='dummy_source_name',
                  namespace_name=DAG_NAMESPACE,
                  fields=[],
                  run_id=run_id)
    ])

    log.info("Marquez client calls when completing:")
    for call in mock_marquez_client.mock_calls:
        log.info(call)
    assert [name for name, args, kwargs in mock_marquez_client.mock_calls] == [
        'create_namespace', 'create_source', 'create_dataset', 'create_source',
        'create_dataset', 'create_job', 'create_job_run', 'create_source',
        'create_dataset', 'create_source', 'create_dataset', 'create_job',
        'mark_job_run_as_started', 'mark_job_run_as_completed'
    ]
Ejemplo n.º 12
0
from datetime import datetime

from airflow.operators.dummy_operator import DummyOperator
from marquez_airflow import DAG

DAG_NAME = 'test_dag'

default_args = {
    'depends_on_past': False,
    'start_date': datetime(2019, 2, 1),
}

dag = DAG(DAG_NAME,
          schedule_interval='0 0 * * *',
          catchup=False,
          default_args=default_args,
          description="My awesome DAG")

run_this_1 = DummyOperator(task_id='run_this_1', dag=dag)
run_this_2 = DummyOperator(task_id='run_this_2', dag=dag)
run_this_2.set_upstream(run_this_1)
Ejemplo n.º 13
0
from marquez_airflow import DAG
from airflow.operators.postgres_operator import PostgresOperator
from airflow.utils.dates import days_ago

default_args = {
    'owner': 'datascience',
    'depends_on_past': False,
    'start_date': days_ago(1),
    'email_on_failure': False,
    'email_on_retry': False,
    'email': ['*****@*****.**']
}

dag = DAG('etl_restaurants',
          schedule_interval='@hourly',
          catchup=False,
          default_args=default_args,
          description='Loads newly registered restaurants daily.')

t1 = PostgresOperator(task_id='if_not_exists',
                      postgres_conn_id='food_delivery_db',
                      sql='''
    CREATE TABLE IF NOT EXISTS restaurants (
      id                SERIAL PRIMARY KEY,
      created_at        TIMESTAMP NOT NULL,
      updated_at        TIMESTAMP NOT NULL,
      name              VARCHAR(64) NOT NULL,
      email             VARCHAR(64) UNIQUE NOT NULL,
      address           VARCHAR(64) NOT NULL,
      phone             VARCHAR(64) NOT NULL,
      city_id           INTEGER REFERENCES cities(id),
Ejemplo n.º 14
0
    'aws_conn_id': "aws_default",
    'consumer_key':"mT81o4cCwLhB3v8T19obCLl9m",
    'consumer_secret':"khIx52CeapnNU1Ux8NjJig2hNzAux7hQBol2q7ZfMzAHC2fSa9",
    'access_token': "712363196-buAMU1eDePCkbjWhtT8BAlry2et9SeA6oY8CWRPN",
    'access_token_secret':"rDXek6tRjNO9nrw43fSKxhaQlxzu0rsiYzAiNUSqvLZHi",   
    'postgres_conn_id': 'postgres_conn_id_bingyu',
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': timedelta(minutes=5)
}


dag = DAG('airflow_individual',
          description='test for twitter web scraping',
          schedule_interval='@weekly', # cron scheduling for more detailed time
          catchup=False,
          default_args=default_args,
          max_active_runs=1)

# =============================================================================
# 2. Define different functions
# =============================================================================
def scrape_data_twitter(**kwargs):

    # authorization of consumer key and consumer secret
    auth = tweepy.OAuthHandler(kwargs['consumer_key'], kwargs['consumer_secret'])
    
    # set access to user's access key and access secret 
    auth.set_access_token(kwargs['access_token'], kwargs['access_token_secret'])
    
    # calling the api 
Ejemplo n.º 15
0
def test_marquez_dag_with_extractor_returning_two_steps(
        job_id_mapping,
        mock_get_or_create_openlineage_client,
        clear_db_airflow_dags,
        session=None):

    # --- test setup
    dag_id = 'test_marquez_dag_with_extractor_returning_two_steps'
    dag = DAG(
        dag_id,
        schedule_interval='@daily',
        default_args=DAG_DEFAULT_ARGS,
        description=DAG_DESCRIPTION
    )

    dag_run_id = 'test_marquez_dag_with_extractor_returning_two_steps_run_id'
    run_id = f"{dag_run_id}.{TASK_ID_COMPLETED}"

    # Mock the marquez client method calls
    mock_marquez_client = mock.Mock()
    mock_get_or_create_openlineage_client.return_value = mock_marquez_client

    # Add task that will be marked as completed
    task_will_complete = TestFixtureDummyOperator(
        task_id=TASK_ID_COMPLETED,
        dag=dag
    )
    completed_task_location = get_location(task_will_complete.dag.fileloc)

    # Add the dummy extractor to the list for the task above
    _DAG_EXTRACTORS[task_will_complete.__class__] = TestFixtureDummyExtractorWithMultipleSteps

    # --- pretend run the DAG

    # Create DAG run and mark as running
    dagrun = dag.create_dagrun(
        run_id=dag_run_id,
        execution_date=DEFAULT_DATE,
        state=State.RUNNING)

    # --- Asserts that the job starting triggers openlineage event

    start_time = '2016-01-01T00:00:00.000000Z'
    end_time = '2016-01-02T00:00:00.000000Z'

    mock_marquez_client.emit.assert_called_once_with(
        RunEvent(
            RunState.START,
            mock.ANY,
            Run(run_id, {"nominalTime": NominalTimeRunFacet(start_time, end_time)}),
            Job("default", f"{dag_id}.{TASK_ID_COMPLETED}", {
                "documentation": DocumentationJobFacet(DAG_DESCRIPTION),
                "sourceCodeLocation": SourceCodeLocationJobFacet("", completed_task_location)
            }),
            PRODUCER,
            [OpenLineageDataset(DAG_NAMESPACE, 'extract_input1', {
                "dataSource": DataSourceDatasetFacet(
                    name='dummy_source_name',
                    uri='http://dummy/source/url'
                )
            })],
            []
        )
    )

    mock_marquez_client.reset_mock()

    # --- Pretend complete the task
    job_id_mapping.pop.return_value = run_id

    task_will_complete.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

    dag.handle_callback(dagrun, success=True, session=session)

    # --- Assert that the openlineage call is done

    mock_marquez_client.emit.assert_called_once_with(
        RunEvent(
            RunState.COMPLETE,
            mock.ANY,
            Run(run_id),
            Job("default", f"{dag_id}.{TASK_ID_COMPLETED}"),
            PRODUCER,
            [OpenLineageDataset(DAG_NAMESPACE, 'extract_input1', {
                "dataSource": DataSourceDatasetFacet(
                    name='dummy_source_name',
                    uri='http://dummy/source/url'
                )
            })],
            []
        )
    )
Ejemplo n.º 16
0
from airflow.operators.postgres_operator import PostgresOperator
from airflow.operators.sensors import ExternalTaskSensor
from airflow.utils.dates import days_ago

default_args = {
    'owner': 'datascience',
    'depends_on_past': False,
    'start_date': days_ago(1),
    'email_on_failure': False,
    'email_on_retry': False,
    'email': ['*****@*****.**']
}

dag = DAG('etl_customers',
          schedule_interval='@hourly',
          catchup=False,
          default_args=default_args,
          description='Loads newly registered customers daily.')

# Wait for new_food_deliveries DAG to complete
t1 = ExternalTaskSensor(task_id='wait_for_new_food_deliveries',
                        external_dag_id='new_food_deliveries',
                        mode='reschedule',
                        dag=dag)

t2 = PostgresOperator(task_id='if_not_exists',
                      postgres_conn_id='food_delivery_db',
                      sql='''
    CREATE TABLE IF NOT EXISTS customers (
      id         SERIAL PRIMARY KEY,
      created_at TIMESTAMP NOT NULL,
Ejemplo n.º 17
0
def test_marquez_dag(job_id_mapping, mock_get_or_create_openlineage_client,
                     clear_db_airflow_dags, session=None):

    dag = DAG(
        DAG_ID,
        schedule_interval='@daily',
        default_args=DAG_DEFAULT_ARGS,
        description=DAG_DESCRIPTION
    )
    # (1) Mock the marquez client method calls
    mock_marquez_client = mock.Mock()
    mock_get_or_create_openlineage_client.return_value = mock_marquez_client
    run_id_completed = f"{DAG_RUN_ID}.{TASK_ID_COMPLETED}"
    run_id_failed = f"{DAG_RUN_ID}.{TASK_ID_FAILED}"
    # mock_uuid.side_effect = [run_id_completed, run_id_failed]

    # (2) Add task that will be marked as completed
    task_will_complete = DummyOperator(
        task_id=TASK_ID_COMPLETED,
        dag=dag
    )
    completed_task_location = get_location(task_will_complete.dag.fileloc)

    # (3) Add task that will be marked as failed
    task_will_fail = DummyOperator(
        task_id=TASK_ID_FAILED,
        dag=dag
    )
    failed_task_location = get_location(task_will_complete.dag.fileloc)

    # (4) Create DAG run and mark as running
    dagrun = dag.create_dagrun(
        run_id=DAG_RUN_ID,
        execution_date=DEFAULT_DATE,
        state=State.RUNNING)

    # Assert emit calls
    start_time = '2016-01-01T00:00:00.000000Z'
    end_time = '2016-01-02T00:00:00.000000Z'

    emit_calls = [
        mock.call(RunEvent(
            eventType=RunState.START,
            eventTime=mock.ANY,
            run=Run(run_id_completed, {"nominalTime": NominalTimeRunFacet(start_time, end_time)}),
            job=Job("default", f"{DAG_ID}.{TASK_ID_COMPLETED}", {
                "documentation": DocumentationJobFacet(DAG_DESCRIPTION),
                "sourceCodeLocation": SourceCodeLocationJobFacet("", completed_task_location)
            }),
            producer=PRODUCER,
            inputs=[],
            outputs=[]
        )),
        mock.call(RunEvent(
            eventType=RunState.START,
            eventTime=mock.ANY,
            run=Run(run_id_failed, {"nominalTime": NominalTimeRunFacet(start_time, end_time)}),
            job=Job("default", f"{DAG_ID}.{TASK_ID_FAILED}", {
                "documentation": DocumentationJobFacet(DAG_DESCRIPTION),
                "sourceCodeLocation": SourceCodeLocationJobFacet("", failed_task_location)
            }),
            producer=PRODUCER,
            inputs=[],
            outputs=[]
        ))
    ]
    log.info(
        f"{ [name for name, args, kwargs in mock_marquez_client.mock_calls]}")
    mock_marquez_client.emit.assert_has_calls(emit_calls)

    # (5) Start task that will be marked as completed
    task_will_complete.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

    # (6) Start task that will be marked as failed
    ti1 = TaskInstance(task=task_will_fail, execution_date=DEFAULT_DATE)
    ti1.state = State.FAILED
    session.add(ti1)
    session.commit()

    job_id_mapping.pop.side_effect = [run_id_completed, run_id_failed]

    dag.handle_callback(dagrun, success=False, session=session)

    emit_calls += [
        mock.call(RunEvent(
            eventType=RunState.COMPLETE,
            eventTime=mock.ANY,
            run=Run(run_id_completed),
            job=Job("default", f"{DAG_ID}.{TASK_ID_COMPLETED}"),
            producer=PRODUCER,
            inputs=[],
            outputs=[]
        )),
        mock.call(RunEvent(
            eventType=RunState.FAIL,
            eventTime=mock.ANY,
            run=Run(run_id_failed),
            job=Job("default", f"{DAG_ID}.{TASK_ID_FAILED}"),
            producer=PRODUCER,
            inputs=[],
            outputs=[]
        ))
    ]
    mock_marquez_client.emit.assert_has_calls(emit_calls)
Ejemplo n.º 18
0
from airflow.operators.postgres_operator import PostgresOperator
from airflow.operators.sensors import ExternalTaskSensor
from airflow.utils.dates import days_ago

default_args = {
    'owner': 'datascience',
    'depends_on_past': False,
    'start_date': days_ago(1),
    'email_on_failure': False,
    'email_on_retry': False,
    'email': ['*****@*****.**']
}

dag = DAG('etl_delivery_7_days',
          schedule_interval='@hourly',
          catchup=False,
          default_args=default_args,
          description='Loads new deliveries for the week.')

# Wait for etl_orders_7_days DAG to complete
t1 = ExternalTaskSensor(task_id='wait_for_etl_orders_7_days',
                        external_dag_id='etl_orders_7_days',
                        mode='reschedule',
                        dag=dag)

# Wait for etl_restaurants DAG to complete
t2 = ExternalTaskSensor(task_id='wait_for_etl_restaurants',
                        external_dag_id='etl_restaurants',
                        mode='reschedule',
                        dag=dag)
Ejemplo n.º 19
0
from airflow.operators.postgres_operator import PostgresOperator
from airflow.operators.sensors import ExternalTaskSensor
from airflow.utils.dates import days_ago

default_args = {
    'owner': 'datascience',
    'depends_on_past': False,
    'start_date': days_ago(1),
    'email_on_failure': False,
    'email_on_retry': False,
    'email': ['*****@*****.**']
}

dag = DAG('orders_popular_day_of_week',
          schedule_interval='@hourly',
          catchup=False,
          default_args=default_args,
          description='Determines the popular day of week orders are placed.')

# Wait for delivery_times_7_days DAG to complete
t1 = ExternalTaskSensor(task_id='wait_for_delivery_times_7_days',
                        external_dag_id='delivery_times_7_days',
                        mode='reschedule',
                        dag=dag)

t2 = PostgresOperator(task_id='if_not_exists',
                      postgres_conn_id='food_delivery_db',
                      sql='''
    CREATE TABLE IF NOT EXISTS popular_orders_day_of_week (
      order_day_of_week VARCHAR(64) NOT NULL,
      order_placed_on   TIMESTAMP NOT NULL,
from airflow.operators.postgres_operator import PostgresOperator
from airflow.operators.sensors import ExternalTaskSensor
from airflow.utils.dates import days_ago

default_args = {
    'owner': 'datascience',
    'depends_on_past': False,
    'start_date': days_ago(1),
    'email_on_failure': False,
    'email_on_retry': False,
    'email': ['*****@*****.**']
}

dag = DAG('delivery_times_7_days',
          schedule_interval='@hourly',
          catchup=False,
          default_args=default_args,
          description='Determine weekly top delivery times by restaurant.')

# Wait for etl_delivery_7_days DAG to complete
t1 = ExternalTaskSensor(task_id='wait_for_etl_delivery_7_days',
                        external_dag_id='etl_delivery_7_days',
                        mode='reschedule',
                        dag=dag)

t2 = PostgresOperator(task_id='if_not_exists',
                      postgres_conn_id='food_delivery_db',
                      sql='''
    CREATE TABLE IF NOT EXISTS top_delivery_times (
      order_id            INTEGER REFERENCES orders(id),
      order_placed_on     TIMESTAMP NOT NULL,
from datetime import datetime

from airflow.operators.dummy_operator import DummyOperator
from marquez_airflow import DAG

dag = DAG(dag_id='test_dummy_dag',
          description='Test dummy DAG',
          schedule_interval='*/2 * * * *',
          start_date=datetime(2020, 1, 8),
          catchup=False,
          max_active_runs=1)

dummy_task = DummyOperator(task_id='test_dummy', dag=dag)
Ejemplo n.º 22
0
DAG_ID = 'email_discounts'
DAG_OWNER = 'datascience'
DAG_DEFAULT_ARGS = {
    'owner': DAG_OWNER,
    'depends_on_past': False,
    'start_date': days_ago(7),
    'email_on_failure': False,
    'email_on_retry': False,
    'email': ['*****@*****.**']
}
DAG_DESCRIPTION = \
    'Email discounts to customers that have experienced order delays daily'

DAG = dag = DAG(DAG_ID,
                schedule_interval='@weekly',
                default_args=DAG_DEFAULT_ARGS,
                description=DAG_DESCRIPTION)

TASK_ID = 'select'
TASK = PostgresOperator(task_id=TASK_ID,
                        postgres_conn_id=CONN_ID,
                        sql=SQL,
                        dag=DAG)


@mock.patch('marquez_airflow.extractors.postgres_extractor.\
PostgresExtractor._get_table_schemas')
def test_extract(mock_get_table_schemas):
    mock_get_table_schemas.side_effect = \
        [[DB_TABLE_SCHEMA], NO_DB_TABLE_SCHEMA]
Ejemplo n.º 23
0
from airflow.operators.sensors import ExternalTaskSensor
from airflow.utils.dates import days_ago

default_args = {
    'owner': 'datascience',
    'depends_on_past': False,
    'start_date': days_ago(1),
    'email_on_failure': False,
    'email_on_retry': False,
    'email': ['*****@*****.**']
}

dag = DAG(
    'etl_categories',
    schedule_interval='@hourly',
    catchup=False,
    default_args=default_args,
    description='Loads newly added menus categories daily.'
)

# Wait for new_food_deliveries DAG to complete
t1 = ExternalTaskSensor(
    task_id='wait_for_new_food_deliveries',
    external_dag_id='new_food_deliveries',
    mode='reschedule',
    dag=dag
)

# Wait for etl_menus DAG to complete
t2 = ExternalTaskSensor(
    task_id='wait_for_etl_menus',
Ejemplo n.º 24
0
from marquez_airflow import DAG
from airflow.operators.postgres_operator import PostgresOperator
from airflow.utils.dates import days_ago

default_args = {
    'owner': 'datascience',
    'depends_on_past': False,
    'start_date': days_ago(1),
    'email_on_failure': False,
    'email_on_retry': False,
    'email': ['*****@*****.**']
}

dag = DAG('etl_order_status',
          schedule_interval='@hourly',
          catchup=False,
          default_args=default_args,
          description='Loads order statues updates daily.')

t1 = PostgresOperator(task_id='if_not_exists',
                      postgres_conn_id='food_delivery_db',
                      sql='''
    CREATE TABLE IF NOT EXISTS order_status (
      id              SERIAL PRIMARY KEY,
      transitioned_at TIMESTAMP NOT NULL,
      status          VARCHAR(64),
      order_id        INTEGER REFERENCES orders(id),
      customer_id     INTEGER REFERENCES customers(id),
      restaurant_id   INTEGER REFERENCES restaurants(id),
      driver_id       INTEGER REFERENCES drivers(id)
    );''',
Ejemplo n.º 25
0
from airflow.operators.sensors import ExternalTaskSensor
from airflow.utils.dates import days_ago

default_args = {
    'owner': 'datascience',
    'depends_on_past': False,
    'start_date': days_ago(1),
    'email_on_failure': False,
    'email_on_retry': False,
    'email': ['*****@*****.**']
}

dag = DAG(
    'etl_orders',
    schedule_interval='@hourly',
    catchup=False,
    default_args=default_args,
    description='Loads newly placed orders daily.'
)

# Wait for new_food_deliveries DAG to complete
t1 = ExternalTaskSensor(
    task_id='wait_for_new_food_deliveries',
    external_dag_id='new_food_deliveries',
    mode='reschedule',
    dag=dag
)

# Wait for etl_menu_items DAG to complete
t2 = ExternalTaskSensor(
    task_id='wait_for_etl_menu_items',
Ejemplo n.º 26
0
from marquez_airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from datetime import datetime
DAG_NAME = 'test_dag_v2'

default_args = {
    'marquez_location': 'github://my_dag_location',
    'marquez_input_urns': ["s3://great_data", "s3://not_so_good_data"],
    'marquez_output_urns': ["s3://amazing_data"],
    'owner': 'some dag developer',
    'depends_on_past': False,
    'start_date': datetime(2019, 1, 31),
}

dag = DAG(DAG_NAME,
          schedule_interval='*/10 * * * *',
          default_args=default_args,
          description="My awesome DAG")

run_this_1 = DummyOperator(task_id='run_this_1', dag=dag)
run_this_2 = DummyOperator(task_id='run_this_2', dag=dag)
run_this_2.set_upstream(run_this_1)