Esempio n. 1
0
    def test_different_target_states(self):
        self.mock_emr_client.describe_cluster.side_effect = [
            DESCRIBE_CLUSTER_STARTING_RETURN,  # return False
            DESCRIBE_CLUSTER_BOOTSTRAPPING_RETURN,  # return False
            DESCRIBE_CLUSTER_RUNNING_RETURN,  # return True
            DESCRIBE_CLUSTER_WAITING_RETURN,  # will not be used
            DESCRIBE_CLUSTER_TERMINATED_RETURN,  # will not be used
            DESCRIBE_CLUSTER_TERMINATED_WITH_ERRORS_RETURN,  # will not be used
        ]
        with patch('boto3.session.Session', self.boto3_session_mock):
            operator = EmrJobFlowSensor(
                task_id='test_task',
                poke_interval=0,
                job_flow_id='j-8989898989',
                aws_conn_id='aws_default',
                target_states=['RUNNING', 'WAITING'],
            )

            operator.execute(None)

            # make sure we called twice
            assert self.mock_emr_client.describe_cluster.call_count == 3

            # make sure it was called with the job_flow_id
            calls = [unittest.mock.call(ClusterId='j-8989898989')]
            self.mock_emr_client.describe_cluster.assert_has_calls(calls)
Esempio n. 2
0
    def test_execute_calls_with_the_job_flow_id_until_it_reaches_a_target_state(
            self):
        self.mock_emr_client.describe_cluster.side_effect = [
            DESCRIBE_CLUSTER_STARTING_RETURN,
            DESCRIBE_CLUSTER_RUNNING_RETURN,
            DESCRIBE_CLUSTER_TERMINATED_RETURN,
        ]
        with patch('boto3.session.Session', self.boto3_session_mock):
            operator = EmrJobFlowSensor(task_id='test_task',
                                        poke_interval=0,
                                        job_flow_id='j-8989898989',
                                        aws_conn_id='aws_default')

            operator.execute(None)

            # make sure we called twice
            assert self.mock_emr_client.describe_cluster.call_count == 3

            # make sure it was called with the job_flow_id
            calls = [unittest.mock.call(ClusterId='j-8989898989')]
            self.mock_emr_client.describe_cluster.assert_has_calls(calls)
Esempio n. 3
0
    def test_execute_calls_with_the_job_flow_id_until_it_reaches_failed_state_with_exception(
            self):
        self.mock_emr_client.describe_cluster.side_effect = [
            DESCRIBE_CLUSTER_RUNNING_RETURN,
            DESCRIBE_CLUSTER_TERMINATED_WITH_ERRORS_RETURN,
        ]
        with patch('boto3.session.Session', self.boto3_session_mock):
            operator = EmrJobFlowSensor(task_id='test_task',
                                        poke_interval=0,
                                        job_flow_id='j-8989898989',
                                        aws_conn_id='aws_default')

            with pytest.raises(AirflowException):
                operator.execute(None)

                # make sure we called twice
                assert self.mock_emr_client.describe_cluster.call_count == 2

                # make sure it was called with the job_flow_id
                self.mock_emr_client.describe_cluster.assert_called_once_with(
                    ClusterId='j-8989898989')
        default_args={
            'owner': 'airflow',
            'depends_on_past': False,
            'email': ['*****@*****.**'],
            'email_on_failure': False,
            'email_on_retry': False,
        },
        dagrun_timeout=timedelta(hours=2),
        start_date=days_ago(2),
        schedule_interval='0 3 * * *',
        tags=['example'],
) as dag:

    # [START howto_operator_emr_automatic_steps_tasks]
    job_flow_creator = EmrCreateJobFlowOperator(
        task_id='create_job_flow',
        job_flow_overrides=JOB_FLOW_OVERRIDES,
        aws_conn_id='aws_default',
        emr_conn_id='emr_default',
    )

    job_sensor = EmrJobFlowSensor(
        task_id='check_job_flow',
        job_flow_id=job_flow_creator.output,
        aws_conn_id='aws_default',
    )
    # [END howto_operator_emr_automatic_steps_tasks]

    # Task dependency created via `XComArgs`:
    #   job_flow_creator >> job_sensor
        False,
        'TerminationProtected':
        False,
    },
    'Steps': SPARK_TEST_STEPS,
    'JobFlowRole': 'EMR_EC2_DefaultRole',
    'ServiceRole': 'EMR_DefaultRole',
}

with DAG(
        dag_id='emr_job_flow_automatic_steps_dag',
        default_args=DEFAULT_ARGS,
        dagrun_timeout=timedelta(hours=2),
        schedule_interval='0 3 * * *',
        tags=['example'],
) as dag:

    job_flow_creator = EmrCreateJobFlowOperator(
        task_id='create_job_flow',
        job_flow_overrides=JOB_FLOW_OVERRIDES,
        aws_conn_id='aws_default',
        emr_conn_id='emr_default')

    job_sensor = EmrJobFlowSensor(
        task_id='check_job_flow',
        job_flow_id=
        "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
        aws_conn_id='aws_default')

    job_flow_creator >> job_sensor
Esempio n. 6
0
        STAGING LAYER PRE-PROCESSING
        (Spark consume Sources save to Parquet)

    '''

    manifold_emr_creator = EmrCreateJobFlowOperator(
        task_id='create_manifold_emr',
        job_flow_overrides=JOB_FLOW_OVERRIDES,
        aws_conn_id='aws_credentials',
        emr_conn_id='emr_credentials',
    )

    manifold_emr_job_sensor = EmrJobFlowSensor(
        task_id='check_emr_completion',
        job_flow_id=
        "{{ task_instance.xcom_pull(task_ids='create_manifold_emr', key='return_value') }}",
        aws_conn_id='aws_credentials',
    )
    '''

        STAGING TABLE CREATION
        
    '''

    staging_table_creation: List[PostgresOperator] = []
    create_staging_definitions: Dict[
        str, str] = sql_queries_staging.create_query_destinition

    # create the staging tables if they do not exist, else truncate
    for object_name, query in create_staging_definitions.items():
        staging_task: PostgresOperator = PostgresOperator(
        start_date=datetime.datetime(2020, 1, 1),
        schedule_interval=None,
        tags=['udacity', 'etl'],
        default_view="graph",
) as dag:
    # generate dag documentation
    dag.doc_md = __doc__

    create_cluster = EmrCreateJobFlowOperator(
        dag=dag,
        task_id="create_cluster",
        job_flow_overrides=job_flow_overrides,
        aws_conn_id=aws_conn_id)
    wait_cluster_completion = EmrJobFlowSensor(
        task_id='wait_cluster_completion',
        job_flow_id=cluster_id,
        aws_conn_id=aws_conn_id,
        target_states=["RUNNING", "WAITING"],
        dag=dag)
    terminate_cluster = EmrTerminateJobFlowOperator(
        task_id="terminate_cluster",
        trigger_rule="all_done",
        job_flow_id=cluster_id,
        aws_conn_id=aws_conn_id,
        dag=dag)

    with TaskGroup("run_immigration_mapping") as run_immigration_mapping:
        add_step, wait_step = emr_step_task_group(
            script_name='immigration_mapping',
            cluster_id=cluster_id,
            aws_conn_id=aws_conn_id,
            dag=dag)
Esempio n. 8
0
# - NLP Named Entity Recognition
# - Transform to star schema
aws_emr_etl_operator = AWSEMROperator(
    task_id="create_EMR_cluster_and_execute_ETL",
    dag=dag,
    conn_id=AWS_CONN_ID,
    redshift_conn_id=AWS_REDSHIFT_CONN_ID,
    time_zone=local_tz,
    cluster_identifier=
    f"news-nlp-emr-{datetime.now(local_tz).strftime('%Y-%m-%d-%H-%M')}",
)
# Wait for the ETL process to finish
emr_etl_sensor = EmrJobFlowSensor(
    task_id="sense_emr_etl",
    dag=dag,
    job_flow_id=
    "{{ task_instance.xcom_pull('create_EMR_cluster_and_execute_ETL', key='return_value') }}",
    aws_conn_id=AWS_CONN_ID,
)

# Create a Redshift cluster
create_redshift_cluster = AWSRedshiftOperator(
    task_id="create_redshift_cluster",
    dag=dag,
    conn_id=AWS_CONN_ID,
    redshift_conn_id=AWS_REDSHIFT_CONN_ID,
    time_zone=local_tz,
    cluster_identifier=
    f"news-nlp-redshift-{datetime.now(local_tz).strftime('%Y-%m-%d-%H-%M')}",
)
# Wait for Redshift cluster to be ready
Esempio n. 9
0
        'TerminationProtected':
        False,
    },
    'Steps': SPARK_STEPS,
    'JobFlowRole': 'EMR_EC2_DefaultRole',
    'ServiceRole': 'EMR_DefaultRole',
}
# [END howto_operator_emr_automatic_steps_config]

with DAG(
        dag_id='emr_job_flow_automatic_steps_dag',
        dagrun_timeout=timedelta(hours=2),
        start_date=datetime(2021, 1, 1),
        schedule_interval='0 3 * * *',
        catchup=False,
        tags=['example'],
) as dag:

    # [START howto_operator_emr_automatic_steps_tasks]
    job_flow_creator = EmrCreateJobFlowOperator(
        task_id='create_job_flow',
        job_flow_overrides=JOB_FLOW_OVERRIDES,
    )

    job_sensor = EmrJobFlowSensor(task_id='check_job_flow',
                                  job_flow_id=job_flow_creator.output)
    # [END howto_operator_emr_automatic_steps_tasks]

    # Task dependency created via `XComArgs`:
    #   job_flow_creator >> job_sensor
Esempio n. 10
0

# cerate emr cluster

instantiate_emr = CustomEmrCreateJobFlowOperator(
    task_id='Wrangling_on_EMR',
    aws_conn_id='aws_credentials',
    emr_conn_id='emr_default',
    dag=dag
    )

s_job_flow_id = "{{ task_instance.xcom_pull(task_ids='Wrangling_on_EMR', "
s_job_flow_id += "key='return_value') }}"
monitor_emr = EmrJobFlowSensor(
    task_id='Monitor_cluster',
    job_flow_id=s_job_flow_id,
    aws_conn_id='aws_credentials',
    dag=dag
    )


# update athena

athena_task_id = "Put_quotes_on_athena"
d_sqls = {
    'subkey': 'quotes',
    'createdb': SqlQueries.quotes_create_database,
    'drop': SqlQueries.quotes_drop_table,
    'create': SqlQueries.quotes_create_table,
    'load': SqlQueries.quotes_rapair,
    'load2': SqlQueries.quotes_load
}