Ejemplo n.º 1
0
    def test_execute_uses_the_emr_config_to_create_a_cluster_and_returns_job_id(self):
        with patch("boto3.client", self.boto3_client_mock):

            operator = EmrCreateJobFlowOperator(
                task_id="test_task", aws_conn_id="aws_default", emr_conn_id="emr_default"
            )

            self.assertEqual(operator.execute(None), "j-8989898989")
    def test_execute_uses_the_emr_config_to_create_a_cluster_and_returns_job_id(
            self):
        with patch('boto3.client', self.boto3_client_mock):

            operator = EmrCreateJobFlowOperator(task_id='test_task',
                                                aws_conn_id='aws_default',
                                                emr_conn_id='emr_default')

            self.assertEqual(operator.execute(None), 'j-8989898989')
    def test_execute_uses_the_emr_config_to_create_a_cluster_and_returns_job_id(self):
        with patch('boto3.client', self.boto3_client_mock):

            operator = EmrCreateJobFlowOperator(
                task_id='test_task',
                aws_conn_id='aws_default',
                emr_conn_id='emr_default'
            )

            self.assertEqual(operator.execute(None), 'j-8989898989')
    def setUp(self):
        args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}

        # Mock out the emr_client (moto has incorrect response)
        self.emr_client_mock = MagicMock()
        self.operator = EmrCreateJobFlowOperator(
            task_id='test_task',
            aws_conn_id='aws_default',
            emr_conn_id='emr_default',
            job_flow_overrides=self._config,
            region_name='ap-southeast-2',
            dag=DAG('test_dag_id', default_args=args))
Ejemplo n.º 5
0
def create_emr_job_flow(**kwargs):
    # Required params to be changed:
    # - aws_account_id
    # - aws_region
    # - ec2_key_pair
    # - ec2_subnet_id

    emr_settings = EmrSettings(
        aws_account_id="????????????",
        aws_region="us-east-1",
        ec2_key_pair="???????????",
        ec2_subnet_id="????????????",
        cluster_name=f"Ifood Data Architect Test | {kwargs['ds']}",
        master_instance_type="m5.4xlarge",
        master_instance_count=1,
        core_instance_type="m5.4xlarge",
        core_instance_count=1,
        core_instance_market="ON_DEMAND",
        task_instance_type="c5.2xlarge",
        task_instance_count=1,
        task_instance_market="SPOT",
        step_concurrency_level=4)

    job_flow_id = EmrCreateJobFlowOperator(
        task_id="create_cluster_emr_job_task",
        aws_conn_id="aws_default",
        region_name=emr_settings.aws_region,
        job_flow_overrides=emr_settings.crete_job_flow_overrides(),
        dag=dag,
    ).execute(kwargs)

    kwargs["ti"].xcom_push(key="job_flow_id", value=job_flow_id)
Ejemplo n.º 6
0
    def _get_test_dag(self):
        with DAG(dag_id='test_dag', default_args=DEFAULT_DAG_ARGS) as dag:
            op1 = SparkSubmitOperator(task_id='op1')
            op2 = EmrAddStepsOperator(task_id='op2', job_flow_id='foo')
            op3 = S3ListOperator(task_id='op3', bucket='foo')
            op4 = EmrCreateJobFlowOperator(task_id='op4')
            op5 = TriggerDagRunOperator(task_id='op5', trigger_dag_id='foo')
            op6 = FileToWasbOperator(task_id='op6',
                                     container_name='foo',
                                     blob_name='foo',
                                     file_path='foo')
            op7 = EmailOperator(task_id='op7',
                                subject='foo',
                                to='foo',
                                html_content='foo')
            op8 = S3CopyObjectOperator(task_id='op8',
                                       dest_bucket_key='foo',
                                       source_bucket_key='foo')
            op9 = BranchPythonOperator(task_id='op9', python_callable=print)
            op10 = PythonOperator(task_id='op10', python_callable=range)

            op1 >> [op2, op3, op4]
            op2 >> [op5, op6]
            op6 >> [op7, op8, op9]
            op3 >> [op7, op8]
            op8 >> [op9, op10]

        return dag
Ejemplo n.º 7
0
def run_emr_job(current_dag,
                cluster_name,
                task_gen_name,
                aws_connection,
                emr_connection,
                script_location,
                library_location,
                region='us-east-1'):
    """
    Creates the EMR cluster, runs the step and terminates the cluster when it is completed
    current_dag: DAG that is created by the user
    cluster_name: Name given to cluster by the user
    task_gen_name: A general name for the task being done. This is used to name different tasks
    aws_connection: Connection to AWS for account credentials
    emr_connection: Name of Airflow connection storing EMR configuration details
    script_location: S3 location of the xcript to be run
    library_location: S3 location of the library being used to run spark-submit
    region: AWS region where the cluster is being created
    """

    # Name of the new cluster being created
    job_flow_overrides = {'Name': cluster_name}

    # name of task creating the cluster
    create_cluster_task_name = task_gen_name + "_create_cluster"

    # Task that creates the cluster
    cluster_creator = EmrCreateJobFlowOperator(
        task_id=create_cluster_task_name,
        job_flow_overrides=job_flow_overrides,
        aws_conn_id=aws_connection,
        emr_conn_id=emr_connection,
        dag=current_dag)

    # script-runner.jar file location is region specific
    script_runner_jar = 's3://' + region + '.elasticmapreduce/libs/script-runner/script-runner.jar'

    # Step description
    step_definition = [{
        'Name': task_gen_name,
        'ActionOnFailure': 'CONTINUE',
        'HadoopJarStep': {
            'Jar': script_runner_jar,
            'Args': [script_location, library_location, '']
        }
    }]

    # Task that terminates the cluster
    cluster_remover = EmrTerminateJobFlowOperator(
        task_id=task_gen_name + "_remove_cluster",
        job_flow_id="{{ task_instance.xcom_pull('" + create_cluster_task_name +
        "', key='return_value') }}",
        aws_conn_id=aws_connection,
        dag=current_dag)

    # Add the step and step checker tasks
    add_step_to_emr(cluster_creator, task_gen_name, step_definition,
                    cluster_remover, create_cluster_task_name, aws_connection,
                    current_dag)
Ejemplo n.º 8
0
def create_dag():
    with DAG(dag_id='emr_job_flow_manual_steps_dag',
             default_args=DEFAULT_DAG_ARGS,
             dagrun_timeout=timedelta(hours=2),
             max_active_runs=1,
             schedule_interval=None) as dag:

        create_cluster_op = EmrCreateJobFlowOperator(
            task_id='create_cluster',
            job_flow_overrides={'Name': CLUSTER_NAME},
            aws_conn_id=AWS_CONN_ID,
            emr_conn_id=EMR_CONN_ID)

        add_steps_to_cluster_op = TemplatedEmrAddStepsOperator(
            task_id='add_steps',
            job_flow_id=
            "{{ task_instance.xcom_pull(task_ids='create_cluster', key='return_value') }}",
            aws_conn_id=AWS_CONN_ID,
            steps=[{
                'Name': 'calculate_pi',
                'ActionOnFailure': 'TERMINATE_CLUSTER',
                'HadoopJarStep': {
                    'Jar': 's3://psm-poc-dmp-temp/spark-examples.jar',
                    'Args': ['10'],
                    'MainClass': 'org.apache.spark.examples.SparkPi'
                }
            }])

        monitor_cluster_op = EmrJobFlowSensor(
            task_id='monitor_cluster',
            retries=0,
            aws_conn_id=AWS_CONN_ID,
            job_flow_id=
            '{{ task_instance.xcom_pull("create_cluster", key="return_value") }}',
            timeout=1800)

        monitor_step_op = EmrStepSensor(
            task_id='watch_step',
            job_flow_id=
            "{{ task_instance.xcom_pull('create_cluster', key='return_value') }}",
            step_id=
            "{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}",
            aws_conn_id=AWS_CONN_ID)

        terminate_cluster_op = EmrTerminateJobFlowOperator(
            task_id='remove_cluster',
            job_flow_id=
            "{{ task_instance.xcom_pull(task_ids='create_cluster', key='return_value') }}",
            aws_conn_id=AWS_CONN_ID)

        handle_failure_op = PythonOperator(
            task_id='handle_failure',
            python_callable=handle_failure_task,
            trigger_rule=trigger_rule.TriggerRule.ONE_FAILED)

        create_cluster_op >> monitor_cluster_op >> handle_failure_op
        create_cluster_op >> add_steps_to_cluster_op >> monitor_step_op >> terminate_cluster_op

    return dag
Ejemplo n.º 9
0
def print_hello():
    print('Hello world!')
    try:
        print("eafds")
        create_emr = EmrCreateJobFlowOperator(task_id='create_job_flow',aws_conn_id='aws_default',dag=dag)
        return(create_emr)
    except AirflowException as ae:
        print (ae.message)
    def execute(self, context):
        if self.environment not in ["dev", "prod"]:
            logging.error(f"Can't recognise deployment environment '{self.environment}'. \n"
                          "Review the environment variable 'DEPLOYMENT_ENVIRONMENT'")
            raise ValueError(f"self.environment = os.environ['DEPLOYMENT_ENVIRONMENT'] --> {self.environment}")

        # check if development/local environment
        if self.environment == 'dev':
            logging.info("EMR cluster running from development environment")

            # get user aws name
            client = boto3.client('sts')
            username = client.get_caller_identity()['Arn'].split(":", 5)[5].split("/", 1)[1].lower()

            # Create zipped archive of the local airflow repository
            airflow_repo_path = '/home/vagrant/uk_dm_airflow'
            zip_local_path = '/tmp/latest'
            shutil.make_archive(base_name='/tmp/latest',
                                format='zip',
                                root_dir=airflow_repo_path)
            logging.info(f"Zipped file location: {zip_local_path}")

            # Upload zipped airflow repository to user's s3 bucket
            hook = S3_hook.S3Hook(aws_conn_id=self.aws_conn_id)
            hook.load_file(f"{zip_local_path}.zip", f'{username}/spark_local/latest.zip',
                           bucket_name='grp-ds-users',
                           replace=True)

            logging.info(f"Airflow repo uploaded to user bucket. User: '******'")

            # Upload local bootstrap file to user s3 buckets
            bootstrap_path = self.bootstrap_path
            hook.load_file(bootstrap_path, f'{username}/spark_local/bootstrap.sh',
                           bucket_name='grp-ds-users',
                           replace=True)

            self.override_emr_template(username)
            return EmrCreateJobFlowOperator.execute(self, context)

        # Create cluster and return jobflow_id
        # Output the edited EMR template.
        self.job_flow_overrides['BootstrapActions'][0]['ScriptBootstrapAction']['Args'] = [
            f'{self.environment}', self.install_packages_on_emr]
        logging.info(self.job_flow_overrides)
        return EmrCreateJobFlowOperator.execute(self, context)  # Returns the jobflow id
Ejemplo n.º 11
0
def get_create_job_flow_operator(
    job_flow_name,
    job_flow_overrides,
    aws_conn_id,
    emr_conn_id,
):
    cc_index = _get_cc_index_template()
    job_flow_overrides["Steps"][0]["HadoopJarStep"]["Args"][-1] = cc_index
    print(job_flow_overrides)
    return EmrCreateJobFlowOperator(
        task_id=_get_job_flow_creator_task_id(job_flow_name),
        job_flow_overrides=job_flow_overrides,
        aws_conn_id=aws_conn_id,
        emr_conn_id=emr_conn_id,
    )
    def setUp(self):
        configuration.load_test_config()
        args = {
            'owner': 'airflow',
            'start_date': DEFAULT_DATE
        }

        # Mock out the emr_client (moto has incorrect response)
        self.emr_client_mock = MagicMock()
        self.operator = EmrCreateJobFlowOperator(
            task_id='test_task',
            aws_conn_id='aws_default',
            emr_conn_id='emr_default',
            job_flow_overrides=self._config,
            dag=DAG('test_dag_id', default_args=args)
        )
Ejemplo n.º 13
0
    def transform(self, subdag: nx.DiGraph,
                  parent_fragment: DAGFragment) -> DAGFragment:
        subdag_roots = [n for n, d in subdag.in_degree() if d == 0]
        first_root = subdag_roots[0].task_id

        task_id_prefix = '' if first_root in ['op2', 'op3'] else '2'

        TestSubDagTransformer1.op1 = SparkSubmitOperator(
            task_id=f"t{task_id_prefix}p1", dag=self.dag)
        TestSubDagTransformer1.op2 = EmrAddStepsOperator(
            task_id=f"t{task_id_prefix}p2", job_flow_id='foo', dag=self.dag)
        TestSubDagTransformer1.op3 = S3ListOperator(
            task_id=f"t{task_id_prefix}p3", bucket='foo', dag=self.dag)
        TestSubDagTransformer1.op4 = EmrCreateJobFlowOperator(
            task_id=f"t{task_id_prefix}p4", dag=self.dag)
        TestSubDagTransformer1.op5 = DummyOperator(
            task_id=f"t{task_id_prefix}p5", dag=self.dag)

        TestSubDagTransformer1.op1 >> [
            TestSubDagTransformer1.op2, TestSubDagTransformer1.op3
        ] >> TestSubDagTransformer1.op4

        return DAGFragment(
            [TestSubDagTransformer1.op1, TestSubDagTransformer1.op5])
Ejemplo n.º 14
0
    file_content = content_object.get()["Body"].read().decode("utf-8")
    return json.loads(file_content)


with DAG(
        dag_id=DAG_ID,
        description="Run multiple Spark jobs with Amazon EMR",
        default_args=DEFAULT_ARGS,
        dagrun_timeout=timedelta(hours=2),
        start_date=days_ago(1),
        schedule_interval=None,
        tags=["emr", "spark", "pyspark"],
) as dag:
    cluster_creator = EmrCreateJobFlowOperator(
        task_id="create_job_flow",
        job_flow_overrides=get_object(
            "job_flow_overrides/job_flow_overrides.json", work_bucket),
    )

    step_adder = EmrAddStepsOperator(
        task_id="add_steps",
        job_flow_id=
        "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
        aws_conn_id="aws_default",
        steps=get_object("emr_steps/emr_steps.json", work_bucket),
    )

    step_checker = EmrStepSensor(
        task_id="watch_step",
        job_flow_id=
        "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
class TestEmrCreateJobFlowOperator(unittest.TestCase):
    # When
    _config = {
        'Name': 'test_job_flow',
        'ReleaseLabel': '5.11.0',
        'Steps': [{
            'Name': 'test_step',
            'ActionOnFailure': 'CONTINUE',
            'HadoopJarStep': {
                'Jar': 'command-runner.jar',
                'Args': [
                    '/usr/lib/spark/bin/run-example',
                    '{{ macros.ds_add(ds, -1) }}',
                    '{{ ds }}'
                ]
            }
        }]
    }

    def setUp(self):
        configuration.load_test_config()
        args = {
            'owner': 'airflow',
            'start_date': DEFAULT_DATE
        }

        # Mock out the emr_client (moto has incorrect response)
        self.emr_client_mock = MagicMock()
        self.operator = EmrCreateJobFlowOperator(
            task_id='test_task',
            aws_conn_id='aws_default',
            emr_conn_id='emr_default',
            job_flow_overrides=self._config,
            dag=DAG('test_dag_id', default_args=args)
        )

    def test_init(self):
        self.assertEqual(self.operator.aws_conn_id, 'aws_default')
        self.assertEqual(self.operator.emr_conn_id, 'emr_default')

    def test_render_template(self):
        ti = TaskInstance(self.operator, DEFAULT_DATE)
        ti.render_templates()

        expected_args = {
            'Name': 'test_job_flow',
            'ReleaseLabel': '5.11.0',
            'Steps': [{
                'Name': 'test_step',
                'ActionOnFailure': 'CONTINUE',
                'HadoopJarStep': {
                    'Jar': 'command-runner.jar',
                    'Args': [
                        '/usr/lib/spark/bin/run-example',
                        (DEFAULT_DATE - timedelta(days=1)).strftime("%Y-%m-%d"),
                        DEFAULT_DATE.strftime("%Y-%m-%d"),
                    ]
                }
            }]
        }

        self.assertDictEqual(self.operator.job_flow_overrides, expected_args)

    def test_execute_returns_job_id(self):
        self.emr_client_mock.run_job_flow.return_value = RUN_JOB_FLOW_SUCCESS_RETURN

        # Mock out the emr_client creator
        emr_session_mock = MagicMock()
        emr_session_mock.client.return_value = self.emr_client_mock
        self.boto3_session_mock = MagicMock(return_value=emr_session_mock)

        with patch('boto3.session.Session', self.boto3_session_mock):
            self.assertEqual(self.operator.execute(None), 'j-8989898989')
    'Name': 'calculate_pi',
    'ActionOnFailure': 'CONTINUE',
    'HadoopJarStep': {
        'Jar': 'command-runner.jar',
        'Args': ['/usr/lib/spark/bin/run-example', 'SparkPi', '10']
    }
}]

JOB_FLOW_OVERRIDES = {'Name': 'PiCalc', 'Steps': SPARK_TEST_STEPS}

dag = DAG('emr_job_flow_automatic_steps_dag',
          default_args=DEFAULT_ARGS,
          dagrun_timeout=timedelta(hours=2),
          schedule_interval='0 3 * * *')

job_flow_creator = EmrCreateJobFlowOperator(
    task_id='create_job_flow',
    job_flow_overrides=JOB_FLOW_OVERRIDES,
    aws_conn_id='aws_default',
    emr_conn_id='emr_default',
    dag=dag)

job_sensor = EmrJobFlowSensor(
    task_id='check_job_flow',
    job_flow_id=
    "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    aws_conn_id='aws_default',
    dag=dag)

job_flow_creator.set_downstream(job_sensor)
Ejemplo n.º 17
0

def create_job_flow_file(job_var):
    client = boto3.client('s3')
    client.put_object(Body=job_var, Bucket='vivek-mathew', Key='job-flow.txt')


dag = DAG(dag_id='Emr',
          schedule_interval=schedule,
          default_args=args,
          catchup=False)

# @TODO : Add a task for getting the latest AMI

create_cluster = EmrCreateJobFlowOperator(task_id="create_cluster",
                                          aws_conn_id='aws_default',
                                          emr_conn_id='test_emr',
                                          dag=dag)

create_job_flow_variable = PythonOperator(
    task_id="set_jobflow_var",
    python_callable=set_job_flow_var,
    op_args=[
        "{{ task_instance.xcom_pull('create_cluster', key='return_value') }}"
    ],
    dag=dag)

create_job_flow_file = PythonOperator(
    task_id="create_job_flow_file",
    python_callable=create_job_flow_file,
    op_args=[
        "{{ task_instance.xcom_pull('create_cluster', key='return_value') }}"
                                             'region':
                                             PARAMS['REGION'],
                                             'aws_access_key':
                                             PARAMS['aws_access_key'],
                                             'aws_secret':
                                             PARAMS['aws_secret'],
                                             'bucket':
                                             PARAMS['RAW_DATA_BUCKET'],
                                             'file_path':
                                             PARAMS['PYTHON_APPS']
                                         },
                                         dag=dag)

cluster_creator = EmrCreateJobFlowOperator(task_id='create_immigration_job',
                                           job_flow_overrides=JOB_FLOW,
                                           aws_conn_id='aws_default',
                                           emr_conn_id='emr_default',
                                           region_name=PARAMS['REGION'],
                                           dag=dag)

add_transform_step_task = EmrAddStepsOperatorV2(
    task_id='add_transform_step',
    job_flow_id=
    "{{ task_instance.xcom_pull('create_immigration_job', key='return_value') }}",
    aws_conn_id='aws_default',
    steps=TRANSFORM_IMMIGRATION_SAS_DATA,
    region_name=PARAMS['REGION'],
    dag=dag)

watch_immigration_transform_task = EmrStepSensor(
    task_id='watch_immigration_transform',
    job_flow_id=
Ejemplo n.º 19
0
from airflow.utils.dates import days_ago

from emr_job_flow_with_sensor import EmrJobFlowWithSensor
from emr_step_with_sensor import EmrStepWithSensor

# the job flow step configuration as described here:
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/emr.html#EMR.Client.run_job_flow
step_conf = {}
job_conf = {}

dag = DAG(
    dag_id='spark_job',
    default_args={
        'owner': 'airflow',
        'start_date': days_ago(1)
    }
)

job = EmrJobFlowWithSensor(
    task_id='job_and_retry',
    job_flow=EmrCreateJobFlowOperator(
        task_id='job',
        job_flow_overrides=job_conf
    ),
    sensor=EmrJobFlowSensor(
        task_id='sensor',
        job_flow_id=''
    ),
    dag=dag
)
Ejemplo n.º 20
0
    task_id='transfer_brazil_data_file',
    python_callable=transfer_brazil_data_file,
    dag=dag)

# Verify weather US data file exists
transfer_usa_data_file_task = PythonOperator(
    task_id='transfer_usa_data_file',
    python_callable=transfer_usa_data_file,
    op_kwargs={
        'bucket': 'covid19-lake',
        'prefix': 'enigma-aggregation/json/us_states'
    },
    dag=dag)

# Create an EMR JobFlow
spin_up_emr_cluster_task = EmrCreateJobFlowOperator(
    task_id='spin_up_emr_cluster', job_flow_overrides=emr_settings, dag=dag)

# Add steps to an existing EMR JobFlow
add_pipeline_to_emr_cluster_task = EmrAddStepsOperator(
    task_id='add_pipeline_to_emr_cluster',
    job_flow_id="{{task_instance.xcom_pull('spin_up_emr_cluster', " \
               +"  key='return_value')}}",
    steps=covid19_pipeline,
    dag=dag
)

# Wait step to be completed
watch_pipeline_step_task = EmrStepSensor(
    task_id='watch_pipeline_step',
    job_flow_id="{{task_instance.xcom_pull(" \
                "      'spin_up_emr_cluster'," \
Ejemplo n.º 21
0
    logging.info('checking that data exists in s3')
    source_s3 = S3Hook(aws_conn_id='aws_default')
    keys = source_s3.list_keys(
        bucket_name='dendsparktut',  #TODO
        prefix='raw_data/')  #TODO
    logging.info('keys {}'.format(keys))


check_data_exists_task = PythonOperator(task_id='check_data_exists',
                                        python_callable=check_data_exists,
                                        provide_context=False,
                                        dag=dag)

create_job_flow_task = EmrCreateJobFlowOperator(
    task_id='create_job_flow',
    aws_conn_id='aws_default',
    emr_conn_id='emr_default',
    job_flow_overrides=default_emr_settings,
    dag=dag)

copy_python_script = EmrAddStepsOperator(
    task_id='copy_script',
    # XComs let tasks exchange messages
    job_flow_id=
    "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    aws_conn_id='aws_default',
    steps=copy_script_step,
    dag=dag)

watch_prev_step_task1 = EmrStepSensor(
    task_id='watch_prev_step1',
    job_flow_id=
Ejemplo n.º 22
0
# Load pyspark script file into S3
script_to_s3 = LoadFileIntoS3Operator(
    dag=dag,
    task_id="script_to_s3",
    airflow_folder=config['S3']['airflow_folder'],
    filename=config['S3']['local_script'],
    s3_key=config['S3']['s3_script'],
    bucket_name=config['S3']['BUCKET_NAME'],
    aws_credentials_id="aws_credentials")

# Create an EMR cluster
create_emr_cluster = EmrCreateJobFlowOperator(
    task_id="create_emr_cluster",
    job_flow_overrides=JOB_FLOW_OVERRIDES,
    aws_conn_id="aws_credentials",
    emr_conn_id="emr_default",
    dag=dag,
)

# Add your steps to the EMR cluster
step_adder = EmrAddStepsOperator(
    task_id="add_steps",
    job_flow_id=
    "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}",
    aws_conn_id="aws_credentials",
    steps=SPARK_STEPS,
    params={
        "BUCKET_NAME": config['S3']['BUCKET_NAME'],
        "s3_script": config['S3']['s3_script'],
        "s3_clean": config['S3']['s3_clean'],
Ejemplo n.º 23
0
                                  key="",
                                  load_sas=False,
                                  provide_context=True)

    # read local script files and transfer to S3
    load_script = StageToS3Operator(task_id="load_script_to_S3",
                                    mode="scripts",
                                    filename=local_scripts,
                                    bucket_name=bucket_name,
                                    prefix="scripts",
                                    key="")

    # Create an EMR cluster
    create_emr_cluster = EmrCreateJobFlowOperator(
        task_id="create_emr_cluster",
        job_flow_overrides=get_job_flow_overrides(job_flow_overrides),
        aws_conn_id="aws_default",
        emr_conn_id="emr_default")

    # Add steps to the EMR cluster
    # Step 1 = ETL Pipeline
    # Step 2 = Data Quality Test
    step_adder = EmrAddStepsOperator(
        task_id="add_steps",
        job_flow_id=
        "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}",
        aws_conn_id="aws_default",
        steps=SPARK_STEPS,
        params={
            "bucket_name": bucket_name,
            "s3_etl_script": s3_etl_script,
Ejemplo n.º 24
0
            },  #todo is it required?
            {
                "Name": "Hadoop"
            },
        ],
        "VisibleToAllUsers":
        True,  # todo not found
        "JobFlowRole":
        "EMR_EC2_DefaultRole",  # todo called InstanceProfile?
        "ServiceRole":
        "EMR_DefaultRole",
    }

    create_job_flow_task = EmrCreateJobFlowOperator(
        task_id='create_job_flow',
        aws_conn_id='aws_default',
        job_flow_overrides=default_emr_settings,
        dag=dag,
        region_name="us-east-1")

    extract_step_task = EmrAddStepsOperator(
        task_id='extract_step',  # 'add_step',
        job_flow_id=
        "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
        aws_conn_id='aws_default',
        steps=[{
            "Name": "Step1 Preprocess",
            "ActionOnFailure": "CONTINUE",
            "HadoopJarStep": {
                "Jar":
                "command-runner.jar",  # todo https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-commandrunner.html
                "Args": [
Ejemplo n.º 25
0
    'email_on_retry': True
}

with DAG(
    dag_id='flight_delays_emr',
    default_args=DEFAULT_ARGS,
    dagrun_timeout=timedelta(hours=1),
    schedule_interval='@once',
) as dag:

    start_operator = DummyOperator(task_id='begin_execution', dag=dag)
    end_operator = DummyOperator(task_id='stop_execution', dag=dag)

    with open('emr_job_flow.json', 'r') as fp:
        job_flow = json.load(fp)
    cluster_creator = EmrCreateJobFlowOperator(
        task_id='create_job_flow',
        job_flow_overrides=job_flow,
        aws_conn_id='aws_credentials',
        emr_conn_id='emr_default'
    )

    job_sensor = EmrJobFlowSensor(
        task_id='check_job_flow',
        job_flow_id="{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
        aws_conn_id='aws_credentials'
    )

    # define the DAG structure, in terms of the created operators
    start_operator >> cluster_creator >> job_sensor >> end_operator
def handle_failure_task():
    raise AirflowException('Marking DAG as failed due to an upstream failure!')


with DAG(
    dag_id='example_emr_job_flow_dag',
    default_args=DEFAULT_ARGS,
    dagrun_timeout=timedelta(hours=2),
    max_active_runs=1,
    schedule_interval=None,
    params=get_config('emr')
) as dag:

    create_cluster_op = EmrCreateJobFlowOperator(
        task_id='create_cluster',
        job_flow_overrides={'Name': 'PiCalc'},
        aws_conn_id=get_config('emr')['aws_conn_id'],
        emr_conn_id=get_config('emr')['emr_conn_id']
    )

    add_steps_to_cluster_op = EmrAddStepsOperator(
        task_id='add_steps',
        job_flow_id="{{ task_instance.xcom_pull(task_ids='create_cluster', key='return_value') }}",
        aws_conn_id=get_config('emr')['aws_conn_id'],
        steps=[
            {
                'Name': 'calculate_pi',
                'ActionOnFailure': 'TERMINATE_CLUSTER',
                'HadoopJarStep': {
                    'Jar': '{{ params.hadoop_jar_path }}',
                    'Args': [
                        '10'
Ejemplo n.º 27
0
check_emr_database = BranchPythonOperator(
    task_id='check_emr_database',
    provide_context=True,
    python_callable=check_emr_database,
    retries=1,
    dag=dag,
)

skip_emr_database_creation = DummyOperator(
    task_id="skip_emr_database_creation",
    trigger_rule=TriggerRule.NONE_FAILED,
    dag=dag,
)

create_emr_database_cluster = EmrCreateJobFlowOperator(
    task_id='create_emr_database_cluster',
    job_flow_overrides=JOB_FLOW_OVERRIDES,
    dag=dag)
create_emr_database_step = EmrAddStepsOperator(
    task_id='create_emr_database_step',
    job_flow_id=
    "{{ task_instance.xcom_pull(task_ids='create_emr_database_cluster', key='return_value') }}",
    aws_conn_id='aws_default',
    on_failure_callback=cleanup_emr_cluster_if_steps_fail,
    steps=CREATE_DATABASE,
)
create_emr_database_sensor = EmrStepSensor(
    task_id='create_emr_database_sensor',
    job_flow_id=
    "{{ task_instance.xcom_pull('create_emr_database_cluster', key='return_value') }}",
    step_id=
    "{{ task_instance.xcom_pull(task_ids='create_emr_database_step', key='return_value')[0] }}",
JOB_FLOW_OVERRIDES = {
    'Name': 'PiCalc',
    'KeepJobFlowAliveWhenNoSteps': True
}

dag = DAG(
    'emr_job_flow_manual_steps_dag',
    default_args=DEFAULT_ARGS,
    dagrun_timeout=timedelta(hours=2),
    schedule_interval='0 3 * * *'
)

cluster_creator = EmrCreateJobFlowOperator(
    task_id='create_job_flow',
    job_flow_overrides=JOB_FLOW_OVERRIDES,
    aws_conn_id='aws_default',
    emr_conn_id='emr_default',
    dag=dag
)

step_adder = EmrAddStepsOperator(
    task_id='add_steps',
    job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    aws_conn_id='aws_default',
    steps=SPARK_TEST_STEPS,
    dag=dag
)

step_checker = EmrStepSensor(
    task_id='watch_step',
    job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
class TestEmrCreateJobFlowOperator(unittest.TestCase):
    # When
    _config = {
        'Name':
        'test_job_flow',
        'ReleaseLabel':
        '5.11.0',
        'Steps': [{
            'Name': 'test_step',
            'ActionOnFailure': 'CONTINUE',
            'HadoopJarStep': {
                'Jar':
                'command-runner.jar',
                'Args': [
                    '/usr/lib/spark/bin/run-example',
                    '{{ macros.ds_add(ds, -1) }}', '{{ ds }}'
                ]
            }
        }]
    }

    def setUp(self):
        args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}

        # Mock out the emr_client (moto has incorrect response)
        self.emr_client_mock = MagicMock()
        self.operator = EmrCreateJobFlowOperator(
            task_id='test_task',
            aws_conn_id='aws_default',
            emr_conn_id='emr_default',
            job_flow_overrides=self._config,
            region_name='ap-southeast-2',
            dag=DAG('test_dag_id', default_args=args))

    def test_init(self):
        self.assertEqual(self.operator.aws_conn_id, 'aws_default')
        self.assertEqual(self.operator.emr_conn_id, 'emr_default')
        self.assertEqual(self.operator.region_name, 'ap-southeast-2')

    def test_render_template(self):
        ti = TaskInstance(self.operator, DEFAULT_DATE)
        ti.render_templates()

        expected_args = {
            'Name':
            'test_job_flow',
            'ReleaseLabel':
            '5.11.0',
            'Steps': [{
                'Name': 'test_step',
                'ActionOnFailure': 'CONTINUE',
                'HadoopJarStep': {
                    'Jar':
                    'command-runner.jar',
                    'Args': [
                        '/usr/lib/spark/bin/run-example',
                        (DEFAULT_DATE -
                         timedelta(days=1)).strftime("%Y-%m-%d"),
                        DEFAULT_DATE.strftime("%Y-%m-%d"),
                    ]
                }
            }]
        }

        self.assertDictEqual(self.operator.job_flow_overrides, expected_args)

    def test_execute_returns_job_id(self):
        self.emr_client_mock.run_job_flow.return_value = RUN_JOB_FLOW_SUCCESS_RETURN

        # Mock out the emr_client creator
        emr_session_mock = MagicMock()
        emr_session_mock.client.return_value = self.emr_client_mock
        self.boto3_session_mock = MagicMock(return_value=emr_session_mock)

        with patch('boto3.session.Session', self.boto3_session_mock):
            self.assertEqual(self.operator.execute(None), 'j-8989898989')
]

JOB_FLOW_OVERRIDES = {
    'Name': 'PiCalc',
    'Steps': SPARK_TEST_STEPS
}

dag = DAG(
    'emr_job_flow_automatic_steps_dag',
    default_args=DEFAULT_ARGS,
    dagrun_timeout=timedelta(hours=2),
    schedule_interval='0 3 * * *'
)

job_flow_creator = EmrCreateJobFlowOperator(
    task_id='create_job_flow',
    job_flow_overrides=JOB_FLOW_OVERRIDES,
    aws_conn_id='aws_default',
    emr_conn_id='emr_default',
    dag=dag
)

job_sensor = EmrJobFlowSensor(
    task_id='check_job_flow',
    job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    aws_conn_id='aws_default',
    dag=dag
)

job_flow_creator.set_downstream(job_sensor)
Ejemplo n.º 31
0
start_operator = DummyOperator(task_id="Begin_execution",  dag=dag)

# Empty out the analytics bucket - otherwise we aggregate results from successive runs
bucket_name = BUCKET_NAME + "/" + S3_ANALYTICS_BUCKET
empty_bucket = BashOperator(
    task_id="empty_bucket",
    bash_command="aws s3 rm s3://{} --recursive".format(
        bucket_name),
    dag=dag,
)

# Create EMR instance
create_EMR_instance = EmrCreateJobFlowOperator(
    task_id="create_EMR_cluster",
    job_flow_overrides=JOB_FLOW_OVERRIDES,
    aws_conn_id="aws_default",
    emr_conn_id="emr_default",
    dag=dag
)

# Add your steps to the EMR cluster
EMR_step_adder = EmrAddStepsOperator(
    task_id="EMR_step_adder",
    job_flow_id="{{ task_instance.xcom_pull(task_ids='create_EMR_cluster', key='return_value') }}",
    aws_conn_id="aws_default",
    steps=SPARK_STEPS,
    params={  # these params are used to provide the parameters for the steps JSON above
        "bucket_name": BUCKET_NAME,
        "s3_data": S3_DATA_BUCKET,
        "s3_script_bucket": S3_SCRIPT_BUCKET,
        "s3_output": S3_ANALYTICS_BUCKET,
Ejemplo n.º 32
0
        SCRAPERS

    '''

    scrapers_dummy = DummyOperator(task_id='scrapers_dummy', dag=dag)
    '''

        STAGING LAYER PRE-PROCESSING
        (Spark consume Sources save to Parquet)

    '''

    manifold_emr_creator = EmrCreateJobFlowOperator(
        task_id='create_manifold_emr',
        job_flow_overrides=JOB_FLOW_OVERRIDES,
        aws_conn_id='aws_credentials',
        emr_conn_id='emr_credentials',
    )

    manifold_emr_job_sensor = EmrJobFlowSensor(
        task_id='check_emr_completion',
        job_flow_id=
        "{{ task_instance.xcom_pull(task_ids='create_manifold_emr', key='return_value') }}",
        aws_conn_id='aws_credentials',
    )
    '''

        STAGING TABLE CREATION
        
    '''
Ejemplo n.º 33
0
dag = DAG('EMR_TEST_1',
          default_args=DEFAULT_ARGS,
          catchup=False,
          schedule_interval="0 1 * * *")

with dag:
    file_sensor = S3KeySensor(task_id='file_sensor',
                              poke_interval=600,
                              timeout=1000,
                              soft_fail=False,
                              bucket_name='ds-afarrell',
                              bucket_key='manybla.txt')

    create_cluster = EmrCreateJobFlowOperator(
        task_id='create_cluster',
        job_flow_overrides=JOB_FLOW_OVERRIDES,
        aws_conn_id='aws_default',
        emr_conn_id='emr_benchmarks_connection')

    run_some_pyspark = EmrAddStepsOperator(
        task_id='run_some_pyspark',
        job_flow_id=
        "{{ task_instance.xcom_pull('create_cluster', key='return_value') }}",
        aws_conn_id='aws_default',
        steps=EMR_STEP_1)

    output_file_sensor = S3KeySensor(
        task_id='output_file_sensor',
        poke_interval=600,
        timeout=1000,
        soft_fail=False,
Ejemplo n.º 34
0
    delimiter='',
    aws_conn_id='aws_default')

processed_tweet_data_quality = S3DataQualityOperator(
    task_id='Processed_tweet_data_quality_check',
    dag=dag,
    bucket=bucket_etl,
    prefix='{}/{}'.format(tweet_stat_key, exec_date_partitioned),
    delimiter='',
    aws_conn_id='aws_default')

# Create EMR job flow and monitor it
job_flow_creator = EmrCreateJobFlowOperator(
    task_id='Create_emr_job_flow',
    dag=dag,
    job_flow_overrides=JOB_FLOW_OVERRIDES,
    aws_conn_id='aws_default',
    emr_conn_id='emr_default',
)

job_sensor = EmrJobFlowSensor(
    task_id='Check_emr_job_flow',
    dag=dag,
    job_flow_id=
    "{{ task_instance.xcom_pull(task_ids='Create_emr_job_flow', key='return_value') }}",
    aws_conn_id='aws_default',
)

# Define task orders
raw_tweet_data_quality >> job_flow_creator
job_flow_creator >> job_sensor
            'spark-submit', '--deploy-mode', 'cluster',
            's3://<s3-bucket>/jobs/movies-analytics.py', '-i',
            's3://<s3-bucket>/data', '-o', 's3://<s3-bucket>/results'
        ]
    }
}]

JOB_FLOW_OVERRIDES = {"Name": "MoviesAnalytics"}

with DAG(dag_id='emr_job_movies_dag',
         default_args=DEFAULT_ARGS,
         dagrun_timeout=timedelta(hours=2),
         schedule_interval=None) as dag:
    cluster_creator = EmrCreateJobFlowOperator(
        task_id='create_emr_cluster',
        job_flow_overrides=JOB_FLOW_OVERRIDES,
        aws_conn_id='aws_default',
        emr_conn_id='emr_default')

    step_adder = EmrAddStepsOperator(
        task_id='movie_analytics_job',
        job_flow_id=
        "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}",
        aws_conn_id='aws_default',
        steps=SPARK_TEST_STEPS)

    step_checker = EmrStepSensor(
        task_id='wait_for_analytics_completion',
        job_flow_id=
        "{{ task_instance.xcom_pull('create_emr_cluster', key='return_value') }}",
        step_id=
Ejemplo n.º 36
0
            "Key": "Owner",
            "Value": "Data Analytics Team"
        },
    ],
}

with DAG(
        dag_id=DAG_ID,
        description="Run built-in Spark app on Amazon EMR",
        default_args=DEFAULT_ARGS,
        dagrun_timeout=timedelta(hours=2),
        start_date=days_ago(1),
        schedule_interval=None,
        tags=["emr", "spark"],
) as dag:
    cluster_creator = EmrCreateJobFlowOperator(
        task_id="create_job_flow", job_flow_overrides=JOB_FLOW_OVERRIDES)

    step_adder = EmrAddStepsOperator(
        task_id="add_steps",
        job_flow_id=
        "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
        aws_conn_id="aws_default",
        steps=SPARK_STEPS,
    )

    step_checker = EmrStepSensor(
        task_id="watch_step",
        job_flow_id=
        "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
        step_id=
        "{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}",