Esempio n. 1
0
def test_emr_log_location_for_cluster(emr_cluster_config, mock_s3_bucket):
    context = create_test_pipeline_execution_context()
    emr = EmrJobRunner(region=REGION)
    cluster_id = emr.run_job_flow(context.log, emr_cluster_config)
    assert emr.log_location_for_cluster(cluster_id) == (mock_s3_bucket.name, "elasticmapreduce/")

    # Should raise when the log URI is missing
    emr_cluster_config = copy.deepcopy(emr_cluster_config)
    del emr_cluster_config["LogUri"]
    cluster_id = emr.run_job_flow(context.log, emr_cluster_config)
    with pytest.raises(EmrError) as exc_info:
        emr.log_location_for_cluster(cluster_id)

    assert "Log URI not specified, cannot retrieve step execution logs" in str(exc_info.value)
Esempio n. 2
0
def test_emr_retrieve_logs(emr_cluster_config, mock_s3_bucket):
    context = create_test_pipeline_execution_context()
    emr = EmrJobRunner(region=REGION)
    cluster_id = emr.run_job_flow(context.log, emr_cluster_config)
    assert emr.log_location_for_cluster(cluster_id) == (mock_s3_bucket.name,
                                                        "elasticmapreduce/")

    def create_log():
        time.sleep(0.5)
        out = io.BytesIO()
        with gzip.GzipFile(fileobj=out, mode="w") as fo:
            fo.write("some log".encode())

        prefix = "elasticmapreduce/{cluster_id}/steps/{step_id}".format(
            cluster_id=cluster_id, step_id="s-123456123456")

        for name in ["stdout.gz", "stderr.gz"]:
            mock_s3_bucket.Object(prefix + "/" + name).put(  # pylint: disable=no-member
                Body=out.getvalue())

    thread = threading.Thread(target=create_log, args=())
    thread.daemon = True
    thread.start()

    stdout_log, stderr_log = emr.retrieve_logs_for_step_id(
        context.log, cluster_id, "s-123456123456")
    assert stdout_log == "some log"
    assert stderr_log == "some log"
Esempio n. 3
0
def test_emr_describe_cluster(emr_cluster_config):
    context = create_test_pipeline_execution_context()
    cluster = EmrJobRunner(region=REGION)
    cluster_id = cluster.run_job_flow(context.log, emr_cluster_config)
    cluster_info = cluster.describe_cluster(cluster_id)['Cluster']
    assert cluster_info['Name'] == 'test-emr'
    assert EmrClusterState(cluster_info['Status']['State']) == EmrClusterState.Waiting
Esempio n. 4
0
def test_emr_describe_cluster(emr_cluster_config):
    context = create_test_pipeline_execution_context()
    cluster = EmrJobRunner(region=REGION)
    cluster_id = cluster.run_job_flow(context.log, emr_cluster_config)
    cluster_info = cluster.describe_cluster(cluster_id)["Cluster"]
    assert cluster_info["Name"] == "test-emr"
    assert EmrClusterState(cluster_info["Status"]["State"]) == EmrClusterState.Waiting
Esempio n. 5
0
def test_emr_retrieve_logs(emr_cluster_config):
    context = create_test_pipeline_execution_context()
    emr = EmrJobRunner(region=REGION)
    cluster_id = emr.run_job_flow(context.log, emr_cluster_config)
    assert emr.log_location_for_cluster(cluster_id) == ('emr-cluster-logs', 'elasticmapreduce/')

    s3 = boto3.resource('s3', region_name=REGION)
    s3.create_bucket(Bucket='emr-cluster-logs')  # pylint: disable=no-member

    def create_log():
        time.sleep(0.5)
        out = io.BytesIO()
        with gzip.GzipFile(fileobj=out, mode='w') as fo:
            fo.write('some log'.encode())

        prefix = 'elasticmapreduce/{cluster_id}/steps/{step_id}'.format(
            cluster_id=cluster_id, step_id='s-123456123456'
        )

        for name in ['stdout.gz', 'stderr.gz']:
            s3.Object('emr-cluster-logs', prefix + '/' + name).put(  # pylint: disable=no-member
                Body=out.getvalue()
            )

    thread = threading.Thread(target=create_log, args=())
    thread.daemon = True
    thread.start()

    stdout_log, stderr_log = emr.retrieve_logs_for_step_id(
        context.log, cluster_id, 's-123456123456'
    )
    assert stdout_log == 'some log'
    assert stderr_log == 'some log'
Esempio n. 6
0
def test_emr_log_location_for_cluster(emr_cluster_config):
    context = create_test_pipeline_execution_context()
    emr = EmrJobRunner(region=REGION)
    cluster_id = emr.run_job_flow(context, emr_cluster_config)
    assert emr.log_location_for_cluster(cluster_id) == ('emr-cluster-logs',
                                                        'elasticmapreduce/')

    # Should raise when the log URI is missing
    emr_cluster_config = copy.deepcopy(emr_cluster_config)
    del emr_cluster_config['LogUri']
    cluster_id = emr.run_job_flow(context, emr_cluster_config)
    with pytest.raises(EmrError) as exc_info:
        emr.log_location_for_cluster(cluster_id)

    assert 'Log URI not specified, cannot retrieve step execution logs' in str(
        exc_info.value)
Esempio n. 7
0
def test_emr_id_from_name(emr_cluster_config):
    context = create_test_pipeline_execution_context()
    cluster = EmrJobRunner(region=REGION)
    cluster_id = cluster.run_job_flow(context.log, emr_cluster_config)
    assert cluster.cluster_id_from_name("test-emr") == cluster_id

    with pytest.raises(EmrError) as exc_info:
        cluster.cluster_id_from_name("cluster-doesnt-exist")

    assert "cluster cluster-doesnt-exist not found in region us-west-1" in str(exc_info.value)
Esempio n. 8
0
def test_is_emr_step_complete(emr_cluster_config):
    context = create_test_pipeline_execution_context()
    emr = EmrJobRunner(region=REGION, check_cluster_every=1)

    cluster_id = emr.run_job_flow(context.log, emr_cluster_config)

    step_name = "test_step"
    step_cmd = ["ls", "/"]
    step_ids = emr.add_job_flow_steps(
        context.log, cluster_id,
        [emr.construct_step_dict_for_command(step_name, step_cmd)])

    def get_step_dict(step_id, step_state):
        return {
            "Step": {
                "Id": step_id,
                "Name": step_name,
                "Config": {
                    "Jar": "command-runner.jar",
                    "Properties": {},
                    "Args": step_cmd
                },
                "ActionOnFailure": "CONTINUE",
                "Status": {
                    "State": step_state,
                    "StateChangeReason": {
                        "Message": "everything is hosed"
                    },
                    "Timeline": {
                        "StartDateTime": _boto3_now()
                    },
                },
            },
        }

    emr_step_id = step_ids[0]
    describe_step_returns = [
        get_step_dict(emr_step_id, "PENDING"),
        get_step_dict(emr_step_id, "RUNNING"),
        get_step_dict(emr_step_id, "COMPLETED"),
        get_step_dict(emr_step_id, "FAILED"),
    ]
    with mock.patch.object(EmrJobRunner,
                           "describe_step",
                           side_effect=describe_step_returns):
        assert not emr.is_emr_step_complete(context.log, cluster_id,
                                            emr_step_id)
        assert not emr.is_emr_step_complete(context.log, cluster_id,
                                            emr_step_id)
        assert emr.is_emr_step_complete(context.log, cluster_id, emr_step_id)

        with pytest.raises(EmrError) as exc_info:
            emr.is_emr_step_complete(context.log, cluster_id, emr_step_id)
            assert "step failed" in str(exc_info.value)
Esempio n. 9
0
def test_is_emr_step_complete(emr_cluster_config):
    context = create_test_pipeline_execution_context()
    emr = EmrJobRunner(region=REGION, check_cluster_every=1)

    cluster_id = emr.run_job_flow(context.log, emr_cluster_config)

    step_name = 'test_step'
    step_cmd = ['ls', '/']
    step_ids = emr.add_job_flow_steps(
        context.log, cluster_id,
        [emr.construct_step_dict_for_command(step_name, step_cmd)])

    def get_step_dict(step_id, step_state):
        return {
            'Step': {
                'Id': step_id,
                'Name': step_name,
                'Config': {
                    'Jar': 'command-runner.jar',
                    'Properties': {},
                    'Args': step_cmd
                },
                'ActionOnFailure': 'CONTINUE',
                'Status': {
                    'State': step_state,
                    'StateChangeReason': {
                        'Message': 'everything is hosed'
                    },
                    'Timeline': {
                        'StartDateTime': _boto3_now()
                    },
                },
            },
        }

    emr_step_id = step_ids[0]
    describe_step_returns = [
        get_step_dict(emr_step_id, 'PENDING'),
        get_step_dict(emr_step_id, 'RUNNING'),
        get_step_dict(emr_step_id, 'COMPLETED'),
        get_step_dict(emr_step_id, 'FAILED'),
    ]
    with mock.patch.object(EmrJobRunner,
                           'describe_step',
                           side_effect=describe_step_returns):
        assert not emr.is_emr_step_complete(context.log, cluster_id,
                                            emr_step_id)
        assert not emr.is_emr_step_complete(context.log, cluster_id,
                                            emr_step_id)
        assert emr.is_emr_step_complete(context.log, cluster_id, emr_step_id)

        with pytest.raises(EmrError) as exc_info:
            emr.is_emr_step_complete(context.log, cluster_id, emr_step_id)
            assert 'step failed' in str(exc_info.value)
Esempio n. 10
0
def test_emr_add_tags_and_describe_cluster(emr_cluster_config):
    context = create_test_pipeline_execution_context()
    emr = EmrJobRunner(region=REGION)

    cluster_id = emr.run_job_flow(context.log, emr_cluster_config)

    emr.add_tags(context.log, {"foobar": "v1", "baz": "123"}, cluster_id)

    tags = emr.describe_cluster(cluster_id)["Cluster"]["Tags"]

    assert {"Key": "baz", "Value": "123"} in tags
    assert {"Key": "foobar", "Value": "v1"} in tags
Esempio n. 11
0
def test_emr_add_tags_and_describe_cluster(emr_cluster_config):
    context = create_test_pipeline_execution_context()
    emr = EmrJobRunner(region=REGION)

    cluster_id = emr.run_job_flow(context.log, emr_cluster_config)

    emr.add_tags(context.log, {'foobar': 'v1', 'baz': '123'}, cluster_id)

    tags = emr.describe_cluster(cluster_id)['Cluster']['Tags']

    assert {'Key': 'baz', 'Value': '123'} in tags
    assert {'Key': 'foobar', 'Value': 'v1'} in tags
Esempio n. 12
0
def test_pyspark_emr(mock_wait):
    run_job_flow_args = dict(
        Instances={
            'InstanceCount': 1,
            'KeepJobFlowAliveWhenNoSteps': True,
            'MasterInstanceType': 'c3.medium',
            'Placement': {
                'AvailabilityZone': 'us-west-1a'
            },
            'SlaveInstanceType': 'c3.xlarge',
        },
        JobFlowRole='EMR_EC2_DefaultRole',
        LogUri='s3://mybucket/log',
        Name='cluster',
        ServiceRole='EMR_DefaultRole',
        VisibleToAllUsers=True,
    )

    # Doing cluster setup outside of a solid here, because run_job_flow is not yet plumbed through
    # to the pyspark EMR resource.
    job_runner = EmrJobRunner(region='us-west-1')
    context = create_test_pipeline_execution_context()
    cluster_id = job_runner.run_job_flow(context, run_job_flow_args)

    result = execute_pipeline(
        example_pipe,
        environment_dict={
            'solids': {
                'blah': {
                    'config': {
                        'foo': 'a string',
                        'bar': 123
                    }
                }
            },
            'resources': {
                'pyspark': {
                    'config': {
                        'pipeline_file': __file__,
                        'pipeline_fn_name': 'example_pipe',
                        'cluster_id': cluster_id,
                        'staging_bucket': 'dagster-scratch-80542c2',
                        'region_name': 'us-west-1',
                    }
                }
            },
        },
        run_config=RunConfig(mode='prod'),
    )
    assert result.success
    assert mock_wait.called_once
Esempio n. 13
0
def test_emr_wait_for_step(emr_cluster_config):
    context = create_test_pipeline_execution_context()
    emr = EmrJobRunner(region=REGION, check_cluster_every=1)

    cluster_id = emr.run_job_flow(context.log, emr_cluster_config)

    step_name = 'test_step'
    step_cmd = ['ls', '/']
    step_ids = emr.add_job_flow_steps(
        context.log, cluster_id, [emr.construct_step_dict_for_command(step_name, step_cmd)]
    )

    def get_step_dict(step_id, step_state):
        return {
            'Step': {
                'Id': step_id,
                'Name': step_name,
                'Config': {'Jar': 'command-runner.jar', 'Properties': {}, 'Args': step_cmd},
                'ActionOnFailure': 'CONTINUE',
                'Status': {
                    'State': step_state,
                    'StateChangeReason': {'Message': 'everything is hosed'},
                    'Timeline': {'StartDateTime': _boto3_now()},
                },
            },
        }

    calls = {'num_calls': 0, 'final_state': 'COMPLETED'}

    def new_describe_step(_, cluster_id, step_id):
        calls['num_calls'] += 1

        if calls['num_calls'] == 1:
            return get_step_dict(step_id, 'PENDING')
        elif calls['num_calls'] == 2:
            return get_step_dict(step_id, 'RUNNING')
        else:
            return get_step_dict(step_id, calls['final_state'])

        return emr.describe_step(cluster_id, step_id)

    with mock.patch.object(EmrJobRunner, 'describe_step', new=new_describe_step):
        emr.wait_for_emr_steps_to_complete(context.log, cluster_id, step_ids)

    calls['num_calls'] = 0
    calls['final_state'] = 'FAILED'
    with pytest.raises(EmrError) as exc_info:
        with mock.patch.object(EmrJobRunner, 'describe_step', new=new_describe_step):
            emr.wait_for_emr_steps_to_complete(context.log, cluster_id, step_ids)
    assert 'step failed' in str(exc_info.value)
Esempio n. 14
0
def test_pyspark_emr(mock_is_emr_step_complete, mock_read_events,
                     mock_s3_bucket):
    mock_read_events.return_value = execute_pipeline(
        reconstructable(define_do_nothing_pipe),
        mode="local").events_by_step_key["do_nothing_solid"]

    run_job_flow_args = dict(
        Instances={
            "InstanceCount": 1,
            "KeepJobFlowAliveWhenNoSteps": True,
            "MasterInstanceType": "c3.medium",
            "Placement": {
                "AvailabilityZone": "us-west-1a"
            },
            "SlaveInstanceType": "c3.xlarge",
        },
        JobFlowRole="EMR_EC2_DefaultRole",
        LogUri="s3://{bucket}/log".format(bucket=mock_s3_bucket.name),
        Name="cluster",
        ServiceRole="EMR_DefaultRole",
        VisibleToAllUsers=True,
    )

    # Doing cluster setup outside of a solid here, because run_job_flow is not yet plumbed through
    # to the pyspark EMR resource.
    job_runner = EmrJobRunner(region="us-west-1")
    context = create_test_pipeline_execution_context()
    cluster_id = job_runner.run_job_flow(context.log, run_job_flow_args)

    result = execute_pipeline(
        pipeline=reconstructable(define_do_nothing_pipe),
        mode="prod",
        run_config={
            "resources": {
                "pyspark_step_launcher": {
                    "config":
                    deep_merge_dicts(
                        BASE_EMR_PYSPARK_STEP_LAUNCHER_CONFIG,
                        {
                            "cluster_id": cluster_id,
                            "staging_bucket": mock_s3_bucket.name
                        },
                    ),
                }
            },
        },
    )
    assert result.success
    assert mock_is_emr_step_complete.called
Esempio n. 15
0
def test_pyspark_emr(mock_wait, mock_get_step_events):
    run_job_flow_args = dict(
        Instances={
            'InstanceCount': 1,
            'KeepJobFlowAliveWhenNoSteps': True,
            'MasterInstanceType': 'c3.medium',
            'Placement': {
                'AvailabilityZone': 'us-west-1a'
            },
            'SlaveInstanceType': 'c3.xlarge',
        },
        JobFlowRole='EMR_EC2_DefaultRole',
        LogUri='s3://mybucket/log',
        Name='cluster',
        ServiceRole='EMR_DefaultRole',
        VisibleToAllUsers=True,
    )

    # Doing cluster setup outside of a solid here, because run_job_flow is not yet plumbed through
    # to the pyspark EMR resource.
    job_runner = EmrJobRunner(region='us-west-1')
    context = create_test_pipeline_execution_context()
    cluster_id = job_runner.run_job_flow(context.log, run_job_flow_args)

    pipeline_def = ExecutionTargetHandle.for_pipeline_fn(
        define_do_nothing_pipe).build_pipeline_definition()
    result = execute_pipeline(
        pipeline=pipeline_def,
        mode='prod',
        environment_dict={
            'resources': {
                'pyspark_step_launcher': {
                    'config':
                    deep_merge_dicts(BASE_EMR_PYSPARK_STEP_LAUNCHER_CONFIG,
                                     {'cluster_id': cluster_id}),
                }
            },
        },
    )
    assert result.success
    assert mock_wait.called_once
    assert mock_get_step_events.called_once
Esempio n. 16
0
def test_emr_create_cluster(emr_cluster_config):
    context = create_test_pipeline_execution_context()
    cluster = EmrJobRunner(region=REGION)
    cluster_id = cluster.run_job_flow(context, emr_cluster_config)
    assert cluster_id.startswith('j-')