Beispiel #1
0
        def step(jar='/home/hadoop/contrib/streaming/hadoop-streaming.jar',
                 args=self._DEFAULT_STEP_ARGS,
                 state='COMPLETED',
                 create_hours_ago=None,
                 start_hours_ago=None,
                 end_hours_ago=None,
                 name='Streaming Step',
                 action_on_failure='TERMINATE_CLUSTER',
                 **kwargs):

            return MockEmrObject(
                config=MockEmrObject(
                    actiononfailure=action_on_failure,
                    args=[MockEmrObject(value=a) for a in args],
                    jar=jar,
                ),
                status=MockEmrObject(
                    state=state,
                    timeline=MockEmrObject(
                        creationdatetime=ago(hours=create_hours_ago),
                        enddatetime=ago(hours=end_hours_ago),
                        startdatetime=ago(hours=start_hours_ago),
                    ),
                )
            )
Beispiel #2
0
 def step(jar='/home/hadoop/contrib/streaming/hadoop-streaming.jar',
          args=[
              '-mapper', 'my_job.py --mapper', '-reducer',
              'my_job.py --reducer'
          ],
          state='COMPLETE',
          create_hours_ago=None,
          start_hours_ago=None,
          end_hours_ago=None,
          name='Streaming Step',
          action_on_failure='TERMINATE_CLUSTER',
          **kwargs):
     if create_hours_ago:
         kwargs['creationdatetime'] = to_iso8601(self.now - timedelta(
             hours=create_hours_ago))
     if start_hours_ago:
         kwargs['startdatetime'] = to_iso8601(self.now - timedelta(
             hours=start_hours_ago))
     if end_hours_ago:
         kwargs['enddatetime'] = to_iso8601(self.now - timedelta(
             hours=end_hours_ago))
     kwargs['args'] = [MockEmrObject(value=a) for a in args]
     return MockEmrObject(jar=jar,
                          state=state,
                          name=name,
                          action_on_failure=action_on_failure,
                          **kwargs)
Beispiel #3
0
    def test_now_is_automatically_set(self):
        cs = MockEmrObject(status=MockEmrObject(timeline=MockEmrObject(
            creationdatetime=to_iso8601(datetime.utcnow()))))

        t = _est_time_to_hour(cs)

        self.assertLessEqual(t, timedelta(minutes=60))
        self.assertGreater(t, timedelta(minutes=59))
Beispiel #4
0
    def test_first_arg_doesnt_start_with_pool(self):
        jf = MockEmrObject(bootstrapactions=[
            MockEmrObject(args=[
                MockEmrObject(value='cowsay'),
                MockEmrObject(value='mrjob'),
            ]),
        ])

        self.assertEqual(pool_hash_and_name(jf), (None, None))
Beispiel #5
0
    def test_clock_skew(self):
        # make sure something reasonable happens if now is before
        # the start time
        cs = MockEmrObject(status=MockEmrObject(timeline=MockEmrObject(
            creationdatetime=to_iso8601(datetime(2010, 6, 6, 4, 26)))))

        self.assertEqual(
            _est_time_to_hour(cs, now=datetime(2010, 6, 6, 4, 25, 59)),
            timedelta(seconds=1))
Beispiel #6
0
    def test_first_arg_doesnt_start_with_pool(self):
        actions = [
            MockEmrObject(args=[
                MockEmrObject(value='cowsay'),
                MockEmrObject(value='mrjob'),
            ], name='master'),
        ]

        self.assertEqual(_pool_hash_and_name(actions), (None, None))
Beispiel #7
0
    def test_too_few_args(self):
        actions = [
            MockEmrObject(args=[
                MockEmrObject(
                    value='pool-0123456789abcdef0123456789abcdef'),
            ], name='master'),
        ]

        self.assertEqual(_pool_hash_and_name(actions), (None, None))
Beispiel #8
0
    def test_too_few_args(self):
        jf = MockEmrObject(bootstrapactions=[
            MockEmrObject(args=[
                MockEmrObject(value='pool-0123456789abcdef0123456789abcdef'),
            ],
                          name='master'),
        ])

        self.assertEqual(pool_hash_and_name(jf), (None, None))
Beispiel #9
0
    def test_pooled_job_flow(self):
        jf = MockEmrObject(bootstrapactions=[
            MockEmrObject(args=[
                MockEmrObject(value='pool-0123456789abcdef0123456789abcdef'),
                MockEmrObject(value='reflecting'),
            ]),
        ])

        self.assertEqual(pool_hash_and_name(jf),
                         ('0123456789abcdef0123456789abcdef', 'reflecting'))
Beispiel #10
0
    def test_bootstrap_action_isnt_named_master(self):
        jf = MockEmrObject(bootstrapactions=[
            MockEmrObject(args=[
                MockEmrObject(value='pool-0123456789abcdef0123456789abcdef'),
                MockEmrObject(value='reflecting'),
            ],
                          name='apprentice'),
        ])

        self.assertEqual(pool_hash_and_name(jf), (None, None))
Beispiel #11
0
    def test_too_many_args(self):
        jf = MockEmrObject(bootstrapactions=[
            MockEmrObject(args=[
                MockEmrObject(value='cowsay'),
                MockEmrObject(value='-b'),
                MockEmrObject(value='mrjob'),
            ]),
        ])

        self.assertEqual(pool_hash_and_name(jf), (None, None))
Beispiel #12
0
    def test_too_many_args(self):
        actions = [
            MockEmrObject(args=[
                MockEmrObject(value='cowsay'),
                MockEmrObject(value='-b'),
                MockEmrObject(value='mrjob'),
            ], name='master'),
        ]

        self.assertEqual(_pool_hash_and_name(actions), (None, None))
Beispiel #13
0
    def test_pooled_cluster(self):
        actions = [
            MockEmrObject(args=[
                MockEmrObject(
                    value='pool-0123456789abcdef0123456789abcdef'),
                MockEmrObject(value='reflecting'),
            ], name='master'),
        ]

        self.assertEqual(_pool_hash_and_name(actions),
                         ('0123456789abcdef0123456789abcdef', 'reflecting'))
Beispiel #14
0
    def test_pooled_job_flow_with_other_bootstrap_actions(self):
        actions = [
            MockEmrObject(args=[], name='action 0'),
            MockEmrObject(args=[], name='action 1'),
            MockEmrObject(args=[
                MockEmrObject(value='pool-0123456789abcdef0123456789abcdef'),
                MockEmrObject(value='reflecting'),
            ],
                          name='master'),
        ]

        self.assertEqual(_pool_hash_and_name(actions),
                         ('0123456789abcdef0123456789abcdef', 'reflecting'))
Beispiel #15
0
    def test_not_yet_started(self):
        cs = MockEmrObject(status=MockEmrObject(timeline=MockEmrObject(
            creationdatetime=to_iso8601(datetime(2010, 6, 6, 4)))))

        self.assertEqual(
            _est_time_to_hour(cs, now=datetime(2010, 6, 6, 4, 35)),
            timedelta(minutes=25))

        self.assertEqual(
            _est_time_to_hour(cs, now=datetime(2010, 6, 6, 5, 20)),
            timedelta(minutes=40))

        self.assertEqual(_est_time_to_hour(cs, now=datetime(2010, 6, 6, 4)),
                         timedelta(minutes=60))
    def test_job_flow_with_no_fields(self):
        # this shouldn't happen in practice; just a robustness check
        job_flow = MockEmrObject()

        summary = job_flow_to_full_summary(job_flow)

        self.assertEqual(
            summary, {
                'created': None,
                'end': None,
                'id': None,
                'label': None,
                'name': None,
                'nih': 0.0,
                'nih_bbnu': 0.0,
                'nih_billed': 0.0,
                'nih_used': 0.0,
                'num_steps': 0,
                'owner': None,
                'pool': None,
                'ran': timedelta(0),
                'ready': None,
                'start': None,
                'state': None,
                'usage': [],
            })
    def test_job_flow_that_was_terminated_before_starting(self):
        job_flow = MockEmrObject(
            creationdatetime='2010-06-05T23:59:00Z',
            enddatetime='2010-06-06T00:01:00Z',
            jobflowid='j-ISFORJOURNEY',
            name='mr_exciting.woo.20100605.235850.000000',
            normalizedinstancehours='0',
            state='TERMINATED',
        )

        summary = job_flow_to_full_summary(job_flow,
                                           now=datetime(2010, 6, 6, 0, 30))

        self.assertEqual(
            summary, {
                'created': datetime(2010, 6, 5, 23, 59),
                'end': datetime(2010, 6, 6, 0, 1),
                'id': 'j-ISFORJOURNEY',
                'label': 'mr_exciting',
                'name': 'mr_exciting.woo.20100605.235850.000000',
                'nih': 0.0,
                'nih_bbnu': 0.0,
                'nih_billed': 0.0,
                'nih_used': 0.0,
                'num_steps': 0,
                'owner': 'woo',
                'pool': None,
                'ran': timedelta(0),
                'ready': None,
                'start': None,
                'state': 'TERMINATED',
                'usage': [],
            })
    def test_job_flow_that_hasnt_yet_started(self):
        job_flow = MockEmrObject(
            creationdatetime='2010-06-05T23:59:00Z',
            jobflowid='j-ISFORJUMP',
            name='mr_exciting.woo.20100605.235850.000000',
            normalizedinstancehours='10',
            state='STARTING',
        )

        summary = job_flow_to_full_summary(job_flow,
                                           now=datetime(2010, 6, 6, 0, 30))

        self.assertEqual(
            summary, {
                'created': datetime(2010, 6, 5, 23, 59),
                'end': None,
                'id': 'j-ISFORJUMP',
                'label': 'mr_exciting',
                'name': 'mr_exciting.woo.20100605.235850.000000',
                'nih': 10.0,
                'nih_bbnu': 0.0,
                'nih_billed': 0.0,
                'nih_used': 0.0,
                'num_steps': 0,
                'owner': 'woo',
                'pool': None,
                'ran': timedelta(0),
                'ready': None,
                'start': None,
                'state': 'STARTING',
                'usage': [],
            })
Beispiel #19
0
    def test_now_is_automatically_set(self):
        jf = MockEmrObject(creationdatetime=to_iso8601(datetime.utcnow()))

        t = est_time_to_hour(jf)

        self.assertLessEqual(t, timedelta(minutes=60))
        self.assertGreater(t, timedelta(minutes=59))

        jf2 = MockEmrObject(creationdatetime=to_iso8601(datetime.utcnow() -
                                                        timedelta(minutes=1)),
                            startdatetime=to_iso8601(datetime.utcnow()))

        t = est_time_to_hour(jf2)

        self.assertLessEqual(t, timedelta(minutes=60))
        self.assertGreater(t, timedelta(minutes=59))
Beispiel #20
0
    def test_tags(self):
        self.add_mock_s3_data({'walrus': {}})
        self.monkey_patch_argv(
            '--quiet', '--no-conf',
            '--cloud-fs-sync-secs', '0',
            '--cloud-tmp-dir', 's3://walrus/tmp',
            '--tag', 'tag_one=foo',
            '--tag', 'tag_two=bar',
        )
        self.monkey_patch_stdout()
        create_cluster_main()
        self.assertEqual(list(self.mock_emr_clusters.keys()),
                         ['j-MOCKCLUSTER0'])

        mock_cluster = self.mock_emr_clusters['j-MOCKCLUSTER0']
        self.assertEqual(mock_cluster.tags, [
            MockEmrObject(key='tag_one', value='foo'),
            MockEmrObject(key='tag_two', value='bar'),
        ])
Beispiel #21
0
    def test_started(self):
        jf = MockEmrObject(creationdatetime=to_iso8601(datetime(2010, 6, 6,
                                                                4)),
                           startdatetime=to_iso8601(datetime(
                               2010, 6, 6, 4, 26)))

        self.assertEqual(est_time_to_hour(jf, now=datetime(2010, 6, 6, 4, 35)),
                         timedelta(minutes=51))

        self.assertEqual(est_time_to_hour(jf, now=datetime(2010, 6, 6, 5, 20)),
                         timedelta(minutes=6))

        self.assertEqual(est_time_to_hour(jf, now=datetime(2010, 6, 6, 6, 26)),
                         timedelta(minutes=60))
Beispiel #22
0
    def test_emr_tags(self):
        self.add_mock_s3_data({'walrus': {}})
        self.monkey_patch_argv(
            '--quiet',
            '--no-conf',
            '--s3-sync-wait-time',
            '0',
            '--s3-scratch-uri',
            's3://walrus/tmp',
            '--emr-tag',
            'tag_one=foo',
            '--emr-tag',
            'tag_two=bar',
        )
        self.monkey_patch_stdout()
        create_job_flow_main()
        self.assertEqual(list(self.mock_emr_clusters.keys()),
                         ['j-MOCKCLUSTER0'])

        mock_cluster = self.mock_emr_clusters['j-MOCKCLUSTER0']
        self.assertEqual(mock_cluster.tags, [
            MockEmrObject(key='tag_one', value='foo'),
            MockEmrObject(key='tag_two', value='bar'),
        ])
    def test_job_flows_to_stats(self):

        # mock jobflows
        NUM_JOB_FLOWS = 30
        job_flows = []
        for i in range(NUM_JOB_FLOWS):
            job_flow_id = 'j-%04d' % i
            job_flows.append(
                MockEmrObject(
                    jobflowid=job_flow_id,
                    instancecount=i,  # each jobflow has different instance count
                ))

        stats = job_flows_to_stats(job_flows)

        self.assertEqual(stats['num_jobflows'], NUM_JOB_FLOWS)
        self.assertEqual(stats['total_instance_count'],
                         sum(range(NUM_JOB_FLOWS)))
Beispiel #24
0
    def test_pooled_job_flow_with_max_hours_idle(self):
        # max hours idle is added AFTER the master bootstrap script,
        # which was a problem when we just look at the last action
        jf = MockEmrObject(bootstrapactions=[
            MockEmrObject(args=[
                MockEmrObject(value='pool-0123456789abcdef0123456789abcdef'),
                MockEmrObject(value='reflecting'),
            ],
                          name='master'),
            MockEmrObject(args=[
                MockEmrObject(value='900'),
                MockEmrObject(value='300'),
            ],
                          name='idle timeout'),
        ])

        self.assertEqual(pool_hash_and_name(jf),
                         ('0123456789abcdef0123456789abcdef', 'reflecting'))
Beispiel #25
0
    def test_can_get_all_job_flows(self):
        now = datetime.datetime.utcnow()

        NUM_JOB_FLOWS = 2222
        assert_gt(NUM_JOB_FLOWS, DEFAULT_MAX_JOB_FLOWS_RETURNED)

        for i in range(NUM_JOB_FLOWS):
            jfid = 'j-%04d' % i
            self.mock_emr_job_flows[jfid] = MockEmrObject(
                creationdatetime=to_iso8601(now - datetime.timedelta(minutes=i)),
                jobflowid=jfid)

        emr_conn = EMRJobRunner().make_emr_conn()

        # ordinary describe_jobflows() hits the limit on number of job flows
        some_jfs = emr_conn.describe_jobflows()
        assert_equal(len(some_jfs), DEFAULT_MAX_JOB_FLOWS_RETURNED)

        all_jfs = describe_all_job_flows(emr_conn)
        assert_equal(len(all_jfs), NUM_JOB_FLOWS)
        assert_equal(sorted(jf.jobflowid for jf in all_jfs),
                     [('j-%04d' % i) for i in range(NUM_JOB_FLOWS)])
    def create_fake_job_flows(self):
        self.now = datetime.utcnow().replace(microsecond=0)

        # empty job
        self.mock_emr_job_flows['j-EMPTY'] = MockEmrObject(
            creationdatetime=to_iso8601(self.now - timedelta(hours=10)),
            state='WAITING',
        )

        # Build a step object easily
        # also make it respond to .args()
        def step(jar='/home/hadoop/contrib/streaming/hadoop-streaming.jar',
                 args=[
                     '-mapper', 'my_job.py --mapper', '-reducer',
                     'my_job.py --reducer'
                 ],
                 state='COMPLETE',
                 start_time_back=None,
                 end_time_back=None,
                 name='Streaming Step',
                 action_on_failure='TERMINATE_JOB_FLOW',
                 **kwargs):
            if start_time_back:
                kwargs['startdatetime'] = to_iso8601(self.now - timedelta(
                    hours=start_time_back))
            if end_time_back:
                kwargs['enddatetime'] = to_iso8601(self.now - timedelta(
                    hours=end_time_back))
            kwargs['args'] = [MockEmrObject(value=a) for a in args]
            return MockEmrObject(jar=jar,
                                 state=state,
                                 name=name,
                                 action_on_failure=action_on_failure,
                                 **kwargs)

        # currently running job
        self.mock_emr_job_flows['j-CURRENTLY_RUNNING'] = MockEmrObject(
            creationdatetime=to_iso8601(self.now - timedelta(hours=6)),
            startdatetime=to_iso8601(self.now -
                                     timedelta(hours=4, minutes=15)),
            state='RUNNING',
            steps=[step(start_time_back=4, state='RUNNING')],
        )

        # finished job flow
        self.mock_emr_job_flows['j-DONE'] = MockEmrObject(
            creationdatetime=to_iso8601(self.now - timedelta(hours=10)),
            enddatetime=to_iso8601(self.now - timedelta(hours=5)),
            startdatetime=to_iso8601(self.now - timedelta(hours=9)),
            state='COMPLETE',
            steps=[step(start_time_back=8, end_time_back=6)],
        )

        # idle job flow
        self.mock_emr_job_flows['j-DONE_AND_IDLE'] = MockEmrObject(
            creationdatetime=to_iso8601(self.now - timedelta(hours=6)),
            startdatetime=to_iso8601(self.now - timedelta(hours=5)),
            state='WAITING',
            steps=[step(start_time_back=4, end_time_back=2)],
        )

        # hive job flow (looks completed but isn't)
        self.mock_emr_job_flows['j-HIVE'] = MockEmrObject(
            creationdatetime=to_iso8601(self.now - timedelta(hours=6)),
            startdatetime=to_iso8601(self.now - timedelta(hours=5)),
            state='WAITING',
            steps=[
                step(
                    start_time_back=4,
                    end_time_back=4,
                    jar=('s3://us-east-1.elasticmapreduce/libs/script-runner/'
                         'script-runner.jar'),
                    args=[],
                )
            ],
        )

        # custom hadoop streaming jar
        self.mock_emr_job_flows['j-CUSTOM_DONE_AND_IDLE'] = MockEmrObject(
            creationdatetime=to_iso8601(self.now - timedelta(hours=6)),
            startdatetime=to_iso8601(self.now - timedelta(hours=5)),
            state='WAITING',
            steps=[
                step(
                    start_time_back=4,
                    end_time_back=4,
                    jar=('s3://my_bucket/tmp/somejob/files/'
                         'oddjob-0.0.3-SNAPSHOT-standalone.jar'),
                    args=[],
                )
            ],
        )

        mock_conn = MockEmrConnection()

        # hadoop debugging without any other steps
        jobflow_id = mock_conn.run_jobflow(name='j-DEBUG_ONLY',
                                           log_uri='',
                                           enable_debugging=True)
        jf = mock_conn.describe_jobflow(jobflow_id)
        self.mock_emr_job_flows['j-DEBUG_ONLY'] = jf
        jf.state = 'WAITING'
        jf.startdatetime = to_iso8601(self.now - timedelta(hours=2))
        jf.steps[0].enddatetime = to_iso8601(self.now - timedelta(hours=2))

        # hadoop debugging + actual job
        # same jar as hive but with different args
        jobflow_id = mock_conn.run_jobflow(name='j-HADOOP_DEBUGGING',
                                           log_uri='',
                                           enable_debugging=True,
                                           steps=[step()])
        jf = mock_conn.describe_jobflow(jobflow_id)
        self.mock_emr_job_flows['j-HADOOP_DEBUGGING'] = jf
        jf.state = 'WAITING'
        jf.creationdatetime = to_iso8601(self.now - timedelta(hours=6))
        jf.startdatetime = to_iso8601(self.now - timedelta(hours=5))
        # Need to reset times manually because mockboto resets them
        jf.steps[0].enddatetime = to_iso8601(self.now - timedelta(hours=5))
        jf.steps[1].startdatetime = to_iso8601(self.now - timedelta(hours=4))
        jf.steps[1].enddatetime = to_iso8601(self.now - timedelta(hours=2))

        # skip cancelled steps
        self.mock_emr_job_flows['j-IDLE_AND_FAILED'] = MockEmrObject(
            state='WAITING',
            creationdatetime=to_iso8601(self.now - timedelta(hours=6)),
            startdatetime=to_iso8601(self.now - timedelta(hours=5)),
            steps=[
                step(start_time_back=4, end_time_back=3, state='FAILED'),
                step(state='CANCELLED', )
            ],
        )

        # pooled job flow reaching end of full hour
        self.mock_emr_job_flows['j-POOLED'] = MockEmrObject(
            bootstrapactions=[
                MockEmrObject(args=[]),
                MockEmrObject(args=[
                    MockEmrObject(
                        value='pool-0123456789abcdef0123456789abcdef'),
                    MockEmrObject(value='reflecting'),
                ]),
            ],
            creationdatetime=to_iso8601(self.now - timedelta(hours=1)),
            startdatetime=to_iso8601(self.now - timedelta(minutes=55)),
            state='WAITING',
            steps=[],
        )

        # add job flow IDs and fake names to the mock job flows
        for jfid, jf in self.mock_emr_job_flows.iteritems():
            jf.jobflowid = jfid
            jf.name = jfid[2:].replace('_', ' ').title() + ' Job Flow'
from datetime import timedelta

from mrjob.py2 import StringIO
from mrjob.tools.emr.report_long_jobs import _find_long_running_jobs
from mrjob.tools.emr.report_long_jobs import main

from tests.mockboto import MockEmrObject
from tests.mockboto import MockBotoTestCase

CLUSTERS = [
    MockEmrObject(
        id='j-STARTING',
        name='mr_grieving',
        status=MockEmrObject(
            state='STARTING',
            timeline=MockEmrObject(
                creationdatetime='2010-06-06T00:05:00Z',
            ),
        ),
        _steps=[],
    ),
    MockEmrObject(
        id='j-BOOTSTRAPPING',
        name='mr_grieving',
        status=MockEmrObject(
            state='BOOTSTRAPPING',
            timeline=MockEmrObject(
                creationdatetime='2010-06-06T00:05:00Z',
            ),
        ),
        _steps=[],
    def create_fake_clusters(self):
        self.now = datetime.utcnow().replace(microsecond=0)
        self.add_mock_s3_data({'my_bucket': {}})

        # create a timestamp the given number of *hours*, *minutes*, etc.
        # in the past. If any *kwargs* are None, return None.
        def ago(**kwargs):
            if any(v is None for v in kwargs.values()):
                return None
            return to_iso8601(self.now - timedelta(**kwargs))

        # Build a step object easily
        # also make it respond to .args()
        def step(jar='/home/hadoop/contrib/streaming/hadoop-streaming.jar',
                 args=self._DEFAULT_STEP_ARGS,
                 state='COMPLETED',
                 create_hours_ago=None,
                 start_hours_ago=None,
                 end_hours_ago=None,
                 name='Streaming Step',
                 action_on_failure='TERMINATE_CLUSTER',
                 **kwargs):

            return MockEmrObject(
                config=MockEmrObject(
                    actiononfailure=action_on_failure,
                    args=[MockEmrObject(value=a) for a in args],
                    jar=jar,
                ),
                status=MockEmrObject(
                    state=state,
                    timeline=MockEmrObject(
                        creationdatetime=ago(hours=create_hours_ago),
                        enddatetime=ago(hours=end_hours_ago),
                        startdatetime=ago(hours=start_hours_ago),
                    ),
                ))

        # empty job
        self.add_mock_emr_cluster(
            MockEmrObject(
                id='j-EMPTY',
                status=MockEmrObject(
                    state='STARTING',
                    timeline=MockEmrObject(creationdatetime=ago(hours=10)),
                ),
            ))

        # job that's bootstrapping
        self.add_mock_emr_cluster(
            MockEmrObject(
                id='j-BOOTSTRAPPING',
                status=MockEmrObject(
                    state='BOOTSTRAPPING',
                    timeline=MockEmrObject(creationdatetime=ago(hours=10), ),
                ),
                _steps=[step(create_hours_ago=10, state='PENDING')],
            ))

        # currently running job
        self.add_mock_emr_cluster(
            MockEmrObject(
                id='j-CURRENTLY_RUNNING',
                status=MockEmrObject(
                    state='RUNNING',
                    timeline=MockEmrObject(creationdatetime=ago(hours=4,
                                                                minutes=15),
                                           readydatetime=ago(hours=4,
                                                             minutes=10))),
                _steps=[step(start_hours_ago=4, state='RUNNING')],
            ), )

        # finished cluster
        self.add_mock_emr_cluster(
            MockEmrObject(
                id='j-DONE',
                status=MockEmrObject(
                    state='TERMINATED',
                    timeline=MockEmrObject(
                        creationdatetime=ago(hours=10),
                        readydatetime=ago(hours=8),
                        enddatetime=ago(hours=5),
                    ),
                ),
                _steps=[step(start_hours_ago=8, end_hours_ago=6)],
            ))

        # idle cluster
        self.add_mock_emr_cluster(
            MockEmrObject(
                id='j-DONE_AND_IDLE',
                status=MockEmrObject(
                    state='WAITING',
                    timeline=MockEmrObject(
                        creationdatetime=ago(hours=6),
                        readydatetime=ago(hours=5, minutes=5),
                    ),
                ),
                _steps=[step(start_hours_ago=4, end_hours_ago=2)],
            ))

        # idle cluster with 4.x step format. should still be
        # recognizable as a streaming step
        self.add_mock_emr_cluster(
            MockEmrObject(
                id='j-DONE_AND_IDLE_4_X',
                status=MockEmrObject(
                    state='WAITING',
                    timeline=MockEmrObject(
                        creationdatetime=ago(hours=6),
                        readydatetime=ago(hours=5, minutes=5),
                    ),
                ),
                _steps=[
                    step(start_hours_ago=4,
                         end_hours_ago=2,
                         jar='command-runner.jar',
                         args=['hadoop-streaming'] + self._DEFAULT_STEP_ARGS)
                ],
            ))

        # idle cluster with an active lock
        self.add_mock_emr_cluster(
            MockEmrObject(
                id='j-IDLE_AND_LOCKED',
                status=MockEmrObject(
                    state='WAITING',
                    timeline=MockEmrObject(
                        creationdatetime=ago(hours=6),
                        readydatetime=ago(hours=5, minutes=5),
                    ),
                ),
                _steps=[step(start_hours_ago=4, end_hours_ago=2)],
            ))
        self.add_mock_s3_data(
            {
                'my_bucket': {
                    'locks/j-IDLE_AND_LOCKED/2': b'not_you',
                },
            },
            time_modified=self.now)

        # idle cluster with an expired lock
        self.add_mock_emr_cluster(
            MockEmrObject(
                id='j-IDLE_AND_EXPIRED',
                status=MockEmrObject(
                    state='WAITING',
                    timeline=MockEmrObject(
                        creationdatetime=ago(hours=6),
                        readydatetime=ago(hours=5, minutes=5),
                    ),
                ),
                _steps=[step(start_hours_ago=4, end_hours_ago=2)],
            ))
        self.add_mock_s3_data(
            {
                'my_bucket': {
                    'locks/j-IDLE_AND_EXPIRED/2': b'not_you',
                },
            },
            time_modified=(self.now - timedelta(minutes=5)))

        # idle cluster with an expired lock
        self.add_mock_emr_cluster(
            MockEmrObject(
                id='j-IDLE_BUT_INCOMPLETE_STEPS',
                status=MockEmrObject(
                    state='WAITING',
                    timeline=MockEmrObject(
                        creationdatetime=ago(hours=6),
                        readydatetime=ago(hours=5, minutes=5),
                    ),
                ),
                _steps=[step(start_hours_ago=4, end_hours_ago=None)],
            ))

        # hive cluster (looks completed but isn't)
        self.add_mock_emr_cluster(
            MockEmrObject(
                id='j-HIVE',
                status=MockEmrObject(
                    state='WAITING',
                    timeline=MockEmrObject(
                        creationdatetime=ago(hours=6),
                        readydatetime=ago(hours=5, minutes=5),
                    ),
                ),
                _steps=[
                    step(
                        start_hours_ago=4,
                        end_hours_ago=4,
                        jar=
                        ('s3://us-east-1.elasticmapreduce/libs/script-runner/'
                         'script-runner.jar'),
                        args=[],
                    )
                ],
            ))

        # custom hadoop streaming jar
        self.add_mock_emr_cluster(
            MockEmrObject(
                id='j-CUSTOM_DONE_AND_IDLE',
                status=MockEmrObject(
                    state='WAITING',
                    timeline=MockEmrObject(
                        creationdatetime=ago(hours=6),
                        readydatetime=ago(hours=5, minutes=5),
                    ),
                ),
                _steps=[
                    step(
                        start_hours_ago=4,
                        end_hours_ago=4,
                        jar=('s3://my_bucket/tmp/somejob/files/'
                             'oddjob-0.0.3-SNAPSHOT-standalone.jar'),
                        args=[],
                    )
                ],
            ))

        mock_emr_conn = self.connect_emr()

        # hadoop debugging without any other steps
        mock_emr_conn.run_jobflow(
            _id='j-DEBUG_ONLY',
            name='DEBUG_ONLY',
            enable_debugging=True,
            now=self.now - timedelta(hours=3),
            job_flow_role='fake-instance-profile',
            service_role='fake-service-role',
        )
        j_debug_only = self.mock_emr_clusters['j-DEBUG_ONLY']
        j_debug_only.status.state = 'WAITING'
        j_debug_only.status.timeline.readydatetime = ago(hours=2, minutes=55)
        j_debug_only._steps[0].status.state = 'COMPLETED'
        j_debug_only._steps[0].status.timeline.enddatetime = ago(hours=2)

        # hadoop debugging + actual job
        # same jar as hive but with different args
        mock_emr_conn.run_jobflow(
            _id='j-HADOOP_DEBUGGING',
            name='HADOOP_DEBUGGING',
            enable_debugging=True,
            now=self.now - timedelta(hours=6),
            job_flow_role='fake-instance-profile',
            service_role='fake-service-role',
        )
        j_hadoop_debugging = self.mock_emr_clusters['j-HADOOP_DEBUGGING']
        j_hadoop_debugging._steps.append(step())
        j_hadoop_debugging.status.state = 'WAITING'
        j_hadoop_debugging.status.timeline.readydatetime = ago(hours=4,
                                                               minutes=55)

        # Need to reset times manually because mockboto resets them
        j_hadoop_debugging._steps[0].status.state = 'COMPLETED'
        j_hadoop_debugging._steps[0].status.timeline.enddatetime = ago(hours=5)
        j_hadoop_debugging._steps[1].status.timeline.startdatetime = ago(
            hours=4)
        j_hadoop_debugging._steps[1].status.timeline.enddatetime = ago(hours=2)

        # should skip cancelled steps
        self.add_mock_emr_cluster(
            MockEmrObject(
                id='j-IDLE_AND_FAILED',
                status=MockEmrObject(
                    state='WAITING',
                    timeline=MockEmrObject(
                        creationdatetime=ago(hours=6),
                        readydatetime=ago(hours=5, minutes=5),
                    ),
                ),
                _steps=[
                    step(start_hours_ago=4, end_hours_ago=3, state='FAILED'),
                    step(state='CANCELLED', )
                ],
            ))

        # pooled cluster reaching end of full hour
        self.add_mock_emr_cluster(
            MockEmrObject(
                _bootstrapactions=[
                    MockEmrObject(args=[], name='action 0'),
                    MockEmrObject(args=[
                        MockEmrObject(
                            value='pool-0123456789abcdef0123456789abcdef'),
                        MockEmrObject(value='reflecting'),
                    ],
                                  name='master'),
                ],
                id='j-POOLED',
                status=MockEmrObject(
                    state='WAITING',
                    timeline=MockEmrObject(
                        creationdatetime=ago(minutes=55),
                        readydatetime=ago(minutes=50),
                    ),
                ),
            ))

        # cluster that has had pending jobs but hasn't run them
        self.add_mock_emr_cluster(
            MockEmrObject(
                id='j-PENDING_BUT_IDLE',
                status=MockEmrObject(
                    state='RUNNING',
                    timeline=MockEmrObject(
                        creationdatetime=ago(hours=3),
                        readydatetime=ago(hours=2, minutes=50),
                    ),
                ),
                _steps=[step(create_hours_ago=3, state='PENDING')],
            ))
Beispiel #29
0
    def test_empty_bootstrap_actions(self):
        jf = MockEmrObject(bootstrapactions=[])

        self.assertEqual(pool_hash_and_name(jf), (None, None))
Beispiel #30
0
    def test_empty(self):
        jf = MockEmrObject()

        self.assertEqual(pool_hash_and_name(jf), (None, None))