def step(jar='/home/hadoop/contrib/streaming/hadoop-streaming.jar', args=self._DEFAULT_STEP_ARGS, state='COMPLETED', create_hours_ago=None, start_hours_ago=None, end_hours_ago=None, name='Streaming Step', action_on_failure='TERMINATE_CLUSTER', **kwargs): return MockEmrObject( config=MockEmrObject( actiononfailure=action_on_failure, args=[MockEmrObject(value=a) for a in args], jar=jar, ), status=MockEmrObject( state=state, timeline=MockEmrObject( creationdatetime=ago(hours=create_hours_ago), enddatetime=ago(hours=end_hours_ago), startdatetime=ago(hours=start_hours_ago), ), ) )
def step(jar='/home/hadoop/contrib/streaming/hadoop-streaming.jar', args=[ '-mapper', 'my_job.py --mapper', '-reducer', 'my_job.py --reducer' ], state='COMPLETE', create_hours_ago=None, start_hours_ago=None, end_hours_ago=None, name='Streaming Step', action_on_failure='TERMINATE_CLUSTER', **kwargs): if create_hours_ago: kwargs['creationdatetime'] = to_iso8601(self.now - timedelta( hours=create_hours_ago)) if start_hours_ago: kwargs['startdatetime'] = to_iso8601(self.now - timedelta( hours=start_hours_ago)) if end_hours_ago: kwargs['enddatetime'] = to_iso8601(self.now - timedelta( hours=end_hours_ago)) kwargs['args'] = [MockEmrObject(value=a) for a in args] return MockEmrObject(jar=jar, state=state, name=name, action_on_failure=action_on_failure, **kwargs)
def test_now_is_automatically_set(self): cs = MockEmrObject(status=MockEmrObject(timeline=MockEmrObject( creationdatetime=to_iso8601(datetime.utcnow())))) t = _est_time_to_hour(cs) self.assertLessEqual(t, timedelta(minutes=60)) self.assertGreater(t, timedelta(minutes=59))
def test_first_arg_doesnt_start_with_pool(self): jf = MockEmrObject(bootstrapactions=[ MockEmrObject(args=[ MockEmrObject(value='cowsay'), MockEmrObject(value='mrjob'), ]), ]) self.assertEqual(pool_hash_and_name(jf), (None, None))
def test_clock_skew(self): # make sure something reasonable happens if now is before # the start time cs = MockEmrObject(status=MockEmrObject(timeline=MockEmrObject( creationdatetime=to_iso8601(datetime(2010, 6, 6, 4, 26))))) self.assertEqual( _est_time_to_hour(cs, now=datetime(2010, 6, 6, 4, 25, 59)), timedelta(seconds=1))
def test_first_arg_doesnt_start_with_pool(self): actions = [ MockEmrObject(args=[ MockEmrObject(value='cowsay'), MockEmrObject(value='mrjob'), ], name='master'), ] self.assertEqual(_pool_hash_and_name(actions), (None, None))
def test_too_few_args(self): actions = [ MockEmrObject(args=[ MockEmrObject( value='pool-0123456789abcdef0123456789abcdef'), ], name='master'), ] self.assertEqual(_pool_hash_and_name(actions), (None, None))
def test_too_few_args(self): jf = MockEmrObject(bootstrapactions=[ MockEmrObject(args=[ MockEmrObject(value='pool-0123456789abcdef0123456789abcdef'), ], name='master'), ]) self.assertEqual(pool_hash_and_name(jf), (None, None))
def test_pooled_job_flow(self): jf = MockEmrObject(bootstrapactions=[ MockEmrObject(args=[ MockEmrObject(value='pool-0123456789abcdef0123456789abcdef'), MockEmrObject(value='reflecting'), ]), ]) self.assertEqual(pool_hash_and_name(jf), ('0123456789abcdef0123456789abcdef', 'reflecting'))
def test_bootstrap_action_isnt_named_master(self): jf = MockEmrObject(bootstrapactions=[ MockEmrObject(args=[ MockEmrObject(value='pool-0123456789abcdef0123456789abcdef'), MockEmrObject(value='reflecting'), ], name='apprentice'), ]) self.assertEqual(pool_hash_and_name(jf), (None, None))
def test_too_many_args(self): jf = MockEmrObject(bootstrapactions=[ MockEmrObject(args=[ MockEmrObject(value='cowsay'), MockEmrObject(value='-b'), MockEmrObject(value='mrjob'), ]), ]) self.assertEqual(pool_hash_and_name(jf), (None, None))
def test_too_many_args(self): actions = [ MockEmrObject(args=[ MockEmrObject(value='cowsay'), MockEmrObject(value='-b'), MockEmrObject(value='mrjob'), ], name='master'), ] self.assertEqual(_pool_hash_and_name(actions), (None, None))
def test_pooled_cluster(self): actions = [ MockEmrObject(args=[ MockEmrObject( value='pool-0123456789abcdef0123456789abcdef'), MockEmrObject(value='reflecting'), ], name='master'), ] self.assertEqual(_pool_hash_and_name(actions), ('0123456789abcdef0123456789abcdef', 'reflecting'))
def test_pooled_job_flow_with_other_bootstrap_actions(self): actions = [ MockEmrObject(args=[], name='action 0'), MockEmrObject(args=[], name='action 1'), MockEmrObject(args=[ MockEmrObject(value='pool-0123456789abcdef0123456789abcdef'), MockEmrObject(value='reflecting'), ], name='master'), ] self.assertEqual(_pool_hash_and_name(actions), ('0123456789abcdef0123456789abcdef', 'reflecting'))
def test_not_yet_started(self): cs = MockEmrObject(status=MockEmrObject(timeline=MockEmrObject( creationdatetime=to_iso8601(datetime(2010, 6, 6, 4))))) self.assertEqual( _est_time_to_hour(cs, now=datetime(2010, 6, 6, 4, 35)), timedelta(minutes=25)) self.assertEqual( _est_time_to_hour(cs, now=datetime(2010, 6, 6, 5, 20)), timedelta(minutes=40)) self.assertEqual(_est_time_to_hour(cs, now=datetime(2010, 6, 6, 4)), timedelta(minutes=60))
def test_job_flow_with_no_fields(self): # this shouldn't happen in practice; just a robustness check job_flow = MockEmrObject() summary = job_flow_to_full_summary(job_flow) self.assertEqual( summary, { 'created': None, 'end': None, 'id': None, 'label': None, 'name': None, 'nih': 0.0, 'nih_bbnu': 0.0, 'nih_billed': 0.0, 'nih_used': 0.0, 'num_steps': 0, 'owner': None, 'pool': None, 'ran': timedelta(0), 'ready': None, 'start': None, 'state': None, 'usage': [], })
def test_job_flow_that_was_terminated_before_starting(self): job_flow = MockEmrObject( creationdatetime='2010-06-05T23:59:00Z', enddatetime='2010-06-06T00:01:00Z', jobflowid='j-ISFORJOURNEY', name='mr_exciting.woo.20100605.235850.000000', normalizedinstancehours='0', state='TERMINATED', ) summary = job_flow_to_full_summary(job_flow, now=datetime(2010, 6, 6, 0, 30)) self.assertEqual( summary, { 'created': datetime(2010, 6, 5, 23, 59), 'end': datetime(2010, 6, 6, 0, 1), 'id': 'j-ISFORJOURNEY', 'label': 'mr_exciting', 'name': 'mr_exciting.woo.20100605.235850.000000', 'nih': 0.0, 'nih_bbnu': 0.0, 'nih_billed': 0.0, 'nih_used': 0.0, 'num_steps': 0, 'owner': 'woo', 'pool': None, 'ran': timedelta(0), 'ready': None, 'start': None, 'state': 'TERMINATED', 'usage': [], })
def test_job_flow_that_hasnt_yet_started(self): job_flow = MockEmrObject( creationdatetime='2010-06-05T23:59:00Z', jobflowid='j-ISFORJUMP', name='mr_exciting.woo.20100605.235850.000000', normalizedinstancehours='10', state='STARTING', ) summary = job_flow_to_full_summary(job_flow, now=datetime(2010, 6, 6, 0, 30)) self.assertEqual( summary, { 'created': datetime(2010, 6, 5, 23, 59), 'end': None, 'id': 'j-ISFORJUMP', 'label': 'mr_exciting', 'name': 'mr_exciting.woo.20100605.235850.000000', 'nih': 10.0, 'nih_bbnu': 0.0, 'nih_billed': 0.0, 'nih_used': 0.0, 'num_steps': 0, 'owner': 'woo', 'pool': None, 'ran': timedelta(0), 'ready': None, 'start': None, 'state': 'STARTING', 'usage': [], })
def test_now_is_automatically_set(self): jf = MockEmrObject(creationdatetime=to_iso8601(datetime.utcnow())) t = est_time_to_hour(jf) self.assertLessEqual(t, timedelta(minutes=60)) self.assertGreater(t, timedelta(minutes=59)) jf2 = MockEmrObject(creationdatetime=to_iso8601(datetime.utcnow() - timedelta(minutes=1)), startdatetime=to_iso8601(datetime.utcnow())) t = est_time_to_hour(jf2) self.assertLessEqual(t, timedelta(minutes=60)) self.assertGreater(t, timedelta(minutes=59))
def test_tags(self): self.add_mock_s3_data({'walrus': {}}) self.monkey_patch_argv( '--quiet', '--no-conf', '--cloud-fs-sync-secs', '0', '--cloud-tmp-dir', 's3://walrus/tmp', '--tag', 'tag_one=foo', '--tag', 'tag_two=bar', ) self.monkey_patch_stdout() create_cluster_main() self.assertEqual(list(self.mock_emr_clusters.keys()), ['j-MOCKCLUSTER0']) mock_cluster = self.mock_emr_clusters['j-MOCKCLUSTER0'] self.assertEqual(mock_cluster.tags, [ MockEmrObject(key='tag_one', value='foo'), MockEmrObject(key='tag_two', value='bar'), ])
def test_started(self): jf = MockEmrObject(creationdatetime=to_iso8601(datetime(2010, 6, 6, 4)), startdatetime=to_iso8601(datetime( 2010, 6, 6, 4, 26))) self.assertEqual(est_time_to_hour(jf, now=datetime(2010, 6, 6, 4, 35)), timedelta(minutes=51)) self.assertEqual(est_time_to_hour(jf, now=datetime(2010, 6, 6, 5, 20)), timedelta(minutes=6)) self.assertEqual(est_time_to_hour(jf, now=datetime(2010, 6, 6, 6, 26)), timedelta(minutes=60))
def test_emr_tags(self): self.add_mock_s3_data({'walrus': {}}) self.monkey_patch_argv( '--quiet', '--no-conf', '--s3-sync-wait-time', '0', '--s3-scratch-uri', 's3://walrus/tmp', '--emr-tag', 'tag_one=foo', '--emr-tag', 'tag_two=bar', ) self.monkey_patch_stdout() create_job_flow_main() self.assertEqual(list(self.mock_emr_clusters.keys()), ['j-MOCKCLUSTER0']) mock_cluster = self.mock_emr_clusters['j-MOCKCLUSTER0'] self.assertEqual(mock_cluster.tags, [ MockEmrObject(key='tag_one', value='foo'), MockEmrObject(key='tag_two', value='bar'), ])
def test_job_flows_to_stats(self): # mock jobflows NUM_JOB_FLOWS = 30 job_flows = [] for i in range(NUM_JOB_FLOWS): job_flow_id = 'j-%04d' % i job_flows.append( MockEmrObject( jobflowid=job_flow_id, instancecount=i, # each jobflow has different instance count )) stats = job_flows_to_stats(job_flows) self.assertEqual(stats['num_jobflows'], NUM_JOB_FLOWS) self.assertEqual(stats['total_instance_count'], sum(range(NUM_JOB_FLOWS)))
def test_pooled_job_flow_with_max_hours_idle(self): # max hours idle is added AFTER the master bootstrap script, # which was a problem when we just look at the last action jf = MockEmrObject(bootstrapactions=[ MockEmrObject(args=[ MockEmrObject(value='pool-0123456789abcdef0123456789abcdef'), MockEmrObject(value='reflecting'), ], name='master'), MockEmrObject(args=[ MockEmrObject(value='900'), MockEmrObject(value='300'), ], name='idle timeout'), ]) self.assertEqual(pool_hash_and_name(jf), ('0123456789abcdef0123456789abcdef', 'reflecting'))
def test_can_get_all_job_flows(self): now = datetime.datetime.utcnow() NUM_JOB_FLOWS = 2222 assert_gt(NUM_JOB_FLOWS, DEFAULT_MAX_JOB_FLOWS_RETURNED) for i in range(NUM_JOB_FLOWS): jfid = 'j-%04d' % i self.mock_emr_job_flows[jfid] = MockEmrObject( creationdatetime=to_iso8601(now - datetime.timedelta(minutes=i)), jobflowid=jfid) emr_conn = EMRJobRunner().make_emr_conn() # ordinary describe_jobflows() hits the limit on number of job flows some_jfs = emr_conn.describe_jobflows() assert_equal(len(some_jfs), DEFAULT_MAX_JOB_FLOWS_RETURNED) all_jfs = describe_all_job_flows(emr_conn) assert_equal(len(all_jfs), NUM_JOB_FLOWS) assert_equal(sorted(jf.jobflowid for jf in all_jfs), [('j-%04d' % i) for i in range(NUM_JOB_FLOWS)])
def create_fake_job_flows(self): self.now = datetime.utcnow().replace(microsecond=0) # empty job self.mock_emr_job_flows['j-EMPTY'] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=10)), state='WAITING', ) # Build a step object easily # also make it respond to .args() def step(jar='/home/hadoop/contrib/streaming/hadoop-streaming.jar', args=[ '-mapper', 'my_job.py --mapper', '-reducer', 'my_job.py --reducer' ], state='COMPLETE', start_time_back=None, end_time_back=None, name='Streaming Step', action_on_failure='TERMINATE_JOB_FLOW', **kwargs): if start_time_back: kwargs['startdatetime'] = to_iso8601(self.now - timedelta( hours=start_time_back)) if end_time_back: kwargs['enddatetime'] = to_iso8601(self.now - timedelta( hours=end_time_back)) kwargs['args'] = [MockEmrObject(value=a) for a in args] return MockEmrObject(jar=jar, state=state, name=name, action_on_failure=action_on_failure, **kwargs) # currently running job self.mock_emr_job_flows['j-CURRENTLY_RUNNING'] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=6)), startdatetime=to_iso8601(self.now - timedelta(hours=4, minutes=15)), state='RUNNING', steps=[step(start_time_back=4, state='RUNNING')], ) # finished job flow self.mock_emr_job_flows['j-DONE'] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=10)), enddatetime=to_iso8601(self.now - timedelta(hours=5)), startdatetime=to_iso8601(self.now - timedelta(hours=9)), state='COMPLETE', steps=[step(start_time_back=8, end_time_back=6)], ) # idle job flow self.mock_emr_job_flows['j-DONE_AND_IDLE'] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=6)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), state='WAITING', steps=[step(start_time_back=4, end_time_back=2)], ) # hive job flow (looks completed but isn't) self.mock_emr_job_flows['j-HIVE'] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=6)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), state='WAITING', steps=[ step( start_time_back=4, end_time_back=4, jar=('s3://us-east-1.elasticmapreduce/libs/script-runner/' 'script-runner.jar'), args=[], ) ], ) # custom hadoop streaming jar self.mock_emr_job_flows['j-CUSTOM_DONE_AND_IDLE'] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=6)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), state='WAITING', steps=[ step( start_time_back=4, end_time_back=4, jar=('s3://my_bucket/tmp/somejob/files/' 'oddjob-0.0.3-SNAPSHOT-standalone.jar'), args=[], ) ], ) mock_conn = MockEmrConnection() # hadoop debugging without any other steps jobflow_id = mock_conn.run_jobflow(name='j-DEBUG_ONLY', log_uri='', enable_debugging=True) jf = mock_conn.describe_jobflow(jobflow_id) self.mock_emr_job_flows['j-DEBUG_ONLY'] = jf jf.state = 'WAITING' jf.startdatetime = to_iso8601(self.now - timedelta(hours=2)) jf.steps[0].enddatetime = to_iso8601(self.now - timedelta(hours=2)) # hadoop debugging + actual job # same jar as hive but with different args jobflow_id = mock_conn.run_jobflow(name='j-HADOOP_DEBUGGING', log_uri='', enable_debugging=True, steps=[step()]) jf = mock_conn.describe_jobflow(jobflow_id) self.mock_emr_job_flows['j-HADOOP_DEBUGGING'] = jf jf.state = 'WAITING' jf.creationdatetime = to_iso8601(self.now - timedelta(hours=6)) jf.startdatetime = to_iso8601(self.now - timedelta(hours=5)) # Need to reset times manually because mockboto resets them jf.steps[0].enddatetime = to_iso8601(self.now - timedelta(hours=5)) jf.steps[1].startdatetime = to_iso8601(self.now - timedelta(hours=4)) jf.steps[1].enddatetime = to_iso8601(self.now - timedelta(hours=2)) # skip cancelled steps self.mock_emr_job_flows['j-IDLE_AND_FAILED'] = MockEmrObject( state='WAITING', creationdatetime=to_iso8601(self.now - timedelta(hours=6)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), steps=[ step(start_time_back=4, end_time_back=3, state='FAILED'), step(state='CANCELLED', ) ], ) # pooled job flow reaching end of full hour self.mock_emr_job_flows['j-POOLED'] = MockEmrObject( bootstrapactions=[ MockEmrObject(args=[]), MockEmrObject(args=[ MockEmrObject( value='pool-0123456789abcdef0123456789abcdef'), MockEmrObject(value='reflecting'), ]), ], creationdatetime=to_iso8601(self.now - timedelta(hours=1)), startdatetime=to_iso8601(self.now - timedelta(minutes=55)), state='WAITING', steps=[], ) # add job flow IDs and fake names to the mock job flows for jfid, jf in self.mock_emr_job_flows.iteritems(): jf.jobflowid = jfid jf.name = jfid[2:].replace('_', ' ').title() + ' Job Flow'
from datetime import timedelta from mrjob.py2 import StringIO from mrjob.tools.emr.report_long_jobs import _find_long_running_jobs from mrjob.tools.emr.report_long_jobs import main from tests.mockboto import MockEmrObject from tests.mockboto import MockBotoTestCase CLUSTERS = [ MockEmrObject( id='j-STARTING', name='mr_grieving', status=MockEmrObject( state='STARTING', timeline=MockEmrObject( creationdatetime='2010-06-06T00:05:00Z', ), ), _steps=[], ), MockEmrObject( id='j-BOOTSTRAPPING', name='mr_grieving', status=MockEmrObject( state='BOOTSTRAPPING', timeline=MockEmrObject( creationdatetime='2010-06-06T00:05:00Z', ), ), _steps=[],
def create_fake_clusters(self): self.now = datetime.utcnow().replace(microsecond=0) self.add_mock_s3_data({'my_bucket': {}}) # create a timestamp the given number of *hours*, *minutes*, etc. # in the past. If any *kwargs* are None, return None. def ago(**kwargs): if any(v is None for v in kwargs.values()): return None return to_iso8601(self.now - timedelta(**kwargs)) # Build a step object easily # also make it respond to .args() def step(jar='/home/hadoop/contrib/streaming/hadoop-streaming.jar', args=self._DEFAULT_STEP_ARGS, state='COMPLETED', create_hours_ago=None, start_hours_ago=None, end_hours_ago=None, name='Streaming Step', action_on_failure='TERMINATE_CLUSTER', **kwargs): return MockEmrObject( config=MockEmrObject( actiononfailure=action_on_failure, args=[MockEmrObject(value=a) for a in args], jar=jar, ), status=MockEmrObject( state=state, timeline=MockEmrObject( creationdatetime=ago(hours=create_hours_ago), enddatetime=ago(hours=end_hours_ago), startdatetime=ago(hours=start_hours_ago), ), )) # empty job self.add_mock_emr_cluster( MockEmrObject( id='j-EMPTY', status=MockEmrObject( state='STARTING', timeline=MockEmrObject(creationdatetime=ago(hours=10)), ), )) # job that's bootstrapping self.add_mock_emr_cluster( MockEmrObject( id='j-BOOTSTRAPPING', status=MockEmrObject( state='BOOTSTRAPPING', timeline=MockEmrObject(creationdatetime=ago(hours=10), ), ), _steps=[step(create_hours_ago=10, state='PENDING')], )) # currently running job self.add_mock_emr_cluster( MockEmrObject( id='j-CURRENTLY_RUNNING', status=MockEmrObject( state='RUNNING', timeline=MockEmrObject(creationdatetime=ago(hours=4, minutes=15), readydatetime=ago(hours=4, minutes=10))), _steps=[step(start_hours_ago=4, state='RUNNING')], ), ) # finished cluster self.add_mock_emr_cluster( MockEmrObject( id='j-DONE', status=MockEmrObject( state='TERMINATED', timeline=MockEmrObject( creationdatetime=ago(hours=10), readydatetime=ago(hours=8), enddatetime=ago(hours=5), ), ), _steps=[step(start_hours_ago=8, end_hours_ago=6)], )) # idle cluster self.add_mock_emr_cluster( MockEmrObject( id='j-DONE_AND_IDLE', status=MockEmrObject( state='WAITING', timeline=MockEmrObject( creationdatetime=ago(hours=6), readydatetime=ago(hours=5, minutes=5), ), ), _steps=[step(start_hours_ago=4, end_hours_ago=2)], )) # idle cluster with 4.x step format. should still be # recognizable as a streaming step self.add_mock_emr_cluster( MockEmrObject( id='j-DONE_AND_IDLE_4_X', status=MockEmrObject( state='WAITING', timeline=MockEmrObject( creationdatetime=ago(hours=6), readydatetime=ago(hours=5, minutes=5), ), ), _steps=[ step(start_hours_ago=4, end_hours_ago=2, jar='command-runner.jar', args=['hadoop-streaming'] + self._DEFAULT_STEP_ARGS) ], )) # idle cluster with an active lock self.add_mock_emr_cluster( MockEmrObject( id='j-IDLE_AND_LOCKED', status=MockEmrObject( state='WAITING', timeline=MockEmrObject( creationdatetime=ago(hours=6), readydatetime=ago(hours=5, minutes=5), ), ), _steps=[step(start_hours_ago=4, end_hours_ago=2)], )) self.add_mock_s3_data( { 'my_bucket': { 'locks/j-IDLE_AND_LOCKED/2': b'not_you', }, }, time_modified=self.now) # idle cluster with an expired lock self.add_mock_emr_cluster( MockEmrObject( id='j-IDLE_AND_EXPIRED', status=MockEmrObject( state='WAITING', timeline=MockEmrObject( creationdatetime=ago(hours=6), readydatetime=ago(hours=5, minutes=5), ), ), _steps=[step(start_hours_ago=4, end_hours_ago=2)], )) self.add_mock_s3_data( { 'my_bucket': { 'locks/j-IDLE_AND_EXPIRED/2': b'not_you', }, }, time_modified=(self.now - timedelta(minutes=5))) # idle cluster with an expired lock self.add_mock_emr_cluster( MockEmrObject( id='j-IDLE_BUT_INCOMPLETE_STEPS', status=MockEmrObject( state='WAITING', timeline=MockEmrObject( creationdatetime=ago(hours=6), readydatetime=ago(hours=5, minutes=5), ), ), _steps=[step(start_hours_ago=4, end_hours_ago=None)], )) # hive cluster (looks completed but isn't) self.add_mock_emr_cluster( MockEmrObject( id='j-HIVE', status=MockEmrObject( state='WAITING', timeline=MockEmrObject( creationdatetime=ago(hours=6), readydatetime=ago(hours=5, minutes=5), ), ), _steps=[ step( start_hours_ago=4, end_hours_ago=4, jar= ('s3://us-east-1.elasticmapreduce/libs/script-runner/' 'script-runner.jar'), args=[], ) ], )) # custom hadoop streaming jar self.add_mock_emr_cluster( MockEmrObject( id='j-CUSTOM_DONE_AND_IDLE', status=MockEmrObject( state='WAITING', timeline=MockEmrObject( creationdatetime=ago(hours=6), readydatetime=ago(hours=5, minutes=5), ), ), _steps=[ step( start_hours_ago=4, end_hours_ago=4, jar=('s3://my_bucket/tmp/somejob/files/' 'oddjob-0.0.3-SNAPSHOT-standalone.jar'), args=[], ) ], )) mock_emr_conn = self.connect_emr() # hadoop debugging without any other steps mock_emr_conn.run_jobflow( _id='j-DEBUG_ONLY', name='DEBUG_ONLY', enable_debugging=True, now=self.now - timedelta(hours=3), job_flow_role='fake-instance-profile', service_role='fake-service-role', ) j_debug_only = self.mock_emr_clusters['j-DEBUG_ONLY'] j_debug_only.status.state = 'WAITING' j_debug_only.status.timeline.readydatetime = ago(hours=2, minutes=55) j_debug_only._steps[0].status.state = 'COMPLETED' j_debug_only._steps[0].status.timeline.enddatetime = ago(hours=2) # hadoop debugging + actual job # same jar as hive but with different args mock_emr_conn.run_jobflow( _id='j-HADOOP_DEBUGGING', name='HADOOP_DEBUGGING', enable_debugging=True, now=self.now - timedelta(hours=6), job_flow_role='fake-instance-profile', service_role='fake-service-role', ) j_hadoop_debugging = self.mock_emr_clusters['j-HADOOP_DEBUGGING'] j_hadoop_debugging._steps.append(step()) j_hadoop_debugging.status.state = 'WAITING' j_hadoop_debugging.status.timeline.readydatetime = ago(hours=4, minutes=55) # Need to reset times manually because mockboto resets them j_hadoop_debugging._steps[0].status.state = 'COMPLETED' j_hadoop_debugging._steps[0].status.timeline.enddatetime = ago(hours=5) j_hadoop_debugging._steps[1].status.timeline.startdatetime = ago( hours=4) j_hadoop_debugging._steps[1].status.timeline.enddatetime = ago(hours=2) # should skip cancelled steps self.add_mock_emr_cluster( MockEmrObject( id='j-IDLE_AND_FAILED', status=MockEmrObject( state='WAITING', timeline=MockEmrObject( creationdatetime=ago(hours=6), readydatetime=ago(hours=5, minutes=5), ), ), _steps=[ step(start_hours_ago=4, end_hours_ago=3, state='FAILED'), step(state='CANCELLED', ) ], )) # pooled cluster reaching end of full hour self.add_mock_emr_cluster( MockEmrObject( _bootstrapactions=[ MockEmrObject(args=[], name='action 0'), MockEmrObject(args=[ MockEmrObject( value='pool-0123456789abcdef0123456789abcdef'), MockEmrObject(value='reflecting'), ], name='master'), ], id='j-POOLED', status=MockEmrObject( state='WAITING', timeline=MockEmrObject( creationdatetime=ago(minutes=55), readydatetime=ago(minutes=50), ), ), )) # cluster that has had pending jobs but hasn't run them self.add_mock_emr_cluster( MockEmrObject( id='j-PENDING_BUT_IDLE', status=MockEmrObject( state='RUNNING', timeline=MockEmrObject( creationdatetime=ago(hours=3), readydatetime=ago(hours=2, minutes=50), ), ), _steps=[step(create_hours_ago=3, state='PENDING')], ))
def test_empty_bootstrap_actions(self): jf = MockEmrObject(bootstrapactions=[]) self.assertEqual(pool_hash_and_name(jf), (None, None))
def test_empty(self): jf = MockEmrObject() self.assertEqual(pool_hash_and_name(jf), (None, None))