def step(jar='/home/hadoop/contrib/streaming/hadoop-streaming.jar', args=[ '-mapper', 'my_job.py --mapper', '-reducer', 'my_job.py --reducer' ], state='COMPLETE', create_hours_ago=None, start_hours_ago=None, end_hours_ago=None, name='Streaming Step', action_on_failure='TERMINATE_CLUSTER', **kwargs): if create_hours_ago: kwargs['creationdatetime'] = to_iso8601(self.now - timedelta( hours=create_hours_ago)) if start_hours_ago: kwargs['startdatetime'] = to_iso8601(self.now - timedelta( hours=start_hours_ago)) if end_hours_ago: kwargs['enddatetime'] = to_iso8601(self.now - timedelta( hours=end_hours_ago)) kwargs['args'] = [MockEmrObject(value=a) for a in args] return MockEmrObject(jar=jar, state=state, name=name, action_on_failure=action_on_failure, **kwargs)
def test_clock_skew(self): # make sure something reasonable happens if now is before # the start time jf = MockEmrObject( creationdatetime=to_iso8601(datetime(2010, 6, 6, 4)), startdatetime=to_iso8601(datetime(2010, 6, 6, 4, 26))) self.assertEqual( est_time_to_hour(jf, now=datetime(2010, 6, 6, 4, 25, 59)), timedelta(seconds=1))
def test_clock_skew(self): # make sure something reasonable happens if now is before # the start time jf = MockEmrObject(creationdatetime=to_iso8601(datetime(2010, 6, 6, 4)), startdatetime=to_iso8601(datetime( 2010, 6, 6, 4, 26))) self.assertEqual( est_time_to_hour(jf, now=datetime(2010, 6, 6, 4, 25, 59)), timedelta(seconds=1))
def test_started(self): jf = MockEmrObject(creationdatetime=to_iso8601(datetime(2010, 6, 6, 4)), startdatetime=to_iso8601(datetime( 2010, 6, 6, 4, 26))) self.assertEqual(est_time_to_hour(jf, now=datetime(2010, 6, 6, 4, 35)), timedelta(minutes=51)) self.assertEqual(est_time_to_hour(jf, now=datetime(2010, 6, 6, 5, 20)), timedelta(minutes=6)) self.assertEqual(est_time_to_hour(jf, now=datetime(2010, 6, 6, 6, 26)), timedelta(minutes=60))
def test_now_is_automatically_set(self): jf = MockEmrObject(creationdatetime=to_iso8601(datetime.utcnow())) t = est_time_to_hour(jf) self.assertLessEqual(t, timedelta(minutes=60)) self.assertGreater(t, timedelta(minutes=59)) jf2 = MockEmrObject(creationdatetime=to_iso8601(datetime.utcnow() - timedelta(minutes=1)), startdatetime=to_iso8601(datetime.utcnow())) t = est_time_to_hour(jf2) self.assertLessEqual(t, timedelta(minutes=60)) self.assertGreater(t, timedelta(minutes=59))
def test_started(self): jf = MockEmrObject( creationdatetime=to_iso8601(datetime(2010, 6, 6, 4)), startdatetime=to_iso8601(datetime(2010, 6, 6, 4, 26))) self.assertEqual( est_time_to_hour(jf, now=datetime(2010, 6, 6, 4, 35)), timedelta(minutes=51)) self.assertEqual( est_time_to_hour(jf, now=datetime(2010, 6, 6, 5, 20)), timedelta(minutes=6)) self.assertEqual( est_time_to_hour(jf, now=datetime(2010, 6, 6, 6, 26)), timedelta(minutes=60))
def test_started(self): cs = MockEmrObject(status=MockEmrObject(timeline=MockEmrObject( creationdatetime=to_iso8601(datetime(2010, 6, 6, 4, 26)), readydatetime=to_iso8601(datetime(2010, 6, 6, 4, 30))))) self.assertEqual( _est_time_to_hour(cs, now=datetime(2010, 6, 6, 4, 35)), timedelta(minutes=51)) self.assertEqual( _est_time_to_hour(cs, now=datetime(2010, 6, 6, 5, 20)), timedelta(minutes=6)) self.assertEqual( _est_time_to_hour(cs, now=datetime(2010, 6, 6, 6, 26)), timedelta(minutes=60))
def test_now_is_automatically_set(self): cs = MockEmrObject(status=MockEmrObject(timeline=MockEmrObject( creationdatetime=to_iso8601(datetime.utcnow())))) t = _est_time_to_hour(cs) self.assertLessEqual(t, timedelta(minutes=60)) self.assertGreater(t, timedelta(minutes=59))
def test_now_is_automatically_set(self): jf = MockEmrObject( creationdatetime=to_iso8601(datetime.utcnow())) t = est_time_to_hour(jf) self.assertLessEqual(t, timedelta(minutes=60)) self.assertGreater(t, timedelta(minutes=59)) jf2 = MockEmrObject( creationdatetime=to_iso8601( datetime.utcnow() - timedelta(minutes=1)), startdatetime=to_iso8601(datetime.utcnow())) t = est_time_to_hour(jf2) self.assertLessEqual(t, timedelta(minutes=60)) self.assertGreater(t, timedelta(minutes=59))
def test_started(self): cs = MockEmrObject( status=MockEmrObject( timeline=MockEmrObject( creationdatetime=to_iso8601(datetime(2010, 6, 6, 4, 26)), readydatetime=to_iso8601(datetime(2010, 6, 6, 4, 30))))) self.assertEqual( _est_time_to_hour(cs, now=datetime(2010, 6, 6, 4, 35)), timedelta(minutes=51)) self.assertEqual( _est_time_to_hour(cs, now=datetime(2010, 6, 6, 5, 20)), timedelta(minutes=6)) self.assertEqual( _est_time_to_hour(cs, now=datetime(2010, 6, 6, 6, 26)), timedelta(minutes=60))
def test_now_is_automatically_set(self): cs = MockEmrObject( status=MockEmrObject( timeline=MockEmrObject( creationdatetime=to_iso8601(datetime.utcnow())))) t = _est_time_to_hour(cs) self.assertLessEqual(t, timedelta(minutes=60)) self.assertGreater(t, timedelta(minutes=59))
def step( jar="/home/hadoop/contrib/streaming/hadoop-streaming.jar", args=["-mapper", "my_job.py --mapper", "-reducer", "my_job.py --reducer"], state="COMPLETE", create_hours_ago=None, start_hours_ago=None, end_hours_ago=None, name="Streaming Step", action_on_failure="TERMINATE_JOB_FLOW", **kwargs ): if create_hours_ago: kwargs["creationdatetime"] = to_iso8601(self.now - timedelta(hours=create_hours_ago)) if start_hours_ago: kwargs["startdatetime"] = to_iso8601(self.now - timedelta(hours=start_hours_ago)) if end_hours_ago: kwargs["enddatetime"] = to_iso8601(self.now - timedelta(hours=end_hours_ago)) kwargs["args"] = [MockEmrObject(value=a) for a in args] return MockEmrObject(jar=jar, state=state, name=name, action_on_failure=action_on_failure, **kwargs)
def step(jar='/home/hadoop/contrib/streaming/hadoop-streaming.jar', args=['-mapper', 'my_job.py --mapper', '-reducer', 'my_job.py --reducer'], state='COMPLETE', start_time_back=None, end_time_back=None, name='Streaming Step', action_on_failure='TERMINATE_JOB_FLOW', **kwargs): if start_time_back: kwargs['startdatetime'] = to_iso8601( self.now - timedelta(hours=start_time_back)) if end_time_back: kwargs['enddatetime'] = to_iso8601( self.now - timedelta(hours=end_time_back)) kwargs['args'] = [MockEmrObject(value=a) for a in args] return MockEmrObject( jar=jar, state=state, name=name, action_on_failure=action_on_failure, **kwargs)
def test_can_get_all_job_flows(self): now = datetime.datetime.utcnow() NUM_JOB_FLOWS = 2222 assert_gt(NUM_JOB_FLOWS, DEFAULT_MAX_JOB_FLOWS_RETURNED) for i in range(NUM_JOB_FLOWS): jfid = 'j-%04d' % i self.mock_emr_job_flows[jfid] = MockEmrObject( creationdatetime=to_iso8601(now - datetime.timedelta(minutes=i)), jobflowid=jfid) emr_conn = EMRJobRunner().make_emr_conn() # ordinary describe_jobflows() hits the limit on number of job flows some_jfs = emr_conn.describe_jobflows() assert_equal(len(some_jfs), DEFAULT_MAX_JOB_FLOWS_RETURNED) all_jfs = describe_all_job_flows(emr_conn) assert_equal(len(all_jfs), NUM_JOB_FLOWS) assert_equal(sorted(jf.jobflowid for jf in all_jfs), [('j-%04d' % i) for i in range(NUM_JOB_FLOWS)])
def create_fake_job_flows(self): self.now = datetime.utcnow().replace(microsecond=0) # empty job self.mock_emr_job_flows['j-EMPTY'] = MockEmrObject( state='WAITING', creationdatetime=to_iso8601(self.now - timedelta(hours=10)), steps=[], ) # Build a step object easily # also make it respond to .args() def step(jar='/home/hadoop/contrib/streaming/hadoop-streaming.jar', args=['-mapper', 'my_job.py --mapper', '-reducer', 'my_job.py --reducer'], state='COMPLETE', start_time_back=None, end_time_back=None, name='Streaming Step', action_on_failure='TERMINATE_JOB_FLOW', **kwargs): if start_time_back: kwargs['startdatetime'] = to_iso8601( self.now - timedelta(hours=start_time_back)) if end_time_back: kwargs['enddatetime'] = to_iso8601( self.now - timedelta(hours=end_time_back)) kwargs['args'] = [MockEmrObject(value=a) for a in args] return MockEmrObject( jar=jar, state=state, name=name, action_on_failure=action_on_failure, **kwargs) # currently running job self.mock_emr_job_flows['j-CURRENTLY_RUNNING'] = MockEmrObject( state='RUNNING', creationdatetime=to_iso8601(self.now - timedelta(hours=6)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), steps=[step(start_time_back=4, state='RUNNING')], ) # finished job flow self.mock_emr_job_flows['j-DONE'] = MockEmrObject( state='COMPLETE', creationdatetime=to_iso8601(self.now - timedelta(hours=10)), startdatetime=to_iso8601(self.now - timedelta(hours=9)), enddatetime=to_iso8601(self.now - timedelta(hours=5)), steps=[step(start_time_back=8, end_time_back=6)], ) # idle job flow self.mock_emr_job_flows['j-DONE_AND_IDLE'] = MockEmrObject( state='WAITING', creationdatetime=to_iso8601(self.now - timedelta(hours=6)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), steps=[step(start_time_back=4, end_time_back=2)], ) # hive job flow (looks completed but isn't) self.mock_emr_job_flows['j-HIVE'] = MockEmrObject( state='WAITING', creationdatetime=to_iso8601(self.now - timedelta(hours=6)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), steps=[step( start_time_back=4, end_time_back=4, jar='s3://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar', args=[], )], ) # custom hadoop streaming jar self.mock_emr_job_flows['j-CUSTOM_DONE_AND_IDLE'] = MockEmrObject( state='WAITING', creationdatetime=to_iso8601(self.now - timedelta(hours=6)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), steps=[step( start_time_back=4, end_time_back=4, jar='s3://my_bucket/tmp/somejob/files/oddjob-0.0.3-SNAPSHOT-standalone.jar', args=[], )], ) mock_conn = MockEmrConnection() # hadoop debugging without any other steps jobflow_id = mock_conn.run_jobflow(name='j-DEBUG_ONLY', log_uri='', enable_debugging=True) jf = mock_conn.describe_jobflow(jobflow_id) self.mock_emr_job_flows['j-DEBUG_ONLY'] = jf jf.state = 'WAITING' jf.startdatetime=to_iso8601(self.now - timedelta(hours=2)) jf.steps[0].enddatetime=to_iso8601(self.now - timedelta(hours=2)) # hadoop debugging + actual job # same jar as hive but with different args jobflow_id = mock_conn.run_jobflow(name='j-HADOOP_DEBUGGING', log_uri='', enable_debugging=True, steps=[step()]) jf = mock_conn.describe_jobflow(jobflow_id) self.mock_emr_job_flows['j-HADOOP_DEBUGGING'] = jf jf.state = 'WAITING' jf.creationdatetime = to_iso8601(self.now - timedelta(hours=6)) jf.startdatetime = to_iso8601(self.now - timedelta(hours=5)) # Need to reset times manually because mockboto resets them jf.steps[0].enddatetime = to_iso8601(self.now - timedelta(hours=5)) jf.steps[1].startdatetime = to_iso8601(self.now - timedelta(hours=4)) jf.steps[1].enddatetime = to_iso8601(self.now - timedelta(hours=2)) # skip cancelled steps self.mock_emr_job_flows['j-IDLE_AND_FAILED'] = MockEmrObject( state='WAITING', creationdatetime=to_iso8601(self.now - timedelta(hours=6)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), steps=[ step(start_time_back=4, end_time_back=3, state='FAILED'), step( state='CANCELLED', ) ], ) # add job flow IDs and fake names to the mock job flows for jfid, jf in self.mock_emr_job_flows.iteritems(): jf.jobflowid = jfid jf.name = jfid[2:].replace('_', ' ').title() + ' Job Flow'
def ago(**kwargs): if any(v is None for v in kwargs.values()): return None return to_iso8601(self.now - timedelta(**kwargs))
def create_fake_job_flows(self): self.now = datetime.utcnow().replace(microsecond=0) self.add_mock_s3_data({'my_bucket': {}}) # Build a step object easily # also make it respond to .args() def step(jar='/home/hadoop/contrib/streaming/hadoop-streaming.jar', args=['-mapper', 'my_job.py --mapper', '-reducer', 'my_job.py --reducer'], state='COMPLETE', create_hours_ago=None, start_hours_ago=None, end_hours_ago=None, name='Streaming Step', action_on_failure='TERMINATE_CLUSTER', **kwargs): if create_hours_ago: kwargs['creationdatetime'] = to_iso8601( self.now - timedelta(hours=create_hours_ago)) if start_hours_ago: kwargs['startdatetime'] = to_iso8601( self.now - timedelta(hours=start_hours_ago)) if end_hours_ago: kwargs['enddatetime'] = to_iso8601( self.now - timedelta(hours=end_hours_ago)) kwargs['args'] = [MockEmrObject(value=a) for a in args] return MockEmrObject( jar=jar, state=state, name=name, action_on_failure=action_on_failure, **kwargs) # empty job self.mock_emr_job_flows['j-EMPTY'] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=10)), state='STARTING', steps=[], ) # job that's bootstrapping self.mock_emr_job_flows['j-BOOTSTRAPPING'] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=10)), startdatetime=to_iso8601( self.now - timedelta(hours=9, minutes=55)), state='BOOTSTRAPPING', steps=[step(create_hours_ago=10, state='PENDING')], ) # currently running job self.mock_emr_job_flows['j-CURRENTLY_RUNNING'] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=6)), readydatetime=to_iso8601(self.now - timedelta(hours=4, minutes=10)), startdatetime=to_iso8601(self.now - timedelta(hours=4, minutes=15)), state='RUNNING', steps=[step(start_hours_ago=4, state='RUNNING')], ) # finished job flow self.mock_emr_job_flows['j-DONE'] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=10)), enddatetime=to_iso8601(self.now - timedelta(hours=5)), readydatetime=to_iso8601(self.now - timedelta(hours=8)), startdatetime=to_iso8601(self.now - timedelta(hours=9)), state='COMPLETE', steps=[step(start_hours_ago=8, end_hours_ago=6)], ) # idle job flow self.mock_emr_job_flows['j-DONE_AND_IDLE'] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=6)), readydatetime=to_iso8601(self.now - timedelta(hours=5, minutes=5)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), state='WAITING', steps=[step(start_hours_ago=4, end_hours_ago=2)], ) # idle job flow with an active lock self.mock_emr_job_flows['j-IDLE_AND_LOCKED'] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=6)), readydatetime=to_iso8601(self.now - timedelta(hours=5, minutes=5)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), state='WAITING', steps=[step(start_hours_ago=4, end_hours_ago=2)], ) self.add_mock_s3_data({ 'my_bucket': { 'locks/j-IDLE_AND_LOCKED/2': b'not_you', }, }, time_modified=datetime.utcnow()) # idle job flow with an expired lock self.mock_emr_job_flows['j-IDLE_AND_EXPIRED'] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=6)), readydatetime=to_iso8601(self.now - timedelta(hours=5, minutes=5)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), state='WAITING', steps=[step(start_hours_ago=4, end_hours_ago=2)], ) self.add_mock_s3_data({ 'my_bucket': { 'locks/j-IDLE_AND_EXPIRED/2': b'not_you', }, }, time_modified=datetime.utcnow()-timedelta(minutes=5)) # idle job flow with an expired lock self.mock_emr_job_flows['j-IDLE_BUT_INCOMPLETE_STEPS'] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=6)), readydatetime=to_iso8601(self.now - timedelta(hours=5, minutes=5)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), state='WAITING', steps=[step(start_hours_ago=4, end_hours_ago=None)], ) # hive job flow (looks completed but isn't) self.mock_emr_job_flows['j-HIVE'] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=6)), readydatetime=to_iso8601(self.now - timedelta(hours=5, minutes=5)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), state='WAITING', steps=[step( start_hours_ago=4, end_hours_ago=4, jar=('s3://us-east-1.elasticmapreduce/libs/script-runner/' 'script-runner.jar'), args=[], )], ) # custom hadoop streaming jar self.mock_emr_job_flows['j-CUSTOM_DONE_AND_IDLE'] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=6)), readydatetime=to_iso8601(self.now - timedelta(hours=5, minutes=5)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), state='WAITING', steps=[step( start_hours_ago=4, end_hours_ago=4, jar=('s3://my_bucket/tmp/somejob/files/' 'oddjob-0.0.3-SNAPSHOT-standalone.jar'), args=[], )], ) mock_conn = MockEmrConnection() # hadoop debugging without any other steps jobflow_id = mock_conn.run_jobflow(name='j-DEBUG_ONLY', log_uri='', enable_debugging=True, now=self.now - timedelta(hours=3, minutes=5)) jf = mock_conn.describe_jobflow(jobflow_id) self.mock_emr_job_flows['j-DEBUG_ONLY'] = jf jf.state = 'WAITING' jf.startdatetime = to_iso8601( self.now - timedelta(hours=3)) jf.readydatetime = to_iso8601( self.now - timedelta(hours=2, minutes=55)) jf.steps[0].enddatetime = to_iso8601(self.now - timedelta(hours=2)) # hadoop debugging + actual job # same jar as hive but with different args jobflow_id = mock_conn.run_jobflow(name='j-HADOOP_DEBUGGING', log_uri='', enable_debugging=True, now=self.now - timedelta(hours=6)) jf = mock_conn.describe_jobflow(jobflow_id) self.mock_emr_job_flows['j-HADOOP_DEBUGGING'] = jf jf.steps.append(step()) jf.state = 'WAITING' jf.startdatetime = to_iso8601(self.now - timedelta(hours=5)) jf.readydatetime = to_iso8601( self.now - timedelta(hours=4, minutes=55)) # Need to reset times manually because mockboto resets them jf.steps[0].enddatetime = to_iso8601(self.now - timedelta(hours=5)) jf.steps[1].startdatetime = to_iso8601(self.now - timedelta(hours=4)) jf.steps[1].enddatetime = to_iso8601(self.now - timedelta(hours=2)) # should skip cancelled steps self.mock_emr_job_flows['j-IDLE_AND_FAILED'] = MockEmrObject( state='WAITING', creationdatetime=to_iso8601(self.now - timedelta(hours=6)), readydatetime=to_iso8601(self.now - timedelta(hours=5, minutes=5)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), steps=[ step(start_hours_ago=4, end_hours_ago=3, state='FAILED'), step( state='CANCELLED', ) ], ) # pooled job flow reaching end of full hour self.mock_emr_job_flows['j-POOLED'] = MockEmrObject( bootstrapactions=[ MockEmrObject(args=[], name='action 0'), MockEmrObject(args=[ MockEmrObject( value='pool-0123456789abcdef0123456789abcdef'), MockEmrObject(value='reflecting'), ], name='master'), ], creationdatetime=to_iso8601(self.now - timedelta(hours=1)), readydatetime=to_iso8601(self.now - timedelta(minutes=50)), startdatetime=to_iso8601(self.now - timedelta(minutes=55)), state='WAITING', steps=[], ) # job flow that has had pending jobs but hasn't run them self.mock_emr_job_flows['j-PENDING_BUT_IDLE'] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=3)), readydatetime=to_iso8601( self.now - timedelta(hours=2, minutes=50)), startdatetime=to_iso8601( self.now - timedelta(hours=2, minutes=55)), state='RUNNING', steps=[step(create_hours_ago=3, state='PENDING')], ) # add job flow IDs and fake names to the mock job flows for jfid, jf in self.mock_emr_job_flows.items(): jf.jobflowid = jfid jf.name = jfid[2:].replace('_', ' ').title() + ' Job Flow'
def create_fake_job_flows(self): self.now = datetime.utcnow().replace(microsecond=0) # Build a step object easily # also make it respond to .args() def step( jar="/home/hadoop/contrib/streaming/hadoop-streaming.jar", args=["-mapper", "my_job.py --mapper", "-reducer", "my_job.py --reducer"], state="COMPLETE", create_hours_ago=None, start_hours_ago=None, end_hours_ago=None, name="Streaming Step", action_on_failure="TERMINATE_JOB_FLOW", **kwargs ): if create_hours_ago: kwargs["creationdatetime"] = to_iso8601(self.now - timedelta(hours=create_hours_ago)) if start_hours_ago: kwargs["startdatetime"] = to_iso8601(self.now - timedelta(hours=start_hours_ago)) if end_hours_ago: kwargs["enddatetime"] = to_iso8601(self.now - timedelta(hours=end_hours_ago)) kwargs["args"] = [MockEmrObject(value=a) for a in args] return MockEmrObject(jar=jar, state=state, name=name, action_on_failure=action_on_failure, **kwargs) # empty job self.mock_emr_job_flows["j-EMPTY"] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=10)), state="STARTING" ) # job that's bootstrapping self.mock_emr_job_flows["j-BOOTSTRAPPING"] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=10)), startdatetime=to_iso8601(self.now - timedelta(hours=9, minutes=55)), state="BOOTSTRAPPING", steps=[step(create_hours_ago=10, state="PENDING")], ) # currently running job self.mock_emr_job_flows["j-CURRENTLY_RUNNING"] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=6)), readydatetime=to_iso8601(self.now - timedelta(hours=4, minutes=10)), startdatetime=to_iso8601(self.now - timedelta(hours=4, minutes=15)), state="RUNNING", steps=[step(start_hours_ago=4, state="RUNNING")], ) # finished job flow self.mock_emr_job_flows["j-DONE"] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=10)), enddatetime=to_iso8601(self.now - timedelta(hours=5)), readydatetime=to_iso8601(self.now - timedelta(hours=8)), startdatetime=to_iso8601(self.now - timedelta(hours=9)), state="COMPLETE", steps=[step(start_hours_ago=8, end_hours_ago=6)], ) # idle job flow self.mock_emr_job_flows["j-DONE_AND_IDLE"] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=6)), readydatetime=to_iso8601(self.now - timedelta(hours=5, minutes=5)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), state="WAITING", steps=[step(start_hours_ago=4, end_hours_ago=2)], ) # hive job flow (looks completed but isn't) self.mock_emr_job_flows["j-HIVE"] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=6)), readydatetime=to_iso8601(self.now - timedelta(hours=5, minutes=5)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), state="WAITING", steps=[ step( start_hours_ago=4, end_hours_ago=4, jar=("s3://us-east-1.elasticmapreduce/libs/script-runner/" "script-runner.jar"), args=[], ) ], ) # custom hadoop streaming jar self.mock_emr_job_flows["j-CUSTOM_DONE_AND_IDLE"] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=6)), readydatetime=to_iso8601(self.now - timedelta(hours=5, minutes=5)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), state="WAITING", steps=[ step( start_hours_ago=4, end_hours_ago=4, jar=("s3://my_bucket/tmp/somejob/files/" "oddjob-0.0.3-SNAPSHOT-standalone.jar"), args=[], ) ], ) mock_conn = MockEmrConnection() # hadoop debugging without any other steps jobflow_id = mock_conn.run_jobflow( name="j-DEBUG_ONLY", log_uri="", enable_debugging=True, now=self.now - timedelta(hours=3, minutes=5) ) jf = mock_conn.describe_jobflow(jobflow_id) self.mock_emr_job_flows["j-DEBUG_ONLY"] = jf jf.state = "WAITING" jf.startdatetime = to_iso8601(self.now - timedelta(hours=3)) jf.readydatetime = to_iso8601(self.now - timedelta(hours=2, minutes=55)) jf.steps[0].enddatetime = to_iso8601(self.now - timedelta(hours=2)) # hadoop debugging + actual job # same jar as hive but with different args jobflow_id = mock_conn.run_jobflow( name="j-HADOOP_DEBUGGING", log_uri="", enable_debugging=True, steps=[step()], now=self.now - timedelta(hours=6), ) jf = mock_conn.describe_jobflow(jobflow_id) self.mock_emr_job_flows["j-HADOOP_DEBUGGING"] = jf jf.state = "WAITING" jf.startdatetime = to_iso8601(self.now - timedelta(hours=5)) jf.readydatetime = to_iso8601(self.now - timedelta(hours=4, minutes=55)) # Need to reset times manually because mockboto resets them jf.steps[0].enddatetime = to_iso8601(self.now - timedelta(hours=5)) jf.steps[1].startdatetime = to_iso8601(self.now - timedelta(hours=4)) jf.steps[1].enddatetime = to_iso8601(self.now - timedelta(hours=2)) # should skip cancelled steps self.mock_emr_job_flows["j-IDLE_AND_FAILED"] = MockEmrObject( state="WAITING", creationdatetime=to_iso8601(self.now - timedelta(hours=6)), readydatetime=to_iso8601(self.now - timedelta(hours=5, minutes=5)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), steps=[step(start_hours_ago=4, end_hours_ago=3, state="FAILED"), step(state="CANCELLED")], ) # pooled job flow reaching end of full hour self.mock_emr_job_flows["j-POOLED"] = MockEmrObject( bootstrapactions=[ MockEmrObject(args=[]), MockEmrObject( args=[ MockEmrObject(value="pool-0123456789abcdef0123456789abcdef"), MockEmrObject(value="reflecting"), ] ), ], creationdatetime=to_iso8601(self.now - timedelta(hours=1)), readydatetime=to_iso8601(self.now - timedelta(minutes=50)), startdatetime=to_iso8601(self.now - timedelta(minutes=55)), state="WAITING", steps=[], ) # job flow that has had pending jobs but hasn't run them self.mock_emr_job_flows["j-PENDING_BUT_IDLE"] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=3)), readydatetime=to_iso8601(self.now - timedelta(hours=2, minutes=50)), startdatetime=to_iso8601(self.now - timedelta(hours=2, minutes=55)), state="RUNNING", steps=[step(create_hours_ago=3, state="PENDING")], ) # add job flow IDs and fake names to the mock job flows for jfid, jf in self.mock_emr_job_flows.iteritems(): jf.jobflowid = jfid jf.name = jfid[2:].replace("_", " ").title() + " Job Flow"
def create_fake_job_flows(self): self.now = datetime.utcnow().replace(microsecond=0) # empty job self.mock_emr_job_flows['j-EMPTY'] = MockEmrObject( state='WAITING', creationdatetime=to_iso8601(self.now - timedelta(hours=10)), steps=[], ) # currently running job self.mock_emr_job_flows['j-CURRENTLY_RUNNING'] = MockEmrObject( state='RUNNING', creationdatetime=to_iso8601(self.now - timedelta(hours=6)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), steps=[ MockEmrObject( startdatetime=to_iso8601(self.now - timedelta(hours=4)), jar= '/home/hadoop/contrib/streaming/hadoop-0.18-streaming.jar', state='RUNNING', ) ], ) # finished job flow self.mock_emr_job_flows['j-DONE'] = MockEmrObject( state='COMPLETE', creationdatetime=to_iso8601(self.now - timedelta(hours=10)), startdatetime=to_iso8601(self.now - timedelta(hours=9)), enddatetime=to_iso8601(self.now - timedelta(hours=5)), steps=[ MockEmrObject( startdatetime=to_iso8601(self.now - timedelta(hours=8)), enddatetime=to_iso8601(self.now - timedelta(hours=6)), jar= '/home/hadoop/contrib/streaming/hadoop-0.18-streaming.jar', state='COMPLETE', ) ], ) # idle job flow self.mock_emr_job_flows['j-DONE_AND_IDLE'] = MockEmrObject( state='WAITING', creationdatetime=to_iso8601(self.now - timedelta(hours=6)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), steps=[ MockEmrObject( startdatetime=to_iso8601(self.now - timedelta(hours=4)), enddatetime=to_iso8601(self.now - timedelta(hours=2)), jar= '/home/hadoop/contrib/streaming/hadoop-0.18-streaming.jar', state='COMPLETE', ) ], ) # hive job flow (looks completed but isn't) self.mock_emr_job_flows['j-HIVE'] = MockEmrObject( state='WAITING', creationdatetime=to_iso8601(self.now - timedelta(hours=6)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), steps=[ MockEmrObject( startdatetime=to_iso8601(self.now - timedelta(hours=4)), enddatetime=to_iso8601(self.now - timedelta(hours=4)), jar= 's3://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar', state='COMPLETE', ) ], ) # hadoop debugging + actual job # hadoop debugging looks the same to us as Hive (they use the same # jar). The difference is that there's also a streaming step. self.mock_emr_job_flows['j-HADOOP_DEBUGGING'] = MockEmrObject( state='WAITING', creationdatetime=to_iso8601(self.now - timedelta(hours=6)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), steps=[ MockEmrObject( startdatetime=to_iso8601(self.now - timedelta(hours=5)), enddatetime=to_iso8601(self.now - timedelta(hours=5)), jar= 's3://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar', state='COMPLETE', ), MockEmrObject( startdatetime=to_iso8601(self.now - timedelta(hours=4)), enddatetime=to_iso8601(self.now - timedelta(hours=2)), jar= '/home/hadoop/contrib/streaming/hadoop-0.18-streaming.jar', state='COMPLETE', ) ], ) # skip cancelled steps self.mock_emr_job_flows['j-IDLE_AND_FAILED'] = MockEmrObject( state='WAITING', creationdatetime=to_iso8601(self.now - timedelta(hours=6)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), steps=[ MockEmrObject( startdatetime=to_iso8601(self.now - timedelta(hours=4)), enddatetime=to_iso8601(self.now - timedelta(hours=3)), jar= '/home/hadoop/contrib/streaming/hadoop-0.18-streaming.jar', state='FAILED', ), MockEmrObject( jar= '/home/hadoop/contrib/streaming/hadoop-0.18-streaming.jar', state='CANCELLED', ) ], ) # add job flow IDs and fake names to the mock job flows for jfid, jf in self.mock_emr_job_flows.iteritems(): jf.jobflowid = jfid jf.name = jfid[2:].replace('_', ' ').title() + ' Job Flow'
def create_fake_job_flows(self): self.now = datetime.utcnow().replace(microsecond=0) # empty job self.mock_emr_job_flows['j-EMPTY'] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=10)), state='WAITING', ) # Build a step object easily # also make it respond to .args() def step(jar='/home/hadoop/contrib/streaming/hadoop-streaming.jar', args=[ '-mapper', 'my_job.py --mapper', '-reducer', 'my_job.py --reducer' ], state='COMPLETE', start_time_back=None, end_time_back=None, name='Streaming Step', action_on_failure='TERMINATE_JOB_FLOW', **kwargs): if start_time_back: kwargs['startdatetime'] = to_iso8601(self.now - timedelta( hours=start_time_back)) if end_time_back: kwargs['enddatetime'] = to_iso8601(self.now - timedelta( hours=end_time_back)) kwargs['args'] = [MockEmrObject(value=a) for a in args] return MockEmrObject(jar=jar, state=state, name=name, action_on_failure=action_on_failure, **kwargs) # currently running job self.mock_emr_job_flows['j-CURRENTLY_RUNNING'] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=6)), startdatetime=to_iso8601(self.now - timedelta(hours=4, minutes=15)), state='RUNNING', steps=[step(start_time_back=4, state='RUNNING')], ) # finished job flow self.mock_emr_job_flows['j-DONE'] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=10)), enddatetime=to_iso8601(self.now - timedelta(hours=5)), startdatetime=to_iso8601(self.now - timedelta(hours=9)), state='COMPLETE', steps=[step(start_time_back=8, end_time_back=6)], ) # idle job flow self.mock_emr_job_flows['j-DONE_AND_IDLE'] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=6)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), state='WAITING', steps=[step(start_time_back=4, end_time_back=2)], ) # hive job flow (looks completed but isn't) self.mock_emr_job_flows['j-HIVE'] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=6)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), state='WAITING', steps=[ step( start_time_back=4, end_time_back=4, jar=('s3://us-east-1.elasticmapreduce/libs/script-runner/' 'script-runner.jar'), args=[], ) ], ) # custom hadoop streaming jar self.mock_emr_job_flows['j-CUSTOM_DONE_AND_IDLE'] = MockEmrObject( creationdatetime=to_iso8601(self.now - timedelta(hours=6)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), state='WAITING', steps=[ step( start_time_back=4, end_time_back=4, jar=('s3://my_bucket/tmp/somejob/files/' 'oddjob-0.0.3-SNAPSHOT-standalone.jar'), args=[], ) ], ) mock_conn = MockEmrConnection() # hadoop debugging without any other steps jobflow_id = mock_conn.run_jobflow(name='j-DEBUG_ONLY', log_uri='', enable_debugging=True) jf = mock_conn.describe_jobflow(jobflow_id) self.mock_emr_job_flows['j-DEBUG_ONLY'] = jf jf.state = 'WAITING' jf.startdatetime = to_iso8601(self.now - timedelta(hours=2)) jf.steps[0].enddatetime = to_iso8601(self.now - timedelta(hours=2)) # hadoop debugging + actual job # same jar as hive but with different args jobflow_id = mock_conn.run_jobflow(name='j-HADOOP_DEBUGGING', log_uri='', enable_debugging=True, steps=[step()]) jf = mock_conn.describe_jobflow(jobflow_id) self.mock_emr_job_flows['j-HADOOP_DEBUGGING'] = jf jf.state = 'WAITING' jf.creationdatetime = to_iso8601(self.now - timedelta(hours=6)) jf.startdatetime = to_iso8601(self.now - timedelta(hours=5)) # Need to reset times manually because mockboto resets them jf.steps[0].enddatetime = to_iso8601(self.now - timedelta(hours=5)) jf.steps[1].startdatetime = to_iso8601(self.now - timedelta(hours=4)) jf.steps[1].enddatetime = to_iso8601(self.now - timedelta(hours=2)) # skip cancelled steps self.mock_emr_job_flows['j-IDLE_AND_FAILED'] = MockEmrObject( state='WAITING', creationdatetime=to_iso8601(self.now - timedelta(hours=6)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), steps=[ step(start_time_back=4, end_time_back=3, state='FAILED'), step(state='CANCELLED', ) ], ) # pooled job flow reaching end of full hour self.mock_emr_job_flows['j-POOLED'] = MockEmrObject( bootstrapactions=[ MockEmrObject(args=[]), MockEmrObject(args=[ MockEmrObject( value='pool-0123456789abcdef0123456789abcdef'), MockEmrObject(value='reflecting'), ]), ], creationdatetime=to_iso8601(self.now - timedelta(hours=1)), startdatetime=to_iso8601(self.now - timedelta(minutes=55)), state='WAITING', steps=[], ) # add job flow IDs and fake names to the mock job flows for jfid, jf in self.mock_emr_job_flows.iteritems(): jf.jobflowid = jfid jf.name = jfid[2:].replace('_', ' ').title() + ' Job Flow'
def create_fake_job_flows(self): self.now = datetime.utcnow().replace(microsecond=0) # empty job self.mock_emr_job_flows['j-EMPTY'] = MockEmrObject( state='WAITING', creationdatetime=to_iso8601(self.now - timedelta(hours=10)), steps=[], ) # currently running job self.mock_emr_job_flows['j-CURRENTLY_RUNNING'] = MockEmrObject( state='RUNNING', creationdatetime=to_iso8601(self.now - timedelta(hours=6)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), steps=[MockEmrObject( startdatetime=to_iso8601(self.now - timedelta(hours=4)), jar='/home/hadoop/contrib/streaming/hadoop-0.20-streaming.jar', state='RUNNING', )], ) # finished job flow self.mock_emr_job_flows['j-DONE'] = MockEmrObject( state='COMPLETE', creationdatetime=to_iso8601(self.now - timedelta(hours=10)), startdatetime=to_iso8601(self.now - timedelta(hours=9)), enddatetime=to_iso8601(self.now - timedelta(hours=5)), steps=[MockEmrObject( startdatetime=to_iso8601(self.now - timedelta(hours=8)), enddatetime=to_iso8601(self.now - timedelta(hours=6)), jar='/home/hadoop/contrib/streaming/hadoop-0.20-streaming.jar', state='COMPLETE', )], ) # idle job flow self.mock_emr_job_flows['j-DONE_AND_IDLE'] = MockEmrObject( state='WAITING', creationdatetime=to_iso8601(self.now - timedelta(hours=6)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), steps=[MockEmrObject( startdatetime=to_iso8601(self.now - timedelta(hours=4)), enddatetime=to_iso8601(self.now - timedelta(hours=2)), jar='/home/hadoop/contrib/streaming/hadoop-0.20-streaming.jar', state='COMPLETE', )], ) # hive job flow (looks completed but isn't) self.mock_emr_job_flows['j-HIVE'] = MockEmrObject( state='WAITING', creationdatetime=to_iso8601(self.now - timedelta(hours=6)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), steps=[MockEmrObject( startdatetime=to_iso8601(self.now - timedelta(hours=4)), enddatetime=to_iso8601(self.now - timedelta(hours=4)), jar='s3://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar', state='COMPLETE', )], ) # hadoop debugging + actual job # hadoop debugging looks the same to us as Hive (they use the same # jar). The difference is that there's also a streaming step. self.mock_emr_job_flows['j-HADOOP_DEBUGGING'] = MockEmrObject( state='WAITING', creationdatetime=to_iso8601(self.now - timedelta(hours=6)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), steps=[ MockEmrObject( startdatetime=to_iso8601(self.now - timedelta(hours=5)), enddatetime=to_iso8601(self.now - timedelta(hours=5)), jar='s3://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar', state='COMPLETE', ), MockEmrObject( startdatetime=to_iso8601(self.now - timedelta(hours=4)), enddatetime=to_iso8601(self.now - timedelta(hours=2)), jar='/home/hadoop/contrib/streaming/hadoop-0.20-streaming.jar', state='COMPLETE', ) ], ) # skip cancelled steps self.mock_emr_job_flows['j-IDLE_AND_FAILED'] = MockEmrObject( state='WAITING', creationdatetime=to_iso8601(self.now - timedelta(hours=6)), startdatetime=to_iso8601(self.now - timedelta(hours=5)), steps=[ MockEmrObject( startdatetime=to_iso8601(self.now - timedelta(hours=4)), enddatetime=to_iso8601(self.now - timedelta(hours=3)), jar='/home/hadoop/contrib/streaming/hadoop-0.20-streaming.jar', state='FAILED', ), MockEmrObject( jar='/home/hadoop/contrib/streaming/hadoop-0.20-streaming.jar', state='CANCELLED', ) ], ) # add job flow IDs and fake names to the mock job flows for jfid, jf in self.mock_emr_job_flows.iteritems(): jf.jobflowid = jfid jf.name = jfid[2:].replace('_', ' ').title() + ' Job Flow'