def test_create_scratch_uri(self): # "walrus" bucket will be ignored; it doesn't start with "mrjob-" self.add_mock_s3_data({'walrus': {}, 'zebra': {}}) runner = EMRJobRunner(conf_path=False, s3_sync_wait_time=0.01) # bucket name should be mrjob- plus 16 random hex digits s3_scratch_uri = runner._opts['s3_scratch_uri'] assert_equal(s3_scratch_uri[:11], 's3://mrjob-') assert_equal(s3_scratch_uri[27:], '/tmp/') # bucket shouldn't actually exist yet scratch_bucket, _ = parse_s3_uri(s3_scratch_uri) assert_not_in(scratch_bucket, self.mock_s3_fs.keys()) # need to do something to ensure that the bucket actually gets # created. let's launch a (mock) job flow jfid = runner.make_persistent_job_flow() assert_in(scratch_bucket, self.mock_s3_fs.keys()) runner.make_emr_conn().terminate_jobflow(jfid) # once our scratch bucket is created, we should re-use it runner2 = EMRJobRunner(conf_path=False) assert_equal(runner2._opts['s3_scratch_uri'], s3_scratch_uri) s3_scratch_uri = runner._opts['s3_scratch_uri']
def s3_cleanup(glob_path, time_old, dry_run=False, conf_path=None): """Delete all files older than *time_old* in *path*. If *dry_run* is ``True``, then just log the files that need to be deleted without actually deleting them """ runner = EMRJobRunner(conf_path=conf_path) s3_conn = runner.make_s3_conn() log.info('Deleting all files in %s that are older than %s' % (glob_path, time_old)) for path in runner.ls(glob_path): bucket_name, key_name = parse_s3_uri(path) bucket = s3_conn.get_bucket(bucket_name) for key in bucket.list(key_name): last_modified = iso8601_to_datetime(key.last_modified) age = datetime.utcnow() - last_modified if age > time_old: # Delete it log.info('Deleting %s; is %s old' % (key.name, age)) if not dry_run: key.delete()
def simulate_progress(self, jobflow_id, now=None): """Simulate progress on the given job flow. This is automatically run when we call describe_jobflow(). :type jobflow_id: str :param jobflow_id: fake job flow ID :type now: py:class:`datetime.datetime` :param now: alternate time to use as the current time (should be UTC) """ if now is None: now = datetime.datetime.utcnow() if self.simulation_steps_left <= 0: raise AssertionError( 'Simulated progress too many times; bailing out') self.simulation_steps_left -= 1 job_flow = self.mock_emr_job_flows[jobflow_id] # if job is STARTING, move it along to WAITING if job_flow.state == 'STARTING': job_flow.state = 'WAITING' job_flow.startdatetime = to_iso8601(now) # if job is done, don't advance it if job_flow.state in ('COMPLETED', 'TERMINATED', 'FAILED'): return # if SHUTTING_DOWN, finish shutting down if job_flow.state == 'SHUTTING_DOWN': if job_flow.reason == 'Shut down as step failed': job_flow.state = 'FAILED' else: job_flow.state = 'TERMINATED' job_flow.enddatetime = to_iso8601(now) return # if a step is currently running, advance it for step_num, step in enumerate(job_flow.steps): # skip steps that are already done if step.state in ('COMPLETED', 'FAILED', 'CANCELLED'): continue if step.name in ('Setup Hadoop Debugging', ): step.state = 'COMPLETED' continue # found currently running step! going to handle it, then exit if step.state == 'PENDING': step.state = 'RUNNING' step.startdatetime = to_iso8601(now) return assert step.state == 'RUNNING' step.enddatetime = to_iso8601(now) # check if we're supposed to have an error if (jobflow_id, step_num) in self.mock_emr_failures: step.state = 'FAILED' reason = self.mock_emr_failures[(jobflow_id, step_num)] if reason: job_flow.reason = reason if step.actiononfailure == 'TERMINATE_JOB_FLOW': job_flow.state = 'SHUTTING_DOWN' if not reason: job_flow.reason = 'Shut down as step failed' return step.state = 'COMPLETED' # create fake output if we're supposed to write to S3 output_uri = self._get_step_output_uri(step) if output_uri and S3_URI_RE.match(output_uri): mock_output = self.mock_emr_output.get( (jobflow_id, step_num)) or [''] bucket_name, key_name = parse_s3_uri(output_uri) # write output to S3 for i, bytes in enumerate(mock_output): add_mock_s3_data(self.mock_s3_fs, { bucket_name: {key_name + 'part-%05d' % i: bytes}}) elif (jobflow_id, step_num) in self.mock_emr_output: raise AssertionError( "can't use output for job flow ID %s, step %d " "(it doesn't output to S3)" % (jobflow_id, step_num)) # done! return # no pending steps. shut down job if appropriate if job_flow.keepjobflowalivewhennosteps: job_flow.state = 'WAITING' job_flow.reason = 'Waiting for steps to run' else: job_flow.state = 'COMPLETED' job_flow.reason = 'Steps Completed'
def simulate_progress(self, jobflow_id, now=None): """Simulate progress on the given job flow. This is automatically run when we call describe_jobflow(). :type jobflow_id: str :param jobflow_id: fake job flow ID :type now: py:class:`datetime.datetime` :param now: alternate time to use as the current time (should be UTC) """ if now is None: now = datetime.datetime.utcnow() if self.simulation_steps_left <= 0: raise AssertionError( 'Simulated progress too many times; bailing out') self.simulation_steps_left -= 1 job_flow = self.mock_emr_job_flows[jobflow_id] # if job is STARTING, move it along to WAITING if job_flow.state == 'STARTING': job_flow.state = 'WAITING' job_flow.startdatetime = to_iso8601(now) # if job is done, don't advance it if job_flow.state in ('COMPLETED', 'TERMINATED', 'FAILED'): return # if SHUTTING_DOWN, finish shutting down if job_flow.state == 'SHUTTING_DOWN': if job_flow.reason == 'Shut down as step failed': job_flow.state = 'FAILED' else: job_flow.state = 'TERMINATED' job_flow.enddatetime = to_iso8601(now) return # if a step is currently running, advance it for step_num, step in enumerate(job_flow.steps): # skip steps that are already done if step.state in ('COMPLETED', 'FAILED', 'CANCELLED'): continue # found currently running step! going to handle it, then exit if step.state == 'PENDING': step.state = 'RUNNING' step.startdatetime = to_iso8601(now) return assert step.state == 'RUNNING' step.enddatetime = to_iso8601(now) # check if we're supposed to have an error if (jobflow_id, step_num) in self.mock_emr_failures: step.state = 'FAILED' reason = self.mock_emr_failures[(jobflow_id, step_num)] if reason: job_flow.reason = reason if step.actiononfailure == 'TERMINATE_JOB_FLOW': job_flow.state = 'SHUTTING_DOWN' if not reason: job_flow.reason = 'Shut down as step failed' return step.state = 'COMPLETED' # create fake output if we're supposed to write to S3 output_uri = self._get_step_output_uri(step) if output_uri and S3_URI_RE.match(output_uri): mock_output = self.mock_emr_output.get( (jobflow_id, step_num)) or [''] bucket_name, key_name = parse_s3_uri(output_uri) # write output to S3 for i, bytes in enumerate(mock_output): add_mock_s3_data( self.mock_s3_fs, {bucket_name: { key_name + 'part-%05d' % i: bytes }}) elif (jobflow_id, step_num) in self.mock_emr_output: raise AssertionError( "can't use output for job flow ID %s, step %d " "(it doesn't output to S3)" % (jobflow_id, step_num)) # done! return # no pending steps. shut down job if appropriate if job_flow.keepjobalivewhennosteps: job_flow.state = 'WAITING' job_flow.reason = 'Waiting for steps to run' else: job_flow.state = 'COMPLETED' job_flow.reason = 'Steps Completed'