def test_uri_parsing(self): self.assertEqual(is_uri('notauri!'), False) self.assertEqual(is_uri('they://did/the/monster/mash'), True) self.assertEqual(is_s3_uri('s3://a/uri'), True) self.assertEqual(is_s3_uri('s3n://a/uri'), True) self.assertEqual(is_s3_uri('hdfs://a/uri'), False) self.assertEqual(parse_s3_uri('s3://bucket/loc'), ('bucket', 'loc'))
def _s3_ls(self, uri): """Helper for ls(); doesn't bother with globbing or directories""" bucket_name, key_name = parse_s3_uri(uri) bucket = self.get_bucket(bucket_name) for key in bucket.list(key_name): yield s3_key_to_uri(key)
def _get_s3_key(self, uri): """Get the boto3 s3.Object matching the given S3 uri, or return None if that key doesn't exist. uri is an S3 URI: ``s3://foo/bar`` """ bucket_name, key_name = parse_s3_uri(uri) return self.get_bucket(bucket_name).Object(key_name)
def _s3_ls(self, uri): """Helper for ls(); doesn't bother with globbing or directories""" s3_conn = self.make_s3_conn() bucket_name, key_name = parse_s3_uri(uri) bucket = s3_conn.get_bucket(bucket_name, validate=VALIDATE_BUCKET) for key in bucket.list(key_name): yield s3_key_to_uri(key)
def make_s3_key(self, uri): """Create the given S3 key, and return the corresponding boto Key object. uri is an S3 URI: ``s3://foo/bar`` """ bucket_name, key_name = parse_s3_uri(uri) return self.get_bucket(bucket_name).new_key(key_name)
def get_s3_keys(self, uri): """Get a stream of boto Key objects for each key inside the given dir on S3. uri is an S3 URI: ``s3://foo/bar`` """ bucket_name, key_prefix = parse_s3_uri(uri) bucket = self.get_bucket(bucket_name) for key in bucket.list(key_prefix): yield key
def ls(self, path_glob): """Recursively list files on S3. This doesn't list "directories" unless there's actually a corresponding key ending with a '/' (which is weird and confusing; don't make S3 keys ending in '/') To list a directory, path_glob must end with a trailing slash (foo and foo/ are different on S3) """ log.debug("ls %s", path_glob) # clean up the base uri to ensure we have an equal uri to boto (s3://) # just incase we get passed s3n:// scheme = urlparse(path_glob).scheme # support globs glob_match = GLOB_RE.match(path_glob) # we're going to search for all keys starting with base_uri if glob_match: # cut it off at first wildcard base_uri = glob_match.group(1) else: base_uri = path_glob # Check if we're only going to get results by using a / on the end uris = self._s3_ls(base_uri) try: first = uris.next() uris = chain([first], uris) except (boto.exception.S3ResponseError, StopIteration): try: uris = self._s3_ls(base_uri.rstrip("/") + "/") except (boto.exception.S3ResponseError, StopIteration): return prev_uri = None for uri in uris: uri = "%s://%s/%s" % ((scheme,) + parse_s3_uri(uri)) # enforce globbing if glob_match and not fnmatch.fnmatchcase(uri, path_glob): continue # If there are keys /data and /data/my_file then we consider there # to be a file /data, overriding there being a directory called # /data containing a file my_file. We discard /data/my_file. if prev_uri is not None and uri.startswith(prev_uri): continue yield uri prev_uri = uri.rstrip("/") + "/"
def test_cleanup(self): runner = EMRJobRunner(conf_paths=[], s3_sync_wait_time=0.01) # add some mock data and change last_modified remote_input_path = 's3://walrus/data/' self.add_mock_s3_data({'walrus': {'data/foo': 'foo\n', 'data/bar': 'bar\n', 'data/qux': 'qux\n'}}) s3_conn = runner.make_s3_conn() bucket_name, key_name = parse_s3_uri(remote_input_path) bucket = s3_conn.get_bucket(bucket_name) key_foo = bucket.get_key('data/foo') key_bar = bucket.get_key('data/bar') key_qux = bucket.get_key('data/qux') key_bar.last_modified = datetime.now() - timedelta(days=45) key_qux.last_modified = datetime.now() - timedelta(hours=50) # make sure keys are there assert isinstance(key_foo, MockKey) assert isinstance(key_bar, MockKey) assert isinstance(key_qux, MockKey) s3_cleanup(remote_input_path, timedelta(days=30), dry_run=True, conf_paths=[]) # dry-run shouldn't delete anything assert isinstance(key_foo, MockKey) assert isinstance(key_bar, MockKey) assert isinstance(key_qux, MockKey) s3_cleanup(remote_input_path, timedelta(days=30), conf_paths=[]) key_foo = bucket.get_key('data/foo') key_bar = bucket.get_key('data/bar') key_qux = bucket.get_key('data/qux') # make sure key_bar is deleted assert isinstance(key_foo, MockKey) self.assertEqual(key_bar, None) assert isinstance(key_qux, MockKey) s3_cleanup(remote_input_path, timedelta(hours=48), conf_paths=[]) key_foo = bucket.get_key('data/foo') key_bar = bucket.get_key('data/bar') key_qux = bucket.get_key('data/qux') # make sure key_qux is deleted assert isinstance(key_foo, MockKey) self.assertEqual(key_bar, None) self.assertEqual(key_qux, None)
def get_s3_key(self, uri, s3_conn=None): """Get the boto Key object matching the given S3 uri, or return None if that key doesn't exist. uri is an S3 URI: ``s3://foo/bar`` You may optionally pass in an existing s3 connection through ``s3_conn``. """ if not s3_conn: s3_conn = self.make_s3_conn() bucket_name, key_name = parse_s3_uri(uri) return s3_conn.get_bucket(bucket_name).get_key(key_name)
def make_s3_key(self, uri, s3_conn=None): """Create the given S3 key, and return the corresponding boto Key object. uri is an S3 URI: ``s3://foo/bar`` You may optionally pass in an existing S3 connection through ``s3_conn``. """ if not s3_conn: s3_conn = self.make_s3_conn() bucket_name, key_name = parse_s3_uri(uri) return s3_conn.get_bucket(bucket_name).new_key(key_name)
def get_s3_keys(self, uri, s3_conn=None): """Get a stream of boto Key objects for each key inside the given dir on S3. uri is an S3 URI: ``s3://foo/bar`` You may optionally pass in an existing S3 connection through s3_conn """ if not s3_conn: s3_conn = self.make_s3_conn() bucket_name, key_prefix = parse_s3_uri(uri) bucket = s3_conn.get_bucket(bucket_name) for key in bucket.list(key_prefix): yield key
def make_s3_key(self, uri, s3_conn=None): """Create the given S3 key, and return the corresponding boto Key object. uri is an S3 URI: ``s3://foo/bar`` You may optionally pass in an existing S3 connection through ``s3_conn``. """ if not s3_conn: s3_conn = self.make_s3_conn() bucket_name, key_name = parse_s3_uri(uri) return s3_conn.get_bucket( bucket_name, validate=VALIDATE_BUCKET).new_key(key_name)
def get_s3_keys(self, uri, s3_conn=None): """Get a stream of boto Key objects for each key inside the given dir on S3. uri is an S3 URI: ``s3://foo/bar`` You may optionally pass in an existing S3 connection through s3_conn """ if not s3_conn: s3_conn = self.make_s3_conn() bucket_name, key_prefix = parse_s3_uri(uri) bucket = _get_bucket(s3_conn, bucket_name) for key in bucket.list(key_prefix): yield key
def mkdir(self, dest): """Make a directory. This doesn't actually create directories on S3 (because there is no such thing), but it will create the corresponding bucket if it doesn't exist. """ bucket_name, key_name = parse_s3_uri(dest) client = self.make_s3_client() try: client.head_bucket(Bucket=bucket_name) except botocore.exceptions.ClientError as ex: if _client_error_status(ex) != 404: raise self.create_bucket(bucket_name)
def ls(self, path_glob): """Recursively list files on S3. *path_glob* can include ``?`` to match single characters or ``*`` to match 0 or more characters. Both ``?`` and ``*`` can match ``/``. .. versionchanged:: 0.5.0 You no longer need a trailing slash to list "directories" on S3; both ``ls('s3://b/dir')`` and `ls('s3://b/dir/')` will list all keys starting with ``dir/``. """ # clean up the base uri to ensure we have an equal uri to boto (s3://) # just in case we get passed s3n:// scheme = urlparse(path_glob).scheme # support globs glob_match = GLOB_RE.match(path_glob) # we're going to search for all keys starting with base_uri if glob_match: # cut it off at first wildcard base_uri = glob_match.group(1) else: base_uri = path_glob bucket_name, base_name = parse_s3_uri(base_uri) # allow subdirectories of the path/glob if path_glob and not path_glob.endswith('/'): dir_glob = path_glob + '/*' else: dir_glob = path_glob + '*' bucket = self.get_bucket(bucket_name) for key in bucket.list(base_name): uri = "%s://%s/%s" % (scheme, bucket_name, key.name) # enforce globbing if not (fnmatch.fnmatchcase(uri, path_glob) or fnmatch.fnmatchcase(uri, dir_glob)): continue yield uri
def get_s3_key(self, uri): """Get the boto Key object matching the given S3 uri, or return None if that key doesn't exist. uri is an S3 URI: ``s3://foo/bar`` """ bucket_name, key_name = parse_s3_uri(uri) try: bucket = self.get_bucket(bucket_name) except boto.exception.S3ResponseError as e: if e.status != 404: raise e key = None else: key = bucket.get_key(key_name) return key
def get_s3_key(self, uri, s3_conn=None): """Get the boto Key object matching the given S3 uri, or return None if that key doesn't exist. uri is an S3 URI: ``s3://foo/bar`` You may optionally pass in an existing s3 connection through ``s3_conn``. """ if not s3_conn: s3_conn = self.make_s3_conn() bucket_name, key_name = parse_s3_uri(uri) try: bucket = s3_conn.get_bucket(bucket_name) except boto.exception.S3ResponseError, e: if e.status != 404: raise e key = None
def _ls(self, path_glob): """Helper method for :py:meth:`ls`; yields tuples of ``(uri, key)`` where *key* is the corresponding boto3 s3.ObjectSummary. """ # clean up the base uri to ensure we have pass boto3 an s3:// URI # (not s3n://) scheme = urlparse(path_glob).scheme # support globs glob_match = GLOB_RE.match(path_glob) # we're going to search for all keys starting with base_uri if glob_match: # cut it off at first wildcard base_uri = glob_match.group(1) else: base_uri = path_glob bucket_name, base_name = parse_s3_uri(base_uri) # allow subdirectories of the path/glob if path_glob and not path_glob.endswith('/'): dir_glob = path_glob + '/*' else: dir_glob = path_glob + '*' try: bucket = self.get_bucket(bucket_name) except botocore.exceptions.ClientError as ex: if _client_error_status(ex) == 404: # treat nonexistent as empty return raise for key in bucket.objects.filter(Prefix=base_name): uri = "%s://%s/%s" % (scheme, bucket_name, key.key) # enforce globbing if not (fnmatch.fnmatchcase(uri, path_glob) or fnmatch.fnmatchcase(uri, dir_glob)): continue yield uri, key
def get_s3_folder_keys(self, uri, s3_conn=None): """.. deprecated:: 0.4.0 Background: EMR used to fake directories on S3 by creating special ``*_$folder$`` keys in S3. That is no longer true, so this method is deprecated. For example if your job outputs ``s3://walrus/tmp/output/part-00000``, EMR will also create these keys: - ``s3://walrus/tmp_$folder$`` - ``s3://walrus/tmp/output_$folder$`` If you want to grant another Amazon user access to your files so they can use them in S3, you must grant read access on the actual keys, plus any ``*_$folder$`` keys that "contain" your keys; otherwise EMR will error out with a permissions error. This gets all the ``*_$folder$`` keys associated with the given URI, as boto Key objects. This does not support globbing. You may optionally pass in an existing S3 connection through ``s3_conn``. """ log.warning( 'get_s3_folder_keys() is deprecated and will be removed in v0.5.0') if not s3_conn: s3_conn = self.make_s3_conn() bucket_name, key_name = parse_s3_uri(uri) bucket = _get_bucket(s3_conn, bucket_name) dirs = key_name.split('/') for i in range(len(dirs)): folder_name = '/'.join(dirs[:i]) + '_$folder$' key = bucket.get_key(folder_name) if key: yield key
def ls(self, path_glob): """Recursively list files on S3. This doesn't list "directories" unless there's actually a corresponding key ending with a '/' (which is weird and confusing; don't make S3 keys ending in '/') To list a directory, path_glob must end with a trailing slash (foo and foo/ are different on S3) """ # clean up the base uri to ensure we have an equal uri to boto (s3://) # just incase we get passed s3n:// scheme = urlparse(path_glob).scheme # support globs glob_match = GLOB_RE.match(path_glob) # if it's a "file" (doesn't end with /), just check if it exists if not glob_match and not path_glob.endswith('/'): uri = path_glob if self.get_s3_key(uri): yield uri return # we're going to search for all keys starting with base_uri if glob_match: # cut it off at first wildcard base_uri = glob_match.group(1) else: base_uri = path_glob for uri in self._s3_ls(base_uri): uri = "%s://%s/%s" % ((scheme,) + parse_s3_uri(uri)) # enforce globbing if glob_match and not fnmatch.fnmatchcase(uri, path_glob): continue yield uri
def ls(self, path_glob): """Recursively list files on S3. This doesn't list "directories" unless there's actually a corresponding key ending with a '/' (which is weird and confusing; don't make S3 keys ending in '/') To list a directory, path_glob must end with a trailing slash (foo and foo/ are different on S3) """ # clean up the base uri to ensure we have an equal uri to boto (s3://) # just in case we get passed s3n:// scheme = urlparse(path_glob).scheme # support globs glob_match = GLOB_RE.match(path_glob) # if it's a "file" (doesn't end with /), just check if it exists if not glob_match and not path_glob.endswith('/'): uri = path_glob if self.get_s3_key(uri): yield uri return # we're going to search for all keys starting with base_uri if glob_match: # cut it off at first wildcard base_uri = glob_match.group(1) else: base_uri = path_glob for uri in self._s3_ls(base_uri): uri = "%s://%s/%s" % ((scheme,) + parse_s3_uri(uri)) # enforce globbing if glob_match and not fnmatch.fnmatchcase(uri, path_glob): continue yield uri
def s3_cleanup(glob_path, time_old, dry_run=False, conf_path=None): """Delete all files older than *time_old* in *path*. If *dry_run* is ``True``, then just log the files that need to be deleted without actually deleting them """ runner = EMRJobRunner(conf_path=conf_path) s3_conn = runner.make_s3_conn() log.info("Deleting all files in %s that are older than %s" % (glob_path, time_old)) for path in runner.ls(glob_path): bucket_name, key_name = parse_s3_uri(path) bucket = s3_conn.get_bucket(bucket_name) for key in bucket.list(key_name): last_modified = iso8601_to_datetime(key.last_modified) age = datetime.utcnow() - last_modified if age > time_old: # Delete it log.info("Deleting %s; is %s old" % (key.name, age)) if not dry_run: key.delete()
def s3_cleanup(glob_path, time_old, dry_run=False, conf_paths=None): """Delete all files older than *time_old* in *path*. If *dry_run* is ``True``, then just log the files that need to be deleted without actually deleting them """ runner = EMRJobRunner(conf_paths=conf_paths) log.info('Deleting all files in %s that are older than %s' % (glob_path, time_old)) for path in runner.ls(glob_path): bucket_name, key_name = parse_s3_uri(path) bucket = runner.fs.get_bucket(bucket_name) for key in bucket.list(key_name): last_modified = iso8601_to_datetime(key.last_modified) age = datetime.utcnow() - last_modified if age > time_old: # Delete it log.info('Deleting %s; is %s old' % (key.name, age)) if not dry_run: key.delete()
def get_s3_folder_keys(self, uri, s3_conn=None): """.. deprecated:: 0.4.0 Background: EMR used to fake directories on S3 by creating special ``*_$folder$`` keys in S3. That is no longer true, so this method is deprecated. For example if your job outputs ``s3://walrus/tmp/output/part-00000``, EMR will also create these keys: - ``s3://walrus/tmp_$folder$`` - ``s3://walrus/tmp/output_$folder$`` If you want to grant another Amazon user access to your files so they can use them in S3, you must grant read access on the actual keys, plus any ``*_$folder$`` keys that "contain" your keys; otherwise EMR will error out with a permissions error. This gets all the ``*_$folder$`` keys associated with the given URI, as boto Key objects. This does not support globbing. You may optionally pass in an existing S3 connection through ``s3_conn``. """ if not s3_conn: s3_conn = self.make_s3_conn() bucket_name, key_name = parse_s3_uri(uri) bucket = s3_conn.get_bucket(bucket_name) dirs = key_name.split('/') for i in range(len(dirs)): folder_name = '/'.join(dirs[:i]) + '_$folder$' key = bucket.get_key(folder_name) if key: yield key
def _s3_cleanup(glob_path, time_old, dry_run=False, **runner_kwargs): """Delete all files older than *time_old* in *path*. If *dry_run* is true, then just log the files that need to be deleted without actually deleting them """ runner = EMRJobRunner(**runner_kwargs) log.info('Deleting all files in %s that are older than %s' % (glob_path, time_old)) for path in runner.fs.ls(glob_path): bucket_name, key_name = parse_s3_uri(path) bucket = runner.fs.get_bucket(bucket_name) for key in bucket.list(key_name): last_modified = iso8601_to_datetime(key.last_modified) age = datetime.utcnow() - last_modified if age > time_old: # Delete it log.info('Deleting %s; is %s old' % (key.name, age)) if not dry_run: key.delete()
def get_s3_folder_keys(self, uri, s3_conn=None): """Background: S3 is even less of a filesystem than HDFS in that it doesn't have directories. EMR fakes directories by creating special ``*_$folder$`` keys in S3. For example if your job outputs ``s3://walrus/tmp/output/part-00000``, EMR will also create these keys: - ``s3://walrus/tmp_$folder$`` - ``s3://walrus/tmp/output_$folder$`` If you want to grant another Amazon user access to your files so they can use them in S3, you must grant read access on the actual keys, plus any ``*_$folder$`` keys that "contain" your keys; otherwise EMR will error out with a permissions error. This gets all the ``*_$folder$`` keys associated with the given URI, as boto Key objects. This does not support globbing. You may optionally pass in an existing S3 connection through ``s3_conn``. """ if not s3_conn: s3_conn = self.make_s3_conn() bucket_name, key_name = parse_s3_uri(uri) bucket = s3_conn.get_bucket(bucket_name) dirs = key_name.split('/') for i in range(len(dirs)): folder_name = '/'.join(dirs[:i]) + '_$folder$' key = bucket.get_key(folder_name) if key: yield key
def simulate_progress(self, jobflow_id, now=None): """Simulate progress on the given job flow. This is automatically run when we call describe_jobflow(). :type jobflow_id: str :param jobflow_id: fake job flow ID :type now: py:class:`datetime.datetime` :param now: alternate time to use as the current time (should be UTC) """ if now is None: now = datetime.utcnow() if self.simulation_iterator: try: self.simulation_iterator.next() except StopIteration: raise AssertionError( 'Simulated progress too many times; bailing out') job_flow = self.mock_emr_job_flows[jobflow_id] # if job is STARTING, move it along to WAITING if job_flow.state == 'STARTING': job_flow.state = 'WAITING' job_flow.startdatetime = to_iso8601(now) # instances are now provisioned and running for ig in job_flow.instancegroups: ig.instancerunningcount = ig.instancerequestcount # if job is done, don't advance it if job_flow.state in ('COMPLETED', 'TERMINATED', 'FAILED'): return # if SHUTTING_DOWN, finish shutting down if job_flow.state == 'SHUTTING_DOWN': if job_flow.reason == 'Shut down as step failed': job_flow.state = 'FAILED' else: job_flow.state = 'TERMINATED' job_flow.enddatetime = to_iso8601(now) return # if a step is currently running, advance it steps = getattr(job_flow, 'steps', None) or [] for step_num, step in enumerate(steps): # skip steps that are already done if step.state in ('COMPLETED', 'FAILED', 'CANCELLED'): continue if step.name in ('Setup Hadoop Debugging', ): step.state = 'COMPLETED' continue # allow steps to get stuck if getattr(step, 'mock_no_progress', None): return # found currently running step! going to handle it, then exit if step.state == 'PENDING': step.state = 'RUNNING' step.startdatetime = to_iso8601(now) return assert step.state == 'RUNNING' step.enddatetime = to_iso8601(now) # check if we're supposed to have an error if (jobflow_id, step_num) in self.mock_emr_failures: step.state = 'FAILED' reason = self.mock_emr_failures[(jobflow_id, step_num)] if reason: job_flow.reason = reason if step.actiononfailure == 'TERMINATE_JOB_FLOW': job_flow.state = 'SHUTTING_DOWN' if not reason: job_flow.reason = 'Shut down as step failed' return step.state = 'COMPLETED' # create fake output if we're supposed to write to S3 output_uri = self._get_step_output_uri(step) if output_uri and is_s3_uri(output_uri): mock_output = self.mock_emr_output.get( (jobflow_id, step_num)) or [''] bucket_name, key_name = parse_s3_uri(output_uri) # write output to S3 for i, bytes in enumerate(mock_output): add_mock_s3_data(self.mock_s3_fs, { bucket_name: {key_name + 'part-%05d' % i: bytes}}) elif (jobflow_id, step_num) in self.mock_emr_output: raise AssertionError( "can't use output for job flow ID %s, step %d " "(it doesn't output to S3)" % (jobflow_id, step_num)) # done! return # no pending steps. shut down job if appropriate if job_flow.keepjobflowalivewhennosteps == 'true': job_flow.state = 'WAITING' job_flow.reason = 'Waiting for steps to run' else: job_flow.state = 'COMPLETED' job_flow.reason = 'Steps Completed'
def simulate_progress(self, jobflow_id, now=None): """Simulate progress on the given job flow. This is automatically run when we call describe_jobflow(). :type jobflow_id: str :param jobflow_id: fake job flow ID :type now: py:class:`datetime.datetime` :param now: alternate time to use as the current time (should be UTC) """ if now is None: now = datetime.datetime.utcnow() if self.simulation_steps_left <= 0: raise AssertionError("Simulated progress too many times; bailing out") self.simulation_steps_left -= 1 job_flow = self.mock_emr_job_flows[jobflow_id] # if job is STARTING, move it along to WAITING if job_flow.state == "STARTING": job_flow.state = "WAITING" job_flow.startdatetime = to_iso8601(now) # if job is done, don't advance it if job_flow.state in ("COMPLETED", "TERMINATED", "FAILED"): return # if SHUTTING_DOWN, finish shutting down if job_flow.state == "SHUTTING_DOWN": if job_flow.reason == "Shut down as step failed": job_flow.state = "FAILED" else: job_flow.state = "TERMINATED" job_flow.enddatetime = to_iso8601(now) return # if a step is currently running, advance it for step_num, step in enumerate(job_flow.steps): # skip steps that are already done if step.state in ("COMPLETED", "FAILED", "CANCELLED"): continue if step.name in ("Setup Hadoop Debugging",): step.state = "COMPLETED" continue # found currently running step! going to handle it, then exit if step.state == "PENDING": step.state = "RUNNING" step.startdatetime = to_iso8601(now) return assert step.state == "RUNNING" step.enddatetime = to_iso8601(now) # check if we're supposed to have an error if (jobflow_id, step_num) in self.mock_emr_failures: step.state = "FAILED" reason = self.mock_emr_failures[(jobflow_id, step_num)] if reason: job_flow.reason = reason if step.actiononfailure == "TERMINATE_JOB_FLOW": job_flow.state = "SHUTTING_DOWN" if not reason: job_flow.reason = "Shut down as step failed" return step.state = "COMPLETED" # create fake output if we're supposed to write to S3 output_uri = self._get_step_output_uri(step) if output_uri and is_s3_uri(output_uri): mock_output = self.mock_emr_output.get((jobflow_id, step_num)) or [""] bucket_name, key_name = parse_s3_uri(output_uri) # write output to S3 for i, bytes in enumerate(mock_output): add_mock_s3_data(self.mock_s3_fs, {bucket_name: {key_name + "part-%05d" % i: bytes}}) elif (jobflow_id, step_num) in self.mock_emr_output: raise AssertionError( "can't use output for job flow ID %s, step %d " "(it doesn't output to S3)" % (jobflow_id, step_num) ) # done! return # no pending steps. shut down job if appropriate if job_flow.keepjobflowalivewhennosteps == "true": job_flow.state = "WAITING" job_flow.reason = "Waiting for steps to run" else: job_flow.state = "COMPLETED" job_flow.reason = "Steps Completed"
def test_parse_s3_uri(self): self.assertEqual(parse_s3_uri('s3://bucket/loc'), ('bucket', 'loc'))
def _simulate_progress(self, cluster_id, now=None): """Simulate progress on the given cluster. This is automatically run when we call :py:meth:`describe_step`, and, when the cluster is ``TERMINATING``, :py:meth:`describe_cluster`. :type cluster_id: str :param cluster_id: fake cluster ID :type now: py:class:`datetime.datetime` :param now: alternate time to use as the current time (should be UTC) """ # TODO: this doesn't actually update steps to CANCELLED when # cluster is shut down if now is None: now = _boto3_now() cluster = self.mock_emr_clusters[cluster_id] # allow clusters to get stuck if cluster.get('_DelayProgressSimulation', 0) > 0: cluster['_DelayProgressSimulation'] -= 1 return # this code is pretty loose about updating StateChangeReason # (for the cluster, instance groups, and steps). Add this as needed. # if job is STARTING, move it along to BOOTSTRAPPING if cluster['Status']['State'] == 'STARTING': cluster['Status']['State'] = 'BOOTSTRAPPING' # master now has a hostname cluster['MasterPublicDnsName'] = 'master.%s.mock' % cluster['Id'] # instances are now provisioned for ig in cluster['_InstanceGroups']: ig['RunningInstanceCount'] = ig['RequestedInstanceCount'] ig['Status']['State'] = 'BOOTSTRAPPING' return # if job is TERMINATING, move along to terminated if cluster['Status']['State'] == 'TERMINATING': code = cluster['Status']['StateChangeReason'].get('Code') if code and code.endswith('_FAILURE'): cluster['Status']['State'] = 'TERMINATED_WITH_ERRORS' else: cluster['Status']['State'] = 'TERMINATED' return # if job is done, nothing to do if cluster['Status']['State'] in ('TERMINATED', 'TERMINATED_WITH_ERRORS'): return # if job is BOOTSTRAPPING, move it along to RUNNING and continue if cluster['Status']['State'] == 'BOOTSTRAPPING': cluster['Status']['State'] = 'RUNNING' for ig in cluster['_InstanceGroups']: ig['Status']['State'] = 'RUNNING' # at this point, should be RUNNING or WAITING assert cluster['Status']['State'] in ('RUNNING', 'WAITING') # simulate self-termination if cluster_id in self.mock_emr_self_termination: cluster['Status']['State'] = 'TERMINATING' cluster['Status']['StateChangeReason'] = dict( Code='INSTANCE_FAILURE', Message='The master node was terminated. ', # sic ) for step in cluster['_Steps']: if step['Status']['State'] in ('PENDING', 'RUNNING'): step['Status']['State'] = 'CANCELLED' # not INTERRUPTED return # try to find the next step, and advance it for step_num, step in enumerate(cluster['_Steps']): # skip steps that are already done if step['Status']['State'] in ('COMPLETED', 'FAILED', 'CANCELLED', 'INTERRUPTED'): continue # found currently running step! handle it, then exit # start PENDING step if step['Status']['State'] == 'PENDING': step['Status']['State'] = 'RUNNING' step['Status']['Timeline']['StartDateTime'] = now return assert step['Status']['State'] == 'RUNNING' # check if we're supposed to have an error if (cluster_id, step_num) in self.mock_emr_failures: step['Status']['State'] = 'FAILED' if step['ActionOnFailure'] in ('TERMINATE_CLUSTER', 'TERMINATE_JOB_FLOW'): cluster['Status']['State'] = 'TERMINATING' cluster['Status']['StateChangeReason']['Code'] = ( 'STEP_FAILURE') cluster['Status']['StateChangeReason']['Message'] = ( 'Shut down as step failed') for step in cluster['_Steps']: if step['Status']['State'] in ('PENDING', 'RUNNING'): step['Status']['State'] = 'CANCELLED' return # complete step step['Status']['State'] = 'COMPLETED' step['Status']['Timeline']['EndDateTime'] = now # create fake output if we're supposed to write to S3 output_uri = self._get_step_output_uri(step['Config']['Args']) if output_uri and is_s3_uri(output_uri): mock_output = self.mock_emr_output.get( (cluster_id, step_num)) or [b''] bucket_name, key_name = parse_s3_uri(output_uri) # write output to S3 for i, part in enumerate(mock_output): add_mock_s3_data( self.mock_s3_fs, {bucket_name: { key_name + 'part-%05d' % i: part }}) elif (cluster_id, step_num) in self.mock_emr_output: raise AssertionError( "can't use output for cluster ID %s, step %d " "(it doesn't output to S3)" % (cluster_id, step_num)) # done! # if this is the last step, continue to autotermination code, below if step_num < len(cluster['_Steps']) - 1: return # no pending steps. should we wait, or shut down? if cluster['AutoTerminate']: cluster['Status']['State'] = 'TERMINATING' cluster['Status']['StateChangeReason']['Code'] = ( 'ALL_STEPS_COMPLETED') cluster['Status']['StateChangeReason']['Message'] = ( 'Steps Completed') else: # just wait cluster['Status']['State'] = 'WAITING' cluster['Status']['StateChangeReason'] = {} return