def main(args=None): now = _boto3_now() option_parser = _make_option_parser() options, args = option_parser.parse_args(args) if args: option_parser.error('takes no arguments') MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) log.info('getting information about running jobs') min_time = timedelta(hours=options.min_hours) emr_client = EMRJobRunner(**_runner_kwargs(options)).make_emr_client() cluster_summaries = _boto3_paginate( 'Clusters', emr_client, 'list_clusters', ClusterStates=['STARTING', 'BOOTSTRAPPING', 'RUNNING']) job_info = _find_long_running_jobs( emr_client, cluster_summaries, min_time, now=now) _print_report(job_info)
def _maybe_terminate_clusters(dry_run=False, max_mins_idle=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs): if now is None: now = _boto3_now() # old default behavior if max_mins_idle is None: max_mins_idle = _DEFAULT_MAX_MINS_IDLE runner = EMRJobRunner(**kwargs) emr_client = runner.make_emr_client() num_starting = 0 num_bootstrapping = 0 num_done = 0 num_idle = 0 num_pending = 0 num_running = 0 # include RUNNING to catch clusters with PENDING jobs that # never ran (see #365). for cluster_summary in _boto3_paginate( 'Clusters', emr_client, 'list_clusters', ClusterStates=['WAITING', 'RUNNING']): cluster_id = cluster_summary['Id'] # check if cluster is done if _is_cluster_done(cluster_summary): num_done += 1 continue # check if cluster is starting if _is_cluster_starting(cluster_summary): num_starting += 1 continue # check if cluster is bootstrapping if _is_cluster_bootstrapping(cluster_summary): num_bootstrapping += 1 continue # need steps to learn more about cluster steps = list( reversed( list( _boto3_paginate('Steps', emr_client, 'list_steps', ClusterId=cluster_id)))) if any(_is_step_running(step) for step in steps): num_running += 1 continue # cluster is idle time_idle = now - _time_last_active(cluster_summary, steps) is_pending = _cluster_has_pending_steps(steps) # need to get actual cluster to see tags cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster'] _, pool = _pool_hash_and_name(cluster) if is_pending: num_pending += 1 else: num_idle += 1 log.debug('cluster %s %s for %s, %s (%s) - %s' % ( cluster_id, 'pending' if is_pending else 'idle', strip_microseconds(time_idle), ('unpooled' if pool is None else 'in %s pool' % pool), cluster_summary['Name'], 'protected' if cluster['TerminationProtected'] else 'unprotected', )) # filter out clusters that don't meet our criteria if (max_mins_idle is not None and time_idle <= timedelta(minutes=max_mins_idle)): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue if cluster['TerminationProtected']: continue # terminate idle cluster _terminate_and_notify(runner=runner, cluster_id=cluster_id, cluster_name=cluster_summary['Name'], num_steps=len(steps), is_pending=is_pending, time_idle=time_idle, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet) log.info('Cluster statuses: %d starting, %d bootstrapping, %d running,' ' %d pending, %d idle, %d done' % (num_starting, num_bootstrapping, num_running, num_pending, num_idle, num_done))
def setUp(self): super(LogFetchingTestCase, self).setUp() self.runner = EMRJobRunner(conf_paths=[], s3_sync_wait_time=0, emr_job_flow_id='j-MOCKCLUSTER0')
def main(args=None): """Run the create_job_flow tool with arguments from ``sys.argv`` and printing to ``sys.stdout``.""" runner = EMRJobRunner(**runner_kwargs(args)) emr_job_flow_id = runner.make_persistent_job_flow() print(emr_job_flow_id)
def inspect_and_maybe_terminate_job_flows(conf_paths=None, dry_run=False, max_hours_idle=None, mins_to_end_of_hour=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs): if now is None: now = datetime.utcnow() # old default behavior if max_hours_idle is None and mins_to_end_of_hour is None: max_hours_idle = DEFAULT_MAX_HOURS_IDLE runner = EMRJobRunner(conf_paths=conf_paths, **kwargs) emr_conn = runner.make_emr_conn() log.info( 'getting info about all job flows (this goes back about 2 months)') # We don't filter by job flow state because we want this to work even # if Amazon adds another kind of idle state. job_flows = describe_all_job_flows(emr_conn) num_bootstrapping = 0 num_done = 0 num_idle = 0 num_non_streaming = 0 num_pending = 0 num_running = 0 # a list of tuples of job flow id, name, idle time (as a timedelta) to_terminate = [] for jf in job_flows: # check if job flow is done if is_job_flow_done(jf): num_done += 1 # check if job flow is bootstrapping elif is_job_flow_bootstrapping(jf): num_bootstrapping += 1 # we can't really tell if non-streaming jobs are idle or not, so # let them be (see Issue #60) elif not is_job_flow_streaming(jf): num_non_streaming += 1 elif is_job_flow_running(jf): num_running += 1 else: time_idle = now - time_last_active(jf) time_to_end_of_hour = est_time_to_hour(jf, now=now) _, pool = pool_hash_and_name(jf) pending = job_flow_has_pending_steps(jf) if pending: num_pending += 1 else: num_idle += 1 log.debug('Job flow %s %s for %s, %s to end of hour, %s (%s)' % (jf.jobflowid, 'pending' if pending else 'idle', strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour), ('unpooled' if pool is None else 'in %s pool' % pool), jf.name)) # filter out job flows that don't meet our criteria if (max_hours_idle is not None and time_idle <= timedelta(hours=max_hours_idle)): continue # mins_to_end_of_hour doesn't apply to jobs with pending steps if (mins_to_end_of_hour is not None and (pending or time_to_end_of_hour >= timedelta(minutes=mins_to_end_of_hour))): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue to_terminate.append((jf, pending, time_idle, time_to_end_of_hour)) log.info( 'Job flow statuses: %d bootstrapping, %d running, %d pending, %d idle,' ' %d active non-streaming, %d done' % (num_running, num_bootstrapping, num_pending, num_idle, num_non_streaming, num_done)) terminate_and_notify(runner, to_terminate, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet)
def test_ap_southeast_1(self): runner = EMRJobRunner(conf_path=False, aws_region='ap-southeast-1') assert_equal(runner.make_s3_conn().endpoint, 's3-ap-southeast-1.amazonaws.com') assert_raises(Exception, runner.make_emr_conn)
def make_runner(self): self.runner = EMRJobRunner(s3_sync_wait_time=0, s3_scratch_uri='s3://walrus/tmp', conf_path=False) self.runner._s3_job_log_uri = BUCKET_URI + LOG_DIR
def main(args=None): """Run the create_cluster tool with arguments from ``sys.argv`` and printing to ``sys.stdout``.""" runner = EMRJobRunner(**_runner_kwargs(args)) cluster_id = runner.make_persistent_cluster() print(cluster_id)
def main(): usage = 'usage: %prog [options] JOB_FLOW_ID' description = ( 'List, display, and parse Hadoop logs associated with EMR job flows.' ' Useful for debugging failed jobs for which mrjob did not display a' ' useful error message or for inspecting jobs whose output has been' ' lost.') option_parser = OptionParser(usage=usage, description=description) option_parser.add_option('-f', '--find-failure', dest='find_failure', action='store_true', default=False, help=('Search the logs for information about why' ' the job failed')) option_parser.add_option('-l', '--list', dest='list_relevant', action="store_true", default=False, help='List log files MRJob finds relevant') option_parser.add_option('-L', '--list-all', dest='list_all', action="store_true", default=False, help='List all log files') option_parser.add_option('-a', '--cat', dest='cat_relevant', action="store_true", default=False, help='Cat log files MRJob finds relevant') option_parser.add_option('-A', '--cat-all', dest='cat_all', action="store_true", default=False, help='Cat all log files to JOB_FLOW_ID/') option_parser.add_option('-s', '--step-num', dest='step_num', action='store', type='int', default=None, help=('Limit results to a single step. To be used' ' with --list and --cat.')) option_parser.add_option('--counters', dest='get_counters', action='store_true', default=False, help='Show counters from the job flow') assignments = { option_parser: ('conf_path', 'quiet', 'verbose', 'ec2_key_pair_file') } mr_job = MRJob() job_option_groups = (mr_job.option_parser, mr_job.mux_opt_group, mr_job.proto_opt_group, mr_job.runner_opt_group, mr_job.hadoop_emr_opt_group, mr_job.emr_opt_group, mr_job.hadoop_opts_opt_group) scrape_options_into_new_groups(job_option_groups, assignments) options, args = option_parser.parse_args() MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) if options.step_num: step_nums = [options.step_num] else: step_nums = None runner_kwargs = options.__dict__.copy() for unused_arg in ('quiet', 'verbose', 'list_relevant', 'list_all', 'cat_relevant', 'cat_all', 'get_counters', 'step_num', 'find_failure'): del runner_kwargs[unused_arg] with EMRJobRunner(emr_job_flow_id=args[0], **runner_kwargs) as runner: if options.list_relevant: list_relevant(runner, step_nums) if options.list_all: list_all(runner) if options.cat_relevant: cat_relevant(runner, step_nums) if options.cat_all: cat_all(runner) if options.get_counters: desc = runner._describe_jobflow() runner._set_s3_job_log_uri(desc) runner._fetch_counters( xrange(1, len(desc.steps) + 1), skip_s3_wait=True) runner.print_counters() if options.find_failure: find_failure(runner, options.step_num)
def test_pick_scratch_uri(self): self.add_mock_s3_data({'mrjob-walrus': {}, 'zebra': {}}) runner = EMRJobRunner(conf_path=False) assert_equal(runner._opts['s3_scratch_uri'], 's3://mrjob-walrus/tmp/')
def test_explicit_endpoints(self): runner = EMRJobRunner(conf_path=False, aws_region='EU', s3_endpoint='s3-proxy', emr_endpoint='emr-proxy') assert_equal(runner.make_emr_conn().endpoint, 'emr-proxy') assert_equal(runner.make_s3_conn().endpoint, 's3-proxy')
def _maybe_terminate_clusters(dry_run=False, max_hours_idle=None, mins_to_end_of_hour=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs): if now is None: now = _boto3_now() # old default behavior if max_hours_idle is None and mins_to_end_of_hour is None: max_hours_idle = _DEFAULT_MAX_HOURS_IDLE runner = EMRJobRunner(**kwargs) emr_client = runner.make_emr_client() num_starting = 0 num_bootstrapping = 0 num_done = 0 num_idle = 0 num_pending = 0 num_running = 0 # We don't filter by cluster state because we want this to work even # if Amazon adds another kind of idle state. for cluster_summary in _boto3_paginate('Clusters', emr_client, 'list_clusters'): cluster_id = cluster_summary['Id'] # check if cluster is done if _is_cluster_done(cluster_summary): num_done += 1 continue # check if cluster is starting if _is_cluster_starting(cluster_summary): num_starting += 1 continue # check if cluster is bootstrapping if _is_cluster_bootstrapping(cluster_summary): num_bootstrapping += 1 continue # need steps to learn more about cluster steps = list( reversed( list( _boto3_paginate('Steps', emr_client, 'list_steps', ClusterId=cluster_id)))) if any(_is_step_running(step) for step in steps): num_running += 1 continue # cluster is idle time_idle = now - _time_last_active(cluster_summary, steps) time_to_end_of_hour = _est_time_to_hour(cluster_summary, now=now) is_pending = _cluster_has_pending_steps(steps) # need to get actual cluster to see tags cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster'] _, pool = _pool_hash_and_name(cluster) if is_pending: num_pending += 1 else: num_idle += 1 log.debug('cluster %s %s for %s, %s to end of hour, %s (%s)' % (cluster_id, 'pending' if is_pending else 'idle', strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour), ('unpooled' if pool is None else 'in %s pool' % pool), cluster_summary['Name'])) # filter out clusters that don't meet our criteria if (max_hours_idle is not None and time_idle <= timedelta(hours=max_hours_idle)): continue # mins_to_end_of_hour doesn't apply to jobs with pending steps if (mins_to_end_of_hour is not None and (is_pending or time_to_end_of_hour >= timedelta(minutes=mins_to_end_of_hour))): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue # terminate idle cluster _terminate_and_notify(runner=runner, cluster_id=cluster_id, cluster_name=cluster_summary['Name'], num_steps=len(steps), is_pending=is_pending, time_idle=time_idle, time_to_end_of_hour=time_to_end_of_hour, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet) log.info('Cluster statuses: %d starting, %d bootstrapping, %d running,' ' %d pending, %d idle, %d done' % (num_starting, num_bootstrapping, num_running, num_pending, num_idle, num_done))
def main(): usage = '%prog [options]' description = ( 'Inspect available job flow pools or identify job flows suitable for' ' running a job with the specified options.') option_parser = OptionParser(usage=usage, description=description) import boto.emr.connection boto.emr.connection.JobFlow.Fields.add('HadoopVersion') def make_option_group(halp): g = OptionGroup(option_parser, halp) option_parser.add_option_group(g) return g ec2_opt_group = make_option_group('EC2 instance configuration') hadoop_opt_group = make_option_group('Hadoop configuration') job_opt_group = make_option_group('Job flow configuration') assignments = { option_parser: ( 'conf_path', 'emr_job_flow_pool_name', 'quiet', 'verbose', ), ec2_opt_group: ( 'aws_availability_zone', 'ec2_instance_type', 'ec2_key_pair', 'ec2_key_pair_file', 'ec2_master_instance_type', 'ec2_slave_instance_type', 'emr_endpoint', 'num_ec2_instances', ), hadoop_opt_group: ( 'hadoop_version', 'label', 'owner', ), job_opt_group: ( 'bootstrap_actions', 'bootstrap_cmds', 'bootstrap_files', 'bootstrap_mrjob', 'bootstrap_python_packages', ), } option_parser.add_option('-a', '--all', action='store_true', default=False, dest='list_all', help=('List all available job flows without' ' filtering by configuration')) option_parser.add_option('-f', '--find', action='store_true', default=False, dest='find', help=('Find a job flow matching the pool name,' ' bootstrap configuration, and instance' ' number/type as specified on the command' ' line and in the configuration files')) option_parser.add_option('-t', '--terminate', action='store', default=None, dest='terminate', metavar='JOB_FLOW_ID', help=('Terminate all job flows in the given pool' ' (defaults to pool "default")')) # Scrape options from MRJob and index them by dest mr_job = MRJob() scrape_options_into_new_groups(mr_job.all_option_groups(), assignments) options, args = option_parser.parse_args() log_to_stream(name='mrjob', debug=options.verbose) runner_kwargs = options.__dict__.copy() for non_runner_kwarg in ('quiet', 'verbose', 'list_all', 'find', 'terminate'): del runner_kwargs[non_runner_kwarg] runner = EMRJobRunner(**runner_kwargs) if options.list_all: pprint_pools(runner) if options.find: sorted_job_flows = runner.usable_job_flows() if sorted_job_flows: jf = sorted_job_flows[-1] print 'You should use this one:' pprint_job_flow(jf) else: print 'No idle job flows match criteria' if options.terminate: terminate(runner, options.terminate)
def inspect_and_maybe_terminate_job_flows( conf_path=None, dry_run=False, max_hours_idle=None, mins_to_end_of_hour=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, ): if now is None: now = datetime.utcnow() # old default behavior if max_hours_idle is None and mins_to_end_of_hour is None: max_hours_idle = DEFAULT_MAX_HOURS_IDLE emr_conn = EMRJobRunner(conf_path=conf_path).make_emr_conn() log.info( 'getting info about all job flows (this goes back about 2 months)') # We don't filter by job flow state because we want this to work even # if Amazon adds another kind of idle state. job_flows = describe_all_job_flows(emr_conn) num_running = 0 num_idle = 0 num_done = 0 num_non_streaming = 0 # a list of tuples of job flow id, name, idle time (as a timedelta) to_terminate = [] for jf in job_flows: # check if job flow is done if is_job_flow_done(jf): num_done += 1 # we can't really tell if non-streaming jobs are idle or not, so # let them be (see Issue #60) elif is_job_flow_non_streaming(jf): num_non_streaming += 1 elif is_job_flow_running(jf): num_running += 1 else: num_idle += 1 time_idle = time_job_flow_idle(jf, now=now) time_to_end_of_hour = time_to_end_of_hour_for_job_flow(jf, now=now) pool = job_flow_pool_name(jf) log.debug( 'Job flow %-15s idle for %s, %s to end of hour, %s (%s)' % (jf.jobflowid, strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour), ('unpooled' if pool is None else 'in %s pool' % pool), jf.name)) # filter out job flows that don't meet our criteria if (max_hours_idle is not None and time_idle <= timedelta(hours=max_hours_idle)): continue if (mins_to_end_of_hour is not None and time_to_end_of_hour >= timedelta(minutes=mins_to_end_of_hour)): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue to_terminate.append( (jf.jobflowid, jf.name, time_idle, time_to_end_of_hour)) log.info('Job flow statuses: %d running, %d idle, %d active non-streaming,' ' %d done' % (num_running, num_idle, num_non_streaming, num_done)) terminate_and_notify(emr_conn, to_terminate, dry_run=dry_run)
def test_no_region(self): runner = EMRJobRunner(conf_path=False) assert_equal(runner.make_emr_conn().endpoint, 'elasticmapreduce.amazonaws.com') assert_equal(runner.make_s3_conn().endpoint, 's3.amazonaws.com') assert_equal(runner._aws_region, '')
def print_report(options): emr_conn = EMRJobRunner(conf_path=options.conf_path).make_emr_conn() log.info('getting job flow history...') # microseconds just make our report messy now = datetime.datetime.utcnow().replace(microsecond=0) # if --max-days-ago is set, only look at recent jobs created_after = None if options.max_days_ago is not None: created_after = now - datetime.timedelta(days=options.max_days_ago) job_flows = describe_all_job_flows(emr_conn, created_after=created_after) job_flow_infos = [] for jf in job_flows: job_flow_info = {} job_flow_info['id'] = jf.jobflowid job_flow_info['name'] = jf.name job_flow_info['created'] = to_datetime(jf.creationdatetime) start_time = to_datetime(getattr(jf, 'startdatetime', None)) if start_time: end_time = to_datetime(getattr(jf, 'enddatetime', None)) or now job_flow_info['ran'] = end_time - start_time else: job_flow_info['ran'] = datetime.timedelta(0) job_flow_info['state'] = jf.state job_flow_info['num_steps'] = len(jf.steps or []) # this looks to be an integer, but let's protect against # future changes job_flow_info['hours'] = float(jf.normalizedinstancehours) # estimate hours billed but not used job_flow_info['hours_bbnu'] = ( job_flow_info['hours'] * estimate_proportion_billed_but_not_used(jf)) # split out mr job name and user # jobs flows created by MRJob have names like: # mr_word_freq_count.dave.20101103.121249.638552 match = JOB_NAME_RE.match(jf.name) if match: job_flow_info['mr_job_name'] = match.group(1) job_flow_info['user'] = match.group(2) else: # not run by mrjob job_flow_info['mr_job_name'] = None job_flow_info['user'] = None job_flow_infos.append(job_flow_info) if not job_flow_infos: print 'No job flows created in the past two months!' return earliest = min(info['created'] for info in job_flow_infos) latest = max(info['created'] for info in job_flow_infos) print 'Total # of Job Flows: %d' % len(job_flow_infos) print print '* All times are in UTC.' print print 'Min create time: %s' % earliest print 'Max create time: %s' % latest print ' Current time: %s' % now print print '* All usage is measured in Normalized Instance Hours, which are' print ' roughly equivalent to running an m1.small instance for an hour.' print # total compute-unit hours used total_hours = sum(info['hours'] for info in job_flow_infos) print 'Total Usage: %d' % total_hours print print '* Time billed but not used is estimated, and may not match' print " Amazon's billing system exactly." print total_hours_bbnu = sum(info['hours_bbnu'] for info in job_flow_infos) print 'Total time billed but not used (waste): %.2f' % total_hours_bbnu print date_to_hours = defaultdict(float) date_to_hours_bbnu = defaultdict(float) for info in job_flow_infos: date_created = info['created'].date() date_to_hours[date_created] += info['hours'] date_to_hours_bbnu[date_created] += info['hours_bbnu'] print 'Daily statistics:' print print ' date usage waste' d = latest.date() while d >= earliest.date(): print ' %10s %6d %9.2f' % (d, date_to_hours[d], date_to_hours_bbnu[d]) d -= datetime.timedelta(days=1) print def fmt(mr_job_name_or_user): if mr_job_name_or_user: return mr_job_name_or_user else: return '(not started by mrjob)' print '* Job flows are considered to belong to the user and job that' print ' started them (even if other jobs use the job flow).' print # Top jobs print 'Top jobs, by total usage:' mr_job_name_to_hours = defaultdict(float) for info in job_flow_infos: mr_job_name_to_hours[info['mr_job_name']] += info['hours'] for mr_job_name, hours in sorted(mr_job_name_to_hours.iteritems(), key=lambda (n, h): (-h, n)): print ' %6d %s' % (hours, fmt(mr_job_name)) print print 'Top jobs, by time billed but not used:' mr_job_name_to_hours_bbnu = defaultdict(float) for info in job_flow_infos: mr_job_name_to_hours_bbnu[info['mr_job_name']] += info['hours_bbnu'] for mr_job_name, hours_bbnu in sorted( mr_job_name_to_hours_bbnu.iteritems(), key=lambda (n, h): (-h, n)): print ' %9.2f %s' % (hours_bbnu, fmt(mr_job_name)) print # Top users print 'Top users, by total usage:' user_to_hours = defaultdict(float) for info in job_flow_infos: user_to_hours[info['user']] += info['hours'] for user, hours in sorted(user_to_hours.iteritems(), key=lambda (n, h): (-h, n)): print ' %6d %s' % (hours, fmt(user)) print print 'Top users, by time billed but not used:' user_to_hours_bbnu = defaultdict(float) for info in job_flow_infos: user_to_hours_bbnu[info['user']] += info['hours_bbnu'] for user, hours_bbnu in sorted(user_to_hours_bbnu.iteritems(), key=lambda (n, h): (-h, n)): print ' %9.2f %s' % (hours_bbnu, fmt(user)) print # Top job flows print 'All job flows, by total usage:' top_job_flows = sorted(job_flow_infos, key=lambda i: (-i['hours'], i['name'])) for info in top_job_flows: print ' %6d %-15s %s' % (info['hours'], info['id'], info['name']) print print 'All job flows, by time billed but not used:' top_job_flows_bbnu = sorted(job_flow_infos, key=lambda i: (-i['hours_bbnu'], i['name'])) for info in top_job_flows_bbnu: print ' %9.2f %-15s %s' % (info['hours_bbnu'], info['id'], info['name']) print print 'Details for all job flows:' print print ' id state created steps time ran usage waste user name' all_job_flows = sorted(job_flow_infos, key=lambda i: i['created'], reverse=True) for info in all_job_flows: print ' %-15s %-13s %19s %3d %17s %6d %9.2f %8s %s' % ( info['id'], info['state'], info['created'], info['num_steps'], info['ran'], info['hours'], info['hours_bbnu'], (info['user'] or ''), fmt(info['mr_job_name']))
def test_us_west_1(self): runner = EMRJobRunner(conf_path=False, aws_region='us-west-1') assert_equal(runner.make_emr_conn().endpoint, 'us-west-1.elasticmapreduce.amazonaws.com') assert_equal(runner.make_s3_conn().endpoint, 's3-us-west-1.amazonaws.com')
def _maybe_terminate_clusters(dry_run=False, max_hours_idle=None, mins_to_end_of_hour=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs): if now is None: now = datetime.utcnow() # old default behavior if max_hours_idle is None and mins_to_end_of_hour is None: max_hours_idle = _DEFAULT_MAX_HOURS_IDLE runner = EMRJobRunner(**kwargs) emr_conn = runner.make_emr_conn() num_starting = 0 num_bootstrapping = 0 num_done = 0 num_idle = 0 num_non_streaming = 0 num_pending = 0 num_running = 0 # We don't filter by cluster state because we want this to work even # if Amazon adds another kind of idle state. for cluster_summary in _yield_all_clusters(emr_conn): cluster_id = cluster_summary.id # check if cluster is done if _is_cluster_done(cluster_summary): num_done += 1 continue # check if cluster is starting if _is_cluster_starting(cluster_summary): num_starting += 1 continue # check if cluster is bootstrapping if _is_cluster_bootstrapping(cluster_summary): num_bootstrapping += 1 continue # need steps to learn more about cluster steps = _list_all_steps(emr_conn, cluster_id) # we can't really tell if non-streaming jobs are idle or not, so # let them be (see Issue #60) if _is_cluster_non_streaming(steps): num_non_streaming += 1 continue if any(_is_step_running(step) for step in steps): num_running += 1 continue # cluster is idle time_idle = now - _time_last_active(cluster_summary, steps) time_to_end_of_hour = _est_time_to_hour(cluster_summary, now=now) is_pending = _cluster_has_pending_steps(steps) bootstrap_actions = list( _yield_all_bootstrap_actions(emr_conn, cluster_id)) _, pool = _pool_hash_and_name(bootstrap_actions) if is_pending: num_pending += 1 else: num_idle += 1 log.debug('cluster %s %s for %s, %s to end of hour, %s (%s)' % (cluster_id, 'pending' if is_pending else 'idle', strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour), ('unpooled' if pool is None else 'in %s pool' % pool), cluster_summary.name)) # filter out clusters that don't meet our criteria if (max_hours_idle is not None and time_idle <= timedelta(hours=max_hours_idle)): continue # mins_to_end_of_hour doesn't apply to jobs with pending steps if (mins_to_end_of_hour is not None and (is_pending or time_to_end_of_hour >= timedelta(minutes=mins_to_end_of_hour))): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue # terminate idle cluster _terminate_and_notify(runner=runner, cluster_id=cluster_id, cluster_name=cluster_summary.name, num_steps=len(steps), is_pending=is_pending, time_idle=time_idle, time_to_end_of_hour=time_to_end_of_hour, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet) log.info('Cluster statuses: %d starting, %d bootstrapping, %d running,' ' %d pending, %d idle, %d active non-streaming, %d done' % (num_starting, num_bootstrapping, num_running, num_pending, num_idle, num_non_streaming, num_done))
def test_cleanup(self): runner = EMRJobRunner(conf_paths=[], cloud_fs_sync_secs=0.01) # add some mock data # foo is current self.add_mock_s3_data({'walrus': {'data/foo': b'foo\n'}}) # bar and baz are very old (but baz isn't in data/) self.add_mock_s3_data( {'walrus': { 'data/bar': b'bar\n', 'other/baz': b'baz\n' }}, age=timedelta(days=45)) # qux is a little more than two days old self.add_mock_s3_data({'walrus': { 'data/qux': b'qux\n' }}, age=timedelta(hours=50)) self.assertEqual( sorted(runner.fs.ls('s3://walrus/')), [ 's3://walrus/data/bar', 's3://walrus/data/foo', 's3://walrus/data/qux', 's3://walrus/other/baz' ], ) # try a dry run, which shouldn't delete anything _s3_cleanup('s3://walrus/data/', timedelta(days=30), dry_run=True, conf_paths=[]) self.assertEqual( sorted(runner.fs.ls('s3://walrus/')), [ 's3://walrus/data/bar', 's3://walrus/data/foo', 's3://walrus/data/qux', 's3://walrus/other/baz', ], ) # now do it for real. should hit bar (baz isn't in data/) _s3_cleanup('s3://walrus/data', timedelta(days=30), conf_paths=[]) self.assertEqual( sorted(runner.fs.ls('s3://walrus/')), [ 's3://walrus/data/foo', 's3://walrus/data/qux', 's3://walrus/other/baz', ], ) # now try to delete qux too _s3_cleanup('s3://walrus/data', timedelta(hours=48), conf_paths=[]) self.assertEqual( sorted(runner.fs.ls('s3://walrus/')), [ 's3://walrus/data/foo', 's3://walrus/other/baz', ], )