def collect_active_job_flows(conf_paths): """Collect active job flow information from EMR. :param str conf_path: Alternate path to read :py:mod:`mrjob.conf` from, or ``False`` to ignore all config files Return a list of job flows """ emr_conn = EMRJobRunner(conf_paths=conf_paths).make_emr_conn() active_states = ['STARTING', 'BOOTSTRAPPING', 'WAITING', 'RUNNING'] return describe_all_job_flows(emr_conn, states=active_states)
def inspect_and_maybe_terminate_job_flows( conf_path, max_hours_idle, now, dry_run): emr_conn = EMRJobRunner(conf_path=conf_path).make_emr_conn() log.info( 'getting info about all job flows (this goes back about 2 months)') job_flows = describe_all_job_flows(emr_conn) num_running = 0 num_idle = 0 num_done = 0 num_non_streaming = 0 # a list of tuples of job flow id, name, idle time (as a timedelta) to_terminate = [] for jf in job_flows: # check if job flow is done if is_job_flow_done(jf): num_done += 1 # we can't really tell if non-streaming jobs are idle or not, so # let them be (see Issue #60) elif is_job_flow_non_streaming(jf): num_non_streaming += 1 elif is_job_flow_running(jf): num_running += 1 else: num_idle += 1 time_idle = time_job_flow_idle(jf, now=now) # don't care about fractions of a second time_idle = timedelta(time_idle.days, time_idle.seconds) log.debug('Job flow %s (%s) idle for %s' % (jf.jobflowid, jf.name, time_idle)) if time_idle > timedelta(hours=max_hours_idle): to_terminate.append( (jf.jobflowid, jf.name, time_idle)) log.info( 'Job flow statuses: %d running, %d idle, %d active non-streaming,' ' %d done' % (num_running, num_idle, num_non_streaming, num_done)) terminate_and_notify(emr_conn, to_terminate, dry_run=dry_run)
def main(args): option_parser = make_option_parser() options, args = option_parser.parse_args(args) if args: option_parser.error('takes no arguments') MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) log.info('getting information about running jobs') emr_conn = EMRJobRunner(conf_path=options.conf_path).make_emr_conn() job_flows = describe_all_job_flows(emr_conn, states=['RUNNING']) min_time = timedelta(hours=options.min_hours) job_info = find_long_running_jobs(job_flows, min_time) print_report(job_info)
def get_job_flows(conf_path, max_days_ago=None, now=None): """Get relevant job flow information from EMR. :param str conf_path: Alternate path to read :py:mod:`mrjob.conf` from, or ``False`` to ignore all config files. :param float max_days_ago: If set, don't fetch job flows created longer than this many days ago. :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. """ if now is None: now = datetime.utcnow() emr_conn = EMRJobRunner(conf_path=conf_path).make_emr_conn() # if --max-days-ago is set, only look at recent jobs created_after = None if max_days_ago is not None: created_after = now - timedelta(days=max_days_ago) return describe_all_job_flows(emr_conn, created_after=created_after)
def get_job_flows(conf_paths, max_days_ago=None, now=None): """Get relevant job flow information from EMR. :param str conf_path: Alternate path to read :py:mod:`mrjob.conf` from, or ``False`` to ignore all config files. :param float max_days_ago: If set, don't fetch job flows created longer than this many days ago. :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. """ if now is None: now = datetime.utcnow() emr_conn = EMRJobRunner(conf_paths=conf_paths).make_emr_conn() # if --max-days-ago is set, only look at recent jobs created_after = None if max_days_ago is not None: created_after = now - timedelta(days=max_days_ago) return describe_all_job_flows(emr_conn, created_after=created_after)
def main(args, now=None): if now is None: now = datetime.utcnow() option_parser = make_option_parser() options, args = option_parser.parse_args(args) if args: option_parser.error('takes no arguments') MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) log.info('getting information about running jobs') emr_conn = EMRJobRunner(conf_paths=options.conf_paths).make_emr_conn() job_flows = describe_all_job_flows(emr_conn, states=['BOOTSTRAPPING', 'RUNNING']) min_time = timedelta(hours=options.min_hours) job_info = find_long_running_jobs(job_flows, min_time, now=now) print_report(job_info)
def test_can_get_all_job_flows(self): now = datetime.datetime.utcnow() NUM_JOB_FLOWS = 2222 assert_gt(NUM_JOB_FLOWS, DEFAULT_MAX_JOB_FLOWS_RETURNED) for i in range(NUM_JOB_FLOWS): jfid = 'j-%04d' % i self.mock_emr_job_flows[jfid] = MockEmrObject( creationdatetime=to_iso8601(now - datetime.timedelta(minutes=i)), jobflowid=jfid) emr_conn = EMRJobRunner().make_emr_conn() # ordinary describe_jobflows() hits the limit on number of job flows some_jfs = emr_conn.describe_jobflows() assert_equal(len(some_jfs), DEFAULT_MAX_JOB_FLOWS_RETURNED) all_jfs = describe_all_job_flows(emr_conn) assert_equal(len(all_jfs), NUM_JOB_FLOWS) assert_equal(sorted(jf.jobflowid for jf in all_jfs), [('j-%04d' % i) for i in range(NUM_JOB_FLOWS)])
def inspect_and_maybe_terminate_job_flows( conf_path=None, dry_run=False, max_hours_idle=None, mins_to_end_of_hour=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs ): if now is None: now = datetime.utcnow() # old default behavior if max_hours_idle is None and mins_to_end_of_hour is None: max_hours_idle = DEFAULT_MAX_HOURS_IDLE runner = EMRJobRunner(conf_path=conf_path, **kwargs) emr_conn = runner.make_emr_conn() log.info( 'getting info about all job flows (this goes back about 2 months)') # We don't filter by job flow state because we want this to work even # if Amazon adds another kind of idle state. job_flows = describe_all_job_flows(emr_conn) num_bootstrapping = 0 num_done = 0 num_idle = 0 num_non_streaming = 0 num_pending = 0 num_running = 0 # a list of tuples of job flow id, name, idle time (as a timedelta) to_terminate = [] for jf in job_flows: # check if job flow is done if is_job_flow_done(jf): num_done += 1 # check if job flow is bootstrapping elif is_job_flow_bootstrapping(jf): num_bootstrapping += 1 # we can't really tell if non-streaming jobs are idle or not, so # let them be (see Issue #60) elif not is_job_flow_streaming(jf): num_non_streaming += 1 elif is_job_flow_running(jf): num_running += 1 else: time_idle = now - time_last_active(jf) time_to_end_of_hour = est_time_to_hour(jf, now=now) _, pool = pool_hash_and_name(jf) pending = job_flow_has_pending_steps(jf) if pending: num_pending += 1 else: num_idle += 1 log.debug( 'Job flow %s %s for %s, %s to end of hour, %s (%s)' % (jf.jobflowid, 'pending' if pending else 'idle', strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour), ('unpooled' if pool is None else 'in %s pool' % pool), jf.name)) # filter out job flows that don't meet our criteria if (max_hours_idle is not None and time_idle <= timedelta(hours=max_hours_idle)): continue # mins_to_end_of_hour doesn't apply to jobs with pending steps if (mins_to_end_of_hour is not None and (pending or time_to_end_of_hour >= timedelta( minutes=mins_to_end_of_hour))): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue to_terminate.append((jf, pending, time_idle, time_to_end_of_hour)) log.info( 'Job flow statuses: %d bootstrapping, %d running, %d pending, %d idle,' ' %d active non-streaming, %d done' % ( num_running, num_bootstrapping, num_pending, num_idle, num_non_streaming, num_done)) terminate_and_notify(runner, to_terminate, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet)
def print_report(options): emr_conn = EMRJobRunner(conf_path=options.conf_path).make_emr_conn() log.info('getting job flow history...') # microseconds just make our report messy now = datetime.datetime.utcnow().replace(microsecond=0) # if --max-days-ago is set, only look at recent jobs created_after = None if options.max_days_ago is not None: created_after = now - datetime.timedelta(days=options.max_days_ago) job_flows = describe_all_job_flows(emr_conn, created_after=created_after) job_flow_infos = [] for jf in job_flows: job_flow_info = {} job_flow_info['id'] = jf.jobflowid job_flow_info['name'] = jf.name job_flow_info['created'] = to_datetime(jf.creationdatetime) start_time = to_datetime(getattr(jf, 'startdatetime', None)) if start_time: end_time = to_datetime(getattr(jf, 'enddatetime', None)) or now job_flow_info['ran'] = end_time - start_time else: job_flow_info['ran'] = datetime.timedelta(0) job_flow_info['state'] = jf.state job_flow_info['num_steps'] = len(jf.steps or []) # this looks to be an integer, but let's protect against # future changes job_flow_info['hours'] = float(jf.normalizedinstancehours) # estimate hours billed but not used job_flow_info['hours_bbnu'] = ( job_flow_info['hours'] * estimate_proportion_billed_but_not_used(jf)) # split out mr job name and user # jobs flows created by MRJob have names like: # mr_word_freq_count.dave.20101103.121249.638552 match = JOB_NAME_RE.match(jf.name) if match: job_flow_info['mr_job_name'] = match.group(1) job_flow_info['user'] = match.group(2) else: # not run by mrjob job_flow_info['mr_job_name'] = None job_flow_info['user'] = None job_flow_infos.append(job_flow_info) if not job_flow_infos: print 'No job flows created in the past two months!' return earliest = min(info['created'] for info in job_flow_infos) latest = max(info['created'] for info in job_flow_infos) print 'Total # of Job Flows: %d' % len(job_flow_infos) print print '* All times are in UTC.' print print 'Min create time: %s' % earliest print 'Max create time: %s' % latest print ' Current time: %s' % now print print '* All usage is measured in Normalized Instance Hours, which are' print ' roughly equivalent to running an m1.small instance for an hour.' print # total compute-unit hours used total_hours = sum(info['hours'] for info in job_flow_infos) print 'Total Usage: %d' % total_hours print print '* Time billed but not used is estimated, and may not match' print " Amazon's billing system exactly." print total_hours_bbnu = sum(info['hours_bbnu'] for info in job_flow_infos) print 'Total time billed but not used (waste): %.2f' % total_hours_bbnu print date_to_hours = defaultdict(float) date_to_hours_bbnu = defaultdict(float) for info in job_flow_infos: date_created = info['created'].date() date_to_hours[date_created] += info['hours'] date_to_hours_bbnu[date_created] += info['hours_bbnu'] print 'Daily statistics:' print print ' date usage waste' d = latest.date() while d >= earliest.date(): print ' %10s %6d %9.2f' % (d, date_to_hours[d], date_to_hours_bbnu[d]) d -= datetime.timedelta(days=1) print def fmt(mr_job_name_or_user): if mr_job_name_or_user: return mr_job_name_or_user else: return '(not started by mrjob)' print '* Job flows are considered to belong to the user and job that' print ' started them (even if other jobs use the job flow).' print # Top jobs print 'Top jobs, by total usage:' mr_job_name_to_hours = defaultdict(float) for info in job_flow_infos: mr_job_name_to_hours[info['mr_job_name']] += info['hours'] for mr_job_name, hours in sorted(mr_job_name_to_hours.iteritems(), key=lambda (n, h): (-h, n)): print ' %6d %s' % (hours, fmt(mr_job_name)) print print 'Top jobs, by time billed but not used:' mr_job_name_to_hours_bbnu = defaultdict(float) for info in job_flow_infos: mr_job_name_to_hours_bbnu[info['mr_job_name']] += info['hours_bbnu'] for mr_job_name, hours_bbnu in sorted( mr_job_name_to_hours_bbnu.iteritems(), key=lambda (n, h): (-h, n)): print ' %9.2f %s' % (hours_bbnu, fmt(mr_job_name)) print # Top users print 'Top users, by total usage:' user_to_hours = defaultdict(float) for info in job_flow_infos: user_to_hours[info['user']] += info['hours'] for user, hours in sorted(user_to_hours.iteritems(), key=lambda (n, h): (-h, n)): print ' %6d %s' % (hours, fmt(user)) print print 'Top users, by time billed but not used:' user_to_hours_bbnu = defaultdict(float) for info in job_flow_infos: user_to_hours_bbnu[info['user']] += info['hours_bbnu'] for user, hours_bbnu in sorted(user_to_hours_bbnu.iteritems(), key=lambda (n, h): (-h, n)): print ' %9.2f %s' % (hours_bbnu, fmt(user)) print # Top job flows print 'All job flows, by total usage:' top_job_flows = sorted(job_flow_infos, key=lambda i: (-i['hours'], i['name'])) for info in top_job_flows: print ' %6d %-15s %s' % (info['hours'], info['id'], info['name']) print print 'All job flows, by time billed but not used:' top_job_flows_bbnu = sorted(job_flow_infos, key=lambda i: (-i['hours_bbnu'], i['name'])) for info in top_job_flows_bbnu: print ' %9.2f %-15s %s' % (info['hours_bbnu'], info['id'], info['name']) print print 'Details for all job flows:' print print ' id state created steps time ran usage waste user name' all_job_flows = sorted(job_flow_infos, key=lambda i: i['created'], reverse=True) for info in all_job_flows: print ' %-15s %-13s %19s %3d %17s %6d %9.2f %8s %s' % ( info['id'], info['state'], info['created'], info['num_steps'], info['ran'], info['hours'], info['hours_bbnu'], (info['user'] or ''), fmt(info['mr_job_name']))
def inspect_and_maybe_terminate_job_flows( conf_path=None, dry_run=False, max_hours_idle=None, mins_to_end_of_hour=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, ): if now is None: now = datetime.utcnow() # old default behavior if max_hours_idle is None and mins_to_end_of_hour is None: max_hours_idle = DEFAULT_MAX_HOURS_IDLE emr_conn = EMRJobRunner(conf_path=conf_path).make_emr_conn() log.info( 'getting info about all job flows (this goes back about 2 months)') # We don't filter by job flow state because we want this to work even # if Amazon adds another kind of idle state. job_flows = describe_all_job_flows(emr_conn) num_running = 0 num_idle = 0 num_done = 0 num_non_streaming = 0 # a list of tuples of job flow id, name, idle time (as a timedelta) to_terminate = [] for jf in job_flows: # check if job flow is done if is_job_flow_done(jf): num_done += 1 # we can't really tell if non-streaming jobs are idle or not, so # let them be (see Issue #60) elif is_job_flow_non_streaming(jf): num_non_streaming += 1 elif is_job_flow_running(jf): num_running += 1 else: num_idle += 1 time_idle = time_job_flow_idle(jf, now=now) time_to_end_of_hour = time_to_end_of_hour_for_job_flow(jf, now=now) pool = job_flow_pool_name(jf) log.debug( 'Job flow %-15s idle for %s, %s to end of hour, %s (%s)' % (jf.jobflowid, strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour), ('unpooled' if pool is None else 'in %s pool' % pool), jf.name)) # filter out job flows that don't meet our criteria if (max_hours_idle is not None and time_idle <= timedelta(hours=max_hours_idle)): continue if (mins_to_end_of_hour is not None and time_to_end_of_hour >= timedelta(minutes=mins_to_end_of_hour)): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue to_terminate.append( (jf.jobflowid, jf.name, time_idle, time_to_end_of_hour)) log.info('Job flow statuses: %d running, %d idle, %d active non-streaming,' ' %d done' % (num_running, num_idle, num_non_streaming, num_done)) terminate_and_notify(emr_conn, to_terminate, dry_run=dry_run)
def print_report(options): emr_conn = EMRJobRunner(conf_path=options.conf_path).make_emr_conn() log.info('getting job flow history...') # microseconds just make our report messy now = datetime.datetime.utcnow().replace(microsecond=0) # if --max-days-ago is set, only look at recent jobs created_after = None if options.max_days_ago is not None: created_after = now - datetime.timedelta(days=options.max_days_ago) job_flows = describe_all_job_flows(emr_conn, created_after=created_after) job_flow_infos = [] for jf in job_flows: job_flow_info = {} job_flow_info['id'] = jf.jobflowid job_flow_info['name'] = jf.name job_flow_info['created'] = to_datetime(jf.creationdatetime) start_time = to_datetime(getattr(jf, 'startdatetime', None)) if start_time: end_time = to_datetime(getattr(jf, 'enddatetime', None)) or now job_flow_info['ran'] = end_time - start_time else: job_flow_info['ran'] = datetime.timedelta(0) job_flow_info['state'] = jf.state job_flow_info['num_steps'] = len(jf.steps or []) # this looks to be an integer, but let's protect against # future changes job_flow_info['hours'] = float(jf.normalizedinstancehours) # estimate hours billed but not used job_flow_info['hours_bbnu'] = ( job_flow_info['hours'] * estimate_proportion_billed_but_not_used(jf)) # split out mr job name and user # jobs flows created by MRJob have names like: # mr_word_freq_count.dave.20101103.121249.638552 match = JOB_NAME_RE.match(jf.name) if match: job_flow_info['mr_job_name'] = match.group(1) job_flow_info['user'] = match.group(2) else: # not run by mrjob job_flow_info['mr_job_name'] = None job_flow_info['user'] = None job_flow_infos.append(job_flow_info) if not job_flow_infos: print 'No job flows created in the past two months!' return earliest = min(info['created'] for info in job_flow_infos) latest = max(info['created'] for info in job_flow_infos) print 'Total # of Job Flows: %d' % len(job_flow_infos) print print '* All times are in UTC.' print print 'Min create time: %s' % earliest print 'Max create time: %s' % latest print ' Current time: %s' % now print print '* All usage is measured in Normalized Instance Hours, which are' print ' roughly equivalent to running an m1.small instance for an hour.' print # total compute-unit hours used total_hours = sum(info['hours'] for info in job_flow_infos) print 'Total Usage: %d' % total_hours print print '* Time billed but not used is estimated, and may not match' print " Amazon's billing system exactly." print total_hours_bbnu = sum(info['hours_bbnu'] for info in job_flow_infos) print 'Total time billed but not used (waste): %.2f' % total_hours_bbnu print date_to_hours = defaultdict(float) date_to_hours_bbnu = defaultdict(float) for info in job_flow_infos: date_created = info['created'].date() date_to_hours[date_created] += info['hours'] date_to_hours_bbnu[date_created] += info['hours_bbnu'] print 'Daily statistics:' print print ' date usage waste' d = latest.date() while d >= earliest.date(): print ' %10s %6d %9.2f' % (d, date_to_hours[d], date_to_hours_bbnu[d]) d -= datetime.timedelta(days=1) print def fmt(mr_job_name_or_user): if mr_job_name_or_user: return mr_job_name_or_user else: return '(not started by mrjob)' print '* Job flows are considered to belong to the user and job that' print ' started them (even if other jobs use the job flow).' print # Top jobs print 'Top jobs, by total usage:' mr_job_name_to_hours = defaultdict(float) for info in job_flow_infos: mr_job_name_to_hours[info['mr_job_name']] += info['hours'] for mr_job_name, hours in sorted(mr_job_name_to_hours.iteritems(), key=lambda (n, h): (-h, n)): print ' %6d %s' % (hours, fmt(mr_job_name)) print print 'Top jobs, by time billed but not used:' mr_job_name_to_hours_bbnu = defaultdict(float) for info in job_flow_infos: mr_job_name_to_hours_bbnu[info['mr_job_name']] += info['hours_bbnu'] for mr_job_name, hours_bbnu in sorted(mr_job_name_to_hours_bbnu.iteritems(), key=lambda (n, h): (-h, n)): print ' %9.2f %s' % (hours_bbnu, fmt(mr_job_name)) print # Top users print 'Top users, by total usage:' user_to_hours = defaultdict(float) for info in job_flow_infos: user_to_hours[info['user']] += info['hours'] for user, hours in sorted(user_to_hours.iteritems(), key=lambda (n, h): (-h, n)): print ' %6d %s' % (hours, fmt(user)) print print 'Top users, by time billed but not used:' user_to_hours_bbnu = defaultdict(float) for info in job_flow_infos: user_to_hours_bbnu[info['user']] += info['hours_bbnu'] for user, hours_bbnu in sorted(user_to_hours_bbnu.iteritems(), key=lambda (n, h): (-h, n)): print ' %9.2f %s' % (hours_bbnu, fmt(user)) print # Top job flows print 'All job flows, by total usage:' top_job_flows = sorted(job_flow_infos, key=lambda i: (-i['hours'], i['name'])) for info in top_job_flows: print ' %6d %-15s %s' % (info['hours'], info['id'], info['name']) print print 'All job flows, by time billed but not used:' top_job_flows_bbnu = sorted(job_flow_infos, key=lambda i: (-i['hours_bbnu'], i['name'])) for info in top_job_flows_bbnu: print ' %9.2f %-15s %s' % ( info['hours_bbnu'], info['id'], info['name']) print print 'Details for all job flows:' print print ' id state created steps time ran usage waste user name' all_job_flows = sorted(job_flow_infos, key=lambda i: i['created'], reverse=True) for info in all_job_flows: print ' %-15s %-13s %19s %3d %17s %6d %9.2f %8s %s' % ( info['id'], info['state'], info['created'], info['num_steps'], info['ran'], info['hours'], info['hours_bbnu'], (info['user'] or ''), fmt(info['mr_job_name']))
def inspect_and_maybe_terminate_job_flows(conf_paths=None, dry_run=False, max_hours_idle=None, mins_to_end_of_hour=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs): if now is None: now = datetime.utcnow() # old default behavior if max_hours_idle is None and mins_to_end_of_hour is None: max_hours_idle = DEFAULT_MAX_HOURS_IDLE runner = EMRJobRunner(conf_paths=conf_paths, **kwargs) emr_conn = runner.make_emr_conn() log.info( 'getting info about all job flows (this goes back about 2 months)') # We don't filter by job flow state because we want this to work even # if Amazon adds another kind of idle state. job_flows = describe_all_job_flows(emr_conn) num_bootstrapping = 0 num_done = 0 num_idle = 0 num_non_streaming = 0 num_pending = 0 num_running = 0 # a list of tuples of job flow id, name, idle time (as a timedelta) to_terminate = [] for jf in job_flows: # check if job flow is done if is_job_flow_done(jf): num_done += 1 # check if job flow is bootstrapping elif is_job_flow_bootstrapping(jf): num_bootstrapping += 1 # we can't really tell if non-streaming jobs are idle or not, so # let them be (see Issue #60) elif not is_job_flow_streaming(jf): num_non_streaming += 1 elif is_job_flow_running(jf): num_running += 1 else: time_idle = now - time_last_active(jf) time_to_end_of_hour = est_time_to_hour(jf, now=now) _, pool = pool_hash_and_name(jf) pending = job_flow_has_pending_steps(jf) if pending: num_pending += 1 else: num_idle += 1 log.debug('Job flow %s %s for %s, %s to end of hour, %s (%s)' % (jf.jobflowid, 'pending' if pending else 'idle', strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour), ('unpooled' if pool is None else 'in %s pool' % pool), jf.name)) # filter out job flows that don't meet our criteria if (max_hours_idle is not None and time_idle <= timedelta(hours=max_hours_idle)): continue # mins_to_end_of_hour doesn't apply to jobs with pending steps if (mins_to_end_of_hour is not None and (pending or time_to_end_of_hour >= timedelta(minutes=mins_to_end_of_hour))): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue to_terminate.append((jf, pending, time_idle, time_to_end_of_hour)) log.info( 'Job flow statuses: %d bootstrapping, %d running, %d pending, %d idle,' ' %d active non-streaming, %d done' % (num_running, num_bootstrapping, num_pending, num_idle, num_non_streaming, num_done)) terminate_and_notify(runner, to_terminate, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet)
def inspect_and_maybe_terminate_job_flows( conf_path=None, dry_run=False, max_hours_idle=None, mins_to_end_of_hour=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, ): if now is None: now = datetime.utcnow() # old default behavior if max_hours_idle is None and mins_to_end_of_hour is None: max_hours_idle = DEFAULT_MAX_HOURS_IDLE emr_conn = EMRJobRunner(conf_path=conf_path).make_emr_conn() log.info( 'getting info about all job flows (this goes back about 2 months)') # We don't filter by job flow state because we want this to work even # if Amazon adds another kind of idle state. job_flows = describe_all_job_flows(emr_conn) num_running = 0 num_idle = 0 num_done = 0 num_non_streaming = 0 # a list of tuples of job flow id, name, idle time (as a timedelta) to_terminate = [] for jf in job_flows: # check if job flow is done if is_job_flow_done(jf): num_done += 1 # we can't really tell if non-streaming jobs are idle or not, so # let them be (see Issue #60) elif is_job_flow_non_streaming(jf): num_non_streaming += 1 elif is_job_flow_running(jf): num_running += 1 else: num_idle += 1 time_idle = time_job_flow_idle(jf, now=now) time_to_end_of_hour = time_to_end_of_hour_for_job_flow(jf, now=now) pool = job_flow_pool_name(jf) log.debug( 'Job flow %-15s idle for %s, %s to end of hour, %s (%s)' % (jf.jobflowid, strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour), ('unpooled' if pool is None else 'in %s pool' % pool), jf.name)) # filter out job flows that don't meet our criteria if (max_hours_idle is not None and time_idle <= timedelta(hours=max_hours_idle)): continue if (mins_to_end_of_hour is not None and time_to_end_of_hour >= timedelta(minutes=mins_to_end_of_hour)): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue to_terminate.append( (jf.jobflowid, jf.name, time_idle, time_to_end_of_hour)) log.info( 'Job flow statuses: %d running, %d idle, %d active non-streaming,' ' %d done' % (num_running, num_idle, num_non_streaming, num_done)) terminate_and_notify(emr_conn, to_terminate, dry_run=dry_run)