コード例 #1
0
def main():
    option_parser = make_option_parser()
    options, args = option_parser.parse_args()
    
    if args:
        option_parser.error('takes no arguments')

    # set up logging
    if not options.quiet:
        log_to_stream(name='mrjob', debug=options.verbose)

    emr_conn = EMRJobRunner().make_emr_conn()

    log.info(
        'getting info about all job flows (this goes back about 2 weeks)')
    job_flows = emr_conn.describe_jobflows()
        
    now = datetime.utcnow()

    num_running = 0
    num_idle = 0
    num_done = 0
    # a list of tuples of job flow id, name, idle time (as a timedelta)
    to_terminate = []

    for jf in job_flows:
        # check if job flow is done
        if hasattr(jf, 'enddatetime'):
            num_done += 1
        # check if job flow is currently running
        elif jf.steps and not hasattr(jf.steps[-1], 'enddatetime'):
            num_running += 1
        # job flow is idle. how long?
        else:
            num_idle += 1
            if jf.steps:
                idle_since = datetime.strptime(
                    jf.steps[-1].enddatetime, ISO8601)
            else:
                idle_since = datetime.strptime(
                    jf.creationdatetime, ISO8601)
            idle_time = now - idle_since

            # don't care about fractions of a second
            idle_time = timedelta(idle_time.days, idle_time.seconds)

            log.debug('Job flow %s (%s) idle for %s' %
                           (jf.jobflowid, jf.name, idle_time))
            if idle_time > timedelta(hours=options.max_hours_idle):
                to_terminate.append(
                    (jf.jobflowid, jf.name, idle_time))

    log.info('Job flow statuses: %d running, %d idle, %d done' %
                  (num_running, num_idle, num_done))

    terminate_and_notify(emr_conn, to_terminate, options)
コード例 #2
0
def find_waiting_flow(aws_access_key_id=None, aws_secret_access_key=None, s3_scratch_uri=None,
                      s3_log_uri=None, ec2_key_pair=None, ec2_key_pair_file=None):
    # If the options are specified then ignore the options in ~/.mrjob.conf
    if aws_access_key_id is not None and aws_secret_access_key is not None and \
       s3_scratch_uri is not None and s3_log_uri is not None and ec2_key_pair is not None and \
       ec2_key_pair_file is not None:

        emr_conn = EMRJobRunner(aws_access_key_id=aws_access_key_id,
                                aws_secret_access_key=aws_secret_access_key,
                                s3_scratch_uri=s3_scratch_uri, s3_log_uri=s3_log_uri,
                                ec2_key_pair=ec2_key_pair,
                                ec2_key_pair_file=ec2_key_pair_file).make_emr_conn()
    # If options are not specified then use the options in ~/.mrjob.conf
    else:
        if not os.path.isfile("%s/.mrjob.conf" % expanduser("~")):
            sys.exit("%s/.mrjob.conf no found" % expanduser("~"))

        emr_conn = EMRJobRunner().make_emr_conn()

    job_flows = emr_conn.describe_jobflows()
    d = {'WAITING': 0, 'STARTING': 1, 'RUNNING': 2}
    waiting_flows = []

    for flow in job_flows:
        try:
            if flow.state in d.keys():
                job_id = flow.jobflowid
                ip_address = flow.masterpublicdnsname
                waiting_flows.append([d[flow.state], job_id, ip_address, flow.state])
                if ec2_key_pair_file != '':
                    print 'ssh -i %s hadoop@%s' % (ec2_key_pair_file, ip_address)
                    job_id = flow.jobflowid
        except Exception:
            continue

    waiting_flows = sorted(waiting_flows, key=itemgetter(0))
    # An index was added at the beginning for the sorting. Removing that index in this step
    waiting_flows = [i[1:] for i in waiting_flows]
    # Converting a list of lists to a list of dicts
    waiting_flows_dict = [{'flow_id': i[0], 'node': i[1], 'flow_state':i[2]} for i in waiting_flows]

    # Printing
    index = 0
    for flow_dict in waiting_flows_dict:
        print index, flow_dict['flow_id'], flow_dict['node'], flow_dict['flow_state']
        index += 1
    
    return waiting_flows_dict
コード例 #3
0
ファイル: emr_test.py プロジェクト: boursier/mrjob
    def test_can_get_all_job_flows(self):
        now = datetime.datetime.utcnow()

        NUM_JOB_FLOWS = 2222
        assert_gt(NUM_JOB_FLOWS, DEFAULT_MAX_JOB_FLOWS_RETURNED)

        for i in range(NUM_JOB_FLOWS):
            jfid = 'j-%04d' % i
            self.mock_emr_job_flows[jfid] = MockEmrObject(
                creationdatetime=to_iso8601(now - datetime.timedelta(minutes=i)),
                jobflowid=jfid)

        emr_conn = EMRJobRunner().make_emr_conn()

        # ordinary describe_jobflows() hits the limit on number of job flows
        some_jfs = emr_conn.describe_jobflows()
        assert_equal(len(some_jfs), DEFAULT_MAX_JOB_FLOWS_RETURNED)

        all_jfs = describe_all_job_flows(emr_conn)
        assert_equal(len(all_jfs), NUM_JOB_FLOWS)
        assert_equal(sorted(jf.jobflowid for jf in all_jfs),
                     [('j-%04d' % i) for i in range(NUM_JOB_FLOWS)])
コード例 #4
0
    def test_can_get_all_job_flows(self):
        now = datetime.datetime.utcnow()

        NUM_JOB_FLOWS = 2222
        assert_gt(NUM_JOB_FLOWS, DEFAULT_MAX_JOB_FLOWS_RETURNED)

        for i in range(NUM_JOB_FLOWS):
            jfid = 'j-%04d' % i
            self.mock_emr_job_flows[jfid] = MockEmrObject(
                creationdatetime=to_iso8601(now - datetime.timedelta(minutes=i)),
                jobflowid=jfid)

        emr_conn = EMRJobRunner().make_emr_conn()

        # ordinary describe_jobflows() hits the limit on number of job flows
        some_jfs = emr_conn.describe_jobflows()
        assert_equal(len(some_jfs), DEFAULT_MAX_JOB_FLOWS_RETURNED)

        all_jfs = describe_all_job_flows(emr_conn)
        assert_equal(len(all_jfs), NUM_JOB_FLOWS)
        assert_equal(sorted(jf.jobflowid for jf in all_jfs),
                     [('j-%04d' % i) for i in range(NUM_JOB_FLOWS)])