Esempio n. 1
0
    def test_too_few_args(self):
        jf = MockEmrObject(bootstrapactions=[
            MockEmrObject(args=[
                MockEmrObject(value='pool-0123456789abcdef0123456789abcdef'),
            ],
                          name='master'),
        ])

        self.assertEqual(pool_hash_and_name(jf), (None, None))
Esempio n. 2
0
    def test_first_arg_doesnt_start_with_pool(self):
        jf = MockEmrObject(bootstrapactions=[
            MockEmrObject(args=[
                MockEmrObject(value='cowsay'),
                MockEmrObject(value='mrjob'),
            ]),
        ])

        self.assertEqual(pool_hash_and_name(jf), (None, None))
 def test_currently_running(self):
     jf = self.mock_emr_job_flows['j-CURRENTLY_RUNNING']
     self.assertEqual(is_job_flow_done(jf), False)
     self.assertEqual(is_job_flow_running(jf), True)
     self.assertEqual(is_job_flow_non_streaming(jf), False)
     self.assertEqual(time_job_flow_idle(jf, self.now), timedelta(0))
     self.assertEqual(est_time_to_hour(jf, self.now),
                      timedelta(minutes=45))
     self.assertEqual(pool_hash_and_name(jf), (None, None))
Esempio n. 4
0
    def test_pooled_job_flow(self):
        jf = MockEmrObject(bootstrapactions=[
            MockEmrObject(args=[
                MockEmrObject(value='pool-0123456789abcdef0123456789abcdef'),
                MockEmrObject(value='reflecting'),
            ]),
        ])

        self.assertEqual(pool_hash_and_name(jf),
                         ('0123456789abcdef0123456789abcdef', 'reflecting'))
    def test_idle_and_failed(self):
        jf = self.mock_emr_job_flows['j-IDLE_AND_FAILED']

        self.assertEqual(is_job_flow_done(jf), False)
        self.assertEqual(is_job_flow_running(jf), False)
        self.assertEqual(is_job_flow_non_streaming(jf), False)
        self.assertEqual(time_job_flow_idle(jf, self.now), timedelta(hours=3))
        self.assertEqual(est_time_to_hour(jf, self.now),
                         timedelta(hours=1))
        self.assertEqual(pool_hash_and_name(jf), (None, None))
    def test_hadoop_debugging_job_flow(self):
        jf = self.mock_emr_job_flows['j-HADOOP_DEBUGGING']

        self.assertEqual(is_job_flow_done(jf), False)
        self.assertEqual(is_job_flow_running(jf), False)
        self.assertEqual(is_job_flow_non_streaming(jf), False)
        self.assertEqual(time_job_flow_idle(jf, self.now), timedelta(hours=2))
        self.assertEqual(est_time_to_hour(jf, self.now),
                         timedelta(hours=1))
        self.assertEqual(pool_hash_and_name(jf), (None, None))
Esempio n. 7
0
    def test_too_many_args(self):
        jf = MockEmrObject(bootstrapactions=[
            MockEmrObject(args=[
                MockEmrObject(value='cowsay'),
                MockEmrObject(value='-b'),
                MockEmrObject(value='mrjob'),
            ]),
        ])

        self.assertEqual(pool_hash_and_name(jf), (None, None))
Esempio n. 8
0
    def test_first_arg_doesnt_start_with_pool(self):
        jf = MockEmrObject(
            bootstrapactions=[
                MockEmrObject(args=[
                    MockEmrObject(value='cowsay'),
                    MockEmrObject(value='mrjob'),
                ], name='master'),
            ])

        self.assertEqual(pool_hash_and_name(jf), (None, None))
Esempio n. 9
0
    def test_too_few_args(self):
        jf = MockEmrObject(
            bootstrapactions=[
                MockEmrObject(args=[
                    MockEmrObject(
                        value='pool-0123456789abcdef0123456789abcdef'),
                ], name='master'),
            ])

        self.assertEqual(pool_hash_and_name(jf), (None, None))
Esempio n. 10
0
    def test_bootstrap_action_isnt_named_master(self):
        jf = MockEmrObject(bootstrapactions=[
            MockEmrObject(args=[
                MockEmrObject(value='pool-0123456789abcdef0123456789abcdef'),
                MockEmrObject(value='reflecting'),
            ],
                          name='apprentice'),
        ])

        self.assertEqual(pool_hash_and_name(jf), (None, None))
    def test_hive_job_flow(self):
        jf = self.mock_emr_job_flows['j-HIVE']

        self.assertEqual(is_job_flow_done(jf), False)
        self.assertEqual(is_job_flow_running(jf), False)
        self.assertEqual(is_job_flow_non_streaming(jf), True)
        self.assertEqual(time_job_flow_idle(jf, self.now), timedelta(hours=4))
        self.assertEqual(est_time_to_hour(jf, self.now),
                         timedelta(hours=1))
        self.assertEqual(pool_hash_and_name(jf), (None, None))
Esempio n. 12
0
    def test_too_many_args(self):
        jf = MockEmrObject(
            bootstrapactions=[
                MockEmrObject(args=[
                    MockEmrObject(value='cowsay'),
                    MockEmrObject(value='-b'),
                    MockEmrObject(value='mrjob'),
                ], name='master'),
            ])

        self.assertEqual(pool_hash_and_name(jf), (None, None))
Esempio n. 13
0
    def test_bootstrap_action_isnt_named_master(self):
        jf = MockEmrObject(
            bootstrapactions=[
                MockEmrObject(args=[
                    MockEmrObject(
                        value='pool-0123456789abcdef0123456789abcdef'),
                    MockEmrObject(value='reflecting'),
                ], name='apprentice'),
            ])

        self.assertEqual(pool_hash_and_name(jf), (None, None))
    def test_pooled(self):
        jf = self.mock_emr_job_flows['j-POOLED']

        self.assertEqual(is_job_flow_done(jf), False)
        self.assertEqual(is_job_flow_running(jf), False)
        self.assertEqual(is_job_flow_non_streaming(jf), False)
        self.assertEqual(time_job_flow_idle(jf, self.now),
                         timedelta(minutes=55))
        self.assertEqual(est_time_to_hour(jf, self.now),
                         timedelta(minutes=5))
        self.assertEqual(pool_hash_and_name(jf),
                         ('0123456789abcdef0123456789abcdef', 'reflecting'))
Esempio n. 15
0
    def test_pooled_job_flow(self):
        jf = MockEmrObject(
            bootstrapactions=[
                MockEmrObject(args=[
                    MockEmrObject(
                        value='pool-0123456789abcdef0123456789abcdef'),
                    MockEmrObject(value='reflecting'),
                ], name='master'),
            ])

        self.assertEqual(pool_hash_and_name(jf),
                         ('0123456789abcdef0123456789abcdef', 'reflecting'))
Esempio n. 16
0
    def test_pooled_job_flow_with_other_bootstrap_actions(self):
        jf = MockEmrObject(bootstrapactions=[
            MockEmrObject(args=[], name='action 0'),
            MockEmrObject(args=[], name='action 1'),
            MockEmrObject(args=[
                MockEmrObject(value='pool-0123456789abcdef0123456789abcdef'),
                MockEmrObject(value='reflecting'),
            ],
                          name='master'),
        ])

        self.assertEqual(pool_hash_and_name(jf),
                         ('0123456789abcdef0123456789abcdef', 'reflecting'))
Esempio n. 17
0
    def test_pooled_job_flow_with_max_hours_idle(self):
        # max hours idle is added AFTER the master bootstrap script,
        # which was a problem when we just look at the last action
        jf = MockEmrObject(bootstrapactions=[
            MockEmrObject(args=[
                MockEmrObject(value='pool-0123456789abcdef0123456789abcdef'),
                MockEmrObject(value='reflecting'),
            ],
                          name='master'),
            MockEmrObject(args=[
                MockEmrObject(value='900'),
                MockEmrObject(value='300'),
            ],
                          name='idle timeout'),
        ])

        self.assertEqual(pool_hash_and_name(jf),
                         ('0123456789abcdef0123456789abcdef', 'reflecting'))
Esempio n. 18
0
    def test_pooled_job_flow_with_max_hours_idle(self):
        # max hours idle is added AFTER the master bootstrap script,
        # which was a problem when we just look at the last action
        jf = MockEmrObject(
            bootstrapactions=[
                MockEmrObject(args=[
                    MockEmrObject(
                        value='pool-0123456789abcdef0123456789abcdef'),
                    MockEmrObject(value='reflecting'),
                ], name='master'),
                MockEmrObject(args=[
                    MockEmrObject(value='900'),
                    MockEmrObject(value='300'),
                ], name='idle timeout'),
            ])

        self.assertEqual(pool_hash_and_name(jf),
                         ('0123456789abcdef0123456789abcdef', 'reflecting'))
 def assertJobFlowIs(
     self, jf,
     bootstrapping=False,
     done=False,
     from_end_of_hour=timedelta(hours=1),
     has_pending_steps=False,
     idle_for=timedelta(0),
     pool_hash=None,
     pool_name=None,
     running=False,
     streaming=True,
 ):
     self.assertEqual(bootstrapping, is_job_flow_bootstrapping(jf))
     self.assertEqual(done, is_job_flow_done(jf))
     self.assertEqual(from_end_of_hour, est_time_to_hour(jf, self.now))
     self.assertEqual(has_pending_steps, job_flow_has_pending_steps(jf))
     self.assertEqual(idle_for, self.time_job_flow_idle(jf))
     self.assertEqual((pool_hash, pool_name), pool_hash_and_name(jf))
     self.assertEqual(running, is_job_flow_running(jf))
     self.assertEqual(streaming, is_job_flow_streaming(jf))
Esempio n. 20
0
    def test_too_few_args(self):
        jf = MockEmrObject(
            bootstrapactions=[MockEmrObject(args=[])])

        self.assertEqual(pool_hash_and_name(jf), (None, None))
Esempio n. 21
0
def inspect_and_maybe_terminate_job_flows(conf_paths=None,
                                          dry_run=False,
                                          max_hours_idle=None,
                                          mins_to_end_of_hour=None,
                                          now=None,
                                          pool_name=None,
                                          pooled_only=False,
                                          unpooled_only=False,
                                          max_mins_locked=None,
                                          quiet=False,
                                          **kwargs):

    if now is None:
        now = datetime.utcnow()

    # old default behavior
    if max_hours_idle is None and mins_to_end_of_hour is None:
        max_hours_idle = DEFAULT_MAX_HOURS_IDLE

    runner = EMRJobRunner(conf_paths=conf_paths, **kwargs)
    emr_conn = runner.make_emr_conn()

    log.info(
        'getting info about all job flows (this goes back about 2 months)')
    # We don't filter by job flow state because we want this to work even
    # if Amazon adds another kind of idle state.
    job_flows = describe_all_job_flows(emr_conn)

    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_non_streaming = 0
    num_pending = 0
    num_running = 0

    # a list of tuples of job flow id, name, idle time (as a timedelta)
    to_terminate = []

    for jf in job_flows:

        # check if job flow is done
        if is_job_flow_done(jf):
            num_done += 1

        # check if job flow is bootstrapping
        elif is_job_flow_bootstrapping(jf):
            num_bootstrapping += 1

        # we can't really tell if non-streaming jobs are idle or not, so
        # let them be (see Issue #60)
        elif not is_job_flow_streaming(jf):
            num_non_streaming += 1

        elif is_job_flow_running(jf):
            num_running += 1

        else:
            time_idle = now - time_last_active(jf)
            time_to_end_of_hour = est_time_to_hour(jf, now=now)
            _, pool = pool_hash_and_name(jf)
            pending = job_flow_has_pending_steps(jf)

            if pending:
                num_pending += 1
            else:
                num_idle += 1

            log.debug('Job flow %s %s for %s, %s to end of hour, %s (%s)' %
                      (jf.jobflowid, 'pending' if pending else 'idle',
                       strip_microseconds(time_idle),
                       strip_microseconds(time_to_end_of_hour),
                       ('unpooled' if pool is None else 'in %s pool' % pool),
                       jf.name))

            # filter out job flows that don't meet our criteria
            if (max_hours_idle is not None
                    and time_idle <= timedelta(hours=max_hours_idle)):

                continue

            # mins_to_end_of_hour doesn't apply to jobs with pending steps
            if (mins_to_end_of_hour is not None
                    and (pending or time_to_end_of_hour >=
                         timedelta(minutes=mins_to_end_of_hour))):
                continue

            if (pooled_only and pool is None):
                continue

            if (unpooled_only and pool is not None):
                continue

            if (pool_name is not None and pool != pool_name):
                continue

            to_terminate.append((jf, pending, time_idle, time_to_end_of_hour))

    log.info(
        'Job flow statuses: %d bootstrapping, %d running, %d pending, %d idle,'
        ' %d active non-streaming, %d done' %
        (num_running, num_bootstrapping, num_pending, num_idle,
         num_non_streaming, num_done))

    terminate_and_notify(runner,
                         to_terminate,
                         dry_run=dry_run,
                         max_mins_locked=max_mins_locked,
                         quiet=quiet)
Esempio n. 22
0
    def test_empty_bootstrap_actions(self):
        jf = MockEmrObject(bootstrapactions=[])

        self.assertEqual(pool_hash_and_name(jf), (None, None))
Esempio n. 23
0
    def test_empty(self):
        jf = MockEmrObject()

        self.assertEqual(pool_hash_and_name(jf), (None, None))
Esempio n. 24
0
    def test_empty_bootstrap_actions(self):
        jf = MockEmrObject(bootstrapactions=[])

        self.assertEqual(pool_hash_and_name(jf), (None, None))
Esempio n. 25
0
    def test_empty(self):
        jf = MockEmrObject()

        self.assertEqual(pool_hash_and_name(jf), (None, None))
def inspect_and_maybe_terminate_job_flows(
    conf_path=None,
    dry_run=False,
    max_hours_idle=None,
    mins_to_end_of_hour=None,
    now=None,
    pool_name=None,
    pooled_only=False,
    unpooled_only=False,
    max_mins_locked=None,
    quiet=False,
    **kwargs
):

    if now is None:
        now = datetime.utcnow()

    # old default behavior
    if max_hours_idle is None and mins_to_end_of_hour is None:
        max_hours_idle = DEFAULT_MAX_HOURS_IDLE

    runner = EMRJobRunner(conf_path=conf_path, **kwargs)
    emr_conn = runner.make_emr_conn()

    log.info(
        'getting info about all job flows (this goes back about 2 months)')
    # We don't filter by job flow state because we want this to work even
    # if Amazon adds another kind of idle state.
    job_flows = describe_all_job_flows(emr_conn)

    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_non_streaming = 0
    num_pending = 0
    num_running = 0

    # a list of tuples of job flow id, name, idle time (as a timedelta)
    to_terminate = []

    for jf in job_flows:

        # check if job flow is done
        if is_job_flow_done(jf):
            num_done += 1

        # check if job flow is bootstrapping
        elif is_job_flow_bootstrapping(jf):
            num_bootstrapping += 1

        # we can't really tell if non-streaming jobs are idle or not, so
        # let them be (see Issue #60)
        elif not is_job_flow_streaming(jf):
            num_non_streaming += 1

        elif is_job_flow_running(jf):
            num_running += 1

        else:
            time_idle = now - time_last_active(jf)
            time_to_end_of_hour = est_time_to_hour(jf, now=now)
            _, pool = pool_hash_and_name(jf)
            pending = job_flow_has_pending_steps(jf)

            if pending:
                num_pending += 1
            else:
                num_idle += 1

            log.debug(
                'Job flow %s %s for %s, %s to end of hour, %s (%s)' %
                      (jf.jobflowid,
                       'pending' if pending else 'idle',
                       strip_microseconds(time_idle),
                       strip_microseconds(time_to_end_of_hour),
                       ('unpooled' if pool is None else 'in %s pool' % pool),
                       jf.name))

            # filter out job flows that don't meet our criteria
            if (max_hours_idle is not None and
                time_idle <= timedelta(hours=max_hours_idle)):
                continue

            # mins_to_end_of_hour doesn't apply to jobs with pending steps
            if (mins_to_end_of_hour is not None and
                (pending or
                 time_to_end_of_hour >= timedelta(
                    minutes=mins_to_end_of_hour))):
                continue

            if (pooled_only and pool is None):
                continue

            if (unpooled_only and pool is not None):
                continue

            if (pool_name is not None and pool != pool_name):
                continue

            to_terminate.append((jf, pending, time_idle, time_to_end_of_hour))

    log.info(
        'Job flow statuses: %d bootstrapping, %d running, %d pending, %d idle,'
        ' %d active non-streaming, %d done' % (
        num_running, num_bootstrapping, num_pending, num_idle,
        num_non_streaming, num_done))

    terminate_and_notify(runner, to_terminate, dry_run=dry_run,
                         max_mins_locked=max_mins_locked, quiet=quiet)
Esempio n. 27
0
    def test_too_few_args(self):
        jf = MockEmrObject(bootstrapactions=[MockEmrObject(args=[])])

        self.assertEqual(pool_hash_and_name(jf), (None, None))