Ejemplo n.º 1
0
def yield_clusters(max_days_ago=None, now=None, **runner_kwargs):
    """Get relevant job flow information from EMR.

    :param float max_days_ago: If set, don't fetch job flows created longer
                               than this many days ago.
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.
    :param runner_kwargs: keyword args to pass through to
                          :py:class:`~mrjob.emr.EMRJobRunner`
    """
    if now is None:
        now = datetime.utcnow()

    emr_conn = EMRJobRunner(**runner_kwargs).make_emr_conn()

    # if --max-days-ago is set, only look at recent jobs
    created_after = None
    if max_days_ago is not None:
        created_after = now - timedelta(days=max_days_ago)

    for cluster_summary in _yield_all_clusters(
            emr_conn, created_after=created_after):
        cluster_id = cluster_summary.id

        cluster = patched_describe_cluster(emr_conn, cluster_id)
        cluster.steps = list(_yield_all_steps(emr_conn, cluster_id))
        cluster.bootstrapactions = list(
            _yield_all_bootstrap_actions(emr_conn, cluster_id))

        yield cluster
Ejemplo n.º 2
0
def yield_clusters(max_days_ago=None, now=None, **runner_kwargs):
    """Get relevant cluster information from EMR.

    :param float max_days_ago: If set, don't fetch clusters created longer
                               than this many days ago.
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.
    :param runner_kwargs: keyword args to pass through to
                          :py:class:`~mrjob.emr.EMRJobRunner`
    """
    if now is None:
        now = datetime.utcnow()

    emr_conn = EMRJobRunner(**runner_kwargs).make_emr_conn()

    # if --max-days-ago is set, only look at recent jobs
    created_after = None
    if max_days_ago is not None:
        created_after = now - timedelta(days=max_days_ago)

    for cluster_summary in _yield_all_clusters(
            emr_conn, created_after=created_after):
        cluster_id = cluster_summary.id

        cluster = patched_describe_cluster(emr_conn, cluster_id)
        cluster.steps = list(_yield_all_steps(emr_conn, cluster_id))
        cluster.bootstrapactions = list(
            _yield_all_bootstrap_actions(emr_conn, cluster_id))

        yield cluster
Ejemplo n.º 3
0
def _find_long_running_jobs(emr_conn, cluster_summaries, min_time, now=None):
    """Identify jobs that have been running or pending for a long time.

    :param clusters: a list of :py:class:`boto.emr.emrobject.Cluster`
                      objects to inspect.
    :param min_time: a :py:class:`datetime.timedelta`: report jobs running or
                     pending longer than this
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.

    For each job that is running or pending longer than *min_time*, yields
    a dictionary with the following keys:

    * *cluster_id*: the cluster's unique ID (e.g. ``j-SOMECLUSTER``)
    * *name*: name of the step, or the cluster when bootstrapping
    * *state*: state of the step (``'RUNNING'`` or ``'PENDING'``) or, if there
               is no step, the cluster (``'STARTING'`` or ``'BOOTSTRAPPING'``)
    * *time*: amount of time step was running or pending, as a
              :py:class:`datetime.timedelta`
    """
    if now is None:
        now = datetime.utcnow()

    for cs in cluster_summaries:

        # special case for jobs that are taking a long time to bootstrap
        if cs.status.state in ('STARTING', 'BOOTSTRAPPING'):
            # there isn't a way to tell when the cluster stopped being
            # provisioned and started bootstrapping, so just measure
            # from cluster creation time
            created_timestamp = cs.status.timeline.creationdatetime
            created = iso8601_to_datetime(created_timestamp)

            time_running = now - created

            if time_running >= min_time:
                yield ({
                    'cluster_id': cs.id,
                    'name': cs.name,
                    'state': cs.status.state,
                    'time': time_running
                })

        # the default case: running clusters
        if cs.status.state != 'RUNNING':
            continue

        steps = list(_yield_all_steps(emr_conn, cs.id))

        running_steps = [
            step for step in steps if step.status.state == 'RUNNING'
        ]
        pending_steps = [
            step for step in steps if step.status.state == 'PENDING'
        ]

        if running_steps:
            # should be only one, but if not, we should know about it
            for step in running_steps:

                start_timestamp = step.status.timeline.startdatetime
                start = iso8601_to_datetime(start_timestamp)

                time_running = now - start

                if time_running >= min_time:
                    yield ({
                        'cluster_id': cs.id,
                        'name': step.name,
                        'state': step.status.state,
                        'time': time_running
                    })

        # sometimes EMR says it's "RUNNING" but doesn't actually run steps!
        elif pending_steps:
            step = pending_steps[0]

            # PENDING job should have run starting when the cluster
            # became ready, or the previous step completed
            start_timestamp = cs.status.timeline.readydatetime
            for step in steps:
                if step.status.state == 'COMPLETED':
                    start_timestamp = step.status.timeline.enddatetime

            start = iso8601_to_datetime(start_timestamp)
            time_pending = now - start

            if time_pending >= min_time:
                yield ({
                    'cluster_id': cs.id,
                    'name': step.name,
                    'state': step.status.state,
                    'time': time_pending
                })
Ejemplo n.º 4
0
def maybe_terminate_clusters(dry_run=False,
                             max_hours_idle=None,
                             mins_to_end_of_hour=None,
                             now=None,
                             pool_name=None,
                             pooled_only=False,
                             unpooled_only=False,
                             max_mins_locked=None,
                             quiet=False,
                             **kwargs):
    if now is None:
        now = datetime.utcnow()

    # old default behavior
    if max_hours_idle is None and mins_to_end_of_hour is None:
        max_hours_idle = DEFAULT_MAX_HOURS_IDLE

    runner = EMRJobRunner(**kwargs)
    emr_conn = runner.make_emr_conn()

    num_starting = 0
    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_non_streaming = 0
    num_pending = 0
    num_running = 0

    # We don't filter by cluster state because we want this to work even
    # if Amazon adds another kind of idle state.
    for cluster_summary in _yield_all_clusters(emr_conn):
        cluster_id = cluster_summary.id

        # check if cluster is done
        if is_cluster_done(cluster_summary):
            num_done += 1
            continue

        # check if cluster is starting
        if is_cluster_starting(cluster_summary):
            num_starting += 1
            continue

        # check if cluster is bootstrapping
        if is_cluster_bootstrapping(cluster_summary):
            num_bootstrapping += 1
            continue

        # need steps to learn more about cluster
        steps = list(_yield_all_steps(emr_conn, cluster_id))

        # we can't really tell if non-streaming jobs are idle or not, so
        # let them be (see Issue #60)
        if is_cluster_non_streaming(steps):
            num_non_streaming += 1
            continue

        if any(is_step_running(step) for step in steps):
            num_running += 1
            continue

        # cluster is idle
        time_idle = now - time_last_active(cluster_summary, steps)
        time_to_end_of_hour = _est_time_to_hour(cluster_summary, now=now)
        is_pending = cluster_has_pending_steps(steps)

        bootstrap_actions = list(_yield_all_bootstrap_actions(
            emr_conn, cluster_id))
        _, pool = _pool_hash_and_name(bootstrap_actions)

        if is_pending:
            num_pending += 1
        else:
            num_idle += 1

        log.debug(
            'cluster %s %s for %s, %s to end of hour, %s (%s)' %
            (cluster_id,
             'pending' if is_pending else 'idle',
             strip_microseconds(time_idle),
             strip_microseconds(time_to_end_of_hour),
             ('unpooled' if pool is None else 'in %s pool' % pool),
             cluster_summary.name))

        # filter out clusters that don't meet our criteria
        if (max_hours_idle is not None and
                time_idle <= timedelta(hours=max_hours_idle)):
            continue

        # mins_to_end_of_hour doesn't apply to jobs with pending steps
        if (mins_to_end_of_hour is not None and
            (is_pending or
             time_to_end_of_hour >= timedelta(
                minutes=mins_to_end_of_hour))):
            continue

        if (pooled_only and pool is None):
            continue

        if (unpooled_only and pool is not None):
            continue

        if (pool_name is not None and pool != pool_name):
            continue

        # terminate idle cluster
        terminate_and_notify(
            runner=runner,
            cluster_id=cluster_id,
            cluster_name=cluster_summary.name,
            num_steps=len(steps),
            is_pending=is_pending,
            time_idle=time_idle,
            time_to_end_of_hour=time_to_end_of_hour,
            dry_run=dry_run,
            max_mins_locked=max_mins_locked,
            quiet=quiet)

    log.info(
        'Cluster statuses: %d starting, %d bootstrapping, %d running,'
        ' %d pending, %d idle, %d active non-streaming, %d done' % (
            num_starting, num_bootstrapping, num_running,
            num_pending, num_idle, num_non_streaming, num_done))
Ejemplo n.º 5
0
def maybe_terminate_clusters(dry_run=False,
                             max_hours_idle=None,
                             mins_to_end_of_hour=None,
                             now=None,
                             pool_name=None,
                             pooled_only=False,
                             unpooled_only=False,
                             max_mins_locked=None,
                             quiet=False,
                             **kwargs):
    if now is None:
        now = datetime.utcnow()

    # old default behavior
    if max_hours_idle is None and mins_to_end_of_hour is None:
        max_hours_idle = DEFAULT_MAX_HOURS_IDLE

    runner = EMRJobRunner(**kwargs)
    emr_conn = runner.make_emr_conn()

    num_starting = 0
    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_non_streaming = 0
    num_pending = 0
    num_running = 0

    # We don't filter by job flow state because we want this to work even
    # if Amazon adds another kind of idle state.
    for cluster_summary in _yield_all_clusters(emr_conn):
        cluster_id = cluster_summary.id

        # check if job flow is done
        if is_cluster_done(cluster_summary):
            num_done += 1
            continue

        # check if job flow is starting
        if is_cluster_starting(cluster_summary):
            num_starting += 1
            continue

        # check if job flow is bootstrapping
        if is_cluster_bootstrapping(cluster_summary):
            num_bootstrapping += 1
            continue

        # need steps to learn more about cluster
        steps = list(_yield_all_steps(emr_conn, cluster_id))

        # we can't really tell if non-streaming jobs are idle or not, so
        # let them be (see Issue #60)
        if is_cluster_non_streaming(steps):
            num_non_streaming += 1
            continue

        if any(is_step_running(step) for step in steps):
            num_running += 1
            continue

        # cluster is idle
        time_idle = now - time_last_active(cluster_summary, steps)
        time_to_end_of_hour = _est_time_to_hour(cluster_summary, now=now)
        is_pending = cluster_has_pending_steps(steps)

        bootstrap_actions = list(
            _yield_all_bootstrap_actions(emr_conn, cluster_id))
        _, pool = _pool_hash_and_name(bootstrap_actions)

        if is_pending:
            num_pending += 1
        else:
            num_idle += 1

        log.debug('Job flow %s %s for %s, %s to end of hour, %s (%s)' %
                  (cluster_id, 'pending' if is_pending else 'idle',
                   strip_microseconds(time_idle),
                   strip_microseconds(time_to_end_of_hour),
                   ('unpooled' if pool is None else 'in %s pool' % pool),
                   cluster_summary.name))

        # filter out job flows that don't meet our criteria
        if (max_hours_idle is not None
                and time_idle <= timedelta(hours=max_hours_idle)):
            continue

        # mins_to_end_of_hour doesn't apply to jobs with pending steps
        if (mins_to_end_of_hour is not None and
            (is_pending or
             time_to_end_of_hour >= timedelta(minutes=mins_to_end_of_hour))):
            continue

        if (pooled_only and pool is None):
            continue

        if (unpooled_only and pool is not None):
            continue

        if (pool_name is not None and pool != pool_name):
            continue

        # terminate idle cluster
        terminate_and_notify(runner=runner,
                             cluster_id=cluster_id,
                             cluster_name=cluster_summary.name,
                             num_steps=len(steps),
                             is_pending=is_pending,
                             time_idle=time_idle,
                             time_to_end_of_hour=time_to_end_of_hour,
                             dry_run=dry_run,
                             max_mins_locked=max_mins_locked,
                             quiet=quiet)

    log.info('Job flow statuses: %d starting, %d bootstrapping, %d running,'
             ' %d pending, %d idle, %d active non-streaming, %d done' %
             (num_starting, num_bootstrapping, num_running, num_pending,
              num_idle, num_non_streaming, num_done))
Ejemplo n.º 6
0
def find_long_running_jobs(emr_conn, cluster_summaries, min_time, now=None):
    """Identify jobs that have been running or pending for a long time.

    :param clusters: a list of :py:class:`boto.emr.emrobject.Cluster`
                      objects to inspect.
    :param min_time: a :py:class:`datetime.timedelta`: report jobs running or
                     pending longer than this
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.

    For each job that is running or pending longer than *min_time*, yields
    a dictionary with the following keys:

    * *cluster_id*: the cluster's unique ID (e.g. ``j-SOMECLUSTER``)
    * *name*: name of the step, or the cluster when bootstrapping
    * *state*: state of the step (``'RUNNING'`` or ``'PENDING'``) or, if there
               is no step, the cluster (``'STARTING'`` or ``'BOOTSTRAPPING'``)
    * *time*: amount of time step was running or pending, as a
              :py:class:`datetime.timedelta`
    """
    if now is None:
        now = datetime.utcnow()

    for cs in cluster_summaries:

        # special case for jobs that are taking a long time to bootstrap
        if cs.status.state in ('STARTING', 'BOOTSTRAPPING'):
            # there isn't a way to tell when the cluster stopped being
            # provisioned and started bootstrapping, so just measure
            # from cluster creation time
            created_timestamp = cs.status.timeline.creationdatetime
            created = iso8601_to_datetime(created_timestamp)

            time_running = now - created

            if time_running >= min_time:
                yield({'cluster_id': cs.id,
                       'name': cs.name,
                       'state': cs.status.state,
                       'time': time_running})

        # the default case: running clusters
        if cs.status.state != 'RUNNING':
            continue

        steps = list(_yield_all_steps(emr_conn, cs.id))

        running_steps = [
            step for step in steps if step.status.state == 'RUNNING']
        pending_steps = [
            step for step in steps if step.status.state == 'PENDING']

        if running_steps:
            # should be only one, but if not, we should know about it
            for step in running_steps:

                start_timestamp = step.status.timeline.startdatetime
                start = iso8601_to_datetime(start_timestamp)

                time_running = now - start

                if time_running >= min_time:
                    yield({'cluster_id': cs.id,
                           'name': step.name,
                           'state': step.status.state,
                           'time': time_running})

        # sometimes EMR says it's "RUNNING" but doesn't actually run steps!
        elif pending_steps:
            step = pending_steps[0]

            # PENDING job should have run starting when the cluster
            # became ready, or the previous step completed
            start_timestamp = cs.status.timeline.readydatetime
            for step in steps:
                if step.status.state == 'COMPLETED':
                    start_timestamp = step.status.timeline.enddatetime

            start = iso8601_to_datetime(start_timestamp)
            time_pending = now - start

            if time_pending >= min_time:
                yield({'cluster_id': cs.id,
                       'name': step.name,
                       'state': step.status.state,
                       'time': time_pending})