Esempio n. 1
0
def _yield_clusters(max_days_ago=None, now=None, **runner_kwargs):
    """Get relevant cluster information from EMR.

    :param float max_days_ago: If set, don't fetch clusters created longer
                               than this many days ago.
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.
    :param runner_kwargs: keyword args to pass through to
                          :py:class:`~mrjob.emr.EMRJobRunner`
    """
    if now is None:
        now = _boto3_now()

    emr_client = EMRJobRunner(**runner_kwargs).make_emr_client()

    # if --max-days-ago is set, only look at recent jobs
    created_after = None
    if max_days_ago is not None:
        created_after = now - timedelta(days=max_days_ago)

    # use _DELAY to sleep 1 second after each API call (see #1091). Could
    # implement some sort of connection wrapper for this if it becomes more
    # generally useful.
    list_clusters_kwargs = dict(_delay=_DELAY)
    if created_after is not None:
        list_clusters_kwargs['CreatedAfter'] = created_after

    for cluster_summary in _boto3_paginate('Clusters', emr_client,
                                           'list_clusters',
                                           **list_clusters_kwargs):

        cluster_id = cluster_summary['Id']

        cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster']
        sleep(_DELAY)

        cluster['Steps'] = list(
            reversed(
                list(
                    _boto3_paginate('Steps',
                                    emr_client,
                                    'list_steps',
                                    ClusterId=cluster_id,
                                    _delay=_DELAY))))

        cluster['BootstrapActions'] = list(
            _boto3_paginate('BootstrapActions',
                            emr_client,
                            'list_bootstrap_actions',
                            ClusterId=cluster_id,
                            _delay=_DELAY))

        yield cluster
Esempio n. 2
0
def get_or_create_mrjob_instance_profile(client):
    """Look for a usable instance profile for EMR, and if there is none,
    create one."""
    # look for matching instance profile. Must point to a role with
    # the right policy document and attached role policy
    for profile in _boto3_paginate(
            'InstanceProfiles', client, 'list_instance_profiles'):
        roles = profile['Roles']
        if len(roles) != 1:
            continue
        if _role_matches(client, roles[0], _MRJOB_INSTANCE_PROFILE_ROLE,
                         _EMR_INSTANCE_PROFILE_POLICY_ARN):
            return profile['InstanceProfileName']

    # create a new role, and wrap it in an instance profile
    # with the same name
    name = _create_mrjob_role_with_attached_policy(
        client, _MRJOB_INSTANCE_PROFILE_ROLE, _EMR_INSTANCE_PROFILE_POLICY_ARN)

    client.create_instance_profile(InstanceProfileName=name)
    client.add_role_to_instance_profile(InstanceProfileName=name,
                                        RoleName=name)

    log.info('Auto-created instance profile %s' % name)

    return name
Esempio n. 3
0
def main(args=None):
    now = _boto3_now()

    arg_parser = _make_arg_parser()
    options = arg_parser.parse_args(args)

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    log.info('getting information about running jobs')

    min_time = timedelta(hours=options.min_hours)

    emr_client = EMRJobRunner(**_runner_kwargs(options)).make_emr_client()
    cluster_summaries = _boto3_paginate(
        'Clusters', emr_client, 'list_clusters',
        ClusterStates=['STARTING', 'BOOTSTRAPPING', 'RUNNING'])

    if not options.exclude:
        filtered_cluster_summaries = cluster_summaries
    else:
        filtered_cluster_summaries = _filter_clusters(
            cluster_summaries, emr_client, options.exclude)

    job_info = _find_long_running_jobs(
        emr_client, filtered_cluster_summaries, min_time, now=now)

    _print_report(job_info)
Esempio n. 4
0
def main(args=None):
    now = _boto3_now()

    arg_parser = _make_arg_parser()
    options = arg_parser.parse_args(args)

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    log.info('getting information about running jobs')

    min_time = timedelta(hours=options.min_hours)

    emr_client = EMRJobRunner(**_runner_kwargs(options)).make_emr_client()
    cluster_summaries = _boto3_paginate(
        'Clusters', emr_client, 'list_clusters',
        ClusterStates=['STARTING', 'BOOTSTRAPPING', 'RUNNING'])

    if not options.exclude:
        filtered_cluster_summaries = cluster_summaries
    else:
        filtered_cluster_summaries = _filter_clusters(
            cluster_summaries, emr_client, options.exclude)

    job_info = _find_long_running_jobs(
        emr_client, filtered_cluster_summaries, min_time, now=now)

    _print_report(job_info)
Esempio n. 5
0
File: iam.py Progetto: Affirm/mrjob
def get_or_create_mrjob_instance_profile(client):
    """Look for a usable instance profile for EMR, and if there is none,
    create one."""
    # look for matching instance profile. Must point to a role with
    # the right policy document and attached role policy
    for profile in _boto3_paginate(
            'InstanceProfiles', client, 'list_instance_profiles'):
        roles = profile['Roles']
        if len(roles) != 1:
            continue
        if _role_matches(client, roles[0], _MRJOB_INSTANCE_PROFILE_ROLE,
                         _EMR_INSTANCE_PROFILE_POLICY_ARN):
            return profile['InstanceProfileName']

    # create a new role, and wrap it in an instance profile
    # with the same name
    name = _create_mrjob_role_with_attached_policy(
        client, _MRJOB_INSTANCE_PROFILE_ROLE, _EMR_INSTANCE_PROFILE_POLICY_ARN)

    client.create_instance_profile(InstanceProfileName=name)
    client.add_role_to_instance_profile(InstanceProfileName=name,
                                        RoleName=name)

    log.info('Auto-created instance profile %s' % name)

    return name
Esempio n. 6
0
def _yield_clusters(max_days_ago=None, now=None, **runner_kwargs):
    """Get relevant cluster information from EMR.

    :param float max_days_ago: If set, don't fetch clusters created longer
                               than this many days ago.
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.
    :param runner_kwargs: keyword args to pass through to
                          :py:class:`~mrjob.emr.EMRJobRunner`
    """
    if now is None:
        now = _boto3_now()

    emr_client = EMRJobRunner(**runner_kwargs).make_emr_client()

    # if --max-days-ago is set, only look at recent jobs
    created_after = None
    if max_days_ago is not None:
        created_after = now - timedelta(days=max_days_ago)

    # use _DELAY to sleep 1 second after each API call (see #1091). Could
    # implement some sort of connection wrapper for this if it becomes more
    # generally useful.
    list_clusters_kwargs = dict(_delay=_DELAY)
    if created_after is not None:
        list_clusters_kwargs['CreatedAfter'] = created_after

    for cluster_summary in _boto3_paginate(
            'Clusters', emr_client, 'list_clusters', **list_clusters_kwargs):

        cluster_id = cluster_summary['Id']

        cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster']
        sleep(_DELAY)

        cluster['Steps'] = list(reversed(list(_boto3_paginate(
            'Steps', emr_client, 'list_steps',
            ClusterId=cluster_id, _delay=_DELAY))))

        cluster['BootstrapActions'] = list(_boto3_paginate(
            'BootstrapActions', emr_client, 'list_bootstrap_actions',
            ClusterId=cluster_id, _delay=_DELAY))

        yield cluster
Esempio n. 7
0
def _get_step(emr_client, cluster_id, step_id=None):

    # just iterate backwards through steps, rather than filtering
    # by step ID or status. usually it'll be the last step anyhow

    for step in _boto3_paginate('Steps', emr_client, 'list_steps',
                                ClusterId=cluster_id):

        if _step_matches(step, step_id=step_id):
            return step
    else:
        if step_id:
            log.error('step %s not found on cluster %s' %
                      (step_id, cluster_id))
        else:
            log.error('cluster %s has no failed steps' % cluster_id)
Esempio n. 8
0
def _get_step(emr_client, cluster_id, step_id=None):

    # just iterate backwards through steps, rather than filtering
    # by step ID or status. usually it'll be the last step anyhow

    for step in _boto3_paginate('Steps', emr_client, 'list_steps',
                                ClusterId=cluster_id):

        if _step_matches(step, step_id=step_id):
            return step
    else:
        if step_id:
            log.error('step %s not found on cluster %s' %
                      (step_id, cluster_id))
        else:
            log.error('cluster %s has no failed steps' % cluster_id)
Esempio n. 9
0
File: iam.py Progetto: Affirm/mrjob
def get_or_create_mrjob_service_role(client):
    """Look for a usable service role for EMR, and if there is none,
    create one. Either way, return that role's name."""

    # look for matching role. Must have same policy document
    # and attached role policy
    for role in _boto3_paginate('Roles', client, 'list_roles'):
        if _role_matches(client, role, _MRJOB_SERVICE_ROLE,
                         _EMR_SERVICE_ROLE_POLICY_ARN):
            return role['RoleName']

    # no matches, create it ourselves
    role_name = _create_mrjob_role_with_attached_policy(
        client, _MRJOB_SERVICE_ROLE, _EMR_SERVICE_ROLE_POLICY_ARN)

    log.info('Auto-created service role %s' % role_name)

    return role_name
Esempio n. 10
0
def get_or_create_mrjob_service_role(client):
    """Look for a usable service role for EMR, and if there is none,
    create one. Either way, return that role's name."""

    # look for matching role. Must have same policy document
    # and attached role policy
    for role in _boto3_paginate('Roles', client, 'list_roles'):
        if _role_matches(client, role, _MRJOB_SERVICE_ROLE,
                         _EMR_SERVICE_ROLE_POLICY_ARN):
            return role['RoleName']

    # no matches, create it ourselves
    role_name = _create_mrjob_role_with_attached_policy(
        client, _MRJOB_SERVICE_ROLE, _EMR_SERVICE_ROLE_POLICY_ARN)

    log.info('Auto-created service role %s' % role_name)

    return role_name
Esempio n. 11
0
    def test_retry_during_pagination(self):
        # regression test for #2005
        bucket_names = ['walrus%02d' % i for i in range(100)]

        # must set side_effect before adding error
        self.list_buckets.side_effect = [dict(Buckets=bucket_names)]
        self.add_transient_error(socket.error(110, 'Connection timed out'))

        # our mock pagination somewhat messes with this test; rather than
        # getting called once per page of bucket names, list_buckets() only
        # gets called twice, once to fail with a transient error, and once to
        # get the full list of buckets, which mock pagination then breaks
        # into "pages". This still tests the important thing though, which is
        # that we can retry at all within pagination

        self.assertEqual(
            list(
                _boto3_paginate('Buckets', self.wrapped_client,
                                'list_buckets')), bucket_names)
Esempio n. 12
0
def _find_long_running_jobs(emr_client, cluster_summaries, min_time, now=None):
    """Identify jobs that have been running or pending for a long time.

    :param clusters: a list of :py:mod:`boto3` cluster summary data structures
    :param min_time: a :py:class:`datetime.timedelta`: report jobs running or
                     pending longer than this
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.

    For each job that is running or pending longer than *min_time*, yields
    a dictionary with the following keys:

    * *cluster_id*: the cluster's unique ID (e.g. ``j-SOMECLUSTER``)
    * *name*: name of the step, or the cluster when bootstrapping
    * *state*: state of the step (``'RUNNING'`` or ``'PENDING'``) or, if there
               is no step, the cluster (``'STARTING'`` or ``'BOOTSTRAPPING'``)
    * *time*: amount of time step was running or pending, as a
              :py:class:`datetime.timedelta`
    """
    if now is None:
        now = _boto3_now()

    for cs in cluster_summaries:

        # special case for jobs that are taking a long time to bootstrap
        if cs['Status']['State'] in ('STARTING', 'BOOTSTRAPPING'):
            # there isn't a way to tell when the cluster stopped being
            # provisioned and started bootstrapping, so just measure
            # from cluster creation time
            created = cs['Status']['Timeline']['CreationDateTime']

            time_running = now - created

            if time_running >= min_time:
                yield({'cluster_id': cs['Id'],
                       'name': cs['Name'],
                       'state': cs['Status']['State'],
                       'time': time_running})

        # the default case: running clusters
        if cs['Status']['State'] != 'RUNNING':
            continue

        steps = list(reversed(list(_boto3_paginate(
            'Steps', emr_client, 'list_steps', ClusterId=cs['Id']))))

        running_steps = [
            step for step in steps if step['Status']['State'] == 'RUNNING']
        pending_steps = [
            step for step in steps if step['Status']['State'] == 'PENDING']

        if running_steps:
            # should be only one, but if not, we should know about it
            for step in running_steps:

                start = step['Status']['Timeline']['StartDateTime']

                time_running = now - start

                if time_running >= min_time:
                    yield({'cluster_id': cs['Id'],
                           'name': step['Name'],
                           'state': step['Status']['State'],
                           'time': time_running})

        # sometimes EMR says it's "RUNNING" but doesn't actually run steps!
        elif pending_steps:
            step = pending_steps[0]

            # PENDING job should have run starting when the cluster
            # became ready, or the previous step completed
            start = cs['Status']['Timeline']['ReadyDateTime']
            for step in steps:
                if step['Status']['State'] == 'COMPLETED':
                    start = step['Status']['Timeline']['EndDateTime']

            time_pending = now - start

            if time_pending >= min_time:
                yield({'cluster_id': cs['Id'],
                       'name': step['Name'],
                       'state': step['Status']['State'],
                       'time': time_pending})
Esempio n. 13
0
def _find_long_running_jobs(emr_client, cluster_summaries, min_time, now=None):
    """Identify jobs that have been running or pending for a long time.

    :param clusters: a list of :py:mod:`boto3` cluster summary data structures
    :param min_time: a :py:class:`datetime.timedelta`: report jobs running or
                     pending longer than this
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.

    For each job that is running or pending longer than *min_time*, yields
    a dictionary with the following keys:

    * *cluster_id*: the cluster's unique ID (e.g. ``j-SOMECLUSTER``)
    * *name*: name of the step, or the cluster when bootstrapping
    * *state*: state of the step (``'RUNNING'`` or ``'PENDING'``) or, if there
               is no step, the cluster (``'STARTING'`` or ``'BOOTSTRAPPING'``)
    * *time*: amount of time step was running or pending, as a
              :py:class:`datetime.timedelta`
    """
    if now is None:
        now = _boto3_now()

    for cs in cluster_summaries:

        # special case for jobs that are taking a long time to bootstrap
        if cs['Status']['State'] in ('STARTING', 'BOOTSTRAPPING'):
            # there isn't a way to tell when the cluster stopped being
            # provisioned and started bootstrapping, so just measure
            # from cluster creation time
            created = cs['Status']['Timeline']['CreationDateTime']

            time_running = now - created

            if time_running >= min_time:
                yield({'cluster_id': cs['Id'],
                       'name': cs['Name'],
                       'state': cs['Status']['State'],
                       'time': time_running})

        # the default case: running clusters
        if cs['Status']['State'] != 'RUNNING':
            continue

        steps = list(reversed(list(_boto3_paginate(
            'Steps', emr_client, 'list_steps', ClusterId=cs['Id']))))

        running_steps = [
            step for step in steps if step['Status']['State'] == 'RUNNING']
        pending_steps = [
            step for step in steps if step['Status']['State'] == 'PENDING']

        if running_steps:
            # should be only one, but if not, we should know about it
            for step in running_steps:

                start = step['Status']['Timeline']['StartDateTime']

                time_running = now - start

                if time_running >= min_time:
                    yield({'cluster_id': cs['Id'],
                           'name': step['Name'],
                           'state': step['Status']['State'],
                           'time': time_running})

        # sometimes EMR says it's "RUNNING" but doesn't actually run steps!
        elif pending_steps:
            step = pending_steps[0]

            # PENDING job should have run starting when the cluster
            # became ready, or the previous step completed
            start = cs['Status']['Timeline']['ReadyDateTime']
            for step in steps:
                if step['Status']['State'] == 'COMPLETED':
                    start = step['Status']['Timeline']['EndDateTime']

            time_pending = now - start

            if time_pending >= min_time:
                yield({'cluster_id': cs['Id'],
                       'name': step['Name'],
                       'state': step['Status']['State'],
                       'time': time_pending})
Esempio n. 14
0
def _maybe_terminate_clusters(dry_run=False,
                              max_hours_idle=None,
                              mins_to_end_of_hour=None,
                              now=None,
                              pool_name=None,
                              pooled_only=False,
                              unpooled_only=False,
                              max_mins_locked=None,
                              quiet=False,
                              **kwargs):
    if now is None:
        now = _boto3_now()

    # old default behavior
    if max_hours_idle is None and mins_to_end_of_hour is None:
        max_hours_idle = _DEFAULT_MAX_HOURS_IDLE

    runner = EMRJobRunner(**kwargs)
    emr_client = runner.make_emr_client()

    num_starting = 0
    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_pending = 0
    num_running = 0

    # We don't filter by cluster state because we want this to work even
    # if Amazon adds another kind of idle state.
    for cluster_summary in _boto3_paginate('Clusters', emr_client,
                                           'list_clusters'):

        cluster_id = cluster_summary['Id']

        # check if cluster is done
        if _is_cluster_done(cluster_summary):
            num_done += 1
            continue

        # check if cluster is starting
        if _is_cluster_starting(cluster_summary):
            num_starting += 1
            continue

        # check if cluster is bootstrapping
        if _is_cluster_bootstrapping(cluster_summary):
            num_bootstrapping += 1
            continue

        # need steps to learn more about cluster
        steps = list(
            reversed(
                list(
                    _boto3_paginate('Steps',
                                    emr_client,
                                    'list_steps',
                                    ClusterId=cluster_id))))

        if any(_is_step_running(step) for step in steps):
            num_running += 1
            continue

        # cluster is idle
        time_idle = now - _time_last_active(cluster_summary, steps)
        time_to_end_of_hour = _est_time_to_hour(cluster_summary, now=now)
        is_pending = _cluster_has_pending_steps(steps)

        bootstrap_actions = list(
            _boto3_paginate('BootstrapActions',
                            emr_client,
                            'list_bootstrap_actions',
                            ClusterId=cluster_id))

        _, pool = _pool_hash_and_name(bootstrap_actions)

        if is_pending:
            num_pending += 1
        else:
            num_idle += 1

        log.debug('cluster %s %s for %s, %s to end of hour, %s (%s)' %
                  (cluster_id, 'pending' if is_pending else 'idle',
                   strip_microseconds(time_idle),
                   strip_microseconds(time_to_end_of_hour),
                   ('unpooled' if pool is None else 'in %s pool' % pool),
                   cluster_summary['Name']))

        # filter out clusters that don't meet our criteria
        if (max_hours_idle is not None
                and time_idle <= timedelta(hours=max_hours_idle)):
            continue

        # mins_to_end_of_hour doesn't apply to jobs with pending steps
        if (mins_to_end_of_hour is not None and
            (is_pending or
             time_to_end_of_hour >= timedelta(minutes=mins_to_end_of_hour))):
            continue

        if (pooled_only and pool is None):
            continue

        if (unpooled_only and pool is not None):
            continue

        if (pool_name is not None and pool != pool_name):
            continue

        # terminate idle cluster
        _terminate_and_notify(runner=runner,
                              cluster_id=cluster_id,
                              cluster_name=cluster_summary['Name'],
                              num_steps=len(steps),
                              is_pending=is_pending,
                              time_idle=time_idle,
                              time_to_end_of_hour=time_to_end_of_hour,
                              dry_run=dry_run,
                              max_mins_locked=max_mins_locked,
                              quiet=quiet)

    log.info('Cluster statuses: %d starting, %d bootstrapping, %d running,'
             ' %d pending, %d idle, %d done' %
             (num_starting, num_bootstrapping, num_running, num_pending,
              num_idle, num_done))
Esempio n. 15
0
def _maybe_terminate_clusters(dry_run=False,
                              max_mins_idle=None,
                              now=None,
                              pool_name=None,
                              pooled_only=False,
                              unpooled_only=False,
                              max_mins_locked=None,
                              quiet=False,
                              **kwargs):
    if now is None:
        now = _boto3_now()

    # old default behavior
    if max_mins_idle is None:
        max_mins_idle = _DEFAULT_MAX_MINS_IDLE

    runner = EMRJobRunner(**kwargs)
    emr_client = runner.make_emr_client()

    num_starting = 0
    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_pending = 0
    num_running = 0

    # We don't filter by cluster state because we want this to work even
    # if Amazon adds another kind of idle state.
    for cluster_summary in _boto3_paginate(
            'Clusters', emr_client, 'list_clusters'):

        cluster_id = cluster_summary['Id']

        # check if cluster is done
        if _is_cluster_done(cluster_summary):
            num_done += 1
            continue

        # check if cluster is starting
        if _is_cluster_starting(cluster_summary):
            num_starting += 1
            continue

        # check if cluster is bootstrapping
        if _is_cluster_bootstrapping(cluster_summary):
            num_bootstrapping += 1
            continue

        # need steps to learn more about cluster
        steps = list(reversed(list(_boto3_paginate(
            'Steps', emr_client, 'list_steps',
            ClusterId=cluster_id))))

        if any(_is_step_running(step) for step in steps):
            num_running += 1
            continue

        # cluster is idle
        time_idle = now - _time_last_active(cluster_summary, steps)
        is_pending = _cluster_has_pending_steps(steps)

        # need to get actual cluster to see tags
        cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster']

        _, pool = _pool_hash_and_name(cluster)

        if is_pending:
            num_pending += 1
        else:
            num_idle += 1

        log.debug(
            'cluster %s %s for %s, %s (%s)' %
            (cluster_id,
             'pending' if is_pending else 'idle',
             strip_microseconds(time_idle),
             ('unpooled' if pool is None else 'in %s pool' % pool),
             cluster_summary['Name']))

        # filter out clusters that don't meet our criteria
        if (max_mins_idle is not None and
                time_idle <= timedelta(minutes=max_mins_idle)):
            continue

        if (pooled_only and pool is None):
            continue

        if (unpooled_only and pool is not None):
            continue

        if (pool_name is not None and pool != pool_name):
            continue

        # terminate idle cluster
        _terminate_and_notify(
            runner=runner,
            cluster_id=cluster_id,
            cluster_name=cluster_summary['Name'],
            num_steps=len(steps),
            is_pending=is_pending,
            time_idle=time_idle,
            dry_run=dry_run,
            max_mins_locked=max_mins_locked,
            quiet=quiet)

    log.info(
        'Cluster statuses: %d starting, %d bootstrapping, %d running,'
        ' %d pending, %d idle, %d done' % (
            num_starting, num_bootstrapping, num_running,
            num_pending, num_idle, num_done))
Esempio n. 16
0
def _maybe_terminate_clusters(dry_run=False,
                              max_mins_idle=None,
                              now=None,
                              pool_name=None,
                              pooled_only=False,
                              unpooled_only=False,
                              max_mins_locked=None,
                              quiet=False,
                              **kwargs):
    if now is None:
        now = _boto3_now()

    # old default behavior
    if max_mins_idle is None:
        max_mins_idle = _DEFAULT_MAX_MINS_IDLE

    runner = EMRJobRunner(**kwargs)
    emr_client = runner.make_emr_client()

    num_starting = 0
    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_pending = 0
    num_running = 0

    # include RUNNING to catch clusters with PENDING jobs that
    # never ran (see #365).
    for cluster_summary in _boto3_paginate(
            'Clusters',
            emr_client,
            'list_clusters',
            ClusterStates=['WAITING', 'RUNNING']):

        cluster_id = cluster_summary['Id']

        # check if cluster is done
        if _is_cluster_done(cluster_summary):
            num_done += 1
            continue

        # check if cluster is starting
        if _is_cluster_starting(cluster_summary):
            num_starting += 1
            continue

        # check if cluster is bootstrapping
        if _is_cluster_bootstrapping(cluster_summary):
            num_bootstrapping += 1
            continue

        # need steps to learn more about cluster
        steps = list(
            reversed(
                list(
                    _boto3_paginate('Steps',
                                    emr_client,
                                    'list_steps',
                                    ClusterId=cluster_id))))

        if any(_is_step_running(step) for step in steps):
            num_running += 1
            continue

        # cluster is idle
        time_idle = now - _time_last_active(cluster_summary, steps)
        is_pending = _cluster_has_pending_steps(steps)

        # need to get actual cluster to see tags
        cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster']

        _, pool = _pool_hash_and_name(cluster)

        if is_pending:
            num_pending += 1
        else:
            num_idle += 1

        log.debug('cluster %s %s for %s, %s (%s) - %s' % (
            cluster_id,
            'pending' if is_pending else 'idle',
            strip_microseconds(time_idle),
            ('unpooled' if pool is None else 'in %s pool' % pool),
            cluster_summary['Name'],
            'protected' if cluster['TerminationProtected'] else 'unprotected',
        ))

        # filter out clusters that don't meet our criteria
        if (max_mins_idle is not None
                and time_idle <= timedelta(minutes=max_mins_idle)):
            continue

        if (pooled_only and pool is None):
            continue

        if (unpooled_only and pool is not None):
            continue

        if (pool_name is not None and pool != pool_name):
            continue

        if cluster['TerminationProtected']:
            continue

        # terminate idle cluster
        _terminate_and_notify(runner=runner,
                              cluster_id=cluster_id,
                              cluster_name=cluster_summary['Name'],
                              num_steps=len(steps),
                              is_pending=is_pending,
                              time_idle=time_idle,
                              dry_run=dry_run,
                              max_mins_locked=max_mins_locked,
                              quiet=quiet)

    log.info('Cluster statuses: %d starting, %d bootstrapping, %d running,'
             ' %d pending, %d idle, %d done' %
             (num_starting, num_bootstrapping, num_running, num_pending,
              num_idle, num_done))
Esempio n. 17
0
def _attempt_to_lock_cluster(emr_client,
                             cluster_id,
                             job_key,
                             cluster=None,
                             when_cluster_described=None):
    """Attempt to lock the given pooled cluster using EMR tags.

    You may optionally include *cluster* (a cluster description) and
    *when_cluster_described*, to save an API call to ``DescribeCluster``

    If the cluster's StepConcurrency Level is 1, locking considers the cluster
    available if it's in the WAITING state. this means we should not release
    our lock until our step(s) have started running, which can take several
    seconds.

    Otherwise, steps can run concurrently, so locking
    considers the cluster available if it's in the WAITING or RUNNING state.
    Additionally, it makes a ``ListSteps`` API call to verify that the cluster
    doesn't already have as many active steps as it can run simultaneously.
    Because other jobs looking to join the cluster will also count steps,
    we can release our lock as soon as we add our steps.
    """
    log.debug('Attempting to lock cluster %s for %.1f seconds' %
              (cluster_id, _CLUSTER_LOCK_SECS))

    if cluster is None:
        cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster']

    if when_cluster_described is None:
        start = time.time()
    else:
        start = when_cluster_described

    if cluster['StepConcurrencyLevel'] == 1:
        step_accepting_states = ['WAITING']
    else:
        step_accepting_states = ['RUNNING', 'WAITING']

    # check if there is a non-expired lock
    state = cluster['Status']['State']

    if state not in step_accepting_states:
        # this could happen if the cluster were TERMINATING, for example
        log.info('  cluster is not accepting steps, state is %s' % state)
        return False

    lock = _get_cluster_lock(cluster)

    if lock:
        expiry = None
        try:
            their_job_key, expiry = _parse_cluster_lock(lock)
        except ValueError:
            log.info('  ignoring invalid pool lock: %s' % lock)

        if expiry and expiry > start:
            log.info('  locked by %s for %.1f seconds' %
                     (their_job_key, expiry - start))
            return False

    # add our lock
    our_lock = _make_cluster_lock(job_key, start + _CLUSTER_LOCK_SECS)

    log.debug('  adding tag to cluster %s:' % cluster_id)
    log.debug('    %s=%s' % (_POOL_LOCK_KEY, our_lock))
    emr_client.add_tags(ResourceId=cluster_id,
                        Tags=[dict(Key=_POOL_LOCK_KEY, Value=our_lock)])

    if time.time() - start > _ADD_TAG_BEFORE:
        log.info('  took too long to tag cluster with lock')
        return False

    # wait, then check if our lock is still there
    log.info("  waiting %.1f seconds to ensure lock wasn't overwritten" %
             _WAIT_AFTER_ADD_TAG)
    time.sleep(_WAIT_AFTER_ADD_TAG)

    # check if our lock is still there
    cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster']

    state = cluster['Status']['State']

    if state not in step_accepting_states:
        # this could happen if the cluster were TERMINATING, for example
        log.info('  cluster is not accepting steps, state is %s' % state)
        return False

    if cluster['StepConcurrencyLevel'] > 1:
        # is cluster already full of steps?
        num_active_steps = len(
            list(
                _boto3_paginate('Steps',
                                emr_client,
                                'list_steps',
                                ClusterId=cluster_id,
                                StepStates=['PENDING', 'RUNNING'])))

        if num_active_steps >= cluster['StepConcurrencyLevel']:
            log.info('  cluster already has %d active steps' %
                     num_active_steps)
            return

    lock = _get_cluster_lock(cluster)

    if lock is None:
        log.info('  lock was removed')
        return False
    elif lock != our_lock:
        their_job_desc = 'other job'
        try:
            their_job_desc, expiry = _parse_cluster_lock(lock)
        except ValueError:
            pass

        log.info('  lock was overwritten by %s' % their_job_desc)
        return False

    # make sure we have enough time to add steps and have them run
    # before the lock expires
    if time.time() > start + _CHECK_TAG_BEFORE:
        log.info('  took too long to check for lock')
        return False

    log.info('  lock acquired')
    return True