Esempio n. 1
0
    def test_now_is_automatically_set(self):
        cs = dict(Status=dict(Timeline=dict(CreationDateTime=_boto3_now())))

        t = _est_time_to_hour(cs)

        self.assertLessEqual(t, timedelta(minutes=60))
        self.assertGreater(t, timedelta(minutes=59))
def main(cl_args=None):
    arg_parser = _make_arg_parser()
    options = arg_parser.parse_args(cl_args)

    MRJob.set_up_logging(quiet=options.quiet,
                         verbose=options.verbose)

    # max_hours_idle -> max_mins_idle
    max_mins_idle = options.max_mins_idle
    if max_mins_idle is None and options.max_hours_idle is not None:
        log.warning('--max-hours-idle is deprecated and will be removed'
                    ' in v0.7.0. Please use --max-mins-idle instead.')
        max_mins_idle = options.max_hours_idle * 60

    if options.mins_to_end_of_hour is not None:
        log.warning('--mins-to-end-of-hour is deprecated as of v0.6.0'
                    ' and does nothing')

    _maybe_terminate_clusters(
        dry_run=options.dry_run,
        max_mins_idle=max_mins_idle,
        unpooled_only=options.unpooled_only,
        now=_boto3_now(),
        pool_name=options.pool_name,
        pooled_only=options.pooled_only,
        max_mins_locked=options.max_mins_locked,
        quiet=options.quiet,
        **_runner_kwargs(options)
    )
Esempio n. 3
0
def main(args=None):
    now = _boto3_now()

    arg_parser = _make_arg_parser()
    options = arg_parser.parse_args(args)

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    log.info('getting information about running jobs')

    min_time = timedelta(hours=options.min_hours)

    emr_client = EMRJobRunner(**_runner_kwargs(options)).make_emr_client()
    cluster_summaries = _boto3_paginate(
        'Clusters', emr_client, 'list_clusters',
        ClusterStates=['STARTING', 'BOOTSTRAPPING', 'RUNNING'])

    if not options.exclude:
        filtered_cluster_summaries = cluster_summaries
    else:
        filtered_cluster_summaries = _filter_clusters(
            cluster_summaries, emr_client, options.exclude)

    job_info = _find_long_running_jobs(
        emr_client, filtered_cluster_summaries, min_time, now=now)

    _print_report(job_info)
def main(cl_args=None):
    arg_parser = _make_arg_parser()
    options = arg_parser.parse_args(cl_args)

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    # max_hours_idle -> max_mins_idle
    max_mins_idle = options.max_mins_idle
    if max_mins_idle is None and options.max_hours_idle is not None:
        log.warning('--max-hours-idle is deprecated and will be removed'
                    ' in v0.7.0. Please use --max-mins-idle instead.')
        max_mins_idle = options.max_hours_idle * 60

    if options.mins_to_end_of_hour is not None:
        log.warning('--mins-to-end-of-hour is deprecated as of v0.6.0'
                    ' and does nothing')

    _maybe_terminate_clusters(dry_run=options.dry_run,
                              max_mins_idle=max_mins_idle,
                              unpooled_only=options.unpooled_only,
                              now=_boto3_now(),
                              pool_name=options.pool_name,
                              pooled_only=options.pooled_only,
                              max_mins_locked=options.max_mins_locked,
                              quiet=options.quiet,
                              **_runner_kwargs(options))
Esempio n. 5
0
def main(args=None):
    now = _boto3_now()

    arg_parser = _make_arg_parser()
    options = arg_parser.parse_args(args)

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    log.info('getting information about running jobs')

    min_time = timedelta(hours=options.min_hours)

    emr_client = EMRJobRunner(**_runner_kwargs(options)).make_emr_client()
    cluster_summaries = _boto3_paginate(
        'Clusters', emr_client, 'list_clusters',
        ClusterStates=['STARTING', 'BOOTSTRAPPING', 'RUNNING'])

    if not options.exclude:
        filtered_cluster_summaries = cluster_summaries
    else:
        filtered_cluster_summaries = _filter_clusters(
            cluster_summaries, emr_client, options.exclude)

    job_info = _find_long_running_jobs(
        emr_client, filtered_cluster_summaries, min_time, now=now)

    _print_report(job_info)
Esempio n. 6
0
    def upload_file(self, path, Config=None):
        if self.bucket_name not in self.meta.client.mock_s3_fs:
            # upload_file() is a higher-order operation, has fancy errors
            raise S3UploadFailedError(
                'Failed to upload %s to %s/%s: %s' % (
                    path, self.bucket_name, self.key,
                    str(_no_such_bucket_error('PutObject'))))

        mock_keys = self._mock_bucket_keys('PutObject')
        with open(path, 'rb') as f:
            mock_keys[self.key] = (f.read(), _boto3_now())
Esempio n. 7
0
    def upload_file(self, path, Config=None):
        if self.bucket_name not in self.meta.client.mock_s3_fs:
            # upload_file() is a higher-order operation, has fancy errors
            raise S3UploadFailedError(
                'Failed to upload %s to %s/%s: %s' %
                (path, self.bucket_name, self.key,
                 str(_no_such_bucket_error('PutObject'))))

        mock_keys = self._mock_bucket_keys('PutObject')
        with open(path, 'rb') as f:
            mock_keys[self.key] = (f.read(), _boto3_now())
Esempio n. 8
0
def _yield_clusters(max_days_ago=None, now=None, **runner_kwargs):
    """Get relevant cluster information from EMR.

    :param float max_days_ago: If set, don't fetch clusters created longer
                               than this many days ago.
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.
    :param runner_kwargs: keyword args to pass through to
                          :py:class:`~mrjob.emr.EMRJobRunner`
    """
    if now is None:
        now = _boto3_now()

    emr_client = EMRJobRunner(**runner_kwargs).make_emr_client()

    # if --max-days-ago is set, only look at recent jobs
    created_after = None
    if max_days_ago is not None:
        created_after = now - timedelta(days=max_days_ago)

    # use _DELAY to sleep 1 second after each API call (see #1091). Could
    # implement some sort of connection wrapper for this if it becomes more
    # generally useful.
    list_clusters_kwargs = dict(_delay=_DELAY)
    if created_after is not None:
        list_clusters_kwargs['CreatedAfter'] = created_after

    for cluster_summary in _boto3_paginate('Clusters', emr_client,
                                           'list_clusters',
                                           **list_clusters_kwargs):

        cluster_id = cluster_summary['Id']

        cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster']
        sleep(_DELAY)

        cluster['Steps'] = list(
            reversed(
                list(
                    _boto3_paginate('Steps',
                                    emr_client,
                                    'list_steps',
                                    ClusterId=cluster_id,
                                    _delay=_DELAY))))

        cluster['BootstrapActions'] = list(
            _boto3_paginate('BootstrapActions',
                            emr_client,
                            'list_bootstrap_actions',
                            ClusterId=cluster_id,
                            _delay=_DELAY))

        yield cluster
Esempio n. 9
0
    def put(self, Body):
        if not isinstance(Body, bytes):
            raise NotImplementedError('mock put() only support bytes')

        mock_keys = self._mock_bucket_keys('PutObject')

        if isinstance(Body, bytes):
            data = Body
        elif hasattr(Body, 'read'):
            data = Body.read()

        if not isinstance(data, bytes):
            raise TypeError('Body or Body.read() must be bytes')

        mock_keys[self.key] = (data, _boto3_now())
Esempio n. 10
0
    def put(self, Body):
        if not isinstance(Body, bytes):
            raise NotImplementedError('mock put() only support bytes')

        mock_keys = self._mock_bucket_keys('PutObject')

        if isinstance(Body, bytes):
            data = Body
        elif hasattr(Body, 'read'):
            data = Body.read()

        if not isinstance(data, bytes):
            raise TypeError('Body or Body.read() must be bytes')

        mock_keys[self.key] = (data, _boto3_now())
Esempio n. 11
0
def main(cl_args=None):
    arg_parser = _make_arg_parser()
    options = arg_parser.parse_args(cl_args)

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    _maybe_terminate_clusters(dry_run=options.dry_run,
                              max_hours_idle=options.max_hours_idle,
                              mins_to_end_of_hour=options.mins_to_end_of_hour,
                              unpooled_only=options.unpooled_only,
                              now=_boto3_now(),
                              pool_name=options.pool_name,
                              pooled_only=options.pooled_only,
                              max_mins_locked=options.max_mins_locked,
                              quiet=options.quiet,
                              **_runner_kwargs(options))
Esempio n. 12
0
    def create_role(self, AssumeRolePolicyDocument, RoleName):
        # Path not supported
        # mock RoleIds are all the same

        self._check_role_does_not_exist(RoleName, 'CreateRole')

        role = dict(
            Arn=('arn:aws:iam::012345678901:role/%s' % RoleName),
            AssumeRolePolicyDocument=json.loads(AssumeRolePolicyDocument),
            CreateDate=_boto3_now(),
            Path='/',
            RoleId='AROAMOCKMOCKMOCKMOCK',
            RoleName=RoleName,
        )
        self.mock_iam_roles[RoleName] = role

        return dict(Role=role)
Esempio n. 13
0
def main(args=None):
    # parse command-line args
    arg_parser = _make_arg_parser()

    options = arg_parser.parse_args(args)

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    now = _boto3_now()

    log.info('getting cluster history...')
    clusters = list(_yield_clusters(
        max_days_ago=options.max_days_ago, now=now, **_runner_kwargs(options)))

    log.info('compiling cluster stats...')
    stats = _clusters_to_stats(clusters, now=now)

    _print_report(stats, now=now)
Esempio n. 14
0
def _s3_cleanup(glob_path, time_old, dry_run=False, **runner_kwargs):
    """Delete all files older than *time_old* in *path*.

    If *dry_run* is true, then just log the files that need to be
    deleted without actually deleting them
    """
    runner = EMRJobRunner(**runner_kwargs)

    log.info('Deleting all files in %s that are older than %s' %
             (glob_path, time_old))

    for path, key in runner.fs._ls(glob_path):
        age = _boto3_now() - key.last_modified
        if age > time_old:
            # Delete it
            log.info('Deleting %s; is %s old' % (path, age))
            if not dry_run:
                key.delete()
Esempio n. 15
0
def _s3_cleanup(glob_path, time_old, dry_run=False, **runner_kwargs):
    """Delete all files older than *time_old* in *path*.

    If *dry_run* is true, then just log the files that need to be
    deleted without actually deleting them
    """
    runner = EMRJobRunner(**runner_kwargs)

    log.info('Deleting all files in %s that are older than %s' %
             (glob_path, time_old))

    for path, key in runner.fs.s3._ls(glob_path):
        age = _boto3_now() - key.last_modified
        if age > time_old:
            # Delete it
            log.info('Deleting %s; is %s old' % (path, age))
            if not dry_run:
                key.delete()
Esempio n. 16
0
def _yield_clusters(max_days_ago=None, now=None, **runner_kwargs):
    """Get relevant cluster information from EMR.

    :param float max_days_ago: If set, don't fetch clusters created longer
                               than this many days ago.
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.
    :param runner_kwargs: keyword args to pass through to
                          :py:class:`~mrjob.emr.EMRJobRunner`
    """
    if now is None:
        now = _boto3_now()

    emr_client = EMRJobRunner(**runner_kwargs).make_emr_client()

    # if --max-days-ago is set, only look at recent jobs
    created_after = None
    if max_days_ago is not None:
        created_after = now - timedelta(days=max_days_ago)

    # use _DELAY to sleep 1 second after each API call (see #1091). Could
    # implement some sort of connection wrapper for this if it becomes more
    # generally useful.
    list_clusters_kwargs = dict(_delay=_DELAY)
    if created_after is not None:
        list_clusters_kwargs['CreatedAfter'] = created_after

    for cluster_summary in _boto3_paginate(
            'Clusters', emr_client, 'list_clusters', **list_clusters_kwargs):

        cluster_id = cluster_summary['Id']

        cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster']
        sleep(_DELAY)

        cluster['Steps'] = list(reversed(list(_boto3_paginate(
            'Steps', emr_client, 'list_steps',
            ClusterId=cluster_id, _delay=_DELAY))))

        cluster['BootstrapActions'] = list(_boto3_paginate(
            'BootstrapActions', emr_client, 'list_bootstrap_actions',
            ClusterId=cluster_id, _delay=_DELAY))

        yield cluster
Esempio n. 17
0
def main(args=None):
    # parse command-line args
    arg_parser = _make_arg_parser()
    options = arg_parser.parse_args(args)

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    now = _boto3_now()

    log.info('getting cluster history...')
    clusters = list(
        _yield_clusters(max_days_ago=options.max_days_ago,
                        now=now,
                        **_runner_kwargs(options)))

    log.info('compiling cluster stats...')
    stats = _clusters_to_stats(clusters, now=now)

    _print_report(stats, now=now)
Esempio n. 18
0
    def create_instance_profile(self, InstanceProfileName):
        # Path not implemented
        # mock InstanceProfileIds are all the same

        self._check_instance_profile_does_not_exist(InstanceProfileName,
                                                    'CreateInstanceProfile')

        profile = dict(
            Arn=('arn:aws:iam::012345678901:instance-profile/%s' %
                 InstanceProfileName),
            CreateDate=_boto3_now(),
            InstanceProfileId='AIPAMOCKMOCKMOCKMOCK',
            InstanceProfileName=InstanceProfileName,
            Path='/',
            Roles=[],
        )
        self.mock_iam_instance_profiles[InstanceProfileName] = profile

        return dict(InstanceProfile=profile)
Esempio n. 19
0
def _est_time_to_hour(cluster_summary, now=None):
    """How long before job reaches the end of the next full hour since it
    began. This is important for billing purposes.

    If it happens to be exactly a whole number of hours, we return
    one hour, not zero.
    """
    if now is None:
        now = _boto3_now()

    timeline = cluster_summary.get('Status', {}).get('Timeline', {})

    creationdatetime = timeline.get('CreationDateTime')

    if not creationdatetime:
        # do something reasonable if creationdatetime isn't set
        return timedelta(minutes=60)

    run_time = now - creationdatetime
    return timedelta(seconds=((-run_time).seconds % 3600.0 or 3600.0))
Esempio n. 20
0
def add_mock_s3_data(mock_s3_fs, data, age=None, location=None):
    """Update *mock_s3_fs* with a map from bucket name to key name to data.

    :param age: a timedelta
    :param location string: the bucket's location constraint (a region name)
    """
    age = age or timedelta(0)
    time_modified = _boto3_now() - age

    for bucket_name, key_name_to_bytes in data.items():
        bucket = mock_s3_fs.setdefault(bucket_name,
                                       {'keys': {}, 'location': ''})

        for key_name, key_data in key_name_to_bytes.items():
            if not isinstance(key_data, bytes):
                raise TypeError('mock s3 data must be bytes')
            bucket['keys'][key_name] = (key_data, time_modified)

        if location is not None:
            bucket['location'] = location
Esempio n. 21
0
def add_mock_s3_data(mock_s3_fs, data, age=None, location=None):
    """Update *mock_s3_fs* with a map from bucket name to key name to data.

    :param age: a timedelta
    :param location string: the bucket's location constraint (a region name)
    """
    age = age or timedelta(0)
    time_modified = _boto3_now() - age

    for bucket_name, key_name_to_bytes in data.items():
        bucket = mock_s3_fs.setdefault(
            bucket_name,
            dict(creation_date=_boto3_today(), keys={}, location=''))

        for key_name, key_data in key_name_to_bytes.items():
            if not isinstance(key_data, bytes):
                raise TypeError('mock s3 data must be bytes')
            bucket['keys'][key_name] = (key_data, time_modified)

        if location is not None:
            bucket['location'] = location
Esempio n. 22
0
File: s3.py Progetto: Yelp/mrjob
    def upload_file(self, path, Config=None):
        if self.bucket_name not in self.meta.client.mock_s3_fs:
            # upload_file() is a higher-order operation, has fancy errors
            raise S3UploadFailedError(
                'Failed to upload %s to %s/%s: %s' % (
                    path, self.bucket_name, self.key,
                    str(_no_such_bucket_error('PutObject'))))

        # verify that config doesn't have empty part size (see #2033)
        #
        # config is a boto3.s3.transfer.TransferConfig (we don't mock it),
        # which is actually part of s3transfer. Very old versions of s3transfer
        # (e.g. 0.10.0) disallow initializing TransferConfig with part sizes
        # that are zero or None
        if Config and not (Config.multipart_chunksize and
                           Config.multipart_threshold):
            raise TypeError('part size may not be 0 or None')

        mock_keys = self._mock_bucket_keys('PutObject')
        with open(path, 'rb') as f:
            mock_keys[self.key] = dict(
                body=f.read(), time_modified=_boto3_now())
Esempio n. 23
0
    def upload_file(self, path, Config=None):
        if self.bucket_name not in self.meta.client.mock_s3_fs:
            # upload_file() is a higher-order operation, has fancy errors
            raise S3UploadFailedError(
                'Failed to upload %s to %s/%s: %s' %
                (path, self.bucket_name, self.key,
                 str(_no_such_bucket_error('PutObject'))))

        # verify that config doesn't have empty part size (see #2033)
        #
        # config is a boto3.s3.transfer.TransferConfig (we don't mock it),
        # which is actually part of s3transfer. Very old versions of s3transfer
        # (e.g. 0.10.0) disallow initializing TransferConfig with part sizes
        # that are zero or None
        if Config and not (Config.multipart_chunksize
                           and Config.multipart_threshold):
            raise TypeError('part size may not be 0 or None')

        mock_keys = self._mock_bucket_keys('PutObject')
        with open(path, 'rb') as f:
            mock_keys[self.key] = dict(body=f.read(),
                                       time_modified=_boto3_now())
Esempio n. 24
0
def add_mock_s3_data(mock_s3_fs,
                     data,
                     age=None,
                     location=None,
                     storage_class=None,
                     restore=None):
    """Update *mock_s3_fs* with a map from bucket name to key name to data.

    :param age: a timedelta
    :param location string: the bucket's location constraint (a region name)
    :param storage_class string: storage class for all data added
    :param restore: x-amz-restore header (see
                    https://docs.aws.amazon.com/AmazonS3/latest/API/\
                    RESTObjectHEAD.html#RESTObjectHEAD-responses)
    """
    age = age or timedelta(0)
    time_modified = _boto3_now() - age

    for bucket_name, key_name_to_bytes in data.items():
        bucket = mock_s3_fs.setdefault(
            bucket_name,
            dict(creation_date=_boto3_today(), keys={}, location=''))

        for key_name, key_data in key_name_to_bytes.items():
            if not isinstance(key_data, bytes):
                raise TypeError('mock s3 data must be bytes')

            mock_key = dict(body=key_data, time_modified=time_modified)

            if storage_class:
                mock_key['storage_class'] = storage_class
            if restore:
                mock_key['restore'] = restore

            bucket['keys'][key_name] = mock_key

        if location is not None:
            bucket['location'] = location
Esempio n. 25
0
File: s3.py Progetto: Yelp/mrjob
def add_mock_s3_data(mock_s3_fs, data,
                     age=None, location=None,
                     storage_class=None,
                     restore=None):
    """Update *mock_s3_fs* with a map from bucket name to key name to data.

    :param age: a timedelta
    :param location string: the bucket's location constraint (a region name)
    :param storage_class string: storage class for all data added
    :param restore: x-amz-restore header (see
                    https://docs.aws.amazon.com/AmazonS3/latest/API/\
                    RESTObjectHEAD.html#RESTObjectHEAD-responses)
    """
    age = age or timedelta(0)
    time_modified = _boto3_now() - age

    for bucket_name, key_name_to_bytes in data.items():
        bucket = mock_s3_fs.setdefault(
            bucket_name,
            dict(creation_date=_boto3_today(), keys={}, location=''))

        for key_name, key_data in key_name_to_bytes.items():
            if not isinstance(key_data, bytes):
                raise TypeError('mock s3 data must be bytes')

            mock_key = dict(
                body=key_data, time_modified=time_modified)

            if storage_class:
                mock_key['storage_class'] = storage_class
            if restore:
                mock_key['restore'] = restore

            bucket['keys'][key_name] = mock_key

        if location is not None:
            bucket['location'] = location
Esempio n. 26
0
    def add_mock_ec2_image(self, image):
        """Add information about a mock EC2 Image (AMI) to be returned by
        mock :py:meth:`~tests.mock_boto3.ec2.MockEC2Client.describe_images`.

        This will automatically fill `CreationDate`. Other fields you
        might want to fill include:

        * ``Architecture`` (e.g. ``'i386'``, ``'x86_64'``)
        * ``BlockDeviceMappings`` (e.g. ``[{'DeviceName': '/dev/sda1'}]``)
        * ``ImageOwnerAlias`` (e.g. ``'amazon'``, ``'aws-marketplace'``)
        * ``Name`` (e.g. ``amzn-ami-hvm-2017.09.1.20171120-x86_64-s3``)
        * ``RootDeviceType`` (e.g. ``'ebs'``, ``'instance-store'``)
        * ``VirtualizationType (e.g. ``'hvm'``, ``'paravirtual'``)
        """
        image = dict(image)

        # TODO: will eventually need to add a mock user ID to support
        # filtering by owner == 'self'

        if not image.get('CreationDate'):
            image['CreationDate'] = _boto3_now().strftime(
                '%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z'

        self.mock_ec2_images.append(image)
Esempio n. 27
0
def _cluster_to_basic_summary(cluster, now=None):
    """Extract fields such as creation time, owner, etc. from the cluster.

    :param cluster: a :py:mod:`boto3` cluster data structure
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.

    Returns a dictionary with the following keys. These will be ``None`` if the
    corresponding field in the cluster is unavailable.

    * *created*: UTC `datetime.datetime` that the cluster was created,
      or ``None``
    * *end*: UTC `datetime.datetime` that the cluster finished, or ``None``
    * *id*: cluster ID, or ``None`` (this should never happen)
    * *label*: The label for the cluster (usually the module name of the
      :py:class:`~mrjob.job.MRJob` script that started it), or
      ``None`` for non-:py:mod:`mrjob` clusters.
    * *name*: cluster name, or ``None`` (this should never happen)
    * *nih*: number of normalized instance hours cluster *would* use if it
      ran to the end of the next full hour (
    * *num_steps*: Number of steps in the cluster.
    * *owner*: The owner for the cluster (usually the user that started it),
      or ``None`` for non-:py:mod:`mrjob` clusters.
    * *pool*: pool name (e.g. ``'default'``) if the cluster is pooled,
      otherwise ``None``.
    * *ran*: How long the cluster ran, or has been running, as a
      :py:class:`datetime.timedelta`. This will be ``timedelta(0)`` if
      the cluster hasn't started.
    * *ready*: UTC `datetime.datetime` that the cluster finished
      bootstrapping, or ``None``
    * *state*: The cluster's state as a string (e.g. ``'RUNNING'``)
    """
    if now is None:
        now = _boto3_now()

    bcs = {}  # basic cluster summary to fill in

    bcs['id'] = cluster['Id']
    bcs['name'] = cluster['Name']

    Status = cluster['Status']
    Timeline = Status.get('Timeline', {})

    bcs['created'] = Timeline.get('CreationDateTime')
    bcs['ready'] = Timeline.get('ReadyDateTime')
    bcs['end'] = Timeline.get('EndDateTime')

    if bcs['created']:
        bcs['ran'] = (bcs['end'] or now) - bcs['created']
    else:
        bcs['ran'] = timedelta(0)

    bcs['state'] = Status.get('State')

    bcs['num_steps'] = len(cluster['Steps'])

    _, bcs['pool'] = _pool_hash_and_name(cluster)
    if not bcs['pool']:
        _, bcs['pool'] = _legacy_pool_hash_and_name(
            cluster['BootstrapActions'])

    m = _JOB_KEY_RE.match(bcs['name'] or '')
    if m:
        bcs['label'], bcs['owner'] = m.group(1), m.group(2)
    else:
        bcs['label'], bcs['owner'] = None, None

    bcs['nih'] = float(cluster.get('NormalizedInstanceHours', 0))

    return bcs
Esempio n. 28
0
def _find_long_running_jobs(emr_client, cluster_summaries, min_time, now=None):
    """Identify jobs that have been running or pending for a long time.

    :param clusters: a list of :py:mod:`boto3` cluster summary data structures
    :param min_time: a :py:class:`datetime.timedelta`: report jobs running or
                     pending longer than this
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.

    For each job that is running or pending longer than *min_time*, yields
    a dictionary with the following keys:

    * *cluster_id*: the cluster's unique ID (e.g. ``j-SOMECLUSTER``)
    * *name*: name of the step, or the cluster when bootstrapping
    * *state*: state of the step (``'RUNNING'`` or ``'PENDING'``) or, if there
               is no step, the cluster (``'STARTING'`` or ``'BOOTSTRAPPING'``)
    * *time*: amount of time step was running or pending, as a
              :py:class:`datetime.timedelta`
    """
    if now is None:
        now = _boto3_now()

    for cs in cluster_summaries:

        # special case for jobs that are taking a long time to bootstrap
        if cs['Status']['State'] in ('STARTING', 'BOOTSTRAPPING'):
            # there isn't a way to tell when the cluster stopped being
            # provisioned and started bootstrapping, so just measure
            # from cluster creation time
            created = cs['Status']['Timeline']['CreationDateTime']

            time_running = now - created

            if time_running >= min_time:
                yield({'cluster_id': cs['Id'],
                       'name': cs['Name'],
                       'state': cs['Status']['State'],
                       'time': time_running})

        # the default case: running clusters
        if cs['Status']['State'] != 'RUNNING':
            continue

        steps = list(reversed(list(_boto3_paginate(
            'Steps', emr_client, 'list_steps', ClusterId=cs['Id']))))

        running_steps = [
            step for step in steps if step['Status']['State'] == 'RUNNING']
        pending_steps = [
            step for step in steps if step['Status']['State'] == 'PENDING']

        if running_steps:
            # should be only one, but if not, we should know about it
            for step in running_steps:

                start = step['Status']['Timeline']['StartDateTime']

                time_running = now - start

                if time_running >= min_time:
                    yield({'cluster_id': cs['Id'],
                           'name': step['Name'],
                           'state': step['Status']['State'],
                           'time': time_running})

        # sometimes EMR says it's "RUNNING" but doesn't actually run steps!
        elif pending_steps:
            step = pending_steps[0]

            # PENDING job should have run starting when the cluster
            # became ready, or the previous step completed
            start = cs['Status']['Timeline']['ReadyDateTime']
            for step in steps:
                if step['Status']['State'] == 'COMPLETED':
                    start = step['Status']['Timeline']['EndDateTime']

            time_pending = now - start

            if time_pending >= min_time:
                yield({'cluster_id': cs['Id'],
                       'name': step['Name'],
                       'state': step['Status']['State'],
                       'time': time_pending})
Esempio n. 29
0
def _boto3_today():
    now = _boto3_now()
    return datetime(now.year, now.month, now.day, tzinfo=now.tzinfo)
    def create_fake_clusters(self):
        self.now = _boto3_now().replace(microsecond=0)
        self.add_mock_s3_data({'my_bucket': {}})

        # create a timestamp the given number of *hours*, *minutes*, etc.
        # in the past
        def ago(**kwargs):
            return self.now - timedelta(**kwargs)

        # Build a step object easily
        # also make it respond to .args()
        def step(jar='/home/hadoop/contrib/streaming/hadoop-streaming.jar',
                 args=self._DEFAULT_STEP_ARGS,
                 state='COMPLETED',
                 created=None,
                 started=None,
                 ended=None,
                 name='Streaming Step',
                 action_on_failure='TERMINATE_CLUSTER',
                 **kwargs):

            timeline = dict()
            if created:
                timeline['CreationDateTime'] = created
            if started:
                timeline['StartDateTime'] = started
            if ended:
                timeline['EndDateTime'] = ended

            return dict(Config=dict(
                ActionOnFailure=action_on_failure,
                Args=args,
                Jar=jar,
            ),
                        Status=dict(
                            State=state,
                            Timeline=timeline,
                        ))

        # empty job
        self.add_mock_emr_cluster(
            dict(
                Id='j-EMPTY',
                Status=dict(
                    State='STARTING',
                    Timeline=dict(CreationDateTime=ago(hours=10)),
                ),
            ))

        # job that's bootstrapping
        self.add_mock_emr_cluster(
            dict(
                Id='j-BOOTSTRAPPING',
                Status=dict(
                    State='BOOTSTRAPPING',
                    Timeline=dict(CreationDateTime=ago(hours=10), ),
                ),
                _Steps=[step(created=ago(hours=10), state='PENDING')],
            ))

        # currently running job
        self.add_mock_emr_cluster(
            dict(Id='j-CURRENTLY_RUNNING',
                 Status=dict(State='RUNNING',
                             Timeline=dict(CreationDateTime=ago(hours=4,
                                                                minutes=15),
                                           ReadyDateTime=ago(hours=4,
                                                             minutes=10))),
                 _Steps=[step(started=ago(hours=4), state='RUNNING')]))

        # finished cluster
        self.add_mock_emr_cluster(
            dict(
                Id='j-DONE',
                Status=dict(
                    State='TERMINATED',
                    Timeline=dict(
                        CreationDateTime=ago(hours=10),
                        ReadyDateTime=ago(hours=8),
                        EndDateTime=ago(hours=5),
                    ),
                ),
                _Steps=[step(started=ago(hours=8), ended=ago(hours=6))],
            ))

        # idle cluster
        self.add_mock_emr_cluster(
            dict(
                Id='j-DONE_AND_IDLE',
                Status=dict(
                    State='WAITING',
                    Timeline=dict(
                        CreationDateTime=ago(hours=6),
                        ReadyDateTime=ago(hours=5, minutes=5),
                    ),
                ),
                _Steps=[step(started=ago(hours=4), ended=ago(hours=2))],
            ))

        # idle cluster with 4.x step format. should still be
        # recognizable as a streaming step
        self.add_mock_emr_cluster(
            dict(
                Id='j-DONE_AND_IDLE_4_X',
                Status=dict(
                    State='WAITING',
                    Timeline=dict(
                        CreationDateTime=ago(hours=6),
                        ReadyDateTime=ago(hours=5, minutes=5),
                    ),
                ),
                _Steps=[
                    step(started=ago(hours=4),
                         ended=ago(hours=2),
                         jar='command-runner.jar',
                         args=['hadoop-streaming'] + self._DEFAULT_STEP_ARGS)
                ],
            ))

        # idle cluster with an active lock
        self.add_mock_emr_cluster(
            dict(
                Id='j-IDLE_AND_LOCKED',
                Status=dict(
                    State='WAITING',
                    Timeline=dict(
                        CreationDateTime=ago(hours=6),
                        ReadyDateTime=ago(hours=5, minutes=5),
                    ),
                ),
                _Steps=[step(started=ago(hours=4), ended=ago(hours=2))],
            ))
        self.add_mock_s3_data({
            'my_bucket': {
                'locks/j-IDLE_AND_LOCKED/2': b'not_you',
            },
        })

        # idle cluster with an expired lock
        self.add_mock_emr_cluster(
            dict(
                Id='j-IDLE_AND_EXPIRED',
                Status=dict(
                    State='WAITING',
                    Timeline=dict(
                        CreationDateTime=ago(hours=6),
                        ReadyDateTime=ago(hours=5, minutes=5),
                    ),
                ),
                _Steps=[step(started=ago(hours=4), ended=ago(hours=2))],
            ))
        self.add_mock_s3_data(
            {
                'my_bucket': {
                    'locks/j-IDLE_AND_EXPIRED/2': b'not_you',
                },
            },
            age=timedelta(minutes=5))

        # idle cluster with an expired lock
        self.add_mock_emr_cluster(
            dict(
                Id='j-IDLE_BUT_INCOMPLETE_STEPS',
                Status=dict(
                    State='WAITING',
                    Timeline=dict(
                        CreationDateTime=ago(hours=6),
                        ReadyDateTime=ago(hours=5, minutes=5),
                    ),
                ),
                _Steps=[step(started=ago(hours=4), end_hours_ago=None)],
            ))

        # custom hadoop streaming jar
        self.add_mock_emr_cluster(
            dict(
                Id='j-CUSTOM_DONE_AND_IDLE',
                Status=dict(
                    State='WAITING',
                    Timeline=dict(
                        CreationDateTime=ago(hours=6),
                        ReadyDateTime=ago(hours=5, minutes=5),
                    ),
                ),
                _Steps=[
                    step(
                        started=ago(hours=4),
                        ended=ago(hours=4),
                        jar=('s3://my_bucket/tmp/somejob/files/'
                             'oddjob-0.0.3-SNAPSHOT-standalone.jar'),
                        args=[],
                    )
                ],
            ))

        # hadoop debugging without any other steps
        self.add_mock_emr_cluster(
            dict(
                Id='j-DEBUG_ONLY',
                Status=dict(
                    State='WAITING',
                    Timeline=dict(
                        CreationDateTime=ago(hours=3),
                        ReadyDateTime=ago(hours=2, minutes=55),
                    ),
                ),
                _Steps=[
                    step(jar='command-runner.jar',
                         name='Setup Hadoop Debugging',
                         args=['state-pusher-script'],
                         started=ago(hours=3),
                         ended=ago(hours=2))
                ],
            ))

        # hadoop debugging + actual job
        self.add_mock_emr_cluster(
            dict(
                Id='j-HADOOP_DEBUGGING',
                Status=dict(
                    State='WAITING',
                    Timeline=dict(
                        CreationDateTime=ago(hours=6),
                        ReadyDateTime=ago(hours=5, minutes=55),
                    ),
                ),
                _Steps=[
                    step(jar='command-runner.jar',
                         name='Setup Hadoop Debugging',
                         args=['state-pusher-script'],
                         started=ago(hours=5),
                         ended=ago(hours=4)),
                    step(started=ago(hours=4), ended=ago(hours=2)),
                ],
            ))

        # should skip cancelled steps
        self.add_mock_emr_cluster(
            dict(
                Id='j-IDLE_AND_FAILED',
                Status=dict(
                    State='WAITING',
                    Timeline=dict(
                        CreationDateTime=ago(hours=6),
                        ReadyDateTime=ago(hours=5, minutes=5),
                    ),
                ),
                _Steps=[
                    step(started=ago(hours=4),
                         ended=ago(hours=3),
                         state='FAILED'),
                    step(state='CANCELLED'),
                ],
            ))

        # pooled cluster reaching end of full hour
        self.add_mock_emr_cluster(
            dict(
                _BootstrapActions=[
                    dict(Args=[], Name='action 0'),
                    dict(
                        Args=[
                            'pool-0123456789abcdef0123456789abcdef',
                            'reflecting'
                        ],
                        Name='master',
                    ),
                ],
                Id='j-POOLED',
                Status=dict(
                    State='WAITING',
                    Timeline=dict(
                        CreationDateTime=ago(minutes=55),
                        ReadyDateTime=ago(minutes=50),
                    ),
                ),
            ))

        # cluster that has had pending jobs but hasn't run them
        self.add_mock_emr_cluster(
            dict(
                Id='j-PENDING_BUT_IDLE',
                Status=dict(
                    State='RUNNING',
                    Timeline=dict(
                        CreationDateTime=ago(hours=3),
                        ReadyDateTime=ago(hours=2, minutes=50),
                    ),
                ),
                _Steps=[step(created=ago(hours=3), state='PENDING')],
            ))
Esempio n. 31
0
def _print_report(stats, now=None):
    """Print final report.

    :param stats: a dictionary returned by :py:func:`_clusters_to_stats`
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.
    """
    if now is None:
        now = _boto3_now()

    s = stats

    if not s['clusters']:
        print('No clusters created in the past two months!')
        return

    print('Total  # of Clusters: %d' % len(s['clusters']))
    print()

    print('* All times are in UTC.')
    print()

    print('Min create time: %s' % min(cs['created'] for cs in s['clusters']))
    print('Max create time: %s' % max(cs['created'] for cs in s['clusters']))
    print('   Current time: %s' % now.replace(microsecond=0))
    print()

    print('* All usage is measured in Normalized Instance Hours, which are')
    print('  roughly equivalent to running an m1.medium instance for an hour.')
    print("  Billing is estimated, and may not match Amazon's system exactly.")
    print()

    # total compute-unit hours used
    def with_pct(usage):
        return (usage, _percent(usage, s['nih_billed']))

    print('Total billed:  %9.2f  %5.1f%%' % with_pct(s['nih_billed']))
    print('  Total used:  %9.2f  %5.1f%%' % with_pct(s['nih_used']))
    print('    bootstrap: %9.2f  %5.1f%%' % with_pct(s['bootstrap_nih_used']))
    print('    jobs:      %9.2f  %5.1f%%' % with_pct(s['job_nih_used']))
    print('  Total waste: %9.2f  %5.1f%%' % with_pct(s['nih_bbnu']))
    print('    at end:    %9.2f  %5.1f%%' % with_pct(s['end_nih_bbnu']))
    print('    other:     %9.2f  %5.1f%%' % with_pct(s['other_nih_bbnu']))
    print()

    if s['date_to_nih_billed']:
        print('Daily statistics:')
        print()
        print(' date          billed      used     waste   % waste')
        d = max(s['date_to_nih_billed'])
        while d >= min(s['date_to_nih_billed']):
            print(' %10s %9.2f %9.2f %9.2f     %5.1f' %
                  (d, s['date_to_nih_billed'].get(
                      d, 0.0), s['date_to_nih_used'].get(
                          d, 0.0), s['date_to_nih_bbnu'].get(d, 0.0),
                   _percent(s['date_to_nih_bbnu'].get(d, 0.0),
                            s['date_to_nih_billed'].get(d, 0.0))))
            d -= timedelta(days=1)
        print()

    if s['hour_to_nih_billed']:
        print('Hourly statistics:')
        print()
        print(' hour              billed      used     waste   % waste')
        h = max(s['hour_to_nih_billed'])
        while h >= min(s['hour_to_nih_billed']):
            print(' %13s  %9.2f %9.2f %9.2f     %5.1f' %
                  (h.strftime('%Y-%m-%d %H'), s['hour_to_nih_billed'].get(
                      h, 0.0), s['hour_to_nih_used'].get(
                          h, 0.0), s['hour_to_nih_bbnu'].get(h, 0.0),
                   _percent(s['hour_to_nih_bbnu'].get(h, 0.0),
                            s['hour_to_nih_billed'].get(h, 0.0))))
            h -= timedelta(hours=1)
        print()

    print('* clusters are considered to belong to the user and job that')
    print('  started them or last ran on them.')
    print()

    # Top jobs
    print('Top jobs, by total time used:')
    for label, nih_used in sorted(s['label_to_nih_used'].items(),
                                  key=lambda lb_nih: (-lb_nih[1], lb_nih[0])):
        print('  %9.2f %s' % (nih_used, label))
    print()

    print('Top jobs, by time billed but not used:')
    for label, nih_bbnu in sorted(s['label_to_nih_bbnu'].items(),
                                  key=lambda lb_nih1:
                                  (-lb_nih1[1], lb_nih1[0])):
        print('  %9.2f %s' % (nih_bbnu, label))
    print()

    # Top users
    print('Top users, by total time used:')
    for owner, nih_used in sorted(s['owner_to_nih_used'].items(),
                                  key=lambda o_nih: (-o_nih[1], o_nih[0])):
        print('  %9.2f %s' % (nih_used, owner))
    print()

    print('Top users, by time billed but not used:')
    for owner, nih_bbnu in sorted(s['owner_to_nih_bbnu'].items(),
                                  key=lambda o_nih2: (-o_nih2[1], o_nih2[0])):
        print('  %9.2f %s' % (nih_bbnu, owner))
    print()

    # Top job steps
    print('Top job steps, by total time used (step number first):')
    for (label,
         step_num), nih_used in sorted(s['job_step_to_nih_used'].items(),
                                       key=lambda k_nih:
                                       (-k_nih[1], k_nih[0])):

        if label:
            print('  %9.2f %3d %s' % (nih_used, step_num, label))
        else:
            print('  %9.2f     (non-mrjob step)' % (nih_used, ))
    print()

    print('Top job steps, by total time billed but not used (un-pooled only):')
    for (label, step_num), nih_bbnu in sorted(
            s['job_step_to_nih_bbnu_no_pool'].items(),
            key=lambda k_nih3: (-k_nih3[1], k_nih3[0])):

        if label:
            print('  %9.2f %3d %s' % (nih_bbnu, step_num, label))
        else:
            print('  %9.2f     (non-mrjob step)' % (nih_bbnu, ))
    print()

    # Top pools
    print('All pools, by total time billed:')
    for pool, nih_billed in sorted(s['pool_to_nih_billed'].items(),
                                   key=lambda p_nih: (-p_nih[1], p_nih[0])):
        print('  %9.2f %s' % (nih_billed, pool or '(not pooled)'))
    print()

    print('All pools, by total time billed but not used:')
    for pool, nih_bbnu in sorted(s['pool_to_nih_bbnu'].items(),
                                 key=lambda p_nih4: (-p_nih4[1], p_nih4[0])):
        print('  %9.2f %s' % (nih_bbnu, pool or '(not pooled)'))
    print()

    # Top clusters
    print('All clusters, by total time billed:')
    top_clusters = sorted(s['clusters'],
                          key=lambda cs: (-cs['nih_billed'], cs['name']))
    for cs in top_clusters:
        print('  %9.2f %-15s %s' % (cs['nih_billed'], cs['id'], cs['name']))
    print()

    print('All clusters, by time billed but not used:')
    top_clusters_bbnu = sorted(s['clusters'],
                               key=lambda cs: (-cs['nih_bbnu'], cs['name']))
    for cs in top_clusters_bbnu:
        print('  %9.2f %-15s %s' % (cs['nih_bbnu'], cs['id'], cs['name']))
    print()

    # Details
    print('Details for all clusters:')
    print()
    print(' id              state                  created             steps'
          '        time ran     billed    waste   user   name')

    all_clusters = sorted(s['clusters'],
                          key=lambda cs: cs['created'],
                          reverse=True)

    for cs in all_clusters:
        print(' %-15s %-22s %19s %3d %17s %9.2f %9.2f %8s %s' %
              (cs['id'], cs['state'], cs['created'], cs['num_steps'],
               strip_microseconds(cs['ran']), cs['nih_used'], cs['nih_bbnu'],
               (cs['owner'] or ''), (cs['label'] or ('not started by mrjob'))))
Esempio n. 32
0
File: s3.py Progetto: Yelp/mrjob
def _boto3_today():
    now = _boto3_now()
    return datetime(now.year, now.month, now.day, tzinfo=now.tzinfo)
Esempio n. 33
0
def _maybe_terminate_clusters(dry_run=False,
                              max_mins_idle=None,
                              now=None,
                              pool_name=None,
                              pooled_only=False,
                              unpooled_only=False,
                              max_mins_locked=None,
                              quiet=False,
                              **kwargs):
    if now is None:
        now = _boto3_now()

    # old default behavior
    if max_mins_idle is None:
        max_mins_idle = _DEFAULT_MAX_MINS_IDLE

    runner = EMRJobRunner(**kwargs)
    emr_client = runner.make_emr_client()

    num_starting = 0
    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_pending = 0
    num_running = 0

    # We don't filter by cluster state because we want this to work even
    # if Amazon adds another kind of idle state.
    for cluster_summary in _boto3_paginate(
            'Clusters', emr_client, 'list_clusters'):

        cluster_id = cluster_summary['Id']

        # check if cluster is done
        if _is_cluster_done(cluster_summary):
            num_done += 1
            continue

        # check if cluster is starting
        if _is_cluster_starting(cluster_summary):
            num_starting += 1
            continue

        # check if cluster is bootstrapping
        if _is_cluster_bootstrapping(cluster_summary):
            num_bootstrapping += 1
            continue

        # need steps to learn more about cluster
        steps = list(reversed(list(_boto3_paginate(
            'Steps', emr_client, 'list_steps',
            ClusterId=cluster_id))))

        if any(_is_step_running(step) for step in steps):
            num_running += 1
            continue

        # cluster is idle
        time_idle = now - _time_last_active(cluster_summary, steps)
        is_pending = _cluster_has_pending_steps(steps)

        # need to get actual cluster to see tags
        cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster']

        _, pool = _pool_hash_and_name(cluster)

        if is_pending:
            num_pending += 1
        else:
            num_idle += 1

        log.debug(
            'cluster %s %s for %s, %s (%s)' %
            (cluster_id,
             'pending' if is_pending else 'idle',
             strip_microseconds(time_idle),
             ('unpooled' if pool is None else 'in %s pool' % pool),
             cluster_summary['Name']))

        # filter out clusters that don't meet our criteria
        if (max_mins_idle is not None and
                time_idle <= timedelta(minutes=max_mins_idle)):
            continue

        if (pooled_only and pool is None):
            continue

        if (unpooled_only and pool is not None):
            continue

        if (pool_name is not None and pool != pool_name):
            continue

        # terminate idle cluster
        _terminate_and_notify(
            runner=runner,
            cluster_id=cluster_id,
            cluster_name=cluster_summary['Name'],
            num_steps=len(steps),
            is_pending=is_pending,
            time_idle=time_idle,
            dry_run=dry_run,
            max_mins_locked=max_mins_locked,
            quiet=quiet)

    log.info(
        'Cluster statuses: %d starting, %d bootstrapping, %d running,'
        ' %d pending, %d idle, %d done' % (
            num_starting, num_bootstrapping, num_running,
            num_pending, num_idle, num_done))
Esempio n. 34
0
def _cluster_to_basic_summary(cluster, now=None):
    """Extract fields such as creation time, owner, etc. from the cluster.

    :param cluster: a :py:mod:`boto3` cluster data structure
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.

    Returns a dictionary with the following keys. These will be ``None`` if the
    corresponding field in the cluster is unavailable.

    * *created*: UTC `datetime.datetime` that the cluster was created,
      or ``None``
    * *end*: UTC `datetime.datetime` that the cluster finished, or ``None``
    * *id*: cluster ID, or ``None`` (this should never happen)
    * *label*: The label for the cluster (usually the module name of the
      :py:class:`~mrjob.job.MRJob` script that started it), or
      ``None`` for non-:py:mod:`mrjob` clusters.
    * *name*: cluster name, or ``None`` (this should never happen)
    * *nih*: number of normalized instance hours used by the cluster.
    * *num_steps*: Number of steps in the cluster.
    * *owner*: The owner for the cluster (usually the user that started it),
      or ``None`` for non-:py:mod:`mrjob` clusters.
    * *pool*: pool name (e.g. ``'default'``) if the cluster is pooled,
      otherwise ``None``.
    * *ran*: How long the cluster ran, or has been running, as a
      :py:class:`datetime.timedelta`. This will be ``timedelta(0)`` if
      the cluster hasn't started.
    * *ready*: UTC `datetime.datetime` that the cluster finished
      bootstrapping, or ``None``
    * *state*: The cluster's state as a string (e.g. ``'RUNNING'``)
    """
    if now is None:
        now = _boto3_now()

    bcs = {}  # basic cluster summary to fill in

    bcs['id'] = cluster['Id']
    bcs['name'] = cluster['Name']

    Status = cluster['Status']
    Timeline = Status.get('Timeline', {})

    bcs['created'] = Timeline.get('CreationDateTime')
    bcs['ready'] = Timeline.get('ReadyDateTime')
    bcs['end'] = Timeline.get('EndDateTime')

    if bcs['created']:
        bcs['ran'] = (bcs['end'] or now) - bcs['created']
    else:
        bcs['ran'] = timedelta(0)

    bcs['state'] = Status.get('State')

    bcs['num_steps'] = len(cluster['Steps'])

    _, bcs['pool'] = _pool_hash_and_name(cluster)
    if not bcs['pool']:
        _, bcs['pool'] = _legacy_pool_hash_and_name(
            cluster['BootstrapActions'])

    m = _JOB_KEY_RE.match(bcs['name'] or '')
    if m:
        bcs['label'], bcs['owner'] = m.group(1), m.group(2)
    else:
        bcs['label'], bcs['owner'] = None, None

    bcs['nih'] = cluster.get('NormalizedInstanceHours', 0)

    return bcs
Esempio n. 35
0
def _print_report(stats, now=None):
    """Print final report.

    :param stats: a dictionary returned by :py:func:`_clusters_to_stats`
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.
    """
    if now is None:
        now = _boto3_now()

    s = stats

    if not s['clusters']:
        print('No clusters created in the past two months!')
        return

    print('Total  # of Clusters: %d' % len(s['clusters']))
    print()

    print('* All times are in UTC.')
    print()

    print('Min create time: %s' % min(cs['created'] for cs in s['clusters']))
    print('Max create time: %s' % max(cs['created'] for cs in s['clusters']))
    print('   Current time: %s' % now.replace(microsecond=0))
    print()

    print('* All usage is measured in Normalized Instance Hours, which are')
    print('  roughly equivalent to running an m1.medium instance for an hour.')
    print("  Billing is estimated, and may not match Amazon's system exactly.")
    print()

    # total compute-unit hours used
    def with_pct(usage):
        return (usage, _percent(usage, s['nih_billed']))

    print('Total billed:  %9.2f  %5.1f%%' % with_pct(s['nih_billed']))
    print('  Total used:  %9.2f  %5.1f%%' % with_pct(s['nih_used']))
    print('    bootstrap: %9.2f  %5.1f%%' % with_pct(s['bootstrap_nih_used']))
    print('    jobs:      %9.2f  %5.1f%%' % with_pct(s['job_nih_used']))
    print('  Total waste: %9.2f  %5.1f%%' % with_pct(s['nih_bbnu']))
    print('    at end:    %9.2f  %5.1f%%' % with_pct(s['end_nih_bbnu']))
    print('    other:     %9.2f  %5.1f%%' % with_pct(s['other_nih_bbnu']))
    print()

    if s['date_to_nih_billed']:
        print('Daily statistics:')
        print()
        print(' date          billed      used     waste   % waste')
        d = max(s['date_to_nih_billed'])
        while d >= min(s['date_to_nih_billed']):
            print(' %10s %9.2f %9.2f %9.2f     %5.1f' % (
                d,
                s['date_to_nih_billed'].get(d, 0.0),
                s['date_to_nih_used'].get(d, 0.0),
                s['date_to_nih_bbnu'].get(d, 0.0),
                _percent(s['date_to_nih_bbnu'].get(d, 0.0),
                         s['date_to_nih_billed'].get(d, 0.0))))
            d -= timedelta(days=1)
        print()

    if s['hour_to_nih_billed']:
        print('Hourly statistics:')
        print()
        print(' hour              billed      used     waste   % waste')
        h = max(s['hour_to_nih_billed'])
        while h >= min(s['hour_to_nih_billed']):
            print(' %13s  %9.2f %9.2f %9.2f     %5.1f' % (
                h.strftime('%Y-%m-%d %H'),
                s['hour_to_nih_billed'].get(h, 0.0),
                s['hour_to_nih_used'].get(h, 0.0),
                s['hour_to_nih_bbnu'].get(h, 0.0),
                _percent(s['hour_to_nih_bbnu'].get(h, 0.0),
                         s['hour_to_nih_billed'].get(h, 0.0))))
            h -= timedelta(hours=1)
        print()

    print('* clusters are considered to belong to the user and job that')
    print('  started them or last ran on them.')
    print()

    # Top jobs
    print('Top jobs, by total time used:')
    for label, nih_used in sorted(s['label_to_nih_used'].items(),
                                  key=lambda lb_nih: (-lb_nih[1], lb_nih[0])):
        print('  %9.2f %s' % (nih_used, label))
    print()

    print('Top jobs, by time billed but not used:')
    for label, nih_bbnu in sorted(
            s['label_to_nih_bbnu'].items(),
            key=lambda lb_nih1: (-lb_nih1[1], lb_nih1[0])):
        print('  %9.2f %s' % (nih_bbnu, label))
    print()

    # Top users
    print('Top users, by total time used:')
    for owner, nih_used in sorted(s['owner_to_nih_used'].items(),
                                  key=lambda o_nih: (-o_nih[1], o_nih[0])):
        print('  %9.2f %s' % (nih_used, owner))
    print()

    print('Top users, by time billed but not used:')
    for owner, nih_bbnu in sorted(s['owner_to_nih_bbnu'].items(),
                                  key=lambda o_nih2: (-o_nih2[1], o_nih2[0])):
        print('  %9.2f %s' % (nih_bbnu, owner))
    print()

    # Top job steps
    print('Top job steps, by total time used (step number first):')
    for (label, step_num), nih_used in sorted(
            s['job_step_to_nih_used'].items(),
            key=lambda k_nih: (-k_nih[1], k_nih[0])):

        if label:
            print('  %9.2f %3d %s' % (nih_used, step_num, label))
        else:
            print('  %9.2f     (non-mrjob step)' % (nih_used,))
    print()

    print('Top job steps, by total time billed but not used (un-pooled only):')
    for (label, step_num), nih_bbnu in sorted(
            s['job_step_to_nih_bbnu_no_pool'].items(),
            key=lambda k_nih3: (-k_nih3[1], k_nih3[0])):

        if label:
            print('  %9.2f %3d %s' % (nih_bbnu, step_num, label))
        else:
            print('  %9.2f     (non-mrjob step)' % (nih_bbnu,))
    print()

    # Top pools
    print('All pools, by total time billed:')
    for pool, nih_billed in sorted(s['pool_to_nih_billed'].items(),
                                   key=lambda p_nih: (-p_nih[1], p_nih[0])):
        print('  %9.2f %s' % (nih_billed, pool or '(not pooled)'))
    print()

    print('All pools, by total time billed but not used:')
    for pool, nih_bbnu in sorted(s['pool_to_nih_bbnu'].items(),
                                 key=lambda p_nih4: (-p_nih4[1], p_nih4[0])):
        print('  %9.2f %s' % (nih_bbnu, pool or '(not pooled)'))
    print()

    # Top clusters
    print('All clusters, by total time billed:')
    top_clusters = sorted(s['clusters'],
                          key=lambda cs: (-cs['nih_billed'], cs['name']))
    for cs in top_clusters:
        print('  %9.2f %-15s %s' % (
            cs['nih_billed'], cs['id'], cs['name']))
    print()

    print('All clusters, by time billed but not used:')
    top_clusters_bbnu = sorted(
        s['clusters'], key=lambda cs: (-cs['nih_bbnu'], cs['name']))
    for cs in top_clusters_bbnu:
        print('  %9.2f %-15s %s' % (
            cs['nih_bbnu'], cs['id'], cs['name']))
    print()

    # Details
    print('Details for all clusters:')
    print()
    print(' id              state                  created             steps'
          '        time ran     billed    waste   user   name')

    all_clusters = sorted(s['clusters'], key=lambda cs: cs['created'],
                          reverse=True)

    for cs in all_clusters:
        print(' %-15s %-22s %19s %3d %17s %9.2f %9.2f %8s %s' % (
            cs['id'], cs['state'], cs['created'], cs['num_steps'],
            strip_microseconds(cs['ran']), cs['nih_used'], cs['nih_bbnu'],
            (cs['owner'] or ''), (cs['label'] or ('not started by mrjob'))))
Esempio n. 36
0
    def _simulate_progress(self, cluster_id, now=None):
        """Simulate progress on the given cluster. This is automatically
        run when we call :py:meth:`describe_step`, and, when the cluster is
        ``TERMINATING``, :py:meth:`describe_cluster`.

        :type cluster_id: str
        :param cluster_id: fake cluster ID
        :type now: py:class:`datetime.datetime`
        :param now: alternate time to use as the current time (should be UTC)
        """
        # TODO: this doesn't actually update steps to CANCELLED when
        # cluster is shut down
        if now is None:
            now = _boto3_now()

        cluster = self.mock_emr_clusters[cluster_id]

        # allow clusters to get stuck
        if cluster.get('_DelayProgressSimulation', 0) > 0:
            cluster['_DelayProgressSimulation'] -= 1
            return

        # this code is pretty loose about updating StateChangeReason
        # (for the cluster, instance groups, and steps). Add this as needed.

        # if job is STARTING, move it along to BOOTSTRAPPING
        if cluster['Status']['State'] == 'STARTING':
            cluster['Status']['State'] = 'BOOTSTRAPPING'

            # master now has a hostname
            cluster['MasterPublicDnsName'] = 'master.%s.mock' % cluster['Id']

            # instances are now provisioned
            for ig in cluster['_InstanceGroups']:
                ig['RunningInstanceCount'] = ig['RequestedInstanceCount']
                ig['Status']['State'] = 'BOOTSTRAPPING'

            return

        # if job is TERMINATING, move along to terminated
        if cluster['Status']['State'] == 'TERMINATING':
            code = cluster['Status']['StateChangeReason'].get('Code')
            if code and code.endswith('_FAILURE'):
                cluster['Status']['State'] = 'TERMINATED_WITH_ERRORS'
            else:
                cluster['Status']['State'] = 'TERMINATED'

            return

        # if job is done, nothing to do
        if cluster['Status']['State'] in ('TERMINATED',
                                          'TERMINATED_WITH_ERRORS'):
            return

        # if job is BOOTSTRAPPING, move it along to RUNNING and continue
        if cluster['Status']['State'] == 'BOOTSTRAPPING':
            cluster['Status']['State'] = 'RUNNING'
            for ig in cluster['_InstanceGroups']:
                ig['Status']['State'] = 'RUNNING'

        # at this point, should be RUNNING or WAITING
        assert cluster['Status']['State'] in ('RUNNING', 'WAITING')

        # simulate self-termination
        if cluster_id in self.mock_emr_self_termination:
            cluster['Status']['State'] = 'TERMINATING'
            cluster['Status']['StateChangeReason'] = dict(
                Code='INSTANCE_FAILURE',
                Message='The master node was terminated. ',  # sic
            )

            for step in cluster['_Steps']:
                if step['Status']['State'] in ('PENDING', 'RUNNING'):
                    step['Status']['State'] = 'CANCELLED'  # not INTERRUPTED

            return

        # try to find the next step, and advance it

        for step_num, step in enumerate(cluster['_Steps']):
            # skip steps that are already done
            if step['Status']['State'] in ('COMPLETED', 'FAILED', 'CANCELLED',
                                           'INTERRUPTED'):
                continue

            # found currently running step! handle it, then exit

            # start PENDING step
            if step['Status']['State'] == 'PENDING':
                step['Status']['State'] = 'RUNNING'
                step['Status']['Timeline']['StartDateTime'] = now
                return

            assert step['Status']['State'] == 'RUNNING'

            # check if we're supposed to have an error
            if (cluster_id, step_num) in self.mock_emr_failures:
                step['Status']['State'] = 'FAILED'

                if step['ActionOnFailure'] in ('TERMINATE_CLUSTER',
                                               'TERMINATE_JOB_FLOW'):

                    cluster['Status']['State'] = 'TERMINATING'
                    cluster['Status']['StateChangeReason']['Code'] = (
                        'STEP_FAILURE')
                    cluster['Status']['StateChangeReason']['Message'] = (
                        'Shut down as step failed')

                    for step in cluster['_Steps']:
                        if step['Status']['State'] in ('PENDING', 'RUNNING'):
                            step['Status']['State'] = 'CANCELLED'

                return

            # complete step
            step['Status']['State'] = 'COMPLETED'
            step['Status']['Timeline']['EndDateTime'] = now

            # create fake output if we're supposed to write to S3
            output_uri = self._get_step_output_uri(step['Config']['Args'])
            if output_uri and is_s3_uri(output_uri):
                mock_output = self.mock_emr_output.get(
                    (cluster_id, step_num)) or [b'']

                bucket_name, key_name = parse_s3_uri(output_uri)

                # write output to S3
                for i, part in enumerate(mock_output):
                    add_mock_s3_data(
                        self.mock_s3_fs,
                        {bucket_name: {
                            key_name + 'part-%05d' % i: part
                        }})
            elif (cluster_id, step_num) in self.mock_emr_output:
                raise AssertionError(
                    "can't use output for cluster ID %s, step %d "
                    "(it doesn't output to S3)" % (cluster_id, step_num))

            # done!
            # if this is the last step, continue to autotermination code, below
            if step_num < len(cluster['_Steps']) - 1:
                return

        # no pending steps. should we wait, or shut down?
        if cluster['AutoTerminate']:
            cluster['Status']['State'] = 'TERMINATING'
            cluster['Status']['StateChangeReason']['Code'] = (
                'ALL_STEPS_COMPLETED')
            cluster['Status']['StateChangeReason']['Message'] = (
                'Steps Completed')
        else:
            # just wait
            cluster['Status']['State'] = 'WAITING'
            cluster['Status']['StateChangeReason'] = {}

        return
Esempio n. 37
0
def _find_long_running_jobs(emr_client, cluster_summaries, min_time, now=None):
    """Identify jobs that have been running or pending for a long time.

    :param clusters: a list of :py:mod:`boto3` cluster summary data structures
    :param min_time: a :py:class:`datetime.timedelta`: report jobs running or
                     pending longer than this
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.

    For each job that is running or pending longer than *min_time*, yields
    a dictionary with the following keys:

    * *cluster_id*: the cluster's unique ID (e.g. ``j-SOMECLUSTER``)
    * *name*: name of the step, or the cluster when bootstrapping
    * *state*: state of the step (``'RUNNING'`` or ``'PENDING'``) or, if there
               is no step, the cluster (``'STARTING'`` or ``'BOOTSTRAPPING'``)
    * *time*: amount of time step was running or pending, as a
              :py:class:`datetime.timedelta`
    """
    if now is None:
        now = _boto3_now()

    for cs in cluster_summaries:

        # special case for jobs that are taking a long time to bootstrap
        if cs['Status']['State'] in ('STARTING', 'BOOTSTRAPPING'):
            # there isn't a way to tell when the cluster stopped being
            # provisioned and started bootstrapping, so just measure
            # from cluster creation time
            created = cs['Status']['Timeline']['CreationDateTime']

            time_running = now - created

            if time_running >= min_time:
                yield({'cluster_id': cs['Id'],
                       'name': cs['Name'],
                       'state': cs['Status']['State'],
                       'time': time_running})

        # the default case: running clusters
        if cs['Status']['State'] != 'RUNNING':
            continue

        steps = list(reversed(list(_boto3_paginate(
            'Steps', emr_client, 'list_steps', ClusterId=cs['Id']))))

        running_steps = [
            step for step in steps if step['Status']['State'] == 'RUNNING']
        pending_steps = [
            step for step in steps if step['Status']['State'] == 'PENDING']

        if running_steps:
            # should be only one, but if not, we should know about it
            for step in running_steps:

                start = step['Status']['Timeline']['StartDateTime']

                time_running = now - start

                if time_running >= min_time:
                    yield({'cluster_id': cs['Id'],
                           'name': step['Name'],
                           'state': step['Status']['State'],
                           'time': time_running})

        # sometimes EMR says it's "RUNNING" but doesn't actually run steps!
        elif pending_steps:
            step = pending_steps[0]

            # PENDING job should have run starting when the cluster
            # became ready, or the previous step completed
            start = cs['Status']['Timeline']['ReadyDateTime']
            for step in steps:
                if step['Status']['State'] == 'COMPLETED':
                    start = step['Status']['Timeline']['EndDateTime']

            time_pending = now - start

            if time_pending >= min_time:
                yield({'cluster_id': cs['Id'],
                       'name': step['Name'],
                       'state': step['Status']['State'],
                       'time': time_pending})
Esempio n. 38
0
    def run_job_flow(self, **kwargs):
        # going to pop params from kwargs as we process then, and raise
        # NotImplementedError at the end if any params are left
        now = kwargs.pop('_Now', _boto3_now())

        # our newly created cluster, as described by describe_cluster(), plus:
        #
        # _BootstrapActions: as described by list_bootstrap_actions()
        # _InstanceGroups: as described by list_instance_groups()
        # _Steps: as decribed by list_steps(), but not reversed
        #
        # TODO: at some point when we implement instance fleets,
        # _InstanceGroups will become optional
        cluster = dict(
            _BootstrapActions=[],
            _InstanceGroups=[],
            _Steps=[],
            Applications=[],
            AutoTerminate=True,
            Configurations=[],
            Ec2InstanceAttributes=dict(
                EmrManagedMasterSecurityGroup='sg-mockmaster',
                EmrManagedSlaveSecurityGroup='sg-mockslave',
                IamInstanceProfile='',
            ),
            Id='j-MOCKCLUSTER%d' % len(self.mock_emr_clusters),
            Name='',
            NormalizedInstanceHours=0,
            ScaleDownBehavior='TERMINATE_AT_TASK_COMPLETION',
            ServiceRole='',
            Status=dict(
                State='STARTING',
                StateChangeReason={},
                Timeline=dict(CreationDateTime=now),
            ),
            Tags=[],
            TerminationProtected=False,
            VisibleToAllUsers=False,
        )

        def _error(message):
            return _ValidationException('RunJobFlow', message)

        # Name (required)
        _validate_param(kwargs, 'Name', string_types)
        cluster['Name'] = kwargs.pop('Name')

        # LogUri
        if 'LogUri' in kwargs:
            _validate_param(kwargs, 'LogUri', string_types)
            cluster['LogUri'] = kwargs.pop('LogUri')

        # JobFlowRole and ServiceRole (required)
        _validate_param(kwargs, 'JobFlowRole', string_types)
        cluster['Ec2InstanceAttributes']['IamInstanceProfile'] = kwargs.pop(
            'JobFlowRole')

        if 'ServiceRole' not in kwargs:  # required by API, not boto3
            raise _error('ServiceRole is required for creating cluster.')
        _validate_param(kwargs, 'ServiceRole', string_types)
        cluster['ServiceRole'] = kwargs.pop('ServiceRole')

        # AmiVersion and ReleaseLabel
        for version_param in ('AmiVersion', 'ReleaseLabel'):
            if version_param in kwargs:
                _validate_param(kwargs, version_param, string_types)

        if 'AmiVersion' in kwargs:
            if 'ReleaseLabel' in kwargs:
                raise _error(
                    'Only one AMI version and release label may be specified.'
                    ' Provided AMI: %s, release label: %s.' %
                    (kwargs['AmiVersion'], kwargs['ReleaseLabel']))

            AmiVersion = kwargs.pop('AmiVersion')

            running_ami_version = AMI_VERSION_ALIASES.get(
                AmiVersion, AmiVersion)

            if version_gte(running_ami_version, '4'):
                raise _error('The supplied ami version is invalid.')
            elif not version_gte(running_ami_version, '2'):
                raise _error(
                    'Job flow role is not compatible with the supplied'
                    ' AMI version')

            cluster['RequestedAmiVersion'] = AmiVersion
            cluster['RunningAmiVersion'] = running_ami_version

        elif 'ReleaseLabel' in kwargs:
            ReleaseLabel = kwargs.pop('ReleaseLabel')
            running_ami_version = ReleaseLabel.lstrip('emr-')

            if not version_gte(running_ami_version, '4'):
                raise _error('The supplied release label is invalid: %s.' %
                             ReleaseLabel)

            cluster['ReleaseLabel'] = ReleaseLabel
        else:
            # note: you can't actually set Hadoop version through boto3
            raise _error('Must specify exactly one of the following:'
                         ' release label, AMI version, or Hadoop version.')

        # Applications
        hadoop_version = map_version(running_ami_version,
                                     AMI_HADOOP_VERSION_UPDATES)

        if version_gte(running_ami_version, '4'):
            application_names = set(a['Name']
                                    for a in kwargs.pop('Applications', []))

            # if Applications is set but doesn't include Hadoop, the
            # cluster description won't either! (Even though Hadoop is
            # in fact installed.)
            if not application_names:
                application_names = set(['Hadoop'])

            for app_name in sorted(application_names):
                if app_name == 'Hadoop':
                    version = hadoop_version
                else:
                    version = DUMMY_APPLICATION_VERSION

                cluster['Applications'].append(
                    dict(Name=app_name, Version=version))
        else:
            if kwargs.get('Applications'):
                raise _error(
                    'Cannot specify applications when AMI version is used.'
                    ' Specify supported products or new supported products'
                    ' instead.')

            # 'hadoop' is lowercase if AmiVersion specified
            cluster['Applications'].append(
                dict(Name='hadoop', Version=hadoop_version))

        # Configurations
        if 'Configurations' in kwargs:
            _validate_param(kwargs, 'Configurations', (list, tuple))

            if kwargs['Configurations'] and not version_gte(
                    running_ami_version, '4'):
                raise _ValidationException(
                    'RunJobFlow',
                    'Cannot specify configurations when AMI version is used.')

            cluster['Configurations'] = _normalized_configurations(
                kwargs.pop('Configurations'))

        # VisibleToAllUsers
        if 'VisibleToAllUsers' in kwargs:
            _validate_param(kwargs, 'VisibleToAllUsers', bool)
            cluster['VisibleToAllUsers'] = kwargs.pop('VisibleToAllUsers')

        # pass BootstrapActions off to helper
        if 'BootstrapActions' in kwargs:
            self._add_bootstrap_actions('RunJobFlow',
                                        kwargs.pop('BootstrapActions'),
                                        cluster)

        # pass Instances (required) off to helper
        _validate_param(kwargs, 'Instances')
        self._add_instances('RunJobFlow',
                            kwargs.pop('Instances'),
                            cluster,
                            now=now)

        # pass Steps off to helper
        if 'Steps' in kwargs:
            self._add_steps('RunJobFlow', kwargs.pop('Steps'), cluster)

        # pass Tags off to helper
        if 'Tags' in kwargs:
            self._add_tags('RunJobFlow', kwargs.pop('Tags'), cluster)

        # save AdditionalInfo
        if 'AdditionalInfo' in kwargs:
            cluster['_AdditionalInfo'] = kwargs.pop('AdditionalInfo')

        # catch extra params
        if kwargs:
            raise NotImplementedError(
                'mock RunJobFlow does not support these parameters: %s' %
                ', '.join(sorted(kwargs)))

        self.mock_emr_clusters[cluster['Id']] = cluster

        return dict(JobFlowId=cluster['Id'])
Esempio n. 39
0
def _maybe_terminate_clusters(dry_run=False,
                              max_mins_idle=None,
                              now=None,
                              pool_name=None,
                              pooled_only=False,
                              unpooled_only=False,
                              max_mins_locked=None,
                              quiet=False,
                              **kwargs):
    if now is None:
        now = _boto3_now()

    # old default behavior
    if max_mins_idle is None:
        max_mins_idle = _DEFAULT_MAX_MINS_IDLE

    runner = EMRJobRunner(**kwargs)
    emr_client = runner.make_emr_client()

    num_starting = 0
    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_pending = 0
    num_running = 0

    # include RUNNING to catch clusters with PENDING jobs that
    # never ran (see #365).
    for cluster_summary in _boto3_paginate(
            'Clusters',
            emr_client,
            'list_clusters',
            ClusterStates=['WAITING', 'RUNNING']):

        cluster_id = cluster_summary['Id']

        # check if cluster is done
        if _is_cluster_done(cluster_summary):
            num_done += 1
            continue

        # check if cluster is starting
        if _is_cluster_starting(cluster_summary):
            num_starting += 1
            continue

        # check if cluster is bootstrapping
        if _is_cluster_bootstrapping(cluster_summary):
            num_bootstrapping += 1
            continue

        # need steps to learn more about cluster
        steps = list(
            reversed(
                list(
                    _boto3_paginate('Steps',
                                    emr_client,
                                    'list_steps',
                                    ClusterId=cluster_id))))

        if any(_is_step_running(step) for step in steps):
            num_running += 1
            continue

        # cluster is idle
        time_idle = now - _time_last_active(cluster_summary, steps)
        is_pending = _cluster_has_pending_steps(steps)

        # need to get actual cluster to see tags
        cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster']

        _, pool = _pool_hash_and_name(cluster)

        if is_pending:
            num_pending += 1
        else:
            num_idle += 1

        log.debug('cluster %s %s for %s, %s (%s) - %s' % (
            cluster_id,
            'pending' if is_pending else 'idle',
            strip_microseconds(time_idle),
            ('unpooled' if pool is None else 'in %s pool' % pool),
            cluster_summary['Name'],
            'protected' if cluster['TerminationProtected'] else 'unprotected',
        ))

        # filter out clusters that don't meet our criteria
        if (max_mins_idle is not None
                and time_idle <= timedelta(minutes=max_mins_idle)):
            continue

        if (pooled_only and pool is None):
            continue

        if (unpooled_only and pool is not None):
            continue

        if (pool_name is not None and pool != pool_name):
            continue

        if cluster['TerminationProtected']:
            continue

        # terminate idle cluster
        _terminate_and_notify(runner=runner,
                              cluster_id=cluster_id,
                              cluster_name=cluster_summary['Name'],
                              num_steps=len(steps),
                              is_pending=is_pending,
                              time_idle=time_idle,
                              dry_run=dry_run,
                              max_mins_locked=max_mins_locked,
                              quiet=quiet)

    log.info('Cluster statuses: %d starting, %d bootstrapping, %d running,'
             ' %d pending, %d idle, %d done' %
             (num_starting, num_bootstrapping, num_running, num_pending,
              num_idle, num_done))
Esempio n. 40
0
def _cluster_to_usage_data(cluster, basic_summary=None, now=None):
    """Break billing/usage information for a cluster down by job.

    :param cluster: a :py:mod:`boto3` cluster data structure
    :param basic_summary: a basic summary of the cluster, returned by
                          :py:func:`_cluster_to_basic_summary`. If this
                          is ``None``, we'll call
                          :py:func:`_cluster_to_basic_summary` ourselves.
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.

    Returns a list of dictionaries containing usage information, one for
    bootstrapping, and one for each step that ran or is currently running. If
    the cluster hasn't started yet, return ``[]``.

    Usage dictionaries have the following keys:

    * *end*: when the job finished running, or *now* if it's still running.
    * *end_billing*: the effective end of the job for billing purposes, either
      when the next job starts, the current time if the job
      is still running, or the end of the next full hour
      in the cluster.
    * *nih_billed*: normalized instances hours billed for this job or
      bootstrapping step
    * *nih_used*: normalized instance hours actually used for running
      the job or bootstrapping
    * *nih_bbnu*: usage billed but not used (`nih_billed - nih_used`)
    * *date_to_nih_\**: map from a :py:class:`datetime.date` to number
      of normalized instance hours billed/used/billed but not used on that date
    * *hour_to_nih_\**: map from a :py:class:`datetime.datetime` to number
      of normalized instance hours billed/used/billed but not used during
      the hour starting at that time
    * *label*: job's label (usually the module name of the job), or for the
      bootstrapping step, the label of the cluster
    * *owner*: job's owner (usually the user that started it), or for the
      bootstrapping step, the owner of the cluster
    * *start*: when the job or bootstrapping step started, as a
      :py:class:`datetime.datetime`
    """
    bcs = basic_summary or _cluster_to_basic_summary(cluster)

    if now is None:
        now = _boto3_now()

    if not bcs['created']:
        return []

    # Figure out billing rate per second for the job, given that
    # normalizedinstancehours is how much we're charged up until
    # the next full hour.
    full_hours = math.ceil(timedelta.total_seconds(bcs['ran']) / 60.0 / 60.0)
    nih_per_sec = bcs['nih'] / (full_hours * 3600.0)

    # Don't actually count a step as billed for the full hour until
    # the cluster finishes. This means that our total "nih_billed"
    # will be less than normalizedinstancehours in the cluster, but it
    # also keeps stats stable for steps that have already finished.
    if bcs['end']:
        cluster_end_billing = bcs['created'] + timedelta(hours=full_hours)
    else:
        cluster_end_billing = now

    intervals = []

    # make a fake step for cluster startup and bootstrapping, so we don't
    # consider that wasted.
    intervals.append({
        'label': bcs['label'],
        'owner': bcs['owner'],
        'start': bcs['created'],
        'end': bcs['ready'] or bcs['end'] or now,
        'step_num': None,
    })

    for step in cluster['Steps']:
        Status = step['Status']
        Timeline = Status.get('Timeline', {})

        # we've reached the last step that's actually run
        if not Timeline.get('StartDateTime'):
            break

        step_start = Timeline['StartDateTime']

        step_end = Timeline.get('EndDateTime')
        if step_end is None:
            # step started running and was cancelled. credit it for 0 usage
            if bcs['end']:
                step_end = step_start
            # step is still running
            else:
                step_end = now

        m = _STEP_NAME_RE.match(step['Name'])
        if m:
            step_label = m.group(1)
            step_owner = m.group(2)
            step_num = int(m.group(6))
        else:
            step_label, step_owner, step_num = None, None, None

        intervals.append({
            'label': step_label,
            'owner': step_owner,
            'start': step_start,
            'end': step_end,
            'step_num': step_num,
        })

    # fill in end_billing
    for i in range(len(intervals) - 1):
        intervals[i]['end_billing'] = intervals[i + 1]['start']

    intervals[-1]['end_billing'] = cluster_end_billing

    # fill normalized usage information
    for interval in intervals:

        interval['nih_used'] = (
            nih_per_sec *
            timedelta.total_seconds(interval['end'] - interval['start']))

        interval['date_to_nih_used'] = dict(
            (d, nih_per_sec * secs) for d, secs in _subdivide_interval_by_date(
                interval['start'], interval['end']).items())

        interval['hour_to_nih_used'] = dict(
            (d, nih_per_sec * secs) for d, secs in _subdivide_interval_by_hour(
                interval['start'], interval['end']).items())

        interval['nih_billed'] = (
            nih_per_sec * timedelta.total_seconds(interval['end_billing'] -
                                                  interval['start']))

        interval['date_to_nih_billed'] = dict(
            (d, nih_per_sec * secs) for d, secs in _subdivide_interval_by_date(
                interval['start'], interval['end_billing']).items())

        interval['hour_to_nih_billed'] = dict(
            (d, nih_per_sec * secs) for d, secs in _subdivide_interval_by_hour(
                interval['start'], interval['end_billing']).items())

        # time billed but not used
        interval['nih_bbnu'] = interval['nih_billed'] - interval['nih_used']

        interval['date_to_nih_bbnu'] = {}
        for d, nih_billed in interval['date_to_nih_billed'].items():
            nih_bbnu = nih_billed - interval['date_to_nih_used'].get(d, 0.0)
            if nih_bbnu:
                interval['date_to_nih_bbnu'][d] = nih_bbnu

        interval['hour_to_nih_bbnu'] = {}
        for d, nih_billed in interval['hour_to_nih_billed'].items():
            nih_bbnu = nih_billed - interval['hour_to_nih_used'].get(d, 0.0)
            if nih_bbnu:
                interval['hour_to_nih_bbnu'][d] = nih_bbnu

    return intervals
Esempio n. 41
0
def _maybe_terminate_clusters(dry_run=False,
                              max_hours_idle=None,
                              mins_to_end_of_hour=None,
                              now=None,
                              pool_name=None,
                              pooled_only=False,
                              unpooled_only=False,
                              max_mins_locked=None,
                              quiet=False,
                              **kwargs):
    if now is None:
        now = _boto3_now()

    # old default behavior
    if max_hours_idle is None and mins_to_end_of_hour is None:
        max_hours_idle = _DEFAULT_MAX_HOURS_IDLE

    runner = EMRJobRunner(**kwargs)
    emr_client = runner.make_emr_client()

    num_starting = 0
    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_pending = 0
    num_running = 0

    # We don't filter by cluster state because we want this to work even
    # if Amazon adds another kind of idle state.
    for cluster_summary in _boto3_paginate('Clusters', emr_client,
                                           'list_clusters'):

        cluster_id = cluster_summary['Id']

        # check if cluster is done
        if _is_cluster_done(cluster_summary):
            num_done += 1
            continue

        # check if cluster is starting
        if _is_cluster_starting(cluster_summary):
            num_starting += 1
            continue

        # check if cluster is bootstrapping
        if _is_cluster_bootstrapping(cluster_summary):
            num_bootstrapping += 1
            continue

        # need steps to learn more about cluster
        steps = list(
            reversed(
                list(
                    _boto3_paginate('Steps',
                                    emr_client,
                                    'list_steps',
                                    ClusterId=cluster_id))))

        if any(_is_step_running(step) for step in steps):
            num_running += 1
            continue

        # cluster is idle
        time_idle = now - _time_last_active(cluster_summary, steps)
        time_to_end_of_hour = _est_time_to_hour(cluster_summary, now=now)
        is_pending = _cluster_has_pending_steps(steps)

        bootstrap_actions = list(
            _boto3_paginate('BootstrapActions',
                            emr_client,
                            'list_bootstrap_actions',
                            ClusterId=cluster_id))

        _, pool = _pool_hash_and_name(bootstrap_actions)

        if is_pending:
            num_pending += 1
        else:
            num_idle += 1

        log.debug('cluster %s %s for %s, %s to end of hour, %s (%s)' %
                  (cluster_id, 'pending' if is_pending else 'idle',
                   strip_microseconds(time_idle),
                   strip_microseconds(time_to_end_of_hour),
                   ('unpooled' if pool is None else 'in %s pool' % pool),
                   cluster_summary['Name']))

        # filter out clusters that don't meet our criteria
        if (max_hours_idle is not None
                and time_idle <= timedelta(hours=max_hours_idle)):
            continue

        # mins_to_end_of_hour doesn't apply to jobs with pending steps
        if (mins_to_end_of_hour is not None and
            (is_pending or
             time_to_end_of_hour >= timedelta(minutes=mins_to_end_of_hour))):
            continue

        if (pooled_only and pool is None):
            continue

        if (unpooled_only and pool is not None):
            continue

        if (pool_name is not None and pool != pool_name):
            continue

        # terminate idle cluster
        _terminate_and_notify(runner=runner,
                              cluster_id=cluster_id,
                              cluster_name=cluster_summary['Name'],
                              num_steps=len(steps),
                              is_pending=is_pending,
                              time_idle=time_idle,
                              time_to_end_of_hour=time_to_end_of_hour,
                              dry_run=dry_run,
                              max_mins_locked=max_mins_locked,
                              quiet=quiet)

    log.info('Cluster statuses: %d starting, %d bootstrapping, %d running,'
             ' %d pending, %d idle, %d done' %
             (num_starting, num_bootstrapping, num_running, num_pending,
              num_idle, num_done))
Esempio n. 42
0
def _cluster_to_usage_data(cluster, basic_summary=None, now=None):
    r"""Break billing/usage information for a cluster down by job.

    :param cluster: a :py:mod:`boto3` cluster data structure
    :param basic_summary: a basic summary of the cluster, returned by
                          :py:func:`_cluster_to_basic_summary`. If this
                          is ``None``, we'll call
                          :py:func:`_cluster_to_basic_summary` ourselves.
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.

    Returns a list of dictionaries containing usage information, one for
    bootstrapping, and one for each step that ran or is currently running. If
    the cluster hasn't started yet, return ``[]``.

    Usage dictionaries have the following keys:

    * *end*: when the job finished running, or *now* if it's still running.
    * *end_billing*: the effective end of the job for billing purposes, either
      when the next job starts, the current time if the job
      is still running, or the end of the next full hour
      in the cluster.
    * *nih_billed*: normalized instances hours billed for this job or
      bootstrapping step
    * *nih_used*: normalized instance hours actually used for running
      the job or bootstrapping
    * *nih_bbnu*: usage billed but not used (`nih_billed - nih_used`)
    * *date_to_nih_\**: map from a :py:class:`datetime.date` to number
      of normalized instance hours billed/used/billed but not used on that date
    * *hour_to_nih_\**: map from a :py:class:`datetime.datetime` to number
      of normalized instance hours billed/used/billed but not used during
      the hour starting at that time
    * *label*: job's label (usually the module name of the job), or for the
      bootstrapping step, the label of the cluster
    * *owner*: job's owner (usually the user that started it), or for the
      bootstrapping step, the owner of the cluster
    * *start*: when the job or bootstrapping step started, as a
      :py:class:`datetime.datetime`
    """
    bcs = basic_summary or _cluster_to_basic_summary(cluster)

    if now is None:
        now = _boto3_now()

    if not bcs['created']:
        return []

    # EMR no longer bills by the full hour, but NormalizedInstanceHours
    # still works that way
    full_hours = math.ceil(timedelta.total_seconds(bcs['ran']) / 60.0 / 60.0)
    nih_per_sec = bcs['nih'] / (full_hours * 3600.0)

    # EMR bills by the full second, and at least one minute per cluster
    cluster_end_billing = bcs['created'] + max(
        _round_up_to_next_second(bcs['ran']), timedelta(minutes=1))

    intervals = []

    # make a fake step for cluster startup and bootstrapping, so we don't
    # consider that wasted.
    intervals.append({
        'label': bcs['label'],
        'owner': bcs['owner'],
        'start': bcs['created'],
        'end': bcs['ready'] or bcs['end'] or now,
        'step_num': None,
    })

    for step in cluster['Steps']:
        Status = step['Status']
        Timeline = Status.get('Timeline', {})

        # we've reached the last step that's actually run
        if not Timeline.get('StartDateTime'):
            break

        step_start = Timeline['StartDateTime']

        step_end = Timeline.get('EndDateTime')
        if step_end is None:
            # step started running and was cancelled. credit it for 0 usage
            if bcs['end']:
                step_end = step_start
            # step is still running
            else:
                step_end = now

        m = _STEP_NAME_RE.match(step['Name'])
        if m:
            step_label = m.group(1)
            step_owner = m.group(2)
            step_num = int(m.group(6))
        else:
            step_label, step_owner, step_num = None, None, None

        intervals.append({
            'label': step_label,
            'owner': step_owner,
            'start': step_start,
            'end': step_end,
            'step_num': step_num,
        })

    # fill in end_billing
    for i in range(len(intervals) - 1):
        intervals[i]['end_billing'] = intervals[i + 1]['start']

    intervals[-1]['end_billing'] = cluster_end_billing

    # fill normalized usage information
    for interval in intervals:

        interval['nih_used'] = (
            nih_per_sec *
            timedelta.total_seconds(interval['end'] - interval['start']))

        interval['date_to_nih_used'] = dict(
            (d, nih_per_sec * secs)
            for d, secs
            in _subdivide_interval_by_date(interval['start'],
                                           interval['end']).items())

        interval['hour_to_nih_used'] = dict(
            (d, nih_per_sec * secs)
            for d, secs
            in _subdivide_interval_by_hour(interval['start'],
                                           interval['end']).items())

        interval['nih_billed'] = (
            nih_per_sec * timedelta.total_seconds(
                interval['end_billing'] - interval['start']))

        interval['date_to_nih_billed'] = dict(
            (d, nih_per_sec * secs)
            for d, secs
            in _subdivide_interval_by_date(interval['start'],
                                           interval['end_billing']).items())

        interval['hour_to_nih_billed'] = dict(
            (d, nih_per_sec * secs)
            for d, secs
            in _subdivide_interval_by_hour(interval['start'],
                                           interval['end_billing']).items())

        # time billed but not used
        interval['nih_bbnu'] = interval['nih_billed'] - interval['nih_used']

        interval['date_to_nih_bbnu'] = {}
        for d, nih_billed in interval['date_to_nih_billed'].items():
            nih_bbnu = nih_billed - interval['date_to_nih_used'].get(d, 0.0)
            if nih_bbnu:
                interval['date_to_nih_bbnu'][d] = nih_bbnu

        interval['hour_to_nih_bbnu'] = {}
        for d, nih_billed in interval['hour_to_nih_billed'].items():
            nih_bbnu = nih_billed - interval['hour_to_nih_used'].get(d, 0.0)
            if nih_bbnu:
                interval['hour_to_nih_bbnu'][d] = nih_bbnu

    return intervals
Esempio n. 43
0
    def _add_instances(self, operation_name, Instances, cluster, now=None):
        """Handle Instances param from run_job_flow()"""
        if now is None:
            now = _boto3_now()

        _validate_param_type(Instances, dict)

        Instances = dict(Instances)  # going to pop params from Instances

        def _error(message):
            return _ValidationException(operation_name, message)

        # Ec2KeyName
        if 'Ec2KeyName' in Instances:
            _validate_param(Instances, 'Ec2KeyName', string_types)
            cluster['Ec2InstanceAttributes']['Ec2KeyName'] = Instances.pop(
                'Ec2KeyName')

        # Ec2SubnetId
        if 'Ec2SubnetId' in Instances:
            _validate_param(Instances, 'Ec2SubnetId', string_types)
            cluster['Ec2InstanceAttributes']['Ec2SubnetId'] = (
                Instances.pop('Ec2SubnetId'))

        # KeepJobFlowAliveWhenNoSteps
        if 'KeepJobFlowAliveWhenNoSteps' in Instances:
            _validate_param(Instances, 'KeepJobFlowAliveWhenNoSteps', bool)
            cluster['AutoTerminate'] = (
                not Instances.pop('KeepJobFlowAliveWhenNoSteps'))

        # Placement (availability zone)
        if 'Placement' in Instances:
            _validate_param(Instances, 'Placement', dict)
            Placement = Instances.pop('Placement')

            # mock_boto3 doesn't support the 'AvailabilityZones' param
            _validate_param(Placement, 'AvailabilityZone', string_types)
            cluster['Ec2InstanceAttributes']['Ec2AvailabilityZone'] = (
                Placement['AvailabilityZone'])

        if 'InstanceGroups' in Instances:
            if any(x in Instances
                   for x in ('MasterInstanceType', 'SlaveInstanceType',
                             'InstanceCount')):
                raise _error(
                    'Please configure instances using one and only one of the'
                    ' following: instance groups; instance fleets; instance'
                    ' count, master and slave instance type.')

            self._add_instance_groups(operation_name,
                                      Instances.pop('InstanceGroups'), cluster)
        # TODO: will need to support instance fleets at some point
        else:
            # build our own instance groups
            instance_groups = []

            instance_count = Instances.pop('InstanceCount', 0)
            _validate_param_type(instance_count, integer_types)

            # note: boto3 actually lets 'null' fall through to the API here
            _validate_param(Instances, 'MasterInstanceType', string_types)
            instance_groups.append(
                dict(InstanceRole='MASTER',
                     InstanceType=Instances.pop('MasterInstanceType'),
                     InstanceCount=1))

            if 'SlaveInstanceType' in Instances:
                SlaveInstanceType = Instances.pop('SlaveInstanceType')
                _validate_param_type(SlaveInstanceType, string_types)

                # don't create a group with no instances!
                if instance_count > 1:
                    instance_groups.append(
                        dict(InstanceRole='CORE',
                             InstanceType=SlaveInstanceType,
                             InstanceCount=instance_count - 1))

            self._add_instance_groups(operation_name,
                                      instance_groups,
                                      cluster,
                                      now=now)

        if Instances:
            raise NotImplementedError(
                'mock %s does not support these parameters: %s' %
                (operation_name, ', '.join('Instances.%s' % k
                                           for k in sorted(Instances))))
Esempio n. 44
0
    def _add_steps(self, operation_name, Steps, cluster, now=None):
        if now is None:
            now = _boto3_now()

        _validate_param_type(Steps, (list, tuple))

        # only active job flows allowed
        if cluster['Status']['State'].startswith('TERMINAT'):
            raise _ValidationException(
                operation_name,
                'A job flow that is shutting down, terminated, or finished'
                ' may not be modified.')

        # no more than 256 steps allowed
        if cluster.get('RunningAmiVersion') and map_version(
                cluster['RunningAmiVersion'],
                LIFETIME_STEP_LIMIT_AMI_VERSIONS):
            # for very old AMIs, *all* steps count
            if len(cluster['_Steps']) + len(Steps) > STEP_ADD_LIMIT:
                raise _ValidationException(
                    operation_name,
                    'Maximum number of steps for job flow exceeded')
        else:
            # otherwise, only active and pending steps count
            num_active_steps = sum(
                1 for step in cluster['_Steps']
                if step['Status']['State'] in ('PENDING', 'PENDING_CANCELLED',
                                               'RUNNING'))
            if num_active_steps + len(Steps) > STEP_ADD_LIMIT:
                raise _ValidationException(
                    operation_name,
                    "Maximum number of active steps(State = 'Running',"
                    " 'Pending' or 'Cancel_Pending') for cluster exceeded.")

        new_steps = []

        for i, Step in enumerate(Steps):
            Step = dict(Step)

            new_step = dict(
                ActionOnFailure='TERMINATE_CLUSTER',
                Config=dict(
                    Args=[],
                    Jar={},
                    Properties={},
                ),
                Id='s-MOCKSTEP%d' % (len(cluster['_Steps']) + i),
                Name='',
                Status=dict(
                    State='PENDING',
                    StateChangeReason={},
                    Timeline=dict(CreationDateTime=now),
                ),
            )

            # Name (required)
            _validate_param(Step, 'Name', string_types)
            new_step['Name'] = Step.pop('Name')

            # ActionOnFailure
            if 'ActionOnFailure' in Step:
                _validate_param_enum(Step['ActionOnFailure'], [
                    'CANCEL_AND_WAIT', 'CONTINUE', 'TERMINATE_JOB_FLOW',
                    'TERMINATE_CLUSTER'
                ])

                new_step['ActionOnFailure'] = Step.pop('ActionOnFailure')

            # HadoopJarStep (required)
            _validate_param(Step, 'HadoopJarStep', dict)
            HadoopJarStep = dict(Step.pop('HadoopJarStep'))

            _validate_param(HadoopJarStep, 'Jar', string_types)
            new_step['Config']['Jar'] = HadoopJarStep.pop('Jar')

            if 'Args' in HadoopJarStep:
                Args = HadoopJarStep.pop('Args')
                _validate_param_type(Args, (list, tuple))
                for arg in Args:
                    _validate_param_type(arg, string_types)
                new_step['Config']['Args'].extend(Args)

            if 'MainClass' in HadoopJarStep:
                _validate_param(HadoopJarStep, 'MainClass', string_types)
                new_step['Config']['MainClass'] = HadoopJarStep.pop(
                    'MainClass')

            # we don't currently support Properties
            if HadoopJarStep:
                raise NotImplementedError(
                    "mock_boto3 doesn't support these HadoopJarStep params: %s"
                    % ', '.join(sorted(HadoopJarStep)))

            if Step:
                raise NotImplementedError(
                    "mock_boto3 doesn't support these step params: %s" %
                    ', '.join(sorted(Step)))

            new_steps.append(new_step)

        cluster['_Steps'].extend(new_steps)

        # add_job_flow_steps() needs to return step IDs
        return [new_step['Id'] for new_step in new_steps]
Esempio n. 45
0
    def _add_instance_groups(self,
                             operation_name,
                             InstanceGroups,
                             cluster,
                             now=None):
        """Add instance groups from *InstanceGroups* to the mock
        cluster *cluster*.
        """
        _validate_param_type(InstanceGroups, (list, tuple))

        def _error(message):
            return _ValidationException(operation_name, message)

        if now is None:
            now = _boto3_now()

        # currently, this is just a helper method for run_job_flow()
        if cluster.get('_InstanceGroups'):
            raise NotImplementedError(
                "mock_boto3 doesn't support adding instance groups")

        new_igs = []  # don't update _InstanceGroups if there's an error

        roles = set()  # roles already handled

        for i, InstanceGroup in enumerate(InstanceGroups):
            _validate_param_type(InstanceGroup, dict)
            InstanceGroup = dict(InstanceGroup)

            # our new mock instance group
            ig = dict(
                Configurations=[],
                EbsBlockDevices=[],
                Id='ig-FAKE',
                InstanceGroupType='',
                Market='ON_DEMAND',
                RequestedInstanceCount=0,
                RunningInstanceCount=0,
                ShrinkPolicy={},
                Status=dict(
                    State='PROVISIONING',
                    StateChangeReason=dict(Message=''),
                    Timeline=dict(CreationDateTime=now),
                ),
            )

            # InstanceRole (required)
            _validate_param(InstanceGroup, 'InstanceRole',
                            ['MASTER', 'CORE', 'TASK'])
            role = InstanceGroup.pop('InstanceRole')

            # check for duplicate roles
            if role in roles:
                raise _error('Multiple %s instance groups supplied, you'
                             ' must specify exactly one %s instance group' %
                             (role.lower(), role.lower()))
            roles.add(role)

            ig['InstanceGroupType'] = role

            # InstanceType (required)
            _validate_param(InstanceGroup, 'InstanceType', string_types)

            # 3.x AMIs (but not 4.x, etc.) reject m1.small explicitly
            if (InstanceGroup.get('InstanceType') == 'm1.small'
                    and cluster.get('RunningAmiVersion', '').startswith('3.')):
                raise _error(
                    'm1.small instance type is not supported with AMI version'
                    ' %s.' % cluster['RunningAmiVersion'])

            ig['InstanceType'] = InstanceGroup.pop('InstanceType')

            # InstanceCount (required)
            _validate_param(InstanceGroup, 'InstanceCount', integer_types)
            InstanceCount = InstanceGroup.pop('InstanceCount')
            if InstanceCount < 1:
                raise _error(
                    'An instance group must have at least one instance')

            if role == 'MASTER' and InstanceCount != 1:
                raise _error(
                    'A master instance group must specify a single instance')
            ig['RequestedInstanceCount'] = InstanceCount

            # Name
            if 'Name' in InstanceGroup:
                _validate_param(InstanceGroup, 'Name', string_types)
                ig['Name'] = InstanceGroup.pop('Name')

            # Market (default set above)
            if 'Market' in InstanceGroup:
                _validate_param(InstanceGroup, 'Market', string_types)
                if InstanceGroup['Market'] not in ('ON_DEMAND', 'SPOT'):
                    raise _error(
                        "1 validation error detected: value '%s' at"
                        " 'instances.instanceGroups.%d.member.market' failed"
                        " to satify constraint: Member must satisfy enum value"
                        " set: [SPOT, ON_DEMAND]" % (role, i + 1))
                ig['Market'] = InstanceGroup.pop('Market')

            # BidPrice
            if 'BidPrice' in InstanceGroup:
                # not float, surprisingly
                _validate_param(InstanceGroup, 'BidPrice', string_types)

                if ig['Market'] != 'SPOT':
                    raise _error('Attempted to set bid price for on demand'
                                 ' instance group.')

                # simulate bid price validation
                BidPrice = InstanceGroup.pop('BidPrice')
                try:
                    if not float(BidPrice) > 0:
                        raise _error('The bid price is negative or zero.')
                except (TypeError, ValueError):
                    raise _error(
                        'The bid price supplied for an instance group is'
                        ' invalid')

                if '.' in BidPrice and len(BidPrice.split('.', 1)[1]) > 3:
                    raise _error('No more than 3 digits are allowed after'
                                 ' decimal place in bid price')

                ig['BidPrice'] = BidPrice

            if InstanceGroup:
                raise NotImplementedError(
                    'mock_boto3 does not support these InstanceGroup'
                    ' params: %s' % ', '.join(sorted(InstanceGroup)))

            new_igs.append(ig)

        # TASK roles require CORE roles (to host HDFS)
        if 'TASK' in roles and 'CORE' not in roles:
            raise _error(
                'Clusters with task nodes must also define core nodes.')

        # MASTER role is required
        if 'MASTER' not in roles:
            raise _error('Zero master instance groups supplied, you must'
                         ' specify exactly one master instance group')

        cluster['_InstanceGroups'].extend(new_igs)
Esempio n. 46
0
    def create_fake_clusters(self):
        self.now = _boto3_now().replace(microsecond=0)
        self.add_mock_s3_data({'my_bucket': {}})

        # create a timestamp the given number of *hours*, *minutes*, etc.
        # in the past
        def ago(**kwargs):
            return self.now - timedelta(**kwargs)

        # Build a step object easily
        # also make it respond to .args()
        def step(jar='/home/hadoop/contrib/streaming/hadoop-streaming.jar',
                 args=self._DEFAULT_STEP_ARGS,
                 state='COMPLETED',
                 created=None,
                 started=None,
                 ended=None,
                 name='Streaming Step',
                 action_on_failure='TERMINATE_CLUSTER',
                 **kwargs):

            timeline = dict()
            if created:
                timeline['CreationDateTime'] = created
            if started:
                timeline['StartDateTime'] = started
            if ended:
                timeline['EndDateTime'] = ended

            return dict(
                Config=dict(
                    ActionOnFailure=action_on_failure,
                    Args=args,
                    Jar=jar,
                ),
                Status=dict(
                    State=state,
                    Timeline=timeline,
                )
            )

        # empty job
        self.add_mock_emr_cluster(
            dict(
                Id='j-EMPTY',
                TerminationProtected=False,
                Status=dict(
                    State='STARTING',
                    Timeline=dict(
                        CreationDateTime=ago(hours=10)
                    ),
                ),
            )
        )

        # job that's bootstrapping
        self.add_mock_emr_cluster(dict(
            Id='j-BOOTSTRAPPING',
            TerminationProtected=False,
            Status=dict(
                State='BOOTSTRAPPING',
                Timeline=dict(
                    CreationDateTime=ago(hours=10),
                ),
            ),
            _Steps=[step(created=ago(hours=10), state='PENDING')],
        ))

        # currently running job
        self.add_mock_emr_cluster(
            dict(
                Id='j-CURRENTLY_RUNNING',
                TerminationProtected=False,
                Status=dict(
                    State='RUNNING',
                    Timeline=dict(
                        CreationDateTime=ago(hours=4, minutes=15),
                        ReadyDateTime=ago(hours=4, minutes=10)
                    )
                ),
                _Steps=[step(started=ago(hours=4), state='RUNNING')]
            )
        )

        # finished cluster
        self.add_mock_emr_cluster(dict(
            Id='j-DONE',
            TerminationProtected=False,
            Status=dict(
                State='TERMINATED',
                Timeline=dict(
                    CreationDateTime=ago(hours=10),
                    ReadyDateTime=ago(hours=8),
                    EndDateTime=ago(hours=5),
                ),
            ),
            _Steps=[step(started=ago(hours=8), ended=ago(hours=6))],
        ))

        # idle cluster
        self.add_mock_emr_cluster(dict(
            Id='j-DONE_AND_IDLE',
            TerminationProtected=False,
            Status=dict(
                State='WAITING',
                Timeline=dict(
                    CreationDateTime=ago(hours=6),
                    ReadyDateTime=ago(hours=5, minutes=5),
                ),
            ),
            _Steps=[step(started=ago(hours=4), ended=ago(hours=2))],
        ))

        # idle cluster with 4.x step format. should still be
        # recognizable as a streaming step
        self.add_mock_emr_cluster(dict(
            Id='j-DONE_AND_IDLE_4_X',
            TerminationProtected=False,
            Status=dict(
                State='WAITING',
                Timeline=dict(
                    CreationDateTime=ago(hours=6),
                    ReadyDateTime=ago(hours=5, minutes=5),
                ),
            ),
            _Steps=[step(started=ago(hours=4), ended=ago(hours=2),
                         jar='command-runner.jar',
                         args=['hadoop-streaming'] + self._DEFAULT_STEP_ARGS)],
        ))

        # idle cluster with an active lock
        self.add_mock_emr_cluster(dict(
            Id='j-IDLE_AND_LOCKED',
            TerminationProtected=False,
            Status=dict(
                State='WAITING',
                Timeline=dict(
                    CreationDateTime=ago(hours=6),
                    ReadyDateTime=ago(hours=5, minutes=5),
                ),
            ),
            _Steps=[step(started=ago(hours=4), ended=ago(hours=2))],
        ))
        self.add_mock_s3_data({
            'my_bucket': {
                'locks/j-IDLE_AND_LOCKED/2': b'not_you',
            },
        })

        # idle cluster with an expired lock
        self.add_mock_emr_cluster(dict(
            Id='j-IDLE_AND_EXPIRED',
            TerminationProtected=False,
            Status=dict(
                State='WAITING',
                Timeline=dict(
                    CreationDateTime=ago(hours=6),
                    ReadyDateTime=ago(hours=5, minutes=5),
                ),
            ),
            _Steps=[step(started=ago(hours=4), ended=ago(hours=2))],
        ))
        self.add_mock_s3_data({
            'my_bucket': {
                'locks/j-IDLE_AND_EXPIRED/2': b'not_you',
            },
        }, age=timedelta(minutes=5))

        # idle cluster with an expired lock
        self.add_mock_emr_cluster(dict(
            Id='j-IDLE_BUT_INCOMPLETE_STEPS',
            TerminationProtected=False,
            Status=dict(
                State='WAITING',
                Timeline=dict(
                    CreationDateTime=ago(hours=6),
                    ReadyDateTime=ago(hours=5, minutes=5),
                ),
            ),
            _Steps=[step(started=ago(hours=4), end_hours_ago=None)],
        ))

        # custom hadoop streaming jar
        self.add_mock_emr_cluster(dict(
            Id='j-CUSTOM_DONE_AND_IDLE',
            TerminationProtected=False,
            Status=dict(
                State='WAITING',
                Timeline=dict(
                    CreationDateTime=ago(hours=6),
                    ReadyDateTime=ago(hours=5, minutes=5),
                ),
            ),
            _Steps=[step(
                started=ago(hours=4),
                ended=ago(hours=4),
                jar=('s3://my_bucket/tmp/somejob/files/'
                     'oddjob-0.0.3-SNAPSHOT-standalone.jar'),
                args=[],
            )],
        ))

        # idle cluster, termination protected
        self.add_mock_emr_cluster(dict(
            Id='j-IDLE_AND_PROTECTED',
            TerminationProtected=True,
            Status=dict(
                State='WAITING',
                Timeline=dict(
                    CreationDateTime=ago(hours=6),
                    ReadyDateTime=ago(hours=5, minutes=5),
                ),
            ),
            _Steps=[step(started=ago(hours=4), ended=ago(hours=2))],
        ))

        # hadoop debugging without any other steps
        self.add_mock_emr_cluster(dict(
            Id='j-DEBUG_ONLY',
            TerminationProtected=False,
            Status=dict(
                State='WAITING',
                Timeline=dict(
                    CreationDateTime=ago(hours=3),
                    ReadyDateTime=ago(hours=2, minutes=55),
                ),
            ),
            _Steps=[
                step(jar='command-runner.jar',
                     name='Setup Hadoop Debugging',
                     args=['state-pusher-script'],
                     started=ago(hours=3),
                     ended=ago(hours=2))
            ],
        ))

        # hadoop debugging + actual job
        self.add_mock_emr_cluster(dict(
            Id='j-HADOOP_DEBUGGING',
            TerminationProtected=False,
            Status=dict(
                State='WAITING',
                Timeline=dict(
                    CreationDateTime=ago(hours=6),
                    ReadyDateTime=ago(hours=5, minutes=55),
                ),
            ),
            _Steps=[
                step(jar='command-runner.jar',
                     name='Setup Hadoop Debugging',
                     args=['state-pusher-script'],
                     started=ago(hours=5),
                     ended=ago(hours=4)),
                step(started=ago(hours=4), ended=ago(hours=2)),
            ],
        ))

        # should skip cancelled steps
        self.add_mock_emr_cluster(dict(
            Id='j-IDLE_AND_FAILED',
            TerminationProtected=False,
            Status=dict(
                State='WAITING',
                Timeline=dict(
                    CreationDateTime=ago(hours=6),
                    ReadyDateTime=ago(hours=5, minutes=5),
                ),
            ),
            _Steps=[
                step(started=ago(hours=4), ended=ago(hours=3), state='FAILED'),
                step(state='CANCELLED'),
            ],
        ))

        # pooled cluster reaching end of full hour
        self.add_mock_emr_cluster(dict(
            _BootstrapActions=[
                dict(Args=[], Name='action 0'),
                dict(
                    Args=['pool-0123456789abcdef0123456789abcdef',
                          'reflecting'],
                    Name='master',
                ),
            ],
            Id='j-POOLED',
            TerminationProtected=False,
            Status=dict(
                State='WAITING',
                Timeline=dict(
                    CreationDateTime=ago(minutes=55),
                    ReadyDateTime=ago(minutes=50),
                ),
            ),
            Tags=[
                dict(Key='__mrjob_pool_name',
                     Value='reflecting'),
                dict(Key='__mrjob_pool_hash',
                     Value='0123456789abcdef0123456789abcdef'),
            ],
        ))

        # cluster that has had pending jobs but hasn't run them
        self.add_mock_emr_cluster(dict(
            Id='j-PENDING_BUT_IDLE',
            TerminationProtected=False,
            Status=dict(
                State='RUNNING',
                Timeline=dict(
                    CreationDateTime=ago(hours=3),
                    ReadyDateTime=ago(hours=2, minutes=50),
                ),
            ),
            _Steps=[step(created=ago(hours=3), state='PENDING')],
        ))