Esempio n. 1
0
def _to_datetime(iso8601_time):
    """Convert a ISO8601-formatted datetime (from :py:mod:`boto`) to
    a :py:class:`datetime.datetime`."""
    if iso8601_time is None:
        return None

    return iso8601_to_datetime(iso8601_time)
Esempio n. 2
0
def _time_last_active(cluster_summary, steps):
    """When did something last happen with the given cluster?

    Things we look at:

    * ``cluster.creationdatetime`` (always set)
    * ``cluster.readydatetime`` (i.e. when bootstrapping finished)
    * ``step.creationdatetime`` for any step
    * ``step.startdatetime`` for any step
    * ``step.enddatetime`` for any step

    This is not really meant to be run on clusters which are currently
    running, or done.
    """
    timestamps = []

    for key in 'creationdatetime', 'readydatetime':
        value = getattr(cluster_summary.status.timeline, key, None)
        if value:
            timestamps.append(value)

    for step in steps:
        for key in 'creationdatetime', 'startdatetime', 'enddatetime':
            value = getattr(step.status.timeline, key, None)
            if value:
                timestamps.append(value)

    # for ISO8601 timestamps, alpha order == chronological order
    last_timestamp = max(timestamps)

    return iso8601_to_datetime(last_timestamp)
Esempio n. 3
0
def to_datetime(iso8601_time):
    """Convert a ISO8601-formatted datetime (from :py:mod:`boto`) to
    a :py:class:`datetime.datetime`."""
    if iso8601_time is None:
        return None

    return iso8601_to_datetime(iso8601_time)
def time_last_active(job_flow):
    """When did something last happen with the given job flow?

    Things we look at:

    * ``job_flow.creationdatetime`` (always set)
    * ``job_flow.startdatetime``
    * ``job_flow.readydatetime`` (i.e. when bootstrapping finished)
    * ``step.creationdatetime`` for any step
    * ``step.startdatetime`` for any step
    * ``step.enddatetime`` for any step

    This is not really meant to be run on job flows which are currently
    running, or done.
    """
    timestamps = []

    for key in 'creationdatetime', 'startdatetime', 'readydatetime':
        value = getattr(job_flow, key, None)
        if value:
            timestamps.append(value)

    steps = getattr(job_flow, 'steps', None) or []
    for step in steps:
        for key in 'creationdatetime', 'startdatetime', 'enddatetime':
            value = getattr(step, key, None)
            if value:
                timestamps.append(value)

    # for ISO8601 timestamps, alpha order == chronological order
    last_timestamp = max(timestamps)

    return iso8601_to_datetime(last_timestamp)
Esempio n. 5
0
def time_last_active(job_flow):
    """When did something last happen with the given job flow?

    Things we look at:

    * ``job_flow.creationdatetime`` (always set)
    * ``job_flow.startdatetime``
    * ``job_flow.readydatetime`` (i.e. when bootstrapping finished)
    * ``step.creationdatetime`` for any step
    * ``step.startdatetime`` for any step
    * ``step.enddatetime`` for any step

    This is not really meant to be run on job flows which are currently
    running, or done.
    """
    timestamps = []

    for key in 'creationdatetime', 'startdatetime', 'readydatetime':
        value = getattr(job_flow, key, None)
        if value:
            timestamps.append(value)

    steps = getattr(job_flow, 'steps', None) or []
    for step in steps:
        for key in 'creationdatetime', 'startdatetime', 'enddatetime':
            value = getattr(step, key, None)
            if value:
                timestamps.append(value)

    # for ISO8601 timestamps, alpha order == chronological order
    last_timestamp = max(timestamps)

    return iso8601_to_datetime(last_timestamp)
def _time_last_active(cluster_summary, steps):
    """When did something last happen with the given cluster?

    Things we look at:

    * ``cluster.creationdatetime`` (always set)
    * ``cluster.readydatetime`` (i.e. when bootstrapping finished)
    * ``step.creationdatetime`` for any step
    * ``step.startdatetime`` for any step
    * ``step.enddatetime`` for any step

    This is not really meant to be run on clusters which are currently
    running, or done.
    """
    timestamps = []

    for key in 'creationdatetime', 'readydatetime':
        value = getattr(cluster_summary.status.timeline, key, None)
        if value:
            timestamps.append(value)

    for step in steps:
        for key in 'creationdatetime', 'startdatetime', 'enddatetime':
            value = getattr(step.status.timeline, key, None)
            if value:
                timestamps.append(value)

    # for ISO8601 timestamps, alpha order == chronological order
    last_timestamp = max(timestamps)

    return iso8601_to_datetime(last_timestamp)
Esempio n. 7
0
File: pool.py Progetto: Asana/mrjob
def est_time_to_hour(job_flow, now=None):
    """How long before job reaches the end of the next full hour since it
    began. This is important for billing purposes.

    If it happens to be exactly a whole number of hours, we return
    one hour, not zero.
    """
    if now is None:
        now = datetime.utcnow()

    creationdatetime = getattr(job_flow, 'creationdatetime', None)
    startdatetime = getattr(job_flow, 'startdatetime', None)

    if creationdatetime:
        if startdatetime:
            start = iso8601_to_datetime(startdatetime)
        else:
            start = iso8601_to_datetime(job_flow.creationdatetime)
    else:
        # do something reasonable if creationdatetime isn't set
        return timedelta(minutes=60)

    run_time = now - start
    return timedelta(seconds=((-run_time).seconds % 3600.0 or 3600.0))
Esempio n. 8
0
def est_time_to_hour(job_flow, now=None):
    """How long before job reaches the end of the next full hour since it
    began. This is important for billing purposes.

    If it happens to be exactly a whole number of hours, we return
    one hour, not zero.
    """
    if now is None:
        now = datetime.utcnow()

    creationdatetime = getattr(job_flow, 'creationdatetime', None)
    startdatetime = getattr(job_flow, 'startdatetime', None)

    if creationdatetime:
        if startdatetime:
            start = iso8601_to_datetime(startdatetime)
        else:
            start = iso8601_to_datetime(job_flow.creationdatetime)
    else:
        # do something reasonable if creationdatetime isn't set
        return timedelta(minutes=60)

    run_time = now - start
    return timedelta(seconds=((-run_time).seconds % 3600.0 or 3600.0))
Esempio n. 9
0
def _find_long_running_jobs(emr_conn, cluster_summaries, min_time, now=None):
    """Identify jobs that have been running or pending for a long time.

    :param clusters: a list of :py:class:`boto.emr.emrobject.Cluster`
                      objects to inspect.
    :param min_time: a :py:class:`datetime.timedelta`: report jobs running or
                     pending longer than this
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.

    For each job that is running or pending longer than *min_time*, yields
    a dictionary with the following keys:

    * *cluster_id*: the cluster's unique ID (e.g. ``j-SOMECLUSTER``)
    * *name*: name of the step, or the cluster when bootstrapping
    * *state*: state of the step (``'RUNNING'`` or ``'PENDING'``) or, if there
               is no step, the cluster (``'STARTING'`` or ``'BOOTSTRAPPING'``)
    * *time*: amount of time step was running or pending, as a
              :py:class:`datetime.timedelta`
    """
    if now is None:
        now = datetime.utcnow()

    for cs in cluster_summaries:

        # special case for jobs that are taking a long time to bootstrap
        if cs.status.state in ('STARTING', 'BOOTSTRAPPING'):
            # there isn't a way to tell when the cluster stopped being
            # provisioned and started bootstrapping, so just measure
            # from cluster creation time
            created_timestamp = cs.status.timeline.creationdatetime
            created = iso8601_to_datetime(created_timestamp)

            time_running = now - created

            if time_running >= min_time:
                yield ({
                    'cluster_id': cs.id,
                    'name': cs.name,
                    'state': cs.status.state,
                    'time': time_running
                })

        # the default case: running clusters
        if cs.status.state != 'RUNNING':
            continue

        steps = _list_all_steps(emr_conn, cs.id)

        running_steps = [
            step for step in steps if step.status.state == 'RUNNING'
        ]
        pending_steps = [
            step for step in steps if step.status.state == 'PENDING'
        ]

        if running_steps:
            # should be only one, but if not, we should know about it
            for step in running_steps:

                start_timestamp = step.status.timeline.startdatetime
                start = iso8601_to_datetime(start_timestamp)

                time_running = now - start

                if time_running >= min_time:
                    yield ({
                        'cluster_id': cs.id,
                        'name': step.name,
                        'state': step.status.state,
                        'time': time_running
                    })

        # sometimes EMR says it's "RUNNING" but doesn't actually run steps!
        elif pending_steps:
            step = pending_steps[0]

            # PENDING job should have run starting when the cluster
            # became ready, or the previous step completed
            start_timestamp = cs.status.timeline.readydatetime
            for step in steps:
                if step.status.state == 'COMPLETED':
                    start_timestamp = step.status.timeline.enddatetime

            start = iso8601_to_datetime(start_timestamp)
            time_pending = now - start

            if time_pending >= min_time:
                yield ({
                    'cluster_id': cs.id,
                    'name': step.name,
                    'state': step.status.state,
                    'time': time_pending
                })
Esempio n. 10
0
def find_long_running_jobs(job_flows, min_time, now=None):
    """Identify jobs that have been running or pending for a long time.

    :param job_flows: a list of :py:class:`boto.emr.emrobject.JobFlow`
                      objects to inspect.
    :param min_time: a :py:class:`datetime.timedelta`: report jobs running or
                     pending longer than this
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.

    For each job that is running or pending longer than *min_time*, yields
    a dictionary with the following keys:

    * *job_flow_id*: the job flow's unique ID (e.g. ``j-SOMEJOBFLOW``)
    * *name*: name of the step, or the job flow when bootstrapping
    * *step_state*: state of the step, either ``'RUNNING'`` or ``'PENDING'``
    * *time*: amount of time step was running or pending, as a
              :py:class:`datetime.timedelta`
    """
    if now is None:
        now = datetime.utcnow()

    for jf in job_flows:

        # special case for jobs that are taking a long time to bootstrap
        if jf.state == 'BOOTSTRAPPING':
            start_timestamp = jf.startdatetime
            start = iso8601_to_datetime(start_timestamp)

            time_running = now - start

            if time_running >= min_time:
                # we tell bootstrapping info by step_state being empty,
                # and only use job_flow_id and time in the report
                yield({'job_flow_id': jf.jobflowid,
                       'name': jf.name,
                       'step_state': '',
                       'time': time_running})

        # the default case: running job flows
        if jf.state != 'RUNNING':
            continue

        running_steps = [step for step in jf.steps if step.state == 'RUNNING']
        pending_steps = [step for step in jf.steps if step.state == 'PENDING']

        if running_steps:
            # should be only one, but if not, we should know
            for step in running_steps:

                start_timestamp = step.startdatetime
                start = iso8601_to_datetime(start_timestamp)

                time_running = now - start

                if time_running >= min_time:
                    yield({'job_flow_id': jf.jobflowid,
                           'name': step.name,
                           'step_state': step.state,
                           'time': time_running})

        # sometimes EMR says it's "RUNNING" but doesn't actually run steps!
        elif pending_steps:
            step = pending_steps[0]

            # PENDING job should have run starting when the job flow
            # became ready, or the previous step completed
            start_timestamp = jf.readydatetime
            for step in jf.steps:
                if step.state == 'COMPLETED':
                    start_timestamp = step.enddatetime

            start = iso8601_to_datetime(start_timestamp)
            time_pending = now - start

            if time_pending >= min_time:
                yield({'job_flow_id': jf.jobflowid,
                       'name': step.name,
                       'step_state': step.state,
                       'time': time_pending})
Esempio n. 11
0
def find_long_running_jobs(emr_conn, cluster_summaries, min_time, now=None):
    """Identify jobs that have been running or pending for a long time.

    :param clusters: a list of :py:class:`boto.emr.emrobject.Cluster`
                      objects to inspect.
    :param min_time: a :py:class:`datetime.timedelta`: report jobs running or
                     pending longer than this
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.

    For each job that is running or pending longer than *min_time*, yields
    a dictionary with the following keys:

    * *cluster_id*: the cluster's unique ID (e.g. ``j-SOMECLUSTER``)
    * *name*: name of the step, or the cluster when bootstrapping
    * *state*: state of the step (``'RUNNING'`` or ``'PENDING'``) or, if there
               is no step, the cluster (``'STARTING'`` or ``'BOOTSTRAPPING'``)
    * *time*: amount of time step was running or pending, as a
              :py:class:`datetime.timedelta`
    """
    if now is None:
        now = datetime.utcnow()

    for cs in cluster_summaries:

        # special case for jobs that are taking a long time to bootstrap
        if cs.status.state in ('STARTING', 'BOOTSTRAPPING'):
            # there isn't a way to tell when the cluster stopped being
            # provisioned and started bootstrapping, so just measure
            # from cluster creation time
            created_timestamp = cs.status.timeline.creationdatetime
            created = iso8601_to_datetime(created_timestamp)

            time_running = now - created

            if time_running >= min_time:
                yield({'cluster_id': cs.id,
                       'name': cs.name,
                       'state': cs.status.state,
                       'time': time_running})

        # the default case: running clusters
        if cs.status.state != 'RUNNING':
            continue

        steps = list(_yield_all_steps(emr_conn, cs.id))

        running_steps = [
            step for step in steps if step.status.state == 'RUNNING']
        pending_steps = [
            step for step in steps if step.status.state == 'PENDING']

        if running_steps:
            # should be only one, but if not, we should know about it
            for step in running_steps:

                start_timestamp = step.status.timeline.startdatetime
                start = iso8601_to_datetime(start_timestamp)

                time_running = now - start

                if time_running >= min_time:
                    yield({'cluster_id': cs.id,
                           'name': step.name,
                           'state': step.status.state,
                           'time': time_running})

        # sometimes EMR says it's "RUNNING" but doesn't actually run steps!
        elif pending_steps:
            step = pending_steps[0]

            # PENDING job should have run starting when the cluster
            # became ready, or the previous step completed
            start_timestamp = cs.status.timeline.readydatetime
            for step in steps:
                if step.status.state == 'COMPLETED':
                    start_timestamp = step.status.timeline.enddatetime

            start = iso8601_to_datetime(start_timestamp)
            time_pending = now - start

            if time_pending >= min_time:
                yield({'cluster_id': cs.id,
                       'name': step.name,
                       'state': step.status.state,
                       'time': time_pending})