Beispiel #1
0
def job_flow_to_usage_data(job_flow, basic_summary=None, now=None):
    """Break billing/usage information for a job flow down by job.

    :param job_flow: a :py:class:`boto.emr.EmrObject`
    :param basic_summary: a basic summary of the job flow, returned by
                          :py:func:`job_flow_to_basic_summary`. If this
                          is ``None``, we'll call
                          :py:func:`job_flow_to_basic_summary` ourselves.
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.

    Returns a list of dictionaries containing usage information, one for
    bootstrapping, and one for each step that ran or is currently running. If
    the job flow hasn't started yet, return ``[]``.

    Usage dictionaries have the following keys:

    * *end*: when the job finished running, or *now* if it's still running.
    * *end_billing*: the effective end of the job for billing purposes, either
      when the next job starts, the current time if the job
      is still running, or the end of the next full hour
      in the job flow.
    * *nih_billed*: normalized instances hours billed for this job or
      bootstrapping step
    * *nih_used*: normalized instance hours actually used for running
      the job or bootstrapping
    * *nih_bbnu*: usage billed but not used (`nih_billed - nih_used`)
    * *date_to_nih_\**: map from a :py:class:`datetime.date` to number
      of normalized instance hours billed/used/billed but not used
    * *label*: job's label (usually the module name of the job), or for the
      bootstrapping step, the label of the job flow
    * *owner*: job's owner (usually the user that started it), or for the
      bootstrapping step, the owner of the job flow
    * *start*: when the job or bootstrapping step started, as a
      :py:class:`datetime.datetime`
    """
    jf = basic_summary or job_flow_to_basic_summary(job_flow)

    if now is None:
        now = datetime.utcnow()

    if not jf['start']:
        return []

    # Figure out billing rate per second for the job, given that
    # normalizedinstancehours is how much we're charged up until
    # the next full hour.
    full_hours = math.ceil(to_secs(jf['ran']) / 60.0 / 60.0)
    nih_per_sec = jf['nih'] / (full_hours * 3600.0)

    # Don't actually count a step as billed for the full hour until
    # the job flow finishes. This means that our total "nih_billed"
    # will be less than normalizedinstancehours in the job flow, but it
    # also keeps stats stable for steps that have already finished.
    if jf['end']:
        jf_end_billing = jf['start'] + timedelta(hours=full_hours)
    else:
        jf_end_billing = now

    intervals = []

    # add a fake step for the job that started the job flow, and credit
    # it for time spent bootstrapping.
    intervals.append({
        'label': jf['label'],
        'owner': jf['owner'],
        'start': jf['start'],
        'end': jf['ready'] or now,
        'step_num': None,
    })

    for step in (getattr(job_flow, 'steps', None) or ()):
        # we've reached the last step that's actually run
        if not hasattr(step, 'startdatetime'):
            break

        step_start = to_datetime(step.startdatetime)

        step_end = to_datetime(getattr(step, 'enddatetime', None))
        if step_end is None:
            # step started running and was cancelled. credit it for 0 usage
            if jf['end']:
                step_end = step_start
            # step is still running
            else:
                step_end = now

        m = STEP_NAME_RE.match(getattr(step, 'name', ''))
        if m:
            step_label = m.group(1)
            step_owner = m.group(2)
            step_num = int(m.group(6))
        else:
            step_label, step_owner, step_num = None, None, None

        intervals.append({
            'label': step_label,
            'owner': step_owner,
            'start': step_start,
            'end': step_end,
            'step_num': step_num,
        })

    # fill in end_billing
    for i in xrange(len(intervals) - 1):
        intervals[i]['end_billing'] = intervals[i + 1]['start']

    intervals[-1]['end_billing'] = jf_end_billing

    # fill normalized usage information
    for interval in intervals:

        interval['nih_used'] = (nih_per_sec *
                                to_secs(interval['end'] - interval['start']))

        interval['date_to_nih_used'] = dict(
            (d, nih_per_sec * secs) for d, secs in subdivide_interval_by_date(
                interval['start'], interval['end']).iteritems())

        interval['nih_billed'] = (
            nih_per_sec * to_secs(interval['end_billing'] - interval['start']))

        interval['date_to_nih_billed'] = dict(
            (d, nih_per_sec * secs) for d, secs in subdivide_interval_by_date(
                interval['start'], interval['end_billing']).iteritems())
        # time billed but not used
        interval['nih_bbnu'] = interval['nih_billed'] - interval['nih_used']

        interval['date_to_nih_bbnu'] = {}
        for d, nih_billed in interval['date_to_nih_billed'].iteritems():
            nih_bbnu = nih_billed - interval['date_to_nih_used'].get(d, 0.0)
            if nih_bbnu:
                interval['date_to_nih_bbnu'][d] = nih_bbnu

    return intervals
Beispiel #2
0
def cluster_to_usage_data(cluster, basic_summary=None, now=None):
    """Break billing/usage information for a job flow down by job.

    :param cluster: a :py:class:`boto.emr.EmrObject`
    :param basic_summary: a basic summary of the job flow, returned by
                          :py:func:`cluster_to_basic_summary`. If this
                          is ``None``, we'll call
                          :py:func:`cluster_to_basic_summary` ourselves.
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.

    Returns a list of dictionaries containing usage information, one for
    bootstrapping, and one for each step that ran or is currently running. If
    the job flow hasn't started yet, return ``[]``.

    Usage dictionaries have the following keys:

    * *end*: when the job finished running, or *now* if it's still running.
    * *end_billing*: the effective end of the job for billing purposes, either
      when the next job starts, the current time if the job
      is still running, or the end of the next full hour
      in the job flow.
    * *nih_billed*: normalized instances hours billed for this job or
      bootstrapping step
    * *nih_used*: normalized instance hours actually used for running
      the job or bootstrapping
    * *nih_bbnu*: usage billed but not used (`nih_billed - nih_used`)
    * *date_to_nih_\**: map from a :py:class:`datetime.date` to number
      of normalized instance hours billed/used/billed but not used on that date
    * *hour_to_nih_\**: map from a :py:class:`datetime.datetime` to number
      of normalized instance hours billed/used/billed but not used during
      the hour starting at that time
    * *label*: job's label (usually the module name of the job), or for the
      bootstrapping step, the label of the job flow
    * *owner*: job's owner (usually the user that started it), or for the
      bootstrapping step, the owner of the job flow
    * *start*: when the job or bootstrapping step started, as a
      :py:class:`datetime.datetime`
    """
    bcs = basic_summary or cluster_to_basic_summary(cluster)

    if now is None:
        now = datetime.utcnow()

    if not bcs['created']:
        return []

    # Figure out billing rate per second for the job, given that
    # normalizedinstancehours is how much we're charged up until
    # the next full hour.
    full_hours = math.ceil(to_secs(bcs['ran']) / 60.0 / 60.0)
    nih_per_sec = bcs['nih'] / (full_hours * 3600.0)

    # Don't actually count a step as billed for the full hour until
    # the job flow finishes. This means that our total "nih_billed"
    # will be less than normalizedinstancehours in the job flow, but it
    # also keeps stats stable for steps that have already finished.
    if bcs['end']:
        cluster_end_billing = bcs['created'] + timedelta(hours=full_hours)
    else:
        cluster_end_billing = now

    intervals = []

    # make a fake step for cluster startup and bootstrapping, so we don't
    # consider that wasted.
    intervals.append({
        'label': bcs['label'],
        'owner': bcs['owner'],
        'start': bcs['created'],
        'end': bcs['ready'] or bcs['end'] or now,
        'step_num': None,
    })

    for step in getattr(cluster, 'steps', ()):
        step_status = getattr(step, 'status', None)
        step_timeline = getattr(step_status, 'timeline', None)

        # we've reached the last step that's actually run
        if not hasattr(step_timeline, 'startdatetime'):
            break

        step_start = to_datetime(step_timeline.startdatetime)

        step_end = to_datetime(getattr(step_timeline, 'enddatetime', None))
        if step_end is None:
            # step started running and was cancelled. credit it for 0 usage
            if bcs['end']:
                step_end = step_start
            # step is still running
            else:
                step_end = now

        m = STEP_NAME_RE.match(getattr(step, 'name', ''))
        if m:
            step_label = m.group(1)
            step_owner = m.group(2)
            step_num = int(m.group(6))
        else:
            step_label, step_owner, step_num = None, None, None

        intervals.append({
            'label': step_label,
            'owner': step_owner,
            'start': step_start,
            'end': step_end,
            'step_num': step_num,
        })

    # fill in end_billing
    for i in range(len(intervals) - 1):
        intervals[i]['end_billing'] = intervals[i + 1]['start']

    intervals[-1]['end_billing'] = cluster_end_billing

    # fill normalized usage information
    for interval in intervals:

        interval['nih_used'] = (
            nih_per_sec *
            to_secs(interval['end'] - interval['start']))

        interval['date_to_nih_used'] = dict(
            (d, nih_per_sec * secs)
            for d, secs
            in subdivide_interval_by_date(interval['start'],
                                          interval['end']).items())

        interval['hour_to_nih_used'] = dict(
            (d, nih_per_sec * secs)
            for d, secs
            in subdivide_interval_by_hour(interval['start'],
                                          interval['end']).items())

        interval['nih_billed'] = (
            nih_per_sec *
            to_secs(interval['end_billing'] - interval['start']))

        interval['date_to_nih_billed'] = dict(
            (d, nih_per_sec * secs)
            for d, secs
            in subdivide_interval_by_date(interval['start'],
                                          interval['end_billing']).items())

        interval['hour_to_nih_billed'] = dict(
            (d, nih_per_sec * secs)
            for d, secs
            in subdivide_interval_by_hour(interval['start'],
                                          interval['end_billing']).items())

        # time billed but not used
        interval['nih_bbnu'] = interval['nih_billed'] - interval['nih_used']

        interval['date_to_nih_bbnu'] = {}
        for d, nih_billed in interval['date_to_nih_billed'].items():
            nih_bbnu = nih_billed - interval['date_to_nih_used'].get(d, 0.0)
            if nih_bbnu:
                interval['date_to_nih_bbnu'][d] = nih_bbnu

        interval['hour_to_nih_bbnu'] = {}
        for d, nih_billed in interval['hour_to_nih_billed'].items():
            nih_bbnu = nih_billed - interval['hour_to_nih_used'].get(d, 0.0)
            if nih_bbnu:
                interval['hour_to_nih_bbnu'][d] = nih_bbnu

    return intervals
Beispiel #3
0
def job_flow_to_usage_data(job_flow, basic_summary=None, now=None):
    """Break billing/usage information for a job flow down by job.

    :param job_flow: a :py:class:`boto.emr.EmrObject`
    :param basic_summary: a basic summary of the job flow, returned by
                          :py:func:`job_flow_to_basic_summary`. If this
                          is ``None``, we'll call
                          :py:func:`job_flow_to_basic_summary` ourselves.
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.

    Returns a list of dictionaries containing usage information, one for
    bootstrapping, and one for each step that ran or is currently running. If
    the job flow hasn't started yet, return ``[]``.

    Usage dictionaries have the following keys:

    * *end*: when the job finished running, or *now* if it's still running.
    * *end_billing*: the effective end of the job for billing purposes, either
      when the next job starts, the current time if the job
      is still running, or the end of the next full hour
      in the job flow.
    * *nih_billed*: normalized instances hours billed for this job or
      bootstrapping step
    * *nih_used*: normalized instance hours actually used for running
      the job or bootstrapping
    * *nih_bbnu*: usage billed but not used (`nih_billed - nih_used`)
    * *date_to_nih_\**: map from a :py:class:`datetime.date` to number
      of normalized instance hours billed/used/billed but not used on that date
    * *hour_to_nih_\**: map from a :py:class:`datetime.datetime` to number
      of normalized instance hours billed/used/billed but not used during
      the hour starting at that time
    * *label*: job's label (usually the module name of the job), or for the
      bootstrapping step, the label of the job flow
    * *owner*: job's owner (usually the user that started it), or for the
      bootstrapping step, the owner of the job flow
    * *start*: when the job or bootstrapping step started, as a
      :py:class:`datetime.datetime`
    """
    jf = basic_summary or job_flow_to_basic_summary(job_flow)

    if now is None:
        now = datetime.utcnow()

    if not jf["start"]:
        return []

    # Figure out billing rate per second for the job, given that
    # normalizedinstancehours is how much we're charged up until
    # the next full hour.
    full_hours = math.ceil(to_secs(jf["ran"]) / 60.0 / 60.0)
    nih_per_sec = jf["nih"] / (full_hours * 3600.0)

    # Don't actually count a step as billed for the full hour until
    # the job flow finishes. This means that our total "nih_billed"
    # will be less than normalizedinstancehours in the job flow, but it
    # also keeps stats stable for steps that have already finished.
    if jf["end"]:
        jf_end_billing = jf["start"] + timedelta(hours=full_hours)
    else:
        jf_end_billing = now

    intervals = []

    # add a fake step for the job that started the job flow, and credit
    # it for time spent bootstrapping.
    intervals.append(
        {"label": jf["label"], "owner": jf["owner"], "start": jf["start"], "end": jf["ready"] or now, "step_num": None}
    )

    for step in getattr(job_flow, "steps", None) or ():
        # we've reached the last step that's actually run
        if not hasattr(step, "startdatetime"):
            break

        step_start = to_datetime(step.startdatetime)

        step_end = to_datetime(getattr(step, "enddatetime", None))
        if step_end is None:
            # step started running and was cancelled. credit it for 0 usage
            if jf["end"]:
                step_end = step_start
            # step is still running
            else:
                step_end = now

        m = STEP_NAME_RE.match(getattr(step, "name", ""))
        if m:
            step_label = m.group(1)
            step_owner = m.group(2)
            step_num = int(m.group(6))
        else:
            step_label, step_owner, step_num = None, None, None

        intervals.append(
            {"label": step_label, "owner": step_owner, "start": step_start, "end": step_end, "step_num": step_num}
        )

    # fill in end_billing
    for i in xrange(len(intervals) - 1):
        intervals[i]["end_billing"] = intervals[i + 1]["start"]

    intervals[-1]["end_billing"] = jf_end_billing

    # fill normalized usage information
    for interval in intervals:

        interval["nih_used"] = nih_per_sec * to_secs(interval["end"] - interval["start"])

        interval["date_to_nih_used"] = dict(
            (d, nih_per_sec * secs)
            for d, secs in subdivide_interval_by_date(interval["start"], interval["end"]).iteritems()
        )

        interval["hour_to_nih_used"] = dict(
            (d, nih_per_sec * secs)
            for d, secs in subdivide_interval_by_hour(interval["start"], interval["end"]).iteritems()
        )

        interval["nih_billed"] = nih_per_sec * to_secs(interval["end_billing"] - interval["start"])

        interval["date_to_nih_billed"] = dict(
            (d, nih_per_sec * secs)
            for d, secs in subdivide_interval_by_date(interval["start"], interval["end_billing"]).iteritems()
        )

        interval["hour_to_nih_billed"] = dict(
            (d, nih_per_sec * secs)
            for d, secs in subdivide_interval_by_hour(interval["start"], interval["end_billing"]).iteritems()
        )

        # time billed but not used
        interval["nih_bbnu"] = interval["nih_billed"] - interval["nih_used"]

        interval["date_to_nih_bbnu"] = {}
        for d, nih_billed in interval["date_to_nih_billed"].iteritems():
            nih_bbnu = nih_billed - interval["date_to_nih_used"].get(d, 0.0)
            if nih_bbnu:
                interval["date_to_nih_bbnu"][d] = nih_bbnu

        interval["hour_to_nih_bbnu"] = {}
        for d, nih_billed in interval["hour_to_nih_billed"].iteritems():
            nih_bbnu = nih_billed - interval["hour_to_nih_used"].get(d, 0.0)
            if nih_bbnu:
                interval["hour_to_nih_bbnu"][d] = nih_bbnu

    return intervals