Example #1
0
    def test_auto_owner(self):
        os.environ["USER"] = "******"
        runner = InlineMRJobRunner(conf_paths=[])
        match = JOB_NAME_RE.match(runner.get_job_name())

        self.assertEqual(match.group(1), "no_script")
        self.assertEqual(match.group(2), "mcp")
Example #2
0
    def test_empty_no_user(self):
        self.getuser_should_fail = True
        runner = InlineMRJobRunner(conf_paths=[])
        match = JOB_NAME_RE.match(runner.get_job_name())

        self.assertEqual(match.group(1), 'no_script')
        self.assertEqual(match.group(2), 'no_user')
Example #3
0
    def test_empty_no_user(self):
        self.getuser_should_fail = True
        runner = InlineMRJobRunner(conf_paths=[])
        match = JOB_NAME_RE.match(runner.get_job_name())

        self.assertEqual(match.group(1), 'no_script')
        self.assertEqual(match.group(2), 'no_user')
Example #4
0
    def test_owner_and_label_kwargs(self):
        runner = LocalMRJobRunner(conf_path=False,
                                  owner='ads', label='ads_chain')
        match = JOB_NAME_RE.match(runner.get_job_name())

        assert_equal(match.group(1), 'ads_chain')
        assert_equal(match.group(2), 'ads')
Example #5
0
    def test_empty_no_user(self):
        self.getuser_should_fail = True
        runner = LocalMRJobRunner(conf_path=False)
        match = JOB_NAME_RE.match(runner.get_job_name())

        assert_equal(match.group(1), 'no_script')
        assert_equal(match.group(2), 'no_user')
Example #6
0
    def test_auto_owner(self):
        os.environ['USER'] = '******'
        runner = LocalMRJobRunner(conf_path=False)
        match = JOB_NAME_RE.match(runner.get_job_name())

        assert_equal(match.group(1), 'no_script')
        assert_equal(match.group(2), 'mcp')
Example #7
0
    def test_owner_and_label_kwargs(self):
        runner = InlineMRJobRunner(conf_paths=[],
                                  owner='ads', label='ads_chain')
        match = JOB_NAME_RE.match(runner.get_job_name())

        self.assertEqual(match.group(1), 'ads_chain')
        self.assertEqual(match.group(2), 'ads')
Example #8
0
    def test_auto_owner(self):
        os.environ['USER'] = '******'
        runner = InlineMRJobRunner(conf_paths=[])
        match = JOB_NAME_RE.match(runner.get_job_name())

        self.assertEqual(match.group(1), 'no_script')
        self.assertEqual(match.group(2), 'mcp')
Example #9
0
    def test_owner_and_label_switches(self):
        runner_opts = ['--no-conf', '--owner=ads', '--label=ads_chain']
        runner = MRTwoStepJob(runner_opts).make_runner()
        match = JOB_NAME_RE.match(runner.get_job_name())

        self.assertEqual(match.group(1), 'ads_chain')
        self.assertEqual(match.group(2), 'ads')
Example #10
0
    def test_owner_and_label_kwargs(self):
        runner = LocalMRJobRunner(conf_path=False,
                                  owner='ads', label='ads_chain')
        match = JOB_NAME_RE.match(runner.get_job_name())

        assert_equal(match.group(1), 'ads_chain')
        assert_equal(match.group(2), 'ads')
Example #11
0
    def test_auto_owner(self):
        os.environ['USER'] = '******'
        runner = LocalMRJobRunner(conf_path=False)
        match = JOB_NAME_RE.match(runner.get_job_name())

        assert_equal(match.group(1), 'no_script')
        assert_equal(match.group(2), 'mcp')
Example #12
0
    def test_empty_no_user(self):
        self.getuser_should_fail = True
        runner = LocalMRJobRunner(conf_path=False)
        match = JOB_NAME_RE.match(runner.get_job_name())

        assert_equal(match.group(1), 'no_script')
        assert_equal(match.group(2), 'no_user')
Example #13
0
    def test_owner_and_label_kwargs(self):
        runner = InlineMRJobRunner(conf_paths=[],
                                  owner='ads', label='ads_chain')
        match = JOB_NAME_RE.match(runner.get_job_name())

        self.assertEqual(match.group(1), 'ads_chain')
        self.assertEqual(match.group(2), 'ads')
Example #14
0
    def test_owner_and_label_switches(self):
        runner_opts = ['--no-conf', '--owner=ads', '--label=ads_chain']
        runner = MRTwoStepJob(runner_opts).make_runner()
        match = JOB_NAME_RE.match(runner.get_job_name())

        self.assertEqual(match.group(1), 'ads_chain')
        self.assertEqual(match.group(2), 'ads')
Example #15
0
    def test_auto_owner(self):
        os.environ['USER'] = '******'
        runner = InlineMRJobRunner(conf_paths=[])
        match = JOB_NAME_RE.match(runner.get_job_name())

        self.assertEqual(match.group(1), 'no_script')
        self.assertEqual(match.group(2), 'mcp')
Example #16
0
    def test_auto_everything(self):
        test_start = datetime.datetime.utcnow()

        os.environ["USER"] = "******"
        runner = MRTwoStepJob(["--no-conf"]).make_runner()
        match = JOB_NAME_RE.match(runner.get_job_name())

        self.assertEqual(match.group(1), "mr_two_step_job")
        self.assertEqual(match.group(2), "mcp")

        job_start = datetime.datetime.strptime(match.group(3) + match.group(4), "%Y%m%d%H%M%S")
        job_start = job_start.replace(microsecond=int(match.group(5)))
        self.assertGreaterEqual(job_start, test_start)
        self.assertLessEqual(job_start - test_start, datetime.timedelta(seconds=5))
Example #17
0
    def test_auto_everything(self):
        test_start = datetime.datetime.utcnow()

        os.environ['USER'] = '******'
        runner = MRTwoStepJob(['--no-conf']).make_runner()
        match = JOB_NAME_RE.match(runner.get_job_name())

        assert_equal(match.group(1), 'mr_two_step_job')
        assert_equal(match.group(2), 'mcp')

        job_start = datetime.datetime.strptime(
            match.group(3) + match.group(4), '%Y%m%d%H%M%S')
        job_start = job_start.replace(microsecond=int(match.group(5)))
        assert_gte(job_start, test_start)
        assert_lte(job_start - test_start, datetime.timedelta(seconds=5))
Example #18
0
    def test_auto_everything(self):
        test_start = datetime.datetime.utcnow()

        os.environ['USER'] = '******'
        runner = MRTwoStepJob(['--no-conf']).make_runner()
        match = JOB_NAME_RE.match(runner.get_job_name())

        assert_equal(match.group(1), 'mr_two_step_job')
        assert_equal(match.group(2), 'mcp')

        job_start = datetime.datetime.strptime(
            match.group(3) + match.group(4), '%Y%m%d%H%M%S')
        job_start = job_start.replace(microsecond=int(match.group(5)))
        assert_gte(job_start, test_start)
        assert_lte(job_start - test_start, datetime.timedelta(seconds=5))
Example #19
0
def print_report(options):

    emr_conn = EMRJobRunner(conf_path=options.conf_path).make_emr_conn()

    log.info('getting job flow history...')
    # microseconds just make our report messy
    now = datetime.datetime.utcnow().replace(microsecond=0)

    # if --max-days-ago is set, only look at recent jobs
    created_after = None
    if options.max_days_ago is not None:
        created_after = now - datetime.timedelta(days=options.max_days_ago)

    job_flows = describe_all_job_flows(emr_conn, created_after=created_after)

    job_flow_infos = []
    for jf in job_flows:
        job_flow_info = {}

        job_flow_info['id'] = jf.jobflowid

        job_flow_info['name'] = jf.name

        job_flow_info['created'] = to_datetime(jf.creationdatetime)

        start_time = to_datetime(getattr(jf, 'startdatetime', None))
        if start_time:
            end_time = to_datetime(getattr(jf, 'enddatetime', None)) or now
            job_flow_info['ran'] = end_time - start_time
        else:
            job_flow_info['ran'] = datetime.timedelta(0)

        job_flow_info['state'] = jf.state

        job_flow_info['num_steps'] = len(jf.steps or [])

        # this looks to be an integer, but let's protect against
        # future changes
        job_flow_info['hours'] = float(jf.normalizedinstancehours)

        # estimate hours billed but not used
        job_flow_info['hours_bbnu'] = (
            job_flow_info['hours'] *
            estimate_proportion_billed_but_not_used(jf))

        # split out mr job name and user
        # jobs flows created by MRJob have names like:
        # mr_word_freq_count.dave.20101103.121249.638552
        match = JOB_NAME_RE.match(jf.name)
        if match:
            job_flow_info['mr_job_name'] = match.group(1)
            job_flow_info['user'] = match.group(2)
        else:
            # not run by mrjob
            job_flow_info['mr_job_name'] = None
            job_flow_info['user'] = None

        job_flow_infos.append(job_flow_info)

    if not job_flow_infos:
        print 'No job flows created in the past two months!'
        return

    earliest = min(info['created'] for info in job_flow_infos)
    latest = max(info['created'] for info in job_flow_infos)

    print 'Total # of Job Flows: %d' % len(job_flow_infos)
    print

    print '* All times are in UTC.'
    print


    print 'Min create time: %s' % earliest
    print 'Max create time: %s' % latest
    print '   Current time: %s' % now
    print

    print '* All usage is measured in Normalized Instance Hours, which are'
    print '  roughly equivalent to running an m1.small instance for an hour.'
    print

    # total compute-unit hours used
    total_hours = sum(info['hours'] for info in job_flow_infos)
    print 'Total Usage: %d' % total_hours
    print

    print '* Time billed but not used is estimated, and may not match'
    print "  Amazon's billing system exactly."
    print

    total_hours_bbnu = sum(info['hours_bbnu'] for info in job_flow_infos)
    print 'Total time billed but not used (waste): %.2f' % total_hours_bbnu
    print

    date_to_hours = defaultdict(float)
    date_to_hours_bbnu = defaultdict(float)
    for info in job_flow_infos:
        date_created = info['created'].date()
        date_to_hours[date_created] += info['hours']
        date_to_hours_bbnu[date_created] += info['hours_bbnu']
    print 'Daily statistics:'
    print
    print ' date        usage     waste'
    d = latest.date()
    while d >= earliest.date():
        print ' %10s %6d %9.2f' % (d, date_to_hours[d], date_to_hours_bbnu[d])
        d -= datetime.timedelta(days=1)
    print

    def fmt(mr_job_name_or_user):
        if mr_job_name_or_user:
            return mr_job_name_or_user
        else:
            return '(not started by mrjob)'

    print '* Job flows are considered to belong to the user and job that'
    print '  started them (even if other jobs use the job flow).'
    print

    # Top jobs
    print 'Top jobs, by total usage:'
    mr_job_name_to_hours = defaultdict(float)
    for info in job_flow_infos:
        mr_job_name_to_hours[info['mr_job_name']] += info['hours']
    for mr_job_name, hours in sorted(mr_job_name_to_hours.iteritems(),
                                     key=lambda (n, h): (-h, n)):
        print '  %6d %s' % (hours, fmt(mr_job_name))
    print

    print 'Top jobs, by time billed but not used:'
    mr_job_name_to_hours_bbnu = defaultdict(float)
    for info in job_flow_infos:
        mr_job_name_to_hours_bbnu[info['mr_job_name']] += info['hours_bbnu']
    for mr_job_name, hours_bbnu in sorted(mr_job_name_to_hours_bbnu.iteritems(),
                                     key=lambda (n, h): (-h, n)):
        print '  %9.2f %s' % (hours_bbnu, fmt(mr_job_name))
    print

    # Top users
    print 'Top users, by total usage:'
    user_to_hours = defaultdict(float)
    for info in job_flow_infos:
        user_to_hours[info['user']] += info['hours']
    for user, hours in sorted(user_to_hours.iteritems(),
                              key=lambda (n, h): (-h, n)):
        print '  %6d %s' % (hours, fmt(user))
    print

    print 'Top users, by time billed but not used:'
    user_to_hours_bbnu = defaultdict(float)
    for info in job_flow_infos:
        user_to_hours_bbnu[info['user']] += info['hours_bbnu']
    for user, hours_bbnu in sorted(user_to_hours_bbnu.iteritems(),
                              key=lambda (n, h): (-h, n)):
        print '  %9.2f %s' % (hours_bbnu, fmt(user))
    print

    # Top job flows
    print 'All job flows, by total usage:'
    top_job_flows = sorted(job_flow_infos,
                           key=lambda i: (-i['hours'], i['name']))
    for info in top_job_flows:
        print '  %6d %-15s %s' % (info['hours'], info['id'], info['name'])
    print

    print 'All job flows, by time billed but not used:'
    top_job_flows_bbnu = sorted(job_flow_infos,
                           key=lambda i: (-i['hours_bbnu'], i['name']))
    for info in top_job_flows_bbnu:
        print '  %9.2f %-15s %s' % (
            info['hours_bbnu'], info['id'], info['name'])
    print

    print 'Details for all job flows:'
    print
    print ' id              state         created             steps        time ran  usage     waste   user   name'

    all_job_flows = sorted(job_flow_infos, key=lambda i: i['created'],
                           reverse=True)
    for info in all_job_flows:
        print ' %-15s %-13s %19s %3d %17s %6d %9.2f %8s %s' % (
            info['id'], info['state'], info['created'], info['num_steps'],
            info['ran'], info['hours'], info['hours_bbnu'],
            (info['user'] or ''), fmt(info['mr_job_name']))
Example #20
0
def job_flow_to_basic_summary(job_flow, now=None):
    """Extract fields such as creation time, owner, etc. from the job flow,
    so we can safely reference them without using :py:func:`getattr`.

    :param job_flow: a :py:class:`boto.emr.EmrObject`
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.

    Returns a dictionary with the following keys. These will be ``None`` if the
    corresponding field in the job flow is unavailable.

    * *created*: UTC `datetime.datetime` that the job flow was created,
      or ``None``
    * *end*: UTC `datetime.datetime` that the job flow finished, or ``None``
    * *id*: job flow ID, or ``None`` (this should never happen)
    * *label*: The label for the job flow (usually the module name of the
      :py:class:`~mrjob.job.MRJob` script that started it), or
      ``None`` for non-:py:mod:`mrjob` job flows.
    * *name*: job flow name, or ``None`` (this should never happen)
    * *nih*: number of normalized instance hours used by the job flow.
    * *num_steps*: Number of steps in the job flow.
    * *owner*: The owner for the job flow (usually the user that started it),
      or ``None`` for non-:py:mod:`mrjob` job flows.
    * *pool*: pool name (e.g. ``'default'``) if the job flow is pooled,
      otherwise ``None``.
    * *ran*: How long the job flow ran, or has been running, as a
      :py:class:`datetime.timedelta`. This will be ``timedelta(0)`` if
      the job flow hasn't started.
    * *ready*: UTC `datetime.datetime` that the job flow finished
      bootstrapping, or ``None``
    * *start*: UTC `datetime.datetime` that the job flow became available, or
      ``None``
    * *state*: The job flow's state as a string (e.g. ``'RUNNING'``)
    """
    if now is None:
        now = datetime.utcnow()

    jf = {}  # summary to fill in

    jf['id'] = getattr(job_flow, 'jobflowid', None)
    jf['name'] = getattr(job_flow, 'name', None)

    jf['created'] = to_datetime(getattr(job_flow, 'creationdatetime', None))
    jf['start'] = to_datetime(getattr(job_flow, 'startdatetime', None))
    jf['ready'] = to_datetime(getattr(job_flow, 'readydatetime', None))
    jf['end'] = to_datetime(getattr(job_flow, 'enddatetime', None))

    if jf['start']:
        jf['ran'] = (jf['end'] or now) - jf['start']
    else:
        jf['ran'] = timedelta(0)

    jf['state'] = getattr(job_flow, 'state', None)

    jf['num_steps'] = len(getattr(job_flow, 'steps', None) or ())

    jf['pool'] = None
    bootstrap_actions = getattr(job_flow, 'bootstrapactions', None)
    if bootstrap_actions:
        args = [arg.value for arg in bootstrap_actions[-1].args]
        if len(args) == 2 and args[0].startswith('pool-'):
            jf['pool'] = args[1]

    m = JOB_NAME_RE.match(getattr(job_flow, 'name', ''))
    if m:
        jf['label'], jf['owner'] = m.group(1), m.group(2)
    else:
        jf['label'], jf['owner'] = None, None

    jf['nih'] = float(getattr(job_flow, 'normalizedinstancehours', '0'))

    return jf
Example #21
0
    def test_empty(self):
        runner = LocalMRJobRunner(conf_path=False)
        match = JOB_NAME_RE.match(runner.get_job_name())

        assert_equal(match.group(1), 'no_script')
        assert_equal(match.group(2), getpass.getuser())
Example #22
0
    def test_empty(self):
        runner = LocalMRJobRunner(conf_path=False)
        match = JOB_NAME_RE.match(runner.get_job_name())

        assert_equal(match.group(1), 'no_script')
        assert_equal(match.group(2), getpass.getuser())
Example #23
0
    def test_auto_label(self):
        runner = MRTwoStepJob(['--no-conf']).make_runner()
        match = JOB_NAME_RE.match(runner.get_job_name())

        self.assertEqual(match.group(1), 'mr_two_step_job')
        self.assertEqual(match.group(2), getpass.getuser())
Example #24
0
    def test_empty(self):
        runner = InlineMRJobRunner(conf_paths=[])
        match = JOB_NAME_RE.match(runner.get_job_name())

        self.assertEqual(match.group(1), 'no_script')
        self.assertEqual(match.group(2), getpass.getuser())
Example #25
0
def print_report(options):

    emr_conn = EMRJobRunner(conf_path=options.conf_path).make_emr_conn()

    log.info('getting job flow history...')
    # microseconds just make our report messy
    now = datetime.datetime.utcnow().replace(microsecond=0)

    # if --max-days-ago is set, only look at recent jobs
    created_after = None
    if options.max_days_ago is not None:
        created_after = now - datetime.timedelta(days=options.max_days_ago)

    job_flows = describe_all_job_flows(emr_conn, created_after=created_after)

    job_flow_infos = []
    for jf in job_flows:
        job_flow_info = {}

        job_flow_info['id'] = jf.jobflowid

        job_flow_info['name'] = jf.name

        job_flow_info['created'] = to_datetime(jf.creationdatetime)

        start_time = to_datetime(getattr(jf, 'startdatetime', None))
        if start_time:
            end_time = to_datetime(getattr(jf, 'enddatetime', None)) or now
            job_flow_info['ran'] = end_time - start_time
        else:
            job_flow_info['ran'] = datetime.timedelta(0)

        job_flow_info['state'] = jf.state

        job_flow_info['num_steps'] = len(jf.steps or [])

        # this looks to be an integer, but let's protect against
        # future changes
        job_flow_info['hours'] = float(jf.normalizedinstancehours)

        # estimate hours billed but not used
        job_flow_info['hours_bbnu'] = (
            job_flow_info['hours'] *
            estimate_proportion_billed_but_not_used(jf))

        # split out mr job name and user
        # jobs flows created by MRJob have names like:
        # mr_word_freq_count.dave.20101103.121249.638552
        match = JOB_NAME_RE.match(jf.name)
        if match:
            job_flow_info['mr_job_name'] = match.group(1)
            job_flow_info['user'] = match.group(2)
        else:
            # not run by mrjob
            job_flow_info['mr_job_name'] = None
            job_flow_info['user'] = None

        job_flow_infos.append(job_flow_info)

    if not job_flow_infos:
        print 'No job flows created in the past two months!'
        return

    earliest = min(info['created'] for info in job_flow_infos)
    latest = max(info['created'] for info in job_flow_infos)

    print 'Total # of Job Flows: %d' % len(job_flow_infos)
    print

    print '* All times are in UTC.'
    print

    print 'Min create time: %s' % earliest
    print 'Max create time: %s' % latest
    print '   Current time: %s' % now
    print

    print '* All usage is measured in Normalized Instance Hours, which are'
    print '  roughly equivalent to running an m1.small instance for an hour.'
    print

    # total compute-unit hours used
    total_hours = sum(info['hours'] for info in job_flow_infos)
    print 'Total Usage: %d' % total_hours
    print

    print '* Time billed but not used is estimated, and may not match'
    print "  Amazon's billing system exactly."
    print

    total_hours_bbnu = sum(info['hours_bbnu'] for info in job_flow_infos)
    print 'Total time billed but not used (waste): %.2f' % total_hours_bbnu
    print

    date_to_hours = defaultdict(float)
    date_to_hours_bbnu = defaultdict(float)
    for info in job_flow_infos:
        date_created = info['created'].date()
        date_to_hours[date_created] += info['hours']
        date_to_hours_bbnu[date_created] += info['hours_bbnu']
    print 'Daily statistics:'
    print
    print ' date        usage     waste'
    d = latest.date()
    while d >= earliest.date():
        print ' %10s %6d %9.2f' % (d, date_to_hours[d], date_to_hours_bbnu[d])
        d -= datetime.timedelta(days=1)
    print

    def fmt(mr_job_name_or_user):
        if mr_job_name_or_user:
            return mr_job_name_or_user
        else:
            return '(not started by mrjob)'

    print '* Job flows are considered to belong to the user and job that'
    print '  started them (even if other jobs use the job flow).'
    print

    # Top jobs
    print 'Top jobs, by total usage:'
    mr_job_name_to_hours = defaultdict(float)
    for info in job_flow_infos:
        mr_job_name_to_hours[info['mr_job_name']] += info['hours']
    for mr_job_name, hours in sorted(mr_job_name_to_hours.iteritems(),
                                     key=lambda (n, h): (-h, n)):
        print '  %6d %s' % (hours, fmt(mr_job_name))
    print

    print 'Top jobs, by time billed but not used:'
    mr_job_name_to_hours_bbnu = defaultdict(float)
    for info in job_flow_infos:
        mr_job_name_to_hours_bbnu[info['mr_job_name']] += info['hours_bbnu']
    for mr_job_name, hours_bbnu in sorted(
            mr_job_name_to_hours_bbnu.iteritems(), key=lambda (n, h): (-h, n)):
        print '  %9.2f %s' % (hours_bbnu, fmt(mr_job_name))
    print

    # Top users
    print 'Top users, by total usage:'
    user_to_hours = defaultdict(float)
    for info in job_flow_infos:
        user_to_hours[info['user']] += info['hours']
    for user, hours in sorted(user_to_hours.iteritems(),
                              key=lambda (n, h): (-h, n)):
        print '  %6d %s' % (hours, fmt(user))
    print

    print 'Top users, by time billed but not used:'
    user_to_hours_bbnu = defaultdict(float)
    for info in job_flow_infos:
        user_to_hours_bbnu[info['user']] += info['hours_bbnu']
    for user, hours_bbnu in sorted(user_to_hours_bbnu.iteritems(),
                                   key=lambda (n, h): (-h, n)):
        print '  %9.2f %s' % (hours_bbnu, fmt(user))
    print

    # Top job flows
    print 'All job flows, by total usage:'
    top_job_flows = sorted(job_flow_infos,
                           key=lambda i: (-i['hours'], i['name']))
    for info in top_job_flows:
        print '  %6d %-15s %s' % (info['hours'], info['id'], info['name'])
    print

    print 'All job flows, by time billed but not used:'
    top_job_flows_bbnu = sorted(job_flow_infos,
                                key=lambda i: (-i['hours_bbnu'], i['name']))
    for info in top_job_flows_bbnu:
        print '  %9.2f %-15s %s' % (info['hours_bbnu'], info['id'],
                                    info['name'])
    print

    print 'Details for all job flows:'
    print
    print ' id              state         created             steps        time ran  usage     waste   user   name'

    all_job_flows = sorted(job_flow_infos,
                           key=lambda i: i['created'],
                           reverse=True)
    for info in all_job_flows:
        print ' %-15s %-13s %19s %3d %17s %6d %9.2f %8s %s' % (
            info['id'], info['state'], info['created'], info['num_steps'],
            info['ran'], info['hours'], info['hours_bbnu'],
            (info['user'] or ''), fmt(info['mr_job_name']))
Example #26
0
    def test_end_to_end(self):
        # read from STDIN, a local file, and a remote file
        stdin = StringIO('foo\nbar\n')

        local_input_path = os.path.join(self.tmp_dir, 'input')
        with open(local_input_path, 'w') as local_input_file:
            local_input_file.write('bar\nqux\n')

        remote_input_path = 's3://walrus/data/foo'
        self.add_mock_s3_data({'walrus': {'data/foo': 'foo\n'}})

        # setup fake output
        self.mock_emr_output = {
            ('j-MOCKJOBFLOW0', 1):
            ['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']
        }

        mr_job = MRTwoStepJob([
            '-r', 'emr', '-v', '-c', self.mrjob_conf_path, '-',
            local_input_path, remote_input_path, '--hadoop-input-format',
            'FooFormat', '--hadoop-output-format', 'BarFormat'
        ])
        mr_job.sandbox(stdin=stdin)

        local_tmp_dir = None
        results = []

        mock_s3_fs_snapshot = copy.deepcopy(self.mock_s3_fs)

        with mr_job.make_runner() as runner:
            assert isinstance(runner, EMRJobRunner)

            # make sure that initializing the runner doesn't affect S3
            # (Issue #50)
            assert_equal(mock_s3_fs_snapshot, self.mock_s3_fs)

            runner.run()

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results.append((key, value))

            local_tmp_dir = runner._get_local_tmp_dir()
            # make sure cleanup hasn't happened yet
            assert os.path.exists(local_tmp_dir)
            assert any(runner.ls(runner.get_output_dir()))

            emr_conn = runner.make_emr_conn()
            job_flow = emr_conn.describe_jobflow(runner.get_emr_job_flow_id())
            assert_equal(job_flow.state, 'COMPLETED')
            name_match = JOB_NAME_RE.match(job_flow.name)
            assert_equal(name_match.group(1), 'mr_two_step_job')
            assert_equal(name_match.group(2), getpass.getuser())

            # make sure our input and output formats are attached to
            # the correct steps
            assert_in('-inputformat', job_flow.steps[0].args)
            assert_not_in('-outputformat', job_flow.steps[0].args)
            assert_not_in('-inputformat', job_flow.steps[1].args)
            assert_in('-outputformat', job_flow.steps[1].args)

            # make sure mrjob.tar.gz is created and uploaded as
            # a bootstrap file
            assert runner._mrjob_tar_gz_path
            mrjob_tar_gz_file_dicts = [
                file_dict for file_dict in runner._files
                if file_dict['path'] == runner._mrjob_tar_gz_path
            ]

            assert_equal(len(mrjob_tar_gz_file_dicts), 1)

            mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0]
            assert mrjob_tar_gz_file_dict['name']
            assert_equal(mrjob_tar_gz_file_dict.get('bootstrap'), 'file')

            # shouldn't be in PYTHONPATH (we dump it directly in site-packages)
            pythonpath = runner._get_cmdenv().get('PYTHONPATH') or ''
            assert_not_in(mrjob_tar_gz_file_dict['name'],
                          pythonpath.split(':'))

        assert_equal(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'),
                                       (5, None)])

        # make sure cleanup happens
        assert not os.path.exists(local_tmp_dir)
        assert not any(runner.ls(runner.get_output_dir()))

        # job should get terminated
        emr_conn = runner.make_emr_conn()
        job_flow_id = runner.get_emr_job_flow_id()
        for i in range(10):
            emr_conn.simulate_progress(job_flow_id)

        job_flow = emr_conn.describe_jobflow(job_flow_id)
        assert_equal(job_flow.state, 'TERMINATED')
Example #27
0
    def test_empty(self):
        runner = InlineMRJobRunner(conf_paths=[])
        match = JOB_NAME_RE.match(runner.get_job_name())

        self.assertEqual(match.group(1), 'no_script')
        self.assertEqual(match.group(2), getpass.getuser())
Example #28
0
 def test_job_name_not_specified(self):
     job = MRWordCount()
     with job.make_runner() as runner:
         self.assertFalse(runner._opts['job_name'])
         self.assertIsNotNone(JOB_NAME_RE.match(runner.get_job_name()))
Example #29
0
def job_flow_to_basic_summary(job_flow, now=None):
    """Extract fields such as creation time, owner, etc. from the job flow,
    so we can safely reference them without using :py:func:`getattr`.

    :param job_flow: a :py:class:`boto.emr.EmrObject`
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.

    Returns a dictionary with the following keys. These will be ``None`` if the
    corresponding field in the job flow is unavailable.

    * *created*: UTC `datetime.datetime` that the job flow was created,
      or ``None``
    * *end*: UTC `datetime.datetime` that the job flow finished, or ``None``
    * *id*: job flow ID, or ``None`` (this should never happen)
    * *label*: The label for the job flow (usually the module name of the
      :py:class:`~mrjob.job.MRJob` script that started it), or
      ``None`` for non-:py:mod:`mrjob` job flows.
    * *name*: job flow name, or ``None`` (this should never happen)
    * *nih*: number of normalized instance hours used by the job flow.
    * *num_steps*: Number of steps in the job flow.
    * *owner*: The owner for the job flow (usually the user that started it),
      or ``None`` for non-:py:mod:`mrjob` job flows.
    * *pool*: pool name (e.g. ``'default'``) if the job flow is pooled,
      otherwise ``None``.
    * *ran*: How long the job flow ran, or has been running, as a
      :py:class:`datetime.timedelta`. This will be ``timedelta(0)`` if
      the job flow hasn't started.
    * *ready*: UTC `datetime.datetime` that the job flow finished
      bootstrapping, or ``None``
    * *start*: UTC `datetime.datetime` that the job flow became available, or
      ``None``
    * *state*: The job flow's state as a string (e.g. ``'RUNNING'``)
    """
    if now is None:
        now = datetime.utcnow()

    jf = {}  # summary to fill in

    jf['id'] = getattr(job_flow, 'jobflowid', None)
    jf['name'] = getattr(job_flow, 'name', None)

    jf['created'] = to_datetime(getattr(job_flow, 'creationdatetime', None))
    jf['start'] = to_datetime(getattr(job_flow, 'startdatetime', None))
    jf['ready'] = to_datetime(getattr(job_flow, 'readydatetime', None))
    jf['end'] = to_datetime(getattr(job_flow, 'enddatetime', None))

    if jf['start']:
        jf['ran'] = (jf['end'] or now) - jf['start']
    else:
        jf['ran'] = timedelta(0)

    jf['state'] = getattr(job_flow, 'state', None)

    jf['num_steps'] = len(getattr(job_flow, 'steps', None) or ())

    jf['pool'] = None
    bootstrap_actions = getattr(job_flow, 'bootstrapactions', None)
    if bootstrap_actions:
        args = [arg.value for arg in bootstrap_actions[-1].args]
        if len(args) == 2 and args[0].startswith('pool-'):
            jf['pool'] = args[1]

    m = JOB_NAME_RE.match(getattr(job_flow, 'name', ''))
    if m:
        jf['label'], jf['owner'] = m.group(1), m.group(2)
    else:
        jf['label'], jf['owner'] = None, None

    jf['nih'] = float(getattr(job_flow, 'normalizedinstancehours', '0'))

    return jf
Example #30
0
    def test_end_to_end(self):
        # read from STDIN, a local file, and a remote file
        stdin = StringIO('foo\nbar\n')

        local_input_path = os.path.join(self.tmp_dir, 'input')
        with open(local_input_path, 'w') as local_input_file:
            local_input_file.write('bar\nqux\n')

        remote_input_path = 's3://walrus/data/foo'
        self.add_mock_s3_data({'walrus': {'data/foo': 'foo\n'}})

        # setup fake output
        self.mock_emr_output = {('j-MOCKJOBFLOW0', 1): [
            '1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']}

        mr_job = MRTwoStepJob(['-r', 'emr', '-v',
                               '-c', self.mrjob_conf_path,
                               '-', local_input_path, remote_input_path,
                               '--hadoop-input-format', 'FooFormat',
                               '--hadoop-output-format', 'BarFormat'])
        mr_job.sandbox(stdin=stdin)

        local_tmp_dir = None
        results = []

        mock_s3_fs_snapshot = copy.deepcopy(self.mock_s3_fs)

        with mr_job.make_runner() as runner:
            assert isinstance(runner, EMRJobRunner)

            # make sure that initializing the runner doesn't affect S3
            # (Issue #50)
            assert_equal(mock_s3_fs_snapshot, self.mock_s3_fs)

            runner.run()

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results.append((key, value))

            local_tmp_dir = runner._get_local_tmp_dir()
            # make sure cleanup hasn't happened yet
            assert os.path.exists(local_tmp_dir)
            assert any(runner.ls(runner.get_output_dir()))

            emr_conn = runner.make_emr_conn()
            job_flow = emr_conn.describe_jobflow(runner.get_emr_job_flow_id())
            assert_equal(job_flow.state, 'COMPLETED')
            name_match = JOB_NAME_RE.match(job_flow.name)
            assert_equal(name_match.group(1), 'mr_two_step_job')
            assert_equal(name_match.group(2), getpass.getuser())

            # make sure our input and output formats are attached to
            # the correct steps
            assert_in('-inputformat', job_flow.steps[0].args)
            assert_not_in('-outputformat', job_flow.steps[0].args)
            assert_not_in('-inputformat', job_flow.steps[1].args)
            assert_in('-outputformat', job_flow.steps[1].args)

            # make sure mrjob.tar.gz is created and uploaded as
            # a bootstrap file
            assert runner._mrjob_tar_gz_path
            mrjob_tar_gz_file_dicts = [
                file_dict for file_dict in runner._files
                if file_dict['path'] == runner._mrjob_tar_gz_path]

            assert_equal(len(mrjob_tar_gz_file_dicts), 1)

            mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0]
            assert mrjob_tar_gz_file_dict['name']
            assert_equal(mrjob_tar_gz_file_dict.get('bootstrap'), 'file')

            # shouldn't be in PYTHONPATH (we dump it directly in site-packages)
            pythonpath = runner._get_cmdenv().get('PYTHONPATH') or ''
            assert_not_in(mrjob_tar_gz_file_dict['name'],
                          pythonpath.split(':'))

        assert_equal(sorted(results),
                     [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])

        # make sure cleanup happens
        assert not os.path.exists(local_tmp_dir)
        assert not any(runner.ls(runner.get_output_dir()))

        # job should get terminated
        emr_conn = runner.make_emr_conn()
        job_flow_id = runner.get_emr_job_flow_id()
        for i in range(10):
            emr_conn.simulate_progress(job_flow_id)

        job_flow = emr_conn.describe_jobflow(job_flow_id)
        assert_equal(job_flow.state, 'TERMINATED')
Example #31
0
    def test_auto_label(self):
        runner = MRTwoStepJob(['--no-conf']).make_runner()
        match = JOB_NAME_RE.match(runner.get_job_name())

        self.assertEqual(match.group(1), 'mr_two_step_job')
        self.assertEqual(match.group(2), getpass.getuser())
Example #32
0
    def test_end_to_end(self):
        # read from STDIN, a local file, and a remote file
        stdin = StringIO('foo\nbar\n')

        local_input_path = os.path.join(self.tmp_dir, 'input')
        with open(local_input_path, 'w') as local_input_file:
            local_input_file.write('bar\nqux\n')

        remote_input_path = 's3://walrus/data/foo'
        self.add_mock_s3_data({'walrus': {'data/foo': 'foo\n'}})

        # setup fake output
        self.mock_emr_output = {('j-MOCKJOBFLOW0', 1): [
            '1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']}

        mr_job = MRTwoStepJob(['-r', 'emr', '-v',
                               '-c', self.mrjob_conf_path,
                               '-', local_input_path, remote_input_path])
        mr_job.sandbox(stdin=stdin)

        local_tmp_dir = None
        results = []

        mock_s3_fs_snapshot = copy.deepcopy(self.mock_s3_fs)

        with mr_job.make_runner() as runner:
            assert isinstance(runner, EMRJobRunner)

            # make sure that initializing the runner doesn't affect S3
            # (Issue #50)
            assert_equal(mock_s3_fs_snapshot, self.mock_s3_fs)

            runner.run()

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results.append((key, value))

            local_tmp_dir = runner._get_local_tmp_dir()
            # make sure cleanup hasn't happened yet
            assert os.path.exists(local_tmp_dir)
            assert any(runner.ls(runner.get_output_dir()))

            emr_conn = runner.make_emr_conn()
            job_flow = emr_conn.describe_jobflow(runner.get_emr_job_flow_id())
            assert_equal(job_flow.state, 'COMPLETED')
            name_match = JOB_NAME_RE.match(job_flow.name)
            assert_equal(name_match.group(1), 'mr_two_step_job')
            assert_equal(name_match.group(2), getpass.getuser())

        assert_equal(sorted(results),
                     [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])

        # make sure cleanup happens
        assert not os.path.exists(local_tmp_dir)
        assert not any(runner.ls(runner.get_output_dir()))

        # job should get terminated
        emr_conn = runner.make_emr_conn()
        job_flow_id = runner.get_emr_job_flow_id()
        for i in range(10):
            emr_conn.simulate_progress(job_flow_id)

        job_flow = emr_conn.describe_jobflow(job_flow_id)
        assert_equal(job_flow.state, 'TERMINATED')