Esempio n. 1
0
    def test_create_scratch_uri(self):
        # "walrus" bucket will be ignored; it doesn't start with "mrjob-"
        self.add_mock_s3_data({'walrus': {}, 'zebra': {}})

        runner = EMRJobRunner(conf_path=False, s3_sync_wait_time=0.01)

        # bucket name should be mrjob- plus 16 random hex digits
        s3_scratch_uri = runner._opts['s3_scratch_uri']
        assert_equal(s3_scratch_uri[:11], 's3://mrjob-')
        assert_equal(s3_scratch_uri[27:], '/tmp/')

        # bucket shouldn't actually exist yet
        scratch_bucket, _ = parse_s3_uri(s3_scratch_uri)
        assert_not_in(scratch_bucket, self.mock_s3_fs.keys())

        # need to do something to ensure that the bucket actually gets
        # created. let's launch a (mock) job flow
        jfid = runner.make_persistent_job_flow()
        assert_in(scratch_bucket, self.mock_s3_fs.keys())
        runner.make_emr_conn().terminate_jobflow(jfid)

        # once our scratch bucket is created, we should re-use it
        runner2 = EMRJobRunner(conf_path=False)
        assert_equal(runner2._opts['s3_scratch_uri'], s3_scratch_uri)
        s3_scratch_uri = runner._opts['s3_scratch_uri']
Esempio n. 2
0
    def test_create_scratch_uri(self):
        # "walrus" bucket will be ignored; it doesn't start with "mrjob-"
        self.add_mock_s3_data({'walrus': {}, 'zebra': {}})

        runner = EMRJobRunner(conf_path=False, s3_sync_wait_time=0.01)

        # bucket name should be mrjob- plus 16 random hex digits
        s3_scratch_uri = runner._opts['s3_scratch_uri']
        assert_equal(s3_scratch_uri[:11], 's3://mrjob-')
        assert_equal(s3_scratch_uri[27:], '/tmp/')

        # bucket shouldn't actually exist yet
        scratch_bucket, _ = parse_s3_uri(s3_scratch_uri)
        assert_not_in(scratch_bucket, self.mock_s3_fs.keys())

        # need to do something to ensure that the bucket actually gets
        # created. let's launch a (mock) job flow
        jfid = runner.make_persistent_job_flow()
        assert_in(scratch_bucket, self.mock_s3_fs.keys())
        runner.make_emr_conn().terminate_jobflow(jfid)

        # once our scratch bucket is created, we should re-use it
        runner2 = EMRJobRunner(conf_path=False)
        assert_equal(runner2._opts['s3_scratch_uri'], s3_scratch_uri)
        s3_scratch_uri = runner._opts['s3_scratch_uri']
Esempio n. 3
0
def s3_cleanup(glob_path, time_old, dry_run=False, conf_path=None):
    """Delete all files older than *time_old* in *path*.
       If *dry_run* is ``True``, then just log the files that need to be 
       deleted without actually deleting them
       """
    runner = EMRJobRunner(conf_path=conf_path)
    s3_conn = runner.make_s3_conn()

    log.info('Deleting all files in %s that are older than %s' % (glob_path, time_old))
    
    for path in runner.ls(glob_path):
        bucket_name, key_name = parse_s3_uri(path)
        bucket = s3_conn.get_bucket(bucket_name)

        for key in bucket.list(key_name):
            last_modified = iso8601_to_datetime(key.last_modified)
            age = datetime.utcnow() - last_modified
            if age > time_old:
                # Delete it
                log.info('Deleting %s; is %s old' % (key.name, age))
                if not dry_run:
                    key.delete()
Esempio n. 4
0
    def simulate_progress(self, jobflow_id, now=None):
        """Simulate progress on the given job flow. This is automatically
        run when we call describe_jobflow().

        :type jobflow_id: str
        :param jobflow_id: fake job flow ID
        :type now: py:class:`datetime.datetime`
        :param now: alternate time to use as the current time (should be UTC)
        """
        if now is None:
            now = datetime.datetime.utcnow()

        if self.simulation_steps_left <= 0:
            raise AssertionError(
                'Simulated progress too many times; bailing out')
        self.simulation_steps_left -= 1

        job_flow = self.mock_emr_job_flows[jobflow_id]

        # if job is STARTING, move it along to WAITING
        if job_flow.state == 'STARTING':
            job_flow.state = 'WAITING'
            job_flow.startdatetime = to_iso8601(now)

        # if job is done, don't advance it
        if job_flow.state in ('COMPLETED', 'TERMINATED', 'FAILED'):
            return

        # if SHUTTING_DOWN, finish shutting down
        if job_flow.state == 'SHUTTING_DOWN':
            if job_flow.reason == 'Shut down as step failed':
                job_flow.state = 'FAILED'
            else:
                job_flow.state = 'TERMINATED'
            job_flow.enddatetime = to_iso8601(now)
            return

        # if a step is currently running, advance it
        for step_num, step in enumerate(job_flow.steps):
            # skip steps that are already done
            if step.state in ('COMPLETED', 'FAILED', 'CANCELLED'):
                continue
            if step.name in ('Setup Hadoop Debugging', ):
                step.state = 'COMPLETED'
                continue

            # found currently running step! going to handle it, then exit
            if step.state == 'PENDING':
                step.state = 'RUNNING'
                step.startdatetime = to_iso8601(now)
                return

            assert step.state == 'RUNNING'
            step.enddatetime = to_iso8601(now)

            # check if we're supposed to have an error
            if (jobflow_id, step_num) in self.mock_emr_failures:
                step.state = 'FAILED'
                reason = self.mock_emr_failures[(jobflow_id, step_num)]
                if reason:
                    job_flow.reason = reason
                if step.actiononfailure == 'TERMINATE_JOB_FLOW':
                    job_flow.state = 'SHUTTING_DOWN'
                    if not reason:
                        job_flow.reason = 'Shut down as step failed'
                return

            step.state = 'COMPLETED'

            # create fake output if we're supposed to write to S3
            output_uri = self._get_step_output_uri(step)
            if output_uri and S3_URI_RE.match(output_uri):
                mock_output = self.mock_emr_output.get(
                    (jobflow_id, step_num)) or ['']

                bucket_name, key_name = parse_s3_uri(output_uri)

                # write output to S3
                for i, bytes in enumerate(mock_output):
                    add_mock_s3_data(self.mock_s3_fs, {
                        bucket_name: {key_name + 'part-%05d' % i: bytes}})
            elif (jobflow_id, step_num) in self.mock_emr_output:
                raise AssertionError(
                    "can't use output for job flow ID %s, step %d "
                    "(it doesn't output to S3)" %
                    (jobflow_id, step_num))


            # done!
            return

        # no pending steps. shut down job if appropriate
        if job_flow.keepjobflowalivewhennosteps:
            job_flow.state = 'WAITING'
            job_flow.reason = 'Waiting for steps to run'
        else:
            job_flow.state = 'COMPLETED'
            job_flow.reason = 'Steps Completed'
Esempio n. 5
0
    def simulate_progress(self, jobflow_id, now=None):
        """Simulate progress on the given job flow. This is automatically
        run when we call describe_jobflow().

        :type jobflow_id: str
        :param jobflow_id: fake job flow ID
        :type now: py:class:`datetime.datetime`
        :param now: alternate time to use as the current time (should be UTC)
        """
        if now is None:
            now = datetime.datetime.utcnow()

        if self.simulation_steps_left <= 0:
            raise AssertionError(
                'Simulated progress too many times; bailing out')
        self.simulation_steps_left -= 1

        job_flow = self.mock_emr_job_flows[jobflow_id]

        # if job is STARTING, move it along to WAITING
        if job_flow.state == 'STARTING':
            job_flow.state = 'WAITING'
            job_flow.startdatetime = to_iso8601(now)

        # if job is done, don't advance it
        if job_flow.state in ('COMPLETED', 'TERMINATED', 'FAILED'):
            return

        # if SHUTTING_DOWN, finish shutting down
        if job_flow.state == 'SHUTTING_DOWN':
            if job_flow.reason == 'Shut down as step failed':
                job_flow.state = 'FAILED'
            else:
                job_flow.state = 'TERMINATED'
            job_flow.enddatetime = to_iso8601(now)
            return

        # if a step is currently running, advance it
        for step_num, step in enumerate(job_flow.steps):
            # skip steps that are already done
            if step.state in ('COMPLETED', 'FAILED', 'CANCELLED'):
                continue

            # found currently running step! going to handle it, then exit
            if step.state == 'PENDING':
                step.state = 'RUNNING'
                step.startdatetime = to_iso8601(now)
                return

            assert step.state == 'RUNNING'
            step.enddatetime = to_iso8601(now)

            # check if we're supposed to have an error
            if (jobflow_id, step_num) in self.mock_emr_failures:
                step.state = 'FAILED'
                reason = self.mock_emr_failures[(jobflow_id, step_num)]
                if reason:
                    job_flow.reason = reason
                if step.actiononfailure == 'TERMINATE_JOB_FLOW':
                    job_flow.state = 'SHUTTING_DOWN'
                    if not reason:
                        job_flow.reason = 'Shut down as step failed'
                return

            step.state = 'COMPLETED'

            # create fake output if we're supposed to write to S3
            output_uri = self._get_step_output_uri(step)
            if output_uri and S3_URI_RE.match(output_uri):
                mock_output = self.mock_emr_output.get(
                    (jobflow_id, step_num)) or ['']

                bucket_name, key_name = parse_s3_uri(output_uri)

                # write output to S3
                for i, bytes in enumerate(mock_output):
                    add_mock_s3_data(
                        self.mock_s3_fs,
                        {bucket_name: {
                            key_name + 'part-%05d' % i: bytes
                        }})
            elif (jobflow_id, step_num) in self.mock_emr_output:
                raise AssertionError(
                    "can't use output for job flow ID %s, step %d "
                    "(it doesn't output to S3)" % (jobflow_id, step_num))

            # done!
            return

        # no pending steps. shut down job if appropriate
        if job_flow.keepjobalivewhennosteps:
            job_flow.state = 'WAITING'
            job_flow.reason = 'Waiting for steps to run'
        else:
            job_flow.state = 'COMPLETED'
            job_flow.reason = 'Steps Completed'