Ejemplo n.º 1
0
 def test_with_all_job_flows(self):
     self.mock_emr_job_flows.update(JOB_FLOWS_BY_ID)
     emr_conn = EMRJobRunner(conf_paths=[]).make_emr_conn()
     emr_conn.run_jobflow('no name', log_uri=None)
     main(['-q', '--no-conf'])
     lines = [line for line in StringIO(self.stdout.getvalue())]
     self.assertEqual(len(lines), len(JOB_FLOWS_BY_ID) - 1)
Ejemplo n.º 2
0
 def test_no_region(self):
     runner = EMRJobRunner(conf_path=False)
     assert_equal(runner.make_emr_conn().endpoint,
                  'elasticmapreduce.amazonaws.com')
     assert_equal(runner.make_s3_conn().endpoint,
                  's3.amazonaws.com')
     assert_equal(runner._aws_region, '')
Ejemplo n.º 3
0
def yield_clusters(max_days_ago=None, now=None, **runner_kwargs):
    """Get relevant job flow information from EMR.

    :param float max_days_ago: If set, don't fetch job flows created longer
                               than this many days ago.
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.
    :param runner_kwargs: keyword args to pass through to
                          :py:class:`~mrjob.emr.EMRJobRunner`
    """
    if now is None:
        now = datetime.utcnow()

    emr_conn = EMRJobRunner(**runner_kwargs).make_emr_conn()

    # if --max-days-ago is set, only look at recent jobs
    created_after = None
    if max_days_ago is not None:
        created_after = now - timedelta(days=max_days_ago)

    for cluster_summary in _yield_all_clusters(emr_conn, created_after=created_after):
        cluster_id = cluster_summary.id

        cluster = emr_conn.describe_cluster(cluster_id)
        cluster.steps = list(_yield_all_steps(emr_conn, cluster_id))
        cluster.bootstrapactions = list(_yield_all_bootstrap_actions(emr_conn, cluster_id))

        yield cluster
Ejemplo n.º 4
0
    def test_create_scratch_uri(self):
        # "walrus" bucket will be ignored; it doesn't start with "mrjob-"
        self.add_mock_s3_data({'walrus': {}, 'zebra': {}})

        runner = EMRJobRunner(conf_path=False, s3_sync_wait_time=0.01)

        # bucket name should be mrjob- plus 16 random hex digits
        s3_scratch_uri = runner._opts['s3_scratch_uri']
        assert_equal(s3_scratch_uri[:11], 's3://mrjob-')
        assert_equal(s3_scratch_uri[27:], '/tmp/')

        # bucket shouldn't actually exist yet
        scratch_bucket, _ = parse_s3_uri(s3_scratch_uri)
        assert_not_in(scratch_bucket, self.mock_s3_fs.keys())

        # need to do something to ensure that the bucket actually gets
        # created. let's launch a (mock) job flow
        jfid = runner.make_persistent_job_flow()
        assert_in(scratch_bucket, self.mock_s3_fs.keys())
        runner.make_emr_conn().terminate_jobflow(jfid)

        # once our scratch bucket is created, we should re-use it
        runner2 = EMRJobRunner(conf_path=False)
        assert_equal(runner2._opts['s3_scratch_uri'], s3_scratch_uri)
        s3_scratch_uri = runner._opts['s3_scratch_uri']
Ejemplo n.º 5
0
class MRBossTestCase(MockBotoTestCase):

    def setUp(self):
        super(MRBossTestCase, self).setUp()
        self.make_runner()

    def tearDown(self):
        self.cleanup_runner()
        super(MRBossTestCase, self).tearDown()

    def make_runner(self):
        self.runner = EMRJobRunner(conf_paths=[])
        self.add_mock_s3_data({'walrus': {}})
        self.runner = EMRJobRunner(s3_sync_wait_time=0,
                                   s3_tmp_dir='s3://walrus/tmp',
                                   conf_paths=[])
        self.runner._s3_job_log_uri = BUCKET_URI + LOG_DIR
        self.prepare_runner_for_ssh(self.runner)
        self.output_dir = tempfile.mkdtemp(prefix='mrboss_wd')

    def cleanup_runner(self):
        """This method assumes ``prepare_runner_for_ssh()`` was called. That
        method isn't a "proper" setup method because it requires different
        arguments for different tests.
        """
        shutil.rmtree(self.output_dir)
        self.runner.cleanup()

    def test_one_node(self):
        mock_ssh_file('testmaster', 'some_file', b'file contents')

        run_on_all_nodes(self.runner, self.output_dir, ['cat', 'some_file'],
                         print_stderr=False)

        with open(os.path.join(self.output_dir, 'master', 'stdout'), 'r') as f:
            self.assertEqual(f.read().rstrip(), 'file contents')

        self.assertEqual(os.listdir(self.output_dir), ['master'])

    def test_two_nodes(self):
        self.add_slave()
        self.runner._opts['num_ec2_instances'] = 2

        mock_ssh_file('testmaster', 'some_file', b'file contents 1')
        mock_ssh_file('testmaster!testslave0', 'some_file', b'file contents 2')

        self.runner.fs  # force initialization of _ssh_fs

        run_on_all_nodes(self.runner, self.output_dir, ['cat', 'some_file'],
                         print_stderr=False)

        with open(os.path.join(self.output_dir, 'master', 'stdout'), 'r') as f:
            self.assertEqual(f.read().rstrip(), 'file contents 1')

        with open(os.path.join(self.output_dir, 'slave testslave0', 'stdout'),
                  'r') as f:
            self.assertEqual(f.read().strip(), 'file contents 2')

        self.assertEqual(sorted(os.listdir(self.output_dir)),
                         ['master', 'slave testslave0'])
Ejemplo n.º 6
0
    def test_local_bootstrap_action(self):
        # make sure that local bootstrap action scripts get uploaded to S3
        action_path = os.path.join(self.tmp_dir, 'apt-install.sh')
        with open(action_path, 'w') as f:
            f.write('for $pkg in $@; do sudo apt-get install $pkg; done\n')

        bootstrap_actions = [
            action_path + ' python-scipy mysql-server']

        runner = EMRJobRunner(conf_path=False,
                              bootstrap_actions=bootstrap_actions,
                              s3_sync_wait_time=0.01)

        job_flow_id = runner.make_persistent_job_flow()

        emr_conn = runner.make_emr_conn()
        job_flow = emr_conn.describe_jobflow(job_flow_id)
        actions = job_flow.bootstrapactions

        assert_equal(len(actions), 2)

        assert actions[0].path.startswith('s3://mrjob-')
        assert actions[0].path.endswith('/apt-install.sh')
        assert_equal(actions[0].name, 'apt-install.sh')
        assert_equal(actions[0].args, ['python-scipy', 'mysql-server'])

        # check for master boostrap script
        assert actions[1].path.startswith('s3://mrjob-')
        assert actions[1].path.endswith('b.py')
        assert_equal(actions[1].args, [])
        assert_equal(actions[1].name, 'master')

        # make sure master bootstrap script is on S3
        assert runner.path_exists(actions[1].path)
Ejemplo n.º 7
0
def find_waiting_flow(aws_access_key_id,aws_secret_access_key,ssh_key_pair_file=''):
    # print (aws_access_key_id,aws_secret_access_key)
    JobRunner = EMRJobRunner(aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key)
    emr_conn = JobRunner.make_emr_conn()
    job_flows=emr_conn.describe_jobflows()
    job_id='NONE'
    d = {'WAITING':0,'STARTING':1,'RUNNING':2}
    waiting_flows=[]
    for flow in job_flows:
        try:
            if flow.state in d.keys():
                job_id=flow.jobflowid
                ip_address=flow.masterpublicdnsname
                waiting_flows.append([d[flow.state],job_id,ip_address,flow.state])
                if ssh_key_pair_file != '':
                    print 'ssh -i %s hadoop@%s'%(ssh_key_pair_file,ip_address)
                    job_id=flow.jobflowid
        except Exception:
            continue
    waiting_flows = sorted(waiting_flows, key=itemgetter(0))
    waiting_flows = [i[1:] for i in waiting_flows] #An index was added at the beginning for the sorting. Removing that index in this step
    waiting_flows_dict = [{'flow_id':i[0],'node':i[1],'flow_state':i[2]} for i in waiting_flows] #Converting a list of lists to a list of dicts
    
    #Printing
    index = 0
    for flow_dict in waiting_flows_dict:
        print index, flow_dict['flow_id'], flow_dict['node'], flow_dict['flow_state']
        index+=1
    
    return waiting_flows_dict
Ejemplo n.º 8
0
    def test_attach_to_existing_job_flow(self):
        emr_conn = EMRJobRunner(conf_path=False).make_emr_conn()
        # set log_uri to None, so that when we describe the job flow, it
        # won't have the loguri attribute, to test Issue #112
        emr_job_flow_id = emr_conn.run_jobflow(
            name='Development Job Flow', log_uri=None)

        stdin = StringIO('foo\nbar\n')
        self.mock_emr_output = {(emr_job_flow_id, 1): [
            '1\t"bar"\n1\t"foo"\n2\tnull\n']}

        mr_job = MRTwoStepJob(['-r', 'emr', '-v',
                               '-c', self.mrjob_conf_path,
                               '--emr-job-flow-id', emr_job_flow_id])
        mr_job.sandbox(stdin=stdin)

        results = []
        with mr_job.make_runner() as runner:
            runner.run()

            # Issue 182: don't create the bootstrap script when
            # attaching to another job flow
            assert_equal(runner._master_bootstrap_script, None)

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results.append((key, value))

        assert_equal(sorted(results),
            [(1, 'bar'), (1, 'foo'), (2, None)])
Ejemplo n.º 9
0
def main():
    # parser command-line args
    option_parser = make_option_parser()
    options, args = option_parser.parse_args()

    if args:
        option_parser.error('takes no arguments')

    # set up logging
    if not options.quiet:
        log_to_stream(name='mrjob', debug=options.verbose)

    # create the persistent job
    runner_kwargs = {
        'conf_path': options.conf_path,
        'ec2_instance_type': options.ec2_instance_type,
        'ec2_master_instance_type': options.ec2_master_instance_type,
        'ec2_slave_instance_type': options.ec2_slave_instance_type,
        'label': options.label,
        'num_ec2_instances': options.num_ec2_instances,
        'owner': options.owner,
    }
    runner = EMRJobRunner(**runner_kwargs)
    emr_job_flow_id = runner.make_persistent_job_flow()
    print emr_job_flow_id
Ejemplo n.º 10
0
    def test_spark_script_step_without_mr_job_script(self):
        spark_script_path = self.makefile('a_spark_script.py')
        steps = MRSparkScript(['--script', spark_script_path])._steps_desc()

        runner = EMRJobRunner(steps=steps, stdin=BytesIO())

        runner.run()
        runner.cleanup()
Ejemplo n.º 11
0
    def test_spark_jar_step_without_mr_job_script(self):
        spark_jar_path = self.makefile('fireflies.jar')
        steps = MRSparkJar(['--jar', spark_jar_path])._steps_desc()

        runner = EMRJobRunner(steps=steps, stdin=BytesIO())

        runner.run()
        runner.cleanup()
Ejemplo n.º 12
0
    def test_jar_step_without_mr_job_script(self):
        jar_path = self.makefile('dora.jar')
        steps = MRJustAJar(['--jar', jar_path])._steps_desc()

        runner = EMRJobRunner(steps=steps, stdin=BytesIO(b'backpack'))

        runner.run()
        runner.cleanup()
Ejemplo n.º 13
0
 def test_blank_region(self):
     # blank region should be treated the same as no region
     runner = EMRJobRunner(conf_path=False, aws_region='')
     assert_equal(runner.make_emr_conn().endpoint,
                  'elasticmapreduce.amazonaws.com')
     assert_equal(runner.make_s3_conn().endpoint,
                  's3.amazonaws.com')
     assert_equal(runner._aws_region, '')
Ejemplo n.º 14
0
 def reducer_init(self):
     emr = EMRJobRunner(aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY)
     idf_parts = emr.get_s3_keys('s3://6885public/jeffchan/term-idfs/')
     self.word_to_idf = dict()
     for part in idf_parts:
         json = part.get_contents_as_string()
         for line in StringIO.StringIO(json):
             pair = json.loads(line)
             self.word_to_idf[pair['term']] = pair['idf']
    def test_terminate_job_flow(self):
        cluster_id = self.make_cluster(pool_emr_job_flows=True)
        self.monkey_patch_argv('--quiet', '--no-conf', 'j-MOCKCLUSTER0')

        terminate_main()

        emr_conn = EMRJobRunner(conf_paths=[]).make_emr_conn()
        cluster = emr_conn.describe_cluster(cluster_id)
        self.assertEqual(cluster.status.state, 'TERMINATED')
Ejemplo n.º 16
0
    def test_terminate_job_flow(self):
        jf_id = self.make_job_flow(pool_emr_job_flows=True)
        self.monkey_patch_argv('--quiet', '--no-conf', 'j-MOCKJOBFLOW0')

        terminate_main()

        emr_conn = EMRJobRunner(conf_paths=[]).make_emr_conn()
        self.assertEqual(emr_conn.describe_jobflow(jf_id).state,
                         'TERMINATED')
Ejemplo n.º 17
0
 def make_runner(self):
     self.runner = EMRJobRunner(conf_paths=[])
     self.add_mock_s3_data({'walrus': {}})
     self.runner = EMRJobRunner(cloud_fs_sync_secs=0,
                                cloud_tmp_dir='s3://walrus/tmp',
                                conf_paths=[])
     self.runner._s3_log_dir_uri = BUCKET_URI + LOG_DIR
     self.prepare_runner_for_ssh(self.runner)
     self.output_dir = tempfile.mkdtemp(prefix='mrboss_wd')
Ejemplo n.º 18
0
def main():
    option_parser = make_option_parser()
    options, args = option_parser.parse_args()
    
    if args:
        option_parser.error('takes no arguments')

    # set up logging
    if not options.quiet:
        log_to_stream(name='mrjob', debug=options.verbose)

    emr_conn = EMRJobRunner().make_emr_conn()

    log.info(
        'getting info about all job flows (this goes back about 2 weeks)')
    job_flows = emr_conn.describe_jobflows()
        
    now = datetime.utcnow()

    num_running = 0
    num_idle = 0
    num_done = 0
    # a list of tuples of job flow id, name, idle time (as a timedelta)
    to_terminate = []

    for jf in job_flows:
        # check if job flow is done
        if hasattr(jf, 'enddatetime'):
            num_done += 1
        # check if job flow is currently running
        elif jf.steps and not hasattr(jf.steps[-1], 'enddatetime'):
            num_running += 1
        # job flow is idle. how long?
        else:
            num_idle += 1
            if jf.steps:
                idle_since = datetime.strptime(
                    jf.steps[-1].enddatetime, ISO8601)
            else:
                idle_since = datetime.strptime(
                    jf.creationdatetime, ISO8601)
            idle_time = now - idle_since

            # don't care about fractions of a second
            idle_time = timedelta(idle_time.days, idle_time.seconds)

            log.debug('Job flow %s (%s) idle for %s' %
                           (jf.jobflowid, jf.name, idle_time))
            if idle_time > timedelta(hours=options.max_hours_idle):
                to_terminate.append(
                    (jf.jobflowid, jf.name, idle_time))

    log.info('Job flow statuses: %d running, %d idle, %d done' %
                  (num_running, num_idle, num_done))

    terminate_and_notify(emr_conn, to_terminate, options)
Ejemplo n.º 19
0
def find_waiting_flow(aws_access_key_id,aws_secret_access_key):
    JobRunner = EMRJobRunner(aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key)
    emr_conn = JobRunner.make_emr_conn()
    job_flows=emr_conn.describe_jobflows()
    job_id='NONE'
    for flow in job_flows:
        if flow.state=='WAITING':
            print flow,flow.name,flow.jobflowid,flow.state
            job_id=flow.jobflowid
    return job_id
Ejemplo n.º 20
0
    def test_terminate_cluster(self):
        cluster_id = self.make_cluster(pool_clusters=True)
        self.monkey_patch_argv('--quiet', '--no-conf', 'j-MOCKCLUSTER0')

        terminate_main()

        emr_conn = EMRJobRunner(conf_paths=[]).make_emr_conn()
        cluster = _patched_describe_cluster(emr_conn, cluster_id)
        emr_conn.simulate_progress(cluster_id)
        self.assertEqual(cluster.status.state, 'TERMINATED')
Ejemplo n.º 21
0
 def make_runner(self):
     self.runner = EMRJobRunner(conf_paths=[])
     self.add_mock_s3_data({'walrus': {}})
     self.runner = EMRJobRunner(s3_sync_wait_time=0,
                                s3_scratch_uri='s3://walrus/tmp',
                                conf_paths=[])
     self.runner._s3_job_log_uri = BUCKET_URI + LOG_DIR
     self.prepare_runner_for_ssh(self.runner)
     self.runner._enable_slave_ssh_access()
     self.output_dir = tempfile.mkdtemp(prefix='mrboss_wd')
Ejemplo n.º 22
0
class MRBossTestCase(MockEMRAndS3TestCase):

    @setup
    def make_runner(self):
        self.runner = EMRJobRunner(conf_path=False)
        self.add_mock_s3_data({'walrus': {}})
        self.runner = EMRJobRunner(s3_sync_wait_time=0,
                                   s3_scratch_uri='s3://walrus/tmp',
                                   conf_path=False)
        self.runner._s3_job_log_uri = BUCKET_URI + LOG_DIR
        self.prepare_runner_for_ssh(self.runner)
        self.runner._enable_slave_ssh_access()
        self.output_dir = tempfile.mkdtemp(prefix='mrboss_wd')

    @teardown
    def cleanup_runner(self):
        """This method assumes ``prepare_runner_for_ssh()`` was called. That
        method isn't a "proper" setup method because it requires different
        arguments for different tests.
        """
        shutil.rmtree(self.output_dir)
        self.runner.cleanup()
        self.teardown_ssh()

    def test_one_node(self):
        mock_ssh_file('testmaster', 'some_file', 'file contents')

        run_on_all_nodes(self.runner, self.output_dir, ['cat', 'some_file'],
                         print_stderr=False)

        with open(os.path.join(self.output_dir, 'master', 'stdout'), 'r') as f:
            assert_equal(f.read(), 'file contents\n')

        assert_equal(os.listdir(self.output_dir), ['master'])

    def test_two_nodes(self):
        self.add_slave()
        self.runner._opts['num_ec2_instances'] = 2

        mock_ssh_file('testmaster', 'some_file', 'file contents 1')
        mock_ssh_file('testmaster!testslave0', 'some_file', 'file contents 2')

        run_on_all_nodes(self.runner, self.output_dir, ['cat', 'some_file'],
                         print_stderr=False)

        with open(os.path.join(self.output_dir, 'master', 'stdout'), 'r') as f:
            assert_equal(f.read(), 'file contents 1\n')

        with open(os.path.join(self.output_dir, 'slave testslave0', 'stdout'),
                  'r') as f:
            assert_equal(f.read(), 'file contents 2\n')

        assert_equal(sorted(os.listdir(self.output_dir)),
                     ['master', 'slave testslave0'])
Ejemplo n.º 23
0
    def test_cleanup(self):
        runner = EMRJobRunner(conf_paths=[], s3_sync_wait_time=0.01)

        # add some mock data and change last_modified
        remote_input_path = 's3://walrus/data/'
        self.add_mock_s3_data({'walrus': {'data/foo': 'foo\n',
                                        'data/bar': 'bar\n',
                                        'data/qux': 'qux\n'}})

        s3_conn = runner.make_s3_conn()
        bucket_name, key_name = parse_s3_uri(remote_input_path)
        bucket = s3_conn.get_bucket(bucket_name)

        key_foo = bucket.get_key('data/foo')
        key_bar = bucket.get_key('data/bar')
        key_qux = bucket.get_key('data/qux')
        key_bar.last_modified = datetime.now() - timedelta(days=45)
        key_qux.last_modified = datetime.now() - timedelta(hours=50)

        # make sure keys are there
        assert isinstance(key_foo, MockKey)
        assert isinstance(key_bar, MockKey)
        assert isinstance(key_qux, MockKey)

        s3_cleanup(remote_input_path, timedelta(days=30), dry_run=True,
                   conf_paths=[])

        # dry-run shouldn't delete anything
        assert isinstance(key_foo, MockKey)
        assert isinstance(key_bar, MockKey)
        assert isinstance(key_qux, MockKey)

        s3_cleanup(remote_input_path, timedelta(days=30), conf_paths=[])

        key_foo = bucket.get_key('data/foo')
        key_bar = bucket.get_key('data/bar')
        key_qux = bucket.get_key('data/qux')

        # make sure key_bar is deleted
        assert isinstance(key_foo, MockKey)
        self.assertEqual(key_bar, None)
        assert isinstance(key_qux, MockKey)

        s3_cleanup(remote_input_path, timedelta(hours=48), conf_paths=[])

        key_foo = bucket.get_key('data/foo')
        key_bar = bucket.get_key('data/bar')
        key_qux = bucket.get_key('data/qux')

        # make sure key_qux is deleted
        assert isinstance(key_foo, MockKey)
        self.assertEqual(key_bar, None)
        self.assertEqual(key_qux, None)
Ejemplo n.º 24
0
def main(cl_args=None):
    # parser command-line args
    arg_parser = _make_arg_parser()
    options = arg_parser.parse_args(cl_args)

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    # create the persistent job
    runner = EMRJobRunner(**_runner_kwargs(options))
    log.debug('Terminating cluster %s' % options.cluster_id)
    runner.make_emr_client().terminate_job_flows(
        JobFlowIds=[options.cluster_id])
    log.info('Terminated cluster %s' % options.cluster_id)
Ejemplo n.º 25
0
    def test_terminate_pool(self):
        jf_id = self.make_job_flow(pool_emr_job_flows=True)
        emr_conn = EMRJobRunner(conf_path=False).make_emr_conn()

        for i in range(3):
            emr_conn.simulate_progress(jf_id)

        self.monkey_patch_argv("--quiet", "--no-conf", "-t", jf_id)
        self.monkey_patch_stdout()

        pool_main()

        value = self.stdout.getvalue()
        self.assertIn("j-MOCKJOBFLOW0", value)
Ejemplo n.º 26
0
    def test_archive_remote_data(self):
        runner = EMRJobRunner()

        tar_gz_path = runner._dir_archive_path('s3://walrus/archive')
        self.assertEqual(os.path.basename(tar_gz_path), 'archive.tar.gz')

        runner._create_dir_archive('s3://walrus/archive')

        tar_gz = tarfile.open(tar_gz_path, 'r:gz')
        try:
            self.assertEqual(sorted(tar_gz.getnames()),
                             [os.path.join('bar', 'baz'), 'foo'])
        finally:
            tar_gz.close()
def find_waiting_flow(aws_access_key_id=None, aws_secret_access_key=None, s3_scratch_uri=None,
                      s3_log_uri=None, ec2_key_pair=None, ec2_key_pair_file=None):
    # If the options are specified then ignore the options in ~/.mrjob.conf
    if aws_access_key_id is not None and aws_secret_access_key is not None and \
       s3_scratch_uri is not None and s3_log_uri is not None and ec2_key_pair is not None and \
       ec2_key_pair_file is not None:

        emr_conn = EMRJobRunner(aws_access_key_id=aws_access_key_id,
                                aws_secret_access_key=aws_secret_access_key,
                                s3_scratch_uri=s3_scratch_uri, s3_log_uri=s3_log_uri,
                                ec2_key_pair=ec2_key_pair,
                                ec2_key_pair_file=ec2_key_pair_file).make_emr_conn()
    # If options are not specified then use the options in ~/.mrjob.conf
    else:
        if not os.path.isfile("%s/.mrjob.conf" % expanduser("~")):
            sys.exit("%s/.mrjob.conf no found" % expanduser("~"))

        emr_conn = EMRJobRunner().make_emr_conn()

    job_flows = emr_conn.describe_jobflows()
    d = {'WAITING': 0, 'STARTING': 1, 'RUNNING': 2}
    waiting_flows = []

    for flow in job_flows:
        try:
            if flow.state in d.keys():
                job_id = flow.jobflowid
                ip_address = flow.masterpublicdnsname
                waiting_flows.append([d[flow.state], job_id, ip_address, flow.state])
                if ec2_key_pair_file != '':
                    print 'ssh -i %s hadoop@%s' % (ec2_key_pair_file, ip_address)
                    job_id = flow.jobflowid
        except Exception:
            continue

    waiting_flows = sorted(waiting_flows, key=itemgetter(0))
    # An index was added at the beginning for the sorting. Removing that index in this step
    waiting_flows = [i[1:] for i in waiting_flows]
    # Converting a list of lists to a list of dicts
    waiting_flows_dict = [{'flow_id': i[0], 'node': i[1], 'flow_state':i[2]} for i in waiting_flows]

    # Printing
    index = 0
    for flow_dict in waiting_flows_dict:
        print index, flow_dict['flow_id'], flow_dict['node'], flow_dict['flow_state']
        index += 1
    
    return waiting_flows_dict
Ejemplo n.º 28
0
def main(cl_args=None):
    # parser command-line args
    option_parser = _make_option_parser()
    options, args = option_parser.parse_args(cl_args)

    if len(args) != 1:
        option_parser.error('This tool takes exactly one argument.')
    cluster_id = args[0]

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    # create the persistent job
    runner = EMRJobRunner(**_runner_kwargs(options))
    log.debug('Terminating cluster %s' % cluster_id)
    runner.make_emr_conn().terminate_jobflow(cluster_id)
    log.info('Terminated cluster %s' % cluster_id)
Ejemplo n.º 29
0
def main(cl_args=None):
    # parser command-line args
    option_parser = make_option_parser()
    options, args = option_parser.parse_args(cl_args)

    if len(args) != 1:
        option_parser.error('This tool takes exactly one argument.')
    emr_job_flow_id = args[0]

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    # create the persistent job
    runner = EMRJobRunner(conf_paths=options.conf_paths)
    log.debug('Terminating job flow %s' % emr_job_flow_id)
    runner.make_emr_conn().terminate_jobflow(emr_job_flow_id)
    log.info('Terminated job flow %s' % emr_job_flow_id)
def find_waiting_flow(aws_access_key_id,aws_secret_access_key,ssh_key_pair_file=''):
    print (aws_access_key_id,aws_secret_access_key)
    JobRunner = EMRJobRunner(aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key)
    emr_conn = JobRunner.make_emr_conn()
    job_flows=emr_conn.describe_jobflows()
    job_id='NONE'
    waiting_flows=[]
    for flow in job_flows:
        if flow.state=='WAITING':
            waiting_flows.append(flow)
            print flow.jobflowid,flow.state
            ip_address=flow.masterpublicdnsname
            if ssh_key_pair_file != '':
                print 'ssh -i %s hadoop@%s'%(ssh_key_pair_file,ip_address)
            job_id=flow.jobflowid
    return job_id
Ejemplo n.º 31
0
def main(cl_args=None):
    # parser command-line args
    option_parser = _make_option_parser()
    options, args = option_parser.parse_args(cl_args)

    if len(args) != 1:
        option_parser.error('This tool takes exactly one argument.')
    cluster_id = args[0]

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    # create the persistent job
    runner = EMRJobRunner(**_runner_kwargs(options))
    log.debug('Terminating cluster %s' % cluster_id)
    runner.make_emr_client().terminate_job_flows(
        JobFlowIds=[cluster_id])
    log.info('Terminated cluster %s' % cluster_id)
Ejemplo n.º 32
0
 def make_cluster(self, **kwargs):
     self.add_mock_s3_data({'walrus': {}})
     kwargs.update(
         dict(conf_paths=[],
              cloud_tmp_dir='s3://walrus/',
              cloud_fs_sync_secs=0))
     with EMRJobRunner(**kwargs) as runner:
         return runner.make_persistent_cluster()
Ejemplo n.º 33
0
def _yield_clusters(max_days_ago=None, now=None, **runner_kwargs):
    """Get relevant cluster information from EMR.

    :param float max_days_ago: If set, don't fetch clusters created longer
                               than this many days ago.
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.
    :param runner_kwargs: keyword args to pass through to
                          :py:class:`~mrjob.emr.EMRJobRunner`
    """
    if now is None:
        now = _boto3_now()

    emr_client = EMRJobRunner(**runner_kwargs).make_emr_client()

    # if --max-days-ago is set, only look at recent jobs
    created_after = None
    if max_days_ago is not None:
        created_after = now - timedelta(days=max_days_ago)

    # use _DELAY to sleep 1 second after each API call (see #1091). Could
    # implement some sort of connection wrapper for this if it becomes more
    # generally useful.
    list_clusters_kwargs = dict(_delay=_DELAY)
    if created_after is not None:
        list_clusters_kwargs['CreatedAfter'] = created_after

    for cluster_summary in _boto3_paginate('Clusters', emr_client,
                                           'list_clusters',
                                           **list_clusters_kwargs):

        cluster_id = cluster_summary['Id']

        cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster']
        sleep(_DELAY)

        cluster['Steps'] = list(
            reversed(
                list(
                    _boto3_paginate('Steps',
                                    emr_client,
                                    'list_steps',
                                    ClusterId=cluster_id,
                                    _delay=_DELAY))))

        yield cluster
Ejemplo n.º 34
0
 def make_job_flow(self, **kwargs):
     self.add_mock_s3_data({'walrus': {}})
     kwargs.update(
         dict(conf_paths=[],
              s3_scratch_uri='s3://walrus/',
              s3_sync_wait_time=0))
     with EMRJobRunner(**kwargs) as runner:
         return runner.make_persistent_job_flow()
Ejemplo n.º 35
0
    def test_cleanup(self):
        runner = EMRJobRunner(conf_paths=[], cloud_fs_sync_secs=0.01)

        # add some mock data

        # foo is current
        self.add_mock_s3_data(
            {'walrus': {'data/foo': b'foo\n'}})

        # bar and baz are very old (but baz isn't in data/)
        self.add_mock_s3_data(
            {'walrus': {'data/bar': b'bar\n',
                        'other/baz': b'baz\n'}},
            age=timedelta(days=45))

        # qux is a little more than two days old
        self.add_mock_s3_data(
            {'walrus': {'data/qux': b'qux\n'}},
            age=timedelta(hours=50))

        self.assertEqual(
            sorted(runner.fs.ls('s3://walrus/')),
            ['s3://walrus/data/bar', 's3://walrus/data/foo',
             's3://walrus/data/qux', 's3://walrus/other/baz'],
        )

        # try a dry run, which shouldn't delete anything
        _s3_cleanup('s3://walrus/data/', timedelta(days=30), dry_run=True,
                    conf_paths=[])

        self.assertEqual(
            sorted(runner.fs.ls('s3://walrus/')), [
                's3://walrus/data/bar',
                's3://walrus/data/foo',
                's3://walrus/data/qux',
                's3://walrus/other/baz',
            ],
        )
        # now do it for real. should hit bar (baz isn't in data/)
        _s3_cleanup('s3://walrus/data', timedelta(days=30), conf_paths=[])

        self.assertEqual(
            sorted(runner.fs.ls('s3://walrus/')), [
                's3://walrus/data/foo',
                's3://walrus/data/qux',
                's3://walrus/other/baz',
            ],
        )

        # now try to delete qux too
        _s3_cleanup('s3://walrus/data', timedelta(hours=48), conf_paths=[])

        self.assertEqual(
            sorted(runner.fs.ls('s3://walrus/')), [
                's3://walrus/data/foo',
                's3://walrus/other/baz',
            ],
        )
Ejemplo n.º 36
0
    def test_cleanup(self):
        runner = EMRJobRunner(conf_paths=[], s3_sync_wait_time=0.01)

        # add some mock data and change last_modified
        remote_input_path = 's3://walrus/data/'
        self.add_mock_s3_data({
            'walrus': {
                'data/foo': b'foo\n',
                'data/bar': b'bar\n',
                'data/qux': b'qux\n'
            }
        })

        s3_conn = runner.fs.make_s3_conn()
        bucket_name, key_name = parse_s3_uri(remote_input_path)
        bucket = s3_conn.get_bucket(bucket_name)

        key_foo = bucket.get_key('data/foo')
        key_bar = bucket.get_key('data/bar')
        key_qux = bucket.get_key('data/qux')
        key_bar.last_modified = datetime.now() - timedelta(days=45)
        key_qux.last_modified = datetime.now() - timedelta(hours=50)

        # make sure keys are there
        assert isinstance(key_foo, MockKey)
        assert isinstance(key_bar, MockKey)
        assert isinstance(key_qux, MockKey)

        s3_cleanup(remote_input_path,
                   timedelta(days=30),
                   dry_run=True,
                   conf_paths=[])

        # dry-run shouldn't delete anything
        assert isinstance(key_foo, MockKey)
        assert isinstance(key_bar, MockKey)
        assert isinstance(key_qux, MockKey)

        s3_cleanup(remote_input_path, timedelta(days=30), conf_paths=[])

        key_foo = bucket.get_key('data/foo')
        key_bar = bucket.get_key('data/bar')
        key_qux = bucket.get_key('data/qux')

        # make sure key_bar is deleted
        assert isinstance(key_foo, MockKey)
        self.assertEqual(key_bar, None)
        assert isinstance(key_qux, MockKey)

        s3_cleanup(remote_input_path, timedelta(hours=48), conf_paths=[])

        key_foo = bucket.get_key('data/foo')
        key_bar = bucket.get_key('data/bar')
        key_qux = bucket.get_key('data/qux')

        # make sure key_qux is deleted
        assert isinstance(key_foo, MockKey)
        self.assertEqual(key_bar, None)
Ejemplo n.º 37
0
    def reducer_init(self):
        self.idfs = {}

        # Iterate through the files in the bucket provided by the user
        if self.options.aws_access_key_id and self.options.aws_secret_access_key:
            emr = EMRJobRunner(
                aws_access_key_id=self.options.aws_access_key_id,
                aws_secret_access_key=self.options.aws_secret_access_key)
        else:
            emr = EMRJobRunner()

        for key in emr.get_s3_keys("s3://" + self.options.idf_loc):
            # Load the whole file first, then read it line-by-line: otherwise,
            # chunks may not be even lines
            for line in StringIO(key.get_contents_as_string()):
                term_idf = JSONValueProtocol.read(line)[
                    1]  # parse the line as a JSON object
                self.idfs[term_idf['term']] = term_idf['idf']
Ejemplo n.º 38
0
def main():
    # parser command-line args
    option_parser = make_option_parser()
    options, args = option_parser.parse_args()

    if args:
        option_parser.error('takes no arguments')

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    # create the persistent job
    runner_kwargs = options.__dict__.copy()
    del runner_kwargs['quiet']
    del runner_kwargs['verbose']

    runner = EMRJobRunner(**runner_kwargs)
    emr_job_flow_id = runner.make_persistent_job_flow()
    print emr_job_flow_id
    def test_terminate_job_flow(self):
        cluster_id = self.make_cluster(pool_emr_job_flows=True)
        self.monkey_patch_argv('--quiet', '--no-conf', 'j-MOCKCLUSTER0')

        terminate_main()

        emr_conn = EMRJobRunner(conf_paths=[]).make_emr_conn()
        cluster = patched_describe_cluster(emr_conn, cluster_id)
        self.assertEqual(cluster.status.state, 'TERMINATED')
Ejemplo n.º 40
0
def main():
    # parser command-line args
    option_parser = make_option_parser()
    options, args = option_parser.parse_args()

    if len(args) != 1:
        option_parser.error('takes exactly one argument')
    emr_job_flow_id = args[0]

    # set up logging
    if not options.quiet:
        log_to_stream(name='mrjob', debug=options.verbose)

    # create the persistent job
    runner = EMRJobRunner(conf_path=options.conf_path)
    log.debug('Terminating job flow %s' % emr_job_flow_id)
    runner.make_emr_conn().terminate_jobflow(emr_job_flow_id)
    log.info('Terminated job flow %s' % emr_job_flow_id)
Ejemplo n.º 41
0
def find_waiting_flow(aws_access_key_id,
                      aws_secret_access_key,
                      ssh_key_pair_file=''):
    print(aws_access_key_id, aws_secret_access_key)
    JobRunner = EMRJobRunner(aws_access_key_id=aws_access_key_id,
                             aws_secret_access_key=aws_secret_access_key)
    emr_conn = JobRunner.make_emr_conn()
    job_flows = emr_conn.describe_jobflows()
    job_id = 'NONE'
    waiting_flows = []
    for flow in job_flows:
        if flow.state == 'WAITING':
            waiting_flows.append(flow)
            print flow.jobflowid, flow.state
            ip_address = flow.masterpublicdnsname
            if ssh_key_pair_file != '':
                print 'ssh -i %s hadoop@%s' % (ssh_key_pair_file, ip_address)
            job_id = flow.jobflowid
    return job_id
Ejemplo n.º 42
0
def main(args=None):
    option_parser = make_option_parser()
    try:
        options = parse_args(option_parser, args)
    except OptionError:
        option_parser.error('This tool takes exactly one argument.')

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    with EMRJobRunner(**runner_kwargs(options)) as runner:
        perform_actions(options, runner)
Ejemplo n.º 43
0
def main():
    # parser command-line args
    option_parser = make_option_parser()
    options, args = option_parser.parse_args()

    if args:
        option_parser.error('takes no arguments')

    # set up logging
    if not options.quiet:
        log_to_stream(name='mrjob', debug=options.verbose)

    # create the persistent job
    runner_kwargs = options.__dict__.copy()
    del runner_kwargs['quiet']
    del runner_kwargs['verbose']

    runner = EMRJobRunner(**runner_kwargs)
    emr_job_flow_id = runner.make_persistent_job_flow()
    print emr_job_flow_id
Ejemplo n.º 44
0
    def test_bootstrap_files_only_get_uploaded_once(self):
        # just a regression test for Issue #8

        # use self.mrjob_conf_path because it's easier than making a new file
        bootstrap_file = self.mrjob_conf_path

        runner = EMRJobRunner(conf_path=False,
                              bootstrap_files=[bootstrap_file])

        matching_file_dicts = [fd for fd in runner._files
                               if fd['path'] == bootstrap_file]
        assert_equal(len(matching_file_dicts), 1)
Ejemplo n.º 45
0
def collect_active_job_flows(conf_paths):
    """Collect active job flow information from EMR.

    :param str conf_path: Alternate path to read :py:mod:`mrjob.conf` from,
                          or ``False`` to ignore all config files

    Return a list of job flows
    """
    emr_conn = EMRJobRunner(conf_paths=conf_paths).make_emr_conn()
    active_states = ['STARTING', 'BOOTSTRAPPING', 'WAITING', 'RUNNING']

    return describe_all_job_flows(emr_conn, states=active_states)
Ejemplo n.º 46
0
    def test_create_scratch_uri(self):
        # "walrus" bucket will be ignored; it doesn't start with "mrjob-"
        self.add_mock_s3_data({'walrus': {}, 'zebra': {}})

        runner = EMRJobRunner(conf_path=False, s3_sync_wait_time=0.01)

        # bucket name should be mrjob- plus 16 random hex digits
        s3_scratch_uri = runner._opts['s3_scratch_uri']
        assert_equal(s3_scratch_uri[:11], 's3://mrjob-')
        assert_equal(s3_scratch_uri[27:], '/tmp/')

        # bucket shouldn't actually exist yet
        scratch_bucket, _ = parse_s3_uri(s3_scratch_uri)
        assert_not_in(scratch_bucket, self.mock_s3_fs.keys())

        # need to do something to ensure that the bucket actually gets
        # created. let's launch a (mock) job flow
        jfid = runner.make_persistent_job_flow()
        assert_in(scratch_bucket, self.mock_s3_fs.keys())
        runner.make_emr_conn().terminate_jobflow(jfid)

        # once our scratch bucket is created, we should re-use it
        runner2 = EMRJobRunner(conf_path=False)
        assert_equal(runner2._opts['s3_scratch_uri'], s3_scratch_uri)
        s3_scratch_uri = runner._opts['s3_scratch_uri']
Ejemplo n.º 47
0
    def test_s3_ls(self):
        runner = EMRJobRunner(s3_scratch_uri='s3://walrus/tmp',
                              conf_path=False)

        self.add_mock_s3_data({'walrus': {'one': '', 'two': '', 'three': ''}})

        assert_equal(
            set(runner._s3_ls('s3://walrus/')),
            set([
                's3://walrus/one',
                's3://walrus/two',
                's3://walrus/three',
            ]))

        assert_equal(set(runner._s3_ls('s3://walrus/t')),
                     set([
                         's3://walrus/two',
                         's3://walrus/three',
                     ]))

        assert_equal(set(runner._s3_ls('s3://walrus/t/')), set([]))

        # if we ask for a nonexistent bucket, we should get some sort
        # of exception (in practice, buckets with random names will
        # probably be owned by other people, and we'll get some sort
        # of permissions error)
        assert_raises(Exception, set, runner._s3_ls('s3://lolcat/'))
Ejemplo n.º 48
0
def main(cl_args=None):
    arg_parser = _make_arg_parser()
    options = arg_parser.parse_args(cl_args)

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    runner_kwargs = {
        k: v
        for k, v in options.__dict__.items()
        if k not in ('quiet', 'verbose', 'step_id')
    }

    runner = EMRJobRunner(**runner_kwargs)
    emr_client = runner.make_emr_client()

    # pick step
    step = _get_step(emr_client, options.cluster_id, options.step_id)

    if not step:
        raise SystemExit(1)

    if step['Status']['State'] != 'FAILED':
        log.warning('step %s has state %s, not FAILED' %
                    (step['Id'], step['Status']['State']))

    # interpret logs
    log.info('Diagnosing step %s (%s)' % (step['Id'], step['Name']))

    log_interpretation = dict(step_id=step['Id'])

    step_type = _infer_step_type(step)

    error = runner._pick_error(log_interpretation, step_type)

    # print error
    if error:
        log.error('Probable cause of failure:\n\n%s\n\n' %
                  _format_error(error))
    else:
        log.warning('No error detected')
Ejemplo n.º 49
0
def s3_cleanup(glob_path, time_old, dry_run=False, conf_paths=None):
    """Delete all files older than *time_old* in *path*.
       If *dry_run* is ``True``, then just log the files that need to be
       deleted without actually deleting them
       """
    runner = EMRJobRunner(conf_paths=conf_paths)

    log.info('Deleting all files in %s that are older than %s' %
             (glob_path, time_old))

    for path in runner.ls(glob_path):
        bucket_name, key_name = parse_s3_uri(path)
        bucket = runner.fs.get_bucket(bucket_name)

        for key in bucket.list(key_name):
            last_modified = iso8601_to_datetime(key.last_modified)
            age = datetime.utcnow() - last_modified
            if age > time_old:
                # Delete it
                log.info('Deleting %s; is %s old' % (key.name, age))
                if not dry_run:
                    key.delete()
Ejemplo n.º 50
0
    def test_can_get_all_job_flows(self):
        now = datetime.datetime.utcnow()

        NUM_JOB_FLOWS = 2222
        assert_gt(NUM_JOB_FLOWS, DEFAULT_MAX_JOB_FLOWS_RETURNED)

        for i in range(NUM_JOB_FLOWS):
            jfid = 'j-%04d' % i
            self.mock_emr_job_flows[jfid] = MockEmrObject(
                creationdatetime=to_iso8601(now - datetime.timedelta(minutes=i)),
                jobflowid=jfid)

        emr_conn = EMRJobRunner().make_emr_conn()

        # ordinary describe_jobflows() hits the limit on number of job flows
        some_jfs = emr_conn.describe_jobflows()
        assert_equal(len(some_jfs), DEFAULT_MAX_JOB_FLOWS_RETURNED)

        all_jfs = describe_all_job_flows(emr_conn)
        assert_equal(len(all_jfs), NUM_JOB_FLOWS)
        assert_equal(sorted(jf.jobflowid for jf in all_jfs),
                     [('j-%04d' % i) for i in range(NUM_JOB_FLOWS)])
Ejemplo n.º 51
0
    def setUp(self):
        super(AttemptToUnlockClusterTestCase, self).setUp()

        self.emr_client = self.client('emr')

        self.cluster_id = EMRJobRunner().make_persistent_cluster()
        # get into WAITING state
        self.simulate_emr_progress(self.cluster_id)
        self.simulate_emr_progress(self.cluster_id)

        self.log = self.start(patch('mrjob.pool.log'))

        self.our_key = 'mr_wc.dmarin.20200419.185348.359278'
Ejemplo n.º 52
0
    def test_jar_step_without_mr_job_script(self):
        jar_path = self.makefile('dora.jar')
        steps = MRJustAJar(['--jar', jar_path])._steps_desc()

        runner = EMRJobRunner(steps=steps, stdin=BytesIO(b'backpack'))

        runner.run()
        runner.cleanup()
Ejemplo n.º 53
0
    def test_spark_jar_step_without_mr_job_script(self):
        spark_jar_path = self.makefile('fireflies.jar')
        steps = MRSparkJar(['--jar', spark_jar_path])._steps_desc()

        runner = EMRJobRunner(steps=steps, stdin=BytesIO())

        runner.run()
        runner.cleanup()
Ejemplo n.º 54
0
    def test_spark_script_step_without_mr_job_script(self):
        spark_script_path = self.makefile('a_spark_script.py')
        steps = MRSparkScript(['--script', spark_script_path])._steps_desc()

        runner = EMRJobRunner(steps=steps, stdin=BytesIO())

        runner.run()
        runner.cleanup()
def check_sssp_completed():
    emr = EMRJobRunner()
    key = 'hw7/visited.txt'
    c = emr.fs.make_s3_conn()
    b = c.get_bucket(
        'ucb-mids-mls-rajeshthallam')  # substitute your bucket name here
    key_exists = b.get_key(key)

    if key_exists == None:
        return "-1"
    else:
        k = Key(b)
        k.key = key
        return k.get_contents_as_string()
Ejemplo n.º 56
0
def inspect_and_maybe_terminate_job_flows(
    conf_path, max_hours_idle, now, dry_run):

    emr_conn = EMRJobRunner(conf_path=conf_path).make_emr_conn()

    log.info(
        'getting info about all job flows (this goes back about 2 months)')
    job_flows = describe_all_job_flows(emr_conn)

    num_running = 0
    num_idle = 0
    num_done = 0
    num_non_streaming = 0
    # a list of tuples of job flow id, name, idle time (as a timedelta)
    to_terminate = []

    for jf in job_flows:

        # check if job flow is done
        if is_job_flow_done(jf):
            num_done += 1

        # we can't really tell if non-streaming jobs are idle or not, so
        # let them be (see Issue #60)
        elif is_job_flow_non_streaming(jf):
            num_non_streaming += 1

        elif is_job_flow_running(jf):
            num_running += 1

        else:
            num_idle += 1
            time_idle = time_job_flow_idle(jf, now=now)

            # don't care about fractions of a second
            time_idle = timedelta(time_idle.days, time_idle.seconds)

            log.debug('Job flow %s (%s) idle for %s' %
                      (jf.jobflowid, jf.name, time_idle))
            if time_idle > timedelta(hours=max_hours_idle):
                to_terminate.append(
                    (jf.jobflowid, jf.name, time_idle))

    log.info(
        'Job flow statuses: %d running, %d idle, %d active non-streaming,'
        ' %d done' % (num_running, num_idle, num_non_streaming, num_done))

    terminate_and_notify(emr_conn, to_terminate, dry_run=dry_run)
Ejemplo n.º 57
0
def main():
    runner_kwargs = build_config()
    logger.info('main() -- runner_kwargs: %s', runner_kwargs)
    emr_client = EMRJobRunner(**runner_kwargs)
    cluster_id = emr_client.make_persistent_cluster()
    logger.info('Cluster-id: %s', cluster_id)

    emr_client = EMRJobRunner(
        mr_job_script='src/hive_step.py',
        cluster_id=cluster_id,
        input_paths=['/dev/null'],
        extra_args=['--jar-region', runner_kwargs['region']],
        **runner_kwargs)
    emr_client.run()
 def reducer_final(self):
     if self.target_reached == 1:
         sys.stderr.write('Target reached')
         if self.runmode == 'emr':
             sys.stderr.write(self.target_path)
             s3_key = 'hw7/visited.txt'
             emr = EMRJobRunner()
             c = emr.fs.make_s3_conn()
             b = c.get_bucket(self.s3bucket)
             k = Key(b)
             k.key = s3_key
             k.set_contents_from_string(self.target_path)
             #self.write_to_s3(self.options.bucket, s3_key, self.target_path)
         else:
             yield self.target_path.split('\t')[0], self.target_path.split(
                 '\t')[1]
Ejemplo n.º 59
0
    def setUp(self):
        super(AttemptToLockClusterTestCase, self).setUp()

        self.emr_client = self.client('emr')
        self.cluster_id = EMRJobRunner().make_persistent_cluster()
        # get into WAITING state
        self.simulate_emr_progress(self.cluster_id)
        self.simulate_emr_progress(self.cluster_id)

        self.log = self.start(patch('mrjob.pool.log'))
        self.mock_sleep = self.start(patch('time.sleep'))

        self.time = time.time  # save for safekeeping
        self.mock_time = self.start(patch('time.time', side_effect=time.time))

        self.our_key = 'mr_wc.dmarin.20200419.185348.359278'
        self.their_key = 'mr_wc.them.20200419.185348.999999'
Ejemplo n.º 60
0
def _s3_cleanup(glob_path, time_old, dry_run=False, **runner_kwargs):
    """Delete all files older than *time_old* in *path*.

    If *dry_run* is true, then just log the files that need to be
    deleted without actually deleting them
    """
    runner = EMRJobRunner(**runner_kwargs)

    log.info('Deleting all files in %s that are older than %s' %
             (glob_path, time_old))

    for path, key in runner.fs._ls(glob_path):
        age = datetime.now(tzutc()) - key.last_modified
        if age > time_old:
            # Delete it
            log.info('Deleting %s; is %s old' % (path, age))
            if not dry_run:
                key.delete()