def test_with_all_job_flows(self): self.mock_emr_job_flows.update(JOB_FLOWS_BY_ID) emr_conn = EMRJobRunner(conf_paths=[]).make_emr_conn() emr_conn.run_jobflow('no name', log_uri=None) main(['-q', '--no-conf']) lines = [line for line in StringIO(self.stdout.getvalue())] self.assertEqual(len(lines), len(JOB_FLOWS_BY_ID) - 1)
def test_no_region(self): runner = EMRJobRunner(conf_path=False) assert_equal(runner.make_emr_conn().endpoint, 'elasticmapreduce.amazonaws.com') assert_equal(runner.make_s3_conn().endpoint, 's3.amazonaws.com') assert_equal(runner._aws_region, '')
def yield_clusters(max_days_ago=None, now=None, **runner_kwargs): """Get relevant job flow information from EMR. :param float max_days_ago: If set, don't fetch job flows created longer than this many days ago. :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. :param runner_kwargs: keyword args to pass through to :py:class:`~mrjob.emr.EMRJobRunner` """ if now is None: now = datetime.utcnow() emr_conn = EMRJobRunner(**runner_kwargs).make_emr_conn() # if --max-days-ago is set, only look at recent jobs created_after = None if max_days_ago is not None: created_after = now - timedelta(days=max_days_ago) for cluster_summary in _yield_all_clusters(emr_conn, created_after=created_after): cluster_id = cluster_summary.id cluster = emr_conn.describe_cluster(cluster_id) cluster.steps = list(_yield_all_steps(emr_conn, cluster_id)) cluster.bootstrapactions = list(_yield_all_bootstrap_actions(emr_conn, cluster_id)) yield cluster
def test_create_scratch_uri(self): # "walrus" bucket will be ignored; it doesn't start with "mrjob-" self.add_mock_s3_data({'walrus': {}, 'zebra': {}}) runner = EMRJobRunner(conf_path=False, s3_sync_wait_time=0.01) # bucket name should be mrjob- plus 16 random hex digits s3_scratch_uri = runner._opts['s3_scratch_uri'] assert_equal(s3_scratch_uri[:11], 's3://mrjob-') assert_equal(s3_scratch_uri[27:], '/tmp/') # bucket shouldn't actually exist yet scratch_bucket, _ = parse_s3_uri(s3_scratch_uri) assert_not_in(scratch_bucket, self.mock_s3_fs.keys()) # need to do something to ensure that the bucket actually gets # created. let's launch a (mock) job flow jfid = runner.make_persistent_job_flow() assert_in(scratch_bucket, self.mock_s3_fs.keys()) runner.make_emr_conn().terminate_jobflow(jfid) # once our scratch bucket is created, we should re-use it runner2 = EMRJobRunner(conf_path=False) assert_equal(runner2._opts['s3_scratch_uri'], s3_scratch_uri) s3_scratch_uri = runner._opts['s3_scratch_uri']
class MRBossTestCase(MockBotoTestCase): def setUp(self): super(MRBossTestCase, self).setUp() self.make_runner() def tearDown(self): self.cleanup_runner() super(MRBossTestCase, self).tearDown() def make_runner(self): self.runner = EMRJobRunner(conf_paths=[]) self.add_mock_s3_data({'walrus': {}}) self.runner = EMRJobRunner(s3_sync_wait_time=0, s3_tmp_dir='s3://walrus/tmp', conf_paths=[]) self.runner._s3_job_log_uri = BUCKET_URI + LOG_DIR self.prepare_runner_for_ssh(self.runner) self.output_dir = tempfile.mkdtemp(prefix='mrboss_wd') def cleanup_runner(self): """This method assumes ``prepare_runner_for_ssh()`` was called. That method isn't a "proper" setup method because it requires different arguments for different tests. """ shutil.rmtree(self.output_dir) self.runner.cleanup() def test_one_node(self): mock_ssh_file('testmaster', 'some_file', b'file contents') run_on_all_nodes(self.runner, self.output_dir, ['cat', 'some_file'], print_stderr=False) with open(os.path.join(self.output_dir, 'master', 'stdout'), 'r') as f: self.assertEqual(f.read().rstrip(), 'file contents') self.assertEqual(os.listdir(self.output_dir), ['master']) def test_two_nodes(self): self.add_slave() self.runner._opts['num_ec2_instances'] = 2 mock_ssh_file('testmaster', 'some_file', b'file contents 1') mock_ssh_file('testmaster!testslave0', 'some_file', b'file contents 2') self.runner.fs # force initialization of _ssh_fs run_on_all_nodes(self.runner, self.output_dir, ['cat', 'some_file'], print_stderr=False) with open(os.path.join(self.output_dir, 'master', 'stdout'), 'r') as f: self.assertEqual(f.read().rstrip(), 'file contents 1') with open(os.path.join(self.output_dir, 'slave testslave0', 'stdout'), 'r') as f: self.assertEqual(f.read().strip(), 'file contents 2') self.assertEqual(sorted(os.listdir(self.output_dir)), ['master', 'slave testslave0'])
def test_local_bootstrap_action(self): # make sure that local bootstrap action scripts get uploaded to S3 action_path = os.path.join(self.tmp_dir, 'apt-install.sh') with open(action_path, 'w') as f: f.write('for $pkg in $@; do sudo apt-get install $pkg; done\n') bootstrap_actions = [ action_path + ' python-scipy mysql-server'] runner = EMRJobRunner(conf_path=False, bootstrap_actions=bootstrap_actions, s3_sync_wait_time=0.01) job_flow_id = runner.make_persistent_job_flow() emr_conn = runner.make_emr_conn() job_flow = emr_conn.describe_jobflow(job_flow_id) actions = job_flow.bootstrapactions assert_equal(len(actions), 2) assert actions[0].path.startswith('s3://mrjob-') assert actions[0].path.endswith('/apt-install.sh') assert_equal(actions[0].name, 'apt-install.sh') assert_equal(actions[0].args, ['python-scipy', 'mysql-server']) # check for master boostrap script assert actions[1].path.startswith('s3://mrjob-') assert actions[1].path.endswith('b.py') assert_equal(actions[1].args, []) assert_equal(actions[1].name, 'master') # make sure master bootstrap script is on S3 assert runner.path_exists(actions[1].path)
def find_waiting_flow(aws_access_key_id,aws_secret_access_key,ssh_key_pair_file=''): # print (aws_access_key_id,aws_secret_access_key) JobRunner = EMRJobRunner(aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key) emr_conn = JobRunner.make_emr_conn() job_flows=emr_conn.describe_jobflows() job_id='NONE' d = {'WAITING':0,'STARTING':1,'RUNNING':2} waiting_flows=[] for flow in job_flows: try: if flow.state in d.keys(): job_id=flow.jobflowid ip_address=flow.masterpublicdnsname waiting_flows.append([d[flow.state],job_id,ip_address,flow.state]) if ssh_key_pair_file != '': print 'ssh -i %s hadoop@%s'%(ssh_key_pair_file,ip_address) job_id=flow.jobflowid except Exception: continue waiting_flows = sorted(waiting_flows, key=itemgetter(0)) waiting_flows = [i[1:] for i in waiting_flows] #An index was added at the beginning for the sorting. Removing that index in this step waiting_flows_dict = [{'flow_id':i[0],'node':i[1],'flow_state':i[2]} for i in waiting_flows] #Converting a list of lists to a list of dicts #Printing index = 0 for flow_dict in waiting_flows_dict: print index, flow_dict['flow_id'], flow_dict['node'], flow_dict['flow_state'] index+=1 return waiting_flows_dict
def test_attach_to_existing_job_flow(self): emr_conn = EMRJobRunner(conf_path=False).make_emr_conn() # set log_uri to None, so that when we describe the job flow, it # won't have the loguri attribute, to test Issue #112 emr_job_flow_id = emr_conn.run_jobflow( name='Development Job Flow', log_uri=None) stdin = StringIO('foo\nbar\n') self.mock_emr_output = {(emr_job_flow_id, 1): [ '1\t"bar"\n1\t"foo"\n2\tnull\n']} mr_job = MRTwoStepJob(['-r', 'emr', '-v', '-c', self.mrjob_conf_path, '--emr-job-flow-id', emr_job_flow_id]) mr_job.sandbox(stdin=stdin) results = [] with mr_job.make_runner() as runner: runner.run() # Issue 182: don't create the bootstrap script when # attaching to another job flow assert_equal(runner._master_bootstrap_script, None) for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) assert_equal(sorted(results), [(1, 'bar'), (1, 'foo'), (2, None)])
def main(): # parser command-line args option_parser = make_option_parser() options, args = option_parser.parse_args() if args: option_parser.error('takes no arguments') # set up logging if not options.quiet: log_to_stream(name='mrjob', debug=options.verbose) # create the persistent job runner_kwargs = { 'conf_path': options.conf_path, 'ec2_instance_type': options.ec2_instance_type, 'ec2_master_instance_type': options.ec2_master_instance_type, 'ec2_slave_instance_type': options.ec2_slave_instance_type, 'label': options.label, 'num_ec2_instances': options.num_ec2_instances, 'owner': options.owner, } runner = EMRJobRunner(**runner_kwargs) emr_job_flow_id = runner.make_persistent_job_flow() print emr_job_flow_id
def test_spark_script_step_without_mr_job_script(self): spark_script_path = self.makefile('a_spark_script.py') steps = MRSparkScript(['--script', spark_script_path])._steps_desc() runner = EMRJobRunner(steps=steps, stdin=BytesIO()) runner.run() runner.cleanup()
def test_spark_jar_step_without_mr_job_script(self): spark_jar_path = self.makefile('fireflies.jar') steps = MRSparkJar(['--jar', spark_jar_path])._steps_desc() runner = EMRJobRunner(steps=steps, stdin=BytesIO()) runner.run() runner.cleanup()
def test_jar_step_without_mr_job_script(self): jar_path = self.makefile('dora.jar') steps = MRJustAJar(['--jar', jar_path])._steps_desc() runner = EMRJobRunner(steps=steps, stdin=BytesIO(b'backpack')) runner.run() runner.cleanup()
def test_blank_region(self): # blank region should be treated the same as no region runner = EMRJobRunner(conf_path=False, aws_region='') assert_equal(runner.make_emr_conn().endpoint, 'elasticmapreduce.amazonaws.com') assert_equal(runner.make_s3_conn().endpoint, 's3.amazonaws.com') assert_equal(runner._aws_region, '')
def reducer_init(self): emr = EMRJobRunner(aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY) idf_parts = emr.get_s3_keys('s3://6885public/jeffchan/term-idfs/') self.word_to_idf = dict() for part in idf_parts: json = part.get_contents_as_string() for line in StringIO.StringIO(json): pair = json.loads(line) self.word_to_idf[pair['term']] = pair['idf']
def test_terminate_job_flow(self): cluster_id = self.make_cluster(pool_emr_job_flows=True) self.monkey_patch_argv('--quiet', '--no-conf', 'j-MOCKCLUSTER0') terminate_main() emr_conn = EMRJobRunner(conf_paths=[]).make_emr_conn() cluster = emr_conn.describe_cluster(cluster_id) self.assertEqual(cluster.status.state, 'TERMINATED')
def test_terminate_job_flow(self): jf_id = self.make_job_flow(pool_emr_job_flows=True) self.monkey_patch_argv('--quiet', '--no-conf', 'j-MOCKJOBFLOW0') terminate_main() emr_conn = EMRJobRunner(conf_paths=[]).make_emr_conn() self.assertEqual(emr_conn.describe_jobflow(jf_id).state, 'TERMINATED')
def make_runner(self): self.runner = EMRJobRunner(conf_paths=[]) self.add_mock_s3_data({'walrus': {}}) self.runner = EMRJobRunner(cloud_fs_sync_secs=0, cloud_tmp_dir='s3://walrus/tmp', conf_paths=[]) self.runner._s3_log_dir_uri = BUCKET_URI + LOG_DIR self.prepare_runner_for_ssh(self.runner) self.output_dir = tempfile.mkdtemp(prefix='mrboss_wd')
def main(): option_parser = make_option_parser() options, args = option_parser.parse_args() if args: option_parser.error('takes no arguments') # set up logging if not options.quiet: log_to_stream(name='mrjob', debug=options.verbose) emr_conn = EMRJobRunner().make_emr_conn() log.info( 'getting info about all job flows (this goes back about 2 weeks)') job_flows = emr_conn.describe_jobflows() now = datetime.utcnow() num_running = 0 num_idle = 0 num_done = 0 # a list of tuples of job flow id, name, idle time (as a timedelta) to_terminate = [] for jf in job_flows: # check if job flow is done if hasattr(jf, 'enddatetime'): num_done += 1 # check if job flow is currently running elif jf.steps and not hasattr(jf.steps[-1], 'enddatetime'): num_running += 1 # job flow is idle. how long? else: num_idle += 1 if jf.steps: idle_since = datetime.strptime( jf.steps[-1].enddatetime, ISO8601) else: idle_since = datetime.strptime( jf.creationdatetime, ISO8601) idle_time = now - idle_since # don't care about fractions of a second idle_time = timedelta(idle_time.days, idle_time.seconds) log.debug('Job flow %s (%s) idle for %s' % (jf.jobflowid, jf.name, idle_time)) if idle_time > timedelta(hours=options.max_hours_idle): to_terminate.append( (jf.jobflowid, jf.name, idle_time)) log.info('Job flow statuses: %d running, %d idle, %d done' % (num_running, num_idle, num_done)) terminate_and_notify(emr_conn, to_terminate, options)
def find_waiting_flow(aws_access_key_id,aws_secret_access_key): JobRunner = EMRJobRunner(aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key) emr_conn = JobRunner.make_emr_conn() job_flows=emr_conn.describe_jobflows() job_id='NONE' for flow in job_flows: if flow.state=='WAITING': print flow,flow.name,flow.jobflowid,flow.state job_id=flow.jobflowid return job_id
def test_terminate_cluster(self): cluster_id = self.make_cluster(pool_clusters=True) self.monkey_patch_argv('--quiet', '--no-conf', 'j-MOCKCLUSTER0') terminate_main() emr_conn = EMRJobRunner(conf_paths=[]).make_emr_conn() cluster = _patched_describe_cluster(emr_conn, cluster_id) emr_conn.simulate_progress(cluster_id) self.assertEqual(cluster.status.state, 'TERMINATED')
def make_runner(self): self.runner = EMRJobRunner(conf_paths=[]) self.add_mock_s3_data({'walrus': {}}) self.runner = EMRJobRunner(s3_sync_wait_time=0, s3_scratch_uri='s3://walrus/tmp', conf_paths=[]) self.runner._s3_job_log_uri = BUCKET_URI + LOG_DIR self.prepare_runner_for_ssh(self.runner) self.runner._enable_slave_ssh_access() self.output_dir = tempfile.mkdtemp(prefix='mrboss_wd')
class MRBossTestCase(MockEMRAndS3TestCase): @setup def make_runner(self): self.runner = EMRJobRunner(conf_path=False) self.add_mock_s3_data({'walrus': {}}) self.runner = EMRJobRunner(s3_sync_wait_time=0, s3_scratch_uri='s3://walrus/tmp', conf_path=False) self.runner._s3_job_log_uri = BUCKET_URI + LOG_DIR self.prepare_runner_for_ssh(self.runner) self.runner._enable_slave_ssh_access() self.output_dir = tempfile.mkdtemp(prefix='mrboss_wd') @teardown def cleanup_runner(self): """This method assumes ``prepare_runner_for_ssh()`` was called. That method isn't a "proper" setup method because it requires different arguments for different tests. """ shutil.rmtree(self.output_dir) self.runner.cleanup() self.teardown_ssh() def test_one_node(self): mock_ssh_file('testmaster', 'some_file', 'file contents') run_on_all_nodes(self.runner, self.output_dir, ['cat', 'some_file'], print_stderr=False) with open(os.path.join(self.output_dir, 'master', 'stdout'), 'r') as f: assert_equal(f.read(), 'file contents\n') assert_equal(os.listdir(self.output_dir), ['master']) def test_two_nodes(self): self.add_slave() self.runner._opts['num_ec2_instances'] = 2 mock_ssh_file('testmaster', 'some_file', 'file contents 1') mock_ssh_file('testmaster!testslave0', 'some_file', 'file contents 2') run_on_all_nodes(self.runner, self.output_dir, ['cat', 'some_file'], print_stderr=False) with open(os.path.join(self.output_dir, 'master', 'stdout'), 'r') as f: assert_equal(f.read(), 'file contents 1\n') with open(os.path.join(self.output_dir, 'slave testslave0', 'stdout'), 'r') as f: assert_equal(f.read(), 'file contents 2\n') assert_equal(sorted(os.listdir(self.output_dir)), ['master', 'slave testslave0'])
def test_cleanup(self): runner = EMRJobRunner(conf_paths=[], s3_sync_wait_time=0.01) # add some mock data and change last_modified remote_input_path = 's3://walrus/data/' self.add_mock_s3_data({'walrus': {'data/foo': 'foo\n', 'data/bar': 'bar\n', 'data/qux': 'qux\n'}}) s3_conn = runner.make_s3_conn() bucket_name, key_name = parse_s3_uri(remote_input_path) bucket = s3_conn.get_bucket(bucket_name) key_foo = bucket.get_key('data/foo') key_bar = bucket.get_key('data/bar') key_qux = bucket.get_key('data/qux') key_bar.last_modified = datetime.now() - timedelta(days=45) key_qux.last_modified = datetime.now() - timedelta(hours=50) # make sure keys are there assert isinstance(key_foo, MockKey) assert isinstance(key_bar, MockKey) assert isinstance(key_qux, MockKey) s3_cleanup(remote_input_path, timedelta(days=30), dry_run=True, conf_paths=[]) # dry-run shouldn't delete anything assert isinstance(key_foo, MockKey) assert isinstance(key_bar, MockKey) assert isinstance(key_qux, MockKey) s3_cleanup(remote_input_path, timedelta(days=30), conf_paths=[]) key_foo = bucket.get_key('data/foo') key_bar = bucket.get_key('data/bar') key_qux = bucket.get_key('data/qux') # make sure key_bar is deleted assert isinstance(key_foo, MockKey) self.assertEqual(key_bar, None) assert isinstance(key_qux, MockKey) s3_cleanup(remote_input_path, timedelta(hours=48), conf_paths=[]) key_foo = bucket.get_key('data/foo') key_bar = bucket.get_key('data/bar') key_qux = bucket.get_key('data/qux') # make sure key_qux is deleted assert isinstance(key_foo, MockKey) self.assertEqual(key_bar, None) self.assertEqual(key_qux, None)
def main(cl_args=None): # parser command-line args arg_parser = _make_arg_parser() options = arg_parser.parse_args(cl_args) MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) # create the persistent job runner = EMRJobRunner(**_runner_kwargs(options)) log.debug('Terminating cluster %s' % options.cluster_id) runner.make_emr_client().terminate_job_flows( JobFlowIds=[options.cluster_id]) log.info('Terminated cluster %s' % options.cluster_id)
def test_terminate_pool(self): jf_id = self.make_job_flow(pool_emr_job_flows=True) emr_conn = EMRJobRunner(conf_path=False).make_emr_conn() for i in range(3): emr_conn.simulate_progress(jf_id) self.monkey_patch_argv("--quiet", "--no-conf", "-t", jf_id) self.monkey_patch_stdout() pool_main() value = self.stdout.getvalue() self.assertIn("j-MOCKJOBFLOW0", value)
def test_archive_remote_data(self): runner = EMRJobRunner() tar_gz_path = runner._dir_archive_path('s3://walrus/archive') self.assertEqual(os.path.basename(tar_gz_path), 'archive.tar.gz') runner._create_dir_archive('s3://walrus/archive') tar_gz = tarfile.open(tar_gz_path, 'r:gz') try: self.assertEqual(sorted(tar_gz.getnames()), [os.path.join('bar', 'baz'), 'foo']) finally: tar_gz.close()
def find_waiting_flow(aws_access_key_id=None, aws_secret_access_key=None, s3_scratch_uri=None, s3_log_uri=None, ec2_key_pair=None, ec2_key_pair_file=None): # If the options are specified then ignore the options in ~/.mrjob.conf if aws_access_key_id is not None and aws_secret_access_key is not None and \ s3_scratch_uri is not None and s3_log_uri is not None and ec2_key_pair is not None and \ ec2_key_pair_file is not None: emr_conn = EMRJobRunner(aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, s3_scratch_uri=s3_scratch_uri, s3_log_uri=s3_log_uri, ec2_key_pair=ec2_key_pair, ec2_key_pair_file=ec2_key_pair_file).make_emr_conn() # If options are not specified then use the options in ~/.mrjob.conf else: if not os.path.isfile("%s/.mrjob.conf" % expanduser("~")): sys.exit("%s/.mrjob.conf no found" % expanduser("~")) emr_conn = EMRJobRunner().make_emr_conn() job_flows = emr_conn.describe_jobflows() d = {'WAITING': 0, 'STARTING': 1, 'RUNNING': 2} waiting_flows = [] for flow in job_flows: try: if flow.state in d.keys(): job_id = flow.jobflowid ip_address = flow.masterpublicdnsname waiting_flows.append([d[flow.state], job_id, ip_address, flow.state]) if ec2_key_pair_file != '': print 'ssh -i %s hadoop@%s' % (ec2_key_pair_file, ip_address) job_id = flow.jobflowid except Exception: continue waiting_flows = sorted(waiting_flows, key=itemgetter(0)) # An index was added at the beginning for the sorting. Removing that index in this step waiting_flows = [i[1:] for i in waiting_flows] # Converting a list of lists to a list of dicts waiting_flows_dict = [{'flow_id': i[0], 'node': i[1], 'flow_state':i[2]} for i in waiting_flows] # Printing index = 0 for flow_dict in waiting_flows_dict: print index, flow_dict['flow_id'], flow_dict['node'], flow_dict['flow_state'] index += 1 return waiting_flows_dict
def main(cl_args=None): # parser command-line args option_parser = _make_option_parser() options, args = option_parser.parse_args(cl_args) if len(args) != 1: option_parser.error('This tool takes exactly one argument.') cluster_id = args[0] MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) # create the persistent job runner = EMRJobRunner(**_runner_kwargs(options)) log.debug('Terminating cluster %s' % cluster_id) runner.make_emr_conn().terminate_jobflow(cluster_id) log.info('Terminated cluster %s' % cluster_id)
def main(cl_args=None): # parser command-line args option_parser = make_option_parser() options, args = option_parser.parse_args(cl_args) if len(args) != 1: option_parser.error('This tool takes exactly one argument.') emr_job_flow_id = args[0] MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) # create the persistent job runner = EMRJobRunner(conf_paths=options.conf_paths) log.debug('Terminating job flow %s' % emr_job_flow_id) runner.make_emr_conn().terminate_jobflow(emr_job_flow_id) log.info('Terminated job flow %s' % emr_job_flow_id)
def find_waiting_flow(aws_access_key_id,aws_secret_access_key,ssh_key_pair_file=''): print (aws_access_key_id,aws_secret_access_key) JobRunner = EMRJobRunner(aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key) emr_conn = JobRunner.make_emr_conn() job_flows=emr_conn.describe_jobflows() job_id='NONE' waiting_flows=[] for flow in job_flows: if flow.state=='WAITING': waiting_flows.append(flow) print flow.jobflowid,flow.state ip_address=flow.masterpublicdnsname if ssh_key_pair_file != '': print 'ssh -i %s hadoop@%s'%(ssh_key_pair_file,ip_address) job_id=flow.jobflowid return job_id
def main(cl_args=None): # parser command-line args option_parser = _make_option_parser() options, args = option_parser.parse_args(cl_args) if len(args) != 1: option_parser.error('This tool takes exactly one argument.') cluster_id = args[0] MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) # create the persistent job runner = EMRJobRunner(**_runner_kwargs(options)) log.debug('Terminating cluster %s' % cluster_id) runner.make_emr_client().terminate_job_flows( JobFlowIds=[cluster_id]) log.info('Terminated cluster %s' % cluster_id)
def make_cluster(self, **kwargs): self.add_mock_s3_data({'walrus': {}}) kwargs.update( dict(conf_paths=[], cloud_tmp_dir='s3://walrus/', cloud_fs_sync_secs=0)) with EMRJobRunner(**kwargs) as runner: return runner.make_persistent_cluster()
def _yield_clusters(max_days_ago=None, now=None, **runner_kwargs): """Get relevant cluster information from EMR. :param float max_days_ago: If set, don't fetch clusters created longer than this many days ago. :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. :param runner_kwargs: keyword args to pass through to :py:class:`~mrjob.emr.EMRJobRunner` """ if now is None: now = _boto3_now() emr_client = EMRJobRunner(**runner_kwargs).make_emr_client() # if --max-days-ago is set, only look at recent jobs created_after = None if max_days_ago is not None: created_after = now - timedelta(days=max_days_ago) # use _DELAY to sleep 1 second after each API call (see #1091). Could # implement some sort of connection wrapper for this if it becomes more # generally useful. list_clusters_kwargs = dict(_delay=_DELAY) if created_after is not None: list_clusters_kwargs['CreatedAfter'] = created_after for cluster_summary in _boto3_paginate('Clusters', emr_client, 'list_clusters', **list_clusters_kwargs): cluster_id = cluster_summary['Id'] cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster'] sleep(_DELAY) cluster['Steps'] = list( reversed( list( _boto3_paginate('Steps', emr_client, 'list_steps', ClusterId=cluster_id, _delay=_DELAY)))) yield cluster
def make_job_flow(self, **kwargs): self.add_mock_s3_data({'walrus': {}}) kwargs.update( dict(conf_paths=[], s3_scratch_uri='s3://walrus/', s3_sync_wait_time=0)) with EMRJobRunner(**kwargs) as runner: return runner.make_persistent_job_flow()
def test_cleanup(self): runner = EMRJobRunner(conf_paths=[], cloud_fs_sync_secs=0.01) # add some mock data # foo is current self.add_mock_s3_data( {'walrus': {'data/foo': b'foo\n'}}) # bar and baz are very old (but baz isn't in data/) self.add_mock_s3_data( {'walrus': {'data/bar': b'bar\n', 'other/baz': b'baz\n'}}, age=timedelta(days=45)) # qux is a little more than two days old self.add_mock_s3_data( {'walrus': {'data/qux': b'qux\n'}}, age=timedelta(hours=50)) self.assertEqual( sorted(runner.fs.ls('s3://walrus/')), ['s3://walrus/data/bar', 's3://walrus/data/foo', 's3://walrus/data/qux', 's3://walrus/other/baz'], ) # try a dry run, which shouldn't delete anything _s3_cleanup('s3://walrus/data/', timedelta(days=30), dry_run=True, conf_paths=[]) self.assertEqual( sorted(runner.fs.ls('s3://walrus/')), [ 's3://walrus/data/bar', 's3://walrus/data/foo', 's3://walrus/data/qux', 's3://walrus/other/baz', ], ) # now do it for real. should hit bar (baz isn't in data/) _s3_cleanup('s3://walrus/data', timedelta(days=30), conf_paths=[]) self.assertEqual( sorted(runner.fs.ls('s3://walrus/')), [ 's3://walrus/data/foo', 's3://walrus/data/qux', 's3://walrus/other/baz', ], ) # now try to delete qux too _s3_cleanup('s3://walrus/data', timedelta(hours=48), conf_paths=[]) self.assertEqual( sorted(runner.fs.ls('s3://walrus/')), [ 's3://walrus/data/foo', 's3://walrus/other/baz', ], )
def test_cleanup(self): runner = EMRJobRunner(conf_paths=[], s3_sync_wait_time=0.01) # add some mock data and change last_modified remote_input_path = 's3://walrus/data/' self.add_mock_s3_data({ 'walrus': { 'data/foo': b'foo\n', 'data/bar': b'bar\n', 'data/qux': b'qux\n' } }) s3_conn = runner.fs.make_s3_conn() bucket_name, key_name = parse_s3_uri(remote_input_path) bucket = s3_conn.get_bucket(bucket_name) key_foo = bucket.get_key('data/foo') key_bar = bucket.get_key('data/bar') key_qux = bucket.get_key('data/qux') key_bar.last_modified = datetime.now() - timedelta(days=45) key_qux.last_modified = datetime.now() - timedelta(hours=50) # make sure keys are there assert isinstance(key_foo, MockKey) assert isinstance(key_bar, MockKey) assert isinstance(key_qux, MockKey) s3_cleanup(remote_input_path, timedelta(days=30), dry_run=True, conf_paths=[]) # dry-run shouldn't delete anything assert isinstance(key_foo, MockKey) assert isinstance(key_bar, MockKey) assert isinstance(key_qux, MockKey) s3_cleanup(remote_input_path, timedelta(days=30), conf_paths=[]) key_foo = bucket.get_key('data/foo') key_bar = bucket.get_key('data/bar') key_qux = bucket.get_key('data/qux') # make sure key_bar is deleted assert isinstance(key_foo, MockKey) self.assertEqual(key_bar, None) assert isinstance(key_qux, MockKey) s3_cleanup(remote_input_path, timedelta(hours=48), conf_paths=[]) key_foo = bucket.get_key('data/foo') key_bar = bucket.get_key('data/bar') key_qux = bucket.get_key('data/qux') # make sure key_qux is deleted assert isinstance(key_foo, MockKey) self.assertEqual(key_bar, None)
def reducer_init(self): self.idfs = {} # Iterate through the files in the bucket provided by the user if self.options.aws_access_key_id and self.options.aws_secret_access_key: emr = EMRJobRunner( aws_access_key_id=self.options.aws_access_key_id, aws_secret_access_key=self.options.aws_secret_access_key) else: emr = EMRJobRunner() for key in emr.get_s3_keys("s3://" + self.options.idf_loc): # Load the whole file first, then read it line-by-line: otherwise, # chunks may not be even lines for line in StringIO(key.get_contents_as_string()): term_idf = JSONValueProtocol.read(line)[ 1] # parse the line as a JSON object self.idfs[term_idf['term']] = term_idf['idf']
def main(): # parser command-line args option_parser = make_option_parser() options, args = option_parser.parse_args() if args: option_parser.error('takes no arguments') MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) # create the persistent job runner_kwargs = options.__dict__.copy() del runner_kwargs['quiet'] del runner_kwargs['verbose'] runner = EMRJobRunner(**runner_kwargs) emr_job_flow_id = runner.make_persistent_job_flow() print emr_job_flow_id
def test_terminate_job_flow(self): cluster_id = self.make_cluster(pool_emr_job_flows=True) self.monkey_patch_argv('--quiet', '--no-conf', 'j-MOCKCLUSTER0') terminate_main() emr_conn = EMRJobRunner(conf_paths=[]).make_emr_conn() cluster = patched_describe_cluster(emr_conn, cluster_id) self.assertEqual(cluster.status.state, 'TERMINATED')
def main(): # parser command-line args option_parser = make_option_parser() options, args = option_parser.parse_args() if len(args) != 1: option_parser.error('takes exactly one argument') emr_job_flow_id = args[0] # set up logging if not options.quiet: log_to_stream(name='mrjob', debug=options.verbose) # create the persistent job runner = EMRJobRunner(conf_path=options.conf_path) log.debug('Terminating job flow %s' % emr_job_flow_id) runner.make_emr_conn().terminate_jobflow(emr_job_flow_id) log.info('Terminated job flow %s' % emr_job_flow_id)
def find_waiting_flow(aws_access_key_id, aws_secret_access_key, ssh_key_pair_file=''): print(aws_access_key_id, aws_secret_access_key) JobRunner = EMRJobRunner(aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) emr_conn = JobRunner.make_emr_conn() job_flows = emr_conn.describe_jobflows() job_id = 'NONE' waiting_flows = [] for flow in job_flows: if flow.state == 'WAITING': waiting_flows.append(flow) print flow.jobflowid, flow.state ip_address = flow.masterpublicdnsname if ssh_key_pair_file != '': print 'ssh -i %s hadoop@%s' % (ssh_key_pair_file, ip_address) job_id = flow.jobflowid return job_id
def main(args=None): option_parser = make_option_parser() try: options = parse_args(option_parser, args) except OptionError: option_parser.error('This tool takes exactly one argument.') MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) with EMRJobRunner(**runner_kwargs(options)) as runner: perform_actions(options, runner)
def main(): # parser command-line args option_parser = make_option_parser() options, args = option_parser.parse_args() if args: option_parser.error('takes no arguments') # set up logging if not options.quiet: log_to_stream(name='mrjob', debug=options.verbose) # create the persistent job runner_kwargs = options.__dict__.copy() del runner_kwargs['quiet'] del runner_kwargs['verbose'] runner = EMRJobRunner(**runner_kwargs) emr_job_flow_id = runner.make_persistent_job_flow() print emr_job_flow_id
def test_bootstrap_files_only_get_uploaded_once(self): # just a regression test for Issue #8 # use self.mrjob_conf_path because it's easier than making a new file bootstrap_file = self.mrjob_conf_path runner = EMRJobRunner(conf_path=False, bootstrap_files=[bootstrap_file]) matching_file_dicts = [fd for fd in runner._files if fd['path'] == bootstrap_file] assert_equal(len(matching_file_dicts), 1)
def collect_active_job_flows(conf_paths): """Collect active job flow information from EMR. :param str conf_path: Alternate path to read :py:mod:`mrjob.conf` from, or ``False`` to ignore all config files Return a list of job flows """ emr_conn = EMRJobRunner(conf_paths=conf_paths).make_emr_conn() active_states = ['STARTING', 'BOOTSTRAPPING', 'WAITING', 'RUNNING'] return describe_all_job_flows(emr_conn, states=active_states)
def test_s3_ls(self): runner = EMRJobRunner(s3_scratch_uri='s3://walrus/tmp', conf_path=False) self.add_mock_s3_data({'walrus': {'one': '', 'two': '', 'three': ''}}) assert_equal( set(runner._s3_ls('s3://walrus/')), set([ 's3://walrus/one', 's3://walrus/two', 's3://walrus/three', ])) assert_equal(set(runner._s3_ls('s3://walrus/t')), set([ 's3://walrus/two', 's3://walrus/three', ])) assert_equal(set(runner._s3_ls('s3://walrus/t/')), set([])) # if we ask for a nonexistent bucket, we should get some sort # of exception (in practice, buckets with random names will # probably be owned by other people, and we'll get some sort # of permissions error) assert_raises(Exception, set, runner._s3_ls('s3://lolcat/'))
def main(cl_args=None): arg_parser = _make_arg_parser() options = arg_parser.parse_args(cl_args) MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) runner_kwargs = { k: v for k, v in options.__dict__.items() if k not in ('quiet', 'verbose', 'step_id') } runner = EMRJobRunner(**runner_kwargs) emr_client = runner.make_emr_client() # pick step step = _get_step(emr_client, options.cluster_id, options.step_id) if not step: raise SystemExit(1) if step['Status']['State'] != 'FAILED': log.warning('step %s has state %s, not FAILED' % (step['Id'], step['Status']['State'])) # interpret logs log.info('Diagnosing step %s (%s)' % (step['Id'], step['Name'])) log_interpretation = dict(step_id=step['Id']) step_type = _infer_step_type(step) error = runner._pick_error(log_interpretation, step_type) # print error if error: log.error('Probable cause of failure:\n\n%s\n\n' % _format_error(error)) else: log.warning('No error detected')
def s3_cleanup(glob_path, time_old, dry_run=False, conf_paths=None): """Delete all files older than *time_old* in *path*. If *dry_run* is ``True``, then just log the files that need to be deleted without actually deleting them """ runner = EMRJobRunner(conf_paths=conf_paths) log.info('Deleting all files in %s that are older than %s' % (glob_path, time_old)) for path in runner.ls(glob_path): bucket_name, key_name = parse_s3_uri(path) bucket = runner.fs.get_bucket(bucket_name) for key in bucket.list(key_name): last_modified = iso8601_to_datetime(key.last_modified) age = datetime.utcnow() - last_modified if age > time_old: # Delete it log.info('Deleting %s; is %s old' % (key.name, age)) if not dry_run: key.delete()
def test_can_get_all_job_flows(self): now = datetime.datetime.utcnow() NUM_JOB_FLOWS = 2222 assert_gt(NUM_JOB_FLOWS, DEFAULT_MAX_JOB_FLOWS_RETURNED) for i in range(NUM_JOB_FLOWS): jfid = 'j-%04d' % i self.mock_emr_job_flows[jfid] = MockEmrObject( creationdatetime=to_iso8601(now - datetime.timedelta(minutes=i)), jobflowid=jfid) emr_conn = EMRJobRunner().make_emr_conn() # ordinary describe_jobflows() hits the limit on number of job flows some_jfs = emr_conn.describe_jobflows() assert_equal(len(some_jfs), DEFAULT_MAX_JOB_FLOWS_RETURNED) all_jfs = describe_all_job_flows(emr_conn) assert_equal(len(all_jfs), NUM_JOB_FLOWS) assert_equal(sorted(jf.jobflowid for jf in all_jfs), [('j-%04d' % i) for i in range(NUM_JOB_FLOWS)])
def setUp(self): super(AttemptToUnlockClusterTestCase, self).setUp() self.emr_client = self.client('emr') self.cluster_id = EMRJobRunner().make_persistent_cluster() # get into WAITING state self.simulate_emr_progress(self.cluster_id) self.simulate_emr_progress(self.cluster_id) self.log = self.start(patch('mrjob.pool.log')) self.our_key = 'mr_wc.dmarin.20200419.185348.359278'
def check_sssp_completed(): emr = EMRJobRunner() key = 'hw7/visited.txt' c = emr.fs.make_s3_conn() b = c.get_bucket( 'ucb-mids-mls-rajeshthallam') # substitute your bucket name here key_exists = b.get_key(key) if key_exists == None: return "-1" else: k = Key(b) k.key = key return k.get_contents_as_string()
def inspect_and_maybe_terminate_job_flows( conf_path, max_hours_idle, now, dry_run): emr_conn = EMRJobRunner(conf_path=conf_path).make_emr_conn() log.info( 'getting info about all job flows (this goes back about 2 months)') job_flows = describe_all_job_flows(emr_conn) num_running = 0 num_idle = 0 num_done = 0 num_non_streaming = 0 # a list of tuples of job flow id, name, idle time (as a timedelta) to_terminate = [] for jf in job_flows: # check if job flow is done if is_job_flow_done(jf): num_done += 1 # we can't really tell if non-streaming jobs are idle or not, so # let them be (see Issue #60) elif is_job_flow_non_streaming(jf): num_non_streaming += 1 elif is_job_flow_running(jf): num_running += 1 else: num_idle += 1 time_idle = time_job_flow_idle(jf, now=now) # don't care about fractions of a second time_idle = timedelta(time_idle.days, time_idle.seconds) log.debug('Job flow %s (%s) idle for %s' % (jf.jobflowid, jf.name, time_idle)) if time_idle > timedelta(hours=max_hours_idle): to_terminate.append( (jf.jobflowid, jf.name, time_idle)) log.info( 'Job flow statuses: %d running, %d idle, %d active non-streaming,' ' %d done' % (num_running, num_idle, num_non_streaming, num_done)) terminate_and_notify(emr_conn, to_terminate, dry_run=dry_run)
def main(): runner_kwargs = build_config() logger.info('main() -- runner_kwargs: %s', runner_kwargs) emr_client = EMRJobRunner(**runner_kwargs) cluster_id = emr_client.make_persistent_cluster() logger.info('Cluster-id: %s', cluster_id) emr_client = EMRJobRunner( mr_job_script='src/hive_step.py', cluster_id=cluster_id, input_paths=['/dev/null'], extra_args=['--jar-region', runner_kwargs['region']], **runner_kwargs) emr_client.run()
def reducer_final(self): if self.target_reached == 1: sys.stderr.write('Target reached') if self.runmode == 'emr': sys.stderr.write(self.target_path) s3_key = 'hw7/visited.txt' emr = EMRJobRunner() c = emr.fs.make_s3_conn() b = c.get_bucket(self.s3bucket) k = Key(b) k.key = s3_key k.set_contents_from_string(self.target_path) #self.write_to_s3(self.options.bucket, s3_key, self.target_path) else: yield self.target_path.split('\t')[0], self.target_path.split( '\t')[1]
def setUp(self): super(AttemptToLockClusterTestCase, self).setUp() self.emr_client = self.client('emr') self.cluster_id = EMRJobRunner().make_persistent_cluster() # get into WAITING state self.simulate_emr_progress(self.cluster_id) self.simulate_emr_progress(self.cluster_id) self.log = self.start(patch('mrjob.pool.log')) self.mock_sleep = self.start(patch('time.sleep')) self.time = time.time # save for safekeeping self.mock_time = self.start(patch('time.time', side_effect=time.time)) self.our_key = 'mr_wc.dmarin.20200419.185348.359278' self.their_key = 'mr_wc.them.20200419.185348.999999'
def _s3_cleanup(glob_path, time_old, dry_run=False, **runner_kwargs): """Delete all files older than *time_old* in *path*. If *dry_run* is true, then just log the files that need to be deleted without actually deleting them """ runner = EMRJobRunner(**runner_kwargs) log.info('Deleting all files in %s that are older than %s' % (glob_path, time_old)) for path, key in runner.fs._ls(glob_path): age = datetime.now(tzutc()) - key.last_modified if age > time_old: # Delete it log.info('Deleting %s; is %s old' % (path, age)) if not dry_run: key.delete()