def test_add_steps_to_flow(): conn = boto.connect_emr() step1 = StreamingStep( name='My wordcount example', mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py', reducer='aggregate', input='s3n://elasticmapreduce/samples/wordcount/input', output='s3n://output_bucket/output/wordcount_output') job_id = conn.run_jobflow(name='My jobflow', log_uri='s3://some_bucket/jobflow_logs', steps=[step1]) job_flow = conn.describe_jobflow(job_id) job_flow.state.should.equal('STARTING') job_flow.jobflowid.should.equal(job_id) job_flow.name.should.equal('My jobflow') job_flow.loguri.should.equal('s3://some_bucket/jobflow_logs') step2 = StreamingStep( name='My wordcount example2', mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter2.py', reducer='aggregate', input='s3n://elasticmapreduce/samples/wordcount/input2', output='s3n://output_bucket/output/wordcount_output2') conn.add_jobflow_steps(job_id, [step2]) job_flow = conn.describe_jobflow(job_id) job_step = job_flow.steps[0] job_step.name.should.equal('My wordcount example') job_step.state.should.equal('STARTING') args = [arg.value for arg in job_step.args] args.should.equal([ '-mapper', 's3n://elasticmapreduce/samples/wordcount/wordSplitter.py', '-reducer', 'aggregate', '-input', 's3n://elasticmapreduce/samples/wordcount/input', '-output', 's3n://output_bucket/output/wordcount_output', ]) job_step2 = job_flow.steps[1] job_step2.name.should.equal('My wordcount example2') job_step2.state.should.equal('PENDING') args = [arg.value for arg in job_step2.args] args.should.equal([ '-mapper', 's3n://elasticmapreduce/samples/wordcount/wordSplitter2.py', '-reducer', 'aggregate', '-input', 's3n://elasticmapreduce/samples/wordcount/input2', '-output', 's3n://output_bucket/output/wordcount_output2', ])
def add_step_emr(conn, cluster_id): step = StreamingStep(name='MC_Method example', cache_files=['s3n://bucket774/map.py#map.py'], mapper='map.py', input='s3://bucket774/input/', output='s3://bucket774/output/') conn.add_jobflow_steps(cluster_id, step)
def test_create_instance_groups(): conn = boto.connect_emr() step1 = StreamingStep( name='My wordcount example', mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py', reducer='aggregate', input='s3n://elasticmapreduce/samples/wordcount/input', output='s3n://output_bucket/output/wordcount_output') job_id = conn.run_jobflow( name='My jobflow', log_uri='s3://some_bucket/jobflow_logs', steps=[step1], ) instance_group = InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT', 'spot-0.07', '0.07') instance_group = conn.add_instance_groups(job_id, [instance_group]) instance_group_id = instance_group.instancegroupids job_flow = conn.describe_jobflows()[0] int(job_flow.instancecount).should.equal(6) instance_group = job_flow.instancegroups[0] instance_group.instancegroupid.should.equal(instance_group_id) int(instance_group.instancerunningcount).should.equal(6) instance_group.instancerole.should.equal('TASK') instance_group.instancetype.should.equal('c1.medium') instance_group.market.should.equal('SPOT') instance_group.name.should.equal('spot-0.07') instance_group.bidprice.should.equal('0.07')
def create_emr(R): if not boto.config.has_section('Boto'): boto.config.add_section('Boto') boto.config.set('Boto', 'https_validate_certificates', 'False') step = StreamingStep(name='MC_Method example', cache_files=['s3n://bucket774/map.py#map.py'], mapper='map.py', input='s3://bucket774/input/', output='s3://bucket774/output/') conn = EmrConnection(access_id, access_key) instance_groups = [] instance_groups.append( InstanceGroup(num_instances=1, role="MASTER", type='m4.large', market="ON_DEMAND", name="Master nodes")) if R > 1: instance_groups.append( InstanceGroup(num_instances=R - 1, role="CORE", type='m4.large', market="ON_DEMAND", name="Slave nodes")) cluster_id = conn.run_jobflow(name='test MC_method run', instance_groups=instance_groups, enable_debugging=False, steps=[step], visible_to_all_users=True, keep_alive=True, job_flow_role="EMR_EC2_DefaultRole", service_role="EMR_DefaultRole", hadoop_version='2.4.0', log_uri='s3://bucket774/log') return cluster_id, conn
def _make_step(self, mapper, reducer, input, output, num_mappers=1, num_reducers=1): """ Returns a new step that runs the specified mapper and reducer, reading from the specified input and writing to the specified output. """ bucket = self._s3_conn.get_bucket(self._s3_bucket) # Clear out current bucket/output contents for team keys = bucket.list(prefix=self._get_keyname(output)) bucket.delete_keys(keys) step_name = self._make_name() step_args = [ '-jobconf', 'mapred.map.tasks=%d' % (num_mappers), '-jobconf', 'mapred.reduce.tasks=%d' % (num_reducers) ] return StreamingStep(name=step_name, step_args=step_args, mapper=self._get_s3_team_uri(mapper), reducer=self._get_s3_team_uri(reducer), input=self._get_s3_team_uri(input), output=self._get_s3_team_uri(output))
def setup_and_run_job(self): """ Runs the Elastic MapReduce job on AWS""" step = StreamingStep( name='Titanic Machine Learning', mapper='s3n://' + EmrProcessing.bucket_name + '/mapper/mapper.py', reducer='org.apache.hadoop.mapred.lib.IdentityReducer', input='s3n://' + EmrProcessing.bucket_name + '/input/', output='s3n://' + EmrProcessing.bucket_name + '/output/') self.conn = connect_to_region(self.region_name) self.jobid = self.conn.run_jobflow( name='Titanic Devp', log_uri='s3://' + EmrProcessing.bucket_name + '/jobflow_logs', steps=[step])
def createNewStreamingStep(stepName, fileList, outputDirectory, mapperKey, mapperLocation, reducerLocation): if reducerLocation != 'aggregate': args = ['-files', mapperLocation + ',' + reducerLocation] reducerKey = reducerLocation.split('/')[-1] else: args = ['-files', mapperLocation] reducerKey = reducerLocation return StreamingStep(name=stepName, mapper=mapperKey, reducer=reducerKey, input=fileList, output=outputDirectory, step_args=args)
def _make_step(self, mapper, reducer, input, output, nm=1, nr=1): job_name = self._make_name() team_s3 = self._get_s3_url() bucket = self.s3_conn.get_bucket(self.s3_bucket) keys = bucket.list(prefix='%s/%s' % (self.team_id, output)) bucket.delete_keys(map(lambda k: k.name, keys)) return \ StreamingStep(name=job_name, step_args= ['-jobconf', 'mapred.map.tasks=%d' % nm, '-jobconf', 'mapred.reduce.tasks=%d' % nr], mapper=team_s3 + mapper, reducer=team_s3 + reducer, input=team_s3 + input, output=team_s3 + output)
def _create_sa_job(emr_conn): """Create and start an EMR job asynchronously""" step = StreamingStep(name='reTOracle Sentiment Analysis Step', mapper='s3n://retoracle/sa_mapper.py', reducer='s3n://retoracle/sa_reducer.py', input='s3n://retoracle/sa_input', output='s3n://retoracle/sa_output') return emr_conn.run_jobflow( name='reTOracle Sentiment Analysis Job', log_uri='s3://retoracle/jobflow_logs', steps=[step], num_instances=EMR_NUM_TOTAL_NODES, master_instance_type=EMR_TYPE_MASTER_NODE, slave_instance_type=EMR_TYPE_SLAVE_NODES, ec2_keyname=EMR_KP, enable_debugging=EMR_DEBUGGING, ami_version=EMR_AMI_VERSION, )
def test_modify_instance_groups(): conn = boto.connect_emr() step1 = StreamingStep( name='My wordcount example', mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py', reducer='aggregate', input='s3n://elasticmapreduce/samples/wordcount/input', output='s3n://output_bucket/output/wordcount_output') job_id = conn.run_jobflow(name='My jobflow', log_uri='s3://some_bucket/jobflow_logs', steps=[step1]) instance_group1 = InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT', 'spot-0.07', '0.07') instance_group2 = InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT', 'spot-0.07', '0.07') instance_group = conn.add_instance_groups( job_id, [instance_group1, instance_group2]) instance_group_ids = instance_group.instancegroupids.split(",") job_flow = conn.describe_jobflows()[0] int(job_flow.instancecount).should.equal(12) instance_group = job_flow.instancegroups[0] int(instance_group.instancerunningcount).should.equal(6) conn.modify_instance_groups(instance_group_ids, [2, 3]) job_flow = conn.describe_jobflows()[0] int(job_flow.instancecount).should.equal(5) instance_group1 = [ group for group in job_flow.instancegroups if group.instancegroupid == instance_group_ids[0] ][0] int(instance_group1.instancerunningcount).should.equal(2) instance_group2 = [ group for group in job_flow.instancegroups if group.instancegroupid == instance_group_ids[1] ][0] int(instance_group2.instancerunningcount).should.equal(3)
def run_streaming_step(self, cluster_id, name, mapper_path, reducer_path, input_path, output_path): try: # bundle files with the job files = [] if mapper_path != "NONE": files.append(mapper_path) mapper_path = mapper_path.split("/")[-1] if reducer_path != "NONE": files.append(reducer_path) reducer_path = reducer_path.split("/")[-1] # build streaming step logging.debug("Launching streaming step with mapper: " + mapper_path + " reducer: " + reducer_path + " and files: " + str(files)) step = StreamingStep(name=name, step_args=["-files"] + files, mapper=mapper_path, reducer=reducer_path, input=input_path, output=output_path, action_on_failure="CONTINUE") return self._run_step(cluster_id, step) except: logging.error("Running streaming step in cluster " + cluster_id + " failed.") return "FAILED"
import logging from boto.emr.step import StreamingStep from emr_manager import EmrManager # script to trigger a test step process logging.getLogger().setLevel(logging.INFO) manager = EmrManager() step = StreamingStep(name="testStep", mapper="s3n://tuiinnovation-holycrap/scripts/mapper.py", reducer="NONE", input="s3n://tuiinnovation-emr/input/SuppliersMonitor.log-20140524.bz2", output="s3n://tuiinnovation-emr/output/", action_on_failure="CONTINUE") cluster_id = "j-1OSKIJWRXGEJW" step_id = manager.run_step(cluster_id, step) logging.info("step " + step_id + " completed in " + cluster_id)
print word # <codecell> ### Running code with EMR #emrcon = EmrConnection('<aws access key>', '<aws secret key>') emrcon = EmrConnection('AKIAJRV3RN6NXQTSSTBA', '3e212d6rs99xtiPgwKnfN1QD30WZk2hJwCWjMcGc') # <codecell> # Using EMR's wordcount example step = StreamingStep( name='My wordcount example', mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py', reducer='aggregate', input='s3n://elasticmapreduce/samples/wordcount/input', output='s3n://wambia660fall2013/output/wordcount_output') # <codecell> jobid = emrcon.run_jobflow(name='Word Count Example', log_uri='s3://wambia660fall2013/logs', steps=[step]) # <codecell> print jobid # <codecell>
def test_create_job_flow(): conn = boto.connect_emr() step1 = StreamingStep( name='My wordcount example', mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py', reducer='aggregate', input='s3n://elasticmapreduce/samples/wordcount/input', output='s3n://output_bucket/output/wordcount_output') step2 = StreamingStep( name='My wordcount example2', mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter2.py', reducer='aggregate', input='s3n://elasticmapreduce/samples/wordcount/input2', output='s3n://output_bucket/output/wordcount_output2') job_id = conn.run_jobflow( name='My jobflow', log_uri='s3://some_bucket/jobflow_logs', master_instance_type='m1.medium', slave_instance_type='m1.small', steps=[step1, step2], ) job_flow = conn.describe_jobflow(job_id) job_flow.state.should.equal('STARTING') job_flow.jobflowid.should.equal(job_id) job_flow.name.should.equal('My jobflow') job_flow.masterinstancetype.should.equal('m1.medium') job_flow.slaveinstancetype.should.equal('m1.small') job_flow.loguri.should.equal('s3://some_bucket/jobflow_logs') job_flow.visibletoallusers.should.equal('False') int(job_flow.normalizedinstancehours).should.equal(0) job_step = job_flow.steps[0] job_step.name.should.equal('My wordcount example') job_step.state.should.equal('STARTING') args = [arg.value for arg in job_step.args] args.should.equal([ '-mapper', 's3n://elasticmapreduce/samples/wordcount/wordSplitter.py', '-reducer', 'aggregate', '-input', 's3n://elasticmapreduce/samples/wordcount/input', '-output', 's3n://output_bucket/output/wordcount_output', ]) job_step2 = job_flow.steps[1] job_step2.name.should.equal('My wordcount example2') job_step2.state.should.equal('PENDING') args = [arg.value for arg in job_step2.args] args.should.equal([ '-mapper', 's3n://elasticmapreduce/samples/wordcount/wordSplitter2.py', '-reducer', 'aggregate', '-input', 's3n://elasticmapreduce/samples/wordcount/input2', '-output', 's3n://output_bucket/output/wordcount_output2', ])
def test_steps(): input_steps = [ StreamingStep( name="My wordcount example", mapper="s3n://elasticmapreduce/samples/wordcount/wordSplitter.py", reducer="aggregate", input="s3n://elasticmapreduce/samples/wordcount/input", output="s3n://output_bucket/output/wordcount_output", ), StreamingStep( name="My wordcount example & co.", mapper="s3n://elasticmapreduce/samples/wordcount/wordSplitter2.py", reducer="aggregate", input="s3n://elasticmapreduce/samples/wordcount/input2", output="s3n://output_bucket/output/wordcount_output2", ), ] # TODO: implementation and test for cancel_steps conn = boto.connect_emr() cluster_id = conn.run_jobflow(steps=[input_steps[0]], **run_jobflow_args) jf = conn.describe_jobflow(cluster_id) jf.steps.should.have.length_of(1) conn.add_jobflow_steps(cluster_id, [input_steps[1]]) jf = conn.describe_jobflow(cluster_id) jf.steps.should.have.length_of(2) for step in jf.steps: step.actiononfailure.should.equal("TERMINATE_JOB_FLOW") list(arg.value for arg in step.args).should.have.length_of(8) step.creationdatetime.should.be.a(str) # step.enddatetime.should.be.a(str) step.jar.should.equal( "/home/hadoop/contrib/streaming/hadoop-streaming.jar") step.laststatechangereason.should.be.a(str) step.mainclass.should.equal("") step.name.should.be.a(str) # step.readydatetime.should.be.a(str) # step.startdatetime.should.be.a(str) step.state.should.be.within(["RUNNING", "PENDING"]) expected = dict((s.name, s) for s in input_steps) steps = conn.list_steps(cluster_id).steps for x in steps: y = expected[x.name] # actiononfailure list(arg.value for arg in x.config.args).should.equal([ "-mapper", y.mapper, "-reducer", y.reducer, "-input", y.input, "-output", y.output, ]) x.config.jar.should.equal( "/home/hadoop/contrib/streaming/hadoop-streaming.jar") x.config.mainclass.should.equal("") # properties x.should.have.property("id").should.be.a(str) x.name.should.equal(y.name) x.status.state.should.be.within(["RUNNING", "PENDING"]) # x.status.statechangereason x.status.timeline.creationdatetime.should.be.a(str) # x.status.timeline.enddatetime.should.be.a(str) # x.status.timeline.startdatetime.should.be.a(str) x = conn.describe_step(cluster_id, x.id) list(arg.value for arg in x.config.args).should.equal([ "-mapper", y.mapper, "-reducer", y.reducer, "-input", y.input, "-output", y.output, ]) x.config.jar.should.equal( "/home/hadoop/contrib/streaming/hadoop-streaming.jar") x.config.mainclass.should.equal("") # properties x.should.have.property("id").should.be.a(str) x.name.should.equal(y.name) x.status.state.should.be.within(["RUNNING", "PENDING"]) # x.status.statechangereason x.status.timeline.creationdatetime.should.be.a(str) # x.status.timeline.enddatetime.should.be.a(str) # x.status.timeline.startdatetime.should.be.a(str) @requires_boto_gte("2.39") def test_list_steps_with_states(): # boto's list_steps prior to 2.39 has a bug that ignores # step_states argument. steps = conn.list_steps(cluster_id).steps step_id = steps[0].id steps = conn.list_steps(cluster_id, step_states=["RUNNING"]).steps steps.should.have.length_of(1) steps[0].id.should.equal(step_id) test_list_steps_with_states()
def test_steps(): input_steps = [ StreamingStep( name='My wordcount example', mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py', reducer='aggregate', input='s3n://elasticmapreduce/samples/wordcount/input', output='s3n://output_bucket/output/wordcount_output'), StreamingStep( name='My wordcount example2', mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter2.py', reducer='aggregate', input='s3n://elasticmapreduce/samples/wordcount/input2', output='s3n://output_bucket/output/wordcount_output2') ] # TODO: implementation and test for cancel_steps conn = boto.connect_emr() cluster_id = conn.run_jobflow( steps=[input_steps[0]], **run_jobflow_args) jf = conn.describe_jobflow(cluster_id) jf.steps.should.have.length_of(1) conn.add_jobflow_steps(cluster_id, [input_steps[1]]) jf = conn.describe_jobflow(cluster_id) jf.steps.should.have.length_of(2) for step in jf.steps: step.actiononfailure.should.equal('TERMINATE_JOB_FLOW') list(arg.value for arg in step.args).should.have.length_of(8) step.creationdatetime.should.be.a(six.string_types) # step.enddatetime.should.be.a(six.string_types) step.jar.should.equal( '/home/hadoop/contrib/streaming/hadoop-streaming.jar') step.laststatechangereason.should.be.a(six.string_types) step.mainclass.should.equal('') step.name.should.be.a(six.string_types) # step.readydatetime.should.be.a(six.string_types) # step.startdatetime.should.be.a(six.string_types) step.state.should.be.within(['STARTING', 'PENDING']) expected = dict((s.name, s) for s in input_steps) steps = conn.list_steps(cluster_id).steps for x in steps: y = expected[x.name] # actiononfailure list(arg.value for arg in x.config.args).should.equal([ '-mapper', y.mapper, '-reducer', y.reducer, '-input', y.input, '-output', y.output, ]) x.config.jar.should.equal( '/home/hadoop/contrib/streaming/hadoop-streaming.jar') x.config.mainclass.should.equal('') # properties x.should.have.property('id').should.be.a(six.string_types) x.name.should.equal(y.name) x.status.state.should.be.within(['STARTING', 'PENDING']) # x.status.statechangereason x.status.timeline.creationdatetime.should.be.a(six.string_types) # x.status.timeline.enddatetime.should.be.a(six.string_types) # x.status.timeline.startdatetime.should.be.a(six.string_types) x = conn.describe_step(cluster_id, x.id) list(arg.value for arg in x.config.args).should.equal([ '-mapper', y.mapper, '-reducer', y.reducer, '-input', y.input, '-output', y.output, ]) x.config.jar.should.equal( '/home/hadoop/contrib/streaming/hadoop-streaming.jar') x.config.mainclass.should.equal('') # properties x.should.have.property('id').should.be.a(six.string_types) x.name.should.equal(y.name) x.status.state.should.be.within(['STARTING', 'PENDING']) # x.status.statechangereason x.status.timeline.creationdatetime.should.be.a(six.string_types) # x.status.timeline.enddatetime.should.be.a(six.string_types) # x.status.timeline.startdatetime.should.be.a(six.string_types) @requires_boto_gte('2.39') def test_list_steps_with_states(): # boto's list_steps prior to 2.39 has a bug that ignores # step_states argument. steps = conn.list_steps(cluster_id).steps step_id = steps[0].id steps = conn.list_steps(cluster_id, step_states=['STARTING']).steps steps.should.have.length_of(1) steps[0].id.should.equal(step_id) test_list_steps_with_states()
from boto.emr.connection import EmrConnection from boto.emr.step import StreamingStep import boto AWS_KEY='AKIAIQ7VG4UORIN75ZSA' AWS_SECRET='jzxajGx8gzwX+ymYXJ0/5heCjkPtWLQkICYRn7Vj' conn = EmrConnection(AWS_KEY, AWS_SECRET) step = StreamingStep(name='My wordcount example', mapper='s3n://css739/wordcount/bigramSplitter.py', reducer='aggregate', input='s3n://smalldata/wikipedia_titles.txt', output='s3n://css739/wordcount/bigram_count_output2', cache_files=['s3n://css739/wordcount/english_stoplist.py']) jobid = conn.run_jobflow(name='My jobflow', log_uri='s3n://css739/wordcount/jobflow_logs',steps=[step]) conn.describe_jobflow(jobid).state