Example #1
0
def test_add_steps_to_flow():
    conn = boto.connect_emr()

    step1 = StreamingStep(
        name='My wordcount example',
        mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
        reducer='aggregate',
        input='s3n://elasticmapreduce/samples/wordcount/input',
        output='s3n://output_bucket/output/wordcount_output')

    job_id = conn.run_jobflow(name='My jobflow',
                              log_uri='s3://some_bucket/jobflow_logs',
                              steps=[step1])

    job_flow = conn.describe_jobflow(job_id)
    job_flow.state.should.equal('STARTING')
    job_flow.jobflowid.should.equal(job_id)
    job_flow.name.should.equal('My jobflow')
    job_flow.loguri.should.equal('s3://some_bucket/jobflow_logs')

    step2 = StreamingStep(
        name='My wordcount example2',
        mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter2.py',
        reducer='aggregate',
        input='s3n://elasticmapreduce/samples/wordcount/input2',
        output='s3n://output_bucket/output/wordcount_output2')

    conn.add_jobflow_steps(job_id, [step2])

    job_flow = conn.describe_jobflow(job_id)
    job_step = job_flow.steps[0]
    job_step.name.should.equal('My wordcount example')
    job_step.state.should.equal('STARTING')
    args = [arg.value for arg in job_step.args]
    args.should.equal([
        '-mapper',
        's3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
        '-reducer',
        'aggregate',
        '-input',
        's3n://elasticmapreduce/samples/wordcount/input',
        '-output',
        's3n://output_bucket/output/wordcount_output',
    ])

    job_step2 = job_flow.steps[1]
    job_step2.name.should.equal('My wordcount example2')
    job_step2.state.should.equal('PENDING')
    args = [arg.value for arg in job_step2.args]
    args.should.equal([
        '-mapper',
        's3n://elasticmapreduce/samples/wordcount/wordSplitter2.py',
        '-reducer',
        'aggregate',
        '-input',
        's3n://elasticmapreduce/samples/wordcount/input2',
        '-output',
        's3n://output_bucket/output/wordcount_output2',
    ])
def add_step_emr(conn, cluster_id):
    step = StreamingStep(name='MC_Method example',
                         cache_files=['s3n://bucket774/map.py#map.py'],
                         mapper='map.py',
                         input='s3://bucket774/input/',
                         output='s3://bucket774/output/')
    conn.add_jobflow_steps(cluster_id, step)
Example #3
0
def test_create_instance_groups():
    conn = boto.connect_emr()

    step1 = StreamingStep(
        name='My wordcount example',
        mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
        reducer='aggregate',
        input='s3n://elasticmapreduce/samples/wordcount/input',
        output='s3n://output_bucket/output/wordcount_output')

    job_id = conn.run_jobflow(
        name='My jobflow',
        log_uri='s3://some_bucket/jobflow_logs',
        steps=[step1],
    )

    instance_group = InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT', 'spot-0.07',
                                   '0.07')
    instance_group = conn.add_instance_groups(job_id, [instance_group])
    instance_group_id = instance_group.instancegroupids
    job_flow = conn.describe_jobflows()[0]
    int(job_flow.instancecount).should.equal(6)
    instance_group = job_flow.instancegroups[0]
    instance_group.instancegroupid.should.equal(instance_group_id)
    int(instance_group.instancerunningcount).should.equal(6)
    instance_group.instancerole.should.equal('TASK')
    instance_group.instancetype.should.equal('c1.medium')
    instance_group.market.should.equal('SPOT')
    instance_group.name.should.equal('spot-0.07')
    instance_group.bidprice.should.equal('0.07')
def create_emr(R):
    if not boto.config.has_section('Boto'):
        boto.config.add_section('Boto')
    boto.config.set('Boto', 'https_validate_certificates', 'False')
    step = StreamingStep(name='MC_Method example',
                         cache_files=['s3n://bucket774/map.py#map.py'],
                         mapper='map.py',
                         input='s3://bucket774/input/',
                         output='s3://bucket774/output/')
    conn = EmrConnection(access_id, access_key)
    instance_groups = []
    instance_groups.append(
        InstanceGroup(num_instances=1,
                      role="MASTER",
                      type='m4.large',
                      market="ON_DEMAND",
                      name="Master nodes"))
    if R > 1:
        instance_groups.append(
            InstanceGroup(num_instances=R - 1,
                          role="CORE",
                          type='m4.large',
                          market="ON_DEMAND",
                          name="Slave nodes"))
    cluster_id = conn.run_jobflow(name='test MC_method run',
                                  instance_groups=instance_groups,
                                  enable_debugging=False,
                                  steps=[step],
                                  visible_to_all_users=True,
                                  keep_alive=True,
                                  job_flow_role="EMR_EC2_DefaultRole",
                                  service_role="EMR_DefaultRole",
                                  hadoop_version='2.4.0',
                                  log_uri='s3://bucket774/log')
    return cluster_id, conn
Example #5
0
    def _make_step(self,
                   mapper,
                   reducer,
                   input,
                   output,
                   num_mappers=1,
                   num_reducers=1):
        """
        Returns a new step that runs the specified mapper and reducer,
        reading from the specified input and writing to the specified
        output.
        """

        bucket = self._s3_conn.get_bucket(self._s3_bucket)

        # Clear out current bucket/output contents for team
        keys = bucket.list(prefix=self._get_keyname(output))
        bucket.delete_keys(keys)

        step_name = self._make_name()
        step_args = [
            '-jobconf',
            'mapred.map.tasks=%d' % (num_mappers), '-jobconf',
            'mapred.reduce.tasks=%d' % (num_reducers)
        ]

        return StreamingStep(name=step_name,
                             step_args=step_args,
                             mapper=self._get_s3_team_uri(mapper),
                             reducer=self._get_s3_team_uri(reducer),
                             input=self._get_s3_team_uri(input),
                             output=self._get_s3_team_uri(output))
 def setup_and_run_job(self):
     """ Runs the Elastic MapReduce job on AWS"""
     step = StreamingStep(
         name='Titanic Machine Learning',
         mapper='s3n://' + EmrProcessing.bucket_name + '/mapper/mapper.py',
         reducer='org.apache.hadoop.mapred.lib.IdentityReducer',
         input='s3n://' + EmrProcessing.bucket_name + '/input/',
         output='s3n://' + EmrProcessing.bucket_name + '/output/')
     self.conn = connect_to_region(self.region_name)
     self.jobid = self.conn.run_jobflow(
         name='Titanic Devp',
         log_uri='s3://' + EmrProcessing.bucket_name + '/jobflow_logs',
         steps=[step])
Example #7
0
def createNewStreamingStep(stepName, fileList, outputDirectory, mapperKey,
                           mapperLocation, reducerLocation):
    if reducerLocation != 'aggregate':
        args = ['-files', mapperLocation + ',' + reducerLocation]
        reducerKey = reducerLocation.split('/')[-1]
    else:
        args = ['-files', mapperLocation]
        reducerKey = reducerLocation
    return StreamingStep(name=stepName,
                         mapper=mapperKey,
                         reducer=reducerKey,
                         input=fileList,
                         output=outputDirectory,
                         step_args=args)
Example #8
0
    def _make_step(self, mapper, reducer, input, output, nm=1, nr=1):

        job_name = self._make_name()
        team_s3 = self._get_s3_url()

        bucket = self.s3_conn.get_bucket(self.s3_bucket)
        keys = bucket.list(prefix='%s/%s' % (self.team_id, output))
        bucket.delete_keys(map(lambda k: k.name, keys))

        return \
            StreamingStep(name=job_name,
                          step_args=
                              ['-jobconf', 'mapred.map.tasks=%d' % nm,
                               '-jobconf', 'mapred.reduce.tasks=%d' % nr],
                          mapper=team_s3 + mapper,
                          reducer=team_s3 + reducer,
                          input=team_s3 + input,
                          output=team_s3 + output)
Example #9
0
def _create_sa_job(emr_conn):
    """Create and start an EMR job asynchronously"""

    step = StreamingStep(name='reTOracle Sentiment Analysis Step',
                         mapper='s3n://retoracle/sa_mapper.py',
                         reducer='s3n://retoracle/sa_reducer.py',
                         input='s3n://retoracle/sa_input',
                         output='s3n://retoracle/sa_output')
    return emr_conn.run_jobflow(
        name='reTOracle Sentiment Analysis Job',
        log_uri='s3://retoracle/jobflow_logs',
        steps=[step],
        num_instances=EMR_NUM_TOTAL_NODES,
        master_instance_type=EMR_TYPE_MASTER_NODE,
        slave_instance_type=EMR_TYPE_SLAVE_NODES,
        ec2_keyname=EMR_KP,
        enable_debugging=EMR_DEBUGGING,
        ami_version=EMR_AMI_VERSION,
    )
Example #10
0
def test_modify_instance_groups():
    conn = boto.connect_emr()

    step1 = StreamingStep(
        name='My wordcount example',
        mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
        reducer='aggregate',
        input='s3n://elasticmapreduce/samples/wordcount/input',
        output='s3n://output_bucket/output/wordcount_output')

    job_id = conn.run_jobflow(name='My jobflow',
                              log_uri='s3://some_bucket/jobflow_logs',
                              steps=[step1])

    instance_group1 = InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT',
                                    'spot-0.07', '0.07')
    instance_group2 = InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT',
                                    'spot-0.07', '0.07')
    instance_group = conn.add_instance_groups(
        job_id, [instance_group1, instance_group2])
    instance_group_ids = instance_group.instancegroupids.split(",")

    job_flow = conn.describe_jobflows()[0]
    int(job_flow.instancecount).should.equal(12)
    instance_group = job_flow.instancegroups[0]
    int(instance_group.instancerunningcount).should.equal(6)

    conn.modify_instance_groups(instance_group_ids, [2, 3])

    job_flow = conn.describe_jobflows()[0]
    int(job_flow.instancecount).should.equal(5)
    instance_group1 = [
        group for group in job_flow.instancegroups
        if group.instancegroupid == instance_group_ids[0]
    ][0]
    int(instance_group1.instancerunningcount).should.equal(2)
    instance_group2 = [
        group for group in job_flow.instancegroups
        if group.instancegroupid == instance_group_ids[1]
    ][0]
    int(instance_group2.instancerunningcount).should.equal(3)
Example #11
0
 def run_streaming_step(self, cluster_id, name, mapper_path, reducer_path, input_path, output_path):
     try:
         # bundle files with the job
         files = []
         if mapper_path != "NONE":
             files.append(mapper_path)
             mapper_path = mapper_path.split("/")[-1]
         if reducer_path != "NONE":
             files.append(reducer_path)
             reducer_path = reducer_path.split("/")[-1]
         # build streaming step
         logging.debug("Launching streaming step with mapper: " + mapper_path + " reducer: " + reducer_path + " and files: " + str(files))
         step = StreamingStep(name=name,
                                 step_args=["-files"] + files, 
                                 mapper=mapper_path, 
                                 reducer=reducer_path, 
                                 input=input_path, 
                                 output=output_path, 
                                 action_on_failure="CONTINUE")
         return self._run_step(cluster_id, step)            
     except:
         logging.error("Running streaming step in cluster " + cluster_id + " failed.")
         return "FAILED"
Example #12
0
import logging
from boto.emr.step import StreamingStep
from emr_manager import EmrManager

# script to trigger a test step process
logging.getLogger().setLevel(logging.INFO)
manager = EmrManager()
step = StreamingStep(name="testStep", mapper="s3n://tuiinnovation-holycrap/scripts/mapper.py", reducer="NONE", input="s3n://tuiinnovation-emr/input/SuppliersMonitor.log-20140524.bz2", output="s3n://tuiinnovation-emr/output/", action_on_failure="CONTINUE")
cluster_id = "j-1OSKIJWRXGEJW"
step_id = manager.run_step(cluster_id, step)
logging.info("step " + step_id + " completed in " + cluster_id)
Example #13
0
    print word

# <codecell>

### Running code with EMR

#emrcon = EmrConnection('<aws access key>', '<aws secret key>')
emrcon = EmrConnection('AKIAJRV3RN6NXQTSSTBA',
                       '3e212d6rs99xtiPgwKnfN1QD30WZk2hJwCWjMcGc')

# <codecell>

# Using EMR's wordcount example
step = StreamingStep(
    name='My wordcount example',
    mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
    reducer='aggregate',
    input='s3n://elasticmapreduce/samples/wordcount/input',
    output='s3n://wambia660fall2013/output/wordcount_output')

# <codecell>

jobid = emrcon.run_jobflow(name='Word Count Example',
                           log_uri='s3://wambia660fall2013/logs',
                           steps=[step])

# <codecell>

print jobid

# <codecell>
Example #14
0
def test_create_job_flow():
    conn = boto.connect_emr()

    step1 = StreamingStep(
        name='My wordcount example',
        mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
        reducer='aggregate',
        input='s3n://elasticmapreduce/samples/wordcount/input',
        output='s3n://output_bucket/output/wordcount_output')

    step2 = StreamingStep(
        name='My wordcount example2',
        mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter2.py',
        reducer='aggregate',
        input='s3n://elasticmapreduce/samples/wordcount/input2',
        output='s3n://output_bucket/output/wordcount_output2')

    job_id = conn.run_jobflow(
        name='My jobflow',
        log_uri='s3://some_bucket/jobflow_logs',
        master_instance_type='m1.medium',
        slave_instance_type='m1.small',
        steps=[step1, step2],
    )

    job_flow = conn.describe_jobflow(job_id)
    job_flow.state.should.equal('STARTING')
    job_flow.jobflowid.should.equal(job_id)
    job_flow.name.should.equal('My jobflow')
    job_flow.masterinstancetype.should.equal('m1.medium')
    job_flow.slaveinstancetype.should.equal('m1.small')
    job_flow.loguri.should.equal('s3://some_bucket/jobflow_logs')
    job_flow.visibletoallusers.should.equal('False')
    int(job_flow.normalizedinstancehours).should.equal(0)
    job_step = job_flow.steps[0]
    job_step.name.should.equal('My wordcount example')
    job_step.state.should.equal('STARTING')
    args = [arg.value for arg in job_step.args]
    args.should.equal([
        '-mapper',
        's3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
        '-reducer',
        'aggregate',
        '-input',
        's3n://elasticmapreduce/samples/wordcount/input',
        '-output',
        's3n://output_bucket/output/wordcount_output',
    ])

    job_step2 = job_flow.steps[1]
    job_step2.name.should.equal('My wordcount example2')
    job_step2.state.should.equal('PENDING')
    args = [arg.value for arg in job_step2.args]
    args.should.equal([
        '-mapper',
        's3n://elasticmapreduce/samples/wordcount/wordSplitter2.py',
        '-reducer',
        'aggregate',
        '-input',
        's3n://elasticmapreduce/samples/wordcount/input2',
        '-output',
        's3n://output_bucket/output/wordcount_output2',
    ])
Example #15
0
def test_steps():
    input_steps = [
        StreamingStep(
            name="My wordcount example",
            mapper="s3n://elasticmapreduce/samples/wordcount/wordSplitter.py",
            reducer="aggregate",
            input="s3n://elasticmapreduce/samples/wordcount/input",
            output="s3n://output_bucket/output/wordcount_output",
        ),
        StreamingStep(
            name="My wordcount example & co.",
            mapper="s3n://elasticmapreduce/samples/wordcount/wordSplitter2.py",
            reducer="aggregate",
            input="s3n://elasticmapreduce/samples/wordcount/input2",
            output="s3n://output_bucket/output/wordcount_output2",
        ),
    ]

    # TODO: implementation and test for cancel_steps

    conn = boto.connect_emr()
    cluster_id = conn.run_jobflow(steps=[input_steps[0]], **run_jobflow_args)

    jf = conn.describe_jobflow(cluster_id)
    jf.steps.should.have.length_of(1)

    conn.add_jobflow_steps(cluster_id, [input_steps[1]])

    jf = conn.describe_jobflow(cluster_id)
    jf.steps.should.have.length_of(2)
    for step in jf.steps:
        step.actiononfailure.should.equal("TERMINATE_JOB_FLOW")
        list(arg.value for arg in step.args).should.have.length_of(8)
        step.creationdatetime.should.be.a(str)
        # step.enddatetime.should.be.a(str)
        step.jar.should.equal(
            "/home/hadoop/contrib/streaming/hadoop-streaming.jar")
        step.laststatechangereason.should.be.a(str)
        step.mainclass.should.equal("")
        step.name.should.be.a(str)
        # step.readydatetime.should.be.a(str)
        # step.startdatetime.should.be.a(str)
        step.state.should.be.within(["RUNNING", "PENDING"])

    expected = dict((s.name, s) for s in input_steps)

    steps = conn.list_steps(cluster_id).steps
    for x in steps:
        y = expected[x.name]
        # actiononfailure
        list(arg.value for arg in x.config.args).should.equal([
            "-mapper",
            y.mapper,
            "-reducer",
            y.reducer,
            "-input",
            y.input,
            "-output",
            y.output,
        ])
        x.config.jar.should.equal(
            "/home/hadoop/contrib/streaming/hadoop-streaming.jar")
        x.config.mainclass.should.equal("")
        # properties
        x.should.have.property("id").should.be.a(str)
        x.name.should.equal(y.name)
        x.status.state.should.be.within(["RUNNING", "PENDING"])
        # x.status.statechangereason
        x.status.timeline.creationdatetime.should.be.a(str)
        # x.status.timeline.enddatetime.should.be.a(str)
        # x.status.timeline.startdatetime.should.be.a(str)

        x = conn.describe_step(cluster_id, x.id)
        list(arg.value for arg in x.config.args).should.equal([
            "-mapper",
            y.mapper,
            "-reducer",
            y.reducer,
            "-input",
            y.input,
            "-output",
            y.output,
        ])
        x.config.jar.should.equal(
            "/home/hadoop/contrib/streaming/hadoop-streaming.jar")
        x.config.mainclass.should.equal("")
        # properties
        x.should.have.property("id").should.be.a(str)
        x.name.should.equal(y.name)
        x.status.state.should.be.within(["RUNNING", "PENDING"])
        # x.status.statechangereason
        x.status.timeline.creationdatetime.should.be.a(str)
        # x.status.timeline.enddatetime.should.be.a(str)
        # x.status.timeline.startdatetime.should.be.a(str)

    @requires_boto_gte("2.39")
    def test_list_steps_with_states():
        # boto's list_steps prior to 2.39 has a bug that ignores
        # step_states argument.
        steps = conn.list_steps(cluster_id).steps
        step_id = steps[0].id
        steps = conn.list_steps(cluster_id, step_states=["RUNNING"]).steps
        steps.should.have.length_of(1)
        steps[0].id.should.equal(step_id)

    test_list_steps_with_states()
Example #16
0
def test_steps():
    input_steps = [
        StreamingStep(
            name='My wordcount example',
            mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
            reducer='aggregate',
            input='s3n://elasticmapreduce/samples/wordcount/input',
            output='s3n://output_bucket/output/wordcount_output'),
        StreamingStep(
            name='My wordcount example2',
            mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter2.py',
            reducer='aggregate',
            input='s3n://elasticmapreduce/samples/wordcount/input2',
            output='s3n://output_bucket/output/wordcount_output2')
    ]

    # TODO: implementation and test for cancel_steps

    conn = boto.connect_emr()
    cluster_id = conn.run_jobflow(
        steps=[input_steps[0]],
        **run_jobflow_args)

    jf = conn.describe_jobflow(cluster_id)
    jf.steps.should.have.length_of(1)

    conn.add_jobflow_steps(cluster_id, [input_steps[1]])

    jf = conn.describe_jobflow(cluster_id)
    jf.steps.should.have.length_of(2)
    for step in jf.steps:
        step.actiononfailure.should.equal('TERMINATE_JOB_FLOW')
        list(arg.value for arg in step.args).should.have.length_of(8)
        step.creationdatetime.should.be.a(six.string_types)
        # step.enddatetime.should.be.a(six.string_types)
        step.jar.should.equal(
            '/home/hadoop/contrib/streaming/hadoop-streaming.jar')
        step.laststatechangereason.should.be.a(six.string_types)
        step.mainclass.should.equal('')
        step.name.should.be.a(six.string_types)
        # step.readydatetime.should.be.a(six.string_types)
        # step.startdatetime.should.be.a(six.string_types)
        step.state.should.be.within(['STARTING', 'PENDING'])

    expected = dict((s.name, s) for s in input_steps)

    steps = conn.list_steps(cluster_id).steps
    for x in steps:
        y = expected[x.name]
        # actiononfailure
        list(arg.value for arg in x.config.args).should.equal([
            '-mapper', y.mapper,
            '-reducer', y.reducer,
            '-input', y.input,
            '-output', y.output,
        ])
        x.config.jar.should.equal(
            '/home/hadoop/contrib/streaming/hadoop-streaming.jar')
        x.config.mainclass.should.equal('')
        # properties
        x.should.have.property('id').should.be.a(six.string_types)
        x.name.should.equal(y.name)
        x.status.state.should.be.within(['STARTING', 'PENDING'])
        # x.status.statechangereason
        x.status.timeline.creationdatetime.should.be.a(six.string_types)
        # x.status.timeline.enddatetime.should.be.a(six.string_types)
        # x.status.timeline.startdatetime.should.be.a(six.string_types)

        x = conn.describe_step(cluster_id, x.id)
        list(arg.value for arg in x.config.args).should.equal([
            '-mapper', y.mapper,
            '-reducer', y.reducer,
            '-input', y.input,
            '-output', y.output,
        ])
        x.config.jar.should.equal(
            '/home/hadoop/contrib/streaming/hadoop-streaming.jar')
        x.config.mainclass.should.equal('')
        # properties
        x.should.have.property('id').should.be.a(six.string_types)
        x.name.should.equal(y.name)
        x.status.state.should.be.within(['STARTING', 'PENDING'])
        # x.status.statechangereason
        x.status.timeline.creationdatetime.should.be.a(six.string_types)
        # x.status.timeline.enddatetime.should.be.a(six.string_types)
        # x.status.timeline.startdatetime.should.be.a(six.string_types)

    @requires_boto_gte('2.39')
    def test_list_steps_with_states():
        # boto's list_steps prior to 2.39 has a bug that ignores
        # step_states argument.
        steps = conn.list_steps(cluster_id).steps
        step_id = steps[0].id
        steps = conn.list_steps(cluster_id, step_states=['STARTING']).steps
        steps.should.have.length_of(1)
        steps[0].id.should.equal(step_id)
    test_list_steps_with_states()
Example #17
0
from boto.emr.connection import EmrConnection
from boto.emr.step import StreamingStep
import boto

AWS_KEY='AKIAIQ7VG4UORIN75ZSA'
AWS_SECRET='jzxajGx8gzwX+ymYXJ0/5heCjkPtWLQkICYRn7Vj'

conn = EmrConnection(AWS_KEY, AWS_SECRET)

step = StreamingStep(name='My wordcount example',
                      mapper='s3n://css739/wordcount/bigramSplitter.py',
                      reducer='aggregate',
                      input='s3n://smalldata/wikipedia_titles.txt',
                      output='s3n://css739/wordcount/bigram_count_output2',
                      cache_files=['s3n://css739/wordcount/english_stoplist.py'])
                      
                      
jobid = conn.run_jobflow(name='My jobflow', log_uri='s3n://css739/wordcount/jobflow_logs',steps=[step])

conn.describe_jobflow(jobid).state