Beispiel #1
0
    def get_instance_groups(self):
        '''Get instance groups to start a cluster.
       It calculates the price with self.level, which indicates the
       price upgrades from the original price.
    '''
        instance_groups = []
        for group in self.prop.emr.instance_groups:
            (num, group_name, instance_type) = group
            level = max(0,
                        min(self.level,
                            len(self.prop.emr.price_upgrade_rate) -
                            1))  # 0 <= level < len(...)
            bprice = self.prop.emr.prices[
                instance_type] * self.prop.emr.price_upgrade_rate[level]
            name = '%s-%s@%f' % (group_name, 'SPOT', bprice)

            # Use on-demand instance if prices are zero.
            if bprice > 0:
                ig = InstanceGroup(num, group_name, instance_type, 'SPOT',
                                   name, '%.3f' % bprice)
            else:
                ig = InstanceGroup(num, group_name, instance_type, 'ON_DEMAND',
                                   name)

            instance_groups.append(ig)

        return instance_groups
Beispiel #2
0
def create_new_cluster(conn,
                       s3_bucket,
                       cluster_name,
                       keep_alive=True,
                       worker_type="m1.small",
                       worker_count=2):

    # Note: for testing purposes, you can run ami_version 2.4.9 on m1.small instances.
    # For newer Hadoop, use 3.3.1 and m1.medium or m3.xlarge
    # on-demand prices for m1.small and m1.medium are 0.047 and 0.095 respectively
    master_node = "m1.medium"
    ami_version = "3.3.1"
    bid_price = "0.012"
    spot_count = 6
    if worker_type == "m1.small":
        master_node = "m1.small"
        ami_version = "2.4.9"
        bid_price = "0.012"
        spot_count = 2

    instance_groups = []
    instance_groups.append(
        InstanceGroup(name="Main node",
                      role="MASTER",
                      num_instances=1,
                      type=master_node,
                      market="ON_DEMAND"))
    instance_groups.append(
        InstanceGroup(name="Worker nodes",
                      role="CORE",
                      num_instances=worker_count,
                      type=worker_type,
                      market="ON_DEMAND"))
    instance_groups.append(
        InstanceGroup(name="Optional spot-price nodes",
                      role="TASK",
                      num_instances=spot_count,
                      type="m1.medium",
                      market="SPOT",
                      bidprice=bid_price))

    cluster_id = conn.run_jobflow(cluster_name,
                                  instance_groups=instance_groups,
                                  action_on_failure='CANCEL_AND_WAIT',
                                  keep_alive=keep_alive,
                                  enable_debugging=True,
                                  log_uri="s3://{0}/logs/".format(s3_bucket),
                                  ami_version=ami_version,
                                  bootstrap_actions=[],
                                  additional_info=None,
                                  ec2_keyname="hadoop-seminar-emr",
                                  visible_to_all_users=True,
                                  job_flow_role="EMR_EC2_DefaultRole",
                                  service_role="EMR_DefaultRole")

    conn.add_tags(cluster_id, {'Name': "EMR Cluster " + cluster_id})

    print "Starting cluster", cluster_id, cluster_name
    if keep_alive:
        print "Note: cluster will be left running, remember to terminate it manually after you are done!"
def create_emr(R):
    if not boto.config.has_section('Boto'):
        boto.config.add_section('Boto')
    boto.config.set('Boto', 'https_validate_certificates', 'False')
    step = StreamingStep(name='MC_Method example',
                         cache_files=['s3n://bucket774/map.py#map.py'],
                         mapper='map.py',
                         input='s3://bucket774/input/',
                         output='s3://bucket774/output/')
    conn = EmrConnection(access_id, access_key)
    instance_groups = []
    instance_groups.append(
        InstanceGroup(num_instances=1,
                      role="MASTER",
                      type='m4.large',
                      market="ON_DEMAND",
                      name="Master nodes"))
    if R > 1:
        instance_groups.append(
            InstanceGroup(num_instances=R - 1,
                          role="CORE",
                          type='m4.large',
                          market="ON_DEMAND",
                          name="Slave nodes"))
    cluster_id = conn.run_jobflow(name='test MC_method run',
                                  instance_groups=instance_groups,
                                  enable_debugging=False,
                                  steps=[step],
                                  visible_to_all_users=True,
                                  keep_alive=True,
                                  job_flow_role="EMR_EC2_DefaultRole",
                                  service_role="EMR_DefaultRole",
                                  hadoop_version='2.4.0',
                                  log_uri='s3://bucket774/log')
    return cluster_id, conn
Beispiel #4
0
 def test_bidprice_string(self):
     """
     Test InstanceGroup init works with bidprice type = string.
     """
     instance_group = InstanceGroup(1, 'MASTER', 'm1.small',
                                    'SPOT', 'master', bidprice='1.1')
     self.assertEquals('1.1', instance_group.bidprice)
Beispiel #5
0
 def test_bidprice_Decimal(self):
     """
     Test InstanceGroup init works with bidprice type = Decimal.
     """
     instance_group = InstanceGroup(1, 'MASTER', 'm1.small',
                                    'SPOT', 'master', bidprice=Decimal(1.10))
     self.assertEquals('1.10', instance_group.bidprice[:4])
 def test_bidprice_missing_spot(self):
     """
     Test InstanceGroup init raises ValueError when market==spot and
     bidprice is not specified.
     """
     with self.assertRaisesRegexp(ValueError, 'bidprice must be specified'):
         InstanceGroup(1, 'MASTER', 'm1.small', 'SPOT', 'master')
 def test_bidprice_missing_ondemand(self):
     """
     Test InstanceGroup init accepts a missing bidprice arg, when market is
     ON_DEMAND.
     """
     instance_group = InstanceGroup(1, 'MASTER', 'm1.small', 'ON_DEMAND',
                                    'master')
Beispiel #8
0
def test_create_instance_groups():
    conn = boto.connect_emr()

    step1 = StreamingStep(
        name='My wordcount example',
        mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
        reducer='aggregate',
        input='s3n://elasticmapreduce/samples/wordcount/input',
        output='s3n://output_bucket/output/wordcount_output')

    job_id = conn.run_jobflow(
        name='My jobflow',
        log_uri='s3://some_bucket/jobflow_logs',
        steps=[step1],
    )

    instance_group = InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT', 'spot-0.07',
                                   '0.07')
    instance_group = conn.add_instance_groups(job_id, [instance_group])
    instance_group_id = instance_group.instancegroupids
    job_flow = conn.describe_jobflows()[0]
    int(job_flow.instancecount).should.equal(6)
    instance_group = job_flow.instancegroups[0]
    instance_group.instancegroupid.should.equal(instance_group_id)
    int(instance_group.instancerunningcount).should.equal(6)
    instance_group.instancerole.should.equal('TASK')
    instance_group.instancetype.should.equal('c1.medium')
    instance_group.market.should.equal('SPOT')
    instance_group.name.should.equal('spot-0.07')
    instance_group.bidprice.should.equal('0.07')
 def test_bidprice_float(self):
     """
     Test InstanceGroup init works with bidprice type = float.
     """
     instance_group = InstanceGroup(1,
                                    'parent',
                                    'm1.small',
                                    'SPOT',
                                    'parent',
                                    bidprice=1.1)
     self.assertEquals('1.1', instance_group.bidprice)
Beispiel #10
0
def test_modify_instance_groups():
    conn = boto.connect_emr()

    step1 = StreamingStep(
        name='My wordcount example',
        mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
        reducer='aggregate',
        input='s3n://elasticmapreduce/samples/wordcount/input',
        output='s3n://output_bucket/output/wordcount_output')

    job_id = conn.run_jobflow(name='My jobflow',
                              log_uri='s3://some_bucket/jobflow_logs',
                              steps=[step1])

    instance_group1 = InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT',
                                    'spot-0.07', '0.07')
    instance_group2 = InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT',
                                    'spot-0.07', '0.07')
    instance_group = conn.add_instance_groups(
        job_id, [instance_group1, instance_group2])
    instance_group_ids = instance_group.instancegroupids.split(",")

    job_flow = conn.describe_jobflows()[0]
    int(job_flow.instancecount).should.equal(12)
    instance_group = job_flow.instancegroups[0]
    int(instance_group.instancerunningcount).should.equal(6)

    conn.modify_instance_groups(instance_group_ids, [2, 3])

    job_flow = conn.describe_jobflows()[0]
    int(job_flow.instancecount).should.equal(5)
    instance_group1 = [
        group for group in job_flow.instancegroups
        if group.instancegroupid == instance_group_ids[0]
    ][0]
    int(instance_group1.instancerunningcount).should.equal(2)
    instance_group2 = [
        group for group in job_flow.instancegroups
        if group.instancegroupid == instance_group_ids[1]
    ][0]
    int(instance_group2.instancerunningcount).should.equal(3)
Beispiel #11
0
def launch_cluster(script_name,
                   keep_alive=False,
                   instance_types=None,
                   subnet_id=None):
    '''launch new cluster'''
    if instance_types is None:
        instance_type = 'm2.4xlarge'
        instance_count = 3
    else:
        match = re.match('^([^:]+)(:\d+)?$', instance_types)
        if not match:
            raise ValueError('invalid instance types: %s' % instance_types)
        instance_type, instance_count = match.groups()
        instance_count = int(instance_count[1:])
    instance_groups = [
        InstanceGroup(1, 'MASTER', instance_type, 'ON_DEMAND', 'MASTER_GROUP'),
        InstanceGroup(instance_count, 'CORE', instance_type, 'ON_DEMAND',
                      'CORE_GROUP')
    ]
    bootstrap_actions = [
        BootstrapAction('install-pig', install_pig_script, [pig_version]),
    ]
    api_params = {}
    if subnet_id is not None:
        api_params['Instances.Ec2SubnetId'] = subnet_id
    name = name_prefix + '-' + script_name
    jobid = emr_conn.run_jobflow(name=name,
                                 keep_alive=keep_alive,
                                 ami_version=ami_version,
                                 visible_to_all_users=True,
                                 ec2_keyname=ec2_keyname,
                                 service_role='EMR_DefaultRole',
                                 job_flow_role='EMR_EC2_DefaultRole',
                                 log_uri=log_uri,
                                 action_on_failure='CONTINUE',
                                 instance_groups=instance_groups,
                                 bootstrap_actions=bootstrap_actions,
                                 api_params=api_params)
    print('launched %s (%s)' % (name, jobid))
    return jobid
Beispiel #12
0

run_jobflow_args = dict(
    job_flow_role='EMR_EC2_DefaultRole',
    keep_alive=True,
    log_uri='s3://some_bucket/jobflow_logs',
    master_instance_type='c1.medium',
    name='My jobflow',
    num_instances=2,
    service_role='EMR_DefaultRole',
    slave_instance_type='c1.medium',
)


input_instance_groups = [
    InstanceGroup(1, 'MASTER', 'c1.medium', 'ON_DEMAND', 'master'),
    InstanceGroup(3, 'CORE', 'c1.medium', 'ON_DEMAND', 'core'),
    InstanceGroup(6, 'TASK', 'c1.large', 'SPOT', 'task-1', '0.07'),
    InstanceGroup(10, 'TASK', 'c1.xlarge', 'SPOT', 'task-2', '0.05'),
]


@mock_emr_deprecated
def test_describe_cluster():
    conn = boto.connect_emr()
    args = run_jobflow_args.copy()
    args.update(dict(
        api_params={
            'Applications.member.1.Name': 'Spark',
            'Applications.member.1.Version': '2.4.2',
            'Configurations.member.1.Classification': 'yarn-site',
Beispiel #13
0
                "-input",
                "s3://elasticmapreduce/samples/wordcount/input2",
                "-output",
                "s3://output_bucket/output/wordcount_output2",
                "-reducer",
                "aggregate",
            ],
            "Jar":
            "command-runner.jar",
        },
        "Name": "My wordcount example2",
    },
]

input_instance_groups = [
    InstanceGroup(1, "MASTER", "c1.medium", "ON_DEMAND", "master"),
    InstanceGroup(3, "CORE", "c1.medium", "ON_DEMAND", "core"),
    InstanceGroup(6, "TASK", "c1.large", "ON_DEMAND", "task")
]

cluster_create_request = {
    "api_request_id": "test_emr_create",
    "sub_type": "nonkerb",
    "role": "testing",
    "account": "example_account",
    "name": f"scheduled-testing-{int(time.time())}",
    "core_instance_count": "1",
    "task_instance_count": "3",
    "task_ebs_vol_size": "180",
    "custom_ami_id": "ami-075ac68c1cf8ba1c8"
}
Beispiel #14
0
from boto.emr.connection import EmrConnection
from boto.emr.instance_group import InstanceGroup

# Description:
# The InstanceGroup object can be useful for customizing
# the nodes of an EMR(Elastic Map Reduce) job.

# build up our instance groups
namenode_instance_group = InstanceGroup(num_instances=1,
                                        role="MASTER",
                                        type="c1.xlarge",
                                        market="ON_DEMAND",
                                        name="MASTER_GROUP")

core_nodes = InstanceGroup(num_instances=20,
                           role="MASTER",
                           type="c1.xlarge",
                           market="SPOT",
                           name="MASTER_GROUP")

task_nodes = InstanceGroup(num_instances=10,
                           role="TASK",
                           type="c1.xlarge",
                           market="ON_DEMAND",
                           name="INITIAL_TASK_GROUP")

instance_groups = [namenode_instance_group, core_nodes, task_nodes]


# run the job
conn = EmrConnection("<aws-access-key-id>", "<aws-secret-access-key>")
Beispiel #15
0
import datetime
import os

import boto
from boto.emr.instance_group import InstanceGroup
from boto.emr.step import InstallPigStep, PigStep


conn = boto.connect_emr()

instance_groups = [
    InstanceGroup(1, 'MASTER', 'm1.small', 'SPOT', '[email protected]', '0.10'),
    InstanceGroup(2, 'CORE', 'm1.small', 'SPOT', '[email protected]', '0.10'),
]

pig_file = 's3://elasticmapreduce/samples/pig-apache/do-reports2.pig'
INPUT = 's3://elasticmapreduce/samples/pig-apache/input/'
OUTPUT = ('s3://org.unencrypted.emr.output/apache_sample/%s' %
          datetime.datetime.utcnow().strftime("%s"))

print """\
Running pig job with settings:

    SCRIPT={script}
    INPUT={input}
    OUPUT={output}
""".format(script=pig_file, input=INPUT, output=OUTPUT)

pig_args = ['-p', 'INPUT=%s' % INPUT,
            '-p', 'OUTPUT=%s' % OUTPUT]
Beispiel #16
0
        usage()

for p, v in params.iteritems():
	print "param:" + `p`+ " value:" + `v`

conn = boto.connect_emr(params['aws_key'],params['secret'])

bootstrap_step1 = BootstrapAction("install_cc", "s3://commoncrawl-public/config64.sh",[params['aws_key'], params['secret']])
bootstrap_step2 = BootstrapAction("configure_hadoop", "s3://elasticmapreduce/bootstrap-actions/configure-hadoop",
	[
	"-m","mapred.tasktracker.map.tasks.maximum=8",
	"-m","mapred.child.java.opts=-XX:ErrorFile=/tmp/hs_err_${mapred.tip.id}.log -Xmx700m -XX:+UseParNewGC -XX:ParallelGCThreads=8 -XX:NewSize=100m -XX:+UseConcMarkSweepGC -XX:+UseTLAB -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:CMSIncrementalDutyCycle=10"
	])
bootstrap_step3 = BootstrapAction("configure_jobtrackerheap", "s3://elasticmapreduce/bootstrap-actions/configure-daemons",["--jobtracker-heap-size=12096"])

namenode_instance_group = InstanceGroup(1,"MASTER","c1.xlarge","ON_DEMAND","MASTER_GROUP")
core_instance_group = InstanceGroup(params['num_core'],"CORE","c1.xlarge","ON_DEMAND","CORE_GROUP")

instance_groups=[]
if params['num_spot'] <= 0:
	instance_groups=[namenode_instance_group,core_instance_group]
else:
	
	if not params['spot_bid_price']:
		print '\nERROR:You must specify a spot bid price to use spot instances!'
		usage()
	
	spot_instance_group = InstanceGroup(params['num_spot'],"TASK","c1.xlarge","SPOT","INITIAL_TASK_GROUP",params['spot_bid_price'])
		
	instance_groups=[namenode_instance_group,core_instance_group,spot_instance_group]
Beispiel #17
0
from moto import mock_emr_deprecated
from tests.helpers import requires_boto_gte

run_jobflow_args = dict(
    job_flow_role="EMR_EC2_DefaultRole",
    keep_alive=True,
    log_uri="s3://some_bucket/jobflow_logs",
    master_instance_type="c1.medium",
    name="My jobflow",
    num_instances=2,
    service_role="EMR_DefaultRole",
    slave_instance_type="c1.medium",
)

input_instance_groups = [
    InstanceGroup(1, "MASTER", "c1.medium", "ON_DEMAND", "master"),
    InstanceGroup(3, "CORE", "c1.medium", "ON_DEMAND", "core"),
    InstanceGroup(6, "TASK", "c1.large", "SPOT", "task-1", "0.07"),
    InstanceGroup(10, "TASK", "c1.xlarge", "SPOT", "task-2", "0.05"),
]


# Has boto3 equivalent
@mock_emr_deprecated
def test_describe_cluster():
    conn = boto.connect_emr()
    args = run_jobflow_args.copy()
    args.update(
        dict(
            api_params={
                "Applications.member.1.Name": "Spark",
Beispiel #18
0
def create_emr_cluster(cr):
    """
    @PARAM:  Cluster configuration reader object
    Creates an EMR cluster given a set of configuration parameters
    Return:  EMR Cluster ID
    """

    #region = cr.get_config("aws_region")
    #conn = boto.emr.connect_to_region(region)
    conn = EmrConnection(
        cr.get_config("aws_access_key"),
        cr.get_config("aws_secret_key"),
        region=RegionInfo(name=cr.get_config("aws_region"),
                          endpoint=cr.get_config("aws_region") +
                          ".elasticmapreduce.amazonaws.com"))

    #  Create list of instance groups:  master, core, and task
    instance_groups = []
    instance_groups.append(
        InstanceGroup(num_instances=cr.get_config("emr_master_node_count"),
                      role="MASTER",
                      type=cr.get_config("emr_master_node_type"),
                      market=cr.get_config("emr_market_type"),
                      name="Master Node"))

    instance_groups.append(
        InstanceGroup(num_instances=cr.get_config("emr_core_node_count"),
                      role="CORE",
                      type=cr.get_config("emr_core_node_type"),
                      market=cr.get_config("emr_market_type"),
                      name="Core Node"))

    #  Only create task nodes if specifcally asked for
    if cr.get_config("emr_task_node_count") > 0:
        instance_groups.append(
            InstanceGroup(num_instances=cr.get_config("emr_task_node_count"),
                          role="TASK",
                          type=cr.get_config("emr_task_node_type"),
                          market=cr.get_config("emr_market_type"),
                          name="Task Node"))

    print "Creating EMR Cluster with instance groups: {0}".format(
        instance_groups)

    #  Use these params to add overrrides, these will go away in Boto3
    api_params = {
        "Instances.Ec2SubnetId": cr.get_config("aws_subnet_id"),
        "ReleaseLabel": cr.get_config("emr_version")
    }

    #  Add step to load data
    step_args = [
        "s3-dist-cp", "--s3Endpoint=s3-us-west-1.amazonaws.com",
        "--src=s3://alpine-qa/automation/automation_test_data/",
        "--dest=hdfs:///automation_test_data", "--srcPattern=.*[a-zA-Z,]+"
    ]
    step = JarStep(name="s3distcp for data loading",
                   jar="command-runner.jar",
                   step_args=step_args,
                   action_on_failure="CONTINUE")

    cluster_id = conn.run_jobflow(
        cr.get_config("emr_cluster_name"),
        instance_groups=instance_groups,
        action_on_failure="TERMINATE_JOB_FLOW",
        keep_alive=True,
        enable_debugging=True,
        log_uri=cr.get_config("emr_log_uri"),
        #hadoop_version = "Amazon 2.7.2",
        #ReleaseLabel = "emr-5.0.0",
        #ami_version = "5.0.0",
        steps=[step],
        bootstrap_actions=[],
        ec2_keyname=cr.get_config("ec2_keyname"),
        visible_to_all_users=True,
        job_flow_role="EMR_EC2_DefaultRole",
        service_role="EMR_DefaultRole",
        api_params=api_params)

    print "EMR Cluster created, cluster id: {0}".format(cluster_id)
    state = conn.describe_cluster(cluster_id).status.state
    while state != u'COMPLETED' and state != u'SHUTTING_DOWN' and state != u'FAILED' and state != u'WAITING':
        #sleeping to recheck for status.
        time.sleep(5)
        state = conn.describe_cluster(cluster_id).status.state
        print "State is: {0}, sleeping 5s...".format(state)

    if state == u'SHUTTING_DOWN' or state == u'FAILED':
        return "ERROR"

    #Check if the state is WAITING. Then launch the next steps
    if state == u'WAITING':
        #Finding the master node dns of EMR cluster
        master_dns = conn.describe_cluster(cluster_id).masterpublicdnsname
        print "DNS Name: {0}".format(master_dns)
        return cluster_id