def get_instance_groups(self): '''Get instance groups to start a cluster. It calculates the price with self.level, which indicates the price upgrades from the original price. ''' instance_groups = [] for group in self.prop.emr.instance_groups: (num, group_name, instance_type) = group level = max(0, min(self.level, len(self.prop.emr.price_upgrade_rate) - 1)) # 0 <= level < len(...) bprice = self.prop.emr.prices[ instance_type] * self.prop.emr.price_upgrade_rate[level] name = '%s-%s@%f' % (group_name, 'SPOT', bprice) # Use on-demand instance if prices are zero. if bprice > 0: ig = InstanceGroup(num, group_name, instance_type, 'SPOT', name, '%.3f' % bprice) else: ig = InstanceGroup(num, group_name, instance_type, 'ON_DEMAND', name) instance_groups.append(ig) return instance_groups
def create_new_cluster(conn, s3_bucket, cluster_name, keep_alive=True, worker_type="m1.small", worker_count=2): # Note: for testing purposes, you can run ami_version 2.4.9 on m1.small instances. # For newer Hadoop, use 3.3.1 and m1.medium or m3.xlarge # on-demand prices for m1.small and m1.medium are 0.047 and 0.095 respectively master_node = "m1.medium" ami_version = "3.3.1" bid_price = "0.012" spot_count = 6 if worker_type == "m1.small": master_node = "m1.small" ami_version = "2.4.9" bid_price = "0.012" spot_count = 2 instance_groups = [] instance_groups.append( InstanceGroup(name="Main node", role="MASTER", num_instances=1, type=master_node, market="ON_DEMAND")) instance_groups.append( InstanceGroup(name="Worker nodes", role="CORE", num_instances=worker_count, type=worker_type, market="ON_DEMAND")) instance_groups.append( InstanceGroup(name="Optional spot-price nodes", role="TASK", num_instances=spot_count, type="m1.medium", market="SPOT", bidprice=bid_price)) cluster_id = conn.run_jobflow(cluster_name, instance_groups=instance_groups, action_on_failure='CANCEL_AND_WAIT', keep_alive=keep_alive, enable_debugging=True, log_uri="s3://{0}/logs/".format(s3_bucket), ami_version=ami_version, bootstrap_actions=[], additional_info=None, ec2_keyname="hadoop-seminar-emr", visible_to_all_users=True, job_flow_role="EMR_EC2_DefaultRole", service_role="EMR_DefaultRole") conn.add_tags(cluster_id, {'Name': "EMR Cluster " + cluster_id}) print "Starting cluster", cluster_id, cluster_name if keep_alive: print "Note: cluster will be left running, remember to terminate it manually after you are done!"
def create_emr(R): if not boto.config.has_section('Boto'): boto.config.add_section('Boto') boto.config.set('Boto', 'https_validate_certificates', 'False') step = StreamingStep(name='MC_Method example', cache_files=['s3n://bucket774/map.py#map.py'], mapper='map.py', input='s3://bucket774/input/', output='s3://bucket774/output/') conn = EmrConnection(access_id, access_key) instance_groups = [] instance_groups.append( InstanceGroup(num_instances=1, role="MASTER", type='m4.large', market="ON_DEMAND", name="Master nodes")) if R > 1: instance_groups.append( InstanceGroup(num_instances=R - 1, role="CORE", type='m4.large', market="ON_DEMAND", name="Slave nodes")) cluster_id = conn.run_jobflow(name='test MC_method run', instance_groups=instance_groups, enable_debugging=False, steps=[step], visible_to_all_users=True, keep_alive=True, job_flow_role="EMR_EC2_DefaultRole", service_role="EMR_DefaultRole", hadoop_version='2.4.0', log_uri='s3://bucket774/log') return cluster_id, conn
def test_bidprice_string(self): """ Test InstanceGroup init works with bidprice type = string. """ instance_group = InstanceGroup(1, 'MASTER', 'm1.small', 'SPOT', 'master', bidprice='1.1') self.assertEquals('1.1', instance_group.bidprice)
def test_bidprice_Decimal(self): """ Test InstanceGroup init works with bidprice type = Decimal. """ instance_group = InstanceGroup(1, 'MASTER', 'm1.small', 'SPOT', 'master', bidprice=Decimal(1.10)) self.assertEquals('1.10', instance_group.bidprice[:4])
def test_bidprice_missing_spot(self): """ Test InstanceGroup init raises ValueError when market==spot and bidprice is not specified. """ with self.assertRaisesRegexp(ValueError, 'bidprice must be specified'): InstanceGroup(1, 'MASTER', 'm1.small', 'SPOT', 'master')
def test_bidprice_missing_ondemand(self): """ Test InstanceGroup init accepts a missing bidprice arg, when market is ON_DEMAND. """ instance_group = InstanceGroup(1, 'MASTER', 'm1.small', 'ON_DEMAND', 'master')
def test_create_instance_groups(): conn = boto.connect_emr() step1 = StreamingStep( name='My wordcount example', mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py', reducer='aggregate', input='s3n://elasticmapreduce/samples/wordcount/input', output='s3n://output_bucket/output/wordcount_output') job_id = conn.run_jobflow( name='My jobflow', log_uri='s3://some_bucket/jobflow_logs', steps=[step1], ) instance_group = InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT', 'spot-0.07', '0.07') instance_group = conn.add_instance_groups(job_id, [instance_group]) instance_group_id = instance_group.instancegroupids job_flow = conn.describe_jobflows()[0] int(job_flow.instancecount).should.equal(6) instance_group = job_flow.instancegroups[0] instance_group.instancegroupid.should.equal(instance_group_id) int(instance_group.instancerunningcount).should.equal(6) instance_group.instancerole.should.equal('TASK') instance_group.instancetype.should.equal('c1.medium') instance_group.market.should.equal('SPOT') instance_group.name.should.equal('spot-0.07') instance_group.bidprice.should.equal('0.07')
def test_bidprice_float(self): """ Test InstanceGroup init works with bidprice type = float. """ instance_group = InstanceGroup(1, 'parent', 'm1.small', 'SPOT', 'parent', bidprice=1.1) self.assertEquals('1.1', instance_group.bidprice)
def test_modify_instance_groups(): conn = boto.connect_emr() step1 = StreamingStep( name='My wordcount example', mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py', reducer='aggregate', input='s3n://elasticmapreduce/samples/wordcount/input', output='s3n://output_bucket/output/wordcount_output') job_id = conn.run_jobflow(name='My jobflow', log_uri='s3://some_bucket/jobflow_logs', steps=[step1]) instance_group1 = InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT', 'spot-0.07', '0.07') instance_group2 = InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT', 'spot-0.07', '0.07') instance_group = conn.add_instance_groups( job_id, [instance_group1, instance_group2]) instance_group_ids = instance_group.instancegroupids.split(",") job_flow = conn.describe_jobflows()[0] int(job_flow.instancecount).should.equal(12) instance_group = job_flow.instancegroups[0] int(instance_group.instancerunningcount).should.equal(6) conn.modify_instance_groups(instance_group_ids, [2, 3]) job_flow = conn.describe_jobflows()[0] int(job_flow.instancecount).should.equal(5) instance_group1 = [ group for group in job_flow.instancegroups if group.instancegroupid == instance_group_ids[0] ][0] int(instance_group1.instancerunningcount).should.equal(2) instance_group2 = [ group for group in job_flow.instancegroups if group.instancegroupid == instance_group_ids[1] ][0] int(instance_group2.instancerunningcount).should.equal(3)
def launch_cluster(script_name, keep_alive=False, instance_types=None, subnet_id=None): '''launch new cluster''' if instance_types is None: instance_type = 'm2.4xlarge' instance_count = 3 else: match = re.match('^([^:]+)(:\d+)?$', instance_types) if not match: raise ValueError('invalid instance types: %s' % instance_types) instance_type, instance_count = match.groups() instance_count = int(instance_count[1:]) instance_groups = [ InstanceGroup(1, 'MASTER', instance_type, 'ON_DEMAND', 'MASTER_GROUP'), InstanceGroup(instance_count, 'CORE', instance_type, 'ON_DEMAND', 'CORE_GROUP') ] bootstrap_actions = [ BootstrapAction('install-pig', install_pig_script, [pig_version]), ] api_params = {} if subnet_id is not None: api_params['Instances.Ec2SubnetId'] = subnet_id name = name_prefix + '-' + script_name jobid = emr_conn.run_jobflow(name=name, keep_alive=keep_alive, ami_version=ami_version, visible_to_all_users=True, ec2_keyname=ec2_keyname, service_role='EMR_DefaultRole', job_flow_role='EMR_EC2_DefaultRole', log_uri=log_uri, action_on_failure='CONTINUE', instance_groups=instance_groups, bootstrap_actions=bootstrap_actions, api_params=api_params) print('launched %s (%s)' % (name, jobid)) return jobid
run_jobflow_args = dict( job_flow_role='EMR_EC2_DefaultRole', keep_alive=True, log_uri='s3://some_bucket/jobflow_logs', master_instance_type='c1.medium', name='My jobflow', num_instances=2, service_role='EMR_DefaultRole', slave_instance_type='c1.medium', ) input_instance_groups = [ InstanceGroup(1, 'MASTER', 'c1.medium', 'ON_DEMAND', 'master'), InstanceGroup(3, 'CORE', 'c1.medium', 'ON_DEMAND', 'core'), InstanceGroup(6, 'TASK', 'c1.large', 'SPOT', 'task-1', '0.07'), InstanceGroup(10, 'TASK', 'c1.xlarge', 'SPOT', 'task-2', '0.05'), ] @mock_emr_deprecated def test_describe_cluster(): conn = boto.connect_emr() args = run_jobflow_args.copy() args.update(dict( api_params={ 'Applications.member.1.Name': 'Spark', 'Applications.member.1.Version': '2.4.2', 'Configurations.member.1.Classification': 'yarn-site',
"-input", "s3://elasticmapreduce/samples/wordcount/input2", "-output", "s3://output_bucket/output/wordcount_output2", "-reducer", "aggregate", ], "Jar": "command-runner.jar", }, "Name": "My wordcount example2", }, ] input_instance_groups = [ InstanceGroup(1, "MASTER", "c1.medium", "ON_DEMAND", "master"), InstanceGroup(3, "CORE", "c1.medium", "ON_DEMAND", "core"), InstanceGroup(6, "TASK", "c1.large", "ON_DEMAND", "task") ] cluster_create_request = { "api_request_id": "test_emr_create", "sub_type": "nonkerb", "role": "testing", "account": "example_account", "name": f"scheduled-testing-{int(time.time())}", "core_instance_count": "1", "task_instance_count": "3", "task_ebs_vol_size": "180", "custom_ami_id": "ami-075ac68c1cf8ba1c8" }
from boto.emr.connection import EmrConnection from boto.emr.instance_group import InstanceGroup # Description: # The InstanceGroup object can be useful for customizing # the nodes of an EMR(Elastic Map Reduce) job. # build up our instance groups namenode_instance_group = InstanceGroup(num_instances=1, role="MASTER", type="c1.xlarge", market="ON_DEMAND", name="MASTER_GROUP") core_nodes = InstanceGroup(num_instances=20, role="MASTER", type="c1.xlarge", market="SPOT", name="MASTER_GROUP") task_nodes = InstanceGroup(num_instances=10, role="TASK", type="c1.xlarge", market="ON_DEMAND", name="INITIAL_TASK_GROUP") instance_groups = [namenode_instance_group, core_nodes, task_nodes] # run the job conn = EmrConnection("<aws-access-key-id>", "<aws-secret-access-key>")
import datetime import os import boto from boto.emr.instance_group import InstanceGroup from boto.emr.step import InstallPigStep, PigStep conn = boto.connect_emr() instance_groups = [ InstanceGroup(1, 'MASTER', 'm1.small', 'SPOT', '[email protected]', '0.10'), InstanceGroup(2, 'CORE', 'm1.small', 'SPOT', '[email protected]', '0.10'), ] pig_file = 's3://elasticmapreduce/samples/pig-apache/do-reports2.pig' INPUT = 's3://elasticmapreduce/samples/pig-apache/input/' OUTPUT = ('s3://org.unencrypted.emr.output/apache_sample/%s' % datetime.datetime.utcnow().strftime("%s")) print """\ Running pig job with settings: SCRIPT={script} INPUT={input} OUPUT={output} """.format(script=pig_file, input=INPUT, output=OUTPUT) pig_args = ['-p', 'INPUT=%s' % INPUT, '-p', 'OUTPUT=%s' % OUTPUT]
usage() for p, v in params.iteritems(): print "param:" + `p`+ " value:" + `v` conn = boto.connect_emr(params['aws_key'],params['secret']) bootstrap_step1 = BootstrapAction("install_cc", "s3://commoncrawl-public/config64.sh",[params['aws_key'], params['secret']]) bootstrap_step2 = BootstrapAction("configure_hadoop", "s3://elasticmapreduce/bootstrap-actions/configure-hadoop", [ "-m","mapred.tasktracker.map.tasks.maximum=8", "-m","mapred.child.java.opts=-XX:ErrorFile=/tmp/hs_err_${mapred.tip.id}.log -Xmx700m -XX:+UseParNewGC -XX:ParallelGCThreads=8 -XX:NewSize=100m -XX:+UseConcMarkSweepGC -XX:+UseTLAB -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:CMSIncrementalDutyCycle=10" ]) bootstrap_step3 = BootstrapAction("configure_jobtrackerheap", "s3://elasticmapreduce/bootstrap-actions/configure-daemons",["--jobtracker-heap-size=12096"]) namenode_instance_group = InstanceGroup(1,"MASTER","c1.xlarge","ON_DEMAND","MASTER_GROUP") core_instance_group = InstanceGroup(params['num_core'],"CORE","c1.xlarge","ON_DEMAND","CORE_GROUP") instance_groups=[] if params['num_spot'] <= 0: instance_groups=[namenode_instance_group,core_instance_group] else: if not params['spot_bid_price']: print '\nERROR:You must specify a spot bid price to use spot instances!' usage() spot_instance_group = InstanceGroup(params['num_spot'],"TASK","c1.xlarge","SPOT","INITIAL_TASK_GROUP",params['spot_bid_price']) instance_groups=[namenode_instance_group,core_instance_group,spot_instance_group]
from moto import mock_emr_deprecated from tests.helpers import requires_boto_gte run_jobflow_args = dict( job_flow_role="EMR_EC2_DefaultRole", keep_alive=True, log_uri="s3://some_bucket/jobflow_logs", master_instance_type="c1.medium", name="My jobflow", num_instances=2, service_role="EMR_DefaultRole", slave_instance_type="c1.medium", ) input_instance_groups = [ InstanceGroup(1, "MASTER", "c1.medium", "ON_DEMAND", "master"), InstanceGroup(3, "CORE", "c1.medium", "ON_DEMAND", "core"), InstanceGroup(6, "TASK", "c1.large", "SPOT", "task-1", "0.07"), InstanceGroup(10, "TASK", "c1.xlarge", "SPOT", "task-2", "0.05"), ] # Has boto3 equivalent @mock_emr_deprecated def test_describe_cluster(): conn = boto.connect_emr() args = run_jobflow_args.copy() args.update( dict( api_params={ "Applications.member.1.Name": "Spark",
def create_emr_cluster(cr): """ @PARAM: Cluster configuration reader object Creates an EMR cluster given a set of configuration parameters Return: EMR Cluster ID """ #region = cr.get_config("aws_region") #conn = boto.emr.connect_to_region(region) conn = EmrConnection( cr.get_config("aws_access_key"), cr.get_config("aws_secret_key"), region=RegionInfo(name=cr.get_config("aws_region"), endpoint=cr.get_config("aws_region") + ".elasticmapreduce.amazonaws.com")) # Create list of instance groups: master, core, and task instance_groups = [] instance_groups.append( InstanceGroup(num_instances=cr.get_config("emr_master_node_count"), role="MASTER", type=cr.get_config("emr_master_node_type"), market=cr.get_config("emr_market_type"), name="Master Node")) instance_groups.append( InstanceGroup(num_instances=cr.get_config("emr_core_node_count"), role="CORE", type=cr.get_config("emr_core_node_type"), market=cr.get_config("emr_market_type"), name="Core Node")) # Only create task nodes if specifcally asked for if cr.get_config("emr_task_node_count") > 0: instance_groups.append( InstanceGroup(num_instances=cr.get_config("emr_task_node_count"), role="TASK", type=cr.get_config("emr_task_node_type"), market=cr.get_config("emr_market_type"), name="Task Node")) print "Creating EMR Cluster with instance groups: {0}".format( instance_groups) # Use these params to add overrrides, these will go away in Boto3 api_params = { "Instances.Ec2SubnetId": cr.get_config("aws_subnet_id"), "ReleaseLabel": cr.get_config("emr_version") } # Add step to load data step_args = [ "s3-dist-cp", "--s3Endpoint=s3-us-west-1.amazonaws.com", "--src=s3://alpine-qa/automation/automation_test_data/", "--dest=hdfs:///automation_test_data", "--srcPattern=.*[a-zA-Z,]+" ] step = JarStep(name="s3distcp for data loading", jar="command-runner.jar", step_args=step_args, action_on_failure="CONTINUE") cluster_id = conn.run_jobflow( cr.get_config("emr_cluster_name"), instance_groups=instance_groups, action_on_failure="TERMINATE_JOB_FLOW", keep_alive=True, enable_debugging=True, log_uri=cr.get_config("emr_log_uri"), #hadoop_version = "Amazon 2.7.2", #ReleaseLabel = "emr-5.0.0", #ami_version = "5.0.0", steps=[step], bootstrap_actions=[], ec2_keyname=cr.get_config("ec2_keyname"), visible_to_all_users=True, job_flow_role="EMR_EC2_DefaultRole", service_role="EMR_DefaultRole", api_params=api_params) print "EMR Cluster created, cluster id: {0}".format(cluster_id) state = conn.describe_cluster(cluster_id).status.state while state != u'COMPLETED' and state != u'SHUTTING_DOWN' and state != u'FAILED' and state != u'WAITING': #sleeping to recheck for status. time.sleep(5) state = conn.describe_cluster(cluster_id).status.state print "State is: {0}, sleeping 5s...".format(state) if state == u'SHUTTING_DOWN' or state == u'FAILED': return "ERROR" #Check if the state is WAITING. Then launch the next steps if state == u'WAITING': #Finding the master node dns of EMR cluster master_dns = conn.describe_cluster(cluster_id).masterpublicdnsname print "DNS Name: {0}".format(master_dns) return cluster_id