コード例 #1
0
def test_bootstrap_actions():
    bootstrap_actions = [
        BootstrapAction(
            name="bs1",
            path="path/to/script",
            bootstrap_action_args=["arg1", "arg2&arg3"],
        ),
        BootstrapAction(name="bs2",
                        path="path/to/anotherscript",
                        bootstrap_action_args=[]),
    ]

    conn = boto.connect_emr()
    cluster_id = conn.run_jobflow(bootstrap_actions=bootstrap_actions,
                                  **run_jobflow_args)

    jf = conn.describe_jobflow(cluster_id)
    for x, y in zip(jf.bootstrapactions, bootstrap_actions):
        x.name.should.equal(y.name)
        x.path.should.equal(y.path)
        list(o.value for o in x.args).should.equal(y.args())

    resp = conn.list_bootstrap_actions(cluster_id)
    for i, y in enumerate(bootstrap_actions):
        x = resp.actions[i]
        x.name.should.equal(y.name)
        x.scriptpath.should.equal(y.path)
        list(arg.value for arg in x.args).should.equal(y.args())
コード例 #2
0
 def get_bootstrap_actions(self):
     '''Get list of bootstrap actions from property'''
     actions = []
     for bootstrap_action in self.prop.emr.bootstrap_actions:
         assert len(bootstrap_action
                    ) >= 2, 'Wrong bootstrap action definition: ' + str(
                        bootstrap_action)
         actions.append(
             BootstrapAction(bootstrap_action[0], bootstrap_action[1],
                             bootstrap_action[2:]))
     return actions
コード例 #3
0
def run_emr(profile, input_path, output_path, errors_path, log_path,
            ec2_keyname):
    c = boto.connect_s3(profile_name=profile)
    jar_bucket = c.get_bucket(input_path.split("/")[0])
    r = get_valid_region(jar_bucket.get_location())

    bootstrap_actions = [
        BootstrapAction("Install Spark",
                        "s3://support.elasticmapreduce/spark/install-spark",
                        ["-x"])
    ]

    args = [
        "/home/hadoop/spark/bin/spark-submit",
        "--deploy-mode",
        "cluster",
        "--master",
        "yarn-cluster",
        "--class",
        "com.snowplowanalytics.schemaguru.sparkjob.SchemaDeriveJob",
        "s3://snowplow-hosted-assets/schema-guru/spark/" + JAR_FILE,
        "--ndjson",  # Assuming your source files contain many JSONs each, one per line
        "--errors-path",
        "s3n://" + errors_path,  # trailing slash is required
        "--output",
        "s3n://" + output_path,  # ...here too
        "s3n://" + input_path,  # ...here too
    ]
    steps = [
        InstallHiveStep(),
        ScriptRunnerStep("Run SchemaDeriveJob", step_args=args)
    ]

    conn = boto.emr.connect_to_region(r, profile_name=profile)
    job_id = conn.run_jobflow(name="Schema Derive Spark",
                              log_uri="s3://" + log_path,
                              ec2_keyname=ec2_keyname,
                              master_instance_type="m3.xlarge",
                              slave_instance_type="m3.xlarge",
                              num_instances=3,
                              enable_debugging=True,
                              ami_version="3.8",
                              steps=steps,
                              bootstrap_actions=bootstrap_actions,
                              job_flow_role="EMR_EC2_DefaultRole",
                              service_role="EMR_DefaultRole")
    print("Started jobflow " + job_id)
コード例 #4
0
def launch_cluster(script_name,
                   keep_alive=False,
                   instance_types=None,
                   subnet_id=None):
    '''launch new cluster'''
    if instance_types is None:
        instance_type = 'm2.4xlarge'
        instance_count = 3
    else:
        match = re.match('^([^:]+)(:\d+)?$', instance_types)
        if not match:
            raise ValueError('invalid instance types: %s' % instance_types)
        instance_type, instance_count = match.groups()
        instance_count = int(instance_count[1:])
    instance_groups = [
        InstanceGroup(1, 'MASTER', instance_type, 'ON_DEMAND', 'MASTER_GROUP'),
        InstanceGroup(instance_count, 'CORE', instance_type, 'ON_DEMAND',
                      'CORE_GROUP')
    ]
    bootstrap_actions = [
        BootstrapAction('install-pig', install_pig_script, [pig_version]),
    ]
    api_params = {}
    if subnet_id is not None:
        api_params['Instances.Ec2SubnetId'] = subnet_id
    name = name_prefix + '-' + script_name
    jobid = emr_conn.run_jobflow(name=name,
                                 keep_alive=keep_alive,
                                 ami_version=ami_version,
                                 visible_to_all_users=True,
                                 ec2_keyname=ec2_keyname,
                                 service_role='EMR_DefaultRole',
                                 job_flow_role='EMR_EC2_DefaultRole',
                                 log_uri=log_uri,
                                 action_on_failure='CONTINUE',
                                 instance_groups=instance_groups,
                                 bootstrap_actions=bootstrap_actions,
                                 api_params=api_params)
    print('launched %s (%s)' % (name, jobid))
    return jobid
コード例 #5
0
def run_emr(profile, bucket, ec2_keyname, vpc_subnet_id):
    c = boto.connect_s3(profile_name=profile)
    b = c.get_bucket(bucket)
    r = get_valid_region(b.get_location())

    bootstrap_actions = [
        BootstrapAction("Install Spark",
                        "s3://support.elasticmapreduce/spark/install-spark",
                        ["-x"])
    ]

    args = [
        "/home/hadoop/spark/bin/spark-submit", "--deploy-mode", "cluster",
        "--master", "yarn-cluster", "--class",
        "com.snowplowanalytics.spark.WordCountJob",
        "s3://" + bucket + "/jar/" + JAR_FILE,
        "s3n://" + bucket + "/" + HELLO_TXT, "s3n://" + bucket + "/out"
    ]
    steps = [
        InstallHiveStep(),
        ScriptRunnerStep("Run WordCountJob", step_args=args)
    ]

    conn = boto.emr.connect_to_region(r, profile_name=profile)
    job_id = conn.run_jobflow(name="Spark Example Project",
                              log_uri="s3://" + bucket + "logs",
                              ec2_keyname=ec2_keyname,
                              master_instance_type="m3.xlarge",
                              slave_instance_type="m3.xlarge",
                              num_instances=3,
                              enable_debugging=True,
                              ami_version="3.6",
                              steps=steps,
                              bootstrap_actions=bootstrap_actions,
                              job_flow_role="EMR_EC2_DefaultRole",
                              service_role="EMR_DefaultRole")
    print "Started jobflow " + job_id
コード例 #6
0
ファイル: emr_traffic.py プロジェクト: Verbify/verbify
 def _bootstrap_actions(cls):
     name = cls.BOOTSTRAP_NAME
     path = cls.BOOTSTRAP_SCRIPT
     bootstrap_action_args = [g.TRAFFIC_SRC_DIR, g.tracking_secret]
     bootstrap = BootstrapAction(name, path, bootstrap_action_args)
     return [bootstrap]
コード例 #7
0
	if o in ('--test'):
		params['test_mode']=True
	
required = ['aws_key','secret','keypair']

for pname in required:
    if not params.get(pname, None):
        print '\nERROR:%s is required' % pname
        usage()

for p, v in params.iteritems():
	print "param:" + `p`+ " value:" + `v`

conn = boto.connect_emr(params['aws_key'],params['secret'])

bootstrap_step1 = BootstrapAction("install_cc", "s3://commoncrawl-public/config64.sh",[params['aws_key'], params['secret']])
bootstrap_step2 = BootstrapAction("configure_hadoop", "s3://elasticmapreduce/bootstrap-actions/configure-hadoop",
	[
	"-m","mapred.tasktracker.map.tasks.maximum=8",
	"-m","mapred.child.java.opts=-XX:ErrorFile=/tmp/hs_err_${mapred.tip.id}.log -Xmx700m -XX:+UseParNewGC -XX:ParallelGCThreads=8 -XX:NewSize=100m -XX:+UseConcMarkSweepGC -XX:+UseTLAB -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:CMSIncrementalDutyCycle=10"
	])
bootstrap_step3 = BootstrapAction("configure_jobtrackerheap", "s3://elasticmapreduce/bootstrap-actions/configure-daemons",["--jobtracker-heap-size=12096"])

namenode_instance_group = InstanceGroup(1,"MASTER","c1.xlarge","ON_DEMAND","MASTER_GROUP")
core_instance_group = InstanceGroup(params['num_core'],"CORE","c1.xlarge","ON_DEMAND","CORE_GROUP")

instance_groups=[]
if params['num_spot'] <= 0:
	instance_groups=[namenode_instance_group,core_instance_group]
else:
	
コード例 #8
0
from boto.emr.bootstrap_action import BootstrapAction
from boto.emr.connection import EmrConnection

# Description:
# BootstrapAction is an object reperesenting a bootstrap action in Elastic Map
# Reduce (EMR), a script that gets run before the EMR job executes.

# initialize a bootstrap action
bootstrapSetup = BootstrapAction("Bootstrap Name",
                                 "s3://<my-bucket>/<my-bootstrap-action>",
                                 ["arg1=hello", "arg2=world"])

# initialize emr connection
emr_job = EmrConnection("<aws-access-key-id>", "<aws-secret-access-key>")

# run emr job flow with defined bootstrap action
emr_job.run_jobflow(bootstrap_actions=[bootstrapSetup])
コード例 #9
0
ファイル: emr_traffic.py プロジェクト: PhearTheCeal/reddit
 def __init__(self):
     name = 'memory intensive'
     path = 's3://elasticmapreduce/bootstrap-actions/' \
            'configurations/latest/memory-intensive'
     args = []
     BootstrapAction.__init__(self, name, path, args)
コード例 #10
0
ファイル: emr_traffic.py プロジェクト: ziyoudefeng/reddit
 def __init__(self):
     name = 'memory intensive'
     path = 's3://elasticmapreduce/bootstrap-actions/' \
            'configurations/latest/memory-intensive'
     args = []
     BootstrapAction.__init__(self, name, path, args)