コード例 #1
0
ファイル: run_cluster.py プロジェクト: valeter/nlp-site
def start_hadoop_cluster(nodenum):
	try:
		hadoop_params = ['-m','mapred.tasktracker.map.tasks.maximum=1',
		          '-m', 'mapred.child.java.opts=-Xmx10g']
		configure_hadoop_action = BootstrapAction('configure_hadoop', 's3://elasticmapreduce/bootstrap-actions/configure-hadoop', hadoop_params)

		emr_connection = EmrConnection()
		bucket_name = "udk-bucket"
		steps = []
		copy_jar_step = JarStep(name='copy-jar',
			jar='s3n://' + bucket_name + '/copy-to-hdfs.jar',
			step_args=['s3n://' + bucket_name + '/pipeline.pear',
				'/mnt/pipeline.pear'])
		steps.append(copy_jar_step)

		jobflow_id = emr_connection.run_jobflow(name='udk',
			log_uri='s3://udk-bucket/jobflow_logs',
			master_instance_type='m2.xlarge',
			slave_instance_type='m2.xlarge',
			num_instances=nodenum,
			keep_alive=True,
			enable_debugging=False,
			bootstrap_actions=[configure_hadoop_action],
			hadoop_version='1.0.3',
			steps=steps)
		emr_connection.set_termination_protection(jobflow_id, True)
		
		return jobflow_id
	except Exception, e:
		return "none" 
コード例 #2
0
ファイル: terminate_cluster.py プロジェクト: valeter/nlp-site
def terminate(cluster_id):
	try:
		emr_connection = EmrConnection()
		emr_connection.set_termination_protection(cluster_id, False)
		emr_connection.terminate_jobflow(cluster_id)
		return True
	except Exception, e:
		print e
		return False
コード例 #3
0
    #InstanceGroup(1, 'TASK', 'm1.small', 'ON_DEMAND', 'Task'),
    InstanceGroup(1, 'CORE', 'm1.small', 'ON_DEMAND', 'Core')
]

jf_id = emr.run_jobflow(log_uri='s3://%s/logs' %(bucket_name),
                        name='wc jobflow',
                        steps=[wc_step],
                        #num_instances=NUM_INSTANCES,
                        #master_instance_type='m1.small',
                        #slave_instance_type='m1.small',
                        instance_groups=instance_groups,
                        job_flow_role = 'EMR_EC2_DefaultRole',
                        #bootstrap_actions=[bootstrap_step],
                        service_role = 'EMR_DefaultRole',
                        action_on_failure='CONTINUE',
                        visible_to_all_users="True",
                        ami_version = '2.4',
                        hadoop_version='1.0.3',
                        keep_alive=True)

emr.set_termination_protection(jf_id, True)

print jf_id

while True:
    jf = emr.describe_jobflow(jf_id)
    #print "[%s] %s" % (datetime.now().strftime("%Y-%m-%d %T"), jf.state)
    print datetime.now(), jf.state
    if jf.state == 'COMPLETED':
        break
    sleep(30)
コード例 #4
0
class EmrLauncher(object):

    # Default constructor of the class.
    def __init__(self):
        try:
            self.zone_name = "ap-southeast-1"
            self.access_key = "xxxxxx"
            self.private_key = "xxxxxxx"
            self.ec2_keyname = "xxxxxxxx"
            self.base_bucket = "s3://emr-bucket/"
            self.bootstrap_script = "custom-bootstrap.sh"
            self.log_dir = "Logs"
            self.emr_status_wait = 20
            self.conn = ""
            self.cluster_name = "MyFirstEmrCluster"

            # Establishing EmrConnection
            self.conn = EmrConnection(self.access_key, self.private_key,
                                 region=RegionInfo(name=self.zone_name,
                                 endpoint=self.zone_name + '.elasticmapreduce.amazonaws.com'))


            self.log_bucket_name = self.base_bucket + self.log_dir
            self.bootstrap_script_name = self.base_bucket + self.bootstrap_script

    def launch_emr_cluster(self, master_type, slave_type, num_instance, ami_version):
        try:
            #Custom Bootstrap step
            bootstrap_step = BootstrapAction("CustomBootStrap", self.bootstrap_script_name, None)

            #Modifyting block size to 256 MB
            block_size_conf = 'dfs.block.size=256'
            hadoop_config_params = ['-h', block_size_conf, '-h']
            hadoop_config_bootstrapper = BootstrapAction('hadoop-config',
                                               's3://elasticmapreduce/bootstrap-actions/configure-hadoop',
                                               hadoop_config_params)
            #Bootstrapping Ganglia
            hadoop_monitor_bootstrapper = BootstrapAction('ganglia-config',
                                                's3://elasticmapreduce/bootstrap-actions/install-ganglia', '')

            #Bootstrapping Impala
            impala_install_params = ['--install-impala','--base-path', 's3://elasticmapreduce', '--impala-version', 'latest']
            bootstrap_impala_install_step = BootstrapAction("ImpalaInstall", "s3://elasticmapreduce/libs/impala/setup-impala",
                                                                                                impala_install_params)
            #Hive installation
            hive_install_step = InstallHiveStep();

            #Pig Installation
            pig_install_step = InstallPigStep();

            #Launching the cluster
            jobid = self.conn.run_jobflow(
                         self.cluster_name,
                         self.log_bucket_name,
                         bootstrap_actions=[hadoop_config_bootstrapper, hadoop_monitor_bootstrapper, bootstrap_step,
                                            bootstrap_impala_install_step],
                         ec2_keyname=self.ec2_keyname,
                         steps=[hive_install_step, pig_install_step],
                         keep_alive=True,
                         action_on_failure = 'CANCEL_AND_WAIT',
                         master_instance_type=master_type,
                         slave_instance_type=slave_type,
                         num_instances=num_instance,
                         ami_version=ami_version)

            #Enabling the termination protection
            self.conn.set_termination_protection(jobid, True)

            #Checking the state of EMR cluster
            state = self.conn.describe_jobflow(jobid).state
            while state != u'COMPLETED' and state != u'SHUTTING_DOWN' and state != u'FAILED' and state != u'WAITING':
                #sleeping to recheck for status.
                time.sleep(int(self.emr_status_wait))
                state = self.conn.describe_jobflow(jobid).state

            if state == u'SHUTTING_DOWN' or state == u'FAILED':
                logging.error("Launching EMR cluster failed")
                return "ERROR"

            #Check if the state is WAITING. Then launch the next steps
            if state == u'WAITING':
                #Finding the master node dns of EMR cluster
                master_dns = self.conn.describe_jobflow(jobid).masterpublicdnsname
                logging.info("Launched EMR Cluster Successfully")
                logging.info("Master node DNS of EMR " + master_dns)
                return "SUCCESS"
        except:
            logging.error("Launching EMR cluster failed")
            return "FAILED"

    def main(self):
        try:
            master_type = 'm3.xlarge'
            slave_type = 'm3.xlarge'
            num_instance = 3
            ami_version = '2.4.8'

            emr_status = self.launch_emr_cluster(master_type, slave_type, num_instance, ami_version)
            if emr_status == 'SUCCESS':
                logging.info("Emr cluster launched successfully")
            else:
                logging.error("Emr launching failed")
        except:
            logging.error("Emr launching failed")