def start_hadoop_cluster(nodenum): try: hadoop_params = ['-m','mapred.tasktracker.map.tasks.maximum=1', '-m', 'mapred.child.java.opts=-Xmx10g'] configure_hadoop_action = BootstrapAction('configure_hadoop', 's3://elasticmapreduce/bootstrap-actions/configure-hadoop', hadoop_params) emr_connection = EmrConnection() bucket_name = "udk-bucket" steps = [] copy_jar_step = JarStep(name='copy-jar', jar='s3n://' + bucket_name + '/copy-to-hdfs.jar', step_args=['s3n://' + bucket_name + '/pipeline.pear', '/mnt/pipeline.pear']) steps.append(copy_jar_step) jobflow_id = emr_connection.run_jobflow(name='udk', log_uri='s3://udk-bucket/jobflow_logs', master_instance_type='m2.xlarge', slave_instance_type='m2.xlarge', num_instances=nodenum, keep_alive=True, enable_debugging=False, bootstrap_actions=[configure_hadoop_action], hadoop_version='1.0.3', steps=steps) emr_connection.set_termination_protection(jobflow_id, True) return jobflow_id except Exception, e: return "none"
def terminate(cluster_id): try: emr_connection = EmrConnection() emr_connection.set_termination_protection(cluster_id, False) emr_connection.terminate_jobflow(cluster_id) return True except Exception, e: print e return False
#InstanceGroup(1, 'TASK', 'm1.small', 'ON_DEMAND', 'Task'), InstanceGroup(1, 'CORE', 'm1.small', 'ON_DEMAND', 'Core') ] jf_id = emr.run_jobflow(log_uri='s3://%s/logs' %(bucket_name), name='wc jobflow', steps=[wc_step], #num_instances=NUM_INSTANCES, #master_instance_type='m1.small', #slave_instance_type='m1.small', instance_groups=instance_groups, job_flow_role = 'EMR_EC2_DefaultRole', #bootstrap_actions=[bootstrap_step], service_role = 'EMR_DefaultRole', action_on_failure='CONTINUE', visible_to_all_users="True", ami_version = '2.4', hadoop_version='1.0.3', keep_alive=True) emr.set_termination_protection(jf_id, True) print jf_id while True: jf = emr.describe_jobflow(jf_id) #print "[%s] %s" % (datetime.now().strftime("%Y-%m-%d %T"), jf.state) print datetime.now(), jf.state if jf.state == 'COMPLETED': break sleep(30)
class EmrLauncher(object): # Default constructor of the class. def __init__(self): try: self.zone_name = "ap-southeast-1" self.access_key = "xxxxxx" self.private_key = "xxxxxxx" self.ec2_keyname = "xxxxxxxx" self.base_bucket = "s3://emr-bucket/" self.bootstrap_script = "custom-bootstrap.sh" self.log_dir = "Logs" self.emr_status_wait = 20 self.conn = "" self.cluster_name = "MyFirstEmrCluster" # Establishing EmrConnection self.conn = EmrConnection(self.access_key, self.private_key, region=RegionInfo(name=self.zone_name, endpoint=self.zone_name + '.elasticmapreduce.amazonaws.com')) self.log_bucket_name = self.base_bucket + self.log_dir self.bootstrap_script_name = self.base_bucket + self.bootstrap_script def launch_emr_cluster(self, master_type, slave_type, num_instance, ami_version): try: #Custom Bootstrap step bootstrap_step = BootstrapAction("CustomBootStrap", self.bootstrap_script_name, None) #Modifyting block size to 256 MB block_size_conf = 'dfs.block.size=256' hadoop_config_params = ['-h', block_size_conf, '-h'] hadoop_config_bootstrapper = BootstrapAction('hadoop-config', 's3://elasticmapreduce/bootstrap-actions/configure-hadoop', hadoop_config_params) #Bootstrapping Ganglia hadoop_monitor_bootstrapper = BootstrapAction('ganglia-config', 's3://elasticmapreduce/bootstrap-actions/install-ganglia', '') #Bootstrapping Impala impala_install_params = ['--install-impala','--base-path', 's3://elasticmapreduce', '--impala-version', 'latest'] bootstrap_impala_install_step = BootstrapAction("ImpalaInstall", "s3://elasticmapreduce/libs/impala/setup-impala", impala_install_params) #Hive installation hive_install_step = InstallHiveStep(); #Pig Installation pig_install_step = InstallPigStep(); #Launching the cluster jobid = self.conn.run_jobflow( self.cluster_name, self.log_bucket_name, bootstrap_actions=[hadoop_config_bootstrapper, hadoop_monitor_bootstrapper, bootstrap_step, bootstrap_impala_install_step], ec2_keyname=self.ec2_keyname, steps=[hive_install_step, pig_install_step], keep_alive=True, action_on_failure = 'CANCEL_AND_WAIT', master_instance_type=master_type, slave_instance_type=slave_type, num_instances=num_instance, ami_version=ami_version) #Enabling the termination protection self.conn.set_termination_protection(jobid, True) #Checking the state of EMR cluster state = self.conn.describe_jobflow(jobid).state while state != u'COMPLETED' and state != u'SHUTTING_DOWN' and state != u'FAILED' and state != u'WAITING': #sleeping to recheck for status. time.sleep(int(self.emr_status_wait)) state = self.conn.describe_jobflow(jobid).state if state == u'SHUTTING_DOWN' or state == u'FAILED': logging.error("Launching EMR cluster failed") return "ERROR" #Check if the state is WAITING. Then launch the next steps if state == u'WAITING': #Finding the master node dns of EMR cluster master_dns = self.conn.describe_jobflow(jobid).masterpublicdnsname logging.info("Launched EMR Cluster Successfully") logging.info("Master node DNS of EMR " + master_dns) return "SUCCESS" except: logging.error("Launching EMR cluster failed") return "FAILED" def main(self): try: master_type = 'm3.xlarge' slave_type = 'm3.xlarge' num_instance = 3 ami_version = '2.4.8' emr_status = self.launch_emr_cluster(master_type, slave_type, num_instance, ami_version) if emr_status == 'SUCCESS': logging.info("Emr cluster launched successfully") else: logging.error("Emr launching failed") except: logging.error("Emr launching failed")