def run(self): """Run the Hive job on EMR cluster """ # copy the data source to a new object # (Hive deletes/moves the original) copy_s3_file(self.input_path, self.data_path) # and create the hive script self._generate_and_upload_hive_script() logger.info("Waiting {} seconds for S3 eventual consistency".format( self.s3_sync_wait_time)) time.sleep(self.s3_sync_wait_time) # TODO more options like setting aws region conn = EmrConnection(self.aws_access_key_id, self.aws_secret_access_key) setup_step = InstallHiveStep(self.hive_version) run_step = HiveStep(self.job_name, self.script_path) jobid = conn.run_jobflow( self.job_name, self.log_path, action_on_failure='CANCEL_AND_WAIT', master_instance_type=self.master_instance_type, slave_instance_type=self.slave_instance_type, ami_version=self.ami_version, num_instances=self.num_instances) conn.add_jobflow_steps(jobid, [setup_step, run_step]) self._wait_for_job_to_complete(conn, jobid) logger.info("Output file is in: {0}".format(self.output_path))
def create_emr(R): if not boto.config.has_section('Boto'): boto.config.add_section('Boto') boto.config.set('Boto', 'https_validate_certificates', 'False') step = StreamingStep(name='MC_Method example', cache_files=['s3n://bucket774/map.py#map.py'], mapper='map.py', input='s3://bucket774/input/', output='s3://bucket774/output/') conn = EmrConnection(access_id, access_key) instance_groups = [] instance_groups.append( InstanceGroup(num_instances=1, role="MASTER", type='m4.large', market="ON_DEMAND", name="Master nodes")) if R > 1: instance_groups.append( InstanceGroup(num_instances=R - 1, role="CORE", type='m4.large', market="ON_DEMAND", name="Slave nodes")) cluster_id = conn.run_jobflow(name='test MC_method run', instance_groups=instance_groups, enable_debugging=False, steps=[step], visible_to_all_users=True, keep_alive=True, job_flow_role="EMR_EC2_DefaultRole", service_role="EMR_DefaultRole", hadoop_version='2.4.0', log_uri='s3://bucket774/log') return cluster_id, conn
def start_hadoop_cluster(nodenum): try: hadoop_params = ['-m','mapred.tasktracker.map.tasks.maximum=1', '-m', 'mapred.child.java.opts=-Xmx10g'] configure_hadoop_action = BootstrapAction('configure_hadoop', 's3://elasticmapreduce/bootstrap-actions/configure-hadoop', hadoop_params) emr_connection = EmrConnection() bucket_name = "udk-bucket" steps = [] copy_jar_step = JarStep(name='copy-jar', jar='s3n://' + bucket_name + '/copy-to-hdfs.jar', step_args=['s3n://' + bucket_name + '/pipeline.pear', '/mnt/pipeline.pear']) steps.append(copy_jar_step) jobflow_id = emr_connection.run_jobflow(name='udk', log_uri='s3://udk-bucket/jobflow_logs', master_instance_type='m2.xlarge', slave_instance_type='m2.xlarge', num_instances=nodenum, keep_alive=True, enable_debugging=False, bootstrap_actions=[configure_hadoop_action], hadoop_version='1.0.3', steps=steps) emr_connection.set_termination_protection(jobflow_id, True) return jobflow_id except Exception, e: return "none"
def run(self): """Run the Hive job on EMR cluster """ # copy the data source to a new object # (Hive deletes/moves the original) copy_s3_file(self.input_path, self.data_path) # and create the hive script self._generate_and_upload_hive_script() logger.info("Waiting {} seconds for S3 eventual consistency".format( self.s3_sync_wait_time)) time.sleep(self.s3_sync_wait_time) # TODO more options like setting aws region conn = EmrConnection(self.aws_access_key_id, self.aws_secret_access_key) setup_step = InstallHiveStep(self.hive_version) run_step = HiveStep(self.job_name, self.script_path) cluster_id = conn.run_jobflow( self.job_name, self.log_path, action_on_failure='CANCEL_AND_WAIT', master_instance_type=self.master_instance_type, slave_instance_type=self.slave_instance_type, ami_version=self.ami_version, num_instances=self.num_instances, job_flow_role=self.iam_instance_profile, service_role=self.iam_service_role) conn.add_jobflow_steps(cluster_id, [setup_step, run_step]) logger.info("Job started on cluster {0}".format(cluster_id)) self._wait_for_job_to_complete(conn, cluster_id) logger.info("Output file is in: {0}".format(self.output_path))
class EMR: def creating_a_connection(self): #Creating a connection from boto.emr.connection import EmrConnection self.conn = EmrConnection('', '') def creating_streaming_job(self): #Creating Streaming JobFlow Steps from boto.emr.step import StreamingStep self.step = StreamingStep(name='my bigdata task', mapper='s3n://eth-src/raw_to_stations.py', #mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py', reducer='s3n://eth-src/stations_to_features.py', #reducer='aggregate', input='s3n://eth-input/2007.csv', #input='s3n://elasticmapreduce/samples/wordcount/input', output='s3n://eth-middle/2007') def creating_jobflows(self): #Creating JobFlows #import boto.emr #self.conn = boto.emr.connect_to_region('eu-west-1') job_id = self.conn.run_jobflow(name='My jobflow', log_uri='s3://eth-log/jobflow_logs', master_instance_type='m3.xlarge', slave_instance_type='m1.large', num_instances=2, steps=[self.step], ami_version='3.3.1' ) status = self.conn.describe_jobflow(job_id) status.state def terminating_jobflows(self, job_id): #Terminating JobFlows #self.conn = boto.emr.connect_to_region('eu-west-1') self.conn.terminate_jobflow(job_id)
files_short="""split_A.txt split_B.txt split_C.txt split_D.txt split_E.txt split_F.txt split_G.txt split_H.txt split_I.txt split_J.txt split_K.txt split_L.txt split_M.txt""".split('\n') input_files=['s3n://smalldata/'+f for f in files_short] step = StreamingStep(name='Inverted Index ', mapper='s3n://css739/invIndex/inv-index-mapper.py', reducer='s3n://css739/invIndex/inv-index-mapper.py', input=input_files, #input='s3n://smalldata/wikipedia_titles.txt', output='s3n://css739/invIndex/invindex_output2') #cache_files=['s3n://css739/invindex/english_stoplist.py']) jobid = conn.run_jobflow(name='Inverted Index', log_uri='s3n://css739/invIndex/jobflow_logs',steps=[step]) conn.describe_jobflow(jobid).state
step2 = JarStep(name='Run Hive Script', jar='s3://elasticmapreduce/libs/script-runner/script-runner.jar', step_args=['s3://elasticmapreduce/libs/hive/hive-script', '--run-hive-script', '--args', '-f', 's3://dphive/mmhadooprollup.hql', '-d', 'INPUT=s3://mmlogs', '-d', 'OUTPUT=s3://dphiveoutput']) jobname = 'MM Logs Jobflow %s' %dt.datetime.now() jobid = conne.run_jobflow(name=jobname, log_uri='s3://dphive/debug/', ec2_keyname='dpaws', master_instance_type='c1.medium', slave_instance_type='c1.medium', num_instances=3, steps=[step1, step2]) while True: status = conne.describe_jobflow(jobid) if status.state == 'STARTING': time.sleep(10) elif status.state == 'RUNNING': time.sleep(10) elif status.state == 'WAITING': time.sleep(10) elif status.state == 'TERMINATED':
def create_emr_cluster(cr): """ @PARAM: Cluster configuration reader object Creates an EMR cluster given a set of configuration parameters Return: EMR Cluster ID """ #region = cr.get_config("aws_region") #conn = boto.emr.connect_to_region(region) conn = EmrConnection( cr.get_config("aws_access_key"), cr.get_config("aws_secret_key"), region=RegionInfo(name=cr.get_config("aws_region"), endpoint=cr.get_config("aws_region") + ".elasticmapreduce.amazonaws.com")) # Create list of instance groups: master, core, and task instance_groups = [] instance_groups.append( InstanceGroup(num_instances=cr.get_config("emr_master_node_count"), role="MASTER", type=cr.get_config("emr_master_node_type"), market=cr.get_config("emr_market_type"), name="Master Node")) instance_groups.append( InstanceGroup(num_instances=cr.get_config("emr_core_node_count"), role="CORE", type=cr.get_config("emr_core_node_type"), market=cr.get_config("emr_market_type"), name="Core Node")) # Only create task nodes if specifcally asked for if cr.get_config("emr_task_node_count") > 0: instance_groups.append( InstanceGroup(num_instances=cr.get_config("emr_task_node_count"), role="TASK", type=cr.get_config("emr_task_node_type"), market=cr.get_config("emr_market_type"), name="Task Node")) print "Creating EMR Cluster with instance groups: {0}".format( instance_groups) # Use these params to add overrrides, these will go away in Boto3 api_params = { "Instances.Ec2SubnetId": cr.get_config("aws_subnet_id"), "ReleaseLabel": cr.get_config("emr_version") } # Add step to load data step_args = [ "s3-dist-cp", "--s3Endpoint=s3-us-west-1.amazonaws.com", "--src=s3://alpine-qa/automation/automation_test_data/", "--dest=hdfs:///automation_test_data", "--srcPattern=.*[a-zA-Z,]+" ] step = JarStep(name="s3distcp for data loading", jar="command-runner.jar", step_args=step_args, action_on_failure="CONTINUE") cluster_id = conn.run_jobflow( cr.get_config("emr_cluster_name"), instance_groups=instance_groups, action_on_failure="TERMINATE_JOB_FLOW", keep_alive=True, enable_debugging=True, log_uri=cr.get_config("emr_log_uri"), #hadoop_version = "Amazon 2.7.2", #ReleaseLabel = "emr-5.0.0", #ami_version = "5.0.0", steps=[step], bootstrap_actions=[], ec2_keyname=cr.get_config("ec2_keyname"), visible_to_all_users=True, job_flow_role="EMR_EC2_DefaultRole", service_role="EMR_DefaultRole", api_params=api_params) print "EMR Cluster created, cluster id: {0}".format(cluster_id) state = conn.describe_cluster(cluster_id).status.state while state != u'COMPLETED' and state != u'SHUTTING_DOWN' and state != u'FAILED' and state != u'WAITING': #sleeping to recheck for status. time.sleep(5) state = conn.describe_cluster(cluster_id).status.state print "State is: {0}, sleeping 5s...".format(state) if state == u'SHUTTING_DOWN' or state == u'FAILED': return "ERROR" #Check if the state is WAITING. Then launch the next steps if state == u'WAITING': #Finding the master node dns of EMR cluster master_dns = conn.describe_cluster(cluster_id).masterpublicdnsname print "DNS Name: {0}".format(master_dns) return cluster_id
) k.close() k = Key(b) k.key = "reducer.py" k.set_contents_from_filename( "/Users/winteram/Documents/Teaching/WebAnalytics_2013S/BIA660-2013S/course_docs/20130319/reducer.py" ) k.close() ### Running code with EMR # emrcon = EmrConnection('<aws access key>', '<aws secret key>') emrcon = EmrConnection("0CY3BC386720ZYZNWZ02", "Jv37SHb/XNeqpY8vMrGeclcL6abfKHKd9Eeh5fmy") step = StreamingStep( name="Alcohol Step", mapper="s3n://bia660-winter/mapper.py", reducer="s3n://bia660-winter/reducer.py", input="s3://datasets.elasticmapreduce/ngrams/books/20090715/eng-us-all/3gram/data", output="s3n://bia660-winter/output/alcohol_religion", ) jobid = emrcon.run_jobflow( name="Alcohol Religion 10", log_uri="s3://bia660-winter/logfiles", steps=[step], num_instances=4 ) print "Job created: %s" % jobid status = emrcon.describe_jobflow(jobid) print status.state
class Rankmaniac: '''Rankmaniac Wrapper This class provides a simple wrapper around the Amazon Web Services SDK. It should provide all the functionality required in terms of MapReduce, so students don't need to worry about learning the EMR and S3 API. ''' def __init__(self, team_id, access_key, secret_key): '''Rankmaniac class constructor Creates a new instance of the Rankmaniac Wrapper for a specific team. Arguments: team_id string the team ID. access_key string AWS access key. secret_key string AWS secret key. ''' self.s3_bucket = 'cs144caltech' self.team_id = team_id self.emr_conn = EmrConnection(access_key, secret_key) self.s3_conn = S3Connection(access_key, secret_key) self.job_id = None def __del__(self): if self.job_id: self.terminate_job() def submit_job(self, mapper, reducer, input, output, num_map=1, num_reduce=1): '''Submit a new MapReduce job Submits a new MapReduce job with a single step. To add more steps, call add_step. To terminate this job, call terminate_job. Arguments: mapper string path to the mapper, relative to your data directory. reducer string path to the reducer, relative to your data directory. input string path to the input data, relative to your data directory. To specify a directory as input, ensure your path contains a trailing /. output string path to the desired output directory. num_map int number of map tasks for this job. num_reduce int number of reduce tasks for this job. ''' if self.job_id: raise Exception('There currently already exists a running job.') job_name = self._make_name() step = self._make_step(mapper, reducer, input, output, num_map, num_reduce) self.job_id = \ self.emr_conn.run_jobflow(name=job_name, steps=[step], num_instances=1, log_uri=self._get_s3_url() + 'job_logs', keep_alive=True) def terminate_job(self): '''Terminate a running MapReduce job Stops the current running job. ''' if not self.job_id: raise Exception('No job is running.') self.emr_conn.terminate_jobflow(self.job_id) self.job_id = None def get_job(self): '''Gets the running job details Returns: JobFlow object with relevant fields: state string the state of the job flow, either COMPLETED | FAILED | TERMINATED RUNNING | SHUTTING_DOWN | STARTING WAITING | BOOTSTRAPPING steps list(Step) a list of the step details in the workflow. A Step has the relevant fields: status string startdatetime string enddatetime string Note: Amazon has an upper-limit on the frequency with which you can call this function; we have had success with calling it one every 10 seconds. ''' if not self.job_id: raise Exception('No job is running.') return self.emr_conn.describe_jobflow(self.job_id) def add_step(self, mapper, reducer, input, output, num_map=1, num_reduce=1): '''Add a step to an existing job Adds a new step to an already running job flow. Note: any given job flow can support up to 256 steps. To workaround this limitation, you can always choose to submit a new job once the current job completes. Arguments: mapper string path to the mapper, relative to your data directory. reducer string path to the reducer, relative to your data directory. input string path to the input data, relative to your data directory. To specify a directory as input, ensure your path contains a trailing /. output string path to the desired output directory. ''' if not self.job_id: raise Exception('No job is running.') step = self._make_step(mapper, reducer, input, output, num_map, num_reduce) self.emr_conn.add_jobflow_steps(self.job_id, [step]) def upload(self, in_dir='data'): '''Upload local data to S3 Uploads the files in the specified directory to S3, where it can be used by Elastic MapReduce. Note: this method only uploads files in the root of in_dir. It does NOT scan through subdirectories. Arguments: in_dir string optional, defaults to 'data'. Uses this directory as the base directory from which to upload. ''' bucket = self.s3_conn.get_bucket(self.s3_bucket) keys = bucket.list(prefix='%s/' % self.team_id) bucket.delete_keys(map(lambda k: k.name, keys)) to_upload = [ (os.path.join(in_dir, file_name), os.path.join(self.team_id, file_name)) for file_name in os.listdir(in_dir) if os.path.isfile(os.path.join(in_dir, file_name))] for l, r in to_upload: key = Key(bucket) key.key = r key.set_contents_from_filename(l) def download(self, out_dir='data'): '''Download S3 data to local directory Downloads S3 data to the specified directory. Note: this method DOES download the entire directory hierarchy as given by S3. It will create subdirectories as needed. Arguments: out_dir string optional, defaults to 'data'. Downloads files to this directory. ''' bucket = self.s3_conn.get_bucket(self.s3_bucket) keys = bucket.list(prefix='%s/' % self.team_id) for key in keys: fp = os.path.join(out_dir, '/'.join(key.name.split('/')[1:])) fp_dir = os.path.dirname(fp) if os.path.exists(fp): os.remove(fp) elif not os.path.exists(fp_dir): os.makedirs(fp_dir) key.get_contents_to_filename(fp) def _make_name(self): return '%s-%s' % (self.team_id, strftime('%m-%d-%Y %H:%M:%s', localtime())) def _make_step(self, mapper, reducer, input, output, nm=1, nr=1): job_name = self._make_name() team_s3 = self._get_s3_url() bucket = self.s3_conn.get_bucket(self.s3_bucket) keys = bucket.list(prefix='%s/%s' % (self.team_id, output)) bucket.delete_keys(map(lambda k: k.name, keys)) return \ StreamingStep(name=job_name, step_args= ['-jobconf', 'mapred.map.tasks=%d' % nm, '-jobconf', 'mapred.reduce.tasks=%d' % nr], mapper=team_s3 + mapper, reducer=team_s3 + reducer, input=team_s3 + input, output=team_s3 + output) def _get_s3_url(self): return 's3n://%s/%s/' % (self.s3_bucket, self.team_id)
from boto.emr.connection import EmrConnection # Description: # EmrConnection can be used to create a new emr job # initialize emr connection conn = EmrConnection("<aws-access-key-id>", "<aws-secret-access-key>") # run job flow with 10 instances conn.run_jobflow(num_instances=10, master_instance_type="m1.small", slave_instance_type="m1.small")
market="ON_DEMAND", name="Main node")) instance_groups.append(InstanceGroup( num_instances=6, role="CORE", type="m1.large", market="ON_DEMAND", name="node")) instance_groups.append(InstanceGroup( num_instances=6, role="TASK", type="m1.large", market="SPOT", name="spot node", bidprice="0.004")) job_id = conn.run_jobflow( 'MyCluster', instance_groups=instance_groups, action_on_failure='TERMINATE_JOB_FLOW', keep_alive=False, enable_debugging=True, log_uri='s3://'+holder+'/log', hadoop_version=None, ami_version="2.4.9", steps=[step], bootstrap_actions=[], ec2_keyname='euireland1kp', visible_to_all_users=True, job_flow_role="EMR_EC2_DefaultRole", service_role="EMR_DefaultRole")
# <codecell> # Using EMR's wordcount example step = StreamingStep( name="hwu9 wordcount example", # mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py', mapper="s3n://" + BUCKET + "/mapper.py", reducer="s3n://" + BUCKET + "/reducer.py", # input='s3n://elasticmapreduce/samples/wordcount/input', input="s3n://" + BUCKET + "/test.txt", output="s3n://" + BUCKET + "/output/wordcount_output", ) # <codecell> jobid = emrcon.run_jobflow(name="hwu9 Word Count Example", log_uri="s3://" + BUCKET + "/logs", steps=[step]) # <codecell> print jobid result_queue = multiprocessing.Queue() process = multiprocessing.Process(target=check_status, args=[emrcon, jobid, result_queue]) process.start() result = result_queue.get() # thread = threading.Thread(target=check_status) # thread.start() # thread.join() # <codecell>
class Rankmaniac: '''Rankmaniac Wrapper This class provides a simple wrapper around the Amazon Web Services SDK. It should provide all the functionality required in terms of MapReduce, so students don't need to worry about learning the EMR and S3 API. ''' def __init__(self, team_id, access_key, secret_key): '''Rankmaniac class constructor Creates a new instance of the Rankmaniac Wrapper for a specific team. Arguments: team_id string the team ID. access_key string AWS access key. secret_key string AWS secret key. ''' self.s3_bucket = 'cs144caltech' self.team_id = team_id self.emr_conn = EmrConnection(access_key, secret_key) self.s3_conn = S3Connection(access_key, secret_key) self.job_id = None def __del__(self): if self.job_id: self.terminate_job() def submit_job(self, mapper, reducer, input, output, num_map=1, num_reduce=1): '''Submit a new MapReduce job Submits a new MapReduce job with a single step. To add more steps, call add_step. To terminate this job, call terminate_job. Arguments: mapper string path to the mapper, relative to your data directory. reducer string path to the reducer, relative to your data directory. input string path to the input data, relative to your data directory. To specify a directory as input, ensure your path contains a trailing /. output string path to the desired output directory. num_map int number of map tasks for this job. num_reduce int number of reduce tasks for this job. ''' if self.job_id: raise Exception('There currently already exists a running job.') job_name = self._make_name() step = self._make_step(mapper, reducer, input, output, num_map, num_reduce) self.job_id = \ self.emr_conn.run_jobflow(name=job_name, steps=[step], num_instances=1, log_uri=self._get_s3_url() + 'job_logs', keep_alive=True) def terminate_job(self): '''Terminate a running MapReduce job Stops the current running job. ''' if not self.job_id: raise Exception('No job is running.') self.emr_conn.terminate_jobflow(self.job_id) self.job_id = None def get_job(self): '''Gets the running job details Returns: JobFlow object with relevant fields: state string the state of the job flow, either COMPLETED | FAILED | TERMINATED RUNNING | SHUTTING_DOWN | STARTING WAITING | BOOTSTRAPPING steps list(Step) a list of the step details in the workflow. A Step has the relevant fields: status string startdatetime string enddatetime string Note: Amazon has an upper-limit on the frequency with which you can call this function; we have had success with calling it one every 10 seconds. ''' if not self.job_id: raise Exception('No job is running.') return self.emr_conn.describe_jobflow(self.job_id) def add_step(self, mapper, reducer, input, output, num_map=1, num_reduce=1): '''Add a step to an existing job Adds a new step to an already running job flow. Note: any given job flow can support up to 256 steps. To workaround this limitation, you can always choose to submit a new job once the current job completes. Arguments: mapper string path to the mapper, relative to your data directory. reducer string path to the reducer, relative to your data directory. input string path to the input data, relative to your data directory. To specify a directory as input, ensure your path contains a trailing /. output string path to the desired output directory. ''' if not self.job_id: raise Exception('No job is running.') step = self._make_step(mapper, reducer, input, output, num_map, num_reduce) self.emr_conn.add_jobflow_steps(self.job_id, [step]) def upload(self, in_dir='data'): '''Upload local data to S3 Uploads the files in the specified directory to S3, where it can be used by Elastic MapReduce. Note: this method only uploads files in the root of in_dir. It does NOT scan through subdirectories. Arguments: in_dir string optional, defaults to 'data'. Uses this directory as the base directory from which to upload. ''' bucket = self.s3_conn.get_bucket(self.s3_bucket) keys = bucket.list(prefix='%s/' % self.team_id) bucket.delete_keys(map(lambda k: k.name, keys)) to_upload = [(os.path.join(in_dir, file_name), os.path.join(self.team_id, file_name)) for file_name in os.listdir(in_dir) if os.path.isfile(os.path.join(in_dir, file_name))] for l, r in to_upload: key = Key(bucket) key.key = r key.set_contents_from_filename(l) def download(self, out_dir='data'): '''Download S3 data to local directory Downloads S3 data to the specified directory. Note: this method DOES download the entire directory hierarchy as given by S3. It will create subdirectories as needed. Arguments: out_dir string optional, defaults to 'data'. Downloads files to this directory. ''' bucket = self.s3_conn.get_bucket(self.s3_bucket) keys = bucket.list(prefix='%s/' % self.team_id) for key in keys: fp = os.path.join(out_dir, '/'.join(key.name.split('/')[1:])) fp_dir = os.path.dirname(fp) if os.path.exists(fp): os.remove(fp) elif not os.path.exists(fp_dir): os.makedirs(fp_dir) key.get_contents_to_filename(fp) def _make_name(self): return '%s-%s' % (self.team_id, strftime('%m-%d-%Y %H:%M:%s', localtime())) def _make_step(self, mapper, reducer, input, output, nm=1, nr=1): job_name = self._make_name() team_s3 = self._get_s3_url() bucket = self.s3_conn.get_bucket(self.s3_bucket) keys = bucket.list(prefix='%s/%s' % (self.team_id, output)) bucket.delete_keys(map(lambda k: k.name, keys)) return \ StreamingStep(name=job_name, step_args= ['-jobconf', 'mapred.map.tasks=%d' % nm, '-jobconf', 'mapred.reduce.tasks=%d' % nr], mapper=team_s3 + mapper, reducer=team_s3 + reducer, input=team_s3 + input, output=team_s3 + output) def _get_s3_url(self): return 's3n://%s/%s/' % (self.s3_bucket, self.team_id)
class EmrClient(object): # The Hadoop version to use HADOOP_VERSION = '1.0.3' # The AMI version to use AMI_VERSION = '2.4.7' # Interval to wait between polls to EMR cluster in seconds CLUSTER_OPERATION_RESULTS_POLLING_SECONDS = 10 # Timeout for EMR creation and ramp up in seconds CLUSTER_OPERATION_RESULTS_TIMEOUT_SECONDS = 60 * 30 def __init__(self, region_name='us-east-1', aws_access_key_id=None, aws_secret_access_key=None): # If the access key is not specified, get it from the luigi config.cfg file if not aws_access_key_id: aws_access_key_id = luigi.configuration.get_config().get('aws', 'aws_access_key_id') if not aws_secret_access_key: aws_secret_access_key = luigi.configuration.get_config().get('aws', 'aws_secret_access_key') # Create the region in which to run region_endpoint = u'elasticmapreduce.%s.amazonaws.com' % (region_name) region = RegionInfo(name=region_name, endpoint=region_endpoint) self.emr_connection = EmrConnection(aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, region=region) def launch_emr_cluster(self, cluster_name, log_uri, ec2_keyname=None, master_type='m1.small', core_type='m1.small', num_instances=2, hadoop_version='1.0.3', ami_version='2.4.7', ): # TODO Remove # install_pig_step = InstallPigStep() jobflow_id = self.emr_connection.run_jobflow(name=cluster_name, log_uri=log_uri, ec2_keyname=ec2_keyname, master_instance_type=master_type, slave_instance_type=core_type, num_instances=num_instances, keep_alive=True, enable_debugging=True, hadoop_version=EmrClient.HADOOP_VERSION, steps=[], ami_version=EmrClient.AMI_VERSION) # Log important information status = self.emr_connection.describe_jobflow(jobflow_id) logger.info('Creating new cluster %s with following details' % status.name) logger.info('jobflow ID:\t%s' % status.jobflowid) logger.info('Log URI:\t%s' % status.loguri) logger.info('Master Instance Type:\t%s' % status.masterinstancetype) # A cluster of size 1 does not have any slave instances if hasattr(status, 'slaveinstancetype'): logger.info('Slave Instance Type:\t%s' % status.slaveinstancetype) logger.info('Number of Instances:\t%s' % status.instancecount) logger.info('Hadoop Version:\t%s' % status.hadoopversion) logger.info('AMI Version:\t%s' % status.amiversion) logger.info('Keep Alive:\t%s' % status.keepjobflowalivewhennosteps) return self._poll_until_cluster_ready(jobflow_id) def add_pig_step(self, jobflow_id, pig_file, name='Pig Script', pig_versions='latest', pig_args=[]): pig_step = PigStep(name=name, pig_file=pig_file, pig_versions=pig_versions, pig_args=pig_args, # action_on_failure='CONTINUE', ) self.emr_connection.add_jobflow_steps(jobflow_id, [pig_step]) # Poll until the cluster is done working return self._poll_until_cluster_ready(jobflow_id) def shutdown_emr_cluster(self, jobflow_id): self.emr_connection.terminate_jobflow(jobflow_id) return self._poll_until_cluster_shutdown(jobflow_id) def get_jobflow_id(self): # Get the id of the cluster that is WAITING for work return self.emr_connection.list_clusters(cluster_states=['WAITING']).clusters[0].id def get_master_dns(self): """ Get the master node's public address """ # Get the jobflow ID jobflow_id = self.get_master_dns() # Use the jobflow ID to get the status status = self.emr_connection.describe_jobflow(jobflow_id) # Return the master's public dns return status.masterpublicdnsname def _poll_until_cluster_ready(self, jobflow_id): start_time = time.time() is_cluster_ready = False while (not is_cluster_ready) and (time.time() - start_time < EmrClient.CLUSTER_OPERATION_RESULTS_TIMEOUT_SECONDS): # Get the state state = self.emr_connection.describe_jobflow(jobflow_id).state if state == u'WAITING': logger.info('Cluster intialized and is WAITING for work') is_cluster_ready = True elif (state == u'COMPLETED') or \ (state == u'SHUTTING_DOWN') or \ (state == u'FAILED') or \ (state == u'TERMINATED'): logger.error('Error starting cluster; status: %s' % state) # Poll until cluster shutdown self._poll_until_cluster_shutdown(jobflow_id) raise RuntimeError('Error, cluster failed to start') else: logger.debug('Cluster state: %s' % state) time.sleep(EmrClient.CLUSTER_OPERATION_RESULTS_POLLING_SECONDS) if not is_cluster_ready: # TODO shutdown cluster raise RuntimeError('Timed out waiting for EMR cluster to be active') return jobflow_id def _poll_until_cluster_shutdown(self, jobflow_id): start_time = time.time() is_cluster_shutdown = False while (not is_cluster_shutdown) and (time.time() - start_time < EmrClient.CLUSTER_OPERATION_RESULTS_TIMEOUT_SECONDS): # Get the state state = self.emr_connection.describe_jobflow(jobflow_id).state if (state == u'TERMINATED') or (state == u'COMPLETED'): logger.info('Cluster successfully shutdown with status: %s' % state) return False elif state == u'FAILED': logger.error('Cluster shutdown with FAILED status') return False else: logger.debug('Cluster state: %s' % state) time.sleep(EmrClient.CLUSTER_OPERATION_RESULTS_POLLING_SECONDS) if not is_cluster_shutdown: # TODO shutdown cluster raise RuntimeError('Timed out waiting for EMR cluster to shut down') return True
def main(args): script_name = args for i in range(2, 3, 2): start_time = time.time() bucket_name = 'nlp-' + str(i).strip() emr_connection = EmrConnection() preprocessing_steps = [] for j in xrange(12, 13, 12): preprocessing_steps.append(JarStep(name='prerocessing-' + str(i).strip(), jar='s3n://nlp-' + str(i).strip() + '/init/behemoth-core.jar', step_args=['com.digitalpebble.behemoth.util.CorpusGenerator', '-i', 's3n://nlp-' + str(i).strip() + '/' + str(j).strip() + '/texts', '-o', 's3n://nlp-' + str(i).strip() + '/' + str(j).strip() + '/bcorpus'])) tika_steps = [] for j in xrange(12, 13, 12): tika_steps.append(JarStep(name='tika-' + str(i).strip(), jar='s3n://nlp-' + str(i).strip() + '/init/behemoth-tika.jar', step_args=['com.digitalpebble.behemoth.tika.TikaDriver', '-i', 's3n://nlp-' + str(i).strip() + '/' + str(j).strip() + '/bcorpus', '-o', 's3n://nlp-' + str(i).strip() + '/' + str(j).strip() + '/tcorpus'])) copy_jar_steps = [] for j in xrange(12, 13, 12): copy_jar_steps.append(JarStep(name='copy-jar-' + str(i).strip(), jar='s3n://nlp-' + str(i).strip() + '/init/copy-to-hdfs.jar', step_args=['s3n://nlp-' + str(i).strip() + '/init/pipeline.pear', '/mnt/pipeline.pear'])) uima_steps = [] for j in xrange(12, 13, 12): uima_steps.append(JarStep(name='uima-' + str(i).strip(), jar='s3n://nlp-' + str(i).strip() + '/init/behemoth-uima.jar', step_args=['com.digitalpebble.behemoth.uima.UIMADriver', 's3n://nlp-' + str(i).strip() + '/' + str(j).strip() + '/tcorpus', '/mnt/ucorpus', '/mnt/pipeline.pear'])) steps = [] steps.extend(preprocessing_steps steps.extend(tika_steps) steps.extend(copy_jar_steps) steps.extend(uima_steps) steps.extend(extract_result_steps) hadoop_params = ['-m','mapred.tasktracker.map.tasks.maximum=1', '-m', 'mapred.child.java.opts=-Xmx10g'] configure_hadoop_action = BootstrapAction('configure_hadoop', 's3://elasticmapreduce/bootstrap-actions/configure-hadoop', hadoop_params) jobid = emr_connection.run_jobflow(name='nlp-cloud-' + str(i).strip(), log_uri='s3://nlp-' + str(i).strip() + '/jobflow_logs', master_instance_type='m2.xlarge', slave_instance_type='m2.xlarge', num_instances=i, keep_alive=False, enable_debugging=False, bootstrap_actions=[configure_hadoop_action], hadoop_version='1.0.3', steps=steps) termination_statuses = [u'COMPLETED', u'FAILED', u'TERMINATED'] while True: time.sleep(5) status = emr_connection.describe_jobflow(jobid) if status.state in termination_statuses: print 'Job finished for %s nodes' % i break print time.time() - start_time, ' seconds elapsed' return True if (__name__ == '__main__'): args = sys.argv if (check_args(args)): if (main(args)): sys.exit() print 'Work successfully finished' else: print 'Could not finish work' sys.exit(1) else: print USAGE_MESSAGE sys.exit(2)
from boto.emr.bootstrap_action import BootstrapAction from boto.emr.connection import EmrConnection # Description: # BootstrapAction is an object reperesenting a bootstrap action in Elastic Map # Reduce (EMR), a script that gets run before the EMR job executes. # initialize a bootstrap action bootstrapSetup = BootstrapAction("Bootstrap Name", "s3://<my-bucket>/<my-bootstrap-action>", ["arg1=hello", "arg2=world"]) # initialize emr connection emr_job = EmrConnection("<aws-access-key-id>", "<aws-secret-access-key>") # run emr job flow with defined bootstrap action emr_job.run_jobflow(bootstrap_actions=[bootstrapSetup])
emrcon = EmrConnection(ACCESS,SECRET) # <codecell> # Using EMR's wordcount example step = StreamingStep(name=JOB_NAME, mapper = MY_MAPPER_URI, reducer = MY_REDUCER_URI, input = MY_INPUT_URI, output = MY_OUTPUT_URI) # <codecell> jobid = emrcon.run_jobflow(name = JOB_NAME, log_uri = MY_LOG_URI, steps = [step], num_instances = 1 ) # <codecell> print jobid #start a new thread to check program status result_queue = multiprocessing.Queue() process = multiprocessing.Process(target=check_status, args=[emrcon,jobid,result_queue]) process.start() #finished result = result_queue.get() emrcon.terminate_jobflow(jobid)
class EmrLauncher(object): # Default constructor of the class. def __init__(self): try: self.zone_name = "ap-southeast-1" self.access_key = "xxxxxx" self.private_key = "xxxxxxx" self.ec2_keyname = "xxxxxxxx" self.base_bucket = "s3://emr-bucket/" self.bootstrap_script = "custom-bootstrap.sh" self.log_dir = "Logs" self.emr_status_wait = 20 self.conn = "" self.cluster_name = "MyFirstEmrCluster" # Establishing EmrConnection self.conn = EmrConnection(self.access_key, self.private_key, region=RegionInfo(name=self.zone_name, endpoint=self.zone_name + '.elasticmapreduce.amazonaws.com')) self.log_bucket_name = self.base_bucket + self.log_dir self.bootstrap_script_name = self.base_bucket + self.bootstrap_script def launch_emr_cluster(self, master_type, slave_type, num_instance, ami_version): try: #Custom Bootstrap step bootstrap_step = BootstrapAction("CustomBootStrap", self.bootstrap_script_name, None) #Modifyting block size to 256 MB block_size_conf = 'dfs.block.size=256' hadoop_config_params = ['-h', block_size_conf, '-h'] hadoop_config_bootstrapper = BootstrapAction('hadoop-config', 's3://elasticmapreduce/bootstrap-actions/configure-hadoop', hadoop_config_params) #Bootstrapping Ganglia hadoop_monitor_bootstrapper = BootstrapAction('ganglia-config', 's3://elasticmapreduce/bootstrap-actions/install-ganglia', '') #Bootstrapping Impala impala_install_params = ['--install-impala','--base-path', 's3://elasticmapreduce', '--impala-version', 'latest'] bootstrap_impala_install_step = BootstrapAction("ImpalaInstall", "s3://elasticmapreduce/libs/impala/setup-impala", impala_install_params) #Hive installation hive_install_step = InstallHiveStep(); #Pig Installation pig_install_step = InstallPigStep(); #Launching the cluster jobid = self.conn.run_jobflow( self.cluster_name, self.log_bucket_name, bootstrap_actions=[hadoop_config_bootstrapper, hadoop_monitor_bootstrapper, bootstrap_step, bootstrap_impala_install_step], ec2_keyname=self.ec2_keyname, steps=[hive_install_step, pig_install_step], keep_alive=True, action_on_failure = 'CANCEL_AND_WAIT', master_instance_type=master_type, slave_instance_type=slave_type, num_instances=num_instance, ami_version=ami_version) #Enabling the termination protection self.conn.set_termination_protection(jobid, True) #Checking the state of EMR cluster state = self.conn.describe_jobflow(jobid).state while state != u'COMPLETED' and state != u'SHUTTING_DOWN' and state != u'FAILED' and state != u'WAITING': #sleeping to recheck for status. time.sleep(int(self.emr_status_wait)) state = self.conn.describe_jobflow(jobid).state if state == u'SHUTTING_DOWN' or state == u'FAILED': logging.error("Launching EMR cluster failed") return "ERROR" #Check if the state is WAITING. Then launch the next steps if state == u'WAITING': #Finding the master node dns of EMR cluster master_dns = self.conn.describe_jobflow(jobid).masterpublicdnsname logging.info("Launched EMR Cluster Successfully") logging.info("Master node DNS of EMR " + master_dns) return "SUCCESS" except: logging.error("Launching EMR cluster failed") return "FAILED" def main(self): try: master_type = 'm3.xlarge' slave_type = 'm3.xlarge' num_instance = 3 ami_version = '2.4.8' emr_status = self.launch_emr_cluster(master_type, slave_type, num_instance, ami_version) if emr_status == 'SUCCESS': logging.info("Emr cluster launched successfully") else: logging.error("Emr launching failed") except: logging.error("Emr launching failed")
class EmrClient(object): # The Hadoop version to use HADOOP_VERSION = '1.0.3' # The AMI version to use AMI_VERSION = '2.4.7' # Interval to wait between polls to EMR cluster in seconds CLUSTER_OPERATION_RESULTS_POLLING_SECONDS = 10 # Timeout for EMR creation and ramp up in seconds CLUSTER_OPERATION_RESULTS_TIMEOUT_SECONDS = 60 * 30 def __init__(self, region_name='us-east-1', aws_access_key_id=None, aws_secret_access_key=None): # If the access key is not specified, get it from the luigi config.cfg file if not aws_access_key_id: aws_access_key_id = luigi.configuration.get_config().get( 'aws', 'aws_access_key_id') if not aws_secret_access_key: aws_secret_access_key = luigi.configuration.get_config().get( 'aws', 'aws_secret_access_key') # Create the region in which to run region_endpoint = u'elasticmapreduce.%s.amazonaws.com' % (region_name) region = RegionInfo(name=region_name, endpoint=region_endpoint) self.emr_connection = EmrConnection( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, region=region) def launch_emr_cluster( self, cluster_name, log_uri, ec2_keyname=None, master_type='m1.small', core_type='m1.small', num_instances=2, hadoop_version='1.0.3', ami_version='2.4.7', ): # TODO Remove # install_pig_step = InstallPigStep() jobflow_id = self.emr_connection.run_jobflow( name=cluster_name, log_uri=log_uri, ec2_keyname=ec2_keyname, master_instance_type=master_type, slave_instance_type=core_type, num_instances=num_instances, keep_alive=True, enable_debugging=True, hadoop_version=EmrClient.HADOOP_VERSION, steps=[], ami_version=EmrClient.AMI_VERSION) # Log important information status = self.emr_connection.describe_jobflow(jobflow_id) logger.info('Creating new cluster %s with following details' % status.name) logger.info('jobflow ID:\t%s' % status.jobflowid) logger.info('Log URI:\t%s' % status.loguri) logger.info('Master Instance Type:\t%s' % status.masterinstancetype) # A cluster of size 1 does not have any slave instances if hasattr(status, 'slaveinstancetype'): logger.info('Slave Instance Type:\t%s' % status.slaveinstancetype) logger.info('Number of Instances:\t%s' % status.instancecount) logger.info('Hadoop Version:\t%s' % status.hadoopversion) logger.info('AMI Version:\t%s' % status.amiversion) logger.info('Keep Alive:\t%s' % status.keepjobflowalivewhennosteps) return self._poll_until_cluster_ready(jobflow_id) def add_pig_step(self, jobflow_id, pig_file, name='Pig Script', pig_versions='latest', pig_args=[]): pig_step = PigStep( name=name, pig_file=pig_file, pig_versions=pig_versions, pig_args=pig_args, # action_on_failure='CONTINUE', ) self.emr_connection.add_jobflow_steps(jobflow_id, [pig_step]) # Poll until the cluster is done working return self._poll_until_cluster_ready(jobflow_id) def shutdown_emr_cluster(self, jobflow_id): self.emr_connection.terminate_jobflow(jobflow_id) return self._poll_until_cluster_shutdown(jobflow_id) def get_jobflow_id(self): # Get the id of the cluster that is WAITING for work return self.emr_connection.list_clusters( cluster_states=['WAITING']).clusters[0].id def get_master_dns(self): """ Get the master node's public address """ # Get the jobflow ID jobflow_id = self.get_master_dns() # Use the jobflow ID to get the status status = self.emr_connection.describe_jobflow(jobflow_id) # Return the master's public dns return status.masterpublicdnsname def _poll_until_cluster_ready(self, jobflow_id): start_time = time.time() is_cluster_ready = False while (not is_cluster_ready) and ( time.time() - start_time < EmrClient.CLUSTER_OPERATION_RESULTS_TIMEOUT_SECONDS): # Get the state state = self.emr_connection.describe_jobflow(jobflow_id).state if state == u'WAITING': logger.info('Cluster intialized and is WAITING for work') is_cluster_ready = True elif (state == u'COMPLETED') or \ (state == u'SHUTTING_DOWN') or \ (state == u'FAILED') or \ (state == u'TERMINATED'): logger.error('Error starting cluster; status: %s' % state) # Poll until cluster shutdown self._poll_until_cluster_shutdown(jobflow_id) raise RuntimeError('Error, cluster failed to start') else: logger.debug('Cluster state: %s' % state) time.sleep(EmrClient.CLUSTER_OPERATION_RESULTS_POLLING_SECONDS) if not is_cluster_ready: # TODO shutdown cluster raise RuntimeError( 'Timed out waiting for EMR cluster to be active') return jobflow_id def _poll_until_cluster_shutdown(self, jobflow_id): start_time = time.time() is_cluster_shutdown = False while (not is_cluster_shutdown) and ( time.time() - start_time < EmrClient.CLUSTER_OPERATION_RESULTS_TIMEOUT_SECONDS): # Get the state state = self.emr_connection.describe_jobflow(jobflow_id).state if (state == u'TERMINATED') or (state == u'COMPLETED'): logger.info('Cluster successfully shutdown with status: %s' % state) return False elif state == u'FAILED': logger.error('Cluster shutdown with FAILED status') return False else: logger.debug('Cluster state: %s' % state) time.sleep(EmrClient.CLUSTER_OPERATION_RESULTS_POLLING_SECONDS) if not is_cluster_shutdown: # TODO shutdown cluster raise RuntimeError( 'Timed out waiting for EMR cluster to shut down') return True
install_hive_step = step.InstallHiveStep(hive_versions='0.11.0.1') # <codecell> names1gram = step.HiveStep("fullNgramNamesBoto", 's3://wambia660fall2013/fullNgramNamesBoto.hql', hive_args=['-d INPUT=s3://datasets.elasticmapreduce/ngrams/books/20090715/eng-us-all/1gram/', '-d OUTPUT=s3://wambia660fall2013/output/']) # <codecell> jobid = emrcon.run_jobflow(name='Names 1gram boto v3', log_uri='s3://wambia660fall2013/logs/', steps=[install_hive_step, names1gram], enable_debugging=True, master_instance_type='m1.medium', slave_instance_type='m1.medium', num_instances=4, hadoop_version='1.0.3') # <codecell> print jobid # <codecell> status = emrcon.describe_jobflow(jobid) print status.state # <codecell>
]""" #num_instances, role, type, market, name, instance_groups = [ InstanceGroup(1, 'MASTER', 'm1.small', 'ON_DEMAND', 'Master'), #InstanceGroup(1, 'TASK', 'm1.small', 'ON_DEMAND', 'Task'), InstanceGroup(1, 'CORE', 'm1.small', 'ON_DEMAND', 'Core') ] jf_id = emr.run_jobflow(log_uri='s3://%s/logs' %(bucket_name), name='wc jobflow', steps=[wc_step], #num_instances=NUM_INSTANCES, #master_instance_type='m1.small', #slave_instance_type='m1.small', instance_groups=instance_groups, job_flow_role = 'EMR_EC2_DefaultRole', #bootstrap_actions=[bootstrap_step], service_role = 'EMR_DefaultRole', action_on_failure='CONTINUE', visible_to_all_users="True", ami_version = '2.4', hadoop_version='1.0.3', keep_alive=True) emr.set_termination_protection(jf_id, True) print jf_id while True: jf = emr.describe_jobflow(jf_id) #print "[%s] %s" % (datetime.now().strftime("%Y-%m-%d %T"), jf.state)
from boto.emr.connection import EmrConnection from boto.emr.step import StreamingStep import boto AWS_KEY='AKIAIQ7VG4UORIN75ZSA' AWS_SECRET='jzxajGx8gzwX+ymYXJ0/5heCjkPtWLQkICYRn7Vj' conn = EmrConnection(AWS_KEY, AWS_SECRET) step = StreamingStep(name='My wordcount example', mapper='s3n://css739/wordcount/bigramSplitter.py', reducer='aggregate', input='s3n://smalldata/wikipedia_titles.txt', output='s3n://css739/wordcount/bigram_count_output2', cache_files=['s3n://css739/wordcount/english_stoplist.py']) jobid = conn.run_jobflow(name='My jobflow', log_uri='s3n://css739/wordcount/jobflow_logs',steps=[step]) conn.describe_jobflow(jobid).state
'3e212d6rs99xtiPgwKnfN1QD30WZk2hJwCWjMcGc') # <codecell> # Using EMR's wordcount example step = StreamingStep( name='My wordcount example', mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py', reducer='aggregate', input='s3n://elasticmapreduce/samples/wordcount/input', output='s3n://wambia660fall2013/output/wordcount_output') # <codecell> jobid = emrcon.run_jobflow(name='Word Count Example', log_uri='s3://wambia660fall2013/logs', steps=[step]) # <codecell> print jobid # <codecell> import re # <codecell> for word in b.list(): keystring = str(word.key) if re.match(keystring, 'part-00000'):
def create_emr_cluster(cr): """ @PARAM: Cluster configuration reader object Creates an EMR cluster given a set of configuration parameters Return: EMR Cluster ID """ #region = cr.get_config("aws_region") #conn = boto.emr.connect_to_region(region) conn = EmrConnection( cr.get_config("aws_access_key"), cr.get_config("aws_secret_key"), region = RegionInfo(name = cr.get_config("aws_region"), endpoint = cr.get_config("aws_region") + ".elasticmapreduce.amazonaws.com" )) # Create list of instance groups: master, core, and task instance_groups = [] instance_groups.append(InstanceGroup( num_instances = cr.get_config("emr_master_node_count"), role = "MASTER", type = cr.get_config("emr_master_node_type"), market = cr.get_config("emr_market_type"), name = "Master Node" )) instance_groups.append(InstanceGroup( num_instances = cr.get_config("emr_core_node_count"), role = "CORE", type = cr.get_config("emr_core_node_type"), market = cr.get_config("emr_market_type"), name = "Core Node" )) # Only create task nodes if specifcally asked for if cr.get_config("emr_task_node_count") > 0: instance_groups.append(InstanceGroup( num_instances = cr.get_config("emr_task_node_count"), role = "TASK", type = cr.get_config("emr_task_node_type"), market = cr.get_config("emr_market_type"), name = "Task Node" )) print "Creating EMR Cluster with instance groups: {0}".format(instance_groups) # Use these params to add overrrides, these will go away in Boto3 api_params = {"Instances.Ec2SubnetId": cr.get_config("aws_subnet_id"), "ReleaseLabel": cr.get_config("emr_version")} # Add step to load data step_args = ["s3-dist-cp","--s3Endpoint=s3-us-west-1.amazonaws.com","--src=s3://alpine-qa/automation/automation_test_data/","--dest=hdfs:///automation_test_data","--srcPattern=.*[a-zA-Z,]+"] step = JarStep(name = "s3distcp for data loading", jar = "command-runner.jar", step_args = step_args, action_on_failure = "CONTINUE" ) cluster_id = conn.run_jobflow( cr.get_config("emr_cluster_name"), instance_groups = instance_groups, action_on_failure = "TERMINATE_JOB_FLOW", keep_alive = True, enable_debugging = True, log_uri = cr.get_config("emr_log_uri"), #hadoop_version = "Amazon 2.7.2", #ReleaseLabel = "emr-5.0.0", #ami_version = "5.0.0", steps = [step], bootstrap_actions = [], ec2_keyname = cr.get_config("ec2_keyname"), visible_to_all_users = True, job_flow_role = "EMR_EC2_DefaultRole", service_role = "EMR_DefaultRole", api_params = api_params ) print "EMR Cluster created, cluster id: {0}".format(cluster_id) state = conn.describe_cluster(cluster_id).status.state while state != u'COMPLETED' and state != u'SHUTTING_DOWN' and state != u'FAILED' and state != u'WAITING': #sleeping to recheck for status. time.sleep(5) state = conn.describe_cluster(cluster_id).status.state print "State is: {0}, sleeping 5s...".format(state) if state == u'SHUTTING_DOWN' or state == u'FAILED': return "ERROR" #Check if the state is WAITING. Then launch the next steps if state == u'WAITING': #Finding the master node dns of EMR cluster master_dns = conn.describe_cluster(cluster_id).masterpublicdnsname print "DNS Name: {0}".format(master_dns) return cluster_id
class EMRCluster(object): '''Representation of an EMR cluster. TODO: add bridge to boto interface for unit test. ''' emr_status_delay = 10 # in sec emr_status_max_delay = 60 # in sec emr_status_max_error = 30 # number of errors emr_max_idle = 10 * 60 # 10 min (in sec) rate_limit_lock = RateLimitLock() def __init__(self, prop): '''Constructor, initialize EMR connection.''' self.prop = prop self.conn = EmrConnection(self.prop.ec2.key, self.prop.ec2.secret) self.jobid = None self.retry = 0 self.level = 0 self.last_update = -1 @property def priority(self): '''The priority used in EMRManager. The lower value, the higher priority. ''' with EMRCluster.rate_limit_lock: if self.jobid is None: return 1 return 0 def get_instance_groups(self): '''Get instance groups to start a cluster. It calculates the price with self.level, which indicates the price upgrades from the original price. ''' instance_groups = [] for group in self.prop.emr.instance_groups: (num, group_name, instance_type) = group level = max(0, min(self.level, len(self.prop.emr.price_upgrade_rate) - 1)) # 0 <= level < len(...) bprice = self.prop.emr.prices[ instance_type] * self.prop.emr.price_upgrade_rate[level] name = '%s-%s@%f' % (group_name, 'SPOT', bprice) # Use on-demand instance if prices are zero. if bprice > 0: ig = InstanceGroup(num, group_name, instance_type, 'SPOT', name, '%.3f' % bprice) else: ig = InstanceGroup(num, group_name, instance_type, 'ON_DEMAND', name) instance_groups.append(ig) return instance_groups def get_bootstrap_actions(self): '''Get list of bootstrap actions from property''' actions = [] for bootstrap_action in self.prop.emr.bootstrap_actions: assert len(bootstrap_action ) >= 2, 'Wrong bootstrap action definition: ' + str( bootstrap_action) actions.append( BootstrapAction(bootstrap_action[0], bootstrap_action[1], bootstrap_action[2:])) return actions @synchronized def start(self): '''Start a EMR cluster.''' # emr.project_name is required if self.prop.emr.project_name is None: raise ValueError('emr.project_name is not set') self.last_update = time.time() with EMRCluster.rate_limit_lock: self.jobid = self.conn.run_jobflow( name=self.prop.emr.cluster_name, ec2_keyname=self.prop.emr.keyname, log_uri=self.prop.emr.log_uri, ami_version=self.prop.emr.ami_version, bootstrap_actions=self.get_bootstrap_actions(), keep_alive=True, action_on_failure='CONTINUE', api_params={'VisibleToAllUsers': 'true'}, instance_groups=self.get_instance_groups()) message('Job flow created: %s', self.jobid) # Tag EC2 instances to allow future analysis tags = { 'FlowControl': 'Briefly', 'Project': self.prop.emr.project_name } if self.prop.emr.tags is not None: assert isinstance(self.prop.emr.tags, dict) tags = dict(tags.items() + self.prop.emr.tags.items()) self.conn.add_tags(self.jobid, tags) @synchronized def terminate(self, level_upgrade=0): '''Terminate this EMR cluster.''' if self.jobid is None: return self.level += level_upgrade # upgrade to another price level message('Terminate jobflow: %s', self.jobid) for i in range(3): try: with EMRCluster.rate_limit_lock: self.conn.terminate_jobflow(self.jobid) break except Exception as e: message('Unable to terminate job flow: %s', self.jobid) message(traceback.format_exc()) # We have to set jobid as None to create new cluster; # otherwise, run_steps will keep launching jobs on the bad cluster. self.jobid = None def is_idle(self): '''Check if this EMR cluster is idle?''' return (not self.jobid is None) and ( (time.time() - self.last_update) > self.emr_max_idle) def get_steps(self, node): '''Get the jar step from the node.''' step = JarStep(name=node.config.sub(node.config.emr.step_name, node_hash=node.hash()), main_class=node.config.main_class, jar=node.config.hadoop.jar, action_on_failure='CONTINUE', step_args=node.process_args(*node.config.args)) return [step] def get_step_index(self, step_id): '''Get the index of a step given step_id (1 based)''' steps = [ step.id for step in reversed(self.conn.list_steps(self.jobid).steps) if step.status is not None ] # revert the index since latest step is on top of the list return steps.index(step_id) + 1 def run_steps(self, node, wait=True): '''Main loop to execute a node. It will block until step complete or failure, and will raise exception for failures so that the step will be retried. TODO: add timeouts for each step? TODO: dynamic increase cluster size? ''' if not self.jobid: self.start() try: with EMRCluster.rate_limit_lock: # Here we just add single step. And get the step_id for fallowing checks. step_id = self.conn.add_jobflow_steps( self.jobid, self.get_steps(node)).stepids[0].value assert step_id is not None except Exception as e: node.log('Unable to add jobflow steps: %s', node.hash()) node.log('%s', traceback.format_exc()) raise HadoopFailure() status_error_counter = 0 step_status = 'PENDING' step_index = None step_start = time.time() # notify the node with status. node.notify_status('Running on EMR: %s', self.jobid) while wait and step_status in ['PENDING', 'RUNNING']: try: # wait first for the status turning to 'RUNNING' from 'WAITING'. Exponential delay for errors. # Cap delay to a predefined limit. delay = min(self.emr_status_delay * (2**status_error_counter), self.emr_status_max_delay) time.sleep(delay) # Keep current cluster alive. self.last_update = time.time() # Get current cluster status. May raise exception due to EMR request throttle. cluster_state = self.conn.describe_cluster( self.jobid).status.state if step_index is None: step_index = self.get_step_index(step_id) node.log('Step #: %d', step_index) node.log('Log URI: %s/%s/steps/%d/', node.config.emr.log_uri, self.jobid, step_index) step_status = self.conn.describe_step(self.jobid, step_id).status.state status_error_counter = 0 # reset counter node.log("%s: %s %s", self.jobid, cluster_state, step_status) if cluster_state in [ 'TERMINATING', 'TERMINATED', 'TERMINATED_WITH_ERRORS' ]: # cluster kill (maybe due to spot price), upgrade. self.terminate(1) break if ( time.time() - step_start ) > node.config.emr.step_timeout: # Step running too long? EMR cluster idle. node.log('Step running too long. Restart with new cluster') self.terminate() break except KeyboardInterrupt: raise except Exception as e: node.log('EMR loop exception: %d error(s)', status_error_counter) status_error_counter += 1 if status_error_counter > self.emr_status_max_error: self.terminate() node.log('Too many errors in EMR loop') node.log('Exception: %s', traceback.format_exc()) raise if step_status != 'COMPLETED': raise HadoopFailure()
#emrcon = EmrConnection('<aws access key>', '<aws secret key>') emrcon = EmrConnection('AKIAJRV3RN6NXQTSSTBA', '3e212d6rs99xtiPgwKnfN1QD30WZk2hJwCWjMcGc') # <codecell> # Using EMR's wordcount example step = StreamingStep(name='My wordcount example', mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py', reducer='aggregate', input='s3n://elasticmapreduce/samples/wordcount/input', output='s3n://wambia660fall2013/output/wordcount_output') # <codecell> jobid = emrcon.run_jobflow(name='Word Count Example', log_uri='s3://wambia660fall2013/logs', steps=[step]) # <codecell> print jobid # <codecell> import re # <codecell> for word in b.list(): keystring = str(word.key) if re.match(keystring,'part-00000'):
# EmrConn() args: aws_access_key_id=None aws_secret_access_key=None, emr = EmrConnection(aws_access_key_id= credentials['aws_access_key_id'],\ aws_secret_access_key = credentials['aws_secret_access_key']) print "logged in / made new emr?" raw_input() # Python files must be hosted on s3 and linked to for execution. ## [ ] TODO(emmagras): Check the docs for StreamingStep and understand ## the arguments below. ## args for StreamingStep: name, mapper uri, reducer uri=None, ## combiner uri=None, action_on_failure='TERMINATE_JOB_FLOW', ## cache_files=None, cache_archives=None, step_args=None, ## input=None, output=None, ## jar='/home/hadoop/contrib/streaming/hadoop-streaming.jar' wc_step = StreamingStep('wc text', \ 's3://elasticmapreduce/samples/wordcount/wordSplitter.py', \ 'aggregate', input='s3://elasticmapreduce/samples/wordcount/input', \ output='s3://wc-test-bucket/output/%s' % job_ts) jf_id = emr.run_jobflow('wc jobflow', 's3n://emr-debug/%s' % job_ts, \ steps=[wc_step]) while True: jf = emr.describe_jobflow(jf_id) print "[%s] %s" % (datetime.now().strftime("%Y-%m-%d %T"), jf.state) if jf.state == 'COMPLETED': break time.sleep(10)
class Rankmaniac: """ (wrapper class) This class presents a simple wrapper around the AWS SDK. It strives to provide all the functionality required to run map-reduce (Hadoop) on Amazon. This way the students do not need to worry about learning the API for Amazon S3 and EMR, and instead can focus on computing pagerank quickly! """ DefaultRegionName = 'us-west-2' DefaultRegionEndpoint = 'elasticmapreduce.us-west-2.amazonaws.com' def __init__(self, team_id, access_key, secret_key, bucket='cs144students'): """ (constructor) Creates a new instance of the Rankmaniac class for a specific team using the provided credentials. Arguments: team_id <str> the team identifier, which may be differ slightly from the actual team name. access_key <str> the AWS access key identifier. secret_key <str> the AWS secret acess key. Keyword arguments: bucket <str> the S3 bucket name. """ region = RegionInfo(None, self.DefaultRegionName, self.DefaultRegionEndpoint) self._s3_bucket = bucket self._s3_conn = S3Connection(access_key, secret_key) self._emr_conn = EmrConnection(access_key, secret_key, region=region) self.team_id = team_id self.job_id = None self._reset() self._num_instances = 1 def _reset(self): """ Resets the internal state of the job and submission. """ self._iter_no = 0 self._infile = None self._last_outdir = None self._last_process_step_iter_no = -1 self._is_done = False def __del__(self): """ (destructor) Terminates the map-reduce job if any, and closes the connections to Amazon S3 and EMR. """ if self.job_id is not None: self.terminate() self._s3_conn.close() self._emr_conn.close() def __enter__(self): """ Used for `with` syntax. Simply returns this instance since the set-up has all been done in the constructor. """ return self def __exit__(self, type, value, traceback): """ Refer to __del__(). """ self.__del__() return False # do not swallow any exceptions def upload(self, indir='data'): """ Uploads the local data to Amazon S3 under the configured bucket and key prefix (the team identifier). This way the code can be accessed by Amazon EMR to compute pagerank. Keyword arguments: indir <str> the base directory from which to upload contents. Special notes: This method only uploads **files** in the specified directory. It does not scan through subdirectories. WARNING! This method removes all previous (or ongoing) submission results, so it is unsafe to call while a job is already running (and possibly started elsewhere). """ if self.job_id is not None: raise RankmaniacError('A job is already running.') bucket = self._s3_conn.get_bucket(self._s3_bucket) # Clear out current bucket contents for team keys = bucket.list(prefix=self._get_keyname()) bucket.delete_keys(keys) for filename in os.listdir(indir): relpath = os.path.join(indir, filename) if os.path.isfile(relpath): keyname = self._get_keyname(filename) key = bucket.new_key(keyname) key.set_contents_from_filename(relpath) def set_infile(self, filename): """ Sets the data file to use for the first iteration of the pagerank step in the map-reduce job. """ if self.job_id is not None: raise RankmaniacError('A job is already running.') self._infile = filename def do_iter(self, pagerank_mapper, pagerank_reducer, process_mapper, process_reducer, pagerank_output=None, process_output=None, num_pagerank_mappers=1, num_pagerank_reducers=1): """ Adds a pagerank step and a process step to the current job. """ num_process_mappers = 1 num_process_reducers = 1 if self._iter_no == 0: pagerank_input = self._infile elif self._iter_no > 0: pagerank_input = self._last_outdir if pagerank_output is None: pagerank_output = self._get_default_outdir('pagerank') # Output from the pagerank step becomes input to process step process_input = pagerank_output if process_output is None: process_output = self._get_default_outdir('process') pagerank_step = self._make_step(pagerank_mapper, pagerank_reducer, pagerank_input, pagerank_output, num_pagerank_mappers, num_pagerank_reducers) process_step = self._make_step(process_mapper, process_reducer, process_input, process_output, num_process_mappers, num_process_reducers) steps = [pagerank_step, process_step] if self.job_id is None: self._submit_new_job(steps) else: self._emr_conn.add_jobflow_steps(self.job_id, steps) # Store `process_output` directory so it can be used in # subsequent iteration self._last_outdir = process_output self._iter_no += 1 def is_done(self): """ Returns `True` if the map-reduce job is done, and `False` otherwise. For all process-step output files that have not been fetched, gets the first part of the output file, and checks whether its contents begins with the string 'FinalRank'. Special notes: WARNING! The usage of this method in your code requires that that you used the default output directories in all calls to do_iter(). """ # Cache the result so we can return immediately without hitting # any of the Amazon APIs if self._is_done: return True iter_no = self._get_last_process_step_iter_no() if iter_no < 0: return False while self._last_process_step_iter_no < iter_no: self._last_process_step_iter_no += 1 i = self._last_process_step_iter_no outdir = self._get_default_outdir('process', iter_no=i) keyname = self._get_keyname(outdir, 'part-00000') bucket = self._s3_conn.get_bucket(self._s3_bucket) key = Key(bucket=bucket, name=keyname) contents = key.next() # get first chunk of the output file if contents.startswith('FinalRank'): self._is_done = True # cache result break return self._is_done def is_alive(self): """ Checks whether the jobflow has completed, failed, or been terminated. Special notes: WARNING! This method should only be called **after** is_done() in order to be able to distinguish between the cases where the map-reduce job has outputted 'FinalRank' on its final iteration and has a 'COMPLETED' state. """ jobflow = self.describe() if jobflow.state in ('COMPLETED', 'FAILED', 'TERMINATED'): return False return True def terminate(self): """ Terminates a running map-reduce job. """ if not self.job_id: raise RankmaniacError('No job is running.') self._emr_conn.terminate_jobflow(self.job_id) self.job_id = None self._reset() def download(self, outdir='results'): """ Downloads the results from Amazon S3 to the local directory. Keyword arguments: outdir <str> the base directory to which to download contents. Special notes: This method downloads all keys (files) from the configured bucket for this particular team. It creates subdirectories as needed. """ bucket = self._s3_conn.get_bucket(self._s3_bucket) keys = bucket.list(prefix=self._get_keyname()) for key in keys: keyname = key.name # Ignore folder keys if '$' not in keyname: suffix = keyname.split('/')[1:] # removes team identifier filename = os.path.join(outdir, *suffix) dirname = os.path.dirname(filename) if not os.path.exists(dirname): os.makedirs(dirname) key.get_contents_to_filename(filename) def describe(self): """ Gets the current map-reduce job details. Returns a boto.emr.emrobject.JobFlow object. Special notes: The JobFlow object has the following relevant fields. state <str> the state of the job flow, either COMPLETED | FAILED | TERMINATED | RUNNING | SHUTTING_DOWN | STARTING | WAITING steps <list(boto.emr.emrobject.Step)> a list of the step details in the workflow. The Step object has the following relevant fields. state <str> the state of the step. startdatetime <str> the start time of the job. enddatetime <str> the end time of the job. WARNING! Amazon has an upper-limit on the frequency with which you can call this method; we have had success with calling it at most once every 10 seconds. """ if not self.job_id: raise RankmaniacError('No job is running.') return self._emr_conn.describe_jobflow(self.job_id) def _get_last_process_step_iter_no(self): """ Returns the most recently process-step of the job flow that has been completed. """ steps = self.describe().steps i = 1 while i < len(steps): step = steps[i] if step.state != 'COMPLETED': break i += 2 return i / 2 - 1 def _get_default_outdir(self, name, iter_no=None): """ Returns the default output directory, which is 'iter_no/name/'. """ if iter_no is None: iter_no = self._iter_no # Return iter_no/name/ **with** the trailing slash return '%s/%s/' % (iter_no, name) def _submit_new_job(self, steps): """ Submits a new job to run on Amazon EMR. """ if self.job_id is not None: raise RankmaniacError('A job is already running.') job_name = self._make_name() num_instances = self._num_instances log_uri = self._get_s3_team_uri('job_logs') self.job_id = self._emr_conn.run_jobflow(name=job_name, steps=steps, num_instances=num_instances, log_uri=log_uri) def _make_step(self, mapper, reducer, input, output, num_mappers=1, num_reducers=1): """ Returns a new step that runs the specified mapper and reducer, reading from the specified input and writing to the specified output. """ bucket = self._s3_conn.get_bucket(self._s3_bucket) # Clear out current bucket/output contents for team keys = bucket.list(prefix=self._get_keyname(output)) bucket.delete_keys(keys) step_name = self._make_name() step_args = [ '-jobconf', 'mapred.map.tasks=%d' % (num_mappers), '-jobconf', 'mapred.reduce.tasks=%d' % (num_reducers) ] return StreamingStep(name=step_name, step_args=step_args, mapper=self._get_s3_team_uri(mapper), reducer=self._get_s3_team_uri(reducer), input=self._get_s3_team_uri(input), output=self._get_s3_team_uri(output)) def _make_name(self): return strftime('%%s %m-%d-%Y %H:%M:%S', localtime()) % (self.team_id) def _get_keyname(self, *args): """ Returns the key name to use in the grading bucket (for the particular team). 'team_id/...' """ return '%s/%s' % (self.team_id, '/'.join(args)) def _get_s3_team_uri(self, *args): """ Returns the Amazon S3 URI for the team submissions. """ return 's3n://%s/%s' % (self._s3_bucket, self._get_keyname(*args))
# the nodes of an EMR(Elastic Map Reduce) job. # build up our instance groups namenode_instance_group = InstanceGroup(num_instances=1, role="MASTER", type="c1.xlarge", market="ON_DEMAND", name="MASTER_GROUP") core_nodes = InstanceGroup(num_instances=20, role="MASTER", type="c1.xlarge", market="SPOT", name="MASTER_GROUP") task_nodes = InstanceGroup(num_instances=10, role="TASK", type="c1.xlarge", market="ON_DEMAND", name="INITIAL_TASK_GROUP") instance_groups = [namenode_instance_group, core_nodes, task_nodes] # run the job conn = EmrConnection("<aws-access-key-id>", "<aws-secret-access-key>") conn.run_jobflow(name="My Job Flow", instance_groups=instance_groups)
class EMRCluster(object): '''Representation of an EMR cluster. TODO: add bridge to boto interface for unit test. ''' emr_status_delay = 10 # in sec emr_status_max_delay = 60 # in sec emr_status_max_error = 30 # number of errors emr_max_idle = 10 * 60 # 10 min (in sec) rate_limit_lock = RateLimitLock() def __init__(self, prop): '''Constructor, initialize EMR connection.''' self.prop = prop self.conn = EmrConnection(self.prop.ec2.key, self.prop.ec2.secret) self.jobid = None self.retry = 0 self.level = 0 self.last_update = -1 @property def priority(self): '''The priority used in EMRManager. The lower value, the higher priority. ''' with EMRCluster.rate_limit_lock: if self.jobid is None: return 1 return 0 def get_instance_groups(self): '''Get instance groups to start a cluster. It calculates the price with self.level, which indicates the price upgrades from the original price. ''' instance_groups = [] for group in self.prop.emr.instance_groups: (num, group_name, instance_type) = group level = max(0, min(self.level, len(self.prop.emr.price_upgrade_rate) - 1)) # 0 <= level < len(...) bprice = self.prop.emr.prices[instance_type] * self.prop.emr.price_upgrade_rate[level] name = '%s-%s@%f' % (group_name, 'SPOT', bprice) # Use on-demand instance if prices are zero. if bprice > 0: ig = InstanceGroup(num, group_name, instance_type, 'SPOT', name, '%.3f' % bprice) else: ig = InstanceGroup(num, group_name, instance_type, 'ON_DEMAND', name) instance_groups.append(ig) return instance_groups def get_bootstrap_actions(self): '''Get list of bootstrap actions from property''' actions = [] for bootstrap_action in self.prop.emr.bootstrap_actions: assert len(bootstrap_action) >= 2, 'Wrong bootstrap action definition: ' + str(bootstrap_action) actions.append(BootstrapAction(bootstrap_action[0], bootstrap_action[1], bootstrap_action[2:])) return actions @synchronized def start(self): '''Start a EMR cluster.''' # emr.project_name is required if self.prop.emr.project_name is None: raise ValueError('emr.project_name is not set') self.last_update = time.time() with EMRCluster.rate_limit_lock: self.jobid = self.conn.run_jobflow(name=self.prop.emr.cluster_name, ec2_keyname=self.prop.emr.keyname, log_uri=self.prop.emr.log_uri, ami_version=self.prop.emr.ami_version, bootstrap_actions=self.get_bootstrap_actions(), keep_alive=True, action_on_failure='CONTINUE', api_params={'VisibleToAllUsers': 'true'}, instance_groups=self.get_instance_groups()) message('Job flow created: %s', self.jobid) # Tag EC2 instances to allow future analysis tags = {'FlowControl': 'Briefly', 'Project': self.prop.emr.project_name} if self.prop.emr.tags is not None: assert isinstance(self.prop.emr.tags, dict) tags = dict(tags.items() + self.prop.emr.tags.items()) self.conn.add_tags(self.jobid, tags) @synchronized def terminate(self, level_upgrade=0): '''Terminate this EMR cluster.''' if self.jobid is None: return self.level += level_upgrade # upgrade to another price level message('Terminate jobflow: %s', self.jobid) for i in xrange(3): try: with EMRCluster.rate_limit_lock: self.conn.terminate_jobflow(self.jobid) break except Exception, e: message('Unable to terminate job flow: %s', self.jobid) message(traceback.format_exc()) # We have to set jobid as None to create new cluster; # otherwise, run_steps will keep launching jobs on the bad cluster. self.jobid = None
class EmrManager(object): # Default constructor of the class. Uses default parameters if not provided. def __init__(self, parameters): try: self.region_name = parameters["region_name"] self.access_key = parameters["access_key"] self.secret_key = parameters["secret_key"] self.ec2_keypair_name = parameters["ec2_keypair_name"] self.base_bucket = parameters["base_bucket"] self.log_dir = parameters["log_dir"] self.emr_status_wait = parameters["emr_status_wait"] self.step_status_wait = parameters["step_status_wait"] self.emr_cluster_name = parameters["emr_cluster_name"] except: logging.error("Something went wrong initializing EmrManager") sys.exit() # Establishing EmrConnection self.connection = EmrConnection(self.access_key, self.secret_key, region=RegionInfo(name=self.region_name, endpoint=self.region_name + '.elasticmapreduce.amazonaws.com')) self.log_bucket_name = self.base_bucket + self.log_dir #Method for launching the EMR cluster def launch_cluster(self, master_type, slave_type, num_instances, ami_version): try: #Launching the cluster cluster_id = self.connection.run_jobflow( self.emr_cluster_name, self.log_bucket_name, ec2_keyname=self.ec2_keypair_name, keep_alive=True, action_on_failure = 'CANCEL_AND_WAIT', master_instance_type=master_type, slave_instance_type=slave_type, num_instances=num_instances, ami_version=ami_version) logging.info("Launching cluster: " + cluster_id + ". Please be patient. Check the status of your cluster in your AWS Console") # Checking the state of EMR cluster state = self.connection.describe_jobflow(cluster_id).state while state != u'COMPLETED' and state != u'SHUTTING_DOWN' and state != u'FAILED' and state != u'WAITING': #sleeping to recheck for status. time.sleep(int(self.emr_status_wait)) state = self.connection.describe_jobflow(cluster_id).state logging.info("Creating cluster " + cluster_id + ". Status: " + state) if state == u'SHUTTING_DOWN' or state == u'FAILED': logging.error("Launching EMR cluster failed") return "ERROR" #Check if the state is WAITING. Then launch the next steps if state == u'WAITING': #Finding the master node dns of EMR cluster master_dns = self.connection.describe_jobflow(cluster_id).masterpublicdnsname logging.info("Launched EMR Cluster Successfully with cluster id:" + cluster_id) logging.info("Master node DNS of EMR " + master_dns) return cluster_id except: logging.error("Launching EMR cluster failed") return "FAILED" # run scripting step in cluster def run_scripting_step(self, cluster_id, name, script_path): try: step = ScriptRunnerStep(name=name, step_args=[script_path], action_on_failure="CONTINUE") return self._run_step(cluster_id, step) except: logging.error("Running scripting step in cluster " + cluster_id + " failed.") return "FAILED" # run streaming step in cluster def run_streaming_step(self, cluster_id, name, mapper_path, reducer_path, input_path, output_path): try: # bundle files with the job files = [] if mapper_path != "NONE": files.append(mapper_path) mapper_path = mapper_path.split("/")[-1] if reducer_path != "NONE": files.append(reducer_path) reducer_path = reducer_path.split("/")[-1] # build streaming step logging.debug("Launching streaming step with mapper: " + mapper_path + " reducer: " + reducer_path + " and files: " + str(files)) step = StreamingStep(name=name, step_args=["-files"] + files, mapper=mapper_path, reducer=reducer_path, input=input_path, output=output_path, action_on_failure="CONTINUE") return self._run_step(cluster_id, step) except: logging.error("Running streaming step in cluster " + cluster_id + " failed.") return "FAILED" # run mapreduce jar step in cluster def run_jar_step(self, cluster_id, name, jar_path, class_name, input_path, output_path): try: # build streaming step logging.debug("Launching jar step with jar: " + jar_path + " class name: " + class_name + " input: " + input_path + " and output: " + output_path) step = JarStep(name=name, jar=jar_path, step_args= [class_name, input_path, output_path]) return self._run_step(cluster_id, step) except: logging.error("Running jar step in cluster " + cluster_id + " failed.") return "FAILED" def _run_step(self, cluster_id, step): step_list = self.connection.add_jobflow_steps(cluster_id, [step]) step_id = step_list.stepids[0].value logging.info("Starting step " + step_id + " in cluster " + cluster_id + ". Please be patient. Check the progress of the job in your AWS Console") # Checking the state of the step state = self._find_step_state(cluster_id, step_id) while state != u'NOT_FOUND' and state != u'ERROR' and state != u'FAILED' and state!=u'COMPLETED': #sleeping to recheck for status. time.sleep(int(self.step_status_wait)) state = self._find_step_state(cluster_id, step_id) logging.info("Starting step " + step_id + " in cluster " + cluster_id + ". Status: " + state) if state == u'FAILED': logging.error("Step " + step_id + " failed in cluster: " + cluster_id) return "FAILED" if state == u'NOT_FOUND': logging.error("Step " + step_id + " could not be found in cluster: " + cluster_id) return "NOT_FOUND" if state == u'ERROR': logging.error("Step " + step_id + " produced an error in _find_step_state in cluster: " + cluster_id) return "ERROR" #Check if the state is WAITING. Then launch the next steps if state == u'COMPLETED': #Finding the master node dns of EMR cluster logging.info("Step " + step_id + " succesfully completed in cluster: " + cluster_id) return step_id def _find_step_state(self, cluster_id, step_id): try: step_summary_list = self.connection.list_steps(cluster_id) for step_summary in step_summary_list.steps: if step_summary.id == step_id: return step_summary.status.state return "NOT_FOUND" except: return "ERROR" #Method for terminating the EMR cluster def terminate_cluster(self, cluster_id): self.connection.terminate_jobflow(cluster_id)
class Rankmaniac: """ (wrapper class) This class presents a simple wrapper around the AWS SDK. It strives to provide all the functionality required to run map-reduce (Hadoop) on Amazon. This way the students do not need to worry about learning the API for Amazon S3 and EMR, and instead can focus on computing pagerank quickly! """ DefaultRegionName = 'us-west-2' DefaultRegionEndpoint = 'elasticmapreduce.us-west-2.amazonaws.com' def __init__(self, team_id, access_key, secret_key, bucket='cs144students'): """ (constructor) Creates a new instance of the Rankmaniac class for a specific team using the provided credentials. Arguments: team_id <str> the team identifier, which may be differ slightly from the actual team name. access_key <str> the AWS access key identifier. secret_key <str> the AWS secret acess key. Keyword arguments: bucket <str> the S3 bucket name. """ region = RegionInfo(None, self.DefaultRegionName, self.DefaultRegionEndpoint) self._s3_bucket = bucket self._s3_conn = S3Connection(access_key, secret_key) self._emr_conn = EmrConnection(access_key, secret_key, region=region) self.team_id = team_id self.job_id = None self._reset() self._num_instances = 1 def _reset(self): """ Resets the internal state of the job and submission. """ self._iter_no = 0 self._infile = None self._last_outdir = None self._last_process_step_iter_no = -1 self._is_done = False def __del__(self): """ (destructor) Terminates the map-reduce job if any, and closes the connections to Amazon S3 and EMR. """ if self.job_id is not None: self.terminate() self._s3_conn.close() self._emr_conn.close() def __enter__(self): """ Used for `with` syntax. Simply returns this instance since the set-up has all been done in the constructor. """ return self def __exit__(self, type, value, traceback): """ Refer to __del__(). """ self.__del__() return False # do not swallow any exceptions def upload(self, indir='data'): """ Uploads the local data to Amazon S3 under the configured bucket and key prefix (the team identifier). This way the code can be accessed by Amazon EMR to compute pagerank. Keyword arguments: indir <str> the base directory from which to upload contents. Special notes: This method only uploads **files** in the specified directory. It does not scan through subdirectories. WARNING! This method removes all previous (or ongoing) submission results, so it is unsafe to call while a job is already running (and possibly started elsewhere). """ if self.job_id is not None: raise RankmaniacError('A job is already running.') bucket = self._s3_conn.get_bucket(self._s3_bucket) # Clear out current bucket contents for team keys = bucket.list(prefix=self._get_keyname()) bucket.delete_keys(keys) for filename in os.listdir(indir): relpath = os.path.join(indir, filename) if os.path.isfile(relpath): keyname = self._get_keyname(filename) key = bucket.new_key(keyname) key.set_contents_from_filename(relpath) def set_infile(self, filename): """ Sets the data file to use for the first iteration of the pagerank step in the map-reduce job. """ if self.job_id is not None: raise RankmaniacError('A job is already running.') self._infile = filename def do_iter(self, pagerank_mapper, pagerank_reducer, process_mapper, process_reducer, pagerank_output=None, process_output=None, num_pagerank_mappers=1, num_pagerank_reducers=1): """ Adds a pagerank step and a process step to the current job. """ num_process_mappers = 1 num_process_reducers = 1 if self._iter_no == 0: pagerank_input = self._infile elif self._iter_no > 0: pagerank_input = self._last_outdir if pagerank_output is None: pagerank_output = self._get_default_outdir('pagerank') # Output from the pagerank step becomes input to process step process_input = pagerank_output if process_output is None: process_output = self._get_default_outdir('process') pagerank_step = self._make_step(pagerank_mapper, pagerank_reducer, pagerank_input, pagerank_output, num_pagerank_mappers, num_pagerank_reducers) process_step = self._make_step(process_mapper, process_reducer, process_input, process_output, num_process_mappers, num_process_reducers) steps = [pagerank_step, process_step] if self.job_id is None: self._submit_new_job(steps) else: self._emr_conn.add_jobflow_steps(self.job_id, steps) # Store `process_output` directory so it can be used in # subsequent iteration self._last_outdir = process_output self._iter_no += 1 def is_done(self): """ Returns `True` if the map-reduce job is done, and `False` otherwise. For all process-step output files that have not been fetched, gets the first part of the output file, and checks whether its contents begins with the string 'FinalRank'. Special notes: WARNING! The usage of this method in your code requires that that you used the default output directories in all calls to do_iter(). """ # Cache the result so we can return immediately without hitting # any of the Amazon APIs if self._is_done: return True iter_no = self._get_last_process_step_iter_no() if iter_no < 0: return False while self._last_process_step_iter_no < iter_no: self._last_process_step_iter_no += 1 i = self._last_process_step_iter_no outdir = self._get_default_outdir('process', iter_no=i) keyname = self._get_keyname(outdir, 'part-00000') bucket = self._s3_conn.get_bucket(self._s3_bucket) key = Key(bucket=bucket, name=keyname) contents = key.next() # get first chunk of the output file if contents.startswith('FinalRank'): self._is_done = True # cache result break return self._is_done def is_alive(self): """ Checks whether the jobflow has completed, failed, or been terminated. Special notes: WARNING! This method should only be called **after** is_done() in order to be able to distinguish between the cases where the map-reduce job has outputted 'FinalRank' on its final iteration and has a 'COMPLETED' state. """ jobflow = self.describe() if jobflow.state in ('COMPLETED', 'FAILED', 'TERMINATED'): return False return True def terminate(self): """ Terminates a running map-reduce job. """ if not self.job_id: raise RankmaniacError('No job is running.') self._emr_conn.terminate_jobflow(self.job_id) self.job_id = None self._reset() def download(self, outdir='results'): """ Downloads the results from Amazon S3 to the local directory. Keyword arguments: outdir <str> the base directory to which to download contents. Special notes: This method downloads all keys (files) from the configured bucket for this particular team. It creates subdirectories as needed. """ bucket = self._s3_conn.get_bucket(self._s3_bucket) keys = bucket.list(prefix=self._get_keyname()) for key in keys: keyname = key.name # Ignore folder keys if '$' not in keyname: suffix = keyname.split('/')[1:] # removes team identifier filename = os.path.join(outdir, *suffix) dirname = os.path.dirname(filename) if not os.path.exists(dirname): os.makedirs(dirname) key.get_contents_to_filename(filename) def describe(self): """ Gets the current map-reduce job details. Returns a boto.emr.emrobject.JobFlow object. Special notes: The JobFlow object has the following relevant fields. state <str> the state of the job flow, either COMPLETED | FAILED | TERMINATED | RUNNING | SHUTTING_DOWN | STARTING | WAITING steps <list(boto.emr.emrobject.Step)> a list of the step details in the workflow. The Step object has the following relevant fields. state <str> the state of the step. startdatetime <str> the start time of the job. enddatetime <str> the end time of the job. WARNING! Amazon has an upper-limit on the frequency with which you can call this method; we have had success with calling it at most once every 10 seconds. """ if not self.job_id: raise RankmaniacError('No job is running.') return self._emr_conn.describe_jobflow(self.job_id) def _get_last_process_step_iter_no(self): """ Returns the most recently process-step of the job flow that has been completed. """ steps = self.describe().steps i = 1 while i < len(steps): step = steps[i] if step.state != 'COMPLETED': break i += 2 return i / 2 - 1 def _get_default_outdir(self, name, iter_no=None): """ Returns the default output directory, which is 'iter_no/name/'. """ if iter_no is None: iter_no = self._iter_no # Return iter_no/name/ **with** the trailing slash return '%s/%s/' % (iter_no, name) def _submit_new_job(self, steps): """ Submits a new job to run on Amazon EMR. """ if self.job_id is not None: raise RankmaniacError('A job is already running.') job_name = self._make_name() num_instances = self._num_instances log_uri = self._get_s3_team_uri('job_logs') self.job_id = self._emr_conn.run_jobflow(name=job_name, steps=steps, num_instances=num_instances, log_uri=log_uri) def _make_step(self, mapper, reducer, input, output, num_mappers=1, num_reducers=1): """ Returns a new step that runs the specified mapper and reducer, reading from the specified input and writing to the specified output. """ bucket = self._s3_conn.get_bucket(self._s3_bucket) # Clear out current bucket/output contents for team keys = bucket.list(prefix=self._get_keyname(output)) bucket.delete_keys(keys) step_name = self._make_name() step_args = ['-jobconf', 'mapred.map.tasks=%d' % (num_mappers), '-jobconf', 'mapred.reduce.tasks=%d' % (num_reducers)] return StreamingStep(name=step_name, step_args=step_args, mapper=self._get_s3_team_uri(mapper), reducer=self._get_s3_team_uri(reducer), input=self._get_s3_team_uri(input), output=self._get_s3_team_uri(output)) def _make_name(self): return strftime('%%s %m-%d-%Y %H:%M:%S', localtime()) % (self.team_id) def _get_keyname(self, *args): """ Returns the key name to use in the grading bucket (for the particular team). 'team_id/...' """ return '%s/%s' % (self.team_id, '/'.join(args)) def _get_s3_team_uri(self, *args): """ Returns the Amazon S3 URI for the team submissions. """ return 's3n://%s/%s' % (self._s3_bucket, self._get_keyname(*args))
from boto.emr.connection import EmrConnection from boto.emr.step import InstallPigStep, PigStep AWS_ACCESS_KEY = '' # REQUIRED AWS_SECRET_KEY = '' # REQUIRED conn = EmrConnection(AWS_ACCESS_KEY, AWS_SECRET_KEY) pig_file = 's3://elasticmapreduce/samples/pig-apache/do-reports2.pig' INPUT = 's3://elasticmapreduce/samples/pig-apache/input/access_log_1' OUTPUT = '' # REQUIRED, S3 bucket for job output pig_args = ['-p', 'INPUT=%s' % INPUT, '-p', 'OUTPUT=%s' % OUTPUT] pig_step = PigStep('Process Reports', pig_file, pig_args=pig_args) steps = [InstallPigStep(), pig_step] conn.run_jobflow(name='report test', steps=steps, hadoop_version='0.20.205', ami_version='latest', num_instances=2, keep_alive=False)
#connect to s3 and emr emr_conn = EmrConnection(aws_access_key, aws_secret_key) s3_conn = S3Connection(aws_access_key, aws_secret_key) #upload mapper bucket = s3_conn.create_bucket(s3_bkt) k = Key(bucket) k.key = 'mapper.py' k.set_contents_from_filename('mapper.py') #where data comes from mapper_uri = 's3n://%s/mapper.py' % (s3_bkt) output_uri = 's3n://%s/output' % (s3_bkt) log_uri = 's3n://%s/log' % (s3_bkt) #configure the step wc_step = StreamingStep(name='My Hello World Count', mapper=mapper_uri, reducer='aggregate', input='s3n://elasticmapreduce/samples/wordcount/input', output=output_uri) #launch job jobid = emr_conn.run_jobflow(name='My hello word count job', log_uri=log_uri, steps=[wc_step] ) #status check and exit status_check(emr_conn, jobid)
def main(): aws_access = sys.argv[1] aws_secert = sys.argv[2] jar_path = sys.argv[3] input_filename = sys.argv[4] output_filename = sys.argv[5] nodes = int(sys.argv[6]) slots = 7 * nodes s3_in = sys.argv[7] + "_" + str(os.getpid()) + "_in" s3_out= sys.argv[7] + "_" + str(os.getpid()) + "_out" s3_asm= sys.argv[7] + "_" + str(os.getpid()) + "_asm" readlen = int(sys.argv[8]) kmer= int(sys.argv[9]) # connect to S3 s3_conn = S3Connection(aws_access, aws_secert) mybucket = s3_conn.create_bucket(aws_access.lower()) mybucket = s3_conn.get_bucket(aws_access.lower(), validate=False) print "\nConnection created" # upload data k = Key(mybucket) k.key = 'ReadStackCorrector.jar' k.set_contents_from_filename(jar_path + 'ReadStackCorrector.jar') #k.key = 'CloudBrush.jar' k.key = 'CloudbrushGPU.jar' k.set_contents_from_filename(jar_path + 'CloudbrushGPU-GPU.jar') # uploading file parallel #k.key = s3_in #k.set_contents_from_filename(input_filename) print "\nStarting Upload" s3_path = 's3://%s/%s' % (aws_access.lower(), s3_in) upload_cmd = 'python %s/s3-mp-upload.py %s %s %s %s -f 2>&1' % (jar_path, input_filename, s3_path, aws_access, aws_secert) proc = subprocess.call( args=upload_cmd, shell=True ) #k.key = s3_out #k.delete() # connect to EMR InstanceGroup(nodes, 'CORE', 'c1.xlarge', 'ON_DEMAND', '[email protected]', '0.4') emr_conn = EmrConnection(aws_access, aws_secert) instance_groups = [ InstanceGroup(1, 'MASTER', 'm1.medium', 'ON_DEMAND', '[email protected]', '0.4'), InstanceGroup(nodes, 'CORE', 'g2.2xlarge', 'ON_DEMAND', '[email protected]', '0.4') ] # perform CloudRS step1 = JarStep(name='CloudRS', jar='s3n://%s/ReadStackCorrector.jar' % (aws_access.lower()), step_args = ['-in', 's3n://%s/%s' % (aws_access.lower(), s3_in), '-out', s3_out, '-slots', slots, '-javaopts', '-Xmx960m']) # perform CloudBrush step2 = JarStep(name='CloudBrush', jar='s3n://%s/CloudbrushGPU-GPU.jar' % (aws_access.lower()), step_args = ['-reads', s3_out, '-asm', s3_asm, '-readlen', readlen, '-k', kmer, '-slots', slots, '-javaopts', '-Xmx960m']) # copy from hdfs to S3 k.key = s3_asm step3 = JarStep(name='S3DistCp', jar='/home/hadoop/lib/emr-s3distcp-1.0.jar', #'s3://elasticmapreduce/libs/s3distcp/role/s3distcp.jar', step_args = ['--src', 'hdfs:///user/hadoop/%s' % s3_asm , '--dest', 's3://%s/%s' % (aws_access.lower(), s3_asm), '--groupBy', '.*(part).*']) jobid = emr_conn.run_jobflow(name='CloudBrush', log_uri='s3://%s/jobflow_logs' % aws_access.lower(), ami_version='latest', hadoop_version='2.4.0', #'0.20.205' keep_alive=False, visible_to_all_users=True, steps=[step1,step2,step3], instance_groups = instance_groups) state = emr_conn.describe_jobflow(jobid).state print "job state = ", state print "job id = ", jobid while state != u'COMPLETED': print time.asctime(time.localtime()) time.sleep(30) state = emr_conn.describe_jobflow(jobid).state print "job state = ", state print "job id = ", jobid if state == u'FAILED': print 'FAILED!!!!' break # download file parallel #k.key = "%s/part0" % (s3_asm) #k.get_contents_to_filename(output_filename) if state == u'COMPLETED': s3_path = 's3://%s/%s/part0' % (aws_access.lower(), s3_asm) download_cmd = 'python %s/s3-mp-download.py %s %s %s %s -f 2>&1' % (jar_path, s3_path, output_filename, aws_access, aws_secert) proc = subprocess.call( args=download_cmd, shell=True ) # delete file in S3 k.key = s3_in k.delete() k.key = "%s/part0" % (s3_asm) k.delete()