print "Init emr connection" conn = EmrConnection(aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY) print "Setting up streamStep" result = output_folder + str(time.time()) step = StreamingStep(name='My word example', mapper='s3n://%s/%s' % (bucket_name, mapper), reducer='s3n://%s/%s' % (bucket_name, reducer), input='s3n://%s/%s' % (bucket_name, input_folder), output='s3n://%s/%s' % (bucket_name, result)) # to start a job if we need it again, we have to save job id and we can just keep adding steps to it. Otherwise we must set keep_alive to false to stop the job (and EC2 instance) after running. # Good idea is probably to do all your calculations and then terminate the job. Amazon charges hourly, so if we need a job just for 5 min, we still pay for hour. Also with reusing jobs, we reduce the # waiting for instance to get setup(5-7min). try: jobid = conn.list_clusters(cluster_states="WAITING").clusters[0].id print "We have an existing job waiting - Id: %s" % jobid conn.add_jobflow_steps(jobid, [step]) except IndexError, e: print "Starting ERM job %s" % jobname jobid = conn.run_jobflow(name=jobname, steps=[step], log_uri="s3://"+bucket_name+"/logs/", enable_debugging = True, keep_alive=True) #Wait for 5s to refresh the states time.sleep(10) status = conn.describe_jobflow(jobid).state while not (status in ['WAITING', 'COMPLETED']): status = conn.describe_jobflow(jobid).state print "Status: %s, Job Id: %s" % (status, jobid) time.sleep(10)
# mapred.max.split.size=1 # mapred.min.split.size # mapred.map.tasks = 20 # mapred.task.timeout=800000 # mapred.tasktracker.map.tasks.maximum # mapred.tasktracker.reduce.tasks.maximum # Define an action that will bootstrap these machines and install our needed # dependencies bootstrap_action = BootstrapAction("Install Dependiences", 's3://%s/%s' % (bucket_emr, file_bootstrapper), None) # if an existing cluster is running, let's use that running_clusters = conn.list_clusters(cluster_states="WAITING").clusters if len(running_clusters) > 0: job_id = running_clusters[0].id print "A waiting cluster is already setup. Running Job on : %s" % job_id conn.add_jobflow_steps(job_id, [step]) else: print 'No waiting EMR clusters found. Starting new cluster with name: %s' % jobname answer = raw_input('Do you want to start a new cluster? [N/y]: ') if 'y' not in answer.lower(): print 'exiting' exit() print 'creating a new EMR cluster' job_id = conn.run_jobflow(name=jobname, steps=[step],