step = StreamingStep(name='My word example', mapper='s3n://%s/%s' % (bucket_name, mapper), reducer='s3n://%s/%s' % (bucket_name, reducer), input='s3n://%s/%s' % (bucket_name, input_folder), output='s3n://%s/%s' % (bucket_name, result)) # to start a job if we need it again, we have to save job id and we can just keep adding steps to it. Otherwise we must set keep_alive to false to stop the job (and EC2 instance) after running. # Good idea is probably to do all your calculations and then terminate the job. Amazon charges hourly, so if we need a job just for 5 min, we still pay for hour. Also with reusing jobs, we reduce the # waiting for instance to get setup(5-7min). try: jobid = conn.list_clusters(cluster_states="WAITING").clusters[0].id print "We have an existing job waiting - Id: %s" % jobid conn.add_jobflow_steps(jobid, [step]) except IndexError, e: print "Starting ERM job %s" % jobname jobid = conn.run_jobflow(name=jobname, steps=[step], log_uri="s3://"+bucket_name+"/logs/", enable_debugging = True, keep_alive=True) #Wait for 5s to refresh the states time.sleep(10) status = conn.describe_jobflow(jobid).state while not (status in ['WAITING', 'COMPLETED']): status = conn.describe_jobflow(jobid).state print "Status: %s, Job Id: %s" % (status, jobid) time.sleep(10) print "Job finished" #Wait for few s to write the files time.sleep(10)
print "A waiting cluster is already setup. Running Job on : %s" % job_id conn.add_jobflow_steps(job_id, [step]) else: print 'No waiting EMR clusters found. Starting new cluster with name: %s' % jobname answer = raw_input('Do you want to start a new cluster? [N/y]: ') if 'y' not in answer.lower(): print 'exiting' exit() print 'creating a new EMR cluster' job_id = conn.run_jobflow(name=jobname, steps=[step], log_uri="s3://"+bucket_emr+"/logs/", bootstrap_actions=[bootstrap_action], enable_debugging = True, keep_alive=True, master_instance_type='m1.small', # I like m3.xlarge this b/c networking == high and cost is $.35/instance/hour, slave_instance_type='m1.small', num_instances=4, ec2_keyname=EC2_KEYNAME) #instance_groups = # OPTIONAL: Add Spot Support) # OPTIONAL: Add Spot Support #ig = InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT', 'spot-0.07', '0.07') #c.add_instance_groups(jf.jobflowid, ig) time.sleep(20) status = conn.describe_jobflow(job_id).state while status not in ['WAITING', 'COMPLETED']: status = conn.describe_jobflow(job_id).state print "Status: %s for Job Id: %s" % (status, job_id)