def connect_emr(aws_access_key_id=None, aws_secret_access_key=None, **kwargs): """ :type aws_access_key_id: string :param aws_access_key_id: Your AWS Access Key ID :type aws_secret_access_key: string :param aws_secret_access_key: Your AWS Secret Access Key :rtype: :class:`boto.emr.EmrConnection` :return: A connection to Elastic mapreduce """ from boto.emr import EmrConnection return EmrConnection(aws_access_key_id, aws_secret_access_key, **kwargs)
print "Reducer uploaded" print "Creating input file %s" % testfile create_input_file(testfile, 0, 50) print "Input file created" print "Uploading input to bucket. Input: %s" % testfile upload_to_bucket(testfile, input_folder) print "Creating input file %s" % testfile2 create_input_file(testfile2, 25, 61) print "Input file created" print "Uploading input to bucket. Input: %s" % testfile2 upload_to_bucket(testfile2, input_folder) print "Init emr connection" conn = EmrConnection(aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY) print "Setting up streamStep" result = output_folder + str(time.time()) step = StreamingStep(name='My word example', mapper='s3n://%s/%s' % (bucket_name, mapper), reducer='s3n://%s/%s' % (bucket_name, reducer), input='s3n://%s/%s' % (bucket_name, input_folder), output='s3n://%s/%s' % (bucket_name, result)) # to start a job if we need it again, we have to save job id and we can just keep adding steps to it. Otherwise we must set keep_alive to false to stop the job (and EC2 instance) after running. # Good idea is probably to do all your calculations and then terminate the job. Amazon charges hourly, so if we need a job just for 5 min, we still pay for hour. Also with reusing jobs, we reduce the # waiting for instance to get setup(5-7min). try: jobid = conn.list_clusters(cluster_states="WAITING").clusters[0].id print "We have an existing job waiting - Id: %s" % jobid
# CONDITIONS OF ANY KIND, either express or implied. See the License for the # specific language governing permissions and limitations under the License. # # snippet-sourcedescription:[emrfs-boto-step.py demonstrates how to add a step to an EMR cluster that adds objects in an Amazon S3 bucket to the default EMRFS metadata table.] # snippet-service:[elasticmapreduce] # snippet-keyword:[Python] # snippet-keyword:[Amazon EMR] # snippet-keyword:[Code Sample] # snippet-keyword:[add_jobflow_steps] # snippet-sourcetype:[snippet] # snippet-sourcedate:[2019-01-31] # snippet-sourceauthor:[AWS] # snippet-start:[emr.python.addstep.emrfs] from boto.emr import EmrConnection, connect_to_region, JarStep emr = EmrConnection() connect_to_region("us-west-1") myStep = JarStep( name='Boto EMRFS Sync', jar='s3://elasticmapreduce/libs/script-runner/script-runner.jar', action_on_failure="CONTINUE", step_args=[ '/home/hadoop/bin/emrfs', 'sync', 's3://elasticmapreduce/samples/cloudfront' ]) stepId = emr.add_jobflow_steps("j-2AL4XXXXXX5T9", steps=[myStep]).stepids[0].value # snippet-end:[emr.python.addstep.emrfs]
def conn(self): if self._conn: return self._conn self._conn = EmrConnection(region=self._region) return self._conn
file_input = "manifest.wet.txt" # Upload these files bucket = upload_files(bucket_input, [file_input]) bucket = upload_files(bucket_emr, [file_mapper, file_reducer, file_bootstrapper]) # Name our cluster jobname = "Common Crawl Cruncher" # Location for EMR's log & output files output_folder = "output/" print "Initializing EMR Connection..." conn = EmrConnection(aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY) print "Setting Up Hadoop Streaming Step..." result_folder = output_folder + str(datetime.now()).split('.')[0].replace(' ','_') step = StreamingStep(name='URL Cruncher', mapper='s3n://%s/%s' % (bucket_emr, file_mapper), reducer='s3n://%s/%s' % (bucket_emr, file_reducer), #input='s3n://%s/%s'% (input_bucket, input_path), input='s3n://%s/%s'% (bucket_input, file_input), output='s3n://%s/%s' % (bucket_emr, result_folder), action_on_failure='CANCEL_AND_WAIT', step_args = ["-jobconf", "mapred.map.tasks=24", "mapred.reduce.tasks=2"] ) # Other possible step args include: # mapred.max.split.size=1