Example #1
0
def connect_emr(aws_access_key_id=None, aws_secret_access_key=None, **kwargs):
    """
    :type aws_access_key_id: string
    :param aws_access_key_id: Your AWS Access Key ID

    :type aws_secret_access_key: string
    :param aws_secret_access_key: Your AWS Secret Access Key

    :rtype: :class:`boto.emr.EmrConnection`
    :return: A connection to Elastic mapreduce
    """
    from boto.emr import EmrConnection
    return EmrConnection(aws_access_key_id, aws_secret_access_key, **kwargs)
Example #2
0
print "Reducer uploaded"

print "Creating input file %s" % testfile
create_input_file(testfile, 0, 50)
print "Input file created"
print "Uploading input to bucket. Input: %s" % testfile
upload_to_bucket(testfile, input_folder)

print "Creating input file %s" % testfile2
create_input_file(testfile2, 25, 61)
print "Input file created"
print "Uploading input to bucket. Input: %s" % testfile2
upload_to_bucket(testfile2, input_folder)

print "Init emr connection"
conn = EmrConnection(aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY)

print "Setting up streamStep"
result = output_folder + str(time.time())
step = StreamingStep(name='My word example',
                     mapper='s3n://%s/%s' % (bucket_name, mapper),
                     reducer='s3n://%s/%s' % (bucket_name, reducer),
                     input='s3n://%s/%s' % (bucket_name, input_folder),
                     output='s3n://%s/%s' % (bucket_name, result))

# to start a job if we need it again, we have to save job id and we can just keep adding steps to it. Otherwise we must set keep_alive to false to stop the job (and EC2 instance) after running.
# Good idea is probably to do all your calculations and then terminate the job. Amazon charges hourly, so if we need a job just for 5 min, we still pay for hour. Also with reusing jobs, we reduce the
# waiting for instance to get setup(5-7min).
try:
    jobid = conn.list_clusters(cluster_states="WAITING").clusters[0].id
    print "We have an existing job waiting - Id: %s" % jobid
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# snippet-sourcedescription:[emrfs-boto-step.py demonstrates how to add a step to an EMR cluster that adds objects in an Amazon S3 bucket to the default EMRFS metadata table.]
# snippet-service:[elasticmapreduce]
# snippet-keyword:[Python]
# snippet-keyword:[Amazon EMR]
# snippet-keyword:[Code Sample]
# snippet-keyword:[add_jobflow_steps]
# snippet-sourcetype:[snippet]
# snippet-sourcedate:[2019-01-31]
# snippet-sourceauthor:[AWS]
# snippet-start:[emr.python.addstep.emrfs]
from boto.emr import EmrConnection, connect_to_region, JarStep

emr = EmrConnection()
connect_to_region("us-west-1")

myStep = JarStep(
    name='Boto EMRFS Sync',
    jar='s3://elasticmapreduce/libs/script-runner/script-runner.jar',
    action_on_failure="CONTINUE",
    step_args=[
        '/home/hadoop/bin/emrfs', 'sync',
        's3://elasticmapreduce/samples/cloudfront'
    ])

stepId = emr.add_jobflow_steps("j-2AL4XXXXXX5T9",
                               steps=[myStep]).stepids[0].value
# snippet-end:[emr.python.addstep.emrfs]
Example #4
0
File: emr.py Project: karthich/dart
 def conn(self):
     if self._conn:
         return self._conn
     self._conn = EmrConnection(region=self._region)
     return self._conn
Example #5
0
  file_input = "manifest.wet.txt"

  # Upload these files
  bucket = upload_files(bucket_input, [file_input])
  bucket = upload_files(bucket_emr, [file_mapper, file_reducer, file_bootstrapper])



  # Name our cluster
  jobname = "Common Crawl Cruncher"
  # Location for EMR's log & output files
  output_folder = "output/"


  print "Initializing EMR Connection..."
  conn = EmrConnection(aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY)

  print "Setting Up Hadoop Streaming Step..."
  result_folder = output_folder + str(datetime.now()).split('.')[0].replace(' ','_')
  step = StreamingStep(name='URL Cruncher',
                   mapper='s3n://%s/%s' % (bucket_emr, file_mapper),
                   reducer='s3n://%s/%s' % (bucket_emr, file_reducer),
                   #input='s3n://%s/%s'% (input_bucket, input_path),
                   input='s3n://%s/%s'% (bucket_input, file_input),
                   output='s3n://%s/%s' % (bucket_emr, result_folder),
                   action_on_failure='CANCEL_AND_WAIT',
                   step_args = ["-jobconf", "mapred.map.tasks=24", "mapred.reduce.tasks=2"]
                   )

  # Other possible step args include:
  #   mapred.max.split.size=1