def run(self, terms, inject=None, **kwargs): region, cluster_id = terms dns = emr.connect_to_region( region).describe_cluster(cluster_id).masterpublicdnsname return [ec2.connect_to_region( region).get_all_instances( filters={"dns-name": dns})[0].instances[0].id]
def setup_and_run_job(self): """ Runs the Elastic MapReduce job on AWS""" step = StreamingStep(name='Titanic Machine Learning', mapper='s3n://' + EmrProcessing.bucket_name + '/mapper/mapper.py', reducer='org.apache.hadoop.mapred.lib.IdentityReducer', input='s3n://' + EmrProcessing.bucket_name + '/input/', output='s3n://' + EmrProcessing.bucket_name + '/output/') self.conn = connect_to_region(self.region_name) self.jobid = self.conn.run_jobflow(name='Titanic Devp', log_uri='s3://' + EmrProcessing.bucket_name + '/jobflow_logs', steps=[step])
def setup_and_run_job(self): """ Runs the Elastic MapReduce job on AWS""" step = StreamingStep( name='Titanic Machine Learning', mapper='s3n://' + EmrProcessing.bucket_name + '/mapper/mapper.py', reducer='org.apache.hadoop.mapred.lib.IdentityReducer', input='s3n://' + EmrProcessing.bucket_name + '/input/', output='s3n://' + EmrProcessing.bucket_name + '/output/') self.conn = connect_to_region(self.region_name) self.jobid = self.conn.run_jobflow( name='Titanic Devp', log_uri='s3://' + EmrProcessing.bucket_name + '/jobflow_logs', steps=[step])
clean_env() self.conf = '%s/%s/%s.ini' % (os.getenv('AWS_CRED_DIR'), acct_name, acct_name) try: boto.config.load_credential_file(self.conf) except IOError, msg: print >> sys.stderr, 'ERROR: %s' % msg return False if service == 's3': self.conn = s3.connect_to_region(region) if service == 'ec2': self.conn = ec2.connect_to_region(region) if service == 'rds': self.conn = rds.connect_to_region(region) if service == 'rds2': self.conn = rds2.connect_to_region(region) if service == 'elb': self.conn = elb.connect_to_region(region) if service == 'sqs': self.conn = sqs.connect_to_region(region) if service == 'emr': self.conn = emr.connect_to_region(region) if service == 'route53': self.conn = route53.connect_to_region(region) if service == 'iam': self.conn = iam.connect_to_region('universal') if not self.conn: print >> sys.stderr, 'ERROR: Unknown service' return False return self.conn
def main(): parser = argparse.ArgumentParser() parser.add_argument('--job', '-j', required=True, type=cluster_id, help='Job Id') parser.add_argument('--input', required=True, help='Input file') parser.add_argument('--code', required=True, help='Code path') parser.add_argument('--output', required=True, help='Output path') parser.add_argument('--src-bucket', required=True, dest='src', help='Source S3 bucket') parser.add_argument('--dst-bucket', required=True, dest='dst', help='Destination S3 bucket') parser.add_argument('--src-profile', help='Boto profile used for source connection') parser.add_argument('--dst-profile', help='Boto profile used for destination connection') parser.add_argument('--dry-run', action="store_true", help='Do everything but execute steps') parser.add_argument('--diff', action="store_true", help='Run diff before copy') parser.add_argument('--no-copy', action="store_true", help='Do everything but copy') parser.add_argument('--list-only', action="store_true", help='Run list step only') parser.add_argument('--uuid', help='Set uuid instead of generating it') parser.add_argument('--region', help='Override default region for emr connection') parser.add_argument('--profile', help='Profile to use for emr connection') parser.add_argument('--verbose', '-v', action="store_true", help='Log to console') parser.add_argument('--debug', '-d', action="store_true", help='Enable debug logging') args = parser.parse_args() ## Set region if defined, otherwise fallback on profile ## Default to ~/.aws/credentials, then .boto if args.region: region = args.region elif args.profile: region = boto.config.get('%s' % args.profile, 'region', \ boto.config.get('profile %s' % args.profile, 'region', None)) else: region = boto.config.get(DEFAULT_AWS_SEC, 'region', \ boto.config.get(DEFAULT_BOTO_SEC, 'region', None)) if args.uuid: uuid = args.uuid else: uuid = int(time()) rundir = "/%s" % uuid print "RunDir: hdfs://%s" % rundir ## Setup logging logger = logging.getLogger('rsync') logger.setLevel(logging.INFO) sh = logging.handlers.SysLogHandler("/var/run/syslog" \ if platform.system() == 'Darwin' else "/dev/log", logging.handlers.SysLogHandler.LOG_LOCAL0) sh.setFormatter(logging.Formatter('%(module)s[%(process)d]: %(message)s')) logger.addHandler(sh) if args.debug: logger.setLevel(logging.DEBUG) if args.verbose: ch = logging.StreamHandler(stream=sys.stdout) ch.setFormatter(logging.Formatter('%(asctime)s %(module)s' + \ '[%(process)d]: %(message)s', datefmt='%b %d %H:%M:%S')) logger.addHandler(ch) try: src_endpoint = bucket_endpoint(boto.connect_s3(\ profile_name=args.src_profile),args.src) dst_endpoint = bucket_endpoint(boto.connect_s3(\ profile_name=args.dst_profile),args.dst) except: sys.exit(1) ## Create Commands srclist_cmd = LIST_CMD.format(args.src, src_endpoint) dstlist_cmd = LIST_CMD.format(args.dst, dst_endpoint) multipartlist_cmd = MULTIPARTLIST_CMD.format(args.src, src_endpoint, args.dst, dst_endpoint) objectcopy_cmd = OBJECTCOPY_CMD.format(args.src, src_endpoint, args.dst, dst_endpoint) multipartcomplete_cmd = MULTIPARTCOMPLETE_CMD.format( args.dst, dst_endpoint) ## Add Profiles if args.src_profile is not None: srclist_cmd += " --profile %s" % args.src_profile multipartlist_cmd += " --src-profile %s" % args.src_profile objectcopy_cmd += " --src-profile %s" % args.src_profile if args.dst_profile is not None: dstlist_cmd += " --profile %s" % args.dst_profile multipartlist_cmd += " --dst-profile %s" % args.dst_profile objectcopy_cmd += " --dst-profile %s" % args.dst_profile multipartcomplete_cmd += " --profile %s" % args.dst_profile ## Inputs partfile = os.path.split(args.input)[1] files = FILES.format(args.code.rstrip('\\')) hivefile = "%s/diff.q" % args.code.rstrip('\\') ## HDFS Outputs srclist = "%s/%s.list" % (rundir, args.src) dstlist = "%s/%s.list" % (rundir, args.dst) hivelist = "%s/diff.list" % (rundir) mpartlist = "%s/%s.multipartlist" % (rundir, args.src) ## HDSF inputs multipartlist_input = hivelist if args.diff else srclist ## Job output output = "%s.%s" % (os.path.join(args.output, partfile), uuid) ## jobsteps jobsteps = [] while True: jobsteps.append( bucketlist("list.%s.%s" % (args.src, partfile), files, srclist_cmd, args.input, srclist, debug=args.debug)) if args.list_only: break if args.diff: jobsteps.append( bucketlist("list.%s.%s" % (args.dst, partfile), files, dstlist_cmd, args.input, dstlist, debug=args.debug)) jobsteps.append(hive(\ "hive.%s.%s" % (args.src, partfile), hivefile, srclist, dstlist, hivelist)) if args.no_copy: print "CopyInput:%s" % multipartlist_input break jobsteps.append( multipartlist("mlist.%s.%s" % (args.src, partfile), files, multipartlist_cmd, multipartlist_input, mpartlist, debug=args.debug)) jobsteps.append( objectcopy("copy.%s.%s" % (args.src, partfile), files, objectcopy_cmd, mpartlist, output, multipartcomplete_cmd, debug=args.debug)) print "Output:%s" % output ## Exit loop break if args.debug or args.dry_run: pp = pprint.PrettyPrinter(indent=4) for i in jobsteps: pp.pprint(i.__dict__) if args.dry_run: sys.exit(0) ## Run job response = emr.connect_to_region( region, profile_name=args.profile).add_jobflow_steps(args.job, jobsteps) for i in response.stepids: print "Added step %s" % i.value
def main(): parser = argparse.ArgumentParser() parser.add_argument('--job','-j', required=True, type=cluster_id, help='Job Id') parser.add_argument('--input', required=True, help='Input file') parser.add_argument('--code', required=True, help='Code path') parser.add_argument('--output', required=True, help='Output path') parser.add_argument('--src-bucket', required=True, dest='src', help='Source S3 bucket') parser.add_argument('--dst-bucket', required=True, dest='dst', help='Destination S3 bucket') parser.add_argument('--src-profile', help='Boto profile used for source connection') parser.add_argument('--dst-profile', help='Boto profile used for destination connection') parser.add_argument('--dry-run', action="store_true", help='Do everything but execute steps') parser.add_argument('--diff', action="store_true", help='Run diff before copy') parser.add_argument('--no-copy', action="store_true", help='Do everything but copy') parser.add_argument('--list-only', action="store_true", help='Run list step only') parser.add_argument('--uuid', help='Set uuid instead of generating it') parser.add_argument('--region', help='Override default region for emr connection') parser.add_argument('--profile', help='Profile to use for emr connection') parser.add_argument('--verbose', '-v', action="store_true", help='Log to console') parser.add_argument('--debug', '-d', action="store_true", help='Enable debug logging') args = parser.parse_args() ## Set region if defined, otherwise fallback on profile ## Default to ~/.aws/credentials, then .boto if args.region: region = args.region elif args.profile: region = boto.config.get('%s' % args.profile, 'region', \ boto.config.get('profile %s' % args.profile, 'region', None)) else: region = boto.config.get(DEFAULT_AWS_SEC, 'region', \ boto.config.get(DEFAULT_BOTO_SEC, 'region', None)) if args.uuid: uuid = args.uuid else: uuid = int(time()) rundir = "/%s" % uuid print "RunDir: hdfs://%s" % rundir ## Setup logging logger = logging.getLogger('rsync') logger.setLevel(logging.INFO) sh = logging.handlers.SysLogHandler("/var/run/syslog" \ if platform.system() == 'Darwin' else "/dev/log", logging.handlers.SysLogHandler.LOG_LOCAL0) sh.setFormatter(logging.Formatter('%(module)s[%(process)d]: %(message)s')) logger.addHandler(sh) if args.debug: logger.setLevel(logging.DEBUG) if args.verbose: ch = logging.StreamHandler(stream=sys.stdout) ch.setFormatter(logging.Formatter('%(asctime)s %(module)s' + \ '[%(process)d]: %(message)s', datefmt='%b %d %H:%M:%S')) logger.addHandler(ch) try: src_endpoint = bucket_endpoint(boto.connect_s3(\ profile_name=args.src_profile),args.src) dst_endpoint = bucket_endpoint(boto.connect_s3(\ profile_name=args.dst_profile),args.dst) except: sys.exit(1) ## Create Commands srclist_cmd = LIST_CMD.format(args.src, src_endpoint) dstlist_cmd = LIST_CMD.format(args.dst, dst_endpoint) multipartlist_cmd = MULTIPARTLIST_CMD.format(args.src, src_endpoint, args.dst, dst_endpoint) objectcopy_cmd = OBJECTCOPY_CMD.format(args.src, src_endpoint, args.dst, dst_endpoint) multipartcomplete_cmd = MULTIPARTCOMPLETE_CMD.format(args.dst, dst_endpoint) ## Add Profiles if args.src_profile is not None: srclist_cmd += " --profile %s" % args.src_profile multipartlist_cmd += " --src-profile %s" % args.src_profile objectcopy_cmd += " --src-profile %s" % args.src_profile if args.dst_profile is not None: dstlist_cmd += " --profile %s" % args.dst_profile multipartlist_cmd += " --dst-profile %s" % args.dst_profile objectcopy_cmd += " --dst-profile %s" % args.dst_profile multipartcomplete_cmd += " --profile %s" % args.dst_profile ## Inputs partfile = os.path.split(args.input)[1] files = FILES.format(args.code.rstrip('\\')) hivefile = "%s/diff.q" % args.code.rstrip('\\') ## HDFS Outputs srclist = "%s/%s.list" % (rundir, args.src) dstlist = "%s/%s.list" % (rundir, args.dst) hivelist = "%s/diff.list" % (rundir) mpartlist = "%s/%s.multipartlist" % (rundir, args.src) ## HDSF inputs multipartlist_input = hivelist if args.diff else srclist ## Job output output = "%s.%s" % (os.path.join(args.output, partfile), uuid) ## jobsteps jobsteps = [] while True: jobsteps.append(bucketlist("list.%s.%s" % (args.src, partfile), files, srclist_cmd, args.input, srclist, debug = args.debug)) if args.list_only: break if args.diff: jobsteps.append(bucketlist("list.%s.%s" % (args.dst, partfile), files, dstlist_cmd, args.input, dstlist, debug = args.debug)) jobsteps.append(hive(\ "hive.%s.%s" % (args.src, partfile), hivefile, srclist, dstlist, hivelist)) if args.no_copy: print "CopyInput:%s" % multipartlist_input break jobsteps.append(multipartlist("mlist.%s.%s" % (args.src, partfile), files, multipartlist_cmd, multipartlist_input, mpartlist, debug = args.debug)) jobsteps.append(objectcopy("copy.%s.%s" % (args.src, partfile), files, objectcopy_cmd, mpartlist, output, multipartcomplete_cmd, debug = args.debug)) print "Output:%s" % output ## Exit loop break if args.debug or args.dry_run: pp = pprint.PrettyPrinter(indent=4) for i in jobsteps: pp.pprint(i.__dict__) if args.dry_run: sys.exit(0) ## Run job response = emr.connect_to_region(region, profile_name=args.profile).add_jobflow_steps(args.job, jobsteps) for i in response.stepids: print "Added step %s" % i.value
def main(): (opts, action, cluster_name) = parse_args() try: emrconn = emr.connect_to_region(opts.region) cluster = emrconn.describe_cluster(opts.cid) conn = cluster.connection except Exception as e: print >> stderr, (e) sys.exit(1) # Select an AZ at random if it was not specified. # if opts.zone == "": # opts.zone = random.choice(conn.get_all_zones()).name if action == "launch": (master_nodes, slave_nodes) = get_existing_cluster( conn, opts, cluster_name) # wait_for_cluster(conn, opts.wait, master_nodes, slave_nodes) setup_cluster(conn, master_nodes, slave_nodes, opts, True) # elif action == "destroy": # response = raw_input("Are you sure you want to destroy the cluster " + # cluster_name + "?\nALL DATA ON ALL NODES WILL BE LOST!!\n" + # "Destroy cluster " + cluster_name + " (y/N): ") # if response == "y": # (master_nodes, slave_nodes) = get_existing_cluster( # conn, opts, cluster_name, die_on_error=False) # print "Terminating master..." # for inst in master_nodes: # inst.terminate() # print "Terminating slaves..." # for inst in slave_nodes: # inst.terminate() # # Delete security groups as well # if opts.delete_groups: # print "Deleting security groups (this will take some time)..." # group_names = [cluster_name + "-master", cluster_name + "-slaves"] # attempt = 1; # while attempt <= 3: # print "Attempt %d" % attempt # groups = [g for g in conn.get_all_security_groups() if g.name in group_names] # success = True # # Delete individual rules in all groups before deleting groups to # # remove dependencies between them # for group in groups: # print "Deleting rules in security group " + group.name # for rule in group.rules: # for grant in rule.grants: # success &= group.revoke(ip_protocol=rule.ip_protocol, # from_port=rule.from_port, # to_port=rule.to_port, # src_group=grant) # # Sleep for AWS eventual-consistency to catch up, and for instances # # to terminate # time.sleep(30) # Yes, it does have to be this long :-( # for group in groups: # try: # conn.delete_security_group(group.name) # print "Deleted security group " + group.name # except boto.exception.EC2ResponseError: # success = False; # print "Failed to delete security group " + group.name # # Unfortunately, group.revoke() returns True even if a rule was not # # deleted, so this needs to be rerun if something fails # if success: break; # attempt += 1 # if not success: # print "Failed to delete all security groups after 3 tries." # print "Try re-running in a few minutes." # elif action == "login": # (master_nodes, slave_nodes) = get_existing_cluster( # conn, opts, cluster_name) # master = master_nodes[0].public_dns_name # print "Logging into master " + master + "..." # proxy_opt = "" # if opts.proxy_port != None: # proxy_opt = "-D " + opts.proxy_port # subprocess.check_call("ssh -o StrictHostKeyChecking=no -i %s %s %s@%s" % # (opts.identity_file, proxy_opt, opts.user, master), shell=True) # elif action == "get-master": # (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name) # print master_nodes[0].public_dns_name # elif action == "stop": # response = raw_input("Are you sure you want to stop the cluster " + # cluster_name + "?\nDATA ON EPHEMERAL DISKS WILL BE LOST, " + # "BUT THE CLUSTER WILL KEEP USING SPACE ON\n" + # "AMAZON EBS IF IT IS EBS-BACKED!!\n" + # "Stop cluster " + cluster_name + " (y/N): ") # if response == "y": # (master_nodes, slave_nodes) = get_existing_cluster( # conn, opts, cluster_name, die_on_error=False) # print "Stopping master..." # for inst in master_nodes: # if inst.state not in ["shutting-down", "terminated"]: # inst.stop() # print "Stopping slaves..." # for inst in slave_nodes: # if inst.state not in ["shutting-down", "terminated"]: # inst.stop() # elif action == "start": # (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name) # print "Starting slaves..." # for inst in slave_nodes: # if inst.state not in ["shutting-down", "terminated"]: # inst.start() # print "Starting master..." # for inst in master_nodes: # if inst.state not in ["shutting-down", "terminated"]: # inst.start() # wait_for_cluster(conn, opts.wait, master_nodes, slave_nodes) # setup_cluster(conn, master_nodes, slave_nodes, opts, False) else: print >> stderr, "Invalid action: %s" % action sys.exit(1)
def run_emr(bucket, prefix): conn = emr.connect_to_region('us-west-2') steps = [ emr.JarStep('makedir', jar=SCRIPT_RUNNER, step_args=[HADOOP, 'fs', '-mkdir', '-p', '/data/']), emr.JarStep('download_to_client', jar=SCRIPT_RUNNER, step_args=[HADOOP, 'fs', '-get', 's3://' + bucket + '/' + prefix + '/institution_data.csv', '/home/hadoop/institution_data.csv']), emr.JarStep('download_to_hdfs', jar=SCRIPT_RUNNER, step_args=[HADOOP, 'fs', '-cp', 's3://' + bucket + '/' + prefix + '/raster_data.csv', '/data/raster_data.csv']), emr.JarStep('download_jar_to_client', jar=SCRIPT_RUNNER, step_args=[HADOOP, 'fs', '-get', 's3://' + bucket + '/' + prefix + '/jar.jar', '/home/hadoop/jar.jar']), emr.JarStep('run_spark', jar=SCRIPT_RUNNER, step_args=['/home/hadoop/spark/bin/spark-submit', '--files', '/home/hadoop/institution_data.csv', '--master', 'yarn-cluster', '--class', 'com.pairwise.PairwiseDistance', '--num-executors', '10', '/home/hadoop/jar.jar', '/home/hadoop/institution_data.csv', '/data/raster_data.csv', '/data/output', '10']), emr.JarStep('get_output', jar=COPY, step_args=['--src', '/data/output', '--dest', 's3://' + bucket + '/' + prefix + '/output'])] bootstrap_actions = [ emr.BootstrapAction('install-spark', path='file:///usr/share/aws/emr/install-spark/install-spark', bootstrap_action_args=['-x'])] jid = conn.run_jobflow('pairwise_distance', log_uri='s3://' + bucket + '/' + prefix + '/logs', master_instance_type='m3.xlarge', slave_instance_type='m3.xlarge', num_instances=5, enable_debugging=True, ami_version='3.8', visible_to_all_users=True, steps=steps, bootstrap_actions=bootstrap_actions) logger.info("Running jobflow: " + jid) while True: time.sleep(15) state = conn.describe_cluster(jid).status.state logger.info("Jobflow " + jid + ": " + state) if state == 'TERMINATED': break elif state == 'TERMINATED_WITH_ERRORS': sys.exit(1)
# CONDITIONS OF ANY KIND, either express or implied. See the License for the # specific language governing permissions and limitations under the License. # # snippet-sourcedescription:[emrfs-boto-step.py demonstrates how to add a step to an EMR cluster that adds objects in an Amazon S3 bucket to the default EMRFS metadata table.] # snippet-service:[elasticmapreduce] # snippet-keyword:[Python] # snippet-keyword:[Amazon EMR] # snippet-keyword:[Code Sample] # snippet-keyword:[add_jobflow_steps] # snippet-sourcetype:[snippet] # snippet-sourcedate:[2019-01-31] # snippet-sourceauthor:[AWS] # snippet-start:[emr.python.addstep.emrfs] from boto.emr import EmrConnection, connect_to_region, JarStep emr = EmrConnection() connect_to_region("us-west-1") myStep = JarStep( name='Boto EMRFS Sync', jar='s3://elasticmapreduce/libs/script-runner/script-runner.jar', action_on_failure="CONTINUE", step_args=[ '/home/hadoop/bin/emrfs', 'sync', 's3://elasticmapreduce/samples/cloudfront' ]) stepId = emr.add_jobflow_steps("j-2AL4XXXXXX5T9", steps=[myStep]).stepids[0].value # snippet-end:[emr.python.addstep.emrfs]