def __init__(self, team_id, access_key, secret_key, bucket='cs144students'): """ (constructor) Creates a new instance of the Rankmaniac class for a specific team using the provided credentials. Arguments: team_id <str> the team identifier, which may be differ slightly from the actual team name. access_key <str> the AWS access key identifier. secret_key <str> the AWS secret acess key. Keyword arguments: bucket <str> the S3 bucket name. """ region = RegionInfo(None, self.DefaultRegionName, self.DefaultRegionEndpoint) self._s3_bucket = bucket self._s3_conn = S3Connection(access_key, secret_key) self._emr_conn = EmrConnection(access_key, secret_key, region=region) self.team_id = team_id self.job_id = None self._reset() self._num_instances = 1
def get_job_flow_objects(conf_path, max_days_ago=None, now=None): """Get relevant job flow information from EMR. Args: conf_path: is a string that is either None or has an alternate path to load the configuration file. max_days_ago: A float where if set, dont fetch job flows created longer than this many days ago. now: the current UTC time as a datetime.datetime object. defaults to the current time. Returns: job_flows: A list of boto job flow objects. """ if now is None: now = datetime.datetime.utcnow() emr_conn = None emr_conn = EmrConnection() # if --max-days-ago is set, only look at recent jobs created_after = None if max_days_ago is not None: created_after = now - datetime.timedelta(days=max_days_ago) return describe_all_job_flows(emr_conn, created_after=created_after)
def create_emr(R): if not boto.config.has_section('Boto'): boto.config.add_section('Boto') boto.config.set('Boto', 'https_validate_certificates', 'False') step = StreamingStep(name='MC_Method example', cache_files=['s3n://bucket774/map.py#map.py'], mapper='map.py', input='s3://bucket774/input/', output='s3://bucket774/output/') conn = EmrConnection(access_id, access_key) instance_groups = [] instance_groups.append( InstanceGroup(num_instances=1, role="MASTER", type='m4.large', market="ON_DEMAND", name="Master nodes")) if R > 1: instance_groups.append( InstanceGroup(num_instances=R - 1, role="CORE", type='m4.large', market="ON_DEMAND", name="Slave nodes")) cluster_id = conn.run_jobflow(name='test MC_method run', instance_groups=instance_groups, enable_debugging=False, steps=[step], visible_to_all_users=True, keep_alive=True, job_flow_role="EMR_EC2_DefaultRole", service_role="EMR_DefaultRole", hadoop_version='2.4.0', log_uri='s3://bucket774/log') return cluster_id, conn
def __init__(self, prop): '''Constructor, initialize EMR connection.''' self.prop = prop self.conn = EmrConnection(self.prop.ec2.key, self.prop.ec2.secret) self.jobid = None self.retry = 0 self.level = 0 self.last_update = -1
def __init__(self, spec_filename="spec.json"): import boto from boto.emr.connection import EmrConnection, RegionInfo super(EmrRuntime, self).__init__(spec_filename) p = self.settings.Param self.s3_conn = boto.connect_s3(p.AWS_ACCESS_KEY_ID, p.AWS_ACCESS_KEY_SECRET) self.s3_bucket = self.s3_conn.get_bucket(p.S3_BUCKET) self.region = p.AWS_Region self.emr_conn = EmrConnection(p.AWS_ACCESS_KEY_ID, p.AWS_ACCESS_KEY_SECRET, region = RegionInfo(name = self.region, endpoint = self.region + '.elasticmapreduce.amazonaws.com')) self.job_flow_id = p.EMR_jobFlowId
def __init__(self, team_id, access_key, secret_key): '''Rankmaniac class constructor Creates a new instance of the Rankmaniac Wrapper for a specific team. Arguments: team_id string the team ID. access_key string AWS access key. secret_key string AWS secret key. ''' self.s3_bucket = 'cs144caltech' self.team_id = team_id self.emr_conn = EmrConnection(access_key, secret_key) self.s3_conn = S3Connection(access_key, secret_key) self.job_id = None
def get_internal_ips_from_emr(cluster_id, cr): """ Retrieves a list of internal IP addresses for a given EMR cluster """ # Open connection to EMR conn = EmrConnection( cr.get_config("aws_access_key"), cr.get_config("aws_secret_key"), region=RegionInfo(name=cr.get_config("aws_region"), endpoint=cr.get_config("aws_region") + ".elasticmapreduce.amazonaws.com")) # Build list of internal ips from list_instances EMR API emr_internal_ips = [] emr_instances = conn.list_instances(cluster_id).instances for instance in emr_instances: emr_internal_ips.append(instance.privateipaddress) return emr_internal_ips
def __init__(self, parameters): try: self.region_name = parameters["region_name"] self.access_key = parameters["access_key"] self.secret_key = parameters["secret_key"] self.ec2_keypair_name = parameters["ec2_keypair_name"] self.base_bucket = parameters["base_bucket"] self.log_dir = parameters["log_dir"] self.emr_status_wait = parameters["emr_status_wait"] self.step_status_wait = parameters["step_status_wait"] self.emr_cluster_name = parameters["emr_cluster_name"] except: logging.error("Something went wrong initializing EmrManager") sys.exit() # Establishing EmrConnection self.connection = EmrConnection(self.access_key, self.secret_key, region=RegionInfo(name=self.region_name, endpoint=self.region_name + '.elasticmapreduce.amazonaws.com')) self.log_bucket_name = self.base_bucket + self.log_dir
def __init__(self, region_name='us-east-1', aws_access_key_id=None, aws_secret_access_key=None): # If the access key is not specified, get it from the luigi config.cfg file if not aws_access_key_id: aws_access_key_id = luigi.configuration.get_config().get( 'aws', 'aws_access_key_id') if not aws_secret_access_key: aws_secret_access_key = luigi.configuration.get_config().get( 'aws', 'aws_secret_access_key') # Create the region in which to run region_endpoint = u'elasticmapreduce.%s.amazonaws.com' % (region_name) region = RegionInfo(name=region_name, endpoint=region_endpoint) self.emr_connection = EmrConnection( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, region=region)
def run(self): """Run the Hive job on EMR cluster """ # copy the data source to a new object # (Hive deletes/moves the original) copy_s3_file(self.input_path, self.data_path) # and create the hive script self._generate_and_upload_hive_script() logger.info("Waiting {} seconds for S3 eventual consistency".format( self.s3_sync_wait_time)) time.sleep(self.s3_sync_wait_time) # TODO more options like setting aws region conn = EmrConnection(self.aws_access_key_id, self.aws_secret_access_key) setup_step = InstallHiveStep(self.hive_version) run_step = HiveStep(self.job_name, self.script_path) cluster_id = conn.run_jobflow( self.job_name, self.log_path, action_on_failure='CANCEL_AND_WAIT', master_instance_type=self.master_instance_type, slave_instance_type=self.slave_instance_type, ami_version=self.ami_version, num_instances=self.num_instances, job_flow_role=self.iam_instance_profile, service_role=self.iam_service_role) conn.add_jobflow_steps(cluster_id, [setup_step, run_step]) logger.info("Job started on cluster {0}".format(cluster_id)) self._wait_for_job_to_complete(conn, cluster_id) logger.info("Output file is in: {0}".format(self.output_path))
def __init__(self, user=EMR_USER, key=EMR_KEY): self.conn = EmrConnection(user, key)
PageviewsBySubredditAndPath, PageviewsByLanguage, ClickthroughsByCodename, TargetedClickthroughsByCodename, AdImpressionsByCodename, TargetedImpressionsByCodename) RAW_LOG_DIR = g.RAW_LOG_DIR PROCESSED_DIR = g.PROCESSED_DIR AGGREGATE_DIR = g.AGGREGATE_DIR AWS_LOG_DIR = g.AWS_LOG_DIR # the "or None" business is so that a blank string becomes None to cause boto # to look for credentials in other places. s3_connection = S3Connection(g.TRAFFIC_ACCESS_KEY or None, g.TRAFFIC_SECRET_KEY or None) emr_connection = EmrConnection(g.TRAFFIC_ACCESS_KEY or None, g.TRAFFIC_SECRET_KEY or None) traffic_categories = (SitewidePageviews, PageviewsBySubreddit, PageviewsBySubredditAndPath, PageviewsByLanguage, ClickthroughsByCodename, TargetedClickthroughsByCodename, AdImpressionsByCodename, TargetedImpressionsByCodename) traffic_subdirectories = { SitewidePageviews: 'sitewide', PageviewsBySubreddit: 'subreddit', PageviewsBySubredditAndPath: 'srpath', PageviewsByLanguage: 'lang', ClickthroughsByCodename: 'clicks', TargetedClickthroughsByCodename: 'clicks_targeted', AdImpressionsByCodename: 'thing', TargetedImpressionsByCodename: 'thingtarget',
import os import sys import dateutil.parser from dateutil import tz from boto.emr.connection import EmrConnection from boto.s3.connection import S3Connection from ucsd_bigdata.credentials import Credentials import gzip if __name__ == "__main__": credentials = Credentials() aws_access_key_id = credentials.aws_access_key_id aws_secret_access_key = credentials.aws_secret_access_key emr_conn = EmrConnection(aws_access_key_id, aws_secret_access_key) # List EMR Clusters clusters = emr_conn.list_clusters(cluster_states=["RUNNING", "WAITING"]) for index, cluster in enumerate(clusters.clusters): print "[%s] %s" % (index, cluster.id) # if there is a command line arg, use it for the cluster_id if len(sys.argv) > 1: cluster_id = sys.argv[1] else: if len(clusters.clusters) == 0: sys.exit("No EMR clusters running.") selected_cluster = input("Select a Cluster: ") cluster_id = clusters.clusters[int(selected_cluster)].id
k = Key(b) k.key = 'reducer.py' k.set_contents_from_filename('/Users/winteram/Documents/Teaching/reducer.py') k.close() # <codecell> for word in b.list(): print word # <codecell> ### Running code with EMR #emrcon = EmrConnection('<aws access key>', '<aws secret key>') emrcon = EmrConnection('AKIAJRV3RN6NXQTSSTBA', '3e212d6rs99xtiPgwKnfN1QD30WZk2hJwCWjMcGc') # <codecell> # Using EMR's wordcount example step = StreamingStep( name='My wordcount example', mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py', reducer='aggregate', input='s3n://elasticmapreduce/samples/wordcount/input', output='s3n://wambia660fall2013/output/wordcount_output') # <codecell> jobid = emrcon.run_jobflow(name='Word Count Example', log_uri='s3://wambia660fall2013/logs',
def _emr_connect(self): """Connect to emr. """ self.emr_conn = EmrConnection( aws_access_key_id=self.access_key_id, aws_secret_access_key=self.secret_access_key)
def create_emr_cluster(cr): """ @PARAM: Cluster configuration reader object Creates an EMR cluster given a set of configuration parameters Return: EMR Cluster ID """ #region = cr.get_config("aws_region") #conn = boto.emr.connect_to_region(region) conn = EmrConnection( cr.get_config("aws_access_key"), cr.get_config("aws_secret_key"), region=RegionInfo(name=cr.get_config("aws_region"), endpoint=cr.get_config("aws_region") + ".elasticmapreduce.amazonaws.com")) # Create list of instance groups: master, core, and task instance_groups = [] instance_groups.append( InstanceGroup(num_instances=cr.get_config("emr_master_node_count"), role="MASTER", type=cr.get_config("emr_master_node_type"), market=cr.get_config("emr_market_type"), name="Master Node")) instance_groups.append( InstanceGroup(num_instances=cr.get_config("emr_core_node_count"), role="CORE", type=cr.get_config("emr_core_node_type"), market=cr.get_config("emr_market_type"), name="Core Node")) # Only create task nodes if specifcally asked for if cr.get_config("emr_task_node_count") > 0: instance_groups.append( InstanceGroup(num_instances=cr.get_config("emr_task_node_count"), role="TASK", type=cr.get_config("emr_task_node_type"), market=cr.get_config("emr_market_type"), name="Task Node")) print "Creating EMR Cluster with instance groups: {0}".format( instance_groups) # Use these params to add overrrides, these will go away in Boto3 api_params = { "Instances.Ec2SubnetId": cr.get_config("aws_subnet_id"), "ReleaseLabel": cr.get_config("emr_version") } # Add step to load data step_args = [ "s3-dist-cp", "--s3Endpoint=s3-us-west-1.amazonaws.com", "--src=s3://alpine-qa/automation/automation_test_data/", "--dest=hdfs:///automation_test_data", "--srcPattern=.*[a-zA-Z,]+" ] step = JarStep(name="s3distcp for data loading", jar="command-runner.jar", step_args=step_args, action_on_failure="CONTINUE") cluster_id = conn.run_jobflow( cr.get_config("emr_cluster_name"), instance_groups=instance_groups, action_on_failure="TERMINATE_JOB_FLOW", keep_alive=True, enable_debugging=True, log_uri=cr.get_config("emr_log_uri"), #hadoop_version = "Amazon 2.7.2", #ReleaseLabel = "emr-5.0.0", #ami_version = "5.0.0", steps=[step], bootstrap_actions=[], ec2_keyname=cr.get_config("ec2_keyname"), visible_to_all_users=True, job_flow_role="EMR_EC2_DefaultRole", service_role="EMR_DefaultRole", api_params=api_params) print "EMR Cluster created, cluster id: {0}".format(cluster_id) state = conn.describe_cluster(cluster_id).status.state while state != u'COMPLETED' and state != u'SHUTTING_DOWN' and state != u'FAILED' and state != u'WAITING': #sleeping to recheck for status. time.sleep(5) state = conn.describe_cluster(cluster_id).status.state print "State is: {0}, sleeping 5s...".format(state) if state == u'SHUTTING_DOWN' or state == u'FAILED': return "ERROR" #Check if the state is WAITING. Then launch the next steps if state == u'WAITING': #Finding the master node dns of EMR cluster master_dns = conn.describe_cluster(cluster_id).masterpublicdnsname print "DNS Name: {0}".format(master_dns) return cluster_id
from boto.emr.connection import EmrConnection from boto.emr.step import InstallPigStep, PigStep AWS_ACCESS_KEY = '' # REQUIRED AWS_SECRET_KEY = '' # REQUIRED conn = EmrConnection(AWS_ACCESS_KEY, AWS_SECRET_KEY) pig_file = 's3://elasticmapreduce/samples/pig-apache/do-reports2.pig' INPUT = 's3://elasticmapreduce/samples/pig-apache/input/access_log_1' OUTPUT = '' # REQUIRED, S3 bucket for job output pig_args = ['-p', 'INPUT=%s' % INPUT, '-p', 'OUTPUT=%s' % OUTPUT] pig_step = PigStep('Process Reports', pig_file, pig_args=pig_args) steps = [InstallPigStep(), pig_step] conn.run_jobflow(name='report test', steps=steps, hadoop_version='0.20.205', ami_version='latest', num_instances=2, keep_alive=False)
def create_data_source_variable(cluster_id, cr): """ Creates a data source variable .json file using the cluster_id of an EMR cluster_id @PARAM: cluster_id: ID of an EMR cluster return: True if success, creates a file in the pwd 'default_emr.json' Object created should look like: HADOOP_DATA_SOURCE_NAME="emr_data_source" HADOOP_DATA_SOURCE_DISTRO="Cloudera CDH5.4-5.7" HADOOP_DATA_SOURCE_HOST="emr_master_dns_hostname" HADOOP_DATA_SOURCE_PORT=8020 HADOOP_DATA_SOURCE_USER="******" HADOOP_DATA_SOURCE_GROUP="hadoop" HADOOP_DATA_SOURCE_JT_HOST="emr_master_dns_hostname" HADOOP_DATA_SOURCE_JT_PORT=8032 CONNECTION_PARAMETERS='[{"key":"mapreduce.jobhistory.address", "value":"0.0.0.0:10020"}, ' \ '{"key":"mapreduce.jobhistory.webapp.address", "value":"cdh5hakerberosnn.alpinenow.local:19888"}, ' \ '{"key":"yarn.app.mapreduce.am.staging-dir", "value":"/tmp/hadoop-yarn/staging"}, ' \ '{"key":"yarn.resourcemanager.admin.address", "value":"cdh5hakerberosnn.alpinenow.local:8033"}, ' \ '{"key":"yarn.resourcemanager.resource-tracker.address", "value":"cdh5hakerberosnn.alpinenow.local:8031"}, ' \ '{"key":"yarn.resourcemanager.scheduler.address", "value":"cdh5hakerberosnn.alpinenow.local:8030"}]' """ conn = EmrConnection( cr.get_config("aws_access_key"), cr.get_config("aws_secret_key"), region=RegionInfo(name=cr.get_config("aws_region"), endpoint=cr.get_config("aws_region") + ".elasticmapreduce.amazonaws.com")) emr_cluster = conn.describe_cluster(cluster_id) master_dns_hostname = emr_cluster.masterpublicdnsname # Build up connection parameters conn_params = [] conn_params.append({ "key": "mapreduce.jobhistory.address", "value": "{0}:10020".format(master_dns_hostname) }) conn_params.append({ "key": "mapreduce.jobhistory.webapp.address", "value": "{0}:19888".format(master_dns_hostname) }) conn_params.append({ "key": "yarn.app.mapreduce.am.staging-dir", "value": "/user" }) conn_params.append({ "key": "yarn.resourcemanager.admin.address", "value": "{0}:8033".format(master_dns_hostname) }) conn_params.append({ "key": "yarn.resourcemanager.scheduler.address", "value": "{0}:8030".format(master_dns_hostname) }) conn_params_str = "CONNECTION_PARAMETERS=\"{0}\"".format(conn_params) email_str = "EMAIL=\"avalanche_{0}.alpinenow.com\"".format( random.randint(1, 99999)) with open("emr_default.conf", "w") as f: f.writelines("HADOOP_DATA_SOURCE_NAME=\"{0}\"\n".format( cr.get_config("emr_cluster_name"))) f.writelines( "HADOOP_DATA_SOURCE_DISTRO=\"{0}\"\n".format("Amazon EMR5")) f.writelines( "HADOOP_DATA_SOURCE_HOST=\"{0}\"\n".format(master_dns_hostname)) f.writelines("HADOOP_DATA_SOURCE_POST=\"8020\"\n") f.writelines("HADOOP_DATA_SOURCE_USER=\"hdfs\"\n") f.writelines("HADOOP_DATA_SOURCE_GROUP=\"hadoop\"\n") f.writelines( "HADOOP_DATA_SOURCE_JT_HOST=\"{0}\"\n".format(master_dns_hostname)) f.writelines("HADOOP_DATA_SOURCE_JT_PORT=\"8032\"\n") f.writelines(email_str) f.writelines(conn_params_str)
from boto.emr.bootstrap_action import BootstrapAction from boto.emr.connection import EmrConnection # Description: # BootstrapAction is an object reperesenting a bootstrap action in Elastic Map # Reduce (EMR), a script that gets run before the EMR job executes. # initialize a bootstrap action bootstrapSetup = BootstrapAction("Bootstrap Name", "s3://<my-bucket>/<my-bootstrap-action>", ["arg1=hello", "arg2=world"]) # initialize emr connection emr_job = EmrConnection("<aws-access-key-id>", "<aws-secret-access-key>") # run emr job flow with defined bootstrap action emr_job.run_jobflow(bootstrap_actions=[bootstrapSetup])
from boto.emr.connection import EmrConnection from boto.emr.step import StreamingStep import boto AWS_KEY='AKIAIQ7VG4UORIN75ZSA' AWS_SECRET='jzxajGx8gzwX+ymYXJ0/5heCjkPtWLQkICYRn7Vj' conn = EmrConnection(AWS_KEY, AWS_SECRET) step = StreamingStep(name='My wordcount example', mapper='s3n://css739/wordcount/bigramSplitter.py', reducer='aggregate', input='s3n://smalldata/wikipedia_titles.txt', output='s3n://css739/wordcount/bigram_count_output2', cache_files=['s3n://css739/wordcount/english_stoplist.py']) jobid = conn.run_jobflow(name='My jobflow', log_uri='s3n://css739/wordcount/jobflow_logs',steps=[step]) conn.describe_jobflow(jobid).state
def post(self): if not boto.config.has_section('Boto'): boto.config.add_section('Boto') boto.config.set('Boto', 'https_validate_certificates', 'False') note = '' data_para = [0, 0, 0, 0, 0] s3_connection = S3Connection(access_id, access_key) bucket = s3_connection.get_bucket('bucket774') k = Key(bucket) k.key = 'temp_para.json' temp_para = json.loads(k.get_contents_as_string()) if (temp_para[6] == 1): k.key = 'cluster_id' cluster_id = k.get_contents_as_string() conn = EmrConnection(access_id, access_key) if (temp_para[7] == 0): status = conn.describe_cluster(cluster_id) if (status.status.state == 'WAITING'): PYdata = get_output() conn.terminate_jobflow(cluster_id) data = in_circle_to_pi(PYdata, temp_para[0]) k.key = 'temp_para.json' temp_para[6] = 0 k.set_contents_from_string(json.dumps(temp_para)) data_para[0:4] = temp_para[0:4] data_para[4] = json.loads(data)[-1] note = 'last emr job done, reslut have been updated' save_result(data, json.dumps(data_para)) else: note = 'last emr calculation havet finished,please waitting.' k.key = 'record.json' data = k.get_contents_as_string() k.key = 'record_para.json' data_para_json = k.get_contents_as_string() data_para = json.loads(data_para_json) elif (temp_para[7] == 1): status = conn.describe_cluster(cluster_id) if (status.status.state == 'WAITING'): k.key = 'temp_data.json' PYdata = np.array(json.loads(k.get_contents_as_string())) PYdata += get_output() if (round( np.sum(PYdata) / (temp_para[3] * temp_para[5]), temp_para[4]) == round(math.pi, temp_para[4])): for i in range(1, len(PYdata)): PYdata[i] += PYdata[i - 1] PYdata[i - 1] /= temp_para[0] * (i) * temp_para[5] PYdata[len(PYdata) - 1] /= temp_para[0] * len(PYdata) * temp_para[5] data = json.dumps( PYdata.tolist()) #covernt numpy array to list k.key = 'temp_para.json' temp_para[6] = 0 k.set_contents_from_string(json.dumps(temp_para)) data_para[0:4] = temp_para[0:4] data_para[4] = json.loads(data)[-1] conn.terminate_jobflow(cluster_id) note = 'last emr job done,result have been updated' save_result(data, json.dumps(data_para)) else: note = str(np.sum(PYdata)) + ',' + str( temp_para[3]) + ',' + str(temp_para[5]) add_step_emr(conn, cluster_id) save_temp_result(PYdata) for key in bucket.list(prefix='output/'): key.delete() temp_para[5] += 1 k.key = 'temp_para.json' k.set_contents_from_string(json.dumps(temp_para)) #note='havet find the given accuracy in last run, keep working' k.key = 'record.json' data = k.get_contents_as_string() k.key = 'record_para.json' data_para_json = k.get_contents_as_string() data_para = json.loads(data_para_json) else: note = 'last emr calculation havet finished,please waitting.' k.key = 'record.json' data = k.get_contents_as_string() k.key = 'record_para.json' data_para_json = k.get_contents_as_string() data_para = json.loads(data_para_json) else: k.key = 'record.json' data = k.get_contents_as_string() k.key = 'record_para.json' data_para_json = k.get_contents_as_string() data_para = json.loads(data_para_json) doRender( self, 'chart.htm', { 'Data': data, 'shots_each_threat': data_para[0], 'R': data_para[1], 'Q': data_para[2], 'pi': math.pi, 'shots': data_para[3], 'result': data_para[4], 'note': note })