def create_emr(R):
    if not boto.config.has_section('Boto'):
        boto.config.add_section('Boto')
    boto.config.set('Boto', 'https_validate_certificates', 'False')
    step = StreamingStep(name='MC_Method example',
                         cache_files=['s3n://bucket774/map.py#map.py'],
                         mapper='map.py',
                         input='s3://bucket774/input/',
                         output='s3://bucket774/output/')
    conn = EmrConnection(access_id, access_key)
    instance_groups = []
    instance_groups.append(
        InstanceGroup(num_instances=1,
                      role="MASTER",
                      type='m4.large',
                      market="ON_DEMAND",
                      name="Master nodes"))
    if R > 1:
        instance_groups.append(
            InstanceGroup(num_instances=R - 1,
                          role="CORE",
                          type='m4.large',
                          market="ON_DEMAND",
                          name="Slave nodes"))
    cluster_id = conn.run_jobflow(name='test MC_method run',
                                  instance_groups=instance_groups,
                                  enable_debugging=False,
                                  steps=[step],
                                  visible_to_all_users=True,
                                  keep_alive=True,
                                  job_flow_role="EMR_EC2_DefaultRole",
                                  service_role="EMR_DefaultRole",
                                  hadoop_version='2.4.0',
                                  log_uri='s3://bucket774/log')
    return cluster_id, conn
예제 #2
0
def add_steps(cluster_id, key):
	try:
		emr_connection = EmrConnection()
		emr_connection.add_jobflow_steps(cluster_id, get_steps(key, key))
		return True
	except Exception, e:
		return False
예제 #3
0
def start_hadoop_cluster(nodenum):
	try:
		hadoop_params = ['-m','mapred.tasktracker.map.tasks.maximum=1',
		          '-m', 'mapred.child.java.opts=-Xmx10g']
		configure_hadoop_action = BootstrapAction('configure_hadoop', 's3://elasticmapreduce/bootstrap-actions/configure-hadoop', hadoop_params)

		emr_connection = EmrConnection()
		bucket_name = "udk-bucket"
		steps = []
		copy_jar_step = JarStep(name='copy-jar',
			jar='s3n://' + bucket_name + '/copy-to-hdfs.jar',
			step_args=['s3n://' + bucket_name + '/pipeline.pear',
				'/mnt/pipeline.pear'])
		steps.append(copy_jar_step)

		jobflow_id = emr_connection.run_jobflow(name='udk',
			log_uri='s3://udk-bucket/jobflow_logs',
			master_instance_type='m2.xlarge',
			slave_instance_type='m2.xlarge',
			num_instances=nodenum,
			keep_alive=True,
			enable_debugging=False,
			bootstrap_actions=[configure_hadoop_action],
			hadoop_version='1.0.3',
			steps=steps)
		emr_connection.set_termination_protection(jobflow_id, True)
		
		return jobflow_id
	except Exception, e:
		return "none" 
예제 #4
0
    def run(self):
        """Run the Hive job on EMR cluster
        """
        #  copy the data source to a new object
        #  (Hive deletes/moves the original)
        copy_s3_file(self.input_path, self.data_path)

        # and create the hive script
        self._generate_and_upload_hive_script()

        logger.info("Waiting {} seconds for S3 eventual consistency".format(
                    self.s3_sync_wait_time))
        time.sleep(self.s3_sync_wait_time)

        # TODO more options like setting aws region
        conn = EmrConnection(self.aws_access_key_id,
                             self.aws_secret_access_key)

        setup_step = InstallHiveStep(self.hive_version)
        run_step = HiveStep(self.job_name, self.script_path)

        jobid = conn.run_jobflow(
            self.job_name,
            self.log_path,
            action_on_failure='CANCEL_AND_WAIT',
            master_instance_type=self.master_instance_type,
            slave_instance_type=self.slave_instance_type,
            ami_version=self.ami_version,
            num_instances=self.num_instances)

        conn.add_jobflow_steps(jobid, [setup_step, run_step])

        self._wait_for_job_to_complete(conn, jobid)

        logger.info("Output file is in: {0}".format(self.output_path))
예제 #5
0
    def __init__(self,
                 team_id,
                 access_key,
                 secret_key,
                 bucket='cs144students'):
        """
        (constructor)

        Creates a new instance of the Rankmaniac class for a specific
        team using the provided credentials.

        Arguments:
            team_id       <str>     the team identifier, which may be
                                    differ slightly from the actual team
                                    name.

            access_key    <str>     the AWS access key identifier.
            secret_key    <str>     the AWS secret acess key.

        Keyword arguments:
            bucket        <str>     the S3 bucket name.
        """

        region = RegionInfo(None, self.DefaultRegionName,
                            self.DefaultRegionEndpoint)

        self._s3_bucket = bucket
        self._s3_conn = S3Connection(access_key, secret_key)
        self._emr_conn = EmrConnection(access_key, secret_key, region=region)

        self.team_id = team_id
        self.job_id = None

        self._reset()
        self._num_instances = 1
예제 #6
0
 def __init__(self, prop):
     '''Constructor, initialize EMR connection.'''
     self.prop = prop
     self.conn = EmrConnection(self.prop.ec2.key, self.prop.ec2.secret)
     self.jobid = None
     self.retry = 0
     self.level = 0
     self.last_update = -1
예제 #7
0
def get_cluster_status(cluster_id):
	try:
		emr_connection = EmrConnection()
		flow = emr_connection.describe_jobflow(cluster_id)
		if flow == None:
			return "none"
		return flow.state
	except Exception, e:
		return "none"
예제 #8
0
def terminate(cluster_id):
	try:
		emr_connection = EmrConnection()
		emr_connection.set_termination_protection(cluster_id, False)
		emr_connection.terminate_jobflow(cluster_id)
		return True
	except Exception, e:
		print e
		return False
예제 #9
0
def create_data_source_variable(cluster_id, cr):
    """
    Creates a data source variable .json file using the cluster_id of an EMR cluster_id
    @PARAM:  cluster_id:  ID of an EMR cluster
    return:  True if success, creates a file in the pwd 'default_emr.json'

    Object created should look like:

    HADOOP_DATA_SOURCE_NAME="emr_data_source"
    HADOOP_DATA_SOURCE_DISTRO="Cloudera CDH5.4-5.7"
    HADOOP_DATA_SOURCE_HOST="emr_master_dns_hostname"
    HADOOP_DATA_SOURCE_PORT=8020
    HADOOP_DATA_SOURCE_USER="******"
    HADOOP_DATA_SOURCE_GROUP="hadoop"
    HADOOP_DATA_SOURCE_JT_HOST="emr_master_dns_hostname"
    HADOOP_DATA_SOURCE_JT_PORT=8032
    CONNECTION_PARAMETERS='[{"key":"mapreduce.jobhistory.address", "value":"0.0.0.0:10020"}, ' \
                            '{"key":"mapreduce.jobhistory.webapp.address", "value":"cdh5hakerberosnn.alpinenow.local:19888"}, ' \
                            '{"key":"yarn.app.mapreduce.am.staging-dir", "value":"/tmp/hadoop-yarn/staging"}, ' \
                            '{"key":"yarn.resourcemanager.admin.address", "value":"cdh5hakerberosnn.alpinenow.local:8033"}, ' \
                            '{"key":"yarn.resourcemanager.resource-tracker.address", "value":"cdh5hakerberosnn.alpinenow.local:8031"}, ' \
                            '{"key":"yarn.resourcemanager.scheduler.address", "value":"cdh5hakerberosnn.alpinenow.local:8030"}]'

    """
    conn = EmrConnection(
        cr.get_config("aws_access_key"),
        cr.get_config("aws_secret_key"),
        region = RegionInfo(name = cr.get_config("aws_region"),
            endpoint = cr.get_config("aws_region") + ".elasticmapreduce.amazonaws.com" ))

    emr_cluster = conn.describe_cluster(cluster_id)
    master_dns_hostname = emr_cluster.masterpublicdnsname

    # Build up connection parameters
    conn_params = []
    conn_params.append({"key": "mapreduce.jobhistory.address", "value": "{0}:10020".format(master_dns_hostname)})
    conn_params.append({"key": "mapreduce.jobhistory.webapp.address", "value": "{0}:19888".format(master_dns_hostname)})
    conn_params.append({"key": "yarn.app.mapreduce.am.staging-dir", "value": "/user"})
    conn_params.append({"key": "yarn.resourcemanager.admin.address", "value": "{0}:8033".format(master_dns_hostname)})
    conn_params.append({"key": "yarn.resourcemanager.scheduler.address", "value": "{0}:8030".format(master_dns_hostname)})
    conn_params_str = "CONNECTION_PARAMETERS=\"{0}\"".format(conn_params)
    email_str = "EMAIL=\"avalanche_{0}.alpinenow.com\"".format(random.randint(1,99999))

    with open("emr_default.conf", "w") as f:
        f.writelines("HADOOP_DATA_SOURCE_NAME=\"{0}\"\n".format(cr.get_config("emr_cluster_name")))
        f.writelines("HADOOP_DATA_SOURCE_DISTRO=\"{0}\"\n".format("Amazon EMR5"))
        f.writelines("HADOOP_DATA_SOURCE_HOST=\"{0}\"\n".format(master_dns_hostname))
        f.writelines("HADOOP_DATA_SOURCE_POST=\"8020\"\n")
        f.writelines("HADOOP_DATA_SOURCE_USER=\"hdfs\"\n")
        f.writelines("HADOOP_DATA_SOURCE_GROUP=\"hadoop\"\n")
        f.writelines("HADOOP_DATA_SOURCE_JT_HOST=\"{0}\"\n".format(master_dns_hostname))
        f.writelines("HADOOP_DATA_SOURCE_JT_PORT=\"8032\"\n")
        f.writelines(email_str)
        f.writelines(conn_params_str)
예제 #10
0
def get_job_flow_objects(conf_path, max_days_ago=None, now=None):
    """Get relevant job flow information from EMR.

    Args:
        conf_path: is a string that is either None or has an alternate
            path to load the configuration file.

        max_days_ago: A float where if set, dont fetch job flows created
            longer than this many days ago.

        now: the current UTC time as a datetime.datetime object.
            defaults to the current time.
    Returns:
        job_flows: A list of boto job flow objects.
    """
    if now is None:
        now = datetime.datetime.utcnow()
    emr_conn = None
    emr_conn = EmrConnection()
    # if --max-days-ago is set, only look at recent jobs
    created_after = None
    if max_days_ago is not None:
        created_after = now - datetime.timedelta(days=max_days_ago)

    return describe_all_job_flows(emr_conn, created_after=created_after)
예제 #11
0
    def __init__(self, team_id, access_key, secret_key,
                 bucket='cs144students'):
        """
        (constructor)

        Creates a new instance of the Rankmaniac class for a specific
        team using the provided credentials.

        Arguments:
            team_id       <str>     the team identifier, which may be
                                    differ slightly from the actual team
                                    name.

            access_key    <str>     the AWS access key identifier.
            secret_key    <str>     the AWS secret acess key.

        Keyword arguments:
            bucket        <str>     the S3 bucket name.
        """

        region = RegionInfo(None, self.DefaultRegionName,
                            self.DefaultRegionEndpoint)

        self._s3_bucket = bucket
        self._s3_conn = S3Connection(access_key, secret_key)
        self._emr_conn = EmrConnection(access_key, secret_key, region=region)

        self.team_id = team_id
        self.job_id = None

        self._reset()
        self._num_instances = 1
예제 #12
0
 def __init__(self, prop):
   '''Constructor, initialize EMR connection.'''
   self.prop = prop
   self.conn = EmrConnection(self.prop.ec2.key, self.prop.ec2.secret)
   self.jobid = None
   self.retry = 0
   self.level = 0
   self.last_update = -1
예제 #13
0
    def __init__(self, team_id, access_key, secret_key):
        '''Rankmaniac class constructor

        Creates a new instance of the Rankmaniac Wrapper for a specific
        team.

        Arguments:
            team_id         string      the team ID.
            access_key      string      AWS access key.
            secret_key      string      AWS secret key.
        '''

        self.s3_bucket = 'cs144caltech'

        self.team_id = team_id
        self.emr_conn = EmrConnection(access_key, secret_key)
        self.s3_conn = S3Connection(access_key, secret_key)
        self.job_id = None
예제 #14
0
def get_internal_ips_from_emr(cluster_id, cr):
    """
    Retrieves a list of internal IP addresses for a given EMR cluster
    """

    #  Open connection to EMR
    conn = EmrConnection(
        cr.get_config("aws_access_key"),
        cr.get_config("aws_secret_key"),
        region = RegionInfo(name = cr.get_config("aws_region"),
            endpoint = cr.get_config("aws_region") + ".elasticmapreduce.amazonaws.com" ))

    #  Build list of internal ips from list_instances EMR API
    emr_internal_ips = []
    emr_instances = conn.list_instances(cluster_id).instances
    for instance in emr_instances:
        emr_internal_ips.append(instance.privateipaddress)

    return emr_internal_ips
예제 #15
0
def get_internal_ips_from_emr(cluster_id, cr):
    """
    Retrieves a list of internal IP addresses for a given EMR cluster
    """

    #  Open connection to EMR
    conn = EmrConnection(
        cr.get_config("aws_access_key"),
        cr.get_config("aws_secret_key"),
        region=RegionInfo(name=cr.get_config("aws_region"),
                          endpoint=cr.get_config("aws_region") +
                          ".elasticmapreduce.amazonaws.com"))

    #  Build list of internal ips from list_instances EMR API
    emr_internal_ips = []
    emr_instances = conn.list_instances(cluster_id).instances
    for instance in emr_instances:
        emr_internal_ips.append(instance.privateipaddress)

    return emr_internal_ips
예제 #16
0
    def __init__(self, parameters):
        try: 
            self.region_name = parameters["region_name"]
            self.access_key = parameters["access_key"]
            self.secret_key = parameters["secret_key"]
            self.ec2_keypair_name = parameters["ec2_keypair_name"]
            self.base_bucket = parameters["base_bucket"]
            self.log_dir = parameters["log_dir"]
            self.emr_status_wait = parameters["emr_status_wait"]
            self.step_status_wait = parameters["step_status_wait"]
            self.emr_cluster_name = parameters["emr_cluster_name"]
        except:
            logging.error("Something went wrong initializing EmrManager")
            sys.exit()

        # Establishing EmrConnection
        self.connection = EmrConnection(self.access_key, self.secret_key,
                             region=RegionInfo(name=self.region_name,
                             endpoint=self.region_name + '.elasticmapreduce.amazonaws.com'))

        self.log_bucket_name = self.base_bucket + self.log_dir
예제 #17
0
    def __init__(self,
                 region_name='us-east-1',
                 aws_access_key_id=None,
                 aws_secret_access_key=None):

        # If the access key is not specified, get it from the luigi config.cfg file
        if not aws_access_key_id:
            aws_access_key_id = luigi.configuration.get_config().get(
                'aws', 'aws_access_key_id')

        if not aws_secret_access_key:
            aws_secret_access_key = luigi.configuration.get_config().get(
                'aws', 'aws_secret_access_key')

        # Create the region in which to run
        region_endpoint = u'elasticmapreduce.%s.amazonaws.com' % (region_name)
        region = RegionInfo(name=region_name, endpoint=region_endpoint)

        self.emr_connection = EmrConnection(
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key,
            region=region)
예제 #18
0
    def run(self):
        """Run the Hive job on EMR cluster
        """
        #  copy the data source to a new object
        #  (Hive deletes/moves the original)
        copy_s3_file(self.input_path, self.data_path)

        # and create the hive script
        self._generate_and_upload_hive_script()

        logger.info("Waiting {} seconds for S3 eventual consistency".format(
            self.s3_sync_wait_time))
        time.sleep(self.s3_sync_wait_time)

        # TODO more options like setting aws region
        conn = EmrConnection(self.aws_access_key_id,
                             self.aws_secret_access_key)

        setup_step = InstallHiveStep(self.hive_version)
        run_step = HiveStep(self.job_name, self.script_path)

        cluster_id = conn.run_jobflow(
            self.job_name,
            self.log_path,
            action_on_failure='CANCEL_AND_WAIT',
            master_instance_type=self.master_instance_type,
            slave_instance_type=self.slave_instance_type,
            ami_version=self.ami_version,
            num_instances=self.num_instances,
            job_flow_role=self.iam_instance_profile,
            service_role=self.iam_service_role)

        conn.add_jobflow_steps(cluster_id, [setup_step, run_step])

        logger.info("Job started on cluster {0}".format(cluster_id))

        self._wait_for_job_to_complete(conn, cluster_id)

        logger.info("Output file is in: {0}".format(self.output_path))
예제 #19
0
    def __init__(self, spec_filename="spec.json"):
        import boto
        from boto.emr.connection import EmrConnection, RegionInfo

        super(HiveRuntime, self).__init__(spec_filename)
        p = self.settings.Param
        self.s3_conn = boto.connect_s3(p.AWS_ACCESS_KEY_ID, p.AWS_ACCESS_KEY_SECRET)
        self.s3_bucket = self.s3_conn.get_bucket(p.S3_BUCKET)
        self.region = p.AWS_Region
        self.emr_conn = EmrConnection(p.AWS_ACCESS_KEY_ID, p.AWS_ACCESS_KEY_SECRET,
                region = RegionInfo(name = self.region,
                    endpoint = self.region + '.elasticmapreduce.amazonaws.com'))
        self.job_flow_id = p.EMR_jobFlowId
예제 #20
0
    def __init__(self, spec_filename="spec.json"):
        import boto
        from boto.emr.connection import EmrConnection, RegionInfo

        super(EmrRuntime, self).__init__(spec_filename)
        p = self.settings.Param
        self.s3_conn = boto.connect_s3(p.AWS_ACCESS_KEY_ID, p.AWS_ACCESS_KEY_SECRET)
        self.s3_bucket = self.s3_conn.get_bucket(p.S3_BUCKET)
        self.region = p.AWS_Region
        self.emr_conn = EmrConnection(p.AWS_ACCESS_KEY_ID, p.AWS_ACCESS_KEY_SECRET,
                                      region = RegionInfo(name = self.region,
                                                          endpoint = self.region + '.elasticmapreduce.amazonaws.com'))
        self.job_flow_id = p.EMR_jobFlowId
예제 #21
0
class EMR:
    def creating_a_connection(self):
        #Creating a connection
        from boto.emr.connection import EmrConnection
        self.conn = EmrConnection('', '')

    def creating_streaming_job(self):
        #Creating Streaming JobFlow Steps
        from boto.emr.step import StreamingStep
        self.step = StreamingStep(name='my bigdata task',
            mapper='s3n://eth-src/raw_to_stations.py',
            #mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
            reducer='s3n://eth-src/stations_to_features.py',
            #reducer='aggregate',
            input='s3n://eth-input/2007.csv',
            #input='s3n://elasticmapreduce/samples/wordcount/input',
            output='s3n://eth-middle/2007')

    def creating_jobflows(self):
        #Creating JobFlows
        #import boto.emr
        #self.conn = boto.emr.connect_to_region('eu-west-1')
        job_id = self.conn.run_jobflow(name='My jobflow',
                log_uri='s3://eth-log/jobflow_logs',
                master_instance_type='m3.xlarge',
                slave_instance_type='m1.large',
                num_instances=2,
                steps=[self.step],
                ami_version='3.3.1'
                )

        status = self.conn.describe_jobflow(job_id)
        status.state

    def terminating_jobflows(self, job_id):
        #Terminating JobFlows
        #self.conn = boto.emr.connect_to_region('eu-west-1')
        self.conn.terminate_jobflow(job_id)
예제 #22
0
def main(argv):

  # load the config
  config = ConfigParser()
  config.read(os.path.join(os.path.split(argv[0])[0] if not None else '','config.ini'))

  # load AWS config
  awsConfig = ConfigParser()
  awsConfig.read(config.get('Common','aws'))

  aws_access_key = awsConfig.get('AWS','aws_access_key')
  aws_secret_key = awsConfig.get('AWS','aws_secret_key')
  event_bucket = awsConfig.get('AWS','event_bucket')
  output_bucket = awsConfig.get('AWS','emr_output_bucket')
  script_bucket = awsConfig.get('AWS','script_bucket')
  
  jobId = argv[1]

  emrConnection = EmrConnection(aws_access_key, aws_secret_key)

  s3Connection = S3Connection(aws_access_key, aws_secret_key)

  # clean s3 output
  bucket = s3Connection.get_bucket(output_bucket)
  for key in bucket.get_all_keys(prefix=BUCKET_KEY):
    bucket.delete_key(key)

  step = StreamingStep(name='Foursquare event deduper',
                      mapper='s3://%s/dedup_mapper.py foursquare' % script_bucket,
                      reducer='s3://%s/dedup_reducer.py' % script_bucket,
                      input='s3://%s/normalized' % event_bucket,
                      output='s3://%s/%s' % (output_bucket,BUCKET_KEY),
                      action_on_failure='CONTINUE')

  emrConnection.add_jobflow_steps(jobId, step)

  print 'Successfully started streaming steps'
예제 #23
0
파일: emr.py 프로젝트: ainestal/magpie
class EMRInventory():
    def __init__(self, region='eu-west-1'):
        regionEMR = self.get_emr_region(region)
        self.emrConnection = EmrConnection(region=regionEMR)

    def list_current_resources(self, region='eu-west-1'):
        jobFlows = self.emrConnection.describe_jobflows()
        for jobFlow in jobFlows:
            print jobFlow.jobflowid

    def get_emr_region(self, region='eu-west-1'):
        regionEndpoint = '%s.elasticmapreduce.amazonaws.com' % region
        regionEMR = RegionInfo (name=region,
                                endpoint=regionEndpoint)
        return regionEMR
예제 #24
0
class EmrJarRuntime(ZetRuntime):
    def __init__(self, spec_filename="spec.json"):
        import boto
        from boto.emr.connection import EmrConnection, RegionInfo

        # super(ZetRuntime, self).__init__()
        # TODO
        self.settings = get_settings_from_file(spec_filename)

        p = self.settings.Param
        self.s3_conn = boto.connect_s3(p.AWS_ACCESS_KEY_ID, p.AWS_ACCESS_KEY_SECRET)
        self.s3_bucket = self.s3_conn.get_bucket(p.S3_BUCKET)
        self.region = p.AWS_Region
        self.emr_conn = EmrConnection(p.AWS_ACCESS_KEY_ID, p.AWS_ACCESS_KEY_SECRET,
                region = RegionInfo(name = self.region,
                    endpoint = self.region + '.elasticmapreduce.amazonaws.com'))
        self.job_flow_id = p.EMR_jobFlowId

    def get_s3_working_dir(self, path=""):
        ps = self.settings
        glb_vars = ps.GlobalParam
        return os.path.join('zetjob', glb_vars['userName'], "job%s" % glb_vars['jobId'], "blk%s" % glb_vars['blockId'], path)

    def execute(self, jar_path, args):
        from boto.emr.step import JarStep

        s3_jar_path = s3_upload(self.s3_bucket, self.get_s3_working_dir(jar_path), jar_path)
        # s3_jar_path = "s3://run-jars/jar/mahout-core-1.0-SNAPSHOT-job.jar"
        print("Uploading jar to s3 : %s -> %s" % (jar_path, s3_jar_path))

        print("Add jobflow step")
        step = JarStep(name='cl_filter', jar=s3_jar_path, step_args=args)
        self.emr_conn.add_jobflow_steps(self.job_flow_id, steps=[step])

        print("Waiting jobflow step done")
        emr_wait_job(self.emr_conn, self.job_flow_id)
예제 #25
0
파일: emr_client.py 프로젝트: mbrio/Luigi
    def __init__(self, region_name='us-east-1', aws_access_key_id=None, aws_secret_access_key=None):
 
        # If the access key is not specified, get it from the luigi config.cfg file
        if not aws_access_key_id:
            aws_access_key_id = luigi.configuration.get_config().get('aws', 'aws_access_key_id')
 
        if not aws_secret_access_key:
            aws_secret_access_key = luigi.configuration.get_config().get('aws', 'aws_secret_access_key')
 
 
        # Create the region in which to run
        region_endpoint = u'elasticmapreduce.%s.amazonaws.com' % (region_name)
        region = RegionInfo(name=region_name, endpoint=region_endpoint)
 
        self.emr_connection = EmrConnection(aws_access_key_id=aws_access_key_id,
                                            aws_secret_access_key=aws_secret_access_key,
                                            region=region)
예제 #26
0
    def __init__(self, team_id, access_key, secret_key):
        '''Rankmaniac class constructor

        Creates a new instance of the Rankmaniac Wrapper for a specific
        team.

        Arguments:
            team_id         string      the team ID.
            access_key      string      AWS access key.
            secret_key      string      AWS secret key.
        '''

        self.s3_bucket = 'cs144caltech'

        self.team_id = team_id
        self.emr_conn = EmrConnection(access_key, secret_key)
        self.s3_conn = S3Connection(access_key, secret_key)
        self.job_id = None
예제 #27
0
    def __init__(self):
        try:
            self.zone_name = "ap-southeast-1"
            self.access_key = "xxxxxx"
            self.private_key = "xxxxxxx"
            self.ec2_keyname = "xxxxxxxx"
            self.base_bucket = "s3://emr-bucket/"
            self.bootstrap_script = "custom-bootstrap.sh"
            self.log_dir = "Logs"
            self.emr_status_wait = 20
            self.conn = ""
            self.cluster_name = "MyFirstEmrCluster"

            # Establishing EmrConnection
            self.conn = EmrConnection(self.access_key, self.private_key,
                                 region=RegionInfo(name=self.zone_name,
                                 endpoint=self.zone_name + '.elasticmapreduce.amazonaws.com'))


            self.log_bucket_name = self.base_bucket + self.log_dir
            self.bootstrap_script_name = self.base_bucket + self.bootstrap_script
예제 #28
0
k = Key(b)
k.key = 'reducer.py'
k.set_contents_from_filename('/Users/winteram/Documents/Teaching/reducer.py')
k.close()

# <codecell>

for word in b.list():
    print word

# <codecell>

### Running code with EMR

#emrcon = EmrConnection('<aws access key>', '<aws secret key>')
emrcon = EmrConnection('AKIAJRV3RN6NXQTSSTBA',
                       '3e212d6rs99xtiPgwKnfN1QD30WZk2hJwCWjMcGc')

# <codecell>

# Using EMR's wordcount example
step = StreamingStep(
    name='My wordcount example',
    mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
    reducer='aggregate',
    input='s3n://elasticmapreduce/samples/wordcount/input',
    output='s3n://wambia660fall2013/output/wordcount_output')

# <codecell>

jobid = emrcon.run_jobflow(name='Word Count Example',
                           log_uri='s3://wambia660fall2013/logs',
예제 #29
0
class EMRCluster(object):
    '''Representation of an EMR cluster.
     TODO: add bridge to boto interface for unit test.
  '''
    emr_status_delay = 10  # in sec
    emr_status_max_delay = 60  # in sec
    emr_status_max_error = 30  # number of errors
    emr_max_idle = 10 * 60  # 10 min (in sec)
    rate_limit_lock = RateLimitLock()

    def __init__(self, prop):
        '''Constructor, initialize EMR connection.'''
        self.prop = prop
        self.conn = EmrConnection(self.prop.ec2.key, self.prop.ec2.secret)
        self.jobid = None
        self.retry = 0
        self.level = 0
        self.last_update = -1

    @property
    def priority(self):
        '''The priority used in EMRManager.
       The lower value, the higher priority.
    '''
        with EMRCluster.rate_limit_lock:
            if self.jobid is None:
                return 1
            return 0

    def get_instance_groups(self):
        '''Get instance groups to start a cluster.
       It calculates the price with self.level, which indicates the
       price upgrades from the original price.
    '''
        instance_groups = []
        for group in self.prop.emr.instance_groups:
            (num, group_name, instance_type) = group
            level = max(0,
                        min(self.level,
                            len(self.prop.emr.price_upgrade_rate) -
                            1))  # 0 <= level < len(...)
            bprice = self.prop.emr.prices[
                instance_type] * self.prop.emr.price_upgrade_rate[level]
            name = '%s-%s@%f' % (group_name, 'SPOT', bprice)

            # Use on-demand instance if prices are zero.
            if bprice > 0:
                ig = InstanceGroup(num, group_name, instance_type, 'SPOT',
                                   name, '%.3f' % bprice)
            else:
                ig = InstanceGroup(num, group_name, instance_type, 'ON_DEMAND',
                                   name)

            instance_groups.append(ig)

        return instance_groups

    def get_bootstrap_actions(self):
        '''Get list of bootstrap actions from property'''
        actions = []
        for bootstrap_action in self.prop.emr.bootstrap_actions:
            assert len(bootstrap_action
                       ) >= 2, 'Wrong bootstrap action definition: ' + str(
                           bootstrap_action)
            actions.append(
                BootstrapAction(bootstrap_action[0], bootstrap_action[1],
                                bootstrap_action[2:]))
        return actions

    @synchronized
    def start(self):
        '''Start a EMR cluster.'''
        # emr.project_name is required
        if self.prop.emr.project_name is None:
            raise ValueError('emr.project_name is not set')

        self.last_update = time.time()
        with EMRCluster.rate_limit_lock:
            self.jobid = self.conn.run_jobflow(
                name=self.prop.emr.cluster_name,
                ec2_keyname=self.prop.emr.keyname,
                log_uri=self.prop.emr.log_uri,
                ami_version=self.prop.emr.ami_version,
                bootstrap_actions=self.get_bootstrap_actions(),
                keep_alive=True,
                action_on_failure='CONTINUE',
                api_params={'VisibleToAllUsers': 'true'},
                instance_groups=self.get_instance_groups())
        message('Job flow created: %s', self.jobid)

        # Tag EC2 instances to allow future analysis
        tags = {
            'FlowControl': 'Briefly',
            'Project': self.prop.emr.project_name
        }
        if self.prop.emr.tags is not None:
            assert isinstance(self.prop.emr.tags, dict)
            tags = dict(tags.items() + self.prop.emr.tags.items())
        self.conn.add_tags(self.jobid, tags)

    @synchronized
    def terminate(self, level_upgrade=0):
        '''Terminate this EMR cluster.'''
        if self.jobid is None:
            return

        self.level += level_upgrade  # upgrade to another price level

        message('Terminate jobflow: %s', self.jobid)
        for i in range(3):
            try:
                with EMRCluster.rate_limit_lock:
                    self.conn.terminate_jobflow(self.jobid)
                break
            except Exception as e:
                message('Unable to terminate job flow: %s', self.jobid)
                message(traceback.format_exc())
        # We have to set jobid as None to create new cluster;
        # otherwise, run_steps will keep launching jobs on the bad cluster.
        self.jobid = None

    def is_idle(self):
        '''Check if this EMR cluster is idle?'''
        return (not self.jobid is None) and (
            (time.time() - self.last_update) > self.emr_max_idle)

    def get_steps(self, node):
        '''Get the jar step from the node.'''
        step = JarStep(name=node.config.sub(node.config.emr.step_name,
                                            node_hash=node.hash()),
                       main_class=node.config.main_class,
                       jar=node.config.hadoop.jar,
                       action_on_failure='CONTINUE',
                       step_args=node.process_args(*node.config.args))
        return [step]

    def get_step_index(self, step_id):
        '''Get the index of a step given step_id (1 based)'''
        steps = [
            step.id
            for step in reversed(self.conn.list_steps(self.jobid).steps)
            if step.status is not None
        ]

        # revert the index since latest step is on top of the list
        return steps.index(step_id) + 1

    def run_steps(self, node, wait=True):
        '''Main loop to execute a node.
       It will block until step complete or failure, and will raise
       exception for failures so that the step will be retried.
       TODO: add timeouts for each step?
       TODO: dynamic increase cluster size?
    '''
        if not self.jobid:
            self.start()

        try:
            with EMRCluster.rate_limit_lock:
                # Here we just add single step. And get the step_id for fallowing checks.
                step_id = self.conn.add_jobflow_steps(
                    self.jobid, self.get_steps(node)).stepids[0].value
                assert step_id is not None
        except Exception as e:
            node.log('Unable to add jobflow steps: %s', node.hash())
            node.log('%s', traceback.format_exc())
            raise HadoopFailure()

        status_error_counter = 0
        step_status = 'PENDING'
        step_index = None
        step_start = time.time()

        # notify the node with status.
        node.notify_status('Running on EMR: %s', self.jobid)

        while wait and step_status in ['PENDING', 'RUNNING']:
            try:
                # wait first for the status turning to 'RUNNING' from 'WAITING'. Exponential delay for errors.
                # Cap delay to a predefined limit.
                delay = min(self.emr_status_delay * (2**status_error_counter),
                            self.emr_status_max_delay)
                time.sleep(delay)

                # Keep current cluster alive.
                self.last_update = time.time()

                # Get current cluster status. May raise exception due to EMR request throttle.
                cluster_state = self.conn.describe_cluster(
                    self.jobid).status.state

                if step_index is None:
                    step_index = self.get_step_index(step_id)
                    node.log('Step #: %d', step_index)
                    node.log('Log URI: %s/%s/steps/%d/',
                             node.config.emr.log_uri, self.jobid, step_index)

                step_status = self.conn.describe_step(self.jobid,
                                                      step_id).status.state
                status_error_counter = 0  # reset counter
                node.log("%s: %s %s", self.jobid, cluster_state, step_status)

                if cluster_state in [
                        'TERMINATING', 'TERMINATED', 'TERMINATED_WITH_ERRORS'
                ]:  # cluster kill (maybe due to spot price), upgrade.
                    self.terminate(1)
                    break

                if (
                        time.time() - step_start
                ) > node.config.emr.step_timeout:  # Step running too long? EMR cluster idle.
                    node.log('Step running too long. Restart with new cluster')
                    self.terminate()
                    break

            except KeyboardInterrupt:
                raise
            except Exception as e:
                node.log('EMR loop exception: %d error(s)',
                         status_error_counter)
                status_error_counter += 1
                if status_error_counter > self.emr_status_max_error:
                    self.terminate()
                    node.log('Too many errors in EMR loop')
                    node.log('Exception: %s', traceback.format_exc())
                    raise

        if step_status != 'COMPLETED':
            raise HadoopFailure()
예제 #30
0
class Rankmaniac:
    """
    (wrapper class)

    This class presents a simple wrapper around the AWS SDK. It strives
    to provide all the functionality required to run map-reduce
    (Hadoop) on Amazon. This way the students do not need to worry about
    learning the API for Amazon S3 and EMR, and instead can focus on
    computing pagerank quickly!
    """

    DefaultRegionName = 'us-west-2'
    DefaultRegionEndpoint = 'elasticmapreduce.us-west-2.amazonaws.com'

    def __init__(self,
                 team_id,
                 access_key,
                 secret_key,
                 bucket='cs144students'):
        """
        (constructor)

        Creates a new instance of the Rankmaniac class for a specific
        team using the provided credentials.

        Arguments:
            team_id       <str>     the team identifier, which may be
                                    differ slightly from the actual team
                                    name.

            access_key    <str>     the AWS access key identifier.
            secret_key    <str>     the AWS secret acess key.

        Keyword arguments:
            bucket        <str>     the S3 bucket name.
        """

        region = RegionInfo(None, self.DefaultRegionName,
                            self.DefaultRegionEndpoint)

        self._s3_bucket = bucket
        self._s3_conn = S3Connection(access_key, secret_key)
        self._emr_conn = EmrConnection(access_key, secret_key, region=region)

        self.team_id = team_id
        self.job_id = None

        self._reset()
        self._num_instances = 1

    def _reset(self):
        """
        Resets the internal state of the job and submission.
        """

        self._iter_no = 0
        self._infile = None
        self._last_outdir = None

        self._last_process_step_iter_no = -1
        self._is_done = False

    def __del__(self):
        """
        (destructor)

        Terminates the map-reduce job if any, and closes the connections
        to Amazon S3 and EMR.
        """

        if self.job_id is not None:
            self.terminate()

        self._s3_conn.close()
        self._emr_conn.close()

    def __enter__(self):
        """
        Used for `with` syntax. Simply returns this instance since the
        set-up has all been done in the constructor.
        """

        return self

    def __exit__(self, type, value, traceback):
        """
        Refer to __del__().
        """

        self.__del__()
        return False  # do not swallow any exceptions

    def upload(self, indir='data'):
        """
        Uploads the local data to Amazon S3 under the configured bucket
        and key prefix (the team identifier). This way the code can be
        accessed by Amazon EMR to compute pagerank.

        Keyword arguments:
            indir       <str>       the base directory from which to
                                    upload contents.

        Special notes:
            This method only uploads **files** in the specified
            directory. It does not scan through subdirectories.

            WARNING! This method removes all previous (or ongoing)
            submission results, so it is unsafe to call while a job is
            already running (and possibly started elsewhere).
        """

        if self.job_id is not None:
            raise RankmaniacError('A job is already running.')

        bucket = self._s3_conn.get_bucket(self._s3_bucket)

        # Clear out current bucket contents for team
        keys = bucket.list(prefix=self._get_keyname())
        bucket.delete_keys(keys)

        for filename in os.listdir(indir):
            relpath = os.path.join(indir, filename)
            if os.path.isfile(relpath):
                keyname = self._get_keyname(filename)
                key = bucket.new_key(keyname)
                key.set_contents_from_filename(relpath)

    def set_infile(self, filename):
        """
        Sets the data file to use for the first iteration of the
        pagerank step in the map-reduce job.
        """

        if self.job_id is not None:
            raise RankmaniacError('A job is already running.')

        self._infile = filename

    def do_iter(self,
                pagerank_mapper,
                pagerank_reducer,
                process_mapper,
                process_reducer,
                pagerank_output=None,
                process_output=None,
                num_pagerank_mappers=1,
                num_pagerank_reducers=1):
        """
        Adds a pagerank step and a process step to the current job.
        """

        num_process_mappers = 1
        num_process_reducers = 1

        if self._iter_no == 0:
            pagerank_input = self._infile
        elif self._iter_no > 0:
            pagerank_input = self._last_outdir

        if pagerank_output is None:
            pagerank_output = self._get_default_outdir('pagerank')

        # Output from the pagerank step becomes input to process step
        process_input = pagerank_output

        if process_output is None:
            process_output = self._get_default_outdir('process')

        pagerank_step = self._make_step(pagerank_mapper, pagerank_reducer,
                                        pagerank_input, pagerank_output,
                                        num_pagerank_mappers,
                                        num_pagerank_reducers)

        process_step = self._make_step(process_mapper, process_reducer,
                                       process_input, process_output,
                                       num_process_mappers,
                                       num_process_reducers)

        steps = [pagerank_step, process_step]
        if self.job_id is None:
            self._submit_new_job(steps)
        else:
            self._emr_conn.add_jobflow_steps(self.job_id, steps)

        # Store `process_output` directory so it can be used in
        # subsequent iteration
        self._last_outdir = process_output
        self._iter_no += 1

    def is_done(self):
        """
        Returns `True` if the map-reduce job is done, and `False`
        otherwise.

        For all process-step output files that have not been fetched,
        gets the first part of the output file, and checks whether its
        contents begins with the string 'FinalRank'.

        Special notes:
            WARNING! The usage of this method in your code requires that
            that you used the default output directories in all calls
            to do_iter().
        """

        # Cache the result so we can return immediately without hitting
        # any of the Amazon APIs
        if self._is_done:
            return True

        iter_no = self._get_last_process_step_iter_no()
        if iter_no < 0:
            return False

        while self._last_process_step_iter_no < iter_no:
            self._last_process_step_iter_no += 1
            i = self._last_process_step_iter_no

            outdir = self._get_default_outdir('process', iter_no=i)
            keyname = self._get_keyname(outdir, 'part-00000')

            bucket = self._s3_conn.get_bucket(self._s3_bucket)
            key = Key(bucket=bucket, name=keyname)
            contents = key.next()  # get first chunk of the output file

            if contents.startswith('FinalRank'):
                self._is_done = True  # cache result
                break

        return self._is_done

    def is_alive(self):
        """
        Checks whether the jobflow has completed, failed, or been
        terminated.

        Special notes:
            WARNING! This method should only be called **after**
            is_done() in order to be able to distinguish between the
            cases where the map-reduce job has outputted 'FinalRank'
            on its final iteration and has a 'COMPLETED' state.
        """

        jobflow = self.describe()
        if jobflow.state in ('COMPLETED', 'FAILED', 'TERMINATED'):
            return False

        return True

    def terminate(self):
        """
        Terminates a running map-reduce job.
        """

        if not self.job_id:
            raise RankmaniacError('No job is running.')

        self._emr_conn.terminate_jobflow(self.job_id)
        self.job_id = None

        self._reset()

    def download(self, outdir='results'):
        """
        Downloads the results from Amazon S3 to the local directory.

        Keyword arguments:
            outdir      <str>       the base directory to which to
                                    download contents.

        Special notes:
            This method downloads all keys (files) from the configured
            bucket for this particular team. It creates subdirectories
            as needed.
        """

        bucket = self._s3_conn.get_bucket(self._s3_bucket)
        keys = bucket.list(prefix=self._get_keyname())
        for key in keys:
            keyname = key.name
            # Ignore folder keys
            if '$' not in keyname:
                suffix = keyname.split('/')[1:]  # removes team identifier
                filename = os.path.join(outdir, *suffix)
                dirname = os.path.dirname(filename)

                if not os.path.exists(dirname):
                    os.makedirs(dirname)

                key.get_contents_to_filename(filename)

    def describe(self):
        """
        Gets the current map-reduce job details.

        Returns a boto.emr.emrobject.JobFlow object.

        Special notes:
            The JobFlow object has the following relevant fields.
                state       <str>           the state of the job flow,
                                            either COMPLETED
                                                 | FAILED
                                                 | TERMINATED
                                                 | RUNNING
                                                 | SHUTTING_DOWN
                                                 | STARTING
                                                 | WAITING

                steps       <list(boto.emr.emrobject.Step)>
                            a list of the step details in the workflow.

            The Step object has the following relevant fields.
                state               <str>       the state of the step.

                startdatetime       <str>       the start time of the
                                                job.

                enddatetime         <str>       the end time of the job.

            WARNING! Amazon has an upper-limit on the frequency with
            which you can call this method; we have had success with
            calling it at most once every 10 seconds.
        """

        if not self.job_id:
            raise RankmaniacError('No job is running.')

        return self._emr_conn.describe_jobflow(self.job_id)

    def _get_last_process_step_iter_no(self):
        """
        Returns the most recently process-step of the job flow that has
        been completed.
        """

        steps = self.describe().steps
        i = 1

        while i < len(steps):
            step = steps[i]
            if step.state != 'COMPLETED':
                break

            i += 2

        return i / 2 - 1

    def _get_default_outdir(self, name, iter_no=None):
        """
        Returns the default output directory, which is 'iter_no/name/'.
        """

        if iter_no is None:
            iter_no = self._iter_no

        # Return iter_no/name/ **with** the trailing slash
        return '%s/%s/' % (iter_no, name)

    def _submit_new_job(self, steps):
        """
        Submits a new job to run on Amazon EMR.
        """

        if self.job_id is not None:
            raise RankmaniacError('A job is already running.')

        job_name = self._make_name()
        num_instances = self._num_instances
        log_uri = self._get_s3_team_uri('job_logs')
        self.job_id = self._emr_conn.run_jobflow(name=job_name,
                                                 steps=steps,
                                                 num_instances=num_instances,
                                                 log_uri=log_uri)

    def _make_step(self,
                   mapper,
                   reducer,
                   input,
                   output,
                   num_mappers=1,
                   num_reducers=1):
        """
        Returns a new step that runs the specified mapper and reducer,
        reading from the specified input and writing to the specified
        output.
        """

        bucket = self._s3_conn.get_bucket(self._s3_bucket)

        # Clear out current bucket/output contents for team
        keys = bucket.list(prefix=self._get_keyname(output))
        bucket.delete_keys(keys)

        step_name = self._make_name()
        step_args = [
            '-jobconf',
            'mapred.map.tasks=%d' % (num_mappers), '-jobconf',
            'mapred.reduce.tasks=%d' % (num_reducers)
        ]

        return StreamingStep(name=step_name,
                             step_args=step_args,
                             mapper=self._get_s3_team_uri(mapper),
                             reducer=self._get_s3_team_uri(reducer),
                             input=self._get_s3_team_uri(input),
                             output=self._get_s3_team_uri(output))

    def _make_name(self):
        return strftime('%%s %m-%d-%Y %H:%M:%S', localtime()) % (self.team_id)

    def _get_keyname(self, *args):
        """
        Returns the key name to use in the grading bucket (for the
        particular team).

            'team_id/...'
        """

        return '%s/%s' % (self.team_id, '/'.join(args))

    def _get_s3_team_uri(self, *args):
        """
        Returns the Amazon S3 URI for the team submissions.
        """

        return 's3n://%s/%s' % (self._s3_bucket, self._get_keyname(*args))
예제 #31
0
import os
import sys
import dateutil.parser
from dateutil import tz
from boto.emr.connection import EmrConnection
from boto.s3.connection import S3Connection
from ucsd_bigdata.credentials import Credentials
import gzip

if __name__ == "__main__":
    credentials = Credentials()
    aws_access_key_id = credentials.aws_access_key_id
    aws_secret_access_key = credentials.aws_secret_access_key

    emr_conn = EmrConnection(aws_access_key_id, aws_secret_access_key)

    # List EMR Clusters
    clusters = emr_conn.list_clusters(cluster_states=["RUNNING", "WAITING"])

    for index, cluster in enumerate(clusters.clusters):
        print "[%s] %s" % (index, cluster.id)

    # if there is a command line arg, use it for the cluster_id
    if len(sys.argv) > 1:
        cluster_id = sys.argv[1]
    else:
        if len(clusters.clusters) == 0:
            sys.exit("No EMR clusters running.")
        selected_cluster = input("Select a Cluster: ")
        cluster_id = clusters.clusters[int(selected_cluster)].id
예제 #32
0
class Rankmaniac:
    '''Rankmaniac Wrapper

    This class provides a simple wrapper around the Amazon Web Services SDK.
    It should provide all the functionality required in terms of MapReduce,
    so students don't need to worry about learning the EMR and S3 API.
    '''
    def __init__(self, team_id, access_key, secret_key):
        '''Rankmaniac class constructor

        Creates a new instance of the Rankmaniac Wrapper for a specific
        team.

        Arguments:
            team_id         string      the team ID.
            access_key      string      AWS access key.
            secret_key      string      AWS secret key.
        '''

        self.s3_bucket = 'cs144caltech'

        self.team_id = team_id
        self.emr_conn = EmrConnection(access_key, secret_key)
        self.s3_conn = S3Connection(access_key, secret_key)
        self.job_id = None

    def __del__(self):

        if self.job_id:
            self.terminate_job()

    def submit_job(self,
                   mapper,
                   reducer,
                   input,
                   output,
                   num_map=1,
                   num_reduce=1):
        '''Submit a new MapReduce job

        Submits a new MapReduce job with a single step. To add more steps,
        call add_step. To terminate this job, call terminate_job.
        
        Arguments:
            mapper          string      path to the mapper, relative to
                                        your data directory.
            reducer         string      path to the reducer, relative to
                                        your data directory.
            input           string      path to the input data, relative to
                                        your data directory. To specify a
                                        directory as input, ensure your path
                                        contains a trailing /.
            output          string      path to the desired output directory.
            num_map         int         number of map tasks for this job.
            num_reduce      int         number of reduce tasks for this job.
        '''

        if self.job_id:
            raise Exception('There currently already exists a running job.')

        job_name = self._make_name()
        step = self._make_step(mapper, reducer, input, output, num_map,
                               num_reduce)
        self.job_id = \
          self.emr_conn.run_jobflow(name=job_name,
                                    steps=[step],
                                    num_instances=1,
                                    log_uri=self._get_s3_url() + 'job_logs',
                                    keep_alive=True)

    def terminate_job(self):
        '''Terminate a running MapReduce job

        Stops the current running job.
        '''

        if not self.job_id:
            raise Exception('No job is running.')

        self.emr_conn.terminate_jobflow(self.job_id)
        self.job_id = None

    def get_job(self):
        '''Gets the running job details

        Returns:
            JobFlow object with relevant fields:
                state           string      the state of the job flow, either
                                            COMPLETED | FAILED | TERMINATED
                                            RUNNING | SHUTTING_DOWN | STARTING
                                            WAITING | BOOTSTRAPPING
                steps           list(Step)  a list of the step details in the
                                            workflow. A Step has the relevant
                                            fields:
                                                status              string
                                                startdatetime       string
                                                enddatetime         string

        Note: Amazon has an upper-limit on the frequency with which you can
              call this function; we have had success with calling it one
              every 10 seconds.
        '''

        if not self.job_id:
            raise Exception('No job is running.')

        return self.emr_conn.describe_jobflow(self.job_id)

    def add_step(self,
                 mapper,
                 reducer,
                 input,
                 output,
                 num_map=1,
                 num_reduce=1):
        '''Add a step to an existing job

        Adds a new step to an already running job flow.

        Note: any given job flow can support up to 256 steps. To workaround
              this limitation, you can always choose to submit a new job
              once the current job completes.
        
        Arguments:
            mapper          string      path to the mapper, relative to
                                        your data directory.
            reducer         string      path to the reducer, relative to
                                        your data directory.
            input           string      path to the input data, relative to
                                        your data directory. To specify a
                                        directory as input, ensure your path
                                        contains a trailing /.
            output          string      path to the desired output directory.
        '''

        if not self.job_id:
            raise Exception('No job is running.')

        step = self._make_step(mapper, reducer, input, output, num_map,
                               num_reduce)
        self.emr_conn.add_jobflow_steps(self.job_id, [step])

    def upload(self, in_dir='data'):
        '''Upload local data to S3

        Uploads the files in the specified directory to S3, where it can be
        used by Elastic MapReduce.

        Note: this method only uploads files in the root of in_dir. It does
              NOT scan through subdirectories.

        Arguments:
            in_dir          string      optional, defaults to 'data'. Uses
                                        this directory as the base directory
                                        from which to upload.
        '''

        bucket = self.s3_conn.get_bucket(self.s3_bucket)
        keys = bucket.list(prefix='%s/' % self.team_id)
        bucket.delete_keys(map(lambda k: k.name, keys))

        to_upload = [(os.path.join(in_dir, file_name),
                      os.path.join(self.team_id, file_name))
                     for file_name in os.listdir(in_dir)
                     if os.path.isfile(os.path.join(in_dir, file_name))]

        for l, r in to_upload:
            key = Key(bucket)
            key.key = r
            key.set_contents_from_filename(l)

    def download(self, out_dir='data'):
        '''Download S3 data to local directory

        Downloads S3 data to the specified directory.

        Note: this method DOES download the entire directory hierarchy as
              given by S3. It will create subdirectories as needed.

        Arguments:
            out_dir         string      optional, defaults to 'data'. Downloads
                                        files to this directory.
        '''

        bucket = self.s3_conn.get_bucket(self.s3_bucket)
        keys = bucket.list(prefix='%s/' % self.team_id)
        for key in keys:
            fp = os.path.join(out_dir, '/'.join(key.name.split('/')[1:]))
            fp_dir = os.path.dirname(fp)
            if os.path.exists(fp):
                os.remove(fp)
            elif not os.path.exists(fp_dir):
                os.makedirs(fp_dir)
            key.get_contents_to_filename(fp)

    def _make_name(self):

        return '%s-%s' % (self.team_id,
                          strftime('%m-%d-%Y %H:%M:%s', localtime()))

    def _make_step(self, mapper, reducer, input, output, nm=1, nr=1):

        job_name = self._make_name()
        team_s3 = self._get_s3_url()

        bucket = self.s3_conn.get_bucket(self.s3_bucket)
        keys = bucket.list(prefix='%s/%s' % (self.team_id, output))
        bucket.delete_keys(map(lambda k: k.name, keys))

        return \
            StreamingStep(name=job_name,
                          step_args=
                              ['-jobconf', 'mapred.map.tasks=%d' % nm,
                               '-jobconf', 'mapred.reduce.tasks=%d' % nr],
                          mapper=team_s3 + mapper,
                          reducer=team_s3 + reducer,
                          input=team_s3 + input,
                          output=team_s3 + output)

    def _get_s3_url(self):

        return 's3n://%s/%s/' % (self.s3_bucket, self.team_id)
예제 #33
0
파일: __init__.py 프로젝트: gmohre/awsbook
 def __init__(self, user=EMR_USER, key=EMR_KEY):
     self.conn = EmrConnection(user, key)
예제 #34
0
class EmrClient(object):

    # The Hadoop version to use
    HADOOP_VERSION = '1.0.3'

    # The AMI version to use
    AMI_VERSION = '2.4.7'

    # Interval to wait between polls to EMR cluster in seconds
    CLUSTER_OPERATION_RESULTS_POLLING_SECONDS = 10

    # Timeout for EMR creation and ramp up in seconds
    CLUSTER_OPERATION_RESULTS_TIMEOUT_SECONDS = 60 * 30

    def __init__(self,
                 region_name='us-east-1',
                 aws_access_key_id=None,
                 aws_secret_access_key=None):

        # If the access key is not specified, get it from the luigi config.cfg file
        if not aws_access_key_id:
            aws_access_key_id = luigi.configuration.get_config().get(
                'aws', 'aws_access_key_id')

        if not aws_secret_access_key:
            aws_secret_access_key = luigi.configuration.get_config().get(
                'aws', 'aws_secret_access_key')

        # Create the region in which to run
        region_endpoint = u'elasticmapreduce.%s.amazonaws.com' % (region_name)
        region = RegionInfo(name=region_name, endpoint=region_endpoint)

        self.emr_connection = EmrConnection(
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key,
            region=region)

    def launch_emr_cluster(
        self,
        cluster_name,
        log_uri,
        ec2_keyname=None,
        master_type='m1.small',
        core_type='m1.small',
        num_instances=2,
        hadoop_version='1.0.3',
        ami_version='2.4.7',
    ):

        # TODO Remove
        # install_pig_step = InstallPigStep()

        jobflow_id = self.emr_connection.run_jobflow(
            name=cluster_name,
            log_uri=log_uri,
            ec2_keyname=ec2_keyname,
            master_instance_type=master_type,
            slave_instance_type=core_type,
            num_instances=num_instances,
            keep_alive=True,
            enable_debugging=True,
            hadoop_version=EmrClient.HADOOP_VERSION,
            steps=[],
            ami_version=EmrClient.AMI_VERSION)

        # Log important information
        status = self.emr_connection.describe_jobflow(jobflow_id)

        logger.info('Creating new cluster %s with following details' %
                    status.name)
        logger.info('jobflow ID:\t%s' % status.jobflowid)
        logger.info('Log URI:\t%s' % status.loguri)
        logger.info('Master Instance Type:\t%s' % status.masterinstancetype)

        # A cluster of size 1 does not have any slave instances
        if hasattr(status, 'slaveinstancetype'):
            logger.info('Slave Instance Type:\t%s' % status.slaveinstancetype)

        logger.info('Number of Instances:\t%s' % status.instancecount)
        logger.info('Hadoop Version:\t%s' % status.hadoopversion)
        logger.info('AMI Version:\t%s' % status.amiversion)
        logger.info('Keep Alive:\t%s' % status.keepjobflowalivewhennosteps)

        return self._poll_until_cluster_ready(jobflow_id)

    def add_pig_step(self,
                     jobflow_id,
                     pig_file,
                     name='Pig Script',
                     pig_versions='latest',
                     pig_args=[]):

        pig_step = PigStep(
            name=name,
            pig_file=pig_file,
            pig_versions=pig_versions,
            pig_args=pig_args,
            # action_on_failure='CONTINUE',
        )

        self.emr_connection.add_jobflow_steps(jobflow_id, [pig_step])

        # Poll until the cluster is done working
        return self._poll_until_cluster_ready(jobflow_id)

    def shutdown_emr_cluster(self, jobflow_id):

        self.emr_connection.terminate_jobflow(jobflow_id)
        return self._poll_until_cluster_shutdown(jobflow_id)

    def get_jobflow_id(self):
        # Get the id of the cluster that is WAITING for work
        return self.emr_connection.list_clusters(
            cluster_states=['WAITING']).clusters[0].id

    def get_master_dns(self):
        """
        Get the master node's public address
        """
        # Get the jobflow ID
        jobflow_id = self.get_master_dns()

        # Use the jobflow ID to get the status
        status = self.emr_connection.describe_jobflow(jobflow_id)

        # Return the master's public dns
        return status.masterpublicdnsname

    def _poll_until_cluster_ready(self, jobflow_id):

        start_time = time.time()

        is_cluster_ready = False

        while (not is_cluster_ready) and (
                time.time() - start_time <
                EmrClient.CLUSTER_OPERATION_RESULTS_TIMEOUT_SECONDS):
            # Get the state
            state = self.emr_connection.describe_jobflow(jobflow_id).state

            if state == u'WAITING':
                logger.info('Cluster intialized and is WAITING for work')
                is_cluster_ready = True

            elif (state == u'COMPLETED') or \
                 (state == u'SHUTTING_DOWN') or \
                 (state == u'FAILED') or \
                 (state == u'TERMINATED'):

                logger.error('Error starting cluster; status: %s' % state)

                # Poll until cluster shutdown
                self._poll_until_cluster_shutdown(jobflow_id)
                raise RuntimeError('Error, cluster failed to start')

            else:
                logger.debug('Cluster state: %s' % state)
                time.sleep(EmrClient.CLUSTER_OPERATION_RESULTS_POLLING_SECONDS)

        if not is_cluster_ready:
            # TODO shutdown cluster
            raise RuntimeError(
                'Timed out waiting for EMR cluster to be active')

        return jobflow_id

    def _poll_until_cluster_shutdown(self, jobflow_id):
        start_time = time.time()

        is_cluster_shutdown = False

        while (not is_cluster_shutdown) and (
                time.time() - start_time <
                EmrClient.CLUSTER_OPERATION_RESULTS_TIMEOUT_SECONDS):
            # Get the state
            state = self.emr_connection.describe_jobflow(jobflow_id).state

            if (state == u'TERMINATED') or (state == u'COMPLETED'):
                logger.info('Cluster successfully shutdown with status: %s' %
                            state)
                return False
            elif state == u'FAILED':
                logger.error('Cluster shutdown with FAILED status')
                return False
            else:
                logger.debug('Cluster state: %s' % state)
                time.sleep(EmrClient.CLUSTER_OPERATION_RESULTS_POLLING_SECONDS)

        if not is_cluster_shutdown:
            # TODO shutdown cluster
            raise RuntimeError(
                'Timed out waiting for EMR cluster to shut down')

        return True
예제 #35
0
from boto.emr.connection import EmrConnection
from boto.emr.step import InstallPigStep, PigStep

AWS_ACCESS_KEY = ''  # REQUIRED
AWS_SECRET_KEY = ''  # REQUIRED
conn = EmrConnection(AWS_ACCESS_KEY, AWS_SECRET_KEY)

pig_file = 's3://elasticmapreduce/samples/pig-apache/do-reports2.pig'
INPUT = 's3://elasticmapreduce/samples/pig-apache/input/access_log_1'
OUTPUT = ''  # REQUIRED, S3 bucket for job output

pig_args = ['-p', 'INPUT=%s' % INPUT, '-p', 'OUTPUT=%s' % OUTPUT]
pig_step = PigStep('Process Reports', pig_file, pig_args=pig_args)
steps = [InstallPigStep(), pig_step]

conn.run_jobflow(name='report test',
                 steps=steps,
                 hadoop_version='0.20.205',
                 ami_version='latest',
                 num_instances=2,
                 keep_alive=False)
예제 #36
0
                               PageviewsBySubredditAndPath,
                               PageviewsByLanguage, ClickthroughsByCodename,
                               TargetedClickthroughsByCodename,
                               AdImpressionsByCodename,
                               TargetedImpressionsByCodename)

RAW_LOG_DIR = g.RAW_LOG_DIR
PROCESSED_DIR = g.PROCESSED_DIR
AGGREGATE_DIR = g.AGGREGATE_DIR
AWS_LOG_DIR = g.AWS_LOG_DIR

# the "or None" business is so that a blank string becomes None to cause boto
# to look for credentials in other places.
s3_connection = S3Connection(g.TRAFFIC_ACCESS_KEY or None, g.TRAFFIC_SECRET_KEY
                             or None)
emr_connection = EmrConnection(g.TRAFFIC_ACCESS_KEY or None,
                               g.TRAFFIC_SECRET_KEY or None)

traffic_categories = (SitewidePageviews, PageviewsBySubreddit,
                      PageviewsBySubredditAndPath, PageviewsByLanguage,
                      ClickthroughsByCodename, TargetedClickthroughsByCodename,
                      AdImpressionsByCodename, TargetedImpressionsByCodename)

traffic_subdirectories = {
    SitewidePageviews: 'sitewide',
    PageviewsBySubreddit: 'subreddit',
    PageviewsBySubredditAndPath: 'srpath',
    PageviewsByLanguage: 'lang',
    ClickthroughsByCodename: 'clicks',
    TargetedClickthroughsByCodename: 'clicks_targeted',
    AdImpressionsByCodename: 'thing',
    TargetedImpressionsByCodename: 'thingtarget',
예제 #37
0
import os
import sys
import dateutil.parser
from dateutil import tz
from boto.emr.connection import EmrConnection
from boto.s3.connection import S3Connection
from ucsd_bigdata.credentials import Credentials
import gzip


if __name__ == "__main__":
    credentials = Credentials()
    aws_access_key_id = credentials.aws_access_key_id
    aws_secret_access_key = credentials.aws_secret_access_key

    emr_conn = EmrConnection(aws_access_key_id, aws_secret_access_key)

    # List EMR Clusters
    clusters = emr_conn.list_clusters(cluster_states=["RUNNING", "WAITING"])

    for index, cluster in enumerate(clusters.clusters):
        print "[%s] %s" % (index, cluster.id)

    # if there is a command line arg, use it for the cluster_id
    if len(sys.argv) > 1:
        cluster_id = sys.argv[1]
    else:
        if len(clusters.clusters) == 0:
            sys.exit("No EMR clusters running.")
        selected_cluster = input("Select a Cluster: ")
        cluster_id = clusters.clusters[int(selected_cluster)].id
예제 #38
0
파일: boto.py 프로젝트: todatamining/db1
k = Key(b)
k.key = 'reducer.py'
k.set_contents_from_filename('/Users/winteram/Documents/Teaching/reducer.py')
k.close()

# <codecell>

for word in b.list():
    print word

# <codecell>

### Running code with EMR

#emrcon = EmrConnection('<aws access key>', '<aws secret key>')
emrcon = EmrConnection('AKIAJRV3RN6NXQTSSTBA', '3e212d6rs99xtiPgwKnfN1QD30WZk2hJwCWjMcGc')

# <codecell>

# Using EMR's wordcount example
step = StreamingStep(name='My wordcount example',
	mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
	reducer='aggregate', 
	input='s3n://elasticmapreduce/samples/wordcount/input',
	output='s3n://wambia660fall2013/output/wordcount_output')

# <codecell>

jobid = emrcon.run_jobflow(name='Word Count Example', 
                           log_uri='s3://wambia660fall2013/logs',
                           steps=[step])
예제 #39
0
 def _emr_connect(self):
     """Connect to emr.
     """
     self.emr_conn = EmrConnection(
         aws_access_key_id=self.access_key_id,
         aws_secret_access_key=self.secret_access_key)
예제 #40
0
def create_emr_cluster(cr):
    """
    @PARAM:  Cluster configuration reader object
    Creates an EMR cluster given a set of configuration parameters
    Return:  EMR Cluster ID
    """

    #region = cr.get_config("aws_region")
    #conn = boto.emr.connect_to_region(region)
    conn = EmrConnection(
        cr.get_config("aws_access_key"),
        cr.get_config("aws_secret_key"),
        region = RegionInfo(name = cr.get_config("aws_region"),
                            endpoint = cr.get_config("aws_region") + ".elasticmapreduce.amazonaws.com" ))


    #  Create list of instance groups:  master, core, and task
    instance_groups = []
    instance_groups.append(InstanceGroup(
        num_instances = cr.get_config("emr_master_node_count"),
        role = "MASTER",
        type = cr.get_config("emr_master_node_type"),
        market = cr.get_config("emr_market_type"),
        name = "Master Node" ))

    instance_groups.append(InstanceGroup(
        num_instances = cr.get_config("emr_core_node_count"),
        role = "CORE",
        type = cr.get_config("emr_core_node_type"),
        market = cr.get_config("emr_market_type"),
        name = "Core Node" ))

    #  Only create task nodes if specifcally asked for
    if cr.get_config("emr_task_node_count") > 0:
        instance_groups.append(InstanceGroup(
            num_instances = cr.get_config("emr_task_node_count"),
            role = "TASK",
            type = cr.get_config("emr_task_node_type"),
            market = cr.get_config("emr_market_type"),
            name = "Task Node" ))

    print "Creating EMR Cluster with instance groups: {0}".format(instance_groups)

    #  Use these params to add overrrides, these will go away in Boto3
    api_params = {"Instances.Ec2SubnetId": cr.get_config("aws_subnet_id"), "ReleaseLabel": cr.get_config("emr_version")}

    #  Add step to load data
    step_args = ["s3-dist-cp","--s3Endpoint=s3-us-west-1.amazonaws.com","--src=s3://alpine-qa/automation/automation_test_data/","--dest=hdfs:///automation_test_data","--srcPattern=.*[a-zA-Z,]+"]
    step = JarStep(name = "s3distcp for data loading",
                jar = "command-runner.jar",
                step_args = step_args,
                action_on_failure = "CONTINUE"
                )

    cluster_id = conn.run_jobflow(
        cr.get_config("emr_cluster_name"),
        instance_groups = instance_groups,
        action_on_failure = "TERMINATE_JOB_FLOW",
        keep_alive = True,
        enable_debugging = True,
        log_uri = cr.get_config("emr_log_uri"),
        #hadoop_version = "Amazon 2.7.2",
        #ReleaseLabel = "emr-5.0.0",
        #ami_version = "5.0.0",
        steps = [step],
        bootstrap_actions = [],
        ec2_keyname = cr.get_config("ec2_keyname"),
        visible_to_all_users = True,
        job_flow_role = "EMR_EC2_DefaultRole",
        service_role = "EMR_DefaultRole",
        api_params = api_params )

    print "EMR Cluster created, cluster id: {0}".format(cluster_id)
    state = conn.describe_cluster(cluster_id).status.state
    while state != u'COMPLETED' and state != u'SHUTTING_DOWN' and state != u'FAILED' and state != u'WAITING':
        #sleeping to recheck for status.
        time.sleep(5)
        state = conn.describe_cluster(cluster_id).status.state
        print "State is: {0}, sleeping 5s...".format(state)

    if state == u'SHUTTING_DOWN' or state == u'FAILED':
        return "ERROR"

    #Check if the state is WAITING. Then launch the next steps
    if state == u'WAITING':
        #Finding the master node dns of EMR cluster
        master_dns = conn.describe_cluster(cluster_id).masterpublicdnsname
        print "DNS Name: {0}".format(master_dns)
        return cluster_id
예제 #41
0
Created by Brian Tomasette on 2011-11-02.
Copyright (c) 2011 __DoublePositive__. All rights reserved.
"""

import re
import time
import datetime as dt
from boto.s3.connection import S3Connection
from boto.s3.key import Key
from boto.ses import SESConnection
from boto.emr.connection import EmrConnection
from boto.emr.step import JarStep
conn = S3Connection('AKIAIUCLB3MA3PL2XYNA', 'IbCFVJiiFxgPO6btlz32I67vZc9xoO+qsGpCVLhM')
conns = SESConnection('AKIAIUCLB3MA3PL2XYNA', 'IbCFVJiiFxgPO6btlz32I67vZc9xoO+qsGpCVLhM')
conne = EmrConnection('AKIAIUCLB3MA3PL2XYNA', 'IbCFVJiiFxgPO6btlz32I67vZc9xoO+qsGpCVLhM')


def send(subject, body, to):
	conns.send_email('*****@*****.**', subject, body, to)



message = ''
success_message = ''

email_content = 'The following files have been archived and a successful Hadoop job was run:\n\n\n'


step1 = JarStep(name='Setup Hive',
               jar='s3://elasticmapreduce/libs/script-runner/script-runner.jar',
예제 #42
0
# <codecell>

b = s3con.get_bucket('wambia660fall2013')

# <codecell>

k = Key(b)
k.key = 'fullNgramNamesBoto.hql'
k.set_contents_from_filename('/Users/winteram/Documents/Teaching/BIA_Fall2013/fullNgramNamesBoto.hql')
k.close()

# <codecell>

### Will run Hive via EMR
emrcon = EmrConnection('AKIAJRV3RN6NXQTSSTBA', '3e212d6rs99xtiPgwKnfN1QD30WZk2hJwCWjMcGc')

# <codecell>

install_hive_step = step.InstallHiveStep(hive_versions='0.11.0.1')

# <codecell>

names1gram = step.HiveStep("fullNgramNamesBoto",
                           's3://wambia660fall2013/fullNgramNamesBoto.hql',
                           hive_args=['-d INPUT=s3://datasets.elasticmapreduce/ngrams/books/20090715/eng-us-all/1gram/', 
                                      '-d OUTPUT=s3://wambia660fall2013/output/'])

# <codecell>

jobid = emrcon.run_jobflow(name='Names 1gram boto v3', 
예제 #43
0
class EmrManager(object):
 
    # Default constructor of the class. Uses default parameters if not provided.
    def __init__(self, parameters):
        try: 
            self.region_name = parameters["region_name"]
            self.access_key = parameters["access_key"]
            self.secret_key = parameters["secret_key"]
            self.ec2_keypair_name = parameters["ec2_keypair_name"]
            self.base_bucket = parameters["base_bucket"]
            self.log_dir = parameters["log_dir"]
            self.emr_status_wait = parameters["emr_status_wait"]
            self.step_status_wait = parameters["step_status_wait"]
            self.emr_cluster_name = parameters["emr_cluster_name"]
        except:
            logging.error("Something went wrong initializing EmrManager")
            sys.exit()

        # Establishing EmrConnection
        self.connection = EmrConnection(self.access_key, self.secret_key,
                             region=RegionInfo(name=self.region_name,
                             endpoint=self.region_name + '.elasticmapreduce.amazonaws.com'))

        self.log_bucket_name = self.base_bucket + self.log_dir
 
    #Method for launching the EMR cluster
    def launch_cluster(self, master_type, slave_type, num_instances, ami_version):
        try:
            #Launching the cluster
            cluster_id = self.connection.run_jobflow(
                             self.emr_cluster_name,
                             self.log_bucket_name,
                             ec2_keyname=self.ec2_keypair_name,
                             keep_alive=True,
                             action_on_failure = 'CANCEL_AND_WAIT',
                             master_instance_type=master_type,
                             slave_instance_type=slave_type,
                             num_instances=num_instances,
                             ami_version=ami_version)

            logging.info("Launching cluster: " + cluster_id + ". Please be patient. Check the status of your cluster in your AWS Console")

            # Checking the state of EMR cluster
            state = self.connection.describe_jobflow(cluster_id).state
            while state != u'COMPLETED' and state != u'SHUTTING_DOWN' and state != u'FAILED' and state != u'WAITING':
                #sleeping to recheck for status.
                time.sleep(int(self.emr_status_wait))
                state = self.connection.describe_jobflow(cluster_id).state
                logging.info("Creating cluster " + cluster_id + ". Status: " + state)
 
            if state == u'SHUTTING_DOWN' or state == u'FAILED':
                logging.error("Launching EMR cluster failed")
                return "ERROR"
 
            #Check if the state is WAITING. Then launch the next steps
            if state == u'WAITING':
                #Finding the master node dns of EMR cluster
                master_dns = self.connection.describe_jobflow(cluster_id).masterpublicdnsname
                logging.info("Launched EMR Cluster Successfully with cluster id:" + cluster_id)
                logging.info("Master node DNS of EMR " + master_dns)
                return cluster_id
        except:
            logging.error("Launching EMR cluster failed")
            return "FAILED"

    # run scripting step in cluster
    def run_scripting_step(self, cluster_id, name, script_path):
        try:
            step = ScriptRunnerStep(name=name, 
                                    step_args=[script_path],
                                    action_on_failure="CONTINUE")
            return self._run_step(cluster_id, step)
        except:
            logging.error("Running scripting step in cluster " + cluster_id + " failed.")
            return "FAILED"

    # run streaming step in cluster
    def run_streaming_step(self, cluster_id, name, mapper_path, reducer_path, input_path, output_path):
        try:
            # bundle files with the job
            files = []
            if mapper_path != "NONE":
                files.append(mapper_path)
                mapper_path = mapper_path.split("/")[-1]
            if reducer_path != "NONE":
                files.append(reducer_path)
                reducer_path = reducer_path.split("/")[-1]
            # build streaming step
            logging.debug("Launching streaming step with mapper: " + mapper_path + " reducer: " + reducer_path + " and files: " + str(files))
            step = StreamingStep(name=name,
                                    step_args=["-files"] + files, 
                                    mapper=mapper_path, 
                                    reducer=reducer_path, 
                                    input=input_path, 
                                    output=output_path, 
                                    action_on_failure="CONTINUE")
            return self._run_step(cluster_id, step)            
        except:
            logging.error("Running streaming step in cluster " + cluster_id + " failed.")
            return "FAILED"

    # run mapreduce jar step in cluster
    def run_jar_step(self, cluster_id, name, jar_path, class_name, input_path, output_path):
        try:
            # build streaming step
            logging.debug("Launching jar step with jar: " + jar_path + " class name: " + class_name + " input: " + input_path + " and output: " + output_path)
            step = JarStep(name=name,
                            jar=jar_path, 
                            step_args= [class_name,
                                        input_path,
                                        output_path])
            return self._run_step(cluster_id, step)            
        except:
            logging.error("Running jar step in cluster " + cluster_id + " failed.")
            return "FAILED"

    def _run_step(self, cluster_id, step):
        step_list = self.connection.add_jobflow_steps(cluster_id, [step])
        step_id = step_list.stepids[0].value

        logging.info("Starting step " + step_id + " in cluster " + cluster_id + ". Please be patient. Check the progress of the job in your AWS Console")

        # Checking the state of the step
        state = self._find_step_state(cluster_id, step_id)
        while state != u'NOT_FOUND' and state != u'ERROR' and state != u'FAILED' and state!=u'COMPLETED':
            #sleeping to recheck for status.
            time.sleep(int(self.step_status_wait))
            state = self._find_step_state(cluster_id, step_id)
            logging.info("Starting step " + step_id + " in cluster " + cluster_id + ". Status: " + state)

        if state == u'FAILED':
            logging.error("Step " + step_id + " failed in cluster: " + cluster_id)
            return "FAILED"
        if state == u'NOT_FOUND':
            logging.error("Step " + step_id + " could not be found in cluster: " + cluster_id)
            return "NOT_FOUND"
        if state == u'ERROR':
            logging.error("Step " + step_id + " produced an error in _find_step_state in cluster: " + cluster_id)
            return "ERROR"

        #Check if the state is WAITING. Then launch the next steps
        if state == u'COMPLETED':
            #Finding the master node dns of EMR cluster
            logging.info("Step " + step_id + " succesfully completed in cluster: " + cluster_id)
            return step_id


    def _find_step_state(self, cluster_id, step_id):
        try:
            step_summary_list = self.connection.list_steps(cluster_id)
            for step_summary in step_summary_list.steps:
                if step_summary.id == step_id:
                    return step_summary.status.state
            return "NOT_FOUND"
        except:
            return "ERROR"

    #Method for terminating the EMR cluster
    def terminate_cluster(self, cluster_id):
        self.connection.terminate_jobflow(cluster_id)
예제 #44
0
class Rankmaniac:
    """
    (wrapper class)

    This class presents a simple wrapper around the AWS SDK. It strives
    to provide all the functionality required to run map-reduce
    (Hadoop) on Amazon. This way the students do not need to worry about
    learning the API for Amazon S3 and EMR, and instead can focus on
    computing pagerank quickly!
    """

    DefaultRegionName = 'us-west-2'
    DefaultRegionEndpoint = 'elasticmapreduce.us-west-2.amazonaws.com'

    def __init__(self, team_id, access_key, secret_key,
                 bucket='cs144students'):
        """
        (constructor)

        Creates a new instance of the Rankmaniac class for a specific
        team using the provided credentials.

        Arguments:
            team_id       <str>     the team identifier, which may be
                                    differ slightly from the actual team
                                    name.

            access_key    <str>     the AWS access key identifier.
            secret_key    <str>     the AWS secret acess key.

        Keyword arguments:
            bucket        <str>     the S3 bucket name.
        """

        region = RegionInfo(None, self.DefaultRegionName,
                            self.DefaultRegionEndpoint)

        self._s3_bucket = bucket
        self._s3_conn = S3Connection(access_key, secret_key)
        self._emr_conn = EmrConnection(access_key, secret_key, region=region)

        self.team_id = team_id
        self.job_id = None

        self._reset()
        self._num_instances = 1

    def _reset(self):
        """
        Resets the internal state of the job and submission.
        """

        self._iter_no = 0
        self._infile = None
        self._last_outdir = None

        self._last_process_step_iter_no = -1
        self._is_done = False

    def __del__(self):
        """
        (destructor)

        Terminates the map-reduce job if any, and closes the connections
        to Amazon S3 and EMR.
        """

        if self.job_id is not None:
            self.terminate()

        self._s3_conn.close()
        self._emr_conn.close()

    def __enter__(self):
        """
        Used for `with` syntax. Simply returns this instance since the
        set-up has all been done in the constructor.
        """

        return self

    def __exit__(self, type, value, traceback):
        """
        Refer to __del__().
        """

        self.__del__()
        return False # do not swallow any exceptions

    def upload(self, indir='data'):
        """
        Uploads the local data to Amazon S3 under the configured bucket
        and key prefix (the team identifier). This way the code can be
        accessed by Amazon EMR to compute pagerank.

        Keyword arguments:
            indir       <str>       the base directory from which to
                                    upload contents.

        Special notes:
            This method only uploads **files** in the specified
            directory. It does not scan through subdirectories.

            WARNING! This method removes all previous (or ongoing)
            submission results, so it is unsafe to call while a job is
            already running (and possibly started elsewhere).
        """

        if self.job_id is not None:
            raise RankmaniacError('A job is already running.')

        bucket = self._s3_conn.get_bucket(self._s3_bucket)

        # Clear out current bucket contents for team
        keys = bucket.list(prefix=self._get_keyname())
        bucket.delete_keys(keys)

        for filename in os.listdir(indir):
            relpath = os.path.join(indir, filename)
            if os.path.isfile(relpath):
                keyname = self._get_keyname(filename)
                key = bucket.new_key(keyname)
                key.set_contents_from_filename(relpath)

    def set_infile(self, filename):
        """
        Sets the data file to use for the first iteration of the
        pagerank step in the map-reduce job.
        """

        if self.job_id is not None:
            raise RankmaniacError('A job is already running.')

        self._infile = filename

    def do_iter(self, pagerank_mapper, pagerank_reducer,
                process_mapper, process_reducer,
                pagerank_output=None, process_output=None,
                num_pagerank_mappers=1, num_pagerank_reducers=1):
        """
        Adds a pagerank step and a process step to the current job.
        """

        num_process_mappers = 1
        num_process_reducers = 1

        if self._iter_no == 0:
            pagerank_input = self._infile
        elif self._iter_no > 0:
            pagerank_input = self._last_outdir

        if pagerank_output is None:
            pagerank_output = self._get_default_outdir('pagerank')

        # Output from the pagerank step becomes input to process step
        process_input = pagerank_output

        if process_output is None:
            process_output = self._get_default_outdir('process')

        pagerank_step = self._make_step(pagerank_mapper, pagerank_reducer,
                                        pagerank_input, pagerank_output,
                                        num_pagerank_mappers,
                                        num_pagerank_reducers)

        process_step = self._make_step(process_mapper, process_reducer,
                                       process_input, process_output,
                                       num_process_mappers,
                                       num_process_reducers)

        steps = [pagerank_step, process_step]
        if self.job_id is None:
            self._submit_new_job(steps)
        else:
            self._emr_conn.add_jobflow_steps(self.job_id, steps)

        # Store `process_output` directory so it can be used in
        # subsequent iteration
        self._last_outdir = process_output
        self._iter_no += 1

    def is_done(self):
        """
        Returns `True` if the map-reduce job is done, and `False`
        otherwise.

        For all process-step output files that have not been fetched,
        gets the first part of the output file, and checks whether its
        contents begins with the string 'FinalRank'.

        Special notes:
            WARNING! The usage of this method in your code requires that
            that you used the default output directories in all calls
            to do_iter().
        """

        # Cache the result so we can return immediately without hitting
        # any of the Amazon APIs
        if self._is_done:
            return True

        iter_no = self._get_last_process_step_iter_no()
        if iter_no < 0:
            return False

        while self._last_process_step_iter_no < iter_no:
            self._last_process_step_iter_no += 1
            i = self._last_process_step_iter_no

            outdir = self._get_default_outdir('process', iter_no=i)
            keyname = self._get_keyname(outdir, 'part-00000')

            bucket = self._s3_conn.get_bucket(self._s3_bucket)
            key = Key(bucket=bucket, name=keyname)
            contents = key.next() # get first chunk of the output file

            if contents.startswith('FinalRank'):
                self._is_done = True # cache result
                break

        return self._is_done

    def is_alive(self):
        """
        Checks whether the jobflow has completed, failed, or been
        terminated.

        Special notes:
            WARNING! This method should only be called **after**
            is_done() in order to be able to distinguish between the
            cases where the map-reduce job has outputted 'FinalRank'
            on its final iteration and has a 'COMPLETED' state.
        """

        jobflow = self.describe()
        if jobflow.state in ('COMPLETED', 'FAILED', 'TERMINATED'):
            return False

        return True

    def terminate(self):
        """
        Terminates a running map-reduce job.
        """

        if not self.job_id:
            raise RankmaniacError('No job is running.')

        self._emr_conn.terminate_jobflow(self.job_id)
        self.job_id = None

        self._reset()

    def download(self, outdir='results'):
        """
        Downloads the results from Amazon S3 to the local directory.

        Keyword arguments:
            outdir      <str>       the base directory to which to
                                    download contents.

        Special notes:
            This method downloads all keys (files) from the configured
            bucket for this particular team. It creates subdirectories
            as needed.
        """

        bucket = self._s3_conn.get_bucket(self._s3_bucket)
        keys = bucket.list(prefix=self._get_keyname())
        for key in keys:
            keyname = key.name
            # Ignore folder keys
            if '$' not in keyname:
                suffix = keyname.split('/')[1:] # removes team identifier
                filename = os.path.join(outdir, *suffix)
                dirname = os.path.dirname(filename)

                if not os.path.exists(dirname):
                    os.makedirs(dirname)

                key.get_contents_to_filename(filename)

    def describe(self):
        """
        Gets the current map-reduce job details.

        Returns a boto.emr.emrobject.JobFlow object.

        Special notes:
            The JobFlow object has the following relevant fields.
                state       <str>           the state of the job flow,
                                            either COMPLETED
                                                 | FAILED
                                                 | TERMINATED
                                                 | RUNNING
                                                 | SHUTTING_DOWN
                                                 | STARTING
                                                 | WAITING

                steps       <list(boto.emr.emrobject.Step)>
                            a list of the step details in the workflow.

            The Step object has the following relevant fields.
                state               <str>       the state of the step.

                startdatetime       <str>       the start time of the
                                                job.

                enddatetime         <str>       the end time of the job.

            WARNING! Amazon has an upper-limit on the frequency with
            which you can call this method; we have had success with
            calling it at most once every 10 seconds.
        """

        if not self.job_id:
            raise RankmaniacError('No job is running.')

        return self._emr_conn.describe_jobflow(self.job_id)

    def _get_last_process_step_iter_no(self):
        """
        Returns the most recently process-step of the job flow that has
        been completed.
        """

        steps = self.describe().steps
        i = 1

        while i < len(steps):
            step = steps[i]
            if step.state != 'COMPLETED':
                break

            i += 2

        return i / 2 - 1

    def _get_default_outdir(self, name, iter_no=None):
        """
        Returns the default output directory, which is 'iter_no/name/'.
        """

        if iter_no is None:
            iter_no = self._iter_no

        # Return iter_no/name/ **with** the trailing slash
        return '%s/%s/' % (iter_no, name)

    def _submit_new_job(self, steps):
        """
        Submits a new job to run on Amazon EMR.
        """

        if self.job_id is not None:
            raise RankmaniacError('A job is already running.')

        job_name = self._make_name()
        num_instances = self._num_instances
        log_uri = self._get_s3_team_uri('job_logs')
        self.job_id = self._emr_conn.run_jobflow(name=job_name,
                                                 steps=steps,
                                                 num_instances=num_instances,
                                                 log_uri=log_uri)

    def _make_step(self, mapper, reducer, input, output,
                   num_mappers=1, num_reducers=1):
        """
        Returns a new step that runs the specified mapper and reducer,
        reading from the specified input and writing to the specified
        output.
        """

        bucket = self._s3_conn.get_bucket(self._s3_bucket)

        # Clear out current bucket/output contents for team
        keys = bucket.list(prefix=self._get_keyname(output))
        bucket.delete_keys(keys)

        step_name = self._make_name()
        step_args = ['-jobconf', 'mapred.map.tasks=%d' % (num_mappers),
                     '-jobconf', 'mapred.reduce.tasks=%d' % (num_reducers)]

        return StreamingStep(name=step_name,
                            step_args=step_args,
                            mapper=self._get_s3_team_uri(mapper),
                            reducer=self._get_s3_team_uri(reducer),
                            input=self._get_s3_team_uri(input),
                            output=self._get_s3_team_uri(output))

    def _make_name(self):
        return strftime('%%s %m-%d-%Y %H:%M:%S', localtime()) % (self.team_id)

    def _get_keyname(self, *args):
        """
        Returns the key name to use in the grading bucket (for the
        particular team).

            'team_id/...'
        """

        return '%s/%s' % (self.team_id, '/'.join(args))

    def _get_s3_team_uri(self, *args):
        """
        Returns the Amazon S3 URI for the team submissions.
        """

        return 's3n://%s/%s' % (self._s3_bucket, self._get_keyname(*args))
예제 #45
0
def create_data_source_variable(cluster_id, cr):
    """
    Creates a data source variable .json file using the cluster_id of an EMR cluster_id
    @PARAM:  cluster_id:  ID of an EMR cluster
    return:  True if success, creates a file in the pwd 'default_emr.json'

    Object created should look like:

    HADOOP_DATA_SOURCE_NAME="emr_data_source"
    HADOOP_DATA_SOURCE_DISTRO="Cloudera CDH5.4-5.7"
    HADOOP_DATA_SOURCE_HOST="emr_master_dns_hostname"
    HADOOP_DATA_SOURCE_PORT=8020
    HADOOP_DATA_SOURCE_USER="******"
    HADOOP_DATA_SOURCE_GROUP="hadoop"
    HADOOP_DATA_SOURCE_JT_HOST="emr_master_dns_hostname"
    HADOOP_DATA_SOURCE_JT_PORT=8032
    CONNECTION_PARAMETERS='[{"key":"mapreduce.jobhistory.address", "value":"0.0.0.0:10020"}, ' \
                            '{"key":"mapreduce.jobhistory.webapp.address", "value":"cdh5hakerberosnn.alpinenow.local:19888"}, ' \
                            '{"key":"yarn.app.mapreduce.am.staging-dir", "value":"/tmp/hadoop-yarn/staging"}, ' \
                            '{"key":"yarn.resourcemanager.admin.address", "value":"cdh5hakerberosnn.alpinenow.local:8033"}, ' \
                            '{"key":"yarn.resourcemanager.resource-tracker.address", "value":"cdh5hakerberosnn.alpinenow.local:8031"}, ' \
                            '{"key":"yarn.resourcemanager.scheduler.address", "value":"cdh5hakerberosnn.alpinenow.local:8030"}]'

    """
    conn = EmrConnection(
        cr.get_config("aws_access_key"),
        cr.get_config("aws_secret_key"),
        region=RegionInfo(name=cr.get_config("aws_region"),
                          endpoint=cr.get_config("aws_region") +
                          ".elasticmapreduce.amazonaws.com"))

    emr_cluster = conn.describe_cluster(cluster_id)
    master_dns_hostname = emr_cluster.masterpublicdnsname

    # Build up connection parameters
    conn_params = []
    conn_params.append({
        "key": "mapreduce.jobhistory.address",
        "value": "{0}:10020".format(master_dns_hostname)
    })
    conn_params.append({
        "key": "mapreduce.jobhistory.webapp.address",
        "value": "{0}:19888".format(master_dns_hostname)
    })
    conn_params.append({
        "key": "yarn.app.mapreduce.am.staging-dir",
        "value": "/user"
    })
    conn_params.append({
        "key": "yarn.resourcemanager.admin.address",
        "value": "{0}:8033".format(master_dns_hostname)
    })
    conn_params.append({
        "key": "yarn.resourcemanager.scheduler.address",
        "value": "{0}:8030".format(master_dns_hostname)
    })
    conn_params_str = "CONNECTION_PARAMETERS=\"{0}\"".format(conn_params)
    email_str = "EMAIL=\"avalanche_{0}.alpinenow.com\"".format(
        random.randint(1, 99999))

    with open("emr_default.conf", "w") as f:
        f.writelines("HADOOP_DATA_SOURCE_NAME=\"{0}\"\n".format(
            cr.get_config("emr_cluster_name")))
        f.writelines(
            "HADOOP_DATA_SOURCE_DISTRO=\"{0}\"\n".format("Amazon EMR5"))
        f.writelines(
            "HADOOP_DATA_SOURCE_HOST=\"{0}\"\n".format(master_dns_hostname))
        f.writelines("HADOOP_DATA_SOURCE_POST=\"8020\"\n")
        f.writelines("HADOOP_DATA_SOURCE_USER=\"hdfs\"\n")
        f.writelines("HADOOP_DATA_SOURCE_GROUP=\"hadoop\"\n")
        f.writelines(
            "HADOOP_DATA_SOURCE_JT_HOST=\"{0}\"\n".format(master_dns_hostname))
        f.writelines("HADOOP_DATA_SOURCE_JT_PORT=\"8032\"\n")
        f.writelines(email_str)
        f.writelines(conn_params_str)
예제 #46
0
from boto.emr.connection import EmrConnection

# Description:
# EmrConnection can be used to create a new emr job

# initialize emr connection
conn = EmrConnection("<aws-access-key-id>", "<aws-secret-access-key>")

# run job flow with 10 instances
conn.run_jobflow(num_instances=10,
                 master_instance_type="m1.small",
                 slave_instance_type="m1.small")
예제 #47
0
class EMRCluster(object):
  '''Representation of an EMR cluster.
     TODO: add bridge to boto interface for unit test.
  '''
  emr_status_delay = 10      # in sec
  emr_status_max_delay = 60  # in sec
  emr_status_max_error = 30  # number of errors
  emr_max_idle = 10 * 60     # 10 min (in sec)
  rate_limit_lock = RateLimitLock()

  def __init__(self, prop):
    '''Constructor, initialize EMR connection.'''
    self.prop = prop
    self.conn = EmrConnection(self.prop.ec2.key, self.prop.ec2.secret)
    self.jobid = None
    self.retry = 0
    self.level = 0
    self.last_update = -1

  @property
  def priority(self):
    '''The priority used in EMRManager.
       The lower value, the higher priority.
    '''
    with EMRCluster.rate_limit_lock:
      if self.jobid is None:
        return 1
      return 0

  def get_instance_groups(self):
    '''Get instance groups to start a cluster.
       It calculates the price with self.level, which indicates the
       price upgrades from the original price.
    '''
    instance_groups = []
    for group in self.prop.emr.instance_groups:
      (num, group_name, instance_type) = group
      level = max(0, min(self.level, len(self.prop.emr.price_upgrade_rate) - 1))  # 0 <= level < len(...)
      bprice = self.prop.emr.prices[instance_type] * self.prop.emr.price_upgrade_rate[level]
      name = '%s-%s@%f' % (group_name, 'SPOT', bprice)

      # Use on-demand instance if prices are zero.
      if bprice > 0:
        ig = InstanceGroup(num, group_name, instance_type, 'SPOT', name, '%.3f' % bprice)
      else:
        ig = InstanceGroup(num, group_name, instance_type, 'ON_DEMAND', name)

      instance_groups.append(ig)      

    return instance_groups

  def get_bootstrap_actions(self):
    '''Get list of bootstrap actions from property'''
    actions = []
    for bootstrap_action in self.prop.emr.bootstrap_actions:
      assert len(bootstrap_action) >= 2, 'Wrong bootstrap action definition: ' + str(bootstrap_action)
      actions.append(BootstrapAction(bootstrap_action[0], bootstrap_action[1], bootstrap_action[2:]))
    return actions

  @synchronized
  def start(self):
    '''Start a EMR cluster.'''
    # emr.project_name is required
    if self.prop.emr.project_name is None:
      raise ValueError('emr.project_name is not set')

    self.last_update = time.time()
    with EMRCluster.rate_limit_lock:
      self.jobid = self.conn.run_jobflow(name=self.prop.emr.cluster_name,
                                         ec2_keyname=self.prop.emr.keyname,
                                         log_uri=self.prop.emr.log_uri,
                                         ami_version=self.prop.emr.ami_version,
                                         bootstrap_actions=self.get_bootstrap_actions(),
                                         keep_alive=True,
                                         action_on_failure='CONTINUE',
                                         api_params={'VisibleToAllUsers': 'true'},
                                         instance_groups=self.get_instance_groups())
    message('Job flow created: %s', self.jobid)

    # Tag EC2 instances to allow future analysis
    tags = {'FlowControl': 'Briefly',
            'Project': self.prop.emr.project_name}
    if self.prop.emr.tags is not None:
      assert isinstance(self.prop.emr.tags, dict)
      tags = dict(tags.items() + self.prop.emr.tags.items())
    self.conn.add_tags(self.jobid, tags)

  @synchronized
  def terminate(self, level_upgrade=0):
    '''Terminate this EMR cluster.'''
    if self.jobid is None:
      return

    self.level += level_upgrade # upgrade to another price level

    message('Terminate jobflow: %s', self.jobid)
    for i in xrange(3):
      try:
        with EMRCluster.rate_limit_lock:
          self.conn.terminate_jobflow(self.jobid)
        break
      except Exception, e:
        message('Unable to terminate job flow: %s', self.jobid)
        message(traceback.format_exc())
    # We have to set jobid as None to create new cluster;
    # otherwise, run_steps will keep launching jobs on the bad cluster.
    self.jobid = None
예제 #48
0
def create_emr_cluster(cr):
    """
    @PARAM:  Cluster configuration reader object
    Creates an EMR cluster given a set of configuration parameters
    Return:  EMR Cluster ID
    """

    #region = cr.get_config("aws_region")
    #conn = boto.emr.connect_to_region(region)
    conn = EmrConnection(
        cr.get_config("aws_access_key"),
        cr.get_config("aws_secret_key"),
        region=RegionInfo(name=cr.get_config("aws_region"),
                          endpoint=cr.get_config("aws_region") +
                          ".elasticmapreduce.amazonaws.com"))

    #  Create list of instance groups:  master, core, and task
    instance_groups = []
    instance_groups.append(
        InstanceGroup(num_instances=cr.get_config("emr_master_node_count"),
                      role="MASTER",
                      type=cr.get_config("emr_master_node_type"),
                      market=cr.get_config("emr_market_type"),
                      name="Master Node"))

    instance_groups.append(
        InstanceGroup(num_instances=cr.get_config("emr_core_node_count"),
                      role="CORE",
                      type=cr.get_config("emr_core_node_type"),
                      market=cr.get_config("emr_market_type"),
                      name="Core Node"))

    #  Only create task nodes if specifcally asked for
    if cr.get_config("emr_task_node_count") > 0:
        instance_groups.append(
            InstanceGroup(num_instances=cr.get_config("emr_task_node_count"),
                          role="TASK",
                          type=cr.get_config("emr_task_node_type"),
                          market=cr.get_config("emr_market_type"),
                          name="Task Node"))

    print "Creating EMR Cluster with instance groups: {0}".format(
        instance_groups)

    #  Use these params to add overrrides, these will go away in Boto3
    api_params = {
        "Instances.Ec2SubnetId": cr.get_config("aws_subnet_id"),
        "ReleaseLabel": cr.get_config("emr_version")
    }

    #  Add step to load data
    step_args = [
        "s3-dist-cp", "--s3Endpoint=s3-us-west-1.amazonaws.com",
        "--src=s3://alpine-qa/automation/automation_test_data/",
        "--dest=hdfs:///automation_test_data", "--srcPattern=.*[a-zA-Z,]+"
    ]
    step = JarStep(name="s3distcp for data loading",
                   jar="command-runner.jar",
                   step_args=step_args,
                   action_on_failure="CONTINUE")

    cluster_id = conn.run_jobflow(
        cr.get_config("emr_cluster_name"),
        instance_groups=instance_groups,
        action_on_failure="TERMINATE_JOB_FLOW",
        keep_alive=True,
        enable_debugging=True,
        log_uri=cr.get_config("emr_log_uri"),
        #hadoop_version = "Amazon 2.7.2",
        #ReleaseLabel = "emr-5.0.0",
        #ami_version = "5.0.0",
        steps=[step],
        bootstrap_actions=[],
        ec2_keyname=cr.get_config("ec2_keyname"),
        visible_to_all_users=True,
        job_flow_role="EMR_EC2_DefaultRole",
        service_role="EMR_DefaultRole",
        api_params=api_params)

    print "EMR Cluster created, cluster id: {0}".format(cluster_id)
    state = conn.describe_cluster(cluster_id).status.state
    while state != u'COMPLETED' and state != u'SHUTTING_DOWN' and state != u'FAILED' and state != u'WAITING':
        #sleeping to recheck for status.
        time.sleep(5)
        state = conn.describe_cluster(cluster_id).status.state
        print "State is: {0}, sleeping 5s...".format(state)

    if state == u'SHUTTING_DOWN' or state == u'FAILED':
        return "ERROR"

    #Check if the state is WAITING. Then launch the next steps
    if state == u'WAITING':
        #Finding the master node dns of EMR cluster
        master_dns = conn.describe_cluster(cluster_id).masterpublicdnsname
        print "DNS Name: {0}".format(master_dns)
        return cluster_id
예제 #49
0
from boto.emr.connection import EmrConnection
from boto.emr.step import StreamingStep
import boto

AWS_KEY='AKIAIQ7VG4UORIN75ZSA'
AWS_SECRET='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'

conn = EmrConnection(AWS_KEY, AWS_SECRET)

filelist="""split_!.txt
split_".txt
split_$.txt
split_%.txt
split_&.txt
split_'.txt
split_(.txt
split_).txt
split_*.txt
split_+.txt
split_,.txt
split_-.txt
split_0.txt
split_1.txt
split_2.txt
split_3.txt
split_4.txt
split_5.txt
split_6.txt
split_7.txt
split_8.txt
split_9.txt
예제 #50
0
class EmrHiveRuntime(HiveRuntime):
    def __init__(self, spec_filename="spec.json"):
        import boto
        from boto.emr.connection import EmrConnection, RegionInfo

        super(HiveRuntime, self).__init__(spec_filename)
        p = self.settings.Param
        self.s3_conn = boto.connect_s3(p.AWS_ACCESS_KEY_ID, p.AWS_ACCESS_KEY_SECRET)
        self.s3_bucket = self.s3_conn.get_bucket(p.S3_BUCKET)
        self.region = p.AWS_Region
        self.emr_conn = EmrConnection(p.AWS_ACCESS_KEY_ID, p.AWS_ACCESS_KEY_SECRET,
                region = RegionInfo(name = self.region,
                    endpoint = self.region + '.elasticmapreduce.amazonaws.com'))
        self.job_flow_id = p.EMR_jobFlowId

    def get_s3_working_dir(self, path=""):
        ps = self.settings
        glb_vars = ps.GlobalParam
        return os.path.join('zetjob', glb_vars['userName'], "job%s" % glb_vars['jobId'], "blk%s" % glb_vars['blockId'], path)

    def get_emr_job_name(self):
        ps = self.settings
        glb_vars = ps.GlobalParam
        return os.path.join('zetjob', glb_vars['userName'], "job%s" % glb_vars['jobId'], "blk%s" % glb_vars['blockId'])

    def s3_upload_dir(self, local_dir):
        print("EmrHiveRuntime.s3_uploader()")
        print("s3_upload_dir :::: %s" % local_dir)
        s3_upload_dir = self.get_s3_working_dir(local_dir)
        ext_files = [f for f in sorted(os.listdir(local_dir)) if os.path.isfile(os.path.join(local_dir,f))]
        for f in ext_files:
            f_local = os.path.join(local_dir, f)
            f_remote = os.path.join(s3_upload_dir, local_dir, f)
            f_remote_full = os.path.join("s3://", self.s3_bucket.name, f_remote)

            print("S3 Upload      :: %s ====> %s" % (f_local, s3_upload_dir))
            print("S3 remote_full :: %s" % f_remote_full)
            yield s3_upload(self.s3_bucket, f_remote, f_local)

    def files_uploader(self, local_dir):
        return self.s3_upload_dir(local_dir)

    def clean_s3_working_dir(self):
        s3_working_dir = self.get_s3_working_dir()
        if not s3_delete(self.s3_bucket, s3_working_dir):
            # TODO : refactor to 'HiveException'
            raise Exception("Can not clean s3 path : %s" % s3_working_dir)

    def clean_working_dir(self):
        self.clean_s3_working_dir()

    def emr_execute_hive(self, s3_hive_script):
        from boto.emr.step import HiveStep
        hive_step = HiveStep(name=self.get_emr_job_name(), hive_file=s3_hive_script)
        self.emr_conn.add_jobflow_steps(self.job_flow_id, steps=[hive_step])
        emr_wait_job(self.emr_conn, self.job_flow_id)

    def execute(self, main_hive_script, generated_hive_script=None):
        self.clean_working_dir()
        hive_script_local = self.generate_script(main_hive_script, generated_hive_script)

        s3_working_dir = self.get_s3_working_dir()
        hive_script_remote = os.path.join(s3_working_dir, os.path.basename(hive_script_local))
        hive_script_remote_full = s3_upload(self.s3_bucket, hive_script_remote, hive_script_local)
        print(hive_script_remote_full)
        print("EmrHiveRuntime.execute()")
        self.emr_execute_hive(hive_script_remote_full)
예제 #51
0
from boto.emr.connection import EmrConnection
from boto.emr.step import StreamingStep
import boto

AWS_KEY='AKIAIQ7VG4UORIN75ZSA'
AWS_SECRET='jzxajGx8gzwX+ymYXJ0/5heCjkPtWLQkICYRn7Vj'

conn = EmrConnection(AWS_KEY, AWS_SECRET)

step = StreamingStep(name='My wordcount example',
                      mapper='s3n://css739/wordcount/bigramSplitter.py',
                      reducer='aggregate',
                      input='s3n://smalldata/wikipedia_titles.txt',
                      output='s3n://css739/wordcount/bigram_count_output2',
                      cache_files=['s3n://css739/wordcount/english_stoplist.py'])
                      
                      
jobid = conn.run_jobflow(name='My jobflow', log_uri='s3n://css739/wordcount/jobflow_logs',steps=[step])

conn.describe_jobflow(jobid).state
예제 #52
0
# the nodes of an EMR(Elastic Map Reduce) job.

# build up our instance groups
namenode_instance_group = InstanceGroup(num_instances=1,
                                        role="MASTER",
                                        type="c1.xlarge",
                                        market="ON_DEMAND",
                                        name="MASTER_GROUP")

core_nodes = InstanceGroup(num_instances=20,
                           role="MASTER",
                           type="c1.xlarge",
                           market="SPOT",
                           name="MASTER_GROUP")

task_nodes = InstanceGroup(num_instances=10,
                           role="TASK",
                           type="c1.xlarge",
                           market="ON_DEMAND",
                           name="INITIAL_TASK_GROUP")

instance_groups = [namenode_instance_group, core_nodes, task_nodes]


# run the job
conn = EmrConnection("<aws-access-key-id>", "<aws-secret-access-key>")
conn.run_jobflow(name="My Job Flow",
                 instance_groups=instance_groups)


예제 #53
0
from boto.emr.bootstrap_action import BootstrapAction
from boto.emr.connection import EmrConnection

# Description:
# BootstrapAction is an object reperesenting a bootstrap action in Elastic Map
# Reduce (EMR), a script that gets run before the EMR job executes.

# initialize a bootstrap action
bootstrapSetup = BootstrapAction("Bootstrap Name",
                                 "s3://<my-bucket>/<my-bootstrap-action>",
                                 ["arg1=hello", "arg2=world"])

# initialize emr connection
emr_job = EmrConnection("<aws-access-key-id>", "<aws-secret-access-key>")

# run emr job flow with defined bootstrap action
emr_job.run_jobflow(bootstrap_actions=[bootstrapSetup])
예제 #54
0
k.set_contents_from_filename(
    "/Users/winteram/Documents/Teaching/WebAnalytics_2013S/BIA660-2013S/course_docs/20130319/mapper.py"
)
k.close()

k = Key(b)
k.key = "reducer.py"
k.set_contents_from_filename(
    "/Users/winteram/Documents/Teaching/WebAnalytics_2013S/BIA660-2013S/course_docs/20130319/reducer.py"
)
k.close()


### Running code with EMR
# emrcon = EmrConnection('<aws access key>', '<aws secret key>')
emrcon = EmrConnection("0CY3BC386720ZYZNWZ02", "Jv37SHb/XNeqpY8vMrGeclcL6abfKHKd9Eeh5fmy")

step = StreamingStep(
    name="Alcohol Step",
    mapper="s3n://bia660-winter/mapper.py",
    reducer="s3n://bia660-winter/reducer.py",
    input="s3://datasets.elasticmapreduce/ngrams/books/20090715/eng-us-all/3gram/data",
    output="s3n://bia660-winter/output/alcohol_religion",
)

jobid = emrcon.run_jobflow(
    name="Alcohol Religion 10", log_uri="s3://bia660-winter/logfiles", steps=[step], num_instances=4
)
print "Job created: %s" % jobid

status = emrcon.describe_jobflow(jobid)
예제 #55
0
#add your amazon creds to bash environment following the below
s3_bkt = os.environ['S3_BKT']
aws_access_key = os.environ['AWSAccessKeyId']
aws_secret_key = os.environ['AWSSecretKeyId']

def status_check(emr_conn, jobid):
  status = 0
  while status not in ['COMPLETED', 'FAILED', 'TERMINATED' ]:
    status = emr_conn.describe_jobflow(jobid).state
    print 'running %s: state is %s' % (jobid, status)
    time.sleep(30)


if __name__ == '__main__':
  #connect to s3 and emr
  emr_conn = EmrConnection(aws_access_key, aws_secret_key)
  s3_conn = S3Connection(aws_access_key, aws_secret_key)

  #upload mapper
  bucket = s3_conn.create_bucket(s3_bkt)
  k = Key(bucket)
  k.key = 'mapper.py'
  k.set_contents_from_filename('mapper.py')

  #where data comes from
  mapper_uri = 's3n://%s/mapper.py' % (s3_bkt)
  output_uri = 's3n://%s/output' % (s3_bkt)
  log_uri = 's3n://%s/log' % (s3_bkt)

  #configure the step
  wc_step = StreamingStep(name='My Hello World Count',
    def post(self):
        if not boto.config.has_section('Boto'):
            boto.config.add_section('Boto')
        boto.config.set('Boto', 'https_validate_certificates', 'False')
        note = ''
        data_para = [0, 0, 0, 0, 0]
        s3_connection = S3Connection(access_id, access_key)
        bucket = s3_connection.get_bucket('bucket774')
        k = Key(bucket)
        k.key = 'temp_para.json'
        temp_para = json.loads(k.get_contents_as_string())
        if (temp_para[6] == 1):
            k.key = 'cluster_id'
            cluster_id = k.get_contents_as_string()
            conn = EmrConnection(access_id, access_key)
            if (temp_para[7] == 0):
                status = conn.describe_cluster(cluster_id)
                if (status.status.state == 'WAITING'):
                    PYdata = get_output()
                    conn.terminate_jobflow(cluster_id)
                    data = in_circle_to_pi(PYdata, temp_para[0])
                    k.key = 'temp_para.json'
                    temp_para[6] = 0
                    k.set_contents_from_string(json.dumps(temp_para))
                    data_para[0:4] = temp_para[0:4]
                    data_para[4] = json.loads(data)[-1]
                    note = 'last emr job done, reslut have been updated'
                    save_result(data, json.dumps(data_para))

                else:
                    note = 'last emr calculation havet finished,please waitting.'
                    k.key = 'record.json'
                    data = k.get_contents_as_string()
                    k.key = 'record_para.json'
                    data_para_json = k.get_contents_as_string()
                    data_para = json.loads(data_para_json)
            elif (temp_para[7] == 1):
                status = conn.describe_cluster(cluster_id)
                if (status.status.state == 'WAITING'):
                    k.key = 'temp_data.json'
                    PYdata = np.array(json.loads(k.get_contents_as_string()))
                    PYdata += get_output()
                    if (round(
                            np.sum(PYdata) / (temp_para[3] * temp_para[5]),
                            temp_para[4]) == round(math.pi, temp_para[4])):
                        for i in range(1, len(PYdata)):
                            PYdata[i] += PYdata[i - 1]
                            PYdata[i - 1] /= temp_para[0] * (i) * temp_para[5]
                        PYdata[len(PYdata) -
                               1] /= temp_para[0] * len(PYdata) * temp_para[5]
                        data = json.dumps(
                            PYdata.tolist())  #covernt numpy array to list

                        k.key = 'temp_para.json'
                        temp_para[6] = 0
                        k.set_contents_from_string(json.dumps(temp_para))
                        data_para[0:4] = temp_para[0:4]
                        data_para[4] = json.loads(data)[-1]
                        conn.terminate_jobflow(cluster_id)
                        note = 'last emr job done,result have been updated'
                        save_result(data, json.dumps(data_para))
                    else:
                        note = str(np.sum(PYdata)) + ',' + str(
                            temp_para[3]) + ',' + str(temp_para[5])
                        add_step_emr(conn, cluster_id)
                        save_temp_result(PYdata)
                        for key in bucket.list(prefix='output/'):
                            key.delete()
                        temp_para[5] += 1
                        k.key = 'temp_para.json'
                        k.set_contents_from_string(json.dumps(temp_para))
                        #note='havet find the given accuracy in last run, keep working'
                        k.key = 'record.json'
                        data = k.get_contents_as_string()
                        k.key = 'record_para.json'
                        data_para_json = k.get_contents_as_string()
                        data_para = json.loads(data_para_json)
                else:
                    note = 'last emr calculation havet finished,please waitting.'
                    k.key = 'record.json'
                    data = k.get_contents_as_string()
                    k.key = 'record_para.json'
                    data_para_json = k.get_contents_as_string()
                    data_para = json.loads(data_para_json)
        else:
            k.key = 'record.json'
            data = k.get_contents_as_string()
            k.key = 'record_para.json'
            data_para_json = k.get_contents_as_string()
            data_para = json.loads(data_para_json)

        doRender(
            self, 'chart.htm', {
                'Data': data,
                'shots_each_threat': data_para[0],
                'R': data_para[1],
                'Q': data_para[2],
                'pi': math.pi,
                'shots': data_para[3],
                'result': data_para[4],
                'note': note
            })