예제 #1
0
import dateutil.parser
from dateutil import tz
from boto.emr.connection import EmrConnection
from boto.s3.connection import S3Connection
from ucsd_bigdata.credentials import Credentials
import gzip

if __name__ == "__main__":
    credentials = Credentials()
    aws_access_key_id = credentials.aws_access_key_id
    aws_secret_access_key = credentials.aws_secret_access_key

    emr_conn = EmrConnection(aws_access_key_id, aws_secret_access_key)

    # List EMR Clusters
    clusters = emr_conn.list_clusters(cluster_states=["RUNNING", "WAITING"])

    for index, cluster in enumerate(clusters.clusters):
        print "[%s] %s" % (index, cluster.id)

    # if there is a command line arg, use it for the cluster_id
    if len(sys.argv) > 1:
        cluster_id = sys.argv[1]
    else:
        if len(clusters.clusters) == 0:
            sys.exit("No EMR clusters running.")
        selected_cluster = input("Select a Cluster: ")
        cluster_id = clusters.clusters[int(selected_cluster)].id

    print cluster_id
예제 #2
0
class EmrClient(object):

    # The Hadoop version to use
    HADOOP_VERSION = '1.0.3'

    # The AMI version to use
    AMI_VERSION = '2.4.7'

    # Interval to wait between polls to EMR cluster in seconds
    CLUSTER_OPERATION_RESULTS_POLLING_SECONDS = 10

    # Timeout for EMR creation and ramp up in seconds
    CLUSTER_OPERATION_RESULTS_TIMEOUT_SECONDS = 60 * 30

    def __init__(self,
                 region_name='us-east-1',
                 aws_access_key_id=None,
                 aws_secret_access_key=None):

        # If the access key is not specified, get it from the luigi config.cfg file
        if not aws_access_key_id:
            aws_access_key_id = luigi.configuration.get_config().get(
                'aws', 'aws_access_key_id')

        if not aws_secret_access_key:
            aws_secret_access_key = luigi.configuration.get_config().get(
                'aws', 'aws_secret_access_key')

        # Create the region in which to run
        region_endpoint = u'elasticmapreduce.%s.amazonaws.com' % (region_name)
        region = RegionInfo(name=region_name, endpoint=region_endpoint)

        self.emr_connection = EmrConnection(
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key,
            region=region)

    def launch_emr_cluster(
        self,
        cluster_name,
        log_uri,
        ec2_keyname=None,
        master_type='m1.small',
        core_type='m1.small',
        num_instances=2,
        hadoop_version='1.0.3',
        ami_version='2.4.7',
    ):

        # TODO Remove
        # install_pig_step = InstallPigStep()

        jobflow_id = self.emr_connection.run_jobflow(
            name=cluster_name,
            log_uri=log_uri,
            ec2_keyname=ec2_keyname,
            master_instance_type=master_type,
            slave_instance_type=core_type,
            num_instances=num_instances,
            keep_alive=True,
            enable_debugging=True,
            hadoop_version=EmrClient.HADOOP_VERSION,
            steps=[],
            ami_version=EmrClient.AMI_VERSION)

        # Log important information
        status = self.emr_connection.describe_jobflow(jobflow_id)

        logger.info('Creating new cluster %s with following details' %
                    status.name)
        logger.info('jobflow ID:\t%s' % status.jobflowid)
        logger.info('Log URI:\t%s' % status.loguri)
        logger.info('Master Instance Type:\t%s' % status.masterinstancetype)

        # A cluster of size 1 does not have any slave instances
        if hasattr(status, 'slaveinstancetype'):
            logger.info('Slave Instance Type:\t%s' % status.slaveinstancetype)

        logger.info('Number of Instances:\t%s' % status.instancecount)
        logger.info('Hadoop Version:\t%s' % status.hadoopversion)
        logger.info('AMI Version:\t%s' % status.amiversion)
        logger.info('Keep Alive:\t%s' % status.keepjobflowalivewhennosteps)

        return self._poll_until_cluster_ready(jobflow_id)

    def add_pig_step(self,
                     jobflow_id,
                     pig_file,
                     name='Pig Script',
                     pig_versions='latest',
                     pig_args=[]):

        pig_step = PigStep(
            name=name,
            pig_file=pig_file,
            pig_versions=pig_versions,
            pig_args=pig_args,
            # action_on_failure='CONTINUE',
        )

        self.emr_connection.add_jobflow_steps(jobflow_id, [pig_step])

        # Poll until the cluster is done working
        return self._poll_until_cluster_ready(jobflow_id)

    def shutdown_emr_cluster(self, jobflow_id):

        self.emr_connection.terminate_jobflow(jobflow_id)
        return self._poll_until_cluster_shutdown(jobflow_id)

    def get_jobflow_id(self):
        # Get the id of the cluster that is WAITING for work
        return self.emr_connection.list_clusters(
            cluster_states=['WAITING']).clusters[0].id

    def get_master_dns(self):
        """
        Get the master node's public address
        """
        # Get the jobflow ID
        jobflow_id = self.get_master_dns()

        # Use the jobflow ID to get the status
        status = self.emr_connection.describe_jobflow(jobflow_id)

        # Return the master's public dns
        return status.masterpublicdnsname

    def _poll_until_cluster_ready(self, jobflow_id):

        start_time = time.time()

        is_cluster_ready = False

        while (not is_cluster_ready) and (
                time.time() - start_time <
                EmrClient.CLUSTER_OPERATION_RESULTS_TIMEOUT_SECONDS):
            # Get the state
            state = self.emr_connection.describe_jobflow(jobflow_id).state

            if state == u'WAITING':
                logger.info('Cluster intialized and is WAITING for work')
                is_cluster_ready = True

            elif (state == u'COMPLETED') or \
                 (state == u'SHUTTING_DOWN') or \
                 (state == u'FAILED') or \
                 (state == u'TERMINATED'):

                logger.error('Error starting cluster; status: %s' % state)

                # Poll until cluster shutdown
                self._poll_until_cluster_shutdown(jobflow_id)
                raise RuntimeError('Error, cluster failed to start')

            else:
                logger.debug('Cluster state: %s' % state)
                time.sleep(EmrClient.CLUSTER_OPERATION_RESULTS_POLLING_SECONDS)

        if not is_cluster_ready:
            # TODO shutdown cluster
            raise RuntimeError(
                'Timed out waiting for EMR cluster to be active')

        return jobflow_id

    def _poll_until_cluster_shutdown(self, jobflow_id):
        start_time = time.time()

        is_cluster_shutdown = False

        while (not is_cluster_shutdown) and (
                time.time() - start_time <
                EmrClient.CLUSTER_OPERATION_RESULTS_TIMEOUT_SECONDS):
            # Get the state
            state = self.emr_connection.describe_jobflow(jobflow_id).state

            if (state == u'TERMINATED') or (state == u'COMPLETED'):
                logger.info('Cluster successfully shutdown with status: %s' %
                            state)
                return False
            elif state == u'FAILED':
                logger.error('Cluster shutdown with FAILED status')
                return False
            else:
                logger.debug('Cluster state: %s' % state)
                time.sleep(EmrClient.CLUSTER_OPERATION_RESULTS_POLLING_SECONDS)

        if not is_cluster_shutdown:
            # TODO shutdown cluster
            raise RuntimeError(
                'Timed out waiting for EMR cluster to shut down')

        return True
예제 #3
0
from dateutil import tz
from boto.emr.connection import EmrConnection
from boto.s3.connection import S3Connection
from ucsd_bigdata.credentials import Credentials
import gzip


if __name__ == "__main__":
    credentials = Credentials()
    aws_access_key_id = credentials.aws_access_key_id
    aws_secret_access_key = credentials.aws_secret_access_key

    emr_conn = EmrConnection(aws_access_key_id, aws_secret_access_key)

    # List EMR Clusters
    clusters = emr_conn.list_clusters(cluster_states=["RUNNING", "WAITING"])

    for index, cluster in enumerate(clusters.clusters):
        print "[%s] %s" % (index, cluster.id)

    # if there is a command line arg, use it for the cluster_id
    if len(sys.argv) > 1:
        cluster_id = sys.argv[1]
    else:
        if len(clusters.clusters) == 0:
            sys.exit("No EMR clusters running.")
        selected_cluster = input("Select a Cluster: ")
        cluster_id = clusters.clusters[int(selected_cluster)].id

    print cluster_id
예제 #4
0
파일: emr_client.py 프로젝트: mbrio/Luigi
class EmrClient(object):


    # The Hadoop version to use
    HADOOP_VERSION = '1.0.3'

    # The AMI version to use
    AMI_VERSION = '2.4.7'
 
    # Interval to wait between polls to EMR cluster in seconds
    CLUSTER_OPERATION_RESULTS_POLLING_SECONDS = 10
 
    # Timeout for EMR creation and ramp up in seconds
    CLUSTER_OPERATION_RESULTS_TIMEOUT_SECONDS = 60 * 30
 
    def __init__(self, region_name='us-east-1', aws_access_key_id=None, aws_secret_access_key=None):
 
        # If the access key is not specified, get it from the luigi config.cfg file
        if not aws_access_key_id:
            aws_access_key_id = luigi.configuration.get_config().get('aws', 'aws_access_key_id')
 
        if not aws_secret_access_key:
            aws_secret_access_key = luigi.configuration.get_config().get('aws', 'aws_secret_access_key')
 
 
        # Create the region in which to run
        region_endpoint = u'elasticmapreduce.%s.amazonaws.com' % (region_name)
        region = RegionInfo(name=region_name, endpoint=region_endpoint)
 
        self.emr_connection = EmrConnection(aws_access_key_id=aws_access_key_id,
                                            aws_secret_access_key=aws_secret_access_key,
                                            region=region)
 
    def launch_emr_cluster(self, cluster_name, log_uri, ec2_keyname=None, master_type='m1.small', core_type='m1.small', num_instances=2, hadoop_version='1.0.3', ami_version='2.4.7', ):
 
        # TODO Remove
        # install_pig_step = InstallPigStep()
 
        jobflow_id = self.emr_connection.run_jobflow(name=cluster_name,
                              log_uri=log_uri,
                              ec2_keyname=ec2_keyname,
                              master_instance_type=master_type,
                              slave_instance_type=core_type,
                              num_instances=num_instances,
                              keep_alive=True,
                              enable_debugging=True,
                              hadoop_version=EmrClient.HADOOP_VERSION,
                              steps=[], 
                              ami_version=EmrClient.AMI_VERSION)
 
        # Log important information
        status = self.emr_connection.describe_jobflow(jobflow_id)

        logger.info('Creating new cluster %s with following details' % status.name)
        logger.info('jobflow ID:\t%s' % status.jobflowid)
        logger.info('Log URI:\t%s' % status.loguri)
        logger.info('Master Instance Type:\t%s' % status.masterinstancetype)
        
        # A cluster of size 1 does not have any slave instances
        if hasattr(status, 'slaveinstancetype'):
            logger.info('Slave Instance Type:\t%s' % status.slaveinstancetype)
        
        logger.info('Number of Instances:\t%s' % status.instancecount)
        logger.info('Hadoop Version:\t%s' % status.hadoopversion)
        logger.info('AMI Version:\t%s' % status.amiversion)
        logger.info('Keep Alive:\t%s' % status.keepjobflowalivewhennosteps)
 
        return self._poll_until_cluster_ready(jobflow_id)
 
 
    def add_pig_step(self, jobflow_id, pig_file, name='Pig Script', pig_versions='latest', pig_args=[]): 

        pig_step = PigStep(name=name,
                           pig_file=pig_file,
                           pig_versions=pig_versions,
                           pig_args=pig_args,
                           # action_on_failure='CONTINUE',
                       )

        self.emr_connection.add_jobflow_steps(jobflow_id, [pig_step])

        # Poll until the cluster is done working        
        return self._poll_until_cluster_ready(jobflow_id)


    def shutdown_emr_cluster(self, jobflow_id):
 
        self.emr_connection.terminate_jobflow(jobflow_id)
        return self._poll_until_cluster_shutdown(jobflow_id)
 
    def get_jobflow_id(self):
        # Get the id of the cluster that is WAITING for work
        return self.emr_connection.list_clusters(cluster_states=['WAITING']).clusters[0].id
 
    def get_master_dns(self):
        """
        Get the master node's public address
        """
        # Get the jobflow ID
        jobflow_id = self.get_master_dns()

        # Use the jobflow ID to get the status
        status = self.emr_connection.describe_jobflow(jobflow_id)

        # Return the master's public dns
        return status.masterpublicdnsname

    def _poll_until_cluster_ready(self, jobflow_id):
 
        start_time = time.time()
 
        is_cluster_ready = False
 
        while (not is_cluster_ready) and (time.time() - start_time < EmrClient.CLUSTER_OPERATION_RESULTS_TIMEOUT_SECONDS):
            # Get the state
            state = self.emr_connection.describe_jobflow(jobflow_id).state

            if state == u'WAITING':
                logger.info('Cluster intialized and is WAITING for work')
                is_cluster_ready = True

            elif (state == u'COMPLETED') or \
                 (state == u'SHUTTING_DOWN') or \
                 (state == u'FAILED') or \
                 (state == u'TERMINATED'):
                
                logger.error('Error starting cluster; status: %s' % state)

                # Poll until cluster shutdown
                self._poll_until_cluster_shutdown(jobflow_id)
                raise RuntimeError('Error, cluster failed to start')

            else:
                logger.debug('Cluster state: %s' % state)
                time.sleep(EmrClient.CLUSTER_OPERATION_RESULTS_POLLING_SECONDS)

 
        if not is_cluster_ready:
            # TODO shutdown cluster
            raise RuntimeError('Timed out waiting for EMR cluster to be active')
 
        return jobflow_id
 
 
    def _poll_until_cluster_shutdown(self, jobflow_id):
        start_time = time.time()
 
        is_cluster_shutdown = False
 
        while (not is_cluster_shutdown) and (time.time() - start_time < EmrClient.CLUSTER_OPERATION_RESULTS_TIMEOUT_SECONDS):
            # Get the state
            state = self.emr_connection.describe_jobflow(jobflow_id).state

            if (state == u'TERMINATED') or (state == u'COMPLETED'):
                logger.info('Cluster successfully shutdown with status: %s' % state)
                return False
            elif state == u'FAILED':
                logger.error('Cluster shutdown with FAILED status')
                return False
            else:
                logger.debug('Cluster state: %s' % state)
                time.sleep(EmrClient.CLUSTER_OPERATION_RESULTS_POLLING_SECONDS)

        if not is_cluster_shutdown:
            # TODO shutdown cluster
            raise RuntimeError('Timed out waiting for EMR cluster to shut down')
 
        return True