Example #1
0
 def is_dir_test(self):
     s = 's3://foo/bar/baz.csv'
     self.assertFalse(is_dir(s))
     s = 's3://foo/bar/baz/'
     self.assertTrue(is_dir(s))
Example #2
0
    def __init__(self, job_name=None, input_path=None, hive_query=None,
                 output_dir=None, scratch_uri=None, log_path=None,
                 ami_version=None, hive_version=None, num_instances=None,
                 master_instance_type=None, slave_instance_type=None,
                 aws_access_key_id=None, aws_secret_access_key=None,
                 s3_sync_wait_time=5, check_emr_status_every=30,
                 temp_dir=None):

        self.job_name = job_name
        self.job_id = self._generate_job_id()
        self.start_time = time.time()

        # AWS credentials can come from arguments or environment
        # set AWS credentials if supplied (override ENV)
        if aws_access_key_id:
            os.environ['AWS_ACCESS_KEY_ID'] = aws_access_key_id
            self.aws_access_key_id = aws_access_key_id
        else:
            logger.debug("Getting AWS access key from ENV")
            self.aws_access_key_id = os.environ['AWS_ACCESS_KEY_ID']

        if aws_secret_access_key:
            os.environ['AWS_SECRET_ACCESS_KEY'] = aws_secret_access_key
            self.aws_secret_access_key = aws_secret_access_key
        else:
            logger.debug("Getting AWS secret key from ENV")
            self.aws_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY']

        self.s3_sync_wait_time = s3_sync_wait_time
        self.check_emr_status_every = check_emr_status_every

        # I/O for job data
        self.input_path = input_path
        self.output_dir = output_dir

        # is the input multiple files in a 'directory'?
        try:
            self.input_is_dir = is_dir(input_path)
        except TypeError:
            self.input_is_dir = False

        # the Hive script object
        self.hive_query = hive_query

        #  EMR options
        self.master_instance_type = master_instance_type
        self.slave_instance_type = slave_instance_type
        self.ami_version = ami_version
        self.hive_version = hive_version
        self.num_instances = num_instances

        # S3 'scratch' directory
        if scratch_uri:
            self.base_path = scratch_uri
            os.environ['S3_SCRATCH_URI'] = scratch_uri
        else:
            self.base_path = os.environ['S3_SCRATCH_URI']

        # allow alternate logging path
        self.log_path = log_path or self.base_path + 'logs/'
        # other temp files live in a jobID bucket
        self.job_files = self.base_path + self.job_id + '/'
        self.data_path = self.job_files + 'data'
        if self.input_is_dir:
            self.data_path += '/'
        self.table_path = self.job_files + 'tables/'
        self.script_path = self.job_files + 'script.hql'
        self.output_path = self.output_dir or self.job_files + 'output/'

        # a local temp dir is used to write the script
        self.local_script_file = get_script_file_location(self.job_id,
                                                          temp_dir)

        logger.info("JobID {0}, started at {1}".format(self.job_id,
                                                       self.start_time))
Example #3
0
    def __init__(self,
                 job_name=None,
                 input_path=None,
                 hive_query=None,
                 output_dir=None,
                 scratch_uri=None,
                 log_path=None,
                 ami_version=None,
                 hive_version=None,
                 num_instances=None,
                 master_instance_type=None,
                 slave_instance_type=None,
                 iam_instance_profile=None,
                 iam_service_role=None,
                 aws_access_key_id=None,
                 aws_secret_access_key=None,
                 s3_sync_wait_time=5,
                 check_emr_status_every=30,
                 temp_dir=None):

        self.job_name = job_name
        self.job_id = self._generate_job_id()
        self.start_time = time.time()

        # AWS credentials can come from arguments or environment
        # set AWS credentials if supplied (override ENV)
        if aws_access_key_id:
            os.environ['AWS_ACCESS_KEY_ID'] = aws_access_key_id
            self.aws_access_key_id = aws_access_key_id
        else:
            logger.debug("Getting AWS access key from ENV")
            self.aws_access_key_id = os.environ['AWS_ACCESS_KEY_ID']

        if aws_secret_access_key:
            os.environ['AWS_SECRET_ACCESS_KEY'] = aws_secret_access_key
            self.aws_secret_access_key = aws_secret_access_key
        else:
            logger.debug("Getting AWS secret key from ENV")
            self.aws_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY']

        self.s3_sync_wait_time = s3_sync_wait_time
        self.check_emr_status_every = check_emr_status_every

        # I/O for job data
        self.input_path = input_path
        self.output_dir = output_dir

        # is the input multiple files in a 'directory'?
        try:
            self.input_is_dir = is_dir(input_path)
        except TypeError:
            self.input_is_dir = False

        # the Hive script object
        self.hive_query = hive_query

        #  EMR options
        self.master_instance_type = master_instance_type
        self.slave_instance_type = slave_instance_type
        self.ami_version = ami_version
        self.hive_version = hive_version
        self.num_instances = num_instances
        self.iam_instance_profile = iam_instance_profile
        self.iam_service_role = iam_service_role

        # S3 'scratch' directory
        if scratch_uri:
            self.base_path = scratch_uri
            os.environ['S3_SCRATCH_URI'] = scratch_uri
        else:
            self.base_path = os.environ['S3_SCRATCH_URI']

        # allow alternate logging path
        self.log_path = log_path or self.base_path + 'logs/'
        # other temp files live in a jobID bucket
        self.job_files = self.base_path + self.job_id + '/'
        self.data_path = self.job_files + 'data'
        if self.input_is_dir:
            self.data_path += '/'
        self.table_path = self.job_files + 'tables/'
        self.script_path = self.job_files + 'script.hql'
        self.output_path = self.output_dir or self.job_files + 'output/'

        # a local temp dir is used to write the script
        self.local_script_file = get_script_file_location(
            self.job_id, temp_dir)

        logger.info("JobID {0}, started at {1}".format(self.job_id,
                                                       self.start_time))
Example #4
0
 def is_dir_test(self):
     s = 's3://foo/bar/baz.csv'
     self.assertFalse(is_dir(s))
     s = 's3://foo/bar/baz/'
     self.assertTrue(is_dir(s))