def is_dir_test(self): s = 's3://foo/bar/baz.csv' self.assertFalse(is_dir(s)) s = 's3://foo/bar/baz/' self.assertTrue(is_dir(s))
def __init__(self, job_name=None, input_path=None, hive_query=None, output_dir=None, scratch_uri=None, log_path=None, ami_version=None, hive_version=None, num_instances=None, master_instance_type=None, slave_instance_type=None, aws_access_key_id=None, aws_secret_access_key=None, s3_sync_wait_time=5, check_emr_status_every=30, temp_dir=None): self.job_name = job_name self.job_id = self._generate_job_id() self.start_time = time.time() # AWS credentials can come from arguments or environment # set AWS credentials if supplied (override ENV) if aws_access_key_id: os.environ['AWS_ACCESS_KEY_ID'] = aws_access_key_id self.aws_access_key_id = aws_access_key_id else: logger.debug("Getting AWS access key from ENV") self.aws_access_key_id = os.environ['AWS_ACCESS_KEY_ID'] if aws_secret_access_key: os.environ['AWS_SECRET_ACCESS_KEY'] = aws_secret_access_key self.aws_secret_access_key = aws_secret_access_key else: logger.debug("Getting AWS secret key from ENV") self.aws_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY'] self.s3_sync_wait_time = s3_sync_wait_time self.check_emr_status_every = check_emr_status_every # I/O for job data self.input_path = input_path self.output_dir = output_dir # is the input multiple files in a 'directory'? try: self.input_is_dir = is_dir(input_path) except TypeError: self.input_is_dir = False # the Hive script object self.hive_query = hive_query # EMR options self.master_instance_type = master_instance_type self.slave_instance_type = slave_instance_type self.ami_version = ami_version self.hive_version = hive_version self.num_instances = num_instances # S3 'scratch' directory if scratch_uri: self.base_path = scratch_uri os.environ['S3_SCRATCH_URI'] = scratch_uri else: self.base_path = os.environ['S3_SCRATCH_URI'] # allow alternate logging path self.log_path = log_path or self.base_path + 'logs/' # other temp files live in a jobID bucket self.job_files = self.base_path + self.job_id + '/' self.data_path = self.job_files + 'data' if self.input_is_dir: self.data_path += '/' self.table_path = self.job_files + 'tables/' self.script_path = self.job_files + 'script.hql' self.output_path = self.output_dir or self.job_files + 'output/' # a local temp dir is used to write the script self.local_script_file = get_script_file_location(self.job_id, temp_dir) logger.info("JobID {0}, started at {1}".format(self.job_id, self.start_time))
def __init__(self, job_name=None, input_path=None, hive_query=None, output_dir=None, scratch_uri=None, log_path=None, ami_version=None, hive_version=None, num_instances=None, master_instance_type=None, slave_instance_type=None, iam_instance_profile=None, iam_service_role=None, aws_access_key_id=None, aws_secret_access_key=None, s3_sync_wait_time=5, check_emr_status_every=30, temp_dir=None): self.job_name = job_name self.job_id = self._generate_job_id() self.start_time = time.time() # AWS credentials can come from arguments or environment # set AWS credentials if supplied (override ENV) if aws_access_key_id: os.environ['AWS_ACCESS_KEY_ID'] = aws_access_key_id self.aws_access_key_id = aws_access_key_id else: logger.debug("Getting AWS access key from ENV") self.aws_access_key_id = os.environ['AWS_ACCESS_KEY_ID'] if aws_secret_access_key: os.environ['AWS_SECRET_ACCESS_KEY'] = aws_secret_access_key self.aws_secret_access_key = aws_secret_access_key else: logger.debug("Getting AWS secret key from ENV") self.aws_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY'] self.s3_sync_wait_time = s3_sync_wait_time self.check_emr_status_every = check_emr_status_every # I/O for job data self.input_path = input_path self.output_dir = output_dir # is the input multiple files in a 'directory'? try: self.input_is_dir = is_dir(input_path) except TypeError: self.input_is_dir = False # the Hive script object self.hive_query = hive_query # EMR options self.master_instance_type = master_instance_type self.slave_instance_type = slave_instance_type self.ami_version = ami_version self.hive_version = hive_version self.num_instances = num_instances self.iam_instance_profile = iam_instance_profile self.iam_service_role = iam_service_role # S3 'scratch' directory if scratch_uri: self.base_path = scratch_uri os.environ['S3_SCRATCH_URI'] = scratch_uri else: self.base_path = os.environ['S3_SCRATCH_URI'] # allow alternate logging path self.log_path = log_path or self.base_path + 'logs/' # other temp files live in a jobID bucket self.job_files = self.base_path + self.job_id + '/' self.data_path = self.job_files + 'data' if self.input_is_dir: self.data_path += '/' self.table_path = self.job_files + 'tables/' self.script_path = self.job_files + 'script.hql' self.output_path = self.output_dir or self.job_files + 'output/' # a local temp dir is used to write the script self.local_script_file = get_script_file_location( self.job_id, temp_dir) logger.info("JobID {0}, started at {1}".format(self.job_id, self.start_time))