def get_script_file_location_test(self): if 'APIARIST_TMP_DIR' in os.environ: del os.environ['APIARIST_TMP_DIR'] l, j = "/foo/bar", "abcdef1234567890" self.assertEqual(get_script_file_location(j), '/tmp/' + j + '.hql') self.assertEqual(get_script_file_location(j, l), l + j + '.hql') # test with ENV var os.environ['APIARIST_TMP_DIR'] = "/bar/baz/" self.assertEqual(get_script_file_location(j), '/bar/baz/' + j + '.hql')
def __init__(self, job_name=None, input_path=None, hive_query=None, output_dir=None, temp_dir=None, no_output=False, retain_hive_table=False): # TODO test for Hive installation self.job_name = job_name self.job_id = self._generate_job_id() self.start_time = time.time() # I/O for job data self.scratch_dir = self.get_local_scratch_dir(temp_dir) self.stream_output = (not no_output) self.data_path = self.scratch_dir + 'data' self.table_path = self.scratch_dir + 'table' self.input_path = os.path.abspath(input_path) if output_dir: self.output_dir = os.path.abspath(output_dir) + '/' + self.job_id else: self.output_dir = self.scratch_dir + 'output' # the Hive script object self.hive_query = hive_query self.local_script_file = get_script_file_location( self.job_id, self.scratch_dir) self.retain_hive_table = retain_hive_table
def __init__(self, job_name=None, input_path=None, hive_query=None, output_dir=None, temp_dir=None, no_output=False, retain_hive_table=False): # TODO test for Hive installation self.job_name = job_name self.job_id = self._generate_job_id() self.start_time = time.time() # I/O for job data self.scratch_dir = self.get_local_scratch_dir(temp_dir) self.stream_output = (not no_output) self.data_path = self.scratch_dir + 'data' self.table_path = self.scratch_dir + 'table' self.input_path = os.path.abspath(input_path) if output_dir: self.output_dir = os.path.abspath(output_dir) + '/' + self.job_id else: self.output_dir = self.scratch_dir + 'output' # the Hive script object self.hive_query = hive_query self.local_script_file = get_script_file_location(self.job_id, self.scratch_dir) self.retain_hive_table = retain_hive_table
def __init__(self, job_name=None, input_path=None, hive_query=None, output_dir=None, scratch_uri=None, log_path=None, ami_version=None, hive_version=None, num_instances=None, master_instance_type=None, slave_instance_type=None, iam_instance_profile=None, iam_service_role=None, aws_access_key_id=None, aws_secret_access_key=None, s3_sync_wait_time=5, check_emr_status_every=30, temp_dir=None): self.job_name = job_name self.job_id = self._generate_job_id() self.start_time = time.time() # AWS credentials can come from arguments or environment # set AWS credentials if supplied (override ENV) if aws_access_key_id: os.environ['AWS_ACCESS_KEY_ID'] = aws_access_key_id self.aws_access_key_id = aws_access_key_id else: logger.debug("Getting AWS access key from ENV") self.aws_access_key_id = os.environ['AWS_ACCESS_KEY_ID'] if aws_secret_access_key: os.environ['AWS_SECRET_ACCESS_KEY'] = aws_secret_access_key self.aws_secret_access_key = aws_secret_access_key else: logger.debug("Getting AWS secret key from ENV") self.aws_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY'] self.s3_sync_wait_time = s3_sync_wait_time self.check_emr_status_every = check_emr_status_every # I/O for job data self.input_path = input_path self.output_dir = output_dir # is the input multiple files in a 'directory'? try: self.input_is_dir = is_dir(input_path) except TypeError: self.input_is_dir = False # the Hive script object self.hive_query = hive_query # EMR options self.master_instance_type = master_instance_type self.slave_instance_type = slave_instance_type self.ami_version = ami_version self.hive_version = hive_version self.num_instances = num_instances self.iam_instance_profile = iam_instance_profile self.iam_service_role = iam_service_role # S3 'scratch' directory if scratch_uri: self.base_path = scratch_uri os.environ['S3_SCRATCH_URI'] = scratch_uri else: self.base_path = os.environ['S3_SCRATCH_URI'] # allow alternate logging path self.log_path = log_path or self.base_path + 'logs/' # other temp files live in a jobID bucket self.job_files = self.base_path + self.job_id + '/' self.data_path = self.job_files + 'data' if self.input_is_dir: self.data_path += '/' self.table_path = self.job_files + 'tables/' self.script_path = self.job_files + 'script.hql' self.output_path = self.output_dir or self.job_files + 'output/' # a local temp dir is used to write the script self.local_script_file = get_script_file_location( self.job_id, temp_dir) logger.info("JobID {0}, started at {1}".format(self.job_id, self.start_time))
def __init__(self, job_name=None, input_path=None, hive_query=None, output_dir=None, scratch_uri=None, log_path=None, ami_version=None, hive_version=None, num_instances=None, master_instance_type=None, slave_instance_type=None, aws_access_key_id=None, aws_secret_access_key=None, s3_sync_wait_time=5, check_emr_status_every=30, temp_dir=None): self.job_name = job_name self.job_id = self._generate_job_id() self.start_time = time.time() # AWS credentials can come from arguments or environment # set AWS credentials if supplied (override ENV) if aws_access_key_id: os.environ['AWS_ACCESS_KEY_ID'] = aws_access_key_id self.aws_access_key_id = aws_access_key_id else: logger.debug("Getting AWS access key from ENV") self.aws_access_key_id = os.environ['AWS_ACCESS_KEY_ID'] if aws_secret_access_key: os.environ['AWS_SECRET_ACCESS_KEY'] = aws_secret_access_key self.aws_secret_access_key = aws_secret_access_key else: logger.debug("Getting AWS secret key from ENV") self.aws_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY'] self.s3_sync_wait_time = s3_sync_wait_time self.check_emr_status_every = check_emr_status_every # I/O for job data self.input_path = input_path self.output_dir = output_dir # is the input multiple files in a 'directory'? try: self.input_is_dir = is_dir(input_path) except TypeError: self.input_is_dir = False # the Hive script object self.hive_query = hive_query # EMR options self.master_instance_type = master_instance_type self.slave_instance_type = slave_instance_type self.ami_version = ami_version self.hive_version = hive_version self.num_instances = num_instances # S3 'scratch' directory if scratch_uri: self.base_path = scratch_uri os.environ['S3_SCRATCH_URI'] = scratch_uri else: self.base_path = os.environ['S3_SCRATCH_URI'] # allow alternate logging path self.log_path = log_path or self.base_path + 'logs/' # other temp files live in a jobID bucket self.job_files = self.base_path + self.job_id + '/' self.data_path = self.job_files + 'data' if self.input_is_dir: self.data_path += '/' self.table_path = self.job_files + 'tables/' self.script_path = self.job_files + 'script.hql' self.output_path = self.output_dir or self.job_files + 'output/' # a local temp dir is used to write the script self.local_script_file = get_script_file_location(self.job_id, temp_dir) logger.info("JobID {0}, started at {1}".format(self.job_id, self.start_time))