def reload_read_request_queues(job_description_file, job_ids, redis_host, redis_port, redis_db, skip_phase_zero, skip_phase_one, phase_zero_sample_size): with open(job_description_file, 'r') as fp: job_description = json.load(fp) coordinator_db = redis_utils.CoordinatorDB(redis_host, redis_port, redis_db) input_files = input_file_utils.gather_input_file_paths( coordinator_db, job_description["input_directory"]) phases = [] if not skip_phase_zero: phases.append(0) if not skip_phase_one: phases.append(1) read_requests = input_file_utils.generate_read_requests( input_files, phase_zero_sample_size, job_ids, phases) input_file_utils.load_read_requests(coordinator_db, read_requests)
def setup_new_job(self, job_description, job_id): # Put intermediates and outputs for this job in their own # sub-directories. job_description["intermediate_directory"] += "/job_%d" % job_id job_description["output_directory"] += "/job_%d" % job_id self.coordinator_db.new_job_info(job_id, job_description) # Extract a list of all input files from the directory # corresponding to the input location input_dir = job_description["input_directory"] max_input_files_per_disk = None if "max_input_files_per_disk" in job_description: max_input_files_per_disk = \ job_description["max_input_files_per_disk"] input_path_struct = gather_input_file_paths( self.coordinator_db, input_dir, max_input_files_per_disk) if input_path_struct is not None: (worker_inputs, total_input_size) = input_path_struct else: worker_inputs = None total_input_size = 0 input_files_error = (worker_inputs is None or len(worker_inputs) == 0 or total_input_size == 0) if worker_inputs == None: error_msg = "Unable to list input directory '%s'" % ( input_dir) elif len(worker_inputs) == 0: error_msg = ("Didn't find any input files in directory '%s' " % (input_dir)) elif total_input_size == 0: error_msg = "Total length of all input files is 0B" if input_files_error: self.fail_job(job_id, error_msg) log.error("Job %d failed: %s" % (job_id, error_msg)) self.coordinator_db.update_job_status( job_id, { "fail_message" : error_msg }, post_status="Failed") return None self.coordinator_db.update_job_status( job_id, { "total_input_size_bytes" : total_input_size }) return worker_inputs
def setup_new_job(self, job_description, job_id): # Put intermediates and outputs for this job in their own # sub-directories. job_description["intermediate_directory"] += "/job_%d" % job_id job_description["output_directory"] += "/job_%d" % job_id self.coordinator_db.new_job_info(job_id, job_description) # Extract a list of all input files from the directory # corresponding to the input location input_dir = job_description["input_directory"] max_input_files_per_disk = None if "max_input_files_per_disk" in job_description: max_input_files_per_disk = \ job_description["max_input_files_per_disk"] input_path_struct = gather_input_file_paths(self.coordinator_db, input_dir, max_input_files_per_disk) if input_path_struct is not None: (worker_inputs, total_input_size) = input_path_struct else: worker_inputs = None total_input_size = 0 input_files_error = (worker_inputs is None or len(worker_inputs) == 0 or total_input_size == 0) if worker_inputs == None: error_msg = "Unable to list input directory '%s'" % (input_dir) elif len(worker_inputs) == 0: error_msg = ("Didn't find any input files in directory '%s' " % (input_dir)) elif total_input_size == 0: error_msg = "Total length of all input files is 0B" if input_files_error: self.fail_job(job_id, error_msg) log.error("Job %d failed: %s" % (job_id, error_msg)) self.coordinator_db.update_job_status(job_id, {"fail_message": error_msg}, post_status="Failed") return None self.coordinator_db.update_job_status( job_id, {"total_input_size_bytes": total_input_size}) return worker_inputs
def reload_read_request_queues( job_description_file, job_ids, redis_host, redis_port, redis_db, skip_phase_zero, skip_phase_one, phase_zero_sample_size): with open(job_description_file, 'r') as fp: job_description = json.load(fp) coordinator_db = redis_utils.CoordinatorDB(redis_host, redis_port, redis_db) input_files = input_file_utils.gather_input_file_paths( coordinator_db, job_description["input_directory"]) phases = [] if not skip_phase_zero: phases.append(0) if not skip_phase_one: phases.append(1) read_requests = input_file_utils.generate_read_requests( input_files, phase_zero_sample_size, job_ids, phases) input_file_utils.load_read_requests(coordinator_db, read_requests)