def reload_read_request_queues(job_description_file, job_ids, redis_host, redis_port, redis_db, skip_phase_zero, skip_phase_one, phase_zero_sample_size): with open(job_description_file, 'r') as fp: job_description = json.load(fp) coordinator_db = redis_utils.CoordinatorDB(redis_host, redis_port, redis_db) input_files = input_file_utils.gather_input_file_paths( coordinator_db, job_description["input_directory"]) phases = [] if not skip_phase_zero: phases.append(0) if not skip_phase_one: phases.append(1) read_requests = input_file_utils.generate_read_requests( input_files, phase_zero_sample_size, job_ids, phases) input_file_utils.load_read_requests(coordinator_db, read_requests)
def reload_read_request_queues( job_description_file, job_ids, redis_host, redis_port, redis_db, skip_phase_zero, skip_phase_one, phase_zero_sample_size): with open(job_description_file, 'r') as fp: job_description = json.load(fp) coordinator_db = redis_utils.CoordinatorDB(redis_host, redis_port, redis_db) input_files = input_file_utils.gather_input_file_paths( coordinator_db, job_description["input_directory"]) phases = [] if not skip_phase_zero: phases.append(0) if not skip_phase_one: phases.append(1) read_requests = input_file_utils.generate_read_requests( input_files, phase_zero_sample_size, job_ids, phases) input_file_utils.load_read_requests(coordinator_db, read_requests)
def run_batch(self, batch_jobs, batch_inputs): batch_id = self.coordinator_db.next_batch_id log.info("Running batch %d with the following job(s): %s" % (batch_id, ', '.join(map(str, batch_jobs)))) # Create log directory for the current batch batch_logs = create_batch_directory(self.log_directory, batch_id) # Copy description files to the log directory description_dir = os.path.join( os.path.dirname(__file__), os.pardir, os.pardir, os.pardir, "tritonsort", "mapreduce", "description") shutil.copy(os.path.join(description_dir, "stages.json"), batch_logs) shutil.copy(os.path.join(description_dir, "structure.json"), batch_logs) # Copy config file to log directory shutil.copy(self.config_file, batch_logs) self.ready_for_next_batch = False # Pull out relevant phase zero parameters phase_zero_sample_rate = 1 # Sample 100% by default if "SAMPLE_RATE" in self.config: phase_zero_sample_rate = float(self.config["SAMPLE_RATE"]) phase_zero_sample_points_per_file = 1 # Sample prefixes by default if "SAMPLES_PER_FILE" in self.config: phase_zero_sample_points_per_file = \ int(self.config["SAMPLES_PER_FILE"]) fixed_key_length = None if "MAP_INPUT_FIXED_KEY_LENGTH" in self.config: fixed_key_length = int(self.config["MAP_INPUT_FIXED_KEY_LENGTH"]) fixed_value_length = None if "MAP_INPUT_FIXED_VALUE_LENGTH" in self.config: fixed_value_length = \ int(self.config["MAP_INPUT_FIXED_VALUE_LENGTH"]) # If the application config file (yaml) or the job spec file (json) # skips a phase, we should not load read requests for that phase. The # job spec file should override the application config file. skip_phase_zero = 0 skip_phase_one = 0 skip_phase_two = 0 skip_phase_three = 0 if "SKIP_PHASE_ZERO" in self.config and self.config["SKIP_PHASE_ZERO"]: skip_phase_zero = 1 if "SKIP_PHASE_ONE" in self.config and self.config["SKIP_PHASE_ONE"]: skip_phase_one = 1 if "SKIP_PHASE_TWO" in self.config and self.config["SKIP_PHASE_TWO"]: skip_phase_two = 1 if "SKIP_PHASE_THREE" in self.config and \ self.config["SKIP_PHASE_THREE"]: skip_phase_three = 1 # The run_job.py script verifies that all jobs in the batch have the # same value of these skip parameters in the job specs, so we can just # check the first one. for key, value in ( self.coordinator_db.job_params(batch_jobs[0]).items()): if key == "SKIP_PHASE_ZERO": skip_phase_zero = value if key == "SKIP_PHASE_ONE": skip_phase_one = value if key == "SKIP_PHASE_TWO": skip_phase_two = value if key == "SKIP_PHASE_THREE": skip_phase_three = value if key == "MAP_INPUT_FIXED_KEY_LENGTH": fixed_key_length = int(value) if key == "MAP_INPUT_FIXED_VALUE_LENGTH": fixed_value_length = int(value) fixed_tuple_length = None if fixed_key_length != None and fixed_value_length != None: fixed_tuple_length = fixed_key_length + fixed_value_length use_replication = False if "OUTPUT_REPLICATION_LEVEL" in self.config and \ int(self.config["OUTPUT_REPLICATION_LEVEL"]) > 1: use_replication = True phases = [] if not skip_phase_zero: phases.append(0) if not skip_phase_one: phases.append(1) if not skip_phase_two and use_replication: # If we're using replication, phase two will have network transfer, # use barriers to guarantee sockets are connected. phases.append(2) if not skip_phase_three and use_replication: # If we're using replication, phase three will have network # transfer, use barriers to guarantee sockets are connected. phases.append(3) # Setup barriers self.coordinator_db.create_barriers(phases, batch_id, batch_jobs) # Generate read requests for the jobs in the batch read_requests = generate_read_requests( job_inputs = batch_inputs, phase_zero_sample_rate = phase_zero_sample_rate, phase_zero_sample_points_per_file =\ phase_zero_sample_points_per_file, tuple_start_offset = fixed_tuple_length, job_ids = batch_jobs, phases=phases) # Load read requests into read request queue for each worker load_read_requests(self.coordinator_db, read_requests) start_time = time.time() # Mark phase zero as starting now. self.coordinator_db.begin_phase(batch_id, "phase_zero") self.batch_phase_info[batch_id] = ("phase_zero", 0, start_time) log.info("Running phase_zero...") print_keyboard_commands() for job_id in batch_jobs: self.coordinator_db.update_job_status( job_id, { "start_time" : str(start_time), "batch_id" : batch_id, "date" : time.asctime()}) self.coordinator_db.add_jobs_to_batch(batch_id, batch_jobs) self.coordinator_db.mark_batch_incomplete(batch_id) # Setting current_batch will cause all node coordinators to start work # on that batch self.coordinator_db.add_batch_to_node_coordinator_batch_queues(batch_id)
def run_batch(self, batch_jobs, batch_inputs): batch_id = self.coordinator_db.next_batch_id log.info("Running batch %d with the following job(s): %s" % (batch_id, ', '.join(map(str, batch_jobs)))) # Create log directory for the current batch batch_logs = create_batch_directory(self.log_directory, batch_id) # Copy description files to the log directory description_dir = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, os.pardir, "tritonsort", "mapreduce", "description") shutil.copy(os.path.join(description_dir, "stages.json"), batch_logs) shutil.copy(os.path.join(description_dir, "structure.json"), batch_logs) # Copy config file to log directory shutil.copy(self.config_file, batch_logs) self.ready_for_next_batch = False # Pull out relevant phase zero parameters phase_zero_sample_rate = 1 # Sample 100% by default if "SAMPLE_RATE" in self.config: phase_zero_sample_rate = float(self.config["SAMPLE_RATE"]) phase_zero_sample_points_per_file = 1 # Sample prefixes by default if "SAMPLES_PER_FILE" in self.config: phase_zero_sample_points_per_file = \ int(self.config["SAMPLES_PER_FILE"]) fixed_key_length = None if "MAP_INPUT_FIXED_KEY_LENGTH" in self.config: fixed_key_length = int(self.config["MAP_INPUT_FIXED_KEY_LENGTH"]) fixed_value_length = None if "MAP_INPUT_FIXED_VALUE_LENGTH" in self.config: fixed_value_length = \ int(self.config["MAP_INPUT_FIXED_VALUE_LENGTH"]) # If the application config file (yaml) or the job spec file (json) # skips a phase, we should not load read requests for that phase. The # job spec file should override the application config file. skip_phase_zero = 0 skip_phase_one = 0 skip_phase_two = 0 skip_phase_three = 0 if "SKIP_PHASE_ZERO" in self.config and self.config["SKIP_PHASE_ZERO"]: skip_phase_zero = 1 if "SKIP_PHASE_ONE" in self.config and self.config["SKIP_PHASE_ONE"]: skip_phase_one = 1 if "SKIP_PHASE_TWO" in self.config and self.config["SKIP_PHASE_TWO"]: skip_phase_two = 1 if "SKIP_PHASE_THREE" in self.config and \ self.config["SKIP_PHASE_THREE"]: skip_phase_three = 1 # The run_job.py script verifies that all jobs in the batch have the # same value of these skip parameters in the job specs, so we can just # check the first one. for key, value in (self.coordinator_db.job_params( batch_jobs[0]).items()): if key == "SKIP_PHASE_ZERO": skip_phase_zero = value if key == "SKIP_PHASE_ONE": skip_phase_one = value if key == "SKIP_PHASE_TWO": skip_phase_two = value if key == "SKIP_PHASE_THREE": skip_phase_three = value if key == "MAP_INPUT_FIXED_KEY_LENGTH": fixed_key_length = int(value) if key == "MAP_INPUT_FIXED_VALUE_LENGTH": fixed_value_length = int(value) fixed_tuple_length = None if fixed_key_length != None and fixed_value_length != None: fixed_tuple_length = fixed_key_length + fixed_value_length use_replication = False if "OUTPUT_REPLICATION_LEVEL" in self.config and \ int(self.config["OUTPUT_REPLICATION_LEVEL"]) > 1: use_replication = True phases = [] if not skip_phase_zero: phases.append(0) if not skip_phase_one: phases.append(1) if not skip_phase_two and use_replication: # If we're using replication, phase two will have network transfer, # use barriers to guarantee sockets are connected. phases.append(2) if not skip_phase_three and use_replication: # If we're using replication, phase three will have network # transfer, use barriers to guarantee sockets are connected. phases.append(3) # Setup barriers self.coordinator_db.create_barriers(phases, batch_id, batch_jobs) # Generate read requests for the jobs in the batch read_requests = generate_read_requests( job_inputs = batch_inputs, phase_zero_sample_rate = phase_zero_sample_rate, phase_zero_sample_points_per_file =\ phase_zero_sample_points_per_file, tuple_start_offset = fixed_tuple_length, job_ids = batch_jobs, phases=phases) # Load read requests into read request queue for each worker load_read_requests(self.coordinator_db, read_requests) start_time = time.time() # Mark phase zero as starting now. self.coordinator_db.begin_phase(batch_id, "phase_zero") self.batch_phase_info[batch_id] = ("phase_zero", 0, start_time) log.info("Running phase_zero...") print_keyboard_commands() for job_id in batch_jobs: self.coordinator_db.update_job_status( job_id, { "start_time": str(start_time), "batch_id": batch_id, "date": time.asctime() }) self.coordinator_db.add_jobs_to_batch(batch_id, batch_jobs) self.coordinator_db.mark_batch_incomplete(batch_id) # Setting current_batch will cause all node coordinators to start work # on that batch self.coordinator_db.add_batch_to_node_coordinator_batch_queues( batch_id)