def run_batch(self, batch_jobs, batch_inputs): batch_id = self.coordinator_db.next_batch_id log.info("Running batch %d with the following job(s): %s" % (batch_id, ', '.join(map(str, batch_jobs)))) # Create log directory for the current batch batch_logs = create_batch_directory(self.log_directory, batch_id) # Copy description files to the log directory description_dir = os.path.join( os.path.dirname(__file__), os.pardir, os.pardir, os.pardir, "tritonsort", "mapreduce", "description") shutil.copy(os.path.join(description_dir, "stages.json"), batch_logs) shutil.copy(os.path.join(description_dir, "structure.json"), batch_logs) # Copy config file to log directory shutil.copy(self.config_file, batch_logs) self.ready_for_next_batch = False # Pull out relevant phase zero parameters phase_zero_sample_rate = 1 # Sample 100% by default if "SAMPLE_RATE" in self.config: phase_zero_sample_rate = float(self.config["SAMPLE_RATE"]) phase_zero_sample_points_per_file = 1 # Sample prefixes by default if "SAMPLES_PER_FILE" in self.config: phase_zero_sample_points_per_file = \ int(self.config["SAMPLES_PER_FILE"]) fixed_key_length = None if "MAP_INPUT_FIXED_KEY_LENGTH" in self.config: fixed_key_length = int(self.config["MAP_INPUT_FIXED_KEY_LENGTH"]) fixed_value_length = None if "MAP_INPUT_FIXED_VALUE_LENGTH" in self.config: fixed_value_length = \ int(self.config["MAP_INPUT_FIXED_VALUE_LENGTH"]) # If the application config file (yaml) or the job spec file (json) # skips a phase, we should not load read requests for that phase. The # job spec file should override the application config file. skip_phase_zero = 0 skip_phase_one = 0 skip_phase_two = 0 skip_phase_three = 0 if "SKIP_PHASE_ZERO" in self.config and self.config["SKIP_PHASE_ZERO"]: skip_phase_zero = 1 if "SKIP_PHASE_ONE" in self.config and self.config["SKIP_PHASE_ONE"]: skip_phase_one = 1 if "SKIP_PHASE_TWO" in self.config and self.config["SKIP_PHASE_TWO"]: skip_phase_two = 1 if "SKIP_PHASE_THREE" in self.config and \ self.config["SKIP_PHASE_THREE"]: skip_phase_three = 1 # The run_job.py script verifies that all jobs in the batch have the # same value of these skip parameters in the job specs, so we can just # check the first one. for key, value in ( self.coordinator_db.job_params(batch_jobs[0]).items()): if key == "SKIP_PHASE_ZERO": skip_phase_zero = value if key == "SKIP_PHASE_ONE": skip_phase_one = value if key == "SKIP_PHASE_TWO": skip_phase_two = value if key == "SKIP_PHASE_THREE": skip_phase_three = value if key == "MAP_INPUT_FIXED_KEY_LENGTH": fixed_key_length = int(value) if key == "MAP_INPUT_FIXED_VALUE_LENGTH": fixed_value_length = int(value) fixed_tuple_length = None if fixed_key_length != None and fixed_value_length != None: fixed_tuple_length = fixed_key_length + fixed_value_length use_replication = False if "OUTPUT_REPLICATION_LEVEL" in self.config and \ int(self.config["OUTPUT_REPLICATION_LEVEL"]) > 1: use_replication = True phases = [] if not skip_phase_zero: phases.append(0) if not skip_phase_one: phases.append(1) if not skip_phase_two and use_replication: # If we're using replication, phase two will have network transfer, # use barriers to guarantee sockets are connected. phases.append(2) if not skip_phase_three and use_replication: # If we're using replication, phase three will have network # transfer, use barriers to guarantee sockets are connected. phases.append(3) # Setup barriers self.coordinator_db.create_barriers(phases, batch_id, batch_jobs) # Generate read requests for the jobs in the batch read_requests = generate_read_requests( job_inputs = batch_inputs, phase_zero_sample_rate = phase_zero_sample_rate, phase_zero_sample_points_per_file =\ phase_zero_sample_points_per_file, tuple_start_offset = fixed_tuple_length, job_ids = batch_jobs, phases=phases) # Load read requests into read request queue for each worker load_read_requests(self.coordinator_db, read_requests) start_time = time.time() # Mark phase zero as starting now. self.coordinator_db.begin_phase(batch_id, "phase_zero") self.batch_phase_info[batch_id] = ("phase_zero", 0, start_time) log.info("Running phase_zero...") print_keyboard_commands() for job_id in batch_jobs: self.coordinator_db.update_job_status( job_id, { "start_time" : str(start_time), "batch_id" : batch_id, "date" : time.asctime()}) self.coordinator_db.add_jobs_to_batch(batch_id, batch_jobs) self.coordinator_db.mark_batch_incomplete(batch_id) # Setting current_batch will cause all node coordinators to start work # on that batch self.coordinator_db.add_batch_to_node_coordinator_batch_queues(batch_id)
def run(self): # Any pending batches won't be processed by this client self.coordinator_db.clear_batch_queue(self.hostname) remaining_live_retries = 10 # Make sure the entire cluster is ping-able nodes = list(self.coordinator_db.live_nodes) self.coordinator_db.wait_for_ping_request(self.hostname) # Issue fping command to the entire cluster log.info("Pinging %s" % nodes) command = fping["-u"] for node in nodes: command = command[node] unreachable_nodes = command() unreachable_nodes = unreachable_nodes.encode("ascii") log.info("Unreachable nodes: %s" % unreachable_nodes) # Report results to the cluster coordinator self.coordinator_db.send_ping_reply(self.hostname, unreachable_nodes) while True: # Re-grab my node ID, the list of nodes, and the number of # intermediate disks on each node nodes = list(self.coordinator_db.live_nodes) nodes.sort() try: node_id = nodes.index(self.hostname) remaining_live_retries = 10 except ValueError: error_message = ( ("Can't find my hostname (%s) in the list of valid " "nodes") % (self.hostname)) log.error(error_message) # Sleep for a little while and try again remaining_live_retries -= 1 if remaining_live_retries == 0: raise RuntimeError(error_message) else: time.sleep(1) continue intermediate_disk_counts = [] for node in nodes: intermediate_disk_counts.append( len(self.coordinator_db.local_disks(node))) # Make sure we have the same number of intermediate disks on each # node. if len(set(intermediate_disk_counts)) != 1: error_message = ( ("All nodes should have the same number of intermediate " "disks, but counts are %s") % (intermediate_disk_counts)) log.error(error_message) raise RuntimeError(error_message) num_intermediate_disks = intermediate_disk_counts[0] node_ips = map(lambda x: self.coordinator_db.ipv4_address(x), nodes) log.info(node_ips) log.info(node_id) self.ip_address = node_ips[node_id] # Get IPs for all interfaces node_interface_ips = map( lambda x: self.coordinator_db.interfaces(x), nodes) intermediate_disks = self.coordinator_db.local_disks(self.hostname) # If we're writing output to local disks, we need to know what # those local disks are output_disks = self.coordinator_db.io_disks(self.hostname) # Get the next batch number from the coordinator log.info("Waiting for the next batch ...") self.current_batch = ( self.coordinator_db.blocking_wait_for_next_batch( self.hostname)) log.info("Running batch %d" % (self.current_batch)) # Make a temporary directory to hold logical disk counts and # partition information; put a nonce in the directory name to avoid # collisions. Store it on this node's first intermediate disk to # avoid running into /tmp size limits tmp_files_dir = os.path.join( intermediate_disks[0], "%(username)s_tempfiles_batch_%(batch_number)d_%(nonce)x" % { "username": self.username, "batch_number": self.current_batch, "nonce": self.batch_nonce }) assert not os.path.exists(tmp_files_dir) os.makedirs(tmp_files_dir) # Construct log directory based on current batch base_log_dir = create_batch_directory(self.log_directory, self.current_batch) batch_jobs = self.coordinator_db.batch_jobs(self.current_batch) # Determine which phases we're running based on the app config and # and first job's job-spec job_params = self.coordinator_db.job_params(batch_jobs[0]) skip_params = [ "SKIP_PHASE_ZERO", "SKIP_PHASE_ONE", "SKIP_PHASE_TWO", "SKIP_PHASE_THREE" ] skipped_phases = {} for param in skip_params: # By default don't skip the phase skipped_phases[param] = False # First load app config if param in self.config: skipped_phases[param] = self.config[param] # Then load job spec if param in job_params: skipped_phases[param] = job_params[param] # Special case for daytona minutesort daytona_minutesort = False if "DAYTONA_MINUTESORT" in job_params and \ job_params["DAYTONA_MINUTESORT"]: daytona_minutesort = True skipped_phases["SKIP_PHASE_ZERO"] = False skipped_phases["SKIP_PHASE_ONE"] = True skipped_phases["SKIP_PHASE_TWO"] = True skipped_phases["SKIP_PHASE_THREE"] = True # Need to make a disk-backed boundary list file for each job in the # batch, and retrieve any boundary list files for jobs that those # jobs are recovering global_boundary_list_files = self.lookup_global_boundary_lists( batch_jobs, base_log_dir) if type(global_boundary_list_files) == int: # There was some sort of error while grabbing the boundary # file for the returned job; abort this job self.fail_current_batch( "Couldn't fetch global boundary list files for job %d" % (global_boundary_list_files)) self.coordinator_db.node_completed_batch( self.hostname, self.current_batch) continue # If any part of the batch fails, we should skip all subsequent # parts, but still clean up appropriately continue_batch = True logical_disk_counts_files = {} boundary_list_files = {} command_params = { "OUTPUT_DISK_LIST": ','.join(output_disks), "MYPEERID": node_id, "MY_IP_ADDRESS": self.ip_address, "PEER_LIST": ','.join(node_interface_ips), "NUM_INTERFACES": self.num_interfaces, "CONFIG": self.config_file, "DEFAULT_CONFIG": self.default_config, "SKIP_PHASE_ONE": 1, "SKIP_PHASE_TWO": 1, "SKIP_PHASE_THREE": 1, "COORDINATOR.HOSTNAME": self.redis_host, "COORDINATOR.PORT": self.redis_port, "COORDINATOR.DB": self.redis_db, "BATCH_ID": str(self.current_batch), "NUM_INPUT_DISKS": len(self.coordinator_db.io_disks(self.hostname)) } if skipped_phases["SKIP_PHASE_ZERO"] == False: # Execute phase zero for each job in the batch for job_id in batch_jobs: if not continue_batch: break phase_zero_log_dir = os.path.join( base_log_dir, "phase_zero_job_%d" % (job_id)) logical_disk_counts_file = os.path.join( tmp_files_dir, "logical_disk_counts.%d" % (job_id)) logical_disk_counts_files[ job_id] = logical_disk_counts_file boundary_list_file = os.path.join( tmp_files_dir, "boundary_list.%d" % (job_id)) boundary_list_files[job_id] = boundary_list_file command_params["LOG_DIR"] = phase_zero_log_dir command_params[ "LOGICAL_DISK_COUNTS_FILE"] = logical_disk_counts_file command_params["BOUNDARY_LIST_FILE"] = boundary_list_file command_params["JOB_IDS"] = str(job_id) for job_id in global_boundary_list_files: param_name = "DISK_BACKED_BOUNDARY_LIST.%d" % (job_id) command_params[param_name] = ( global_boundary_list_files[job_id]) if daytona_minutesort: for job_id, filename in boundary_list_files.items(): command_params["BOUNDARY_LIST_FILE.%d" % (job_id)] = \ filename # Pull in any parameters that may have been set for this job, # overriding the parameters set above for key, value in ( self.coordinator_db.job_params(job_id).items()): command_params[key] = value continue_batch = self._run_themis(self.themis_binary, command_params, phase_zero_log_dir) # Copy one of the logical disk counts file to a well-known # location if continue_batch and node_id == 0: if os.path.exists(logical_disk_counts_file): shutil.copy( logical_disk_counts_file, os.path.join( phase_zero_log_dir, os.path.basename( logical_disk_counts_file))) else: log.error( "Can't find logical disk counts file '%s'" % (logical_disk_counts_file)) # Notify redis that we're done with phase zero self.coordinator_db.phase_completed(self.current_batch, self.ip_address, "phase_zero") if skipped_phases["SKIP_PHASE_ONE"] == False: # Execute phase one with all jobs at once if continue_batch: phase_one_log_dir = os.path.join(base_log_dir, "phase_one") if "BOUNDARY_LIST_FILE" in command_params: del command_params["BOUNDARY_LIST_FILE"] if "LOGICAL_DISK_COUNTS_FILE" in command_params: del command_params["LOGICAL_DISK_COUNTS_FILE"] if "SKIP_PHASE_ONE" in command_params: del command_params["SKIP_PHASE_ONE"] command_params["SKIP_PHASE_ZERO"] = 1 command_params["SKIP_PHASE_TWO"] = 1 command_params["SKIP_PHASE_THREE"] = 1 command_params["JOB_IDS"] = ','.join(map(str, batch_jobs)) command_params["LOG_DIR"] = phase_one_log_dir for job_id, filename in logical_disk_counts_files.items(): command_params["LOGICAL_DISK_COUNTS_FILE.%d" % (job_id)] = \ filename for job_id, filename in boundary_list_files.items(): command_params["BOUNDARY_LIST_FILE.%d" % (job_id)] = \ filename for job_id in batch_jobs: # Pull in any parameters that may have been set for this job, # overriding the parameters set above # \\\TODO(MC): This doesn't work for multiple jobs. for key, value in (self.coordinator_db.job_params( job_id).items()): command_params[key] = value continue_batch = self._run_themis(self.themis_binary, command_params, phase_one_log_dir) # Notify redis that we're done with phase one self.coordinator_db.phase_completed(self.current_batch, self.ip_address, "phase_one") if skipped_phases["SKIP_PHASE_TWO"] == False: # Execute phase two with all jobs at once if continue_batch: phase_two_log_dir = os.path.join(base_log_dir, "phase_two") if "SKIP_PHASE_TWO" in command_params: del command_params["SKIP_PHASE_TWO"] command_params["SKIP_PHASE_ZERO"] = 1 command_params["SKIP_PHASE_ONE"] = 1 command_params["SKIP_PHASE_THREE"] = 1 command_params["LOG_DIR"] = phase_two_log_dir # Execute phase two continue_batch = self._run_themis( self.themis_binary + "_phase_two", command_params, phase_two_log_dir) # Notify redis that we're done with phase two self.coordinator_db.phase_completed(self.current_batch, self.ip_address, "phase_two") if skipped_phases["SKIP_PHASE_THREE"] == False: # Execute phase three for each job in the batch for job_id in batch_jobs: if not continue_batch: break phase_three_log_dir = os.path.join( base_log_dir, "phase_three_job_%d" % (job_id)) if "SKIP_PHASE_THREE" in command_params: del command_params["SKIP_PHASE_THREE"] command_params["SKIP_PHASE_ZERO"] = 1 command_params["SKIP_PHASE_ONE"] = 1 command_params["SKIP_PHASE_TWO"] = 1 command_params["LOG_DIR"] = phase_three_log_dir # Execute phase three continue_batch = self._run_themis(self.themis_binary, command_params, phase_three_log_dir) # Notify redis that we're done with phase three self.coordinator_db.phase_completed(self.current_batch, self.ip_address, "phase_three") if continue_batch: log.info("Batch %d succeeded" % (self.current_batch)) else: log.info("Batch %d failed" % (self.current_batch)) # Done processing this batch self.coordinator_db.node_completed_batch(self.hostname, self.current_batch)
def run_batch(self, batch_jobs, batch_inputs): batch_id = self.coordinator_db.next_batch_id log.info("Running batch %d with the following job(s): %s" % (batch_id, ', '.join(map(str, batch_jobs)))) # Create log directory for the current batch batch_logs = create_batch_directory(self.log_directory, batch_id) # Copy description files to the log directory description_dir = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, os.pardir, "tritonsort", "mapreduce", "description") shutil.copy(os.path.join(description_dir, "stages.json"), batch_logs) shutil.copy(os.path.join(description_dir, "structure.json"), batch_logs) # Copy config file to log directory shutil.copy(self.config_file, batch_logs) self.ready_for_next_batch = False # Pull out relevant phase zero parameters phase_zero_sample_rate = 1 # Sample 100% by default if "SAMPLE_RATE" in self.config: phase_zero_sample_rate = float(self.config["SAMPLE_RATE"]) phase_zero_sample_points_per_file = 1 # Sample prefixes by default if "SAMPLES_PER_FILE" in self.config: phase_zero_sample_points_per_file = \ int(self.config["SAMPLES_PER_FILE"]) fixed_key_length = None if "MAP_INPUT_FIXED_KEY_LENGTH" in self.config: fixed_key_length = int(self.config["MAP_INPUT_FIXED_KEY_LENGTH"]) fixed_value_length = None if "MAP_INPUT_FIXED_VALUE_LENGTH" in self.config: fixed_value_length = \ int(self.config["MAP_INPUT_FIXED_VALUE_LENGTH"]) # If the application config file (yaml) or the job spec file (json) # skips a phase, we should not load read requests for that phase. The # job spec file should override the application config file. skip_phase_zero = 0 skip_phase_one = 0 skip_phase_two = 0 skip_phase_three = 0 if "SKIP_PHASE_ZERO" in self.config and self.config["SKIP_PHASE_ZERO"]: skip_phase_zero = 1 if "SKIP_PHASE_ONE" in self.config and self.config["SKIP_PHASE_ONE"]: skip_phase_one = 1 if "SKIP_PHASE_TWO" in self.config and self.config["SKIP_PHASE_TWO"]: skip_phase_two = 1 if "SKIP_PHASE_THREE" in self.config and \ self.config["SKIP_PHASE_THREE"]: skip_phase_three = 1 # The run_job.py script verifies that all jobs in the batch have the # same value of these skip parameters in the job specs, so we can just # check the first one. for key, value in (self.coordinator_db.job_params( batch_jobs[0]).items()): if key == "SKIP_PHASE_ZERO": skip_phase_zero = value if key == "SKIP_PHASE_ONE": skip_phase_one = value if key == "SKIP_PHASE_TWO": skip_phase_two = value if key == "SKIP_PHASE_THREE": skip_phase_three = value if key == "MAP_INPUT_FIXED_KEY_LENGTH": fixed_key_length = int(value) if key == "MAP_INPUT_FIXED_VALUE_LENGTH": fixed_value_length = int(value) fixed_tuple_length = None if fixed_key_length != None and fixed_value_length != None: fixed_tuple_length = fixed_key_length + fixed_value_length use_replication = False if "OUTPUT_REPLICATION_LEVEL" in self.config and \ int(self.config["OUTPUT_REPLICATION_LEVEL"]) > 1: use_replication = True phases = [] if not skip_phase_zero: phases.append(0) if not skip_phase_one: phases.append(1) if not skip_phase_two and use_replication: # If we're using replication, phase two will have network transfer, # use barriers to guarantee sockets are connected. phases.append(2) if not skip_phase_three and use_replication: # If we're using replication, phase three will have network # transfer, use barriers to guarantee sockets are connected. phases.append(3) # Setup barriers self.coordinator_db.create_barriers(phases, batch_id, batch_jobs) # Generate read requests for the jobs in the batch read_requests = generate_read_requests( job_inputs = batch_inputs, phase_zero_sample_rate = phase_zero_sample_rate, phase_zero_sample_points_per_file =\ phase_zero_sample_points_per_file, tuple_start_offset = fixed_tuple_length, job_ids = batch_jobs, phases=phases) # Load read requests into read request queue for each worker load_read_requests(self.coordinator_db, read_requests) start_time = time.time() # Mark phase zero as starting now. self.coordinator_db.begin_phase(batch_id, "phase_zero") self.batch_phase_info[batch_id] = ("phase_zero", 0, start_time) log.info("Running phase_zero...") print_keyboard_commands() for job_id in batch_jobs: self.coordinator_db.update_job_status( job_id, { "start_time": str(start_time), "batch_id": batch_id, "date": time.asctime() }) self.coordinator_db.add_jobs_to_batch(batch_id, batch_jobs) self.coordinator_db.mark_batch_incomplete(batch_id) # Setting current_batch will cause all node coordinators to start work # on that batch self.coordinator_db.add_batch_to_node_coordinator_batch_queues( batch_id)
def run(self): # Any pending batches won't be processed by this client self.coordinator_db.clear_batch_queue(self.hostname) remaining_live_retries = 10 # Make sure the entire cluster is ping-able nodes = list(self.coordinator_db.live_nodes) self.coordinator_db.wait_for_ping_request(self.hostname) # Issue fping command to the entire cluster log.info("Pinging %s" % nodes) command = fping["-u"] for node in nodes: command = command[node] unreachable_nodes = command() unreachable_nodes = unreachable_nodes.encode("ascii") log.info("Unreachable nodes: %s" % unreachable_nodes) # Report results to the cluster coordinator self.coordinator_db.send_ping_reply(self.hostname, unreachable_nodes) while True: # Re-grab my node ID, the list of nodes, and the number of # intermediate disks on each node nodes = list(self.coordinator_db.live_nodes) nodes.sort() try: node_id = nodes.index(self.hostname) remaining_live_retries = 10 except ValueError: error_message = ( ("Can't find my hostname (%s) in the list of valid " "nodes") % (self.hostname)) log.error(error_message) # Sleep for a little while and try again remaining_live_retries -= 1 if remaining_live_retries == 0: raise RuntimeError(error_message) else: time.sleep(1) continue intermediate_disk_counts = [] for node in nodes: intermediate_disk_counts.append( len(self.coordinator_db.local_disks(node))) # Make sure we have the same number of intermediate disks on each # node. if len(set(intermediate_disk_counts)) != 1: error_message = ( ("All nodes should have the same number of intermediate " "disks, but counts are %s") % (intermediate_disk_counts)) log.error(error_message) raise RuntimeError(error_message) num_intermediate_disks = intermediate_disk_counts[0] node_ips = map(lambda x: self.coordinator_db.ipv4_address(x), nodes) self.ip_address = node_ips[node_id] # Get IPs for all interfaces node_interface_ips = map( lambda x: self.coordinator_db.interfaces(x), nodes) intermediate_disks = self.coordinator_db.local_disks(self.hostname) # If we're writing output to local disks, we need to know what # those local disks are output_disks = self.coordinator_db.io_disks(self.hostname) # Get the next batch number from the coordinator log.info("Waiting for the next batch ...") self.current_batch = ( self.coordinator_db.blocking_wait_for_next_batch( self.hostname)) log.info("Running batch %d" % (self.current_batch)) # Make a temporary directory to hold logical disk counts and # partition information; put a nonce in the directory name to avoid # collisions. Store it on this node's first intermediate disk to # avoid running into /tmp size limits tmp_files_dir = os.path.join( intermediate_disks[0], "%(username)s_tempfiles_batch_%(batch_number)d_%(nonce)x" % { "username" : self.username, "batch_number" : self.current_batch, "nonce" : self.batch_nonce }) assert not os.path.exists(tmp_files_dir) os.makedirs(tmp_files_dir) # Construct log directory based on current batch base_log_dir = create_batch_directory( self.log_directory, self.current_batch) batch_jobs = self.coordinator_db.batch_jobs(self.current_batch) # Determine which phases we're running based on the app config and # and first job's job-spec job_params = self.coordinator_db.job_params(batch_jobs[0]) skip_params = [ "SKIP_PHASE_ZERO", "SKIP_PHASE_ONE", "SKIP_PHASE_TWO", "SKIP_PHASE_THREE"] skipped_phases = {} for param in skip_params: # By default don't skip the phase skipped_phases[param] = False # First load app config if param in self.config: skipped_phases[param] = self.config[param] # Then load job spec if param in job_params: skipped_phases[param] = job_params[param] # Special case for daytona minutesort daytona_minutesort = False if "DAYTONA_MINUTESORT" in job_params and \ job_params["DAYTONA_MINUTESORT"]: daytona_minutesort = True skipped_phases["SKIP_PHASE_ZERO"] = False skipped_phases["SKIP_PHASE_ONE"] = True skipped_phases["SKIP_PHASE_TWO"] = True skipped_phases["SKIP_PHASE_THREE"] = True # Need to make a disk-backed boundary list file for each job in the # batch, and retrieve any boundary list files for jobs that those # jobs are recovering global_boundary_list_files = self.lookup_global_boundary_lists( batch_jobs, base_log_dir) if type(global_boundary_list_files) == int: # There was some sort of error while grabbing the boundary # file for the returned job; abort this job self.fail_current_batch( "Couldn't fetch global boundary list files for job %d" % ( global_boundary_list_files)) self.coordinator_db.node_completed_batch( self.hostname, self.current_batch) continue # If any part of the batch fails, we should skip all subsequent # parts, but still clean up appropriately continue_batch = True logical_disk_counts_files = {} boundary_list_files = {} command_params = { "OUTPUT_DISK_LIST" : ','.join(output_disks), "MYPEERID" : node_id, "MY_IP_ADDRESS" : self.ip_address, "PEER_LIST" : ','.join(node_interface_ips), "NUM_INTERFACES" : self.num_interfaces, "CONFIG" : self.config_file, "DEFAULT_CONFIG" : self.default_config, "SKIP_PHASE_ONE" : 1, "SKIP_PHASE_TWO" : 1, "SKIP_PHASE_THREE" : 1, "COORDINATOR.HOSTNAME" : self.redis_host, "COORDINATOR.PORT" : self.redis_port, "COORDINATOR.DB" : self.redis_db, "BATCH_ID" : str(self.current_batch), "NUM_INPUT_DISKS" : len(self.coordinator_db.io_disks(self.hostname)) } if skipped_phases["SKIP_PHASE_ZERO"] == False: # Execute phase zero for each job in the batch for job_id in batch_jobs: if not continue_batch: break phase_zero_log_dir = os.path.join( base_log_dir, "phase_zero_job_%d" % (job_id)) logical_disk_counts_file = os.path.join( tmp_files_dir, "logical_disk_counts.%d" % (job_id)) logical_disk_counts_files[job_id] = logical_disk_counts_file boundary_list_file = os.path.join( tmp_files_dir, "boundary_list.%d" % (job_id)) boundary_list_files[job_id] = boundary_list_file command_params["LOG_DIR"] = phase_zero_log_dir command_params["LOGICAL_DISK_COUNTS_FILE"] = logical_disk_counts_file command_params["BOUNDARY_LIST_FILE"] = boundary_list_file command_params["JOB_IDS"] = str(job_id) for job_id in global_boundary_list_files: param_name = "DISK_BACKED_BOUNDARY_LIST.%d" % (job_id) command_params[param_name] = ( global_boundary_list_files[job_id]) if daytona_minutesort: for job_id, filename in boundary_list_files.items(): command_params["BOUNDARY_LIST_FILE.%d" % (job_id)] = \ filename # Pull in any parameters that may have been set for this job, # overriding the parameters set above for key, value in ( self.coordinator_db.job_params(job_id).items()): command_params[key] = value continue_batch = self._run_themis( self.themis_binary, command_params, phase_zero_log_dir) # Copy one of the logical disk counts file to a well-known # location if continue_batch and node_id == 0: if os.path.exists(logical_disk_counts_file): shutil.copy( logical_disk_counts_file, os.path.join( phase_zero_log_dir, os.path.basename(logical_disk_counts_file))) else: log.error("Can't find logical disk counts file '%s'" % (logical_disk_counts_file)) # Notify redis that we're done with phase zero self.coordinator_db.phase_completed( self.current_batch, self.ip_address, "phase_zero") if skipped_phases["SKIP_PHASE_ONE"] == False: # Execute phase one with all jobs at once if continue_batch: phase_one_log_dir = os.path.join(base_log_dir, "phase_one") if "BOUNDARY_LIST_FILE" in command_params: del command_params["BOUNDARY_LIST_FILE"] if "LOGICAL_DISK_COUNTS_FILE" in command_params: del command_params["LOGICAL_DISK_COUNTS_FILE"] if "SKIP_PHASE_ONE" in command_params: del command_params["SKIP_PHASE_ONE"] command_params["SKIP_PHASE_ZERO"] = 1 command_params["SKIP_PHASE_TWO"] = 1 command_params["SKIP_PHASE_THREE"] = 1 command_params["JOB_IDS"] = ','.join(map(str, batch_jobs)) command_params["LOG_DIR"] = phase_one_log_dir for job_id, filename in logical_disk_counts_files.items(): command_params["LOGICAL_DISK_COUNTS_FILE.%d" % (job_id)] = \ filename for job_id, filename in boundary_list_files.items(): command_params["BOUNDARY_LIST_FILE.%d" % (job_id)] = \ filename for job_id in batch_jobs: # Pull in any parameters that may have been set for this job, # overriding the parameters set above # \\\TODO(MC): This doesn't work for multiple jobs. for key, value in ( self.coordinator_db.job_params(job_id).items()): command_params[key] = value continue_batch = self._run_themis( self.themis_binary, command_params, phase_one_log_dir) # Notify redis that we're done with phase one self.coordinator_db.phase_completed( self.current_batch, self.ip_address, "phase_one") if skipped_phases["SKIP_PHASE_TWO"] == False: # Execute phase two with all jobs at once if continue_batch: phase_two_log_dir = os.path.join(base_log_dir, "phase_two") if "SKIP_PHASE_TWO" in command_params: del command_params["SKIP_PHASE_TWO"] command_params["SKIP_PHASE_ZERO"] = 1 command_params["SKIP_PHASE_ONE"] = 1 command_params["SKIP_PHASE_THREE"] = 1 command_params["LOG_DIR"] = phase_two_log_dir # Execute phase two continue_batch = self._run_themis( self.themis_binary + "_phase_two", command_params, phase_two_log_dir) # Notify redis that we're done with phase two self.coordinator_db.phase_completed( self.current_batch, self.ip_address, "phase_two") if skipped_phases["SKIP_PHASE_THREE"] == False: # Execute phase three for each job in the batch for job_id in batch_jobs: if not continue_batch: break phase_three_log_dir = os.path.join( base_log_dir, "phase_three_job_%d" % (job_id)) if "SKIP_PHASE_THREE" in command_params: del command_params["SKIP_PHASE_THREE"] command_params["SKIP_PHASE_ZERO"] = 1 command_params["SKIP_PHASE_ONE"] = 1 command_params["SKIP_PHASE_TWO"] = 1 command_params["LOG_DIR"] = phase_three_log_dir # Execute phase three continue_batch = self._run_themis( self.themis_binary, command_params, phase_three_log_dir) # Notify redis that we're done with phase three self.coordinator_db.phase_completed( self.current_batch, self.ip_address, "phase_three") if continue_batch: log.info("Batch %d succeeded" % (self.current_batch)) else: log.info("Batch %d failed" % (self.current_batch)) # Done processing this batch self.coordinator_db.node_completed_batch( self.hostname, self.current_batch)