def __init__(self, redis_host, redis_port, redis_db, config, themis_binary, log_directory, keepalive_refresh, keepalive_timeout, profiler, profiler_options, ld_preload, interfaces): self.redis_host = redis_host self.redis_port = redis_port self.redis_db = redis_db self.config_file = config with open(config, 'r') as fp: self.config = yaml.load(fp) self.themis_binary = themis_binary self.log_directory = log_directory self.keepalive_refresh = keepalive_refresh self.keepalive_timeout = keepalive_timeout self.profiler = profiler self.profiler_options = profiler_options self.ld_preload = ld_preload self.batch_nonce = random.randint(0, 1000000000) self.batch_phase_info = {} self.interfaces = interfaces self.node_coordinator_log_dir = os.path.join( log_directory, "node_coordinators") self.coordinator_db = redis_utils.CoordinatorDB( redis_host, redis_port, redis_db) self.ssh_command = utils.ssh_command()
def __init__(self, redis_host, redis_port, redis_db, config, themis_binary, log_directory, keepalive_refresh, keepalive_timeout, profiler, profiler_options, ld_preload, interfaces): self.redis_host = redis_host self.redis_port = redis_port self.redis_db = redis_db self.config_file = config with open(config, 'r') as fp: self.config = yaml.load(fp) self.themis_binary = themis_binary self.log_directory = log_directory self.keepalive_refresh = keepalive_refresh self.keepalive_timeout = keepalive_timeout self.profiler = profiler self.profiler_options = profiler_options self.ld_preload = ld_preload self.batch_nonce = random.randint(0, 1000000000) self.batch_phase_info = {} self.interfaces = interfaces self.node_coordinator_log_dir = os.path.join(log_directory, "node_coordinators") self.coordinator_db = redis_utils.CoordinatorDB( redis_host, redis_port, redis_db) self.ssh_command = utils.ssh_command()
def parallel_ssh(hosts, script_path, script_options, verbose, print_stdout=False): # Script path is either a string, or a list of strings one per host if not isinstance(script_options, str): # Must be a list of same length as hosts if not isinstance(script_options, list): sys.exit( "Script options must either be a string or a list of options " "one per host. Got %s" % script_options) if len(script_options) != len(hosts): sys.exit( "Script options list must be same length as hosts (%d) but got " "%s" % len(hosts), script_options) # Launch all ssh commands in parallel. print "Running %s in parallel on %d hosts..." % (script_path, len(hosts)) pending_commands = [] for host_ID, host in enumerate(hosts): command_template = ('%(ssh)s %(host)s "%(script_path)s ' '%(script_options)s"') options_string = script_options if isinstance(options_string, list): # Pick the option for this host options_string = options_string[host_ID] command = command_template % { "ssh": utils.ssh_command(), "host": host, "script_path": script_path, "script_options": options_string, } command_object = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) pending_commands.append((command_object, command, host)) # Wait for each one to complete. failed_hosts = [] for (command_object, command_string, host) in pending_commands: (stdout, stderr) = command_object.communicate() if verbose: print "%s:%s" % (host, command_string) if command_object.returncode != 0: print " FAILURE - returned %d:" % command_object.returncode print " stdout:" if stdout is not None: for line in stdout.split('\n'): print " %s" % line print " stderr:" if stderr is not None: for line in stderr.split('\n'): print " %s" % line failed_hosts.append(host) else: if verbose: print " SUCCESS!" if print_stdout: print " stdout:" if stdout is not None: for line in stdout.split('\n'): print " %s" % line if len(failed_hosts) > 0: print "" print "Parallel %s failed on hosts:" % script_path print " %s" % failed_hosts return False return True
def run_benchmark( binary, config, batch_directory, phase_directory, profiler, profiler_options, peer_list, node_list, per_peer_config, dump_core_directory, solo_mode, vnstat_interface, params, ): # Get ssh username. username = read_conf_file("cluster.conf", "cluster", "username") # Add command line parameters to binary binary = "%s -LOG_DIR %s" % (binary, phase_directory) if dump_core_directory is not None: binary = "cd %s; ulimit -c unlimited; %s" % (dump_core_directory, binary) processes = [] start_time = time.time() for index, ip in enumerate(node_list): # Now start themis binaries if solo_mode: # Act as if you are the only peer in the cluster. peer_binary = "%s -PEER_LIST %s" % (binary, ip) else: # Use the entire set of peers and designate yourself as # one of them. peer_binary = "%s -PEER_LIST %s -MYPEERID %d" % (binary, peer_list, index) if per_peer_config: # Append the IP address to the config file name peer_binary = "%s -CONFIG %s.%s" % (peer_binary, config, ip) else: peer_binary = "%s -CONFIG %s" % (peer_binary, config) # Override config file with specified parameters if params: peer_binary = "%s %s" % (peer_binary, params) if profiler == "operf": # Use the batch directory as the operf session dir session_dir = os.path.join(batch_directory, "oprofile_data.%s", ip) parallel_ssh(None, "mkdir -p %s" % session_dir, username, node_list, False, True, False) peer_binary = "%s %s --session-dir=%s %s" % (profiler, profiler_options, session_dir, peer_binary) elif profiler is not None: # Some other profiler, just prepend it to the binary peer_binary = "%s %s %s" % (profiler, profiler_options, peer_binary) # Run the node-local benchmark script. vnstat_param_string = "" if vnstat_interface != None: vnstat_param_string = "--vnstat_interface %s" % vnstat_interface command = "%s %s \"%s/run_benchmark_local.py %s %s '%s'\"" % ( ssh_command(), ip, THEMIS_SCRIPTS_DIR, vnstat_param_string, phase_directory, peer_binary, ) processes.append((subprocess.Popen(command, shell=True), ip)) print "%d tasks launched on %s\n" % (len(processes), time.asctime()) elapsed_times = [] completed_ips = [] num_nodes = len(processes) while len(processes) > 0: for process, ip in processes: process.poll() if process.returncode != None: elapsed_time = time.time() - start_time process.communicate() processes.remove((process, ip)) elapsed_times.append(elapsed_time) completed_ips.append(ip) print "Node %s completed in %.2f seconds (%d / %d)" % (ip, elapsed_time, len(elapsed_times), num_nodes) break stop_time = time.time() return (stop_time - start_time, elapsed_times, completed_ips)
def gather_local_file_paths( coordinator_db, input_dir, max_input_files_per_disk): ssh_command = utils.ssh_command() hosts = coordinator_db.live_nodes ssh_command_template = string.Template( "%(ssh_command)s ${host} '%(script_path)s/list_local_files.py ${disks}'" % { "ssh_command" : ssh_command, "script_path" : os.path.abspath(os.path.dirname(__file__)) }) pending_commands = [] for host in hosts: disks = list(coordinator_db.io_disks(host)) disks.sort() local_dirs = map(lambda x: "%s/%s" % (x, input_dir), disks) cmd = ssh_command_template.substitute( host=host, disks = ' '.join(local_dirs)) pending_commands.append( (host, subprocess.Popen(cmd, shell="True", stdout=subprocess.PIPE))) worker_inputs = {} total_input_size = 0 for hostname, command in pending_commands: worker_inputs[hostname] = {} stdout, stderr = command.communicate() if command.returncode != 0: log.error("Command '%s' failed with error %d" % ( command.cmd, command.returncode)) return None file_paths = json.loads(stdout) if file_paths is None: log.error(("Input directory '%s' doesn't exist on all of host %s's " "input disks") % (input_dir, hostname)) return None for i, file_list in enumerate(file_paths): worker_inputs[hostname][i] = [] num_files = 0 for filename, file_length in file_list: # Allow the user to cap the number of input files for testing. if max_input_files_per_disk == None or \ num_files < max_input_files_per_disk: file_url = "local://%s%s" % (hostname, filename) worker_inputs[hostname][i].append((file_url, file_length)) total_input_size += file_length num_files += 1 return (worker_inputs, total_input_size)
def parallel_ssh( hosts, script_path, script_options, verbose, print_stdout=False): # Script path is either a string, or a list of strings one per host if not isinstance(script_options, str): # Must be a list of same length as hosts if not isinstance(script_options, list): sys.exit( "Script options must either be a string or a list of options " "one per host. Got %s" % script_options) if len(script_options) != len(hosts): sys.exit( "Script options list must be same length as hosts (%d) but got " "%s" % len(hosts), script_options) # Launch all ssh commands in parallel. print "Running %s in parallel on %d hosts..." % (script_path, len(hosts)) pending_commands = [] for host_ID, host in enumerate(hosts): command_template = ('%(ssh)s %(host)s "%(script_path)s ' '%(script_options)s"') options_string = script_options if isinstance(options_string, list): # Pick the option for this host options_string = options_string[host_ID] command = command_template % { "ssh" : utils.ssh_command(), "host" : host, "script_path" : script_path, "script_options" : options_string, } command_object = subprocess.Popen( command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) pending_commands.append((command_object, command, host)) # Wait for each one to complete. failed_hosts = [] for (command_object, command_string, host) in pending_commands: (stdout, stderr) = command_object.communicate() if verbose: print "%s:%s" % (host, command_string) if command_object.returncode != 0: print " FAILURE - returned %d:" % command_object.returncode print " stdout:" if stdout is not None: for line in stdout.split('\n'): print " %s" % line print " stderr:" if stderr is not None: for line in stderr.split('\n'): print " %s" % line failed_hosts.append(host) else: if verbose: print " SUCCESS!" if print_stdout: print " stdout:" if stdout is not None: for line in stdout.split('\n'): print " %s" % line if len(failed_hosts) > 0: print "" print "Parallel %s failed on hosts:" % script_path print " %s" % failed_hosts return False return True
def run_benchmark( binary, config, batch_directory, phase_directory, profiler, profiler_options, peer_list, node_list, per_peer_config, dump_core_directory, solo_mode, vnstat_interface, params): # Get ssh username. username = read_conf_file("cluster.conf", "cluster", "username") # Add command line parameters to binary binary = "%s -LOG_DIR %s" % ( binary, phase_directory) if dump_core_directory is not None: binary = "cd %s; ulimit -c unlimited; %s" % ( dump_core_directory, binary) processes = [] start_time = time.time() for index, ip in enumerate(node_list): # Now start themis binaries if solo_mode: # Act as if you are the only peer in the cluster. peer_binary = "%s -PEER_LIST %s" % (binary, ip) else: # Use the entire set of peers and designate yourself as # one of them. peer_binary = "%s -PEER_LIST %s -MYPEERID %d" % ( binary, peer_list, index) if per_peer_config: # Append the IP address to the config file name peer_binary = "%s -CONFIG %s.%s" % (peer_binary, config, ip) else: peer_binary = "%s -CONFIG %s" % (peer_binary, config) # Override config file with specified parameters if params: peer_binary = "%s %s" % (peer_binary, params) if profiler == "operf": # Use the batch directory as the operf session dir session_dir = os.path.join(batch_directory, "oprofile_data.%s", ip) parallel_ssh( None, "mkdir -p %s" % session_dir, username, node_list, False, True, False) peer_binary = "%s %s --session-dir=%s %s" % ( profiler, profiler_options, session_dir, peer_binary) elif profiler is not None: # Some other profiler, just prepend it to the binary peer_binary = "%s %s %s" % ( profiler, profiler_options, peer_binary) # Run the node-local benchmark script. vnstat_param_string = "" if vnstat_interface != None: vnstat_param_string = "--vnstat_interface %s" % vnstat_interface command = '%s %s "%s/run_benchmark_local.py %s %s \'%s\'"' % ( ssh_command(), ip, THEMIS_SCRIPTS_DIR, vnstat_param_string, phase_directory, peer_binary) processes.append((subprocess.Popen(command, shell=True), ip)) print "%d tasks launched on %s\n" % (len(processes), time.asctime()) elapsed_times = [] completed_ips = [] num_nodes = len(processes) while len(processes) > 0: for process, ip in processes: process.poll() if process.returncode != None: elapsed_time = time.time() - start_time process.communicate() processes.remove((process, ip)) elapsed_times.append(elapsed_time) completed_ips.append(ip) print "Node %s completed in %.2f seconds (%d / %d)" % ( ip, elapsed_time, len(elapsed_times), num_nodes) break stop_time = time.time() return (stop_time - start_time, elapsed_times, completed_ips)