def upload_logs(): cluster_ID, log_directory, provider = read_conf_file( "cluster.conf", "cluster", ["id", "log_directory", "provider"]) if provider == "amazon": S3_bucket = read_conf_file("amazon.conf", "amazon", "bucket") parallel_ssh["-m"]["aws --profile themis s3 sync --exact-timestamps "\ "%s s3://%s/cluster_%s/themis_logs" % ( log_directory, S3_bucket, cluster_ID)]() elif provider == "google": bucket = read_conf_file("google.conf", "google", "bucket") # gsutil appears to be buggy and can fail randomly so keep trying until # you succeed. Try 3 times even if it succeeds to make sure all files # get uploaded. for i in xrange(3): done = False while not done: try: parallel_ssh["-m"]["gsutil -m rsync -r %s gs://%s/cluster_%s/" \ "themis_logs" % ( log_directory, bucket, cluster_ID)]() done = True except ProcessExecutionError as e: pass else: print >>sys.stderr, "Unknown provider %s" % provider return 1 return 0
def download_logs(): cluster_ID, log_directory, provider = read_conf_file( "cluster.conf", "cluster", ["id", "log_directory", "provider"]) log_directory = os.path.expanduser(log_directory) if provider == "amazon": S3_bucket = read_conf_file("amazon.conf", "amazon", "bucket") aws = plumbum.local["aws"] aws["--profile"]["themis"]["s3"]["sync"]["--exact-timestamps"]\ ["s3://%s/cluster_%s/themis_logs" % (S3_bucket, cluster_ID)]\ [log_directory]() elif provider == "google": bucket = read_conf_file("google.conf", "google", "bucket") # gsutil appears to be buggy and can fail randomly so keep trying until # you succeed. In fact it appears that even if the command succeeds it # might not download all files, so run the command 3 times. for i in xrange(3): done = False gsutil = plumbum.local["gsutil"] while not done: try: gsutil["-m"]["rsync"]["-r"]\ ["gs://%s/cluster_%s/themis_logs" % (bucket, cluster_ID)]\ [log_directory]() done = True except ProcessExecutionError as e: pass else: print >>sys.stderr, "Unknown provider %s" % provider return 1 return 0
def download_logs(): cluster_ID, log_directory, provider = read_conf_file( "cluster.conf", "cluster", ["id", "log_directory", "provider"]) log_directory = os.path.expanduser(log_directory) if provider == "amazon": S3_bucket = read_conf_file("amazon.conf", "amazon", "bucket") aws = plumbum.local["aws"] aws["--profile"]["themis"]["s3"]["sync"]["--exact-timestamps"]\ ["s3://%s/cluster_%s/themis_logs" % (S3_bucket, cluster_ID)]\ [log_directory]() elif provider == "google": bucket = read_conf_file("google.conf", "google", "bucket") done = False gsutil = plumbum.local["gsutil"] while not done: try: gsutil["-m"]["rsync"]["-r"]\ ["gs://%s/cluster_%s/themis_logs" % (bucket, cluster_ID)]\ [log_directory]() done = True except ProcessExecutionError as e: pass else: print >> sys.stderr, "Unknown provider %s" % provider return 1 return 0
def upload_logs(): cluster_ID, log_directory, provider = read_conf_file( "cluster.conf", "cluster", ["id", "log_directory", "provider"]) if provider == "amazon": S3_bucket = read_conf_file("amazon.conf", "amazon", "bucket") parallel_ssh["-m"]["aws --profile themis s3 sync --exact-timestamps "\ "%s s3://%s/cluster_%s/themis_logs" % ( log_directory, S3_bucket, cluster_ID)]() elif provider == "google": bucket = read_conf_file("google.conf", "google", "bucket") done = False while not done: try: parallel_ssh["-m"]["gsutil -m rsync -r %s gs://%s/cluster_%s/" \ "themis_logs" % ( log_directory, bucket, cluster_ID)]() done = True except ProcessExecutionError as e: pass else: print >> sys.stderr, "Unknown provider %s" % provider return 1 return 0
def download_logs(): cluster_ID, log_directory, provider = read_conf_file( "cluster.conf", "cluster", ["id", "log_directory", "provider"]) log_directory = os.path.expanduser(log_directory) if provider == "amazon": S3_bucket = read_conf_file("amazon.conf", "amazon", "bucket") aws = plumbum.local["aws"] aws["--profile"]["themis"]["s3"]["sync"]["--exact-timestamps"]\ ["s3://%s/cluster_%s/themis_logs" % (S3_bucket, cluster_ID)]\ [log_directory]() elif provider == "google": bucket = read_conf_file("google.conf", "google", "bucket") # gsutil appears to be buggy and can fail randomly so keep trying until # you succeed. In fact it appears that even if the command succeeds it # might not download all files, so run the command 3 times. for i in xrange(3): done = False gsutil = plumbum.local["gsutil"] while not done: try: gsutil["-m"]["rsync"]["-r"]\ ["gs://%s/cluster_%s/themis_logs" % (bucket, cluster_ID)]\ [log_directory]() done = True except ProcessExecutionError as e: pass else: print >> sys.stderr, "Unknown provider %s" % provider return 1 return 0
def upload_logs(): cluster_ID, log_directory, provider = read_conf_file( "cluster.conf", "cluster", ["id", "log_directory", "provider"]) if provider == "amazon": S3_bucket = read_conf_file("amazon.conf", "amazon", "bucket") parallel_ssh["-m"]["aws --profile themis s3 sync --exact-timestamps "\ "%s s3://%s/cluster_%s/themis_logs" % ( log_directory, S3_bucket, cluster_ID)]() elif provider == "google": bucket = read_conf_file("google.conf", "google", "bucket") # gsutil appears to be buggy and can fail randomly so keep trying until # you succeed. Try 3 times even if it succeeds to make sure all files # get uploaded. for i in xrange(3): done = False while not done: try: parallel_ssh["-m"]["gsutil -m rsync -r %s gs://%s/cluster_%s/" \ "themis_logs" % ( log_directory, bucket, cluster_ID)]() done = True except ProcessExecutionError as e: pass else: print >> sys.stderr, "Unknown provider %s" % provider return 1 return 0
def upload_logs(): cluster_ID, log_directory, provider = read_conf_file( "cluster.conf", "cluster", ["id", "log_directory", "provider"]) if provider == "amazon": S3_bucket = read_conf_file("amazon.conf", "amazon", "bucket") parallel_ssh["-m"]["aws --profile themis s3 sync --exact-timestamps "\ "%s s3://%s/cluster_%s/themis_logs" % ( log_directory, S3_bucket, cluster_ID)]() elif provider == "google": bucket = read_conf_file("google.conf", "google", "bucket") done = False while not done: try: parallel_ssh["-m"]["gsutil -m rsync -r %s gs://%s/cluster_%s/" \ "themis_logs" % ( log_directory, bucket, cluster_ID)]() done = True except ProcessExecutionError as e: pass else: print >>sys.stderr, "Unknown provider %s" % provider return 1 return 0
def download_logs(): cluster_ID, log_directory, provider = read_conf_file( "cluster.conf", "cluster", ["id", "log_directory", "provider"]) log_directory = os.path.expanduser(log_directory) if provider == "amazon": S3_bucket = read_conf_file("amazon.conf", "amazon", "bucket") aws = plumbum.local["aws"] aws["--profile"]["themis"]["s3"]["sync"]["--exact-timestamps"]\ ["s3://%s/cluster_%s/themis_logs" % (S3_bucket, cluster_ID)]\ [log_directory]() elif provider == "google": bucket = read_conf_file("google.conf", "google", "bucket") done = False gsutil = plumbum.local["gsutil"] while not done: try: gsutil["-m"]["rsync"]["-r"]\ ["gs://%s/cluster_%s/themis_logs" % (bucket, cluster_ID)]\ [log_directory]() done = True except ProcessExecutionError as e: pass else: print >>sys.stderr, "Unknown provider %s" % provider return 1 return 0
def build_themis_rc(dump_core, port, database): port = int(port) # Read the cluster config to get master address. master, keyfile = read_conf_file( "cluster.conf", "cluster", ["master_internal_address", "private_key"]) keyfile = os.path.join(os.path.expanduser("~"), ".ssh", keyfile) if os.path.exists(THEMIS_RC): os.unlink(THEMIS_RC) # .themisrc is written in yaml so do this manually with open(THEMIS_RC, "w") as themisrc_file: themisrc_file.write("ssh:\n") themisrc_file.write(" key: \"%s\"\n\n" % keyfile) if dump_core: themisrc_file.write("dump_core: true\n\n") else: themisrc_file.write("dump_core: false\n\n") themisrc_file.write("redis:\n") themisrc_file.write(" host: \"%s\"\n" % master) themisrc_file.write(" port: %d\n" % port) themisrc_file.write(" db: %d\n\n" % database) return 0
def sync_config_files(): cluster_ID, config_directory, provider = read_conf_file( "cluster.conf", "cluster", ["id", "config_directory", "provider"]) if provider == "amazon": S3_bucket = read_conf_file("amazon.conf", "amazon", "bucket") aws = local["aws"] # First upload local config files aws["--profile"]["themis"]["s3"]["sync"]["--exact-timestamps"]\ [os.path.expanduser(config_directory)]\ ["s3://%s/cluster_%s/themis_config" % (S3_bucket, cluster_ID)]() # Then download config files to all nodes parallel_ssh["-m"]["aws --profile themis s3 sync --exact-timestamps "\ "s3://%s/cluster_%s/themis_config %s" % ( S3_bucket, cluster_ID, config_directory)]() elif provider == "google": bucket = read_conf_file("google.conf", "google", "bucket") gsutil = local["gsutil"] # First upload local config files gsutil["-m"]["cp"]["-r"]\ [os.path.expanduser(config_directory)]\ ["gs://%s/cluster_%s" % (bucket, cluster_ID)]() # Then download config files to all nodes # gsutil appears to be buggy and can fail randomly so keep trying until # you succeed. Try 5 times even if it succeeds to make sure all files # get synced. for i in xrange(5): done = False while not done: try: parallel_ssh["-m"]["gsutil -m rsync -r -c gs://%s/cluster_%s/" \ "themis_config %s" % ( bucket, cluster_ID, config_directory)]() done = True except ProcessExecutionError as e: pass else: print >>sys.stderr, "Unknown provider %s" % provider return 1 return 0
def sync_config_files(): cluster_ID, config_directory, provider = read_conf_file( "cluster.conf", "cluster", ["id", "config_directory", "provider"]) if provider == "amazon": S3_bucket = read_conf_file("amazon.conf", "amazon", "bucket") aws = local["aws"] # First upload local config files aws["--profile"]["themis"]["s3"]["sync"]["--exact-timestamps"]\ [os.path.expanduser(config_directory)]\ ["s3://%s/cluster_%s/themis_config" % (S3_bucket, cluster_ID)]() # Then download config files to all nodes parallel_ssh["-m"]["aws --profile themis s3 sync --exact-timestamps "\ "s3://%s/cluster_%s/themis_config %s" % ( S3_bucket, cluster_ID, config_directory)]() elif provider == "google": bucket = read_conf_file("google.conf", "google", "bucket") gsutil = local["gsutil"] # First upload local config files gsutil["-m"]["cp"]["-r"]\ [os.path.expanduser(config_directory)]\ ["gs://%s/cluster_%s" % (bucket, cluster_ID)]() # Then download config files to all nodes # gsutil appears to be buggy and can fail randomly so keep trying until # you succeed. Try 5 times even if it succeeds to make sure all files # get synced. for i in xrange(5): done = False while not done: try: parallel_ssh["-m"]["gsutil -m rsync -r -c gs://%s/cluster_%s/" \ "themis_config %s" % ( bucket, cluster_ID, config_directory)]() done = True except ProcessExecutionError as e: pass else: print >> sys.stderr, "Unknown provider %s" % provider return 1 return 0
def main(): log_directory = read_conf_file("cluster.conf", "cluster", "log_directory") log_directory = os.path.expanduser(log_directory) log_directory = os.path.join(log_directory, "networkbench") parser = argparse.ArgumentParser( description="Harness for network benchmark application") parser.add_argument( "--config", "-c", help="config file to use for the benchmark " "(default: %(default)s)", default=os.path.join(BENCHMARK_DIR, "config.yaml"), type=str) parser.add_argument( "--log_directory", "-l", help="directory containing logs for an experiment " "(default: %(default)s)", default=log_directory) parser.add_argument( "--profiler", help="path to the binary of a profiling tool to use, for " "example valgrind or operf") parser.add_argument( "--profiler_options", help="options surrounded by quotes to pass to " "the profiler", type=str, default="") parser.add_argument( "--iterations", "-i", help="run the benchmark this many times " "(default: %(default)s)", type=int, default=1) parser.add_argument( "--sleep", "-s", help="sleep this many seconds between iterations " "(default: %(default)s)", type=int, default=0) parser.add_argument( "--per_peer_config", help="use separate config files for each peer, by " "appending the peer's IP address to the config file name: .A.B.C.D", action="store_true", default=False) parser.add_argument( "--dump_core_directory", "-d", help="dump core file to this directory " "if the benchmark crashes", default=None) parser.add_argument( "peer_ips", help="comma delimited list of host IPs to use for " "benchmarking") parser.add_argument( "--remote_connections_only", "-r", help="Only send to remote peers, " "instead of sending all-to-all, which includes localhost", action="store_true", default=False) utils.add_interfaces_params(parser) args = parser.parse_args() binary = os.path.join(BENCHMARK_DIR, "networkbench") delete_output = False solo_mode = False stage_stats = "sender,receiver" params = "-REMOTE_CONNECTIONS_ONLY %d" % (args.remote_connections_only) run_benchmark_iterations( binary, args.log_directory, args.config, args.peer_ips, args.profiler, args.profiler_options, args.iterations, args.sleep, delete_output, args.per_peer_config, args.dump_core_directory, solo_mode, stage_stats, args.interfaces, params)
def main(): parser = argparse.ArgumentParser( description="Mount Themis disks") disk_mountpoint = read_conf_file( "cluster.conf", "cluster", "disk_mountpoint") parser.add_argument( "--mountpoint", default=disk_mountpoint, help="Mount point for disks. Default %(default)s") parser.add_argument( "--format_disks", action="store_true", help="Format disks with XFS") parser.add_argument( "--partitions", action="store_true", help="If true, assume that the " "devices listed in node.conf are partitions and don't run fdisk.") args = parser.parse_args() return mount_disks(**vars(args))
def main(): parser = argparse.ArgumentParser(description="Mount Themis disks") disk_mountpoint = read_conf_file("cluster.conf", "cluster", "disk_mountpoint") parser.add_argument("--mountpoint", default=disk_mountpoint, help="Mount point for disks. Default %(default)s") parser.add_argument("--format_disks", action="store_true", help="Format disks with XFS") parser.add_argument( "--partitions", action="store_true", help="If true, assume that the " "devices listed in node.conf are partitions and don't run fdisk.") args = parser.parse_args() return mount_disks(**vars(args))
def main(): # Read the cluster config to get cluster ID. parser = ConfigParser.SafeConfigParser() parser.read(CLUSTER_CONFIG) cluster_ID = int(parser.get("cluster", "id")) provider = parser.get("cluster", "provider") zone = read_conf_file("%s.conf" % provider, provider, "zone") # Store master address information master = get_cluster_status(provider, cluster_ID, zone)["master"] if master == None: print >> sys.stderr, "Could not find master hostname" return 1 # Set master hostname in cluster.conf parser.set("cluster", "master_internal_address", master[0]) parser.set("cluster", "master_external_address", master[1]) with open(CLUSTER_CONFIG, "w") as config_file: parser.write(config_file) return 0
def main(): # Read the cluster config to get cluster ID. parser = ConfigParser.SafeConfigParser() parser.read(CLUSTER_CONFIG) cluster_ID = int(parser.get("cluster", "id")) provider = parser.get("cluster", "provider") zone = read_conf_file("%s.conf" % provider, provider, "zone") # Store master address information master = get_cluster_status(provider, cluster_ID, zone)["master"] if master == None: print >>sys.stderr, "Could not find master hostname" return 1 # Set master hostname in cluster.conf parser.set("cluster", "master_internal_address", master[0]) parser.set("cluster", "master_external_address", master[1]) with open(CLUSTER_CONFIG, "w") as config_file: parser.write(config_file) return 0
def main(): log_directory = read_conf_file("cluster.conf", "cluster", "log_directory") log_directory = os.path.expanduser(log_directory) log_directory = os.path.join(log_directory, "storagebench") parser = argparse.ArgumentParser( description="Harness for storage benchmark application") parser.add_argument("--config", "-c", help="config file to use for the benchmark " "(default: %(default)s)", default=os.path.join(BENCHMARK_DIR, "config.yaml"), type=str) parser.add_argument("--log_directory", "-l", help="directory containing logs for an experiment " "(default: %(default)s)", default=log_directory) parser.add_argument( "--profiler", help="path to the binary of a profiling tool to use, for " "example valgrind or operf") parser.add_argument("--profiler_options", help="options surrounded by quotes to pass to " "the profiler", type=str, default="") parser.add_argument("--iterations", "-i", help="run the benchmark this many times " "(default: %(default)s)", type=int, default=1) parser.add_argument("--sleep", "-s", help="sleep this many seconds between iterations " "(default: %(default)s)", type=int, default=0) parser.add_argument("--delete_output", help="delete output files after run completes", action="store_true", default=False) parser.add_argument( "--per_peer_config", help="use separate config files for each peer, by " "appending the peer's IP address to the config file name: .A.B.C.D", action="store_true", default=False) parser.add_argument("--dump_core_directory", "-d", help="dump core file to this directory " "if the benchmark crashes", default=None) parser.add_argument("--read_only", "-r", help="Only read files, don't write", action="store_true", default=False) parser.add_argument("--write_only", "-w", help="Only write (generate) files, don't read", action="store_true", default=False) parser.add_argument("peer_ips", help="comma delimited list of host IPs to use for " "benchmarking") args = parser.parse_args() binary = os.path.join(BENCHMARK_DIR, "storagebench") # Run the storage benchmark individually on each machine as if it were its # own cluster of size 1. solo_mode = True if args.read_only and args.write_only: sys.exit("Cannot specify both read-only and write-only") if args.write_only: read = 0 else: read = 1 if args.read_only: write = 0 else: write = 1 if read == 1 and write == 1: stage_stats = "reader,writer" elif read == 1 and write == 0: stage_stats = "reader" elif read == 0 and write == 1: stage_stats = "writer" else: sys.exit("Cannot specify both read-only and write-only") # Pass read/write params to Themis params = "-READ %d -WRITE %d" % (read, write) print params run_benchmark_iterations(binary, args.log_directory, args.config, args.peer_ips, args.profiler, args.profiler_options, args.iterations, args.sleep, args.delete_output, args.per_peer_config, args.dump_core_directory, solo_mode, stage_stats, None, params)
def mount_disks(format_disks, mountpoint, partitions): # Get comma delimited list of devices devices = read_conf_file("node.conf", "node", "devices") devices = devices.split(",") devices = [d for d in devices if len(d) > 0] username = read_conf_file("cluster.conf", "cluster", "username") # Setup mount point sudo[mkdir["-p"][mountpoint]]() sudo[chown]["%s:%s" % (username, username)][mountpoint]() mkfs_commands = [] for device in devices: # Unmount ALL partitions connected to this device num_mounted = (mount | grep["-c"][device])(retcode=(0,1)) num_mounted = int(num_mounted.strip()) while num_mounted > 0: # Unmount device mounted_device =\ (mount | grep[device] | head["-n1"] | awk["{print $1}"])() mounted_device = mounted_device.strip() print "Unmounting %s" % mounted_device sudo[umount[mounted_device]]() num_mounted -= 1 # Format device if format_disks: if not partitions: print "Creating new partition for %s" % device (sudo[fdisk[device]] << "d\nn\np\n1\n\n\nw")() # It appears that the fdisk command returns before the partition is # usable... time.sleep(2) print "Creating xfs file system" if not partitions: # Use partition 1 on the device partition = "%s1" % device else: # The device itself is a partition partition = device mkfs_commands.append(sudo[mkfsxfs]["-f"][partition] & BG) for command in mkfs_commands: command.wait() if command.returncode != 0: print >>sys.stderr, command.stderr sys.exit(command.returncode) # Now mount all devices disk_index = 0 for device in devices: # Setup mount point disk_mountpoint = os.path.join(mountpoint, "disk_%d" % disk_index) print "Mounting %s at %s" % (device, disk_mountpoint) mkdir["-p"][disk_mountpoint]() sudo[chown]["%s:%s" % (username, username)][disk_mountpoint]() # Mount disk if not partitions: # Use partition 1 on the device partition = "%s1" % device else: # The device itself is a partition partition = device sudo[mount["-o"]["noatime,discard"][partition][disk_mountpoint]]() sudo[chown]["%s:%s" % (username, username)][disk_mountpoint]() disk_index += 1
def mount_disks(format_disks, mountpoint, partitions): # Get comma delimited list of devices devices = read_conf_file("node.conf", "node", "devices") devices = devices.split(",") devices = [d for d in devices if len(d) > 0] username = read_conf_file("cluster.conf", "cluster", "username") # Setup mount point sudo[mkdir["-p"][mountpoint]]() sudo[chown]["%s:%s" % (username, username)][mountpoint]() mkfs_commands = [] for device in devices: # Unmount ALL partitions connected to this device num_mounted = (mount | grep["-c"][device])(retcode=(0, 1)) num_mounted = int(num_mounted.strip()) while num_mounted > 0: # Unmount device mounted_device =\ (mount | grep[device] | head["-n1"] | awk["{print $1}"])() mounted_device = mounted_device.strip() print "Unmounting %s" % mounted_device sudo[umount[mounted_device]]() num_mounted -= 1 # Format device if format_disks: if not partitions: print "Creating new partition for %s" % device (sudo[fdisk[device]] << "d\nn\np\n1\n\n\nw")() # It appears that the fdisk command returns before the partition is # usable... time.sleep(2) print "Creating xfs file system" if not partitions: # Use partition 1 on the device partition = "%s1" % device else: # The device itself is a partition partition = device mkfs_commands.append(sudo[mkfsxfs]["-f"][partition] & BG) for command in mkfs_commands: command.wait() if command.returncode != 0: print >> sys.stderr, command.stderr sys.exit(command.returncode) # Now mount all devices disk_index = 0 for device in devices: # Setup mount point disk_mountpoint = os.path.join(mountpoint, "disk_%d" % disk_index) print "Mounting %s at %s" % (device, disk_mountpoint) mkdir["-p"][disk_mountpoint]() sudo[chown]["%s:%s" % (username, username)][disk_mountpoint]() # Mount disk if not partitions: # Use partition 1 on the device partition = "%s1" % device else: # The device itself is a partition partition = device sudo[mount["-o"]["noatime,discard"][partition][disk_mountpoint]]() sudo[chown]["%s:%s" % (username, username)][disk_mountpoint]() disk_index += 1
def run_benchmark( binary, config, batch_directory, phase_directory, profiler, profiler_options, peer_list, node_list, per_peer_config, dump_core_directory, solo_mode, vnstat_interface, params, ): # Get ssh username. username = read_conf_file("cluster.conf", "cluster", "username") # Add command line parameters to binary binary = "%s -LOG_DIR %s" % (binary, phase_directory) if dump_core_directory is not None: binary = "cd %s; ulimit -c unlimited; %s" % (dump_core_directory, binary) processes = [] start_time = time.time() for index, ip in enumerate(node_list): # Now start themis binaries if solo_mode: # Act as if you are the only peer in the cluster. peer_binary = "%s -PEER_LIST %s" % (binary, ip) else: # Use the entire set of peers and designate yourself as # one of them. peer_binary = "%s -PEER_LIST %s -MYPEERID %d" % (binary, peer_list, index) if per_peer_config: # Append the IP address to the config file name peer_binary = "%s -CONFIG %s.%s" % (peer_binary, config, ip) else: peer_binary = "%s -CONFIG %s" % (peer_binary, config) # Override config file with specified parameters if params: peer_binary = "%s %s" % (peer_binary, params) if profiler == "operf": # Use the batch directory as the operf session dir session_dir = os.path.join(batch_directory, "oprofile_data.%s", ip) parallel_ssh(None, "mkdir -p %s" % session_dir, username, node_list, False, True, False) peer_binary = "%s %s --session-dir=%s %s" % (profiler, profiler_options, session_dir, peer_binary) elif profiler is not None: # Some other profiler, just prepend it to the binary peer_binary = "%s %s %s" % (profiler, profiler_options, peer_binary) # Run the node-local benchmark script. vnstat_param_string = "" if vnstat_interface != None: vnstat_param_string = "--vnstat_interface %s" % vnstat_interface command = "%s %s \"%s/run_benchmark_local.py %s %s '%s'\"" % ( ssh_command(), ip, THEMIS_SCRIPTS_DIR, vnstat_param_string, phase_directory, peer_binary, ) processes.append((subprocess.Popen(command, shell=True), ip)) print "%d tasks launched on %s\n" % (len(processes), time.asctime()) elapsed_times = [] completed_ips = [] num_nodes = len(processes) while len(processes) > 0: for process, ip in processes: process.poll() if process.returncode != None: elapsed_time = time.time() - start_time process.communicate() processes.remove((process, ip)) elapsed_times.append(elapsed_time) completed_ips.append(ip) print "Node %s completed in %.2f seconds (%d / %d)" % (ip, elapsed_time, len(elapsed_times), num_nodes) break stop_time = time.time() return (stop_time - start_time, elapsed_times, completed_ips)
def run_benchmark_iterations( binary, log_directory, config, peer_ips, profiler, profiler_options, iterations, sleep, delete_output, per_peer_config, dump_core_directory, solo_mode, stage_stats, interfaces, params="", ): # Get ssh username and themis directory username, themis_directory = read_conf_file("cluster.conf", "cluster", ["username", "themis_directory"]) themis_directory = os.path.expanduser(themis_directory) # Get cloud provider if applicable. provider = read_conf_file("cluster.conf", "cluster", "provider") if interfaces == None: vnstat_interface = None else: interface_list = filter(lambda x: len(x) > 0, interfaces.split(",")) vnstat_interface = interface_list[0] if not os.path.exists(config): sys.exit("Config file %s does not exist." % config) with open(config, "r") as fp: app_config = yaml.load(fp) # If we're using more than 1 network interface per peer, the peer list is # going to look like: # Peer1_interface1, Peer1_interface2, Peer2_interface1, Peer2_interface2, .. # In this case, we only want to launch the benchmark once per peer, so # make sure we only look at the first interface for each peer, and let # the application itself deal with the other interfaces. num_interfaces = 1 if "NUM_INTERFACES" in app_config: num_interfaces = app_config["NUM_INTERFACES"] # Remove trailing comma if any from the IP list. This will be the string we # pass into the benchmark binary. peer_list = peer_ips.rstrip(",") # If we're using multiple interfaces, only launch the benchmark once per # node. node_list = peer_list.split(",")[::num_interfaces] # Look for description files in the same directory as the binary. binary_dir = os.path.dirname(binary) description_directory = os.path.join(binary_dir, "description") if not os.path.exists(description_directory): sys.exit("Could not find description directory %s" % (description_directory)) # Check for the phase name. For simplicity we're going to require that # the benchmark have only 1 phase description = Description(description_directory) phases = description.getPhaseList() if len(phases) != 1: sys.exit("Benchmark must have exactly one phase. Got %s" % phases) phase_name = phases[0] data_size_per_node = int(app_config["BENCHMARK_DATA_SIZE_PER_NODE"][phase_name]) data_size = data_size_per_node * len(node_list) total_throughputs = {} if stage_stats is not None: stage_stats = stage_stats.split(",") for stage in stage_stats: total_throughputs[stage] = 0.0 node_benchmark_throughputs = [] for i in xrange(iterations): # Pick a unique batch ID batch = 0 while os.path.exists(os.path.join(log_directory, "batch_%d" % batch)): batch += 1 batch_directory = os.path.join(log_directory, "batch_%d" % batch) # Create directories phase_directory = os.path.join(batch_directory, phase_name) parallel_ssh(None, "mkdir -p %s" % phase_directory, username, node_list, False, True, False) # Copy description files and create phase directory. if not os.path.exists(batch_directory): os.makedirs(batch_directory) shutil.copy(os.path.join(description_directory, "stages.json"), batch_directory) shutil.copy(os.path.join(description_directory, "structure.json"), batch_directory) os.chmod(os.path.join(batch_directory, "stages.json"), 0777) os.chmod(os.path.join(batch_directory, "structure.json"), 0777) # Copy config file shutil.copyfile(config, os.path.join(batch_directory, "config.yaml")) print "\nLogging to %s" % (batch_directory) print "Running %s with batch ID %d on %d nodes..." % (phase_name, batch, len(node_list)) (elapsed, elapsed_times, completed_ips) = run_benchmark( binary, config, batch_directory, phase_directory, profiler, profiler_options, peer_list, node_list, per_peer_config, dump_core_directory, solo_mode, vnstat_interface, params, ) # Compute overall throughput throughput = (data_size / elapsed) / 1000000 per_node_throughput = (data_size_per_node / elapsed) / 1000000 print "Completed in %.2f seconds." % elapsed print " Throughput: %.2f MB/s" % throughput print " Per-server: %.2f MB/s" % per_node_throughput # Record individual throughputs throughputs = [(data_size_per_node / x) / 1000000 for x in elapsed_times] node_benchmark_throughputs += throughputs # Dump these results to a file in the batch directory results_file = open(os.path.join(batch_directory, "results"), "w") results_file.write( "Runtime: %.2f seconds\nThroughput: %.2f MB/s\nPer-server: " "%.2f MB/s\n\n" % (elapsed, throughput, per_node_throughput) ) results_file.write("Node throughputs: %s\n\n" % throughputs) for ip, elapsed_time, throughput in zip(completed_ips, elapsed_times, throughputs): results_file.write("Node %s completed in %.2f seconds (%.2f MB/s)\n" % (ip, elapsed_time, throughput)) results_file.write("\n") if stage_stats is not None: # Compute runtime stat throughputs done = False while not done: # Upload all logs. upload_logs() # Download logs locally. download_logs() try: runtime_info = gather_runtime_info(batch_directory, False) done = True except ValueError: print "Runtime info script failed. Retrying log upload/downloads." stage_info = runtime_info[0]["stages"] node_throughputs = {} for worker_info in stage_info: stats_info = worker_info["stats_info"] # We only want to look at the overall stats, which includes all # nodes (hostname or worker ID won't be specified) if len(stats_info) == 1: stage_name = stats_info["stage"] if stage_name in stage_stats: # This is one of the stages we care about node_throughputs[stage_name] = worker_info["observed_processing_rate_per_node"] total_throughputs[stage_name] += node_throughputs[stage_name] # Print throughputs in the correct order. for stage_name in stage_stats: print " %s throughput: %.2f MB/s/node" % (stage_name, node_throughputs[stage_name]) results_file.write("%s throughput: %.2f MB/s\n" % (stage_name, node_throughputs[stage_name])) results_file.close() if delete_output and "OUTPUT_DISK_LIST" in app_config and phase_name in app_config["OUTPUT_DISK_LIST"]: output_disk_list = app_config["OUTPUT_DISK_LIST"][phase_name] output_disks = output_disk_list.split(",") for disk in output_disks: print "Clearing %s" % disk parallel_ssh(None, "rm -rf %s" % disk, username, node_list, False, False, False) if sleep > 0 and i != iterations - 1: print "Sleeping %d seconds" % sleep time.sleep(sleep) print "\nCompleted %d iterations\n" % iterations # Format node throughputs node_benchmark_throughput_strings = ["%.2f" % x for x in node_benchmark_throughputs] print " Node throughputs (MB/s):" print " %s" % node_benchmark_throughput_strings print " Average node throughput: %.2f MB/s" % (numpy.mean(node_benchmark_throughputs)) print " Standard deviation: %.2f MB/s" % (numpy.std(node_benchmark_throughputs)) print " Min node throughput: %.2f MB/s" % (numpy.min(node_benchmark_throughputs)) print " Max node throughput: %.2f MB/s\n" % (numpy.max(node_benchmark_throughputs)) if stage_stats is not None: for stage_name in stage_stats: print " Average %s throughput: %.2f MB/s/node" % (stage_name, total_throughputs[stage_name] / iterations)
sys.path.append( os.path.abspath(os.path.join(SCRIPT_DIR, os.pardir, "job_runner"))) import redis_utils sys.path.append(os.path.abspath(os.path.join(SCRIPT_DIR, os.pardir, os.pardir))) import constants from common import unitconversion coordinator_db = None username = None disk_mountpoint = read_conf_file("cluster.conf", "cluster", "disk_mountpoint") username = read_conf_file("cluster.conf", "cluster", "username") themis_directory = read_conf_file("cluster.conf", "cluster", "themis_directory") # Display the master's external address on the status page. master_address = read_conf_file("cluster.conf", "cluster", "master_external_address") MOUNT_SCRIPT = os.path.join(themis_directory, "src", "scripts", "themis", "cluster", "mount_disks.py") UPDATE_SCRIPT = os.path.join(themis_directory, "src", "scripts", "themis", "cluster", "update_repo.py") generate_command = None generate_data_size = None
import utils sys.path.append(os.path.abspath(os.path.join(SCRIPT_DIR, os.pardir, "job_runner"))) import redis_utils sys.path.append(os.path.abspath(os.path.join(SCRIPT_DIR, os.pardir, os.pardir))) import constants from common import unitconversion coordinator_db = None username = None disk_mountpoint = read_conf_file("cluster.conf", "cluster", "disk_mountpoint") username = read_conf_file("cluster.conf", "cluster", "username") themis_directory = read_conf_file("cluster.conf", "cluster", "themis_directory") # Display the master's external address on the status page. master_address = read_conf_file("cluster.conf", "cluster", "master_external_address") MOUNT_SCRIPT = os.path.join(themis_directory, "src", "scripts", "themis", "cluster", "mount_disks.py") UPDATE_SCRIPT = os.path.join(themis_directory, "src", "scripts", "themis", "cluster", "update_repo.py") generate_command = None generate_data_size = None def datetimeformat(value, format="%m-%d-%Y %H:%M:%S"): return time.strftime(format, time.localtime(float(value)))
def main(): log_directory = read_conf_file("cluster.conf", "cluster", "log_directory") log_directory = os.path.expanduser(log_directory) log_directory = os.path.join(log_directory, "storagebench") parser = argparse.ArgumentParser( description="Harness for storage benchmark application") parser.add_argument( "--config", "-c", help="config file to use for the benchmark " "(default: %(default)s)", default=os.path.join(BENCHMARK_DIR, "config.yaml"), type=str) parser.add_argument( "--log_directory", "-l", help="directory containing logs for an experiment " "(default: %(default)s)", default=log_directory) parser.add_argument( "--profiler", help="path to the binary of a profiling tool to use, for " "example valgrind or operf") parser.add_argument( "--profiler_options", help="options surrounded by quotes to pass to " "the profiler", type=str, default="") parser.add_argument( "--iterations", "-i", help="run the benchmark this many times " "(default: %(default)s)", type=int, default=1) parser.add_argument( "--sleep", "-s", help="sleep this many seconds between iterations " "(default: %(default)s)", type=int, default=0) parser.add_argument( "--delete_output", help="delete output files after run completes", action="store_true", default=False) parser.add_argument( "--per_peer_config", help="use separate config files for each peer, by " "appending the peer's IP address to the config file name: .A.B.C.D", action="store_true", default=False) parser.add_argument( "--dump_core_directory", "-d", help="dump core file to this directory " "if the benchmark crashes", default=None) parser.add_argument( "--read_only", "-r", help="Only read files, don't write", action="store_true", default=False) parser.add_argument( "--write_only", "-w", help="Only write (generate) files, don't read", action="store_true", default=False) parser.add_argument( "peer_ips", help="comma delimited list of host IPs to use for " "benchmarking") args = parser.parse_args() binary = os.path.join(BENCHMARK_DIR, "storagebench") # Run the storage benchmark individually on each machine as if it were its # own cluster of size 1. solo_mode = True if args.read_only and args.write_only: sys.exit("Cannot specify both read-only and write-only") if args.write_only: read = 0 else: read = 1 if args.read_only: write = 0 else: write = 1 if read == 1 and write == 1: stage_stats = "reader,writer" elif read == 1 and write == 0: stage_stats = "reader" elif read == 0 and write == 1: stage_stats = "writer" else: sys.exit("Cannot specify both read-only and write-only") # Pass read/write params to Themis params = "-READ %d -WRITE %d" % (read, write) print params run_benchmark_iterations( binary, args.log_directory, args.config, args.peer_ips, args.profiler, args.profiler_options, args.iterations, args.sleep, args.delete_output, args.per_peer_config, args.dump_core_directory, solo_mode, stage_stats, None, params)
def run_benchmark_iterations( binary, log_directory, config, peer_ips, profiler, profiler_options, iterations, sleep, delete_output, per_peer_config, dump_core_directory, solo_mode, stage_stats, interfaces, params=""): # Get ssh username and themis directory username, themis_directory = read_conf_file( "cluster.conf", "cluster", ["username", "themis_directory"]) themis_directory = os.path.expanduser(themis_directory) # Get cloud provider if applicable. provider = read_conf_file("cluster.conf", "cluster", "provider") if interfaces == None: vnstat_interface = None else: interface_list = filter(lambda x: len(x) > 0, interfaces.split(',')) vnstat_interface = interface_list[0] if not os.path.exists(config): sys.exit("Config file %s does not exist." % config) with open(config, 'r') as fp: app_config = yaml.load(fp) # If we're using more than 1 network interface per peer, the peer list is # going to look like: # Peer1_interface1, Peer1_interface2, Peer2_interface1, Peer2_interface2, .. # In this case, we only want to launch the benchmark once per peer, so # make sure we only look at the first interface for each peer, and let # the application itself deal with the other interfaces. num_interfaces = 1 if "NUM_INTERFACES" in app_config: num_interfaces = app_config["NUM_INTERFACES"] # Remove trailing comma if any from the IP list. This will be the string we # pass into the benchmark binary. peer_list = peer_ips.rstrip(",") # If we're using multiple interfaces, only launch the benchmark once per # node. node_list = peer_list.split(",")[::num_interfaces] # Look for description files in the same directory as the binary. binary_dir = os.path.dirname(binary) description_directory = os.path.join(binary_dir, "description") if not os.path.exists(description_directory): sys.exit("Could not find description directory %s" % ( description_directory)) # Check for the phase name. For simplicity we're going to require that # the benchmark have only 1 phase description = Description(description_directory) phases = description.getPhaseList() if len(phases) != 1: sys.exit("Benchmark must have exactly one phase. Got %s" % phases) phase_name = phases[0] data_size_per_node = int( app_config["BENCHMARK_DATA_SIZE_PER_NODE"][phase_name]) data_size = data_size_per_node * len(node_list) total_throughputs = {} if stage_stats is not None: stage_stats = stage_stats.split(",") for stage in stage_stats: total_throughputs[stage] = 0.0 node_benchmark_throughputs = [] for i in xrange(iterations): # Pick a unique batch ID batch = 0 while os.path.exists( os.path.join(log_directory, "batch_%d" % batch)): batch += 1 batch_directory = os.path.join(log_directory, "batch_%d" % batch) # Create directories phase_directory = os.path.join(batch_directory, phase_name) parallel_ssh( None, "mkdir -p %s" % phase_directory, username, node_list, False, True, False) # Copy description files and create phase directory. if not os.path.exists(batch_directory): os.makedirs(batch_directory) shutil.copy( os.path.join(description_directory, "stages.json"), batch_directory) shutil.copy( os.path.join(description_directory, "structure.json"), batch_directory) os.chmod(os.path.join(batch_directory, "stages.json"), 0777) os.chmod(os.path.join(batch_directory, "structure.json"), 0777) # Copy config file shutil.copyfile(config, os.path.join(batch_directory, "config.yaml")) print "\nLogging to %s" % (batch_directory) print "Running %s with batch ID %d on %d nodes..." % ( phase_name, batch, len(node_list)) (elapsed, elapsed_times, completed_ips) = run_benchmark( binary, config, batch_directory, phase_directory, profiler, profiler_options, peer_list, node_list, per_peer_config, dump_core_directory, solo_mode, vnstat_interface, params) # Compute overall throughput throughput = (data_size / elapsed) / 1000000 per_node_throughput = (data_size_per_node / elapsed) / 1000000 print "Completed in %.2f seconds." % elapsed print " Throughput: %.2f MB/s" % throughput print " Per-server: %.2f MB/s" % per_node_throughput # Record individual throughputs throughputs = [(data_size_per_node / x) / 1000000 \ for x in elapsed_times] node_benchmark_throughputs += throughputs # Dump these results to a file in the batch directory results_file = open(os.path.join(batch_directory, "results"), "w") results_file.write( "Runtime: %.2f seconds\nThroughput: %.2f MB/s\nPer-server: " \ "%.2f MB/s\n\n" % (elapsed, throughput, per_node_throughput)) results_file.write("Node throughputs: %s\n\n" % throughputs) for ip, elapsed_time, throughput in zip( completed_ips, elapsed_times, throughputs): results_file.write( "Node %s completed in %.2f seconds (%.2f MB/s)\n" % ( ip, elapsed_time, throughput)) results_file.write("\n") if stage_stats is not None: # Compute runtime stat throughputs done = False while not done: # Upload all logs. upload_logs() # Download logs locally. download_logs() try: runtime_info = gather_runtime_info(batch_directory, False) done = True except ValueError: print "Runtime info script failed. Retrying log upload/downloads." stage_info = runtime_info[0]["stages"] node_throughputs = {} for worker_info in stage_info: stats_info = worker_info["stats_info"] # We only want to look at the overall stats, which includes all # nodes (hostname or worker ID won't be specified) if len(stats_info) == 1: stage_name = stats_info["stage"] if stage_name in stage_stats: # This is one of the stages we care about node_throughputs[stage_name] = \ worker_info["observed_processing_rate_per_node"] total_throughputs[stage_name] += \ node_throughputs[stage_name] # Print throughputs in the correct order. for stage_name in stage_stats: print " %s throughput: %.2f MB/s/node" % ( stage_name, node_throughputs[stage_name]) results_file.write("%s throughput: %.2f MB/s\n" % ( stage_name, node_throughputs[stage_name])) results_file.close() if delete_output and "OUTPUT_DISK_LIST" in app_config and \ phase_name in app_config["OUTPUT_DISK_LIST"]: output_disk_list = app_config["OUTPUT_DISK_LIST"][phase_name] output_disks = output_disk_list.split(",") for disk in output_disks: print "Clearing %s" % disk parallel_ssh( None, "rm -rf %s" % disk, username, node_list, False, False, False) if sleep > 0 and i != iterations - 1: print "Sleeping %d seconds" % sleep time.sleep(sleep) print "\nCompleted %d iterations\n" % iterations # Format node throughputs node_benchmark_throughput_strings = [ "%.2f" % x for x in node_benchmark_throughputs] print " Node throughputs (MB/s):" print " %s" % node_benchmark_throughput_strings print " Average node throughput: %.2f MB/s" % ( numpy.mean(node_benchmark_throughputs)) print " Standard deviation: %.2f MB/s" % ( numpy.std(node_benchmark_throughputs)) print " Min node throughput: %.2f MB/s" % ( numpy.min(node_benchmark_throughputs)) print " Max node throughput: %.2f MB/s\n" % ( numpy.max(node_benchmark_throughputs)) if stage_stats is not None: for stage_name in stage_stats: print " Average %s throughput: %.2f MB/s/node" % ( stage_name, total_throughputs[stage_name] / iterations)
def run_benchmark( binary, config, batch_directory, phase_directory, profiler, profiler_options, peer_list, node_list, per_peer_config, dump_core_directory, solo_mode, vnstat_interface, params): # Get ssh username. username = read_conf_file("cluster.conf", "cluster", "username") # Add command line parameters to binary binary = "%s -LOG_DIR %s" % ( binary, phase_directory) if dump_core_directory is not None: binary = "cd %s; ulimit -c unlimited; %s" % ( dump_core_directory, binary) processes = [] start_time = time.time() for index, ip in enumerate(node_list): # Now start themis binaries if solo_mode: # Act as if you are the only peer in the cluster. peer_binary = "%s -PEER_LIST %s" % (binary, ip) else: # Use the entire set of peers and designate yourself as # one of them. peer_binary = "%s -PEER_LIST %s -MYPEERID %d" % ( binary, peer_list, index) if per_peer_config: # Append the IP address to the config file name peer_binary = "%s -CONFIG %s.%s" % (peer_binary, config, ip) else: peer_binary = "%s -CONFIG %s" % (peer_binary, config) # Override config file with specified parameters if params: peer_binary = "%s %s" % (peer_binary, params) if profiler == "operf": # Use the batch directory as the operf session dir session_dir = os.path.join(batch_directory, "oprofile_data.%s", ip) parallel_ssh( None, "mkdir -p %s" % session_dir, username, node_list, False, True, False) peer_binary = "%s %s --session-dir=%s %s" % ( profiler, profiler_options, session_dir, peer_binary) elif profiler is not None: # Some other profiler, just prepend it to the binary peer_binary = "%s %s %s" % ( profiler, profiler_options, peer_binary) # Run the node-local benchmark script. vnstat_param_string = "" if vnstat_interface != None: vnstat_param_string = "--vnstat_interface %s" % vnstat_interface command = '%s %s "%s/run_benchmark_local.py %s %s \'%s\'"' % ( ssh_command(), ip, THEMIS_SCRIPTS_DIR, vnstat_param_string, phase_directory, peer_binary) processes.append((subprocess.Popen(command, shell=True), ip)) print "%d tasks launched on %s\n" % (len(processes), time.asctime()) elapsed_times = [] completed_ips = [] num_nodes = len(processes) while len(processes) > 0: for process, ip in processes: process.poll() if process.returncode != None: elapsed_time = time.time() - start_time process.communicate() processes.remove((process, ip)) elapsed_times.append(elapsed_time) completed_ips.append(ip) print "Node %s completed in %.2f seconds (%d / %d)" % ( ip, elapsed_time, len(elapsed_times), num_nodes) break stop_time = time.time() return (stop_time - start_time, elapsed_times, completed_ips)
def mount_disks(format_disks, mountpoint, partitions): # Get comma delimited list of devices devices = read_conf_file("node.conf", "node", "devices") devices = devices.split(",") devices = [d for d in devices if len(d) > 0] username = read_conf_file("cluster.conf", "cluster", "username") # Setup mount point sudo[mkdir["-p"][mountpoint]]() sudo[chown]["%s:%s" % (username, username)][mountpoint]() mkfs_commands = [] for device in devices: # Unmount ALL partitions connected to this device num_mounted = (mount | grep["-c"][device])(retcode=(0, 1)) num_mounted = int(num_mounted.strip()) while num_mounted > 0: # Unmount device mounted_device =\ (mount | grep[device] | head["-n1"] | awk["{print $1}"])() mounted_device = mounted_device.strip() print "Unmounting %s" % mounted_device sudo[umount[mounted_device]]() num_mounted -= 1 # Format device if format_disks: if not partitions and "by-id" not in device: print "Creating new partition for %s" % device (sudo[fdisk[device]] << "d\nn\np\n1\n\n\nw")() # It appears that the fdisk command returns before the partition is # usable... time.sleep(2) print "Creating ext4 file system" if not partitions and "by-id" not in device: # Use partition 1 on the device partition = "%s1" % device else: # The device itself is a partition partition = device # Persistent devices can use fast formatting if "persist" in device: extra_opt = "lazy_itable_init=0,lazy_journal_init=0,discard" mkfs_commands.append( sudo[mkfsext4]["-F"]["-E"][extra_opt][partition] & BG) else: mkfs_commands.append(sudo[mkfsext4]["-F"][partition] & BG) for command in mkfs_commands: command.wait() if command.returncode != 0: print >> sys.stderr, command.stderr sys.exit(command.returncode) # Now mount all devices disk_index = 0 persist_disk_index = 0 for device in devices: # Setup mount point disk_basename = "disk_persist_%d" % persist_disk_index if "persist" in device else "disk_%d" % disk_index disk_mountpoint = os.path.join(mountpoint, disk_basename) print "Mounting %s at %s" % (device, disk_mountpoint) mkdir["-p"][disk_mountpoint]() sudo[chown]["%s:%s" % (username, username)][disk_mountpoint]() # Mount disk if not partitions and "by-id" not in device: # Use partition 1 on the device partition = "%s1" % device else: # The device itself is a partition partition = device sudo[mount["-o"]["discard,defaults,dioread_nolock,noatime"][partition] [disk_mountpoint]]() sudo[chown]["%s:%s" % (username, username)][disk_mountpoint]() if "persist" in device: persist_disk_index += 1 else: disk_index += 1
def parallel_ssh( redis_client, command, username, hosts, ignore_bad_hosts, master, verbose=False): pending_commands = [] stdout_dict = {} stderr_dict = {} start_time = time.time() try: if type(command) == list: command = ' '.join(command) if command == "": print >>sys.stderr, "Cannot run empty command." return (1, stdout_dict, stderr_dict) if verbose: print "Running %s in parallel." % command if hosts is None: # User did not specify host list override, so ask redis hosts = redis_client.smembers("nodes") if hosts is None: print >>sys.stderr, "Error extracting host list from "\ "redis database" return (1, stdout_dict, stderr_dict) else: hosts = set(hosts) if master: # Also run on the master node. master_address = read_conf_file( "cluster.conf", "cluster", "master_internal_address") if verbose: print "Including master %s" % master_address hosts.add(master_address) temp_dir = "/tmp/run-script-%s-%s-%08x" % ( username, time.strftime("%Y-%m-%d-%H%M.%S"), random.randint(0, (16 ** 8) - 1)) if os.path.exists(temp_dir): print >>sys.stderr, ( "Temporary directory %s already (and extremely improbably) " "exists; aborting" % (temp_dir)) return (1, stdout_dict, stderr_dict) os.makedirs(temp_dir) hosts_file = os.path.join(temp_dir, "hosts") with open(hosts_file, 'w') as fp: fp.write('\n'.join(hosts) + '\n') stderr_dir = os.path.join(temp_dir, "stderr") stdout_dir = os.path.join(temp_dir, "stdout") for dirname in stderr_dir, stdout_dir: os.makedirs(dirname, 0755) for host in hosts: try: ssh_client = paramiko.SSHClient() ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) ssh_client.connect(host, username=username) channel = ssh_client.get_transport().open_session() channel.get_pty() channel.exec_command(command) pending_commands.append((host, ssh_client, channel)) if verbose: print "Launching remote command on %s (%d / %d)" % ( host, len(pending_commands), len(hosts)) except socket.gaierror as error: if not ignore_bad_hosts: raise error host_failed = False for host in hosts: stdout_dict[host] = "" stderr_dict[host] = "" while len(pending_commands) > 0: completed_commands = [] for host, ssh_client, channel in pending_commands: # Loop until we find a command that finished. if channel.exit_status_ready(): num_completed_nodes = len(hosts) - len(pending_commands) # This node just completed. num_completed_nodes += 1 elapsed_time = time.time() - start_time if verbose: print "%s completed remote command in %.2f seconds " \ "(%d / %d)" % ( host, elapsed_time, num_completed_nodes, len(hosts)) return_code = channel.recv_exit_status() if return_code != 0: print >>sys.stderr, "%s FAILED:" % (host) host_failed = True # Save stdout and stderr to file and to dicts stdout_file = os.path.join(stdout_dir, host) with open(stdout_file, "w") as fp: while channel.recv_ready(): stdout = channel.recv(1024) fp.write(stdout) stdout_dict[host] += stdout stderr_file = os.path.join(stderr_dir, host) with open(stderr_file, "w") as fp: while channel.recv_stderr_ready(): stderr = channel.recv_stderr(1024) fp.write(stderr) stderr_dict[host] += stderr if return_code != 0 and verbose: sys.stderr.write(stderr) ssh_client.close() completed_commands.append((host, ssh_client, channel)) for completed in completed_commands: pending_commands.remove(completed) time.sleep(1) pending_commands = [] if host_failed: return (1, stdout_dict, stderr_dict) else: return (0, stdout_dict, stderr_dict) except KeyboardInterrupt: print >>sys.stderr, "\nCaught keyboard interrupt\n" return (1, stdout_dict, stderr_dict) finally: # Cleanly stop any pending commands remaining_hosts = len(pending_commands) if remaining_hosts > 0: for host, ssh_client, channel in pending_commands: print >>sys.stderr, ( "Killing pending command on host '%s' ..." % (host)) ssh_client.close() elapsed_time = time.time() - start_time print "Remaining %d commands terminated at %.2f seconds." % ( remaining_hosts, elapsed_time) pending_commands = []
def mount_disks(format_disks, mountpoint, partitions): # Get comma delimited list of devices devices = read_conf_file("node.conf", "node", "devices") devices = devices.split(",") devices = [d for d in devices if len(d) > 0] username = read_conf_file("cluster.conf", "cluster", "username") # Setup mount point sudo[mkdir["-p"][mountpoint]]() sudo[chown]["%s:%s" % (username, username)][mountpoint]() mkfs_commands = [] for device in devices: # Unmount ALL partitions connected to this device num_mounted = (mount | grep["-c"][device])(retcode=(0,1)) num_mounted = int(num_mounted.strip()) while num_mounted > 0: # Unmount device mounted_device =\ (mount | grep[device] | head["-n1"] | awk["{print $1}"])() mounted_device = mounted_device.strip() print "Unmounting %s" % mounted_device sudo[umount[mounted_device]]() num_mounted -= 1 # Format device if format_disks: if not partitions and "by-id" not in device: print "Creating new partition for %s" % device (sudo[fdisk[device]] << "d\nn\np\n1\n\n\nw")() # It appears that the fdisk command returns before the partition is # usable... time.sleep(2) print "Creating ext4 file system" if not partitions and "by-id" not in device: # Use partition 1 on the device partition = "%s1" % device else: # The device itself is a partition partition = device # Persistent devices can use fast formatting if "persist" in device: extra_opt = "lazy_itable_init=0,lazy_journal_init=0,discard" mkfs_commands.append(sudo[mkfsext4]["-F"]["-E"][extra_opt][partition] & BG) else: mkfs_commands.append(sudo[mkfsext4]["-F"][partition] & BG) for command in mkfs_commands: command.wait() if command.returncode != 0: print >>sys.stderr, command.stderr sys.exit(command.returncode) # Now mount all devices disk_index = 0 persist_disk_index = 0 for device in devices: # Setup mount point disk_basename = "disk_persist_%d" % persist_disk_index if "persist" in device else "disk_%d" % disk_index disk_mountpoint = os.path.join(mountpoint, disk_basename) print "Mounting %s at %s" % (device, disk_mountpoint) mkdir["-p"][disk_mountpoint]() sudo[chown]["%s:%s" % (username, username)][disk_mountpoint]() # Mount disk if not partitions and "by-id" not in device: # Use partition 1 on the device partition = "%s1" % device else: # The device itself is a partition partition = device sudo[mount["-o"]["discard,defaults,dioread_nolock,noatime"][partition][disk_mountpoint]]() sudo[chown]["%s:%s" % (username, username)][disk_mountpoint]() if "persist" in device: persist_disk_index += 1 else: disk_index += 1