def main(): log_directory = read_conf_file("cluster.conf", "cluster", "log_directory") log_directory = os.path.expanduser(log_directory) log_directory = os.path.join(log_directory, "networkbench") parser = argparse.ArgumentParser( description="Harness for network benchmark application") parser.add_argument( "--config", "-c", help="config file to use for the benchmark " "(default: %(default)s)", default=os.path.join(BENCHMARK_DIR, "config.yaml"), type=str) parser.add_argument( "--log_directory", "-l", help="directory containing logs for an experiment " "(default: %(default)s)", default=log_directory) parser.add_argument( "--profiler", help="path to the binary of a profiling tool to use, for " "example valgrind or operf") parser.add_argument( "--profiler_options", help="options surrounded by quotes to pass to " "the profiler", type=str, default="") parser.add_argument( "--iterations", "-i", help="run the benchmark this many times " "(default: %(default)s)", type=int, default=1) parser.add_argument( "--sleep", "-s", help="sleep this many seconds between iterations " "(default: %(default)s)", type=int, default=0) parser.add_argument( "--per_peer_config", help="use separate config files for each peer, by " "appending the peer's IP address to the config file name: .A.B.C.D", action="store_true", default=False) parser.add_argument( "--dump_core_directory", "-d", help="dump core file to this directory " "if the benchmark crashes", default=None) parser.add_argument( "peer_ips", help="comma delimited list of host IPs to use for " "benchmarking") parser.add_argument( "--remote_connections_only", "-r", help="Only send to remote peers, " "instead of sending all-to-all, which includes localhost", action="store_true", default=False) utils.add_interfaces_params(parser) args = parser.parse_args() binary = os.path.join(BENCHMARK_DIR, "networkbench") delete_output = False solo_mode = False stage_stats = "sender,receiver" params = "-REMOTE_CONNECTIONS_ONLY %d" % (args.remote_connections_only) run_benchmark_iterations( binary, args.log_directory, args.config, args.peer_ips, args.profiler, args.profiler_options, args.iterations, args.sleep, delete_output, args.per_peer_config, args.dump_core_directory, solo_mode, stage_stats, args.interfaces, params)
def main(): parser = argparse.ArgumentParser(description="Run a benchmark application on a collection of nodes.") parser.add_argument("binary", help="benchmark application binary") parser.add_argument("log_directory", help="directory containing logs for an experiment") parser.add_argument("config", help="config file to use for the benchmark.") parser.add_argument("peer_ips", help="comma delimited list of host IPs to use for " "benchmarking") parser.add_argument( "--profiler", help="path to the binary of a profiling tool to use, for " "example valgrind or operf" ) parser.add_argument( "--profiler_options", help="options surrounded by quotes to pass to " "the profiler", type=str, default="" ) parser.add_argument( "--iterations", "-i", help="run the benchmark this many times " "(default: %(default)s)", type=int, default=1 ) parser.add_argument( "--sleep", "-s", help="sleep this many seconds between iterations " "(default: %(default)s)", type=int, default=0, ) parser.add_argument( "--delete_output", help="delete output files after run completes", action="store_true", default=False ) parser.add_argument( "--per_peer_config", help="use separate config files for each peer, by " "appending the peer's IP address to the config file name: .A.B.C.D", action="store_true", default=False, ) parser.add_argument( "--dump_core_directory", "-d", help="dump core file to this directory " "if the benchmark crashes", default=None ) parser.add_argument( "--solo_mode", help="run the benchmark on all peers, but run each peer " "as if it's its own cluster of size 1.", action="store_true", default=False, ) parser.add_argument( "--stage_stats", help="comma delimited list of stage names to show " "runtime stats for upon completion" ) parser.add_argument("--params", help="params that will override the config file", type=str, default="") add_interfaces_params(parser) args = parser.parse_args() run_benchmark_iterations(**vars(args))
def main(): parser = argparse.ArgumentParser( description="Run a benchmark application on a collection of nodes.") parser.add_argument( "binary", help="benchmark application binary") parser.add_argument( "log_directory", help="directory containing logs for an experiment") parser.add_argument( "config", help="config file to use for the benchmark.") parser.add_argument( "peer_ips", help="comma delimited list of host IPs to use for " "benchmarking") parser.add_argument( "--profiler", help="path to the binary of a profiling tool to use, for " "example valgrind or operf") parser.add_argument( "--profiler_options", help="options surrounded by quotes to pass to " "the profiler", type=str, default="") parser.add_argument( "--iterations", "-i", help="run the benchmark this many times " "(default: %(default)s)", type=int, default=1) parser.add_argument( "--sleep", "-s", help="sleep this many seconds between iterations " "(default: %(default)s)", type=int, default=0) parser.add_argument( "--delete_output", help="delete output files after run completes", action="store_true", default=False) parser.add_argument( "--per_peer_config", help="use separate config files for each peer, by " "appending the peer's IP address to the config file name: .A.B.C.D", action="store_true", default=False) parser.add_argument( "--dump_core_directory", "-d", help="dump core file to this directory " "if the benchmark crashes", default=None) parser.add_argument( "--solo_mode", help="run the benchmark on all peers, but run each peer " "as if it's its own cluster of size 1.", action="store_true", default=False) parser.add_argument( "--stage_stats", help="comma delimited list of stage names to show " "runtime stats for upon completion") parser.add_argument( "--params", help="params that will override the config file", type=str, default="") add_interfaces_params(parser) args = parser.parse_args() run_benchmark_iterations(**vars(args))
def main(): # Load cluster.conf parser = ConfigParser.SafeConfigParser() parser.read(CLUSTER_CONF) # Get default log directory log_directory = parser.get("cluster", "log_directory") parser = argparse.ArgumentParser( description="coordinates the execution of Themis jobs") parser.add_argument("themis_binary", help="path to the Themis binary") parser.add_argument("config", help="a YAML file giving configuration " "options for Themis") parser.add_argument("--log_directory", "-l", help="the directory in which to store coordinator logs " "(default: %(default)s)", default=log_directory) parser.add_argument("--keepalive_refresh", help="the length of time node " "coordinators should wait between refreshing keepalive " "information (default: %(default)s seconds)", type=int, default=2) parser.add_argument("--keepalive_timeout", help="the amount of time that " "must pass without receiving a keepalive message from " "a node coordinator before the cluster coordinator " "considers that node to be dead (default: %(default)s " "seconds)", type=int, default=10) parser.add_argument("--profiler", help="path to the binary of a profiling" "tool to use, for example valgrind or operf") parser.add_argument("--profiler_options", help="options surrounded by " "quotes to pass to the profiler") parser.add_argument("--ld_preload", help="Path to a library to be " "preloaded using LD_PRELOAD.") utils.add_redis_params(parser) utils.add_interfaces_params(parser) args = parser.parse_args() args.config = os.path.abspath(args.config) args.log_directory = create_log_directory(args.log_directory) log.info("Logging to %s" % (args.log_directory)) job_status_gui = None job_status_gui_out_fp = None resource_monitor_gui = None resource_monitor_gui_out_fp = None coordinator = None try: # To make the status GUI port distinct for each user but deterministic # for a single user, use 2000 + (the md5 hash of the user's username # mod 1000) as the web GUI's port number username_md5sum = hashlib.md5() username_md5sum.update(getpass.getuser()) job_status_gui_port = ( (int(username_md5sum.hexdigest(), 16) % 1000 + 2000) / 10) * 10 resource_monitor_gui_port = ( (int(username_md5sum.hexdigest(), 16) % 1000 + 3200) / 10) * 10 print "" # Start the resource monitor web GUI resource_monitor_gui, resource_monitor_gui_out_fp = \ start_resource_monitor_gui(args, resource_monitor_gui_port) # Start the job status web GUI job_status_gui, job_status_gui_out_fp = start_job_status_gui( args, job_status_gui_port) print "" coordinator = ClusterCoordinator(**vars(args)) coordinator.run() finally: if job_status_gui is not None: log.info("Stopping job status GUI (PID %d)" % (job_status_gui.pid)) os.killpg(job_status_gui.pid, signal.SIGTERM) job_status_gui.wait() if job_status_gui_out_fp is not None: job_status_gui_out_fp.flush() job_status_gui_out_fp.close() if resource_monitor_gui is not None: log.info("Stopping resource monitor GUI (PID %d)" % ( resource_monitor_gui.pid)) os.killpg(resource_monitor_gui.pid, signal.SIGTERM) resource_monitor_gui.wait() if resource_monitor_gui_out_fp is not None: resource_monitor_gui_out_fp.flush() resource_monitor_gui_out_fp.close() if coordinator is not None: log.info("Stopping node coordinators") coordinator.stop_node_coordinators()
def main(): # Load cluster.conf parser = ConfigParser.SafeConfigParser() parser.read(CLUSTER_CONF) # Get default log directory log_directory = parser.get("cluster", "log_directory") parser = argparse.ArgumentParser( description="coordinates the execution of Themis jobs") parser.add_argument("themis_binary", help="path to the Themis binary") parser.add_argument("config", help="a YAML file giving configuration " "options for Themis") parser.add_argument( "--log_directory", "-l", help="the directory in which to store coordinator logs " "(default: %(default)s)", default=log_directory) parser.add_argument( "--keepalive_refresh", help="the length of time node " "coordinators should wait between refreshing keepalive " "information (default: %(default)s seconds)", type=int, default=2) parser.add_argument("--keepalive_timeout", help="the amount of time that " "must pass without receiving a keepalive message from " "a node coordinator before the cluster coordinator " "considers that node to be dead (default: %(default)s " "seconds)", type=int, default=10) parser.add_argument("--profiler", help="path to the binary of a profiling" "tool to use, for example valgrind or operf") parser.add_argument("--profiler_options", help="options surrounded by " "quotes to pass to the profiler") parser.add_argument("--ld_preload", help="Path to a library to be " "preloaded using LD_PRELOAD.") utils.add_redis_params(parser) utils.add_interfaces_params(parser) args = parser.parse_args() args.config = os.path.abspath(args.config) args.log_directory = create_log_directory(args.log_directory) log.info("Logging to %s" % (args.log_directory)) job_status_gui = None job_status_gui_out_fp = None resource_monitor_gui = None resource_monitor_gui_out_fp = None coordinator = None try: # To make the status GUI port distinct for each user but deterministic # for a single user, use 2000 + (the md5 hash of the user's username # mod 1000) as the web GUI's port number username_md5sum = hashlib.md5() username_md5sum.update(getpass.getuser()) job_status_gui_port = ( (int(username_md5sum.hexdigest(), 16) % 1000 + 2000) / 10) * 10 resource_monitor_gui_port = ( (int(username_md5sum.hexdigest(), 16) % 1000 + 3200) / 10) * 10 print "" # Start the resource monitor web GUI resource_monitor_gui, resource_monitor_gui_out_fp = \ start_resource_monitor_gui(args, resource_monitor_gui_port) # Start the job status web GUI job_status_gui, job_status_gui_out_fp = start_job_status_gui( args, job_status_gui_port) print "" coordinator = ClusterCoordinator(**vars(args)) coordinator.run() finally: if job_status_gui is not None: log.info("Stopping job status GUI (PID %d)" % (job_status_gui.pid)) os.killpg(job_status_gui.pid, signal.SIGTERM) job_status_gui.wait() if job_status_gui_out_fp is not None: job_status_gui_out_fp.flush() job_status_gui_out_fp.close() if resource_monitor_gui is not None: log.info("Stopping resource monitor GUI (PID %d)" % (resource_monitor_gui.pid)) os.killpg(resource_monitor_gui.pid, signal.SIGTERM) resource_monitor_gui.wait() if resource_monitor_gui_out_fp is not None: resource_monitor_gui_out_fp.flush() resource_monitor_gui_out_fp.close() if coordinator is not None: log.info("Stopping node coordinators") coordinator.stop_node_coordinators()
def main(): parser = argparse.ArgumentParser(description="") parser.add_argument("themis_binary", help="path to the Themis binary") parser.add_argument("config", help="a YAML file giving configuration " "options for Themis") parser.add_argument("log_directory", help="the base log directory where " "the job runner stores its logs") parser.add_argument("batch_nonce", help="the nonce for all batches " "executed by this node coordinator", type=int) parser.add_argument( "--keepalive_refresh", help="the interval, in seconds, " "between refreshes of the key that this node " "coordinator uses to tell the cluster coordinator that " "it's still alive", type=int) parser.add_argument("--keepalive_timeout", help="the amount of time that " "must pass without receiving a keepalive message from " "this node coordinator before the cluster coordinator " "considers it to be dead (default: %(default)s " "seconds)", type=int, default=10) parser.add_argument("--profiler", help="path to the binary of a profiling" "tool to use, for example valgrind or operf") parser.add_argument("--profiler_options", help="options surrounded by " "quotes to pass to the profiler") parser.add_argument("--ld_preload", help="Path to a library to be " "preloaded using LD_PRELOAD.") utils.add_redis_params(parser) utils.add_interfaces_params(parser) args = parser.parse_args() args.log_directory = create_log_directory(args.log_directory) log.info("Logging to %s" % (args.log_directory)) node_coordinator_log = os.path.join(args.log_directory, "node_coordinators", "%s.log" % (socket.getfqdn())) utils.backup_if_exists(node_coordinator_log) logging.basicConfig( format="%(levelname)-8s %(asctime)s %(name)-15s %(message)s", datefmt="%m-%d %H:%M:%S", filename=node_coordinator_log) coordinator = None def signal_handler(signal_id, frame): log.error("Caught signal %s" % (str(signal_id))) os.killpg(0, signal.SIGKILL) sys.exit(1) signal.signal(signal.SIGUSR1, signal_handler) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) try: coordinator = NodeCoordinator(**vars(args)) coordinator.run() except: # Log and print the exception you just caught exception_info = sys.exc_info() exception = exception_info[1] log.exception(exception) traceback.print_exception(*exception_info) if (not isinstance(exception, SystemExit)) and coordinator is not None: log.error("Marking current batch as failed") coordinator.fail_current_batch("Node coordinator error: " + str(exception_info[1])) finally: if coordinator is not None: coordinator.stop_keepalive()
def main(): parser = argparse.ArgumentParser(description="") parser.add_argument("themis_binary", help="path to the Themis binary") parser.add_argument("config", help="a YAML file giving configuration " "options for Themis") parser.add_argument("log_directory", help="the base log directory where " "the job runner stores its logs") parser.add_argument("batch_nonce", help="the nonce for all batches " "executed by this node coordinator", type=int) parser.add_argument("--keepalive_refresh", help="the interval, in seconds, " "between refreshes of the key that this node " "coordinator uses to tell the cluster coordinator that " "it's still alive", type=int) parser.add_argument("--keepalive_timeout", help="the amount of time that " "must pass without receiving a keepalive message from " "this node coordinator before the cluster coordinator " "considers it to be dead (default: %(default)s " "seconds)", type=int, default=10) parser.add_argument("--profiler", help="path to the binary of a profiling" "tool to use, for example valgrind or operf") parser.add_argument("--profiler_options", help="options surrounded by " "quotes to pass to the profiler") parser.add_argument("--ld_preload", help="Path to a library to be " "preloaded using LD_PRELOAD.") utils.add_redis_params(parser) utils.add_interfaces_params(parser) args = parser.parse_args() args.log_directory = create_log_directory(args.log_directory) log.info("Logging to %s" % (args.log_directory)) node_coordinator_log = os.path.join( args.log_directory, "node_coordinators", "%s.log" % (socket.getfqdn())) utils.backup_if_exists(node_coordinator_log) logging.basicConfig( format="%(levelname)-8s %(asctime)s %(name)-15s %(message)s", datefmt="%m-%d %H:%M:%S", filename=node_coordinator_log) coordinator = None def signal_handler(signal_id, frame): log.error("Caught signal %s" % (str(signal_id))) os.killpg(0, signal.SIGKILL) sys.exit(1) signal.signal(signal.SIGUSR1, signal_handler) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) try: coordinator = NodeCoordinator(**vars(args)) coordinator.run() except: # Log and print the exception you just caught exception_info = sys.exc_info() exception = exception_info[1] log.exception(exception) traceback.print_exception(*exception_info) if (not isinstance(exception, SystemExit)) and coordinator is not None: log.error("Marking current batch as failed") coordinator.fail_current_batch( "Node coordinator error: " + str(exception_info[1])) finally: if coordinator is not None: coordinator.stop_keepalive()