def start_node_coordinators(self): # Stop any running coordinators first! self.stop_node_coordinators() # Conditionally specify the profiler option profiler_option = "" if self.profiler is not None: profiler_option = "--profiler %s" % self.profiler if self.profiler_options is not None: # We already use single quotes in the ssh command so we need to # use this bash voodoo detailed in # http://stackoverflow.com/a/1250279/470771 profiler_option = "%s --profiler_options '\"'\"'%s'\"'\"'" % ( profiler_option, self.profiler_options) # Conditionally specify LD_PRELOAD library ld_preload = "" if self.ld_preload is not None: ld_preload = "--ld_preload %s "% self.ld_preload # Start node coordinators on each node ssh_command_template = string.Template( ("%s ${host} 'source /etc/profile; source ~/.bash_profile; " "mkdir -p %s; nohup " "%s --redis_port=%d --redis_db=%d " "--redis_host=%s --keepalive_refresh=%d --keepalive_timeout=%d %s " "%s --interfaces %s %s %s ${log_dir} %s 1>${stdout_file} " "2>${stderr_file} &'") % (self.ssh_command, self.node_coordinator_log_dir, os.path.join(SCRIPT_DIR, "node_coordinator.py"), self.redis_port, self.redis_db, self.redis_host, self.keepalive_refresh, self.keepalive_timeout, profiler_option, ld_preload, self.interfaces, self.themis_binary, self.config_file, self.batch_nonce)) self.known_nodes = self.coordinator_db.known_nodes self.total_nodes = len(self.known_nodes) for host in self.known_nodes: # Create log directory for node coordinator node_coordinator_stdout_file = os.path.join( self.node_coordinator_log_dir, "stdout-%s.log" % (host)) node_coordinator_stderr_file = os.path.join( self.node_coordinator_log_dir, "stderr-%s.log" % (host)) for log_filename in [ node_coordinator_stdout_file, node_coordinator_stderr_file]: utils.backup_if_exists(log_filename) ssh_cmd = ssh_command_template.substitute( host=host, stdout_file = node_coordinator_stdout_file, stderr_file = node_coordinator_stderr_file, log_dir = self.log_directory) # Create a keepalive key for this node coordinator self.coordinator_db.create_keepalive(host) log.info("Starting node coordinator on '%s'" % (host)) subprocess.check_call(ssh_cmd, shell=True)
def start_job_status_gui(args, gui_port): log_file = os.path.join(args.log_directory, "web_gui.log") utils.backup_if_exists(log_file) out_fp = open(log_file, "w") cmd = ("%s --redis_port=%d --redis_db=%d --redis_host=%s --port=%d %s") % ( os.path.join(os.path.dirname(__file__), "job_status.py"), args.redis_port, args.redis_db, args.redis_host, gui_port, args.log_directory) cmd_obj = spawn_gui_and_check_bind(cmd, gui_port, out_fp, "job status GUI") return (cmd_obj, out_fp)
def start_resource_monitor_gui(args, gui_port): with open(args.config, 'r') as fp: app_config = yaml.load(fp) node_resource_monitor_port = app_config["MONITOR_PORT"] cmd_path = os.path.abspath( os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, "resource_monitor_gui", "resource_monitor_gui.py")) cmd = ("%s --redis_port=%d --redis_db=%d --redis_host=%s --port=%d %d") % ( cmd_path, args.redis_port, args.redis_db, args.redis_host, gui_port, node_resource_monitor_port) log_file = os.path.join(args.log_directory, "resource_monitor_gui.log") utils.backup_if_exists(log_file) out_fp = open(log_file, "w") cmd_obj = spawn_gui_and_check_bind( cmd, gui_port, out_fp, "resource monitor GUI") return (cmd_obj, out_fp)
def start_resource_monitor_gui(args, gui_port): with open(args.config, 'r') as fp: app_config = yaml.load(fp) node_resource_monitor_port = app_config["MONITOR_PORT"] cmd_path = os.path.abspath( os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, "resource_monitor_gui", "resource_monitor_gui.py")) cmd = ("%s --redis_port=%d --redis_db=%d --redis_host=%s --port=%d %d") % ( cmd_path, args.redis_port, args.redis_db, args.redis_host, gui_port, node_resource_monitor_port) log_file = os.path.join(args.log_directory, "resource_monitor_gui.log") utils.backup_if_exists(log_file) out_fp = open(log_file, "w") cmd_obj = spawn_gui_and_check_bind(cmd, gui_port, out_fp, "resource monitor GUI") return (cmd_obj, out_fp)
def main(): parser = argparse.ArgumentParser(description="") parser.add_argument("themis_binary", help="path to the Themis binary") parser.add_argument("config", help="a YAML file giving configuration " "options for Themis") parser.add_argument("log_directory", help="the base log directory where " "the job runner stores its logs") parser.add_argument("batch_nonce", help="the nonce for all batches " "executed by this node coordinator", type=int) parser.add_argument( "--keepalive_refresh", help="the interval, in seconds, " "between refreshes of the key that this node " "coordinator uses to tell the cluster coordinator that " "it's still alive", type=int) parser.add_argument("--keepalive_timeout", help="the amount of time that " "must pass without receiving a keepalive message from " "this node coordinator before the cluster coordinator " "considers it to be dead (default: %(default)s " "seconds)", type=int, default=10) parser.add_argument("--profiler", help="path to the binary of a profiling" "tool to use, for example valgrind or operf") parser.add_argument("--profiler_options", help="options surrounded by " "quotes to pass to the profiler") parser.add_argument("--ld_preload", help="Path to a library to be " "preloaded using LD_PRELOAD.") utils.add_redis_params(parser) utils.add_interfaces_params(parser) args = parser.parse_args() args.log_directory = create_log_directory(args.log_directory) log.info("Logging to %s" % (args.log_directory)) node_coordinator_log = os.path.join(args.log_directory, "node_coordinators", "%s.log" % (socket.getfqdn())) utils.backup_if_exists(node_coordinator_log) logging.basicConfig( format="%(levelname)-8s %(asctime)s %(name)-15s %(message)s", datefmt="%m-%d %H:%M:%S", filename=node_coordinator_log) coordinator = None def signal_handler(signal_id, frame): log.error("Caught signal %s" % (str(signal_id))) os.killpg(0, signal.SIGKILL) sys.exit(1) signal.signal(signal.SIGUSR1, signal_handler) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) try: coordinator = NodeCoordinator(**vars(args)) coordinator.run() except: # Log and print the exception you just caught exception_info = sys.exc_info() exception = exception_info[1] log.exception(exception) traceback.print_exception(*exception_info) if (not isinstance(exception, SystemExit)) and coordinator is not None: log.error("Marking current batch as failed") coordinator.fail_current_batch("Node coordinator error: " + str(exception_info[1])) finally: if coordinator is not None: coordinator.stop_keepalive()
def _run_themis(self, binary, command_params, log_dir): # Refresh the current set of local disks, which may have changed if # disks failed during a previous phase intermediate_disks = self.coordinator_db.local_disks(self.hostname) assert len(intermediate_disks) > 0 command_params["INTERMEDIATE_DISK_LIST"] = ','.join(intermediate_disks) if not os.path.exists(log_dir): try: os.makedirs(log_dir) except: # Directory already exists pass # Start sar, iostat, and vnstat logging. # Only run vnstat on the first interface for simplicity. interface_list = filter(lambda x: len(x) > 0, self.interfaces.split(',')) monitors = monitor_utils.start_monitors(log_dir, self.hostname, interface_list[0]) # Check core dump settings dump_core = False themisrc = utils.get_themisrc() dump_core = ("dump_core" in themisrc and themisrc["dump_core"]) params_string = ' '.join( map(lambda x: "-%s %s" % (x[0], str(x[1])), command_params.items())) # If the user specified a profiling tool, run that instead and pass the # binary to its first argument. if self.profiler is not None: profiler_options = "" if self.profiler_options is not None: profiler_options = self.profiler_options if self.profiler == "operf": # Use the log directory as the operf session dir session_dir = os.path.join(os.path.dirname(log_dir), "oprofile", os.path.basename(log_dir), self.hostname) if not os.path.exists(session_dir): os.makedirs(session_dir) binary = "%s %s --session-dir=%s %s" % ( self.profiler, profiler_options, session_dir, binary) else: # Some other profiler, just prepend it to the binary binary = "%s %s %s" % (self.profiler, profiler_options, binary) # If the user specified a library to LD_PRELOAD, set the environment # variable before running the binary. if self.ld_preload is not None: binary = "LD_PRELOAD=%s %s" % (self.ld_preload, binary) command = ' '.join((binary, params_string)) log.error(command) # Create a file containing the command being run cmd_log_file = os.path.join(log_dir, "%s.cmd" % (socket.getfqdn())) with open(cmd_log_file, 'w') as fp: fp.write(command) fp.flush() core_path = None if dump_core: # Should be running in the context of one of this host's local # disks so that if we dump core, it gets dumped to space that can # hold it local_disks = self.coordinator_db.local_disks(self.hostname) if len(local_disks) > 0: run_dir = local_disks[0] else: run_dir = "/tmp" run_dir = os.path.join(run_dir, self.username) with open("/proc/sys/kernel/core_pattern", "r") as fp: core_filename = fp.read().strip() core_path = os.path.abspath(os.path.join(run_dir, core_filename)) utils.backup_if_exists(core_path) if not os.path.exists(run_dir): os.makedirs(run_dir) command = "cd %s; ulimit -c unlimited; %s" % (run_dir, command) stdout_file = os.path.join(log_dir, "stdout-%s.log" % (self.hostname)) stderr_file = os.path.join(log_dir, "stderr-%s.log" % (self.hostname)) for filename in [stdout_file, stderr_file]: utils.backup_if_exists(filename) out_fp = open(stdout_file, 'w') err_fp = open(stderr_file, 'w') cmd_obj = subprocess.Popen(command, shell=True, stdout=out_fp, stderr=err_fp) cmd_obj.communicate() out_fp.flush() out_fp.close() err_fp.flush() err_fp.close() # Terminate sar, iostat, and vnstat monitor_utils.stop_monitors(*monitors) if cmd_obj.returncode != 0: log.error("Themis exited with status %d", cmd_obj.returncode) if dump_core: assert core_path is not None # Identify the core file by its batch number if os.path.exists(core_path): core_path_with_batch = os.path.join( os.path.dirname(core_path), "core.batch_%d" % (self.current_batch)) shutil.move(core_path, core_path_with_batch) with open(stderr_file, 'r') as fp: error_msg = fp.read() self.fail_current_batch(error_msg) log.error(error_msg) return cmd_obj.returncode == 0
def main(): parser = argparse.ArgumentParser(description="") parser.add_argument("themis_binary", help="path to the Themis binary") parser.add_argument("config", help="a YAML file giving configuration " "options for Themis") parser.add_argument("log_directory", help="the base log directory where " "the job runner stores its logs") parser.add_argument("batch_nonce", help="the nonce for all batches " "executed by this node coordinator", type=int) parser.add_argument("--keepalive_refresh", help="the interval, in seconds, " "between refreshes of the key that this node " "coordinator uses to tell the cluster coordinator that " "it's still alive", type=int) parser.add_argument("--keepalive_timeout", help="the amount of time that " "must pass without receiving a keepalive message from " "this node coordinator before the cluster coordinator " "considers it to be dead (default: %(default)s " "seconds)", type=int, default=10) parser.add_argument("--profiler", help="path to the binary of a profiling" "tool to use, for example valgrind or operf") parser.add_argument("--profiler_options", help="options surrounded by " "quotes to pass to the profiler") parser.add_argument("--ld_preload", help="Path to a library to be " "preloaded using LD_PRELOAD.") utils.add_redis_params(parser) utils.add_interfaces_params(parser) args = parser.parse_args() args.log_directory = create_log_directory(args.log_directory) log.info("Logging to %s" % (args.log_directory)) node_coordinator_log = os.path.join( args.log_directory, "node_coordinators", "%s.log" % (socket.getfqdn())) utils.backup_if_exists(node_coordinator_log) logging.basicConfig( format="%(levelname)-8s %(asctime)s %(name)-15s %(message)s", datefmt="%m-%d %H:%M:%S", filename=node_coordinator_log) coordinator = None def signal_handler(signal_id, frame): log.error("Caught signal %s" % (str(signal_id))) os.killpg(0, signal.SIGKILL) sys.exit(1) signal.signal(signal.SIGUSR1, signal_handler) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) try: coordinator = NodeCoordinator(**vars(args)) coordinator.run() except: # Log and print the exception you just caught exception_info = sys.exc_info() exception = exception_info[1] log.exception(exception) traceback.print_exception(*exception_info) if (not isinstance(exception, SystemExit)) and coordinator is not None: log.error("Marking current batch as failed") coordinator.fail_current_batch( "Node coordinator error: " + str(exception_info[1])) finally: if coordinator is not None: coordinator.stop_keepalive()
def _run_themis(self, binary, command_params, log_dir): # Refresh the current set of local disks, which may have changed if # disks failed during a previous phase intermediate_disks = self.coordinator_db.local_disks(self.hostname) assert len(intermediate_disks) > 0 command_params["INTERMEDIATE_DISK_LIST"] = ','.join( intermediate_disks) if not os.path.exists(log_dir): try: os.makedirs(log_dir) except: # Directory already exists pass # Start sar, iostat, and vnstat logging. # Only run vnstat on the first interface for simplicity. interface_list = filter( lambda x: len(x) > 0, self.interfaces.split(',')) monitors = monitor_utils.start_monitors( log_dir, self.hostname, interface_list[0]) # Check core dump settings dump_core = False themisrc = utils.get_themisrc() dump_core = ("dump_core" in themisrc and themisrc["dump_core"]) params_string = ' '.join( map(lambda x: "-%s %s" % (x[0], str(x[1])), command_params.items())) # If the user specified a profiling tool, run that instead and pass the # binary to its first argument. if self.profiler is not None: profiler_options = "" if self.profiler_options is not None: profiler_options = self.profiler_options if self.profiler == "operf": # Use the log directory as the operf session dir session_dir = os.path.join( os.path.dirname(log_dir), "oprofile", os.path.basename(log_dir), self.hostname) if not os.path.exists(session_dir): os.makedirs(session_dir) binary = "%s %s --session-dir=%s %s" % ( self.profiler, profiler_options, session_dir, binary) else: # Some other profiler, just prepend it to the binary binary = "%s %s %s" % (self.profiler, profiler_options, binary) # If the user specified a library to LD_PRELOAD, set the environment # variable before running the binary. if self.ld_preload is not None: binary = "LD_PRELOAD=%s %s" % (self.ld_preload, binary) command = ' '.join((binary, params_string)) # Create a file containing the command being run cmd_log_file = os.path.join(log_dir, "%s.cmd" % (socket.getfqdn())) with open(cmd_log_file, 'w') as fp: fp.write(command) core_path = None if dump_core: # Should be running in the context of one of this host's local # disks so that if we dump core, it gets dumped to space that can # hold it local_disks = self.coordinator_db.local_disks(self.hostname) if len(local_disks) > 0: run_dir = local_disks[0] else: run_dir = "/tmp" run_dir = os.path.join(run_dir, self.username) with open("/proc/sys/kernel/core_pattern", "r") as fp: core_filename = fp.read().strip() core_path = os.path.abspath(os.path.join(run_dir, core_filename)) utils.backup_if_exists(core_path) if not os.path.exists(run_dir): os.makedirs(run_dir) command = "cd %s; ulimit -c unlimited; %s" % (run_dir, command) stdout_file = os.path.join(log_dir, "stdout-%s.log" % (self.hostname)) stderr_file = os.path.join(log_dir, "stderr-%s.log" % (self.hostname)) for filename in [stdout_file, stderr_file]: utils.backup_if_exists(filename) out_fp = open(stdout_file, 'w') err_fp = open(stderr_file, 'w') cmd_obj = subprocess.Popen( command, shell=True, stdout=out_fp, stderr=err_fp) cmd_obj.communicate() out_fp.flush() out_fp.close() err_fp.flush() err_fp.close() # Terminate sar, iostat, and vnstat monitor_utils.stop_monitors(*monitors) if cmd_obj.returncode != 0: if dump_core: assert core_path is not None # Identify the core file by its batch number if os.path.exists(core_path): core_path_with_batch = os.path.join( os.path.dirname(core_path), "core.batch_%d" % (self.current_batch)) shutil.move(core_path, core_path_with_batch) with open(stderr_file, 'r') as fp: error_msg = fp.read() self.fail_current_batch(error_msg) log.error(error_msg) return cmd_obj.returncode == 0