def start_node_coordinators(self):
        # Stop any running coordinators first!
        self.stop_node_coordinators()
        # Conditionally specify the profiler option
        profiler_option = ""
        if self.profiler is not None:
            profiler_option = "--profiler %s" % self.profiler
            if self.profiler_options is not None:
                # We already use single quotes in the ssh command so we need to
                # use this bash voodoo detailed in
                # http://stackoverflow.com/a/1250279/470771
                profiler_option = "%s --profiler_options '\"'\"'%s'\"'\"'" % (
                    profiler_option, self.profiler_options)

        # Conditionally specify LD_PRELOAD library
        ld_preload = ""
        if self.ld_preload is not None:
            ld_preload = "--ld_preload %s "% self.ld_preload

        # Start node coordinators on each node
        ssh_command_template = string.Template(
            ("%s ${host} 'source /etc/profile; source ~/.bash_profile; "
             "mkdir -p %s; nohup "
             "%s --redis_port=%d --redis_db=%d "
             "--redis_host=%s --keepalive_refresh=%d --keepalive_timeout=%d %s "
             "%s --interfaces %s %s %s ${log_dir} %s 1>${stdout_file} "
             "2>${stderr_file} &'") %
            (self.ssh_command, self.node_coordinator_log_dir,
             os.path.join(SCRIPT_DIR, "node_coordinator.py"),
             self.redis_port, self.redis_db, self.redis_host,
             self.keepalive_refresh, self.keepalive_timeout, profiler_option,
             ld_preload, self.interfaces, self.themis_binary, self.config_file,
             self.batch_nonce))

        self.known_nodes = self.coordinator_db.known_nodes
        self.total_nodes = len(self.known_nodes)
        for host in self.known_nodes:
            # Create log directory for node coordinator

            node_coordinator_stdout_file = os.path.join(
                self.node_coordinator_log_dir, "stdout-%s.log" % (host))
            node_coordinator_stderr_file = os.path.join(
                self.node_coordinator_log_dir, "stderr-%s.log" % (host))

            for log_filename in [
                node_coordinator_stdout_file, node_coordinator_stderr_file]:

                utils.backup_if_exists(log_filename)

            ssh_cmd = ssh_command_template.substitute(
                host=host,
                stdout_file = node_coordinator_stdout_file,
                stderr_file = node_coordinator_stderr_file,
                log_dir = self.log_directory)

            # Create a keepalive key for this node coordinator
            self.coordinator_db.create_keepalive(host)

            log.info("Starting node coordinator on '%s'" % (host))
            subprocess.check_call(ssh_cmd, shell=True)
def start_job_status_gui(args, gui_port):
    log_file = os.path.join(args.log_directory, "web_gui.log")

    utils.backup_if_exists(log_file)

    out_fp = open(log_file, "w")

    cmd = ("%s --redis_port=%d --redis_db=%d --redis_host=%s --port=%d %s") % (
        os.path.join(os.path.dirname(__file__), "job_status.py"),
        args.redis_port, args.redis_db, args.redis_host,
        gui_port, args.log_directory)

    cmd_obj = spawn_gui_and_check_bind(cmd, gui_port, out_fp, "job status GUI")

    return (cmd_obj, out_fp)
def start_job_status_gui(args, gui_port):
    log_file = os.path.join(args.log_directory, "web_gui.log")

    utils.backup_if_exists(log_file)

    out_fp = open(log_file, "w")

    cmd = ("%s --redis_port=%d --redis_db=%d --redis_host=%s --port=%d %s") % (
        os.path.join(os.path.dirname(__file__),
                     "job_status.py"), args.redis_port, args.redis_db,
        args.redis_host, gui_port, args.log_directory)

    cmd_obj = spawn_gui_and_check_bind(cmd, gui_port, out_fp, "job status GUI")

    return (cmd_obj, out_fp)
def start_resource_monitor_gui(args, gui_port):
    with open(args.config, 'r') as fp:
        app_config = yaml.load(fp)

    node_resource_monitor_port = app_config["MONITOR_PORT"]

    cmd_path = os.path.abspath(
        os.path.join(os.path.dirname(__file__), os.pardir, os.pardir,
                     "resource_monitor_gui", "resource_monitor_gui.py"))

    cmd = ("%s --redis_port=%d --redis_db=%d --redis_host=%s --port=%d %d") % (
        cmd_path, args.redis_port, args.redis_db, args.redis_host, gui_port,
        node_resource_monitor_port)

    log_file = os.path.join(args.log_directory, "resource_monitor_gui.log")

    utils.backup_if_exists(log_file)

    out_fp = open(log_file, "w")

    cmd_obj = spawn_gui_and_check_bind(
        cmd, gui_port, out_fp, "resource monitor GUI")

    return (cmd_obj, out_fp)
def start_resource_monitor_gui(args, gui_port):
    with open(args.config, 'r') as fp:
        app_config = yaml.load(fp)

    node_resource_monitor_port = app_config["MONITOR_PORT"]

    cmd_path = os.path.abspath(
        os.path.join(os.path.dirname(__file__), os.pardir, os.pardir,
                     "resource_monitor_gui", "resource_monitor_gui.py"))

    cmd = ("%s --redis_port=%d --redis_db=%d --redis_host=%s --port=%d %d") % (
        cmd_path, args.redis_port, args.redis_db, args.redis_host, gui_port,
        node_resource_monitor_port)

    log_file = os.path.join(args.log_directory, "resource_monitor_gui.log")

    utils.backup_if_exists(log_file)

    out_fp = open(log_file, "w")

    cmd_obj = spawn_gui_and_check_bind(cmd, gui_port, out_fp,
                                       "resource monitor GUI")

    return (cmd_obj, out_fp)
Example #6
0
def main():
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("themis_binary", help="path to the Themis binary")
    parser.add_argument("config",
                        help="a YAML file giving configuration "
                        "options for Themis")
    parser.add_argument("log_directory",
                        help="the base log directory where "
                        "the job runner stores its logs")
    parser.add_argument("batch_nonce",
                        help="the nonce for all batches "
                        "executed by this node coordinator",
                        type=int)
    parser.add_argument(
        "--keepalive_refresh",
        help="the interval, in seconds, "
        "between refreshes of the key that this node "
        "coordinator uses to tell the cluster coordinator that "
        "it's still alive",
        type=int)
    parser.add_argument("--keepalive_timeout",
                        help="the amount of time that "
                        "must pass without receiving a keepalive message from "
                        "this node coordinator before the cluster coordinator "
                        "considers it to be dead (default: %(default)s "
                        "seconds)",
                        type=int,
                        default=10)
    parser.add_argument("--profiler",
                        help="path to the binary of a profiling"
                        "tool to use, for example valgrind or operf")
    parser.add_argument("--profiler_options",
                        help="options surrounded by "
                        "quotes to pass to the profiler")
    parser.add_argument("--ld_preload",
                        help="Path to a library to be "
                        "preloaded using LD_PRELOAD.")

    utils.add_redis_params(parser)
    utils.add_interfaces_params(parser)

    args = parser.parse_args()

    args.log_directory = create_log_directory(args.log_directory)
    log.info("Logging to %s" % (args.log_directory))

    node_coordinator_log = os.path.join(args.log_directory,
                                        "node_coordinators",
                                        "%s.log" % (socket.getfqdn()))

    utils.backup_if_exists(node_coordinator_log)

    logging.basicConfig(
        format="%(levelname)-8s %(asctime)s %(name)-15s %(message)s",
        datefmt="%m-%d %H:%M:%S",
        filename=node_coordinator_log)

    coordinator = None

    def signal_handler(signal_id, frame):
        log.error("Caught signal %s" % (str(signal_id)))
        os.killpg(0, signal.SIGKILL)

        sys.exit(1)

    signal.signal(signal.SIGUSR1, signal_handler)
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)

    try:
        coordinator = NodeCoordinator(**vars(args))
        coordinator.run()
    except:
        # Log and print the exception you just caught
        exception_info = sys.exc_info()

        exception = exception_info[1]

        log.exception(exception)

        traceback.print_exception(*exception_info)

        if (not isinstance(exception, SystemExit)) and coordinator is not None:
            log.error("Marking current batch as failed")
            coordinator.fail_current_batch("Node coordinator error: " +
                                           str(exception_info[1]))

    finally:
        if coordinator is not None:
            coordinator.stop_keepalive()
Example #7
0
    def _run_themis(self, binary, command_params, log_dir):
        # Refresh the current set of local disks, which may have changed if
        # disks failed during a previous phase
        intermediate_disks = self.coordinator_db.local_disks(self.hostname)
        assert len(intermediate_disks) > 0

        command_params["INTERMEDIATE_DISK_LIST"] = ','.join(intermediate_disks)

        if not os.path.exists(log_dir):
            try:
                os.makedirs(log_dir)
            except:
                # Directory already exists
                pass

        # Start sar, iostat, and vnstat logging.
        # Only run vnstat on the first interface for simplicity.
        interface_list = filter(lambda x: len(x) > 0,
                                self.interfaces.split(','))
        monitors = monitor_utils.start_monitors(log_dir, self.hostname,
                                                interface_list[0])

        # Check core dump settings
        dump_core = False

        themisrc = utils.get_themisrc()

        dump_core = ("dump_core" in themisrc and themisrc["dump_core"])

        params_string = ' '.join(
            map(lambda x: "-%s %s" % (x[0], str(x[1])),
                command_params.items()))

        # If the user specified a profiling tool, run that instead and pass the
        # binary to its first argument.
        if self.profiler is not None:
            profiler_options = ""
            if self.profiler_options is not None:
                profiler_options = self.profiler_options

            if self.profiler == "operf":
                # Use the log directory as the operf session dir
                session_dir = os.path.join(os.path.dirname(log_dir),
                                           "oprofile",
                                           os.path.basename(log_dir),
                                           self.hostname)
                if not os.path.exists(session_dir):
                    os.makedirs(session_dir)
                binary = "%s %s --session-dir=%s %s" % (
                    self.profiler, profiler_options, session_dir, binary)
            else:
                # Some other profiler, just prepend it to the binary
                binary = "%s %s %s" % (self.profiler, profiler_options, binary)

        # If the user specified a library to LD_PRELOAD, set the environment
        # variable before running the binary.
        if self.ld_preload is not None:
            binary = "LD_PRELOAD=%s %s" % (self.ld_preload, binary)

        command = ' '.join((binary, params_string))

        log.error(command)

        # Create a file containing the command being run
        cmd_log_file = os.path.join(log_dir, "%s.cmd" % (socket.getfqdn()))
        with open(cmd_log_file, 'w') as fp:
            fp.write(command)
            fp.flush()

        core_path = None

        if dump_core:
            # Should be running in the context of one of this host's local
            # disks so that if we dump core, it gets dumped to space that can
            # hold it
            local_disks = self.coordinator_db.local_disks(self.hostname)

            if len(local_disks) > 0:
                run_dir = local_disks[0]
            else:
                run_dir = "/tmp"

            run_dir = os.path.join(run_dir, self.username)

            with open("/proc/sys/kernel/core_pattern", "r") as fp:
                core_filename = fp.read().strip()

            core_path = os.path.abspath(os.path.join(run_dir, core_filename))

            utils.backup_if_exists(core_path)

            if not os.path.exists(run_dir):
                os.makedirs(run_dir)

            command = "cd %s; ulimit -c unlimited; %s" % (run_dir, command)

        stdout_file = os.path.join(log_dir, "stdout-%s.log" % (self.hostname))
        stderr_file = os.path.join(log_dir, "stderr-%s.log" % (self.hostname))

        for filename in [stdout_file, stderr_file]:
            utils.backup_if_exists(filename)

        out_fp = open(stdout_file, 'w')
        err_fp = open(stderr_file, 'w')

        cmd_obj = subprocess.Popen(command,
                                   shell=True,
                                   stdout=out_fp,
                                   stderr=err_fp)
        cmd_obj.communicate()

        out_fp.flush()
        out_fp.close()
        err_fp.flush()
        err_fp.close()

        # Terminate sar, iostat, and vnstat
        monitor_utils.stop_monitors(*monitors)

        if cmd_obj.returncode != 0:
            log.error("Themis exited with status %d", cmd_obj.returncode)
            if dump_core:
                assert core_path is not None

                # Identify the core file by its batch number
                if os.path.exists(core_path):
                    core_path_with_batch = os.path.join(
                        os.path.dirname(core_path),
                        "core.batch_%d" % (self.current_batch))
                    shutil.move(core_path, core_path_with_batch)

            with open(stderr_file, 'r') as fp:
                error_msg = fp.read()

            self.fail_current_batch(error_msg)
            log.error(error_msg)

        return cmd_obj.returncode == 0
def main():
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("themis_binary", help="path to the Themis binary")
    parser.add_argument("config", help="a YAML file giving configuration "
                        "options for Themis")
    parser.add_argument("log_directory", help="the base log directory where "
                        "the job runner stores its logs")
    parser.add_argument("batch_nonce", help="the nonce for all batches "
                        "executed by this node coordinator", type=int)
    parser.add_argument("--keepalive_refresh", help="the interval, in seconds, "
                        "between refreshes of the key that this node "
                        "coordinator uses to tell the cluster coordinator that "
                        "it's still alive", type=int)
    parser.add_argument("--keepalive_timeout", help="the amount of time that "
                        "must pass without receiving a keepalive message from "
                        "this node coordinator before the cluster coordinator "
                        "considers it to be dead (default: %(default)s "
                        "seconds)", type=int, default=10)
    parser.add_argument("--profiler", help="path to the binary of a profiling"
                        "tool to use, for example valgrind or operf")
    parser.add_argument("--profiler_options", help="options surrounded by "
                        "quotes to pass to the profiler")
    parser.add_argument("--ld_preload", help="Path to a library to be "
                        "preloaded using LD_PRELOAD.")

    utils.add_redis_params(parser)
    utils.add_interfaces_params(parser)

    args = parser.parse_args()

    args.log_directory = create_log_directory(args.log_directory)
    log.info("Logging to %s" % (args.log_directory))

    node_coordinator_log = os.path.join(
        args.log_directory, "node_coordinators",
        "%s.log" % (socket.getfqdn()))

    utils.backup_if_exists(node_coordinator_log)

    logging.basicConfig(
        format="%(levelname)-8s %(asctime)s %(name)-15s %(message)s",
        datefmt="%m-%d %H:%M:%S",
        filename=node_coordinator_log)

    coordinator = None

    def signal_handler(signal_id, frame):
        log.error("Caught signal %s" % (str(signal_id)))
        os.killpg(0, signal.SIGKILL)

        sys.exit(1)

    signal.signal(signal.SIGUSR1, signal_handler)
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)

    try:
        coordinator = NodeCoordinator(**vars(args))
        coordinator.run()
    except:
        # Log and print the exception you just caught
        exception_info = sys.exc_info()

        exception = exception_info[1]

        log.exception(exception)

        traceback.print_exception(*exception_info)

        if (not isinstance(exception, SystemExit)) and coordinator is not None:
            log.error("Marking current batch as failed")
            coordinator.fail_current_batch(
                "Node coordinator error: " + str(exception_info[1]))

    finally:
        if coordinator is not None:
            coordinator.stop_keepalive()
    def _run_themis(self, binary, command_params, log_dir):
        # Refresh the current set of local disks, which may have changed if
        # disks failed during a previous phase
        intermediate_disks = self.coordinator_db.local_disks(self.hostname)
        assert len(intermediate_disks) > 0

        command_params["INTERMEDIATE_DISK_LIST"] = ','.join(
            intermediate_disks)

        if not os.path.exists(log_dir):
            try:
                os.makedirs(log_dir)
            except:
                # Directory already exists
                pass

        # Start sar, iostat, and vnstat logging.
        # Only run vnstat on the first interface for simplicity.
        interface_list = filter(
            lambda x: len(x) > 0, self.interfaces.split(','))
        monitors = monitor_utils.start_monitors(
            log_dir, self.hostname, interface_list[0])

        # Check core dump settings
        dump_core = False

        themisrc = utils.get_themisrc()

        dump_core = ("dump_core" in themisrc and themisrc["dump_core"])

        params_string = ' '.join(
            map(lambda x: "-%s %s" % (x[0], str(x[1])),
                command_params.items()))

        # If the user specified a profiling tool, run that instead and pass the
        # binary to its first argument.
        if self.profiler is not None:
            profiler_options = ""
            if self.profiler_options is not None:
                profiler_options = self.profiler_options

            if self.profiler == "operf":
                # Use the log directory as the operf session dir
                session_dir = os.path.join(
                    os.path.dirname(log_dir), "oprofile",
                    os.path.basename(log_dir), self.hostname)
                if not os.path.exists(session_dir):
                    os.makedirs(session_dir)
                binary = "%s %s --session-dir=%s %s" % (
                    self.profiler, profiler_options, session_dir, binary)
            else:
                # Some other profiler, just prepend it to the binary
                binary = "%s %s %s" % (self.profiler, profiler_options, binary)

        # If the user specified a library to LD_PRELOAD, set the environment
        # variable before running the binary.
        if self.ld_preload is not None:
            binary = "LD_PRELOAD=%s %s" % (self.ld_preload, binary)

        command = ' '.join((binary, params_string))

        # Create a file containing the command being run
        cmd_log_file = os.path.join(log_dir, "%s.cmd" % (socket.getfqdn()))
        with open(cmd_log_file, 'w') as fp:
            fp.write(command)

        core_path = None

        if dump_core:
            # Should be running in the context of one of this host's local
            # disks so that if we dump core, it gets dumped to space that can
            # hold it
            local_disks = self.coordinator_db.local_disks(self.hostname)

            if len(local_disks) > 0:
                run_dir = local_disks[0]
            else:
                run_dir = "/tmp"

            run_dir = os.path.join(run_dir, self.username)

            with open("/proc/sys/kernel/core_pattern", "r") as fp:
                core_filename = fp.read().strip()

            core_path = os.path.abspath(os.path.join(run_dir, core_filename))

            utils.backup_if_exists(core_path)

            if not os.path.exists(run_dir):
                os.makedirs(run_dir)

            command = "cd %s; ulimit -c unlimited; %s" % (run_dir, command)

        stdout_file = os.path.join(log_dir, "stdout-%s.log" % (self.hostname))
        stderr_file = os.path.join(log_dir, "stderr-%s.log" % (self.hostname))

        for filename in [stdout_file, stderr_file]:
            utils.backup_if_exists(filename)

        out_fp = open(stdout_file, 'w')
        err_fp = open(stderr_file, 'w')

        cmd_obj = subprocess.Popen(
            command, shell=True, stdout=out_fp, stderr=err_fp)
        cmd_obj.communicate()

        out_fp.flush()
        out_fp.close()
        err_fp.flush()
        err_fp.close()

        # Terminate sar, iostat, and vnstat
        monitor_utils.stop_monitors(*monitors)

        if cmd_obj.returncode != 0:
            if dump_core:
                assert core_path is not None

                # Identify the core file by its batch number
                if os.path.exists(core_path):
                    core_path_with_batch = os.path.join(
                        os.path.dirname(core_path),
                        "core.batch_%d" % (self.current_batch))
                    shutil.move(core_path, core_path_with_batch)

            with open(stderr_file, 'r') as fp:
                error_msg = fp.read()

            self.fail_current_batch(error_msg)
            log.error(error_msg)

        return cmd_obj.returncode == 0