Example #1
0
    def _cdqos_init(self, log):
        """
        Init the QoS for the file system
        """
        lustrefs = self.cdqos_lustrefs
        fsname = lustrefs.lf_fsname

        name = "thread_qos_%s" % fsname
        thread_workspace = self.cdqos_global_workspace + "/" + name
        if not os.path.exists(thread_workspace):
            ret = utils.mkdir(thread_workspace)
            if ret:
                log.cl_error("failed to create directory [%s] on local host",
                             thread_workspace)
                return -1
        elif not os.path.isdir(thread_workspace):
            self.cdqos_log.cl_error("[%s] is not a directory",
                                    thread_workspace)
            return -1
        self.cdqos_thread_log = log.cl_get_child(name,
                                                 resultsdir=thread_workspace)

        ret = self._cdqos_init_clients()
        if ret:
            log.cl_error(
                "failed to init QoS Lustre clients for file system [%s]",
                fsname)
            return -1
        return 0
Example #2
0
 def hr_thread_start(self, parent_log):
     """
     Start the thread
     """
     ret = utils.mkdir(self.hr_workspace)
     if ret:
         return ret
     self.hr_thread = utils.thread_start(self.hr_thread_main, (parent_log, ))
     return 0
Example #3
0
 def hc_thread_start(self, log):
     """
     Start the thread
     """
     if utils.mkdir(self.hc_workspace):
         log.cl_error("failed to create directory [%s] on local host, "
                      "exiting the thread",
                      self.hc_workspace)
         self.hc_status = -1
         return
     self.hc_thread = utils.thread_start(self.hc_thread_main, (log, ))
Example #4
0
    def hr_run(self, log):
        """
        Thread of running remover
        """
        # pylint: disable=too-many-locals,too-many-statements
        # pylint: disable=too-many-return-statements,too-many-arguments
        identity = time_util.local_strftime(time_util.utcnow(), "%Y-%m-%d-%H_%M_%S")
        workspace = self.hr_workspace + "/" + identity
        ret = utils.mkdir(workspace)
        if ret:
            log.cl_error("failed to create directory [%s] on local host",
                         workspace)
            return -1
        stdout_file = workspace + "/" + "remover_watching.stdout"
        stderr_file = workspace + "/" + "remover_watching.stderr"

        host = self.hr_host
        args = {}
        args["hostname"] = host.sh_hostname
        args["log"] = log
        stdout_fd = watched_io.watched_io_open(stdout_file,
                                               watched_io.log_watcher_debug, args)
        stderr_fd = watched_io.watched_io_open(stderr_file,
                                               watched_io.log_watcher_error, args)
        log.cl_debug("start to run command [%s] on host [%s]",
                     self.hr_command, host.sh_hostname)
        retval = host.sh_run(log, self.hr_command, stdout_tee=stdout_fd,
                             stderr_tee=stderr_fd, return_stdout=False,
                             return_stderr=False, timeout=None)
        stdout_fd.close()
        stderr_fd.close()
        if retval.cr_exit_status:
            log.cl_error("failed to run command [%s] on host [%s], "
                         "ret = [%d], "
                         "stdout = [%s], stderr = [%s]",
                         self.hr_command,
                         host.sh_hostname,
                         retval.cr_exit_status,
                         retval.cr_stdout,
                         retval.cr_stderr)
        else:
            log.cl_debug("finished running command [%s] on host [%s], "
                         "ret = [%d], "
                         "stdout = [%s], stderr = [%s]",
                         self.hr_command,
                         host.sh_hostname,
                         retval.cr_exit_status,
                         retval.cr_stdout,
                         retval.cr_stderr)
        return retval.cr_exit_status
Example #5
0
    def pt_thread_start(self, parent_log):
        """
        Start the thread
        """
        ret = utils.mkdir(self.pt_workspace)
        if ret:
            parent_log.cl_error("failed to create directory [%s]",
                                self.pt_workspace)
            return -1

        # The log for this thread
        log = self.pt_parallel_execute.pe_log.cl_get_child(
            self.pt_id, resultsdir=self.pt_workspace)
        self.pt_log = log
        log.cl_result.cr_clear()
        log.cl_abort = False
        self.pt_status = ParallelThread.STATUS_RUNNING
        self.pt_thread = utils.thread_start(self.pt_main, ())
        return 0
 def __init__(self, parent_log, client_hash, sequence, instance):
     self.cc_client_hash = client_hash
     self.cc_sequence = sequence
     self.cc_atime = time.time()
     self.cc_connection_name = "connection_%s" % sequence
     self.cc_instance = instance
     self.cc_workspace = instance.ci_workspace + "/" + self.cc_connection_name
     ret = utils.mkdir(self.cc_workspace)
     if ret:
         reason = ("failed to create directory [%s] on local host" %
                   (self.cc_workspace))
         parent_log.cl_error(reason)
         raise Exception(reason)
     self.cc_condition = threading.Condition()
     self.cc_command_log = parent_log.cl_get_child(
         self.cc_connection_name,
         resultsdir=self.cc_workspace,
         record_consumer=True,
         condition=self.cc_condition)
     self.cc_last_retval = None
     self.cc_quit = False
     # Used when a command running thread needs input from console
     self.cc_input_prompt = None
     self.cc_input_result = None
def main():
    """
    Start clownfish
    """
    # pylint: disable=unused-variable,too-many-statements,too-many-branches
    reload(sys)
    sys.setdefaultencoding("utf-8")

    argc = len(sys.argv)
    if argc == 1:
        # clownfish <localhost>
        server_url = ("tcp://localhost:%s" %
                      constants.CLOWNFISH_DEFAULT_SERVER_PORT)
        cmdline = None
    elif argc == 2:
        # clownfish_console host
        # clownfish_console -h
        # clownfish_console --help
        if sys.argv[1] == "-h" or sys.argv[1] == "--help":
            usage()
            sys.exit(0)
        server_url = sys.argv[1]
        if ":" not in server_url:
            server_url += ":" + str(constants.CLOWNFISH_DEFAULT_SERVER_PORT)
        server_url = "tcp://" + server_url
        cmdline = None
    elif argc == 3:
        # clownfish_console host cmdline
        server_url = sys.argv[1]
        if ":" not in server_url:
            server_url += ":" + str(constants.CLOWNFISH_DEFAULT_SERVER_PORT)
        server_url = "tcp://" + server_url
        cmdline = sys.argv[2]
    elif argc == 4:
        # clownfish_console -P 3002 host
        # clownfish_console host cmdline1 cmdline2
        if sys.argv[1] == "-P":
            port_string = sys.argv[2]
            host = sys.argv[3]
            if ":" in host:
                usage()
                sys.exit(-1)
            cmdline = None
            server_url = "tcp://%s:%s" % (host, port_string)
        else:
            server_url = sys.argv[1]
            if ":" not in server_url:
                server_url += ":" + str(
                    constants.CLOWNFISH_DEFAULT_SERVER_PORT)
            server_url = "tcp://" + server_url
            cmdline = sys.argv[2] + " " + sys.argv[3]
    elif argc >= 5:
        # clownfish_console -P 3002 host cmdline...
        # clownfish_console host cmdline1 cmdline2 cmdline3...
        if sys.argv[1] == "-P":
            port_string = sys.argv[2]
            host = sys.argv[3]
            if ":" in host:
                usage()
                sys.exit(-1)
            cmdline_start = 4
            server_url = "tcp://%s:%s" % (host, port_string)
        else:
            server_url = sys.argv[1]
            if ":" not in server_url:
                server_url += ":" + str(
                    constants.CLOWNFISH_DEFAULT_SERVER_PORT)
            server_url = "tcp://" + server_url
            cmdline_start = 2
        cmdline = ""
        for arg_index in range(cmdline_start, argc):
            if cmdline != "":
                cmdline += " "
            cmdline += sys.argv[arg_index]

    identity = time_util.local_strftime(time_util.utcnow(),
                                        "%Y-%m-%d-%H_%M_%S")
    workspace = CLOWNFISH_CONSOLE_LOG_DIR + "/" + identity

    if not os.path.exists(CLOWNFISH_CONSOLE_LOG_DIR):
        ret = utils.mkdir(CLOWNFISH_CONSOLE_LOG_DIR)
        if ret:
            sys.stderr.write("failed to create directory [%s]" %
                             CLOWNFISH_CONSOLE_LOG_DIR)
            sys.exit(-1)
    elif not os.path.isdir(CLOWNFISH_CONSOLE_LOG_DIR):
        sys.stderr.write("[%s] is not a directory" % CLOWNFISH_CONSOLE_LOG_DIR)
        sys.exit(-1)

    if not os.path.exists(workspace):
        ret = utils.mkdir(workspace)
        if ret:
            sys.stderr.write("failed to create directory [%s]" % workspace)
            sys.exit(-1)
    elif not os.path.isdir(workspace):
        sys.stderr.write("[%s] is not a directory" % workspace)
        sys.exit(-1)

    if cmdline is None:
        print("Starting Clownfish console to server [%s], "
              "please check [%s] for more log" % (server_url, workspace))

    log = clog.get_log(resultsdir=workspace, simple_console=True)

    ret = clownfish_console_loop(log, workspace, server_url, cmdline=cmdline)
    if ret:
        log.cl_error(
            "Clownfish console exited with failure, please check [%s] for "
            "more log", workspace)
        sys.exit(ret)
    if cmdline is None:
        log.cl_info("Clownfish console exited, please check [%s] for more log",
                    workspace)
    sys.exit(0)
    def cc_ping_thread(self):
        """
        Ping the server constantly
        """
        # pylint: disable=too-many-locals,too-many-statements,too-many-branches
        name = "thread_ping"
        thread_workspace = self.cc_workspace + "/" + name
        if not os.path.exists(thread_workspace):
            ret = utils.mkdir(thread_workspace)
            if ret:
                self.cc_log.cl_error("failed to create directory [%s]",
                                     thread_workspace)
                sys.exit(-1)
        elif not os.path.isdir(thread_workspace):
            self.cc_log.cl_error("[%s] is not a directory", thread_workspace)
            return -1
        log = self.cc_log.cl_get_child(name, resultsdir=thread_workspace)

        log.cl_debug("starting ping thread")
        server_url = self.cc_server_url
        ret = 0
        time_start = time.time()
        poll = zmq.Poller()
        context = zmq.Context(1)
        client = None
        while self.cc_running and ret == 0:
            if client is not None:
                client.setsockopt(zmq.LINGER, 0)
                client.close()
                poll.unregister(client)
            client = context.socket(zmq.REQ)
            client.connect(server_url)
            poll.register(client, zmq.POLLIN)
            while self.cc_running and ret == 0:
                time_now = time.time()
                elapsed = time_now - time_start
                if elapsed >= CLOWNFISH_CONSOLE_PING_TIMEOUT:
                    log.cl_error("timeout when pinging [%s]", server_url)
                    ret = -1
                    break

                message = ClownfishConsoleMessage(
                    self.cc_uuid,
                    clownfish_pb2.ClownfishMessage.CMT_PING_REQUEST,
                    clownfish_pb2.ClownfishMessage.CMT_PING_REPLY)
                log.cl_debug("pinging [%s]", server_url)
                ret = message.ccm_communicate(log, poll, client,
                                              CLOWNFISH_CONSOLE_PING_TIMEOUT)
                if ret < 0:
                    log.cl_error("failed to ping server")
                    break
                elif ret > 0:
                    log.cl_debug("no response from server, retrying")
                    ret = 0
                    continue

                log.cl_debug("server replied successfully")
                time_start = time.time()
                self.cc_condition.acquire()
                self.cc_condition.wait(CLOWNFISH_CONSOLE_PING_INTERVAL)
                self.cc_condition.release()
        if client is not None:
            client.setsockopt(zmq.LINGER, 0)
            client.close()
        if ret:
            log.cl_debug("ping thread stoped because of connection error")
            self.cc_running = False
        else:
            assert not self.cc_running
            log.cl_debug("ping thread stoped because the console is exiting")

        log.cl_debug("terminating ZMQ context of pinging thread")
        context.term()
        log.cl_debug("terminated ZMQ context of pinging thread")
        return ret
Example #9
0
    def cd_worker_thread(self, worker_index):
        """
        Worker routine
        """
        # pylint: disable=too-many-nested-blocks,too-many-locals
        # pylint: disable=too-many-branches,too-many-statements
        # Socket to talk to dispatcher
        name = "thread_worker_%s" % worker_index
        thread_workspace = self.cd_workspace + "/" + name
        if not os.path.exists(thread_workspace):
            ret = utils.mkdir(thread_workspace)
            if ret:
                self.cd_log.cl_error(
                    "failed to create directory [%s] on local host",
                    thread_workspace)
                return -1
        elif not os.path.isdir(thread_workspace):
            self.cd_log.cl_error("[%s] is not a directory", thread_workspace)
            return -1
        log = self.cd_log.cl_get_child(name, resultsdir=thread_workspace)

        log.cl_info("starting worker thread [%s]", worker_index)
        dispatcher_socket = self.cd_context.socket(zmq.REP)
        dispatcher_socket.connect(self.cd_url_worker)
        hostname = socket.gethostname()
        host = ssh_host.SSHHost(hostname, local=True)

        while self.cd_running:
            try:
                request_message = dispatcher_socket.recv()
            except zmq.ContextTerminated:
                log.cl_info(
                    "worker thread [%s] exiting because context has "
                    "been terminated", worker_index)
                break

            cmessage = copytoold_pb2.CopytooldMessage
            request = cmessage()
            request.ParseFromString(request_message)
            log.cl_debug("received request with type [%s]", request.cm_type)
            reply = cmessage()
            reply.cm_protocol_version = cmessage.CPV_ZERO
            reply.cm_errno = cmessage.CE_NO_ERROR

            if request.cm_type == cmessage.CMT_START_REQUEST:
                source = request.cm_start_request.csr_source
                dest = request.cm_start_request.csr_dest
                log.cl_info(
                    "received a start request of copytool from [%s] to [%s]",
                    source, dest)
                ret = start_copytool(log, host, source, dest)
                if ret:
                    reply.cm_errno = cmessage.CE_OPERATION_FAILED
                reply.cm_type = cmessage.CMT_START_REPLY
            else:
                reply.cm_type = cmessage.CMT_GENERAL
                reply.cm_errno = cmessage.CE_NO_TYPE
                log.cl_error(
                    "received a request with type [%s] that "
                    "is not supported", request.cm_type)

            reply_message = reply.SerializeToString()
            dispatcher_socket.send(reply_message)
            log.cl_info(
                "send reply to a start request of copytool from [%s] to [%s]",
                source, dest)
        dispatcher_socket.close()
        log.cl_info("worker thread [%s] exited", worker_index)
    def cs_worker_thread(self, worker_index):
        """
        Worker routine
        """
        # pylint: disable=too-many-nested-blocks,too-many-locals
        # pylint: disable=too-many-branches,too-many-statements
        # Socket to talk to dispatcher
        instance = self.cs_instance

        name = "thread_worker_%s" % worker_index
        thread_workspace = instance.ci_workspace + "/" + name
        if not os.path.exists(thread_workspace):
            ret = utils.mkdir(thread_workspace)
            if ret:
                self.cs_log.cl_error(
                    "failed to create directory [%s] on local host",
                    thread_workspace)
                return -1
        elif not os.path.isdir(thread_workspace):
            self.cs_log.cl_error("[%s] is not a directory", thread_workspace)
            return -1
        log = self.cs_log.cl_get_child(name, resultsdir=thread_workspace)

        log.cl_info("starting worker thread [%s]", worker_index)
        dispatcher_socket = self.cs_context.socket(zmq.REP)
        dispatcher_socket.connect(self.cs_url_worker)

        while self.cs_running:
            try:
                request_message = dispatcher_socket.recv()
            except zmq.ContextTerminated:
                log.cl_info(
                    "worker thread [%s] exiting because context has "
                    "been terminated", worker_index)
                break
            cmessage = clownfish_pb2.ClownfishMessage
            request = cmessage()
            request.ParseFromString(request_message)
            log.cl_debug("received request with type [%s]", request.cm_type)
            reply = cmessage()
            reply.cm_protocol_version = cmessage.CPV_ZERO
            reply.cm_errno = cmessage.CE_NO_ERROR

            if request.cm_type == cmessage.CMT_CONNECT_REQUEST:
                client_hash = request.cm_connect_request.ccrt_client_hash
                connection = self.cs_connection_allocate(client_hash)
                reply.cm_type = cmessage.CMT_CONNECT_REPLY
                reply.cm_connect_reply.ccry_client_hash = client_hash
                reply.cm_client_uuid = connection.cc_sequence
            else:
                client_uuid = request.cm_client_uuid
                reply.cm_client_uuid = client_uuid
                connection = self.cs_connection_find(client_uuid)
                if connection is None:
                    log.cl_error(
                        "received a request with UUID [%s] that "
                        "doesnot exist", client_uuid)
                    reply.cm_type = cmessage.CMT_GENERAL
                    reply.cm_errno = cmessage.CE_NO_UUID
                elif request.cm_type == cmessage.CMT_PING_REQUEST:
                    reply.cm_type = cmessage.CMT_PING_REPLY
                elif request.cm_type == cmessage.CMT_INTERACT_REQUEST:
                    reply.cm_type = cmessage.CMT_INTERACT_REPLY
                    connection.cc_interact(log, request.cm_interact_request,
                                           reply.cm_interact_reply)
                elif request.cm_type == cmessage.CMT_COMMAND_REQUEST:
                    reply.cm_type = cmessage.CMT_COMMAND_REPLY
                    cmd_line = request.cm_command_request.ccrt_cmd_line
                    connection.cc_command(log, cmd_line,
                                          reply.cm_command_reply)
                elif request.cm_type == cmessage.CMT_COMMAND_PARTWAY_QUERY:
                    reply.cm_type = cmessage.CMT_COMMAND_REPLY
                    query = request.cm_command_partway_query
                    if query.ccpq_abort:
                        connection.cc_abort()
                    connection.cc_consume_command_log(log,
                                                      reply.cm_command_reply)
                elif request.cm_type == cmessage.CMT_COMMAND_INPT_REPLY:
                    reply.cm_type = cmessage.CMT_COMMAND_REPLY
                    input_reply = request.cm_command_input_reply
                    if input_reply.cciry_abort:
                        connection.cc_abort()
                    log.cl_debug("got input [%s] for command",
                                 input_reply.cciry_input)
                    connection.cc_condition.acquire()
                    connection.cc_input_result = input_reply.cciry_input
                    connection.cc_condition.notifyAll()
                    connection.cc_condition.release()
                    connection.cc_consume_command_log(log,
                                                      reply.cm_command_reply)
                else:
                    reply.cm_type = cmessage.CMT_GENERAL
                    reply.cm_errno = cmessage.CE_NO_TYPE
                    log.cl_error(
                        "recived a request with type [%s] that "
                        "is not supported", request.cm_type)

                if (reply.cm_type == cmessage.CMT_COMMAND_REPLY
                        and reply.cm_command_reply.ccry_type
                        == cmessage.CCRYT_FINAL
                        and reply.cm_command_reply.ccry_final.ccfr_quit):
                    ret = self.cs_connection_delete(connection.cc_sequence)
                    if ret:
                        log.cl_error("failed to delete connection [%s]",
                                     connection.cc_sequence)
                        reply.cm_errno = cmessage.CE_NO_UUID

            reply_message = reply.SerializeToString()
            dispatcher_socket.send(reply_message)
        dispatcher_socket.close()
        log.cl_info("worker thread [%s] exited", worker_index)