def _cdqos_init(self, log): """ Init the QoS for the file system """ lustrefs = self.cdqos_lustrefs fsname = lustrefs.lf_fsname name = "thread_qos_%s" % fsname thread_workspace = self.cdqos_global_workspace + "/" + name if not os.path.exists(thread_workspace): ret = utils.mkdir(thread_workspace) if ret: log.cl_error("failed to create directory [%s] on local host", thread_workspace) return -1 elif not os.path.isdir(thread_workspace): self.cdqos_log.cl_error("[%s] is not a directory", thread_workspace) return -1 self.cdqos_thread_log = log.cl_get_child(name, resultsdir=thread_workspace) ret = self._cdqos_init_clients() if ret: log.cl_error( "failed to init QoS Lustre clients for file system [%s]", fsname) return -1 return 0
def hr_thread_start(self, parent_log): """ Start the thread """ ret = utils.mkdir(self.hr_workspace) if ret: return ret self.hr_thread = utils.thread_start(self.hr_thread_main, (parent_log, )) return 0
def hc_thread_start(self, log): """ Start the thread """ if utils.mkdir(self.hc_workspace): log.cl_error("failed to create directory [%s] on local host, " "exiting the thread", self.hc_workspace) self.hc_status = -1 return self.hc_thread = utils.thread_start(self.hc_thread_main, (log, ))
def hr_run(self, log): """ Thread of running remover """ # pylint: disable=too-many-locals,too-many-statements # pylint: disable=too-many-return-statements,too-many-arguments identity = time_util.local_strftime(time_util.utcnow(), "%Y-%m-%d-%H_%M_%S") workspace = self.hr_workspace + "/" + identity ret = utils.mkdir(workspace) if ret: log.cl_error("failed to create directory [%s] on local host", workspace) return -1 stdout_file = workspace + "/" + "remover_watching.stdout" stderr_file = workspace + "/" + "remover_watching.stderr" host = self.hr_host args = {} args["hostname"] = host.sh_hostname args["log"] = log stdout_fd = watched_io.watched_io_open(stdout_file, watched_io.log_watcher_debug, args) stderr_fd = watched_io.watched_io_open(stderr_file, watched_io.log_watcher_error, args) log.cl_debug("start to run command [%s] on host [%s]", self.hr_command, host.sh_hostname) retval = host.sh_run(log, self.hr_command, stdout_tee=stdout_fd, stderr_tee=stderr_fd, return_stdout=False, return_stderr=False, timeout=None) stdout_fd.close() stderr_fd.close() if retval.cr_exit_status: log.cl_error("failed to run command [%s] on host [%s], " "ret = [%d], " "stdout = [%s], stderr = [%s]", self.hr_command, host.sh_hostname, retval.cr_exit_status, retval.cr_stdout, retval.cr_stderr) else: log.cl_debug("finished running command [%s] on host [%s], " "ret = [%d], " "stdout = [%s], stderr = [%s]", self.hr_command, host.sh_hostname, retval.cr_exit_status, retval.cr_stdout, retval.cr_stderr) return retval.cr_exit_status
def pt_thread_start(self, parent_log): """ Start the thread """ ret = utils.mkdir(self.pt_workspace) if ret: parent_log.cl_error("failed to create directory [%s]", self.pt_workspace) return -1 # The log for this thread log = self.pt_parallel_execute.pe_log.cl_get_child( self.pt_id, resultsdir=self.pt_workspace) self.pt_log = log log.cl_result.cr_clear() log.cl_abort = False self.pt_status = ParallelThread.STATUS_RUNNING self.pt_thread = utils.thread_start(self.pt_main, ()) return 0
def __init__(self, parent_log, client_hash, sequence, instance): self.cc_client_hash = client_hash self.cc_sequence = sequence self.cc_atime = time.time() self.cc_connection_name = "connection_%s" % sequence self.cc_instance = instance self.cc_workspace = instance.ci_workspace + "/" + self.cc_connection_name ret = utils.mkdir(self.cc_workspace) if ret: reason = ("failed to create directory [%s] on local host" % (self.cc_workspace)) parent_log.cl_error(reason) raise Exception(reason) self.cc_condition = threading.Condition() self.cc_command_log = parent_log.cl_get_child( self.cc_connection_name, resultsdir=self.cc_workspace, record_consumer=True, condition=self.cc_condition) self.cc_last_retval = None self.cc_quit = False # Used when a command running thread needs input from console self.cc_input_prompt = None self.cc_input_result = None
def main(): """ Start clownfish """ # pylint: disable=unused-variable,too-many-statements,too-many-branches reload(sys) sys.setdefaultencoding("utf-8") argc = len(sys.argv) if argc == 1: # clownfish <localhost> server_url = ("tcp://localhost:%s" % constants.CLOWNFISH_DEFAULT_SERVER_PORT) cmdline = None elif argc == 2: # clownfish_console host # clownfish_console -h # clownfish_console --help if sys.argv[1] == "-h" or sys.argv[1] == "--help": usage() sys.exit(0) server_url = sys.argv[1] if ":" not in server_url: server_url += ":" + str(constants.CLOWNFISH_DEFAULT_SERVER_PORT) server_url = "tcp://" + server_url cmdline = None elif argc == 3: # clownfish_console host cmdline server_url = sys.argv[1] if ":" not in server_url: server_url += ":" + str(constants.CLOWNFISH_DEFAULT_SERVER_PORT) server_url = "tcp://" + server_url cmdline = sys.argv[2] elif argc == 4: # clownfish_console -P 3002 host # clownfish_console host cmdline1 cmdline2 if sys.argv[1] == "-P": port_string = sys.argv[2] host = sys.argv[3] if ":" in host: usage() sys.exit(-1) cmdline = None server_url = "tcp://%s:%s" % (host, port_string) else: server_url = sys.argv[1] if ":" not in server_url: server_url += ":" + str( constants.CLOWNFISH_DEFAULT_SERVER_PORT) server_url = "tcp://" + server_url cmdline = sys.argv[2] + " " + sys.argv[3] elif argc >= 5: # clownfish_console -P 3002 host cmdline... # clownfish_console host cmdline1 cmdline2 cmdline3... if sys.argv[1] == "-P": port_string = sys.argv[2] host = sys.argv[3] if ":" in host: usage() sys.exit(-1) cmdline_start = 4 server_url = "tcp://%s:%s" % (host, port_string) else: server_url = sys.argv[1] if ":" not in server_url: server_url += ":" + str( constants.CLOWNFISH_DEFAULT_SERVER_PORT) server_url = "tcp://" + server_url cmdline_start = 2 cmdline = "" for arg_index in range(cmdline_start, argc): if cmdline != "": cmdline += " " cmdline += sys.argv[arg_index] identity = time_util.local_strftime(time_util.utcnow(), "%Y-%m-%d-%H_%M_%S") workspace = CLOWNFISH_CONSOLE_LOG_DIR + "/" + identity if not os.path.exists(CLOWNFISH_CONSOLE_LOG_DIR): ret = utils.mkdir(CLOWNFISH_CONSOLE_LOG_DIR) if ret: sys.stderr.write("failed to create directory [%s]" % CLOWNFISH_CONSOLE_LOG_DIR) sys.exit(-1) elif not os.path.isdir(CLOWNFISH_CONSOLE_LOG_DIR): sys.stderr.write("[%s] is not a directory" % CLOWNFISH_CONSOLE_LOG_DIR) sys.exit(-1) if not os.path.exists(workspace): ret = utils.mkdir(workspace) if ret: sys.stderr.write("failed to create directory [%s]" % workspace) sys.exit(-1) elif not os.path.isdir(workspace): sys.stderr.write("[%s] is not a directory" % workspace) sys.exit(-1) if cmdline is None: print("Starting Clownfish console to server [%s], " "please check [%s] for more log" % (server_url, workspace)) log = clog.get_log(resultsdir=workspace, simple_console=True) ret = clownfish_console_loop(log, workspace, server_url, cmdline=cmdline) if ret: log.cl_error( "Clownfish console exited with failure, please check [%s] for " "more log", workspace) sys.exit(ret) if cmdline is None: log.cl_info("Clownfish console exited, please check [%s] for more log", workspace) sys.exit(0)
def cc_ping_thread(self): """ Ping the server constantly """ # pylint: disable=too-many-locals,too-many-statements,too-many-branches name = "thread_ping" thread_workspace = self.cc_workspace + "/" + name if not os.path.exists(thread_workspace): ret = utils.mkdir(thread_workspace) if ret: self.cc_log.cl_error("failed to create directory [%s]", thread_workspace) sys.exit(-1) elif not os.path.isdir(thread_workspace): self.cc_log.cl_error("[%s] is not a directory", thread_workspace) return -1 log = self.cc_log.cl_get_child(name, resultsdir=thread_workspace) log.cl_debug("starting ping thread") server_url = self.cc_server_url ret = 0 time_start = time.time() poll = zmq.Poller() context = zmq.Context(1) client = None while self.cc_running and ret == 0: if client is not None: client.setsockopt(zmq.LINGER, 0) client.close() poll.unregister(client) client = context.socket(zmq.REQ) client.connect(server_url) poll.register(client, zmq.POLLIN) while self.cc_running and ret == 0: time_now = time.time() elapsed = time_now - time_start if elapsed >= CLOWNFISH_CONSOLE_PING_TIMEOUT: log.cl_error("timeout when pinging [%s]", server_url) ret = -1 break message = ClownfishConsoleMessage( self.cc_uuid, clownfish_pb2.ClownfishMessage.CMT_PING_REQUEST, clownfish_pb2.ClownfishMessage.CMT_PING_REPLY) log.cl_debug("pinging [%s]", server_url) ret = message.ccm_communicate(log, poll, client, CLOWNFISH_CONSOLE_PING_TIMEOUT) if ret < 0: log.cl_error("failed to ping server") break elif ret > 0: log.cl_debug("no response from server, retrying") ret = 0 continue log.cl_debug("server replied successfully") time_start = time.time() self.cc_condition.acquire() self.cc_condition.wait(CLOWNFISH_CONSOLE_PING_INTERVAL) self.cc_condition.release() if client is not None: client.setsockopt(zmq.LINGER, 0) client.close() if ret: log.cl_debug("ping thread stoped because of connection error") self.cc_running = False else: assert not self.cc_running log.cl_debug("ping thread stoped because the console is exiting") log.cl_debug("terminating ZMQ context of pinging thread") context.term() log.cl_debug("terminated ZMQ context of pinging thread") return ret
def cd_worker_thread(self, worker_index): """ Worker routine """ # pylint: disable=too-many-nested-blocks,too-many-locals # pylint: disable=too-many-branches,too-many-statements # Socket to talk to dispatcher name = "thread_worker_%s" % worker_index thread_workspace = self.cd_workspace + "/" + name if not os.path.exists(thread_workspace): ret = utils.mkdir(thread_workspace) if ret: self.cd_log.cl_error( "failed to create directory [%s] on local host", thread_workspace) return -1 elif not os.path.isdir(thread_workspace): self.cd_log.cl_error("[%s] is not a directory", thread_workspace) return -1 log = self.cd_log.cl_get_child(name, resultsdir=thread_workspace) log.cl_info("starting worker thread [%s]", worker_index) dispatcher_socket = self.cd_context.socket(zmq.REP) dispatcher_socket.connect(self.cd_url_worker) hostname = socket.gethostname() host = ssh_host.SSHHost(hostname, local=True) while self.cd_running: try: request_message = dispatcher_socket.recv() except zmq.ContextTerminated: log.cl_info( "worker thread [%s] exiting because context has " "been terminated", worker_index) break cmessage = copytoold_pb2.CopytooldMessage request = cmessage() request.ParseFromString(request_message) log.cl_debug("received request with type [%s]", request.cm_type) reply = cmessage() reply.cm_protocol_version = cmessage.CPV_ZERO reply.cm_errno = cmessage.CE_NO_ERROR if request.cm_type == cmessage.CMT_START_REQUEST: source = request.cm_start_request.csr_source dest = request.cm_start_request.csr_dest log.cl_info( "received a start request of copytool from [%s] to [%s]", source, dest) ret = start_copytool(log, host, source, dest) if ret: reply.cm_errno = cmessage.CE_OPERATION_FAILED reply.cm_type = cmessage.CMT_START_REPLY else: reply.cm_type = cmessage.CMT_GENERAL reply.cm_errno = cmessage.CE_NO_TYPE log.cl_error( "received a request with type [%s] that " "is not supported", request.cm_type) reply_message = reply.SerializeToString() dispatcher_socket.send(reply_message) log.cl_info( "send reply to a start request of copytool from [%s] to [%s]", source, dest) dispatcher_socket.close() log.cl_info("worker thread [%s] exited", worker_index)
def cs_worker_thread(self, worker_index): """ Worker routine """ # pylint: disable=too-many-nested-blocks,too-many-locals # pylint: disable=too-many-branches,too-many-statements # Socket to talk to dispatcher instance = self.cs_instance name = "thread_worker_%s" % worker_index thread_workspace = instance.ci_workspace + "/" + name if not os.path.exists(thread_workspace): ret = utils.mkdir(thread_workspace) if ret: self.cs_log.cl_error( "failed to create directory [%s] on local host", thread_workspace) return -1 elif not os.path.isdir(thread_workspace): self.cs_log.cl_error("[%s] is not a directory", thread_workspace) return -1 log = self.cs_log.cl_get_child(name, resultsdir=thread_workspace) log.cl_info("starting worker thread [%s]", worker_index) dispatcher_socket = self.cs_context.socket(zmq.REP) dispatcher_socket.connect(self.cs_url_worker) while self.cs_running: try: request_message = dispatcher_socket.recv() except zmq.ContextTerminated: log.cl_info( "worker thread [%s] exiting because context has " "been terminated", worker_index) break cmessage = clownfish_pb2.ClownfishMessage request = cmessage() request.ParseFromString(request_message) log.cl_debug("received request with type [%s]", request.cm_type) reply = cmessage() reply.cm_protocol_version = cmessage.CPV_ZERO reply.cm_errno = cmessage.CE_NO_ERROR if request.cm_type == cmessage.CMT_CONNECT_REQUEST: client_hash = request.cm_connect_request.ccrt_client_hash connection = self.cs_connection_allocate(client_hash) reply.cm_type = cmessage.CMT_CONNECT_REPLY reply.cm_connect_reply.ccry_client_hash = client_hash reply.cm_client_uuid = connection.cc_sequence else: client_uuid = request.cm_client_uuid reply.cm_client_uuid = client_uuid connection = self.cs_connection_find(client_uuid) if connection is None: log.cl_error( "received a request with UUID [%s] that " "doesnot exist", client_uuid) reply.cm_type = cmessage.CMT_GENERAL reply.cm_errno = cmessage.CE_NO_UUID elif request.cm_type == cmessage.CMT_PING_REQUEST: reply.cm_type = cmessage.CMT_PING_REPLY elif request.cm_type == cmessage.CMT_INTERACT_REQUEST: reply.cm_type = cmessage.CMT_INTERACT_REPLY connection.cc_interact(log, request.cm_interact_request, reply.cm_interact_reply) elif request.cm_type == cmessage.CMT_COMMAND_REQUEST: reply.cm_type = cmessage.CMT_COMMAND_REPLY cmd_line = request.cm_command_request.ccrt_cmd_line connection.cc_command(log, cmd_line, reply.cm_command_reply) elif request.cm_type == cmessage.CMT_COMMAND_PARTWAY_QUERY: reply.cm_type = cmessage.CMT_COMMAND_REPLY query = request.cm_command_partway_query if query.ccpq_abort: connection.cc_abort() connection.cc_consume_command_log(log, reply.cm_command_reply) elif request.cm_type == cmessage.CMT_COMMAND_INPT_REPLY: reply.cm_type = cmessage.CMT_COMMAND_REPLY input_reply = request.cm_command_input_reply if input_reply.cciry_abort: connection.cc_abort() log.cl_debug("got input [%s] for command", input_reply.cciry_input) connection.cc_condition.acquire() connection.cc_input_result = input_reply.cciry_input connection.cc_condition.notifyAll() connection.cc_condition.release() connection.cc_consume_command_log(log, reply.cm_command_reply) else: reply.cm_type = cmessage.CMT_GENERAL reply.cm_errno = cmessage.CE_NO_TYPE log.cl_error( "recived a request with type [%s] that " "is not supported", request.cm_type) if (reply.cm_type == cmessage.CMT_COMMAND_REPLY and reply.cm_command_reply.ccry_type == cmessage.CCRYT_FINAL and reply.cm_command_reply.ccry_final.ccfr_quit): ret = self.cs_connection_delete(connection.cc_sequence) if ret: log.cl_error("failed to delete connection [%s]", connection.cc_sequence) reply.cm_errno = cmessage.CE_NO_UUID reply_message = reply.SerializeToString() dispatcher_socket.send(reply_message) dispatcher_socket.close() log.cl_info("worker thread [%s] exited", worker_index)