class RequestSender(object): def __init__(self, event_loop, sid, server_addr, client_data=None): self._sid = sid self._client_data = client_data self._codec = ZLengthHeadCodec(self._on_string_message) self._tcp_client = ZTcpClient(event_loop, server_addr) self._tcp_client.set_connection_callback(self._on_connection) self._tcp_client.set_connection_error_callback(self._on_error_connection) self._tcp_client.set_message_callback(self._codec.on_message) self._connection_callback = None self._connection_error_callback = None self._message_callback = None def connect(self): self._tcp_client.connect() def disconnect(self): self._tcp_client.disconnect() def set_connection_callback(self, cb): self._connection_callback = cb def set_connection_error_callback(self, cb): self._connection_error_callback = cb def set_message_callback(self, cb): self._message_callback = cb def send(self, tcp_conn, msg): self._codec.send(tcp_conn, msg) def get_id(self): return self._sid def get_client_data(self): return self._client_data def _on_connection(self, tcp_conn): if self._connection_callback: self._connection_callback(self, tcp_conn) def _on_error_connection(self, errno): if self._connection_error_callback: self._connection_error_callback(self, errno) else: logging.error('failed to connect to %s, errno=(%d)' %str(server_addr), errno) def _on_string_message(self, tcp_conn, message, receive_time): if self._message_callback: self._message_callback(self, tcp_conn, message, receive_time)
class FLRDaemon(object): SCAN = 1 GENERATE_REPORT = 2 CONSOLIDATE_DUINFO = 3 LOAD_PARTITION_INFO = 4 GENERATE_FLR_REPORT = 5 LOAD_FLR_REPORT = 6 TRANSFER_DATA = 7 SCAN_STR = "scan" GENERATE_REPORT_STR = "generate_report" CONSOLIDATE_DUINFO_STR = "consolidate_duinfo" LOAD_PARTITION_INFO_STR = "load_partition_info" GENERATE_FLR_REPORT_STR = "generate_flr_report" LOAD_FLR_REPORT_STR = "load_flr_report" TRANSFER_DATA_STR = "transfer_data" def __init__(self, server_addr, cfg_file, scan_list_dir): self._cfg_parser = ConfigParser.ConfigParser() self._cfg_parser.read(cfg_file) self._site = self._cfg_parser.get("FLRD", "site") self._construct_scan_parameters() self._scan_list_dir = scan_list_dir self._tcp_server = ZTcpServer(server_addr) self._codec = ZLengthHeadCodec(self._on_string_message) self._tcp_server.set_connection_callback(self._on_connection) self._tcp_server.set_message_callback(self._codec.on_message) self._hostname = gethostname() self._req_id_guard = Condition(Lock()) self._setup_req_id() def start(self): self._tcp_server.start() # inject the recover process here self._recover() self._tcp_server.serve_forever() def set_thread_num(self, num): self._tcp_server.set_thread_num(num) def _setup_req_id(self): self._req_id = 0 # recover req_id from db statement = ( "SELECT max(req_id) FROM flr_app_status WHERE site = '%s' AND appname = 'flrd' GROUP BY site, appname" % self._site ) db_conn = self._create_db_conn() db_cursor = db_conn.cursor() db_cursor.execute(statement) cur_req_id = db_cursor.fetchall() if cur_req_id: self._req_id = cur_req_id[0][0] statement = "SELECT max(req_id) FROM flr_request WHERE site = '%s' GROUP BY site" % self._site db_cursor.execute(statement) cur_req_id = db_cursor.fetchall() if cur_req_id and cur_req_id[0][0] > self._req_id: self._req_id = cur_req_id[0][0] db_cursor.close() db_conn.close() logging.info("Recovering request id = (%d) from db" % self._req_id) def _on_connection(self, tcp_conn): logging.info("FLRDaemon (%s) is %s" % (tcp_conn.name(), (tcp_conn.connected() and "UP" or "DOWN"))) def _on_string_message(self, tcp_conn, message, receive_time): logging.info("receive request (%s)" % message) reply_msg = "%s received request" % self._hostname self._codec.send(tcp_conn, reply_msg) tcp_conn.shutdown_write() # next req id req_id = 0 with self._req_id_guard: self._req_id += 1 req_id = self._req_id # Commit this request into db for playback when flrd died unintentionly statement = "INSERT INTO flr_request VALUES ('%s', %d, '%s', current_timestamp)" % (self._site, req_id, message) self._commit_db(statement) root_xml = ElementTree.fromstring(message) assert root_xml.tag == "job" task = root_xml.attrib.get("name", None) site = root_xml.attrib.get("site", "NA") assert site == self._site req_text = root_xml.text.strip() statemachine_tbl = { self.SCAN_STR: self.SCAN, self.GENERATE_REPORT_STR: self.GENERATE_REPORT, self.CONSOLIDATE_DUINFO_STR: self.CONSOLIDATE_DUINFO, self.LOAD_PARTITION_INFO_STR: self.LOAD_PARTITION_INFO, self.GENERATE_FLR_REPORT_STR: self.GENERATE_FLR_REPORT, self.LOAD_FLR_REPORT_STR: self.LOAD_FLR_REPORT, self.TRANSFER_DATA_STR: self.TRANSFER_DATA, } comp = statemachine_tbl.get(task, None) if comp: self._handle_all(req_text, req_id, site, comp) else: logging.error("FLRDaemon unknown job (%s)" % task) # When done with this request, delete the request from the db statement = "DELETE FROM flr_request WHERE req_id = %d" % req_id self._commit_db(statement) def _handle_scan(self, scan_list, req_id, site): # remove the previous scan data dtldir = self._cfg_parser.get("FLRD", "dtldir") rmtree(dtldir, ignore_errors=True) tid = current_thread().ident if not tid: randome.random(time.time()) tid = random.randint(1, 1000000000) root_list_file = "scan_list_%d_%s" % (tid, strftime("%Y%m%d-%H%M%S")) root_list_file = join(self._scan_list_dir, root_list_file) with open(root_list_file, "w") as scan_file: scan_file.write(scan_list) scan_cmd = ["smController", "-rlist", root_list_file] + self._scan_parameters def do_handle_scan(dummy): output = subprocess.Popen(scan_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() result_msg = "" if output[0]: logging.info(output[0]) result_msg = output[0] if output[1]: logging.error(output[1]) result_msg += output[1] res = self._do_job(self.SCAN_STR, req_id, site, "site", do_handle_scan) os.remove(root_list_file) return res def _handle_generate_report(self, request, req_id, site, comp): res = 0 if comp <= self.CONSOLIDATE_DUINFO: # 1) Consolidate all duinfo csv res = self._consolidate_duinfo_csv(request, req_id, site) if res < 0: return res if comp <= self.LOAD_PARTITION_INFO: # 2) Load partition info of this site to partition_info table res = self._load_partition_info(request, req_id, site) if res < 0: return res if comp <= self.GENERATE_FLR_REPORT: # 3) Generate reports in partition/file system level res = self._generate_flr_report(request, req_id, site) return res def _handle_load_flr_report(self, request, req_id, site): import lp_loader cfg_file = self._cfg_parser.get("FLRD", "csv_loader_flr_report_cfg") csv_loader = lp_loader.LPLoader(cfg_file) def load_csv(dummy): csv_loader.load_data() res = self._do_job(self.LOAD_FLR_REPORT_STR, req_id, site, "csv_loader_flr_report_cfg", load_csv) return res def _handle_transfer_data(self, request, req_id, site): pass def _handle_all(self, request, req_id, site, comp): res = 0 if comp <= self.SCAN: res = self._handle_scan(request, req_id, site) if res < 0: return res if comp <= self.GENERATE_FLR_REPORT: res = self._handle_generate_report(request, req_id, site, comp) if res < 0: return res if comp <= self.LOAD_FLR_REPORT: res = self._handle_load_flr_report(request, req_id, site) if res < 0: return res if comp <= self.TRANSFER_DATA: res = self._handle_transfer_data(request, req_id, site) return res def _recover(self): # spawn a new thread which replays the last request which has not been done self._recover_thr = Thread(target=self._do_recover) self._recover_thr.start() def _do_recover(self, **kwargs): db_conn = self._create_db_conn() db_cursor = db_conn.cursor() # replay outstanding request if there are any statement = "SELECT req_id, request, req_time FROM flr_request WHERE site = '%s'" % self._site db_cursor.execute(statement) requests = db_cursor.fetchall() request_sender = FLRRequestSender() request_sender.set_message_callback(self._on_response_from_flrd) request_sender.set_request_done_callback(self._on_done) for req in requests: # the last app component status for this request statement = ( "SELECT app_component, status, message FROM flr_app_status a, (SELECT site, appname, max(msg_time) msg_time FROM flr_app_status GROUP BY site, appname) b WHERE a.appname = 'flrd' AND a.appname = b.appname AND a.site = '%s' AND a.site = b.site AND a.req_id = %d and a.msg_time = b.msg_time" % (self._site, req[0]) ) db_cursor.execute(statement) last_app_status = db_cursor.fetchall() if last_app_status: self._do_recover_for(last_app_status[0][0], last_app_status[0][1], req, request_sender) else: self._do_recover_for("", 0, req, request_sender) db_cursor.close() db_conn.close() def _do_recover_for(self, app_component, status, req, request_sender): logging.info("replaying request: (%s)" % req[1]) root_xml = ElementTree.fromstring(req[1]) site = root_xml.attrib.get("site", None) if app_component: if status < 0: root_xml.attrib.update({"name": app_component}) else: statemachine_tbl = { self.SCAN_STR: self.GENERATE_REPORT_STR, self.GENERATE_REPORT_STR: self.CONSOLIDATE_DUINFO_STR, self.CONSOLIDATE_DUINFO_STR: self.LOAD_PARTITION_INFO_STR, self.LOAD_PARTITION_INFO_STR: self.GENERATE_FLR_REPORT_STR, self.GENERATE_FLR_REPORT_STR: self.LOAD_FLR_REPORT_STR, self.LOAD_FLR_REPORT_STR: self.TRANSFER_DATA_STR, self.TRANSFER_DATA_STR: None, } next_comp = statemachine_tbl.get(app_component, None) print next_comp if next_comp: root_xml.attrib.update({"name": next_comp}) else: root_xml = None self._delete_replayed_request(req[0]) if root_xml is not None: req_str = '<?xml version="1.0"?>\n%s' % ElementTree.tostring(root_xml) logging.info("sending new request: (%s)" % req_str) request_sender.send_requests((("localhost", req_str, req[0]),)) def _on_response_from_flrd(self, request_sender, req_id, tcp_conn, msg, receive_time): self._delete_replayed_request(req_id) def _on_done(self, request_sender): logging.info("Done with the recover process") request_sender.quit() del self._recover_thr def _delete_replayed_request(self, req_id): logging.info("delete replayed request: id = (%d) from db" % req_id) statement = "DELETE FROM flr_request WHERE site = '%s' AND req_id = %d" % (self._site, req_id) self._commit_db(statement) def _do_job(self, component, req_id, site, cfg_file_tag, callback): logging.info("begin (%s)" % component) res = 0 cfg_file = self._cfg_parser.get("FLRD", cfg_file_tag) try: callback(cfg_file) except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() logging.error( "FLRDaemon._do_job encounting error (%s)" % "".join(format_exception(exc_type, exc_value, exc_traceback)) ) res = -1 statement = "INSERT INTO flr_app_status VALUES ('%s', 'flrd', '%s', %d, '', %d, current_timestamp)" % ( site, component, res, req_id, ) self._commit_db(statement) logging.info("end (%s)" % component) return res def _consolidate_duinfo_csv(self, request, req_id, site): import consolidate_duinfo_csv res = self._do_job( self.CONSOLIDATE_DUINFO_STR, req_id, site, "consolidate_duinfo_cfg", consolidate_duinfo_csv.consolidate_duinfo_csv, ) return res def _load_partition_info(self, request, req_id, site): import lp_loader statement = "DELETE FROM partition_info WHERE site = '%s'" % site self._commit_db(statement) cfg_file = self._cfg_parser.get("FLRD", "csv_loader_partition_info_cfg") csv_loader = lp_loader.LPLoader(cfg_file) def load_csv(dummy): csv_loader.load_data() res = self._do_job(self.LOAD_PARTITION_INFO_STR, req_id, site, "csv_loader_partition_info_cfg", load_csv) return res def _generate_flr_report(self, request, req_id, site): import dtl_processor res = self._do_job(self.GENERATE_FLR_REPORT_STR, req_id, site, "dtl_processor_cfg", dtl_processor.process_dtls) return res def _construct_scan_parameters(self): self._scan_parameters = [] flrd_section = "FLRD" thrd_num = self._cfg_parser.get(flrd_section, "thread_num") if thrd_num: self._scan_parameters.extend(["-thrds", thrd_num]) else: self._scan_parameters.extend(["-thrds", str(cpu_count() * 4)]) scan_cfg = self._cfg_parser.get(flrd_section, "scan_cfg") if scan_cfg: self._scan_parameters.extend(["-scancfg", scan_cfg]) prescan = self._cfg_parser.get(flrd_section, "prescan") if prescan: self._scan_parameters.extend(["-prescan", prescan]) postscan = self._cfg_parser.get(flrd_section, "postscan") if postscan: self._scan_parameters.extend(["-postscan", postscan]) dtldir = self._cfg_parser.get(flrd_section, "dtldir") if dtldir: self._scan_parameters.extend(["-dtldir", dtldir]) logdir = self._cfg_parser.get(flrd_section, "logdir") if logdir: self._scan_parameters.extend(["-logdir", logdir]) errdir = self._cfg_parser.get(flrd_section, "errdir") if errdir: self._scan_parameters.extend(["-errdir", errdir]) tmpdir = self._cfg_parser.get(flrd_section, "tmpdir") if tmpdir: self._scan_parameters.extend(["-tmpdir", tmpdir]) def _create_db_conn(self): db_section = "FLRD" db_conn = ZDbConn.create_db_connection( self._cfg_parser.get(db_section, "db_type"), self._cfg_parser.get(db_section, "db_host"), self._cfg_parser.get(db_section, "db_user"), self._cfg_parser.get(db_section, "db_password"), self._cfg_parser.get(db_section, "db_instance"), ) return db_conn def _commit_db(self, statement, db_conn=None, db_cursor=None): close_conn = False for i in range(3): try: if not db_conn: close_conn = True db_conn = self._create_db_conn() db_cursor = db_conn.cursor() # flr_statu: appname, site, status, message, timestamp db_cursor.execute(statement) db_conn.commit() if close_conn: db_cursor.close() db_conn.close() return 0 except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() logging.error( "FLRDaemon._commit_db (%s) encounting error (%s)" % (statement, "".join(format_exception(exc_type, exc_value, exc_traceback))) ) return -1