Example #1
0
class RequestSender(object):
    def __init__(self, event_loop, sid, server_addr, client_data=None):
        self._sid = sid
        self._client_data = client_data
        self._codec = ZLengthHeadCodec(self._on_string_message)
        self._tcp_client = ZTcpClient(event_loop, server_addr)
        self._tcp_client.set_connection_callback(self._on_connection)
        self._tcp_client.set_connection_error_callback(self._on_error_connection)
        self._tcp_client.set_message_callback(self._codec.on_message)
        self._connection_callback = None
        self._connection_error_callback = None
        self._message_callback = None

    def connect(self):
        self._tcp_client.connect()

    def disconnect(self):
        self._tcp_client.disconnect()
    
    def set_connection_callback(self, cb):
        self._connection_callback = cb

    def set_connection_error_callback(self, cb):
        self._connection_error_callback = cb

    def set_message_callback(self, cb):
        self._message_callback = cb

    def send(self, tcp_conn, msg):
        self._codec.send(tcp_conn, msg)

    def get_id(self):
        return self._sid

    def get_client_data(self):
        return self._client_data

    def _on_connection(self, tcp_conn):
        if self._connection_callback:
            self._connection_callback(self, tcp_conn)
       
    def _on_error_connection(self, errno):
        if self._connection_error_callback:
            self._connection_error_callback(self, errno)
        else:
            logging.error('failed to connect to %s, errno=(%d)' %str(server_addr), errno)

    def _on_string_message(self, tcp_conn, message, receive_time):
        if self._message_callback:
            self._message_callback(self, tcp_conn, message, receive_time)
Example #2
0
class FLRDaemon(object):
    SCAN = 1
    GENERATE_REPORT = 2
    CONSOLIDATE_DUINFO = 3
    LOAD_PARTITION_INFO = 4
    GENERATE_FLR_REPORT = 5
    LOAD_FLR_REPORT = 6
    TRANSFER_DATA = 7

    SCAN_STR = "scan"
    GENERATE_REPORT_STR = "generate_report"
    CONSOLIDATE_DUINFO_STR = "consolidate_duinfo"
    LOAD_PARTITION_INFO_STR = "load_partition_info"
    GENERATE_FLR_REPORT_STR = "generate_flr_report"
    LOAD_FLR_REPORT_STR = "load_flr_report"
    TRANSFER_DATA_STR = "transfer_data"

    def __init__(self, server_addr, cfg_file, scan_list_dir):
        self._cfg_parser = ConfigParser.ConfigParser()
        self._cfg_parser.read(cfg_file)
        self._site = self._cfg_parser.get("FLRD", "site")
        self._construct_scan_parameters()
        self._scan_list_dir = scan_list_dir
        self._tcp_server = ZTcpServer(server_addr)
        self._codec = ZLengthHeadCodec(self._on_string_message)
        self._tcp_server.set_connection_callback(self._on_connection)
        self._tcp_server.set_message_callback(self._codec.on_message)
        self._hostname = gethostname()
        self._req_id_guard = Condition(Lock())
        self._setup_req_id()

    def start(self):
        self._tcp_server.start()
        # inject the recover process here
        self._recover()
        self._tcp_server.serve_forever()

    def set_thread_num(self, num):
        self._tcp_server.set_thread_num(num)

    def _setup_req_id(self):
        self._req_id = 0
        # recover req_id from db
        statement = (
            "SELECT max(req_id) FROM flr_app_status WHERE site = '%s' AND appname = 'flrd' GROUP BY site, appname"
            % self._site
        )
        db_conn = self._create_db_conn()
        db_cursor = db_conn.cursor()
        db_cursor.execute(statement)
        cur_req_id = db_cursor.fetchall()
        if cur_req_id:
            self._req_id = cur_req_id[0][0]

        statement = "SELECT max(req_id) FROM flr_request WHERE site = '%s' GROUP BY site" % self._site
        db_cursor.execute(statement)
        cur_req_id = db_cursor.fetchall()
        if cur_req_id and cur_req_id[0][0] > self._req_id:
            self._req_id = cur_req_id[0][0]
        db_cursor.close()
        db_conn.close()
        logging.info("Recovering request id = (%d) from db" % self._req_id)

    def _on_connection(self, tcp_conn):
        logging.info("FLRDaemon (%s) is %s" % (tcp_conn.name(), (tcp_conn.connected() and "UP" or "DOWN")))

    def _on_string_message(self, tcp_conn, message, receive_time):
        logging.info("receive request (%s)" % message)
        reply_msg = "%s received request" % self._hostname
        self._codec.send(tcp_conn, reply_msg)
        tcp_conn.shutdown_write()

        # next req id
        req_id = 0
        with self._req_id_guard:
            self._req_id += 1
            req_id = self._req_id

        # Commit this request into db for playback when flrd died unintentionly
        statement = "INSERT INTO flr_request VALUES ('%s', %d, '%s', current_timestamp)" % (self._site, req_id, message)
        self._commit_db(statement)

        root_xml = ElementTree.fromstring(message)
        assert root_xml.tag == "job"
        task = root_xml.attrib.get("name", None)
        site = root_xml.attrib.get("site", "NA")
        assert site == self._site
        req_text = root_xml.text.strip()
        statemachine_tbl = {
            self.SCAN_STR: self.SCAN,
            self.GENERATE_REPORT_STR: self.GENERATE_REPORT,
            self.CONSOLIDATE_DUINFO_STR: self.CONSOLIDATE_DUINFO,
            self.LOAD_PARTITION_INFO_STR: self.LOAD_PARTITION_INFO,
            self.GENERATE_FLR_REPORT_STR: self.GENERATE_FLR_REPORT,
            self.LOAD_FLR_REPORT_STR: self.LOAD_FLR_REPORT,
            self.TRANSFER_DATA_STR: self.TRANSFER_DATA,
        }

        comp = statemachine_tbl.get(task, None)
        if comp:
            self._handle_all(req_text, req_id, site, comp)
        else:
            logging.error("FLRDaemon unknown job (%s)" % task)

        # When done with this request, delete the request from the db
        statement = "DELETE FROM flr_request WHERE req_id = %d" % req_id
        self._commit_db(statement)

    def _handle_scan(self, scan_list, req_id, site):
        # remove the previous scan data
        dtldir = self._cfg_parser.get("FLRD", "dtldir")
        rmtree(dtldir, ignore_errors=True)

        tid = current_thread().ident
        if not tid:
            randome.random(time.time())
            tid = random.randint(1, 1000000000)
        root_list_file = "scan_list_%d_%s" % (tid, strftime("%Y%m%d-%H%M%S"))
        root_list_file = join(self._scan_list_dir, root_list_file)
        with open(root_list_file, "w") as scan_file:
            scan_file.write(scan_list)

        scan_cmd = ["smController", "-rlist", root_list_file] + self._scan_parameters

        def do_handle_scan(dummy):
            output = subprocess.Popen(scan_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
            result_msg = ""
            if output[0]:
                logging.info(output[0])
                result_msg = output[0]

            if output[1]:
                logging.error(output[1])
                result_msg += output[1]

        res = self._do_job(self.SCAN_STR, req_id, site, "site", do_handle_scan)
        os.remove(root_list_file)
        return res

    def _handle_generate_report(self, request, req_id, site, comp):
        res = 0
        if comp <= self.CONSOLIDATE_DUINFO:
            # 1) Consolidate all duinfo csv
            res = self._consolidate_duinfo_csv(request, req_id, site)
            if res < 0:
                return res

        if comp <= self.LOAD_PARTITION_INFO:
            # 2) Load partition info of this site to partition_info table
            res = self._load_partition_info(request, req_id, site)
            if res < 0:
                return res

        if comp <= self.GENERATE_FLR_REPORT:
            # 3) Generate reports in partition/file system level
            res = self._generate_flr_report(request, req_id, site)
        return res

    def _handle_load_flr_report(self, request, req_id, site):
        import lp_loader

        cfg_file = self._cfg_parser.get("FLRD", "csv_loader_flr_report_cfg")
        csv_loader = lp_loader.LPLoader(cfg_file)

        def load_csv(dummy):
            csv_loader.load_data()

        res = self._do_job(self.LOAD_FLR_REPORT_STR, req_id, site, "csv_loader_flr_report_cfg", load_csv)
        return res

    def _handle_transfer_data(self, request, req_id, site):
        pass

    def _handle_all(self, request, req_id, site, comp):
        res = 0
        if comp <= self.SCAN:
            res = self._handle_scan(request, req_id, site)
            if res < 0:
                return res

        if comp <= self.GENERATE_FLR_REPORT:
            res = self._handle_generate_report(request, req_id, site, comp)
            if res < 0:
                return res

        if comp <= self.LOAD_FLR_REPORT:
            res = self._handle_load_flr_report(request, req_id, site)
            if res < 0:
                return res

        if comp <= self.TRANSFER_DATA:
            res = self._handle_transfer_data(request, req_id, site)
        return res

    def _recover(self):
        # spawn a new thread which replays the last request which has not been done
        self._recover_thr = Thread(target=self._do_recover)
        self._recover_thr.start()

    def _do_recover(self, **kwargs):
        db_conn = self._create_db_conn()
        db_cursor = db_conn.cursor()

        # replay outstanding request if there are any
        statement = "SELECT req_id, request, req_time FROM flr_request WHERE site = '%s'" % self._site
        db_cursor.execute(statement)
        requests = db_cursor.fetchall()
        request_sender = FLRRequestSender()
        request_sender.set_message_callback(self._on_response_from_flrd)
        request_sender.set_request_done_callback(self._on_done)
        for req in requests:
            # the last app component status for this request
            statement = (
                "SELECT app_component, status, message FROM flr_app_status a, (SELECT site, appname, max(msg_time) msg_time FROM flr_app_status GROUP BY site, appname) b WHERE a.appname = 'flrd' AND a.appname = b.appname AND a.site = '%s' AND a.site = b.site AND a.req_id = %d and a.msg_time = b.msg_time"
                % (self._site, req[0])
            )
            db_cursor.execute(statement)
            last_app_status = db_cursor.fetchall()
            if last_app_status:
                self._do_recover_for(last_app_status[0][0], last_app_status[0][1], req, request_sender)
            else:
                self._do_recover_for("", 0, req, request_sender)

        db_cursor.close()
        db_conn.close()

    def _do_recover_for(self, app_component, status, req, request_sender):
        logging.info("replaying request: (%s)" % req[1])
        root_xml = ElementTree.fromstring(req[1])
        site = root_xml.attrib.get("site", None)
        if app_component:
            if status < 0:
                root_xml.attrib.update({"name": app_component})
            else:
                statemachine_tbl = {
                    self.SCAN_STR: self.GENERATE_REPORT_STR,
                    self.GENERATE_REPORT_STR: self.CONSOLIDATE_DUINFO_STR,
                    self.CONSOLIDATE_DUINFO_STR: self.LOAD_PARTITION_INFO_STR,
                    self.LOAD_PARTITION_INFO_STR: self.GENERATE_FLR_REPORT_STR,
                    self.GENERATE_FLR_REPORT_STR: self.LOAD_FLR_REPORT_STR,
                    self.LOAD_FLR_REPORT_STR: self.TRANSFER_DATA_STR,
                    self.TRANSFER_DATA_STR: None,
                }
                next_comp = statemachine_tbl.get(app_component, None)
                print next_comp
                if next_comp:
                    root_xml.attrib.update({"name": next_comp})
                else:
                    root_xml = None
                    self._delete_replayed_request(req[0])

        if root_xml is not None:
            req_str = '<?xml version="1.0"?>\n%s' % ElementTree.tostring(root_xml)
            logging.info("sending new request: (%s)" % req_str)
            request_sender.send_requests((("localhost", req_str, req[0]),))

    def _on_response_from_flrd(self, request_sender, req_id, tcp_conn, msg, receive_time):
        self._delete_replayed_request(req_id)

    def _on_done(self, request_sender):
        logging.info("Done with the recover process")
        request_sender.quit()
        del self._recover_thr

    def _delete_replayed_request(self, req_id):
        logging.info("delete replayed request: id = (%d) from db" % req_id)
        statement = "DELETE FROM flr_request WHERE site = '%s' AND req_id = %d" % (self._site, req_id)
        self._commit_db(statement)

    def _do_job(self, component, req_id, site, cfg_file_tag, callback):
        logging.info("begin (%s)" % component)
        res = 0
        cfg_file = self._cfg_parser.get("FLRD", cfg_file_tag)
        try:
            callback(cfg_file)
        except Exception as e:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            logging.error(
                "FLRDaemon._do_job encounting error (%s)"
                % "".join(format_exception(exc_type, exc_value, exc_traceback))
            )
            res = -1

        statement = "INSERT INTO flr_app_status VALUES ('%s', 'flrd', '%s', %d, '', %d, current_timestamp)" % (
            site,
            component,
            res,
            req_id,
        )
        self._commit_db(statement)
        logging.info("end (%s)" % component)
        return res

    def _consolidate_duinfo_csv(self, request, req_id, site):
        import consolidate_duinfo_csv

        res = self._do_job(
            self.CONSOLIDATE_DUINFO_STR,
            req_id,
            site,
            "consolidate_duinfo_cfg",
            consolidate_duinfo_csv.consolidate_duinfo_csv,
        )
        return res

    def _load_partition_info(self, request, req_id, site):
        import lp_loader

        statement = "DELETE FROM partition_info WHERE site = '%s'" % site
        self._commit_db(statement)

        cfg_file = self._cfg_parser.get("FLRD", "csv_loader_partition_info_cfg")
        csv_loader = lp_loader.LPLoader(cfg_file)

        def load_csv(dummy):
            csv_loader.load_data()

        res = self._do_job(self.LOAD_PARTITION_INFO_STR, req_id, site, "csv_loader_partition_info_cfg", load_csv)
        return res

    def _generate_flr_report(self, request, req_id, site):
        import dtl_processor

        res = self._do_job(self.GENERATE_FLR_REPORT_STR, req_id, site, "dtl_processor_cfg", dtl_processor.process_dtls)
        return res

    def _construct_scan_parameters(self):
        self._scan_parameters = []
        flrd_section = "FLRD"
        thrd_num = self._cfg_parser.get(flrd_section, "thread_num")
        if thrd_num:
            self._scan_parameters.extend(["-thrds", thrd_num])
        else:
            self._scan_parameters.extend(["-thrds", str(cpu_count() * 4)])

        scan_cfg = self._cfg_parser.get(flrd_section, "scan_cfg")
        if scan_cfg:
            self._scan_parameters.extend(["-scancfg", scan_cfg])

        prescan = self._cfg_parser.get(flrd_section, "prescan")
        if prescan:
            self._scan_parameters.extend(["-prescan", prescan])

        postscan = self._cfg_parser.get(flrd_section, "postscan")
        if postscan:
            self._scan_parameters.extend(["-postscan", postscan])

        dtldir = self._cfg_parser.get(flrd_section, "dtldir")
        if dtldir:
            self._scan_parameters.extend(["-dtldir", dtldir])

        logdir = self._cfg_parser.get(flrd_section, "logdir")
        if logdir:
            self._scan_parameters.extend(["-logdir", logdir])

        errdir = self._cfg_parser.get(flrd_section, "errdir")
        if errdir:
            self._scan_parameters.extend(["-errdir", errdir])

        tmpdir = self._cfg_parser.get(flrd_section, "tmpdir")
        if tmpdir:
            self._scan_parameters.extend(["-tmpdir", tmpdir])

    def _create_db_conn(self):
        db_section = "FLRD"
        db_conn = ZDbConn.create_db_connection(
            self._cfg_parser.get(db_section, "db_type"),
            self._cfg_parser.get(db_section, "db_host"),
            self._cfg_parser.get(db_section, "db_user"),
            self._cfg_parser.get(db_section, "db_password"),
            self._cfg_parser.get(db_section, "db_instance"),
        )
        return db_conn

    def _commit_db(self, statement, db_conn=None, db_cursor=None):
        close_conn = False
        for i in range(3):
            try:
                if not db_conn:
                    close_conn = True
                    db_conn = self._create_db_conn()
                    db_cursor = db_conn.cursor()
                # flr_statu: appname, site, status, message, timestamp
                db_cursor.execute(statement)
                db_conn.commit()
                if close_conn:
                    db_cursor.close()
                    db_conn.close()
                return 0
            except Exception as e:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                logging.error(
                    "FLRDaemon._commit_db (%s) encounting error (%s)"
                    % (statement, "".join(format_exception(exc_type, exc_value, exc_traceback)))
                )
        return -1