Example #1
0
class TableauStatusMonitor(threading.Thread):
    # pylint: disable=too-many-instance-attributes

    # Note: If there is a value in the system table, it is
    # used instead of these defaults.
    # Default interval for checking tableau status (in seconds)
    STATUS_REQUEST_INTERVAL_DEFAULT = 30

    # Minimum amount of time that must elapse while DEGRADED
    # before sending out the DEGRADED event.
    EVENT_DEGRADED_MIN_DEFAULT = 120    # in seconds

    # Possible return values when attempting to get systeminfo:
    # Only considered a failure if the http get of the URL
    # can't accept a connection.  Otherwise, Tableau could be up.
    SYSTEMINFO_SUCCESS = 1
    SYSTEMINFO_FAIL = 2

    # Remember the url that worked with systeminfo to help determine
    # if tableau is really stopped when systeminfo fails.
    systeminfo_url_worked = None

    statemap = {
        TableauProcess.STATUS_RUNNING: StateManager.STATE_STARTED,
        TableauProcess.STATUS_STOPPED: StateManager.STATE_STOPPED,
        TableauProcess.STATUS_DEGRADED: StateManager.STATE_DEGRADED,
        TableauProcess.STATUS_UNKNOWN: StateManager.STATE_DISCONNECTED
    }

    def __init__(self, server, manager):
        super(TableauStatusMonitor, self).__init__()
        self.server = server
        self.system = server.system
        self.event = self.server.event_control
        self.rwlock = self.server.upgrade_rwlock
        self.manager = manager # AgentManager instance
        self.envid = self.server.environment.envid

        self.first_degraded_time = None
        self.sent_degraded_event = False

        logger.setLevel(self.system[SystemKeys.DEBUG_LEVEL])

        # Start fresh: status table
        session = meta.Session()
        self.remove_all_status()
        session.commit()

        self.stateman = StateManager(self.server)

    # Remove all entries to get ready for new status info.
    def remove_all_status(self):
        """
            Note a session is passed.  When updating the status table,
            we don't want everything to go away (commit) until we've added
            the new entries.
        """

        # FIXME: Need to figure out how to do this in session.query:
        #        DELETE FROM status USING agent
        #          WHERE status.agentid = agent.agentid
        #            AND agent.domainid = self.domainid;
        #
        # This may do it:
        #
        # subq = session.query(TableauProcess).\
        #   join(Agent).\
        #   filter(Agent.domainid == self.domainid).\
        #   subquery()
        #
        # session.query(TableauProcess).\
        #   filter(TableauProcess.agentid,in_(subq)).\
        #   delete()

        meta.Session.query(TableauProcess).delete()

        # Intentionally don't commit here.  We want the existing
        # rows to be available until the new rows are inserted and
        # committed.

    def _add(self, agentid, name, pid, status):
        """Note a session is passed.  When updating the status table, we
        do remove_all_status, then slowly add in the new status before
        doing the commit, so the table is not every empty/building if
        somebody checks it.
        """

        session = meta.Session()
        entry = TableauProcess(agentid=agentid, name=name,
                               pid=pid, status=status)
        # We merge instead of add since 'tabadmin status -v' sometimes
        # returns duplicate lines.
        session.merge(entry)

    def get_tableau_status(self):
        try:
            return meta.Session().query(TableauProcess).\
                join(Agent).\
                filter(Agent.envid == self.envid).\
                filter(Agent.agent_type == 'primary').\
                filter(TableauProcess.name == 'Status').\
                one().status
        except NoResultFound:
            return TableauProcess.STATUS_UNKNOWN

    def _set_main_state(self, prev_tableau_status, tableau_status, agent, body):
        prev_state = self.stateman.get_state()

        if tableau_status not in (TableauProcess.STATUS_RUNNING,
                                  TableauProcess.STATUS_STOPPED,
                                  TableauProcess.STATUS_DEGRADED):
            logger.error("status-check: Unknown reported tableau_status " + \
                         "from tableau: %s.  prev_state: %s",
                         tableau_status, prev_state)
            return  # fixme: do something more drastic than return?

        if prev_state not in TRANSITIONS:
            logger.error("status-check: prev state unexpected: %s",
                         prev_state)
            return  # fixme: do something more drastic than return?

        # Get our new state and events to send based on the previous
        # state and new tableau status.

        new_state_info = TRANSITIONS[prev_state][tableau_status]

        # Special case (fixme)
        if prev_state == StateManager.STATE_STARTING_RESTORE:
            tversion = YmlEntry.get(self.envid, 'version.external', default='8')
            if tversion[0:1] == '8' and (tversion[1:4] == '.0.' or \
                                         tversion[1:4] == '.1.'):
                # For pre 8.2, 'tabadmin restore' ends in RUNNING/STARTED
                # After 8.2 and v9, 'tabadmin restore' ends in STOPPED
                new_state_info = START_DICT

        logger.debug("status-check: prev_state: %s, new state info: %s, " + \
                     "prev_tableau_status %s, tableau_status: %s",
                     prev_state, str(new_state_info),
                     prev_tableau_status, tableau_status)

        if 'state' in new_state_info and \
                                        new_state_info['state'] != prev_state:
            self.stateman.update(new_state_info['state'])

        if 'events' not in new_state_info:
            events = []
        else:
            events = new_state_info['events']
        if type(events) == type(EventControl.INIT_STATE_STARTED):
            events = [events]

        self._send_events(events, agent, body)

        if 'maint-stop' in new_state_info:
            # Make sure the maint server(s) are stopped if tableau
            # is not stopped.  For example, the user stopped
            # tableau via 'tabadmin stop' and then restarted it with
            # 'tabadmin start' without going through the Palette UI.
            logger.debug("status-check: May stop maint server. " + \
                         "prev_state: %s, new state info: %s, " + \
                         "prev_tableau_status %s, tableau_status: %s, " + \
                         "maint_started: %s",
                         prev_state, str(new_state_info),
                         prev_tableau_status, tableau_status,
                         str(self.server.maint_started))

            if not self.server.maint_started:
                logger.debug("state-check: maint server not running")
                return

            self.server.maint("stop")

        if 'maint-start' in new_state_info:
            # Make sure the maint server(s) are started if tableau
            # is stopped.  For example, the user stopped
            # tableau via 'tabadmin stop' and then the controller
            # started at that point (small chance, but possible).
            # We want to set the maint_started status.
            # We assume the user wanted the maint server started,
            # but can't be sure.
            logger.debug("status-check: Will start maint server. " + \
                         "prev_state: %s, new state info: %s, " + \
                         "prev_tableau_status %s, tableau_status: %s, " + \
                         "maint_started: %s",
                         prev_state, str(new_state_info),
                         prev_tableau_status, tableau_status,
                         str(self.server.maint_started))

            if self.server.maint_started:
                logger.debug("state-check: maint server already running")
                return

            self.server.maint("start")

    def _send_events(self, events, agent, body):
        """Send the events according to the old and new states.
           However, don't send DEGRADED-related events until
           Tableau has a chance to recover, since that's what it does."""
        if events:
            data = agent.todict()

        new_degraded_event = False
        for event in events:
            if event == EventControl.STATE_STARTED_AFTER_DEGRADED and \
                                        not self.sent_degraded_event:
                # We were degraded, and are running now, but never
                # sent out a degraded event, so don't send a
                # "no more degraded" event either.
                continue

            if event != EventControl.STATE_DEGRADED:
                self.event.gen(event, data)

                if event == EventControl.INIT_STATE_DEGRADED:
                    # If this is an "INIT_STATE_*" event, we send it,
                    # even if it is degraded.
                    self.sent_degraded_event = False
                continue

            new_degraded_event = True

            # Don't send the DEGRADED event until a minimum period of time
            # has elapsed with the state still DEGRADED.
            if not self.first_degraded_time:
                # Remember the time of the first DEGRADED state.
                self.first_degraded_time = time.time()
                continue

            if self.sent_degraded_event:
                # Already sent the degraded event
                continue

            try:
                event_degraded_min = self.system[SystemKeys.EVENT_DEGRADED_MIN]
            except ValueError:
                event_degraded_min = self.EVENT_DEGRADED_MIN_DEFAULT

            now = time.time()
            logger.debug("status-check: now %d, first %d, min %d, diff %d",
                         now, self.first_degraded_time,
                         event_degraded_min,
                         now - self.first_degraded_time)
            if now - self.first_degraded_time >= event_degraded_min:
                logger.debug("status-check: Sending degraded")

                # TODO: check if this fix for exceptions thrown when
                # line 974 gives us a None body
                body_items = [] if body is None else body.items()
                self.event.gen(event, dict(body_items + data.items()))
                self.sent_degraded_event = True

        if not new_degraded_event:
            self.first_degraded_time = None
            self.sent_degraded_event = False

    def run(self):
        try:
            self.tableau_status_loop()
        except (SystemExit, KeyboardInterrupt, GeneratorExit):
            raise
        except BaseException:
            line = traceback_string(all_on_one_line=False)
            edata = {'error': line, 'version': self.server.version}

            self.event.gen(EventControl.SYSTEM_EXCEPTION, edata)
            logger.error("status-check: Fatal: " + \
                         "Exiting tableau_status_loop on exception.")
            # pylint: disable=protected-access
            os._exit(93)

    def tableau_status_loop(self):
        while True:
            logger.debug("status-check: About to timeout or " + \
                         "wait for a new primary to connect")
            try:
                system_key = SystemKeys.STATUS_REQUEST_INTERVAL
                request_interval = self.system[system_key]
            except ValueError:
                request_interval = self.STATUS_REQUEST_INTERVAL_DEFAULT

            new_primary = self.manager.check_status_event.wait(request_interval)

            logger.debug("status-check: new_primary: %s", new_primary)
            if new_primary:
                self.manager.clear_check_status_event()

            session = meta.Session()
            try:
                # Don't do a 'tabadmin status -v' if upgrading
                acquired = self.rwlock.read_acquire(blocking=False)
                if not acquired:
                    logger.debug("status-check: Upgrading.  Won't run.")
                    continue
                self.check_status()
            finally:
                if acquired:
                    self.rwlock.read_release()
                session.rollback()
                meta.Session.remove()

    def check_status(self):
        logger.setLevel(self.system[SystemKeys.DEBUG_LEVEL])
        # FIXME: Tie agent to domain.
        agent = self.manager.agent_by_type(AgentManager.AGENT_TYPE_PRIMARY)
        if not agent:
            logger.debug("status-check: The primary agent is either " + \
                         "not connected or not enabled.")
            return

        aconn = agent.connection
        if not aconn:
            session = meta.Session()
            logger.debug("status-check: No primary agent currently connected.")
            self.remove_all_status()
            session.commit()
            return

        # Don't do a 'tabadmin status -v' if the user is doing an action.
        acquired = aconn.user_action_lock(blocking=False)
        if not acquired:
            logger.debug("status-check: Primary agent locked for user "
                         "action. Skipping status check.")
            return

        # We don't force the user to delay starting their request
        # until the 'tabadmin status -v' is finished.
        aconn.user_action_unlock()

        self.check_status_with_connection(agent)

    def _systeminfo_parse(self, agent, systeminfo_xml):
        # pylint: disable=too-many-return-statements
        # pylint: disable=too-many-branches
        # pylint: disable=too-many-locals
        # pylint: disable=too-many-statements
        """Returns:
                SYSTEMINFO_SUCCESS if getting status via systeminfo is fine.

            Raises an exception if there is a problem with systeminfo.
        """
        logger.debug("_systeminfo_parse: Received: %s", systeminfo_xml)
        try:
            root = ET.fromstring(systeminfo_xml)
        except ET.ParseError as ex:
            logger.error("_systeminfo_parse: xml parse error: '%s' from '%s':",
                         str(ex), systeminfo_xml)
            raise SysteminfoException(SysteminfoError.PARSE_FAILURE,
                                      "xml parse error: '%s'" % (str(ex)))

        if root.tag != 'systeminfo':
            logger.error("_systeminfo_parse: wrong root tag: %s", root.tag)
            raise SysteminfoException(SysteminfoError.PARSE_FAILURE,
                                      "wrong root tag: %s" % root.tag)

        session = meta.Session()
        prev_tableau_status = self.get_tableau_status()

        self.remove_all_status()

        tableau_status = None

        failed_proc_str = ""
        for child in root:
            if child.tag == 'machines':
                for machine in child:
#                    print "machine:", machine.attrib
                    if not 'name' in machine.attrib:
                        logger.error("_systeminfo_parse: missing " + \
                                     "'name' in machine attribute: %s",
                                     str(machine.attrib))
                        raise SysteminfoException(SysteminfoError.PARSE_FAILURE,
                                       ("missing " + \
                                       "'name' in machine attribute: %s") % \
                                       str(machine.attrib))

                    host = machine.attrib['name']
                    agentid = Agent.get_agentid_from_host(self.envid, host)

                    if not agentid:
                        logger.error("_systeminfo_parse: No such" + \
                                     " agent host known (yet/any more?): %s",
                                     host)
                        continue

                    machine_agent = Agent.get_by_id(agentid)
                    if machine_agent:
                        machine_displayname = machine_agent.displayname
                    else:
                        machine_displayname = "Unknown"

                    for info in machine:
                        #print "    ", info.tag, "attributes:", info.attrib
                        service_name = info.tag
                        if not 'status' in info.attrib:
                            logger.error("_systeminfo_parse: missing " + \
                                         "'status' in machine %s attrib: %s",
                                         host, str(info.attrib))
                            raise SysteminfoException(
                                      SysteminfoError.PARSE_FAILURE,
                                      ("missing " + \
                                      "'status' in machine %s attrib: %s") % \
                                      (host, str(info.attrib)))

                        if 'worker' in info.attrib:
                            worker_info = info.attrib['worker']
                            parts = worker_info.split(':')
                            if len(parts) == 1 or not parts[1].isdigit():
                                # port = -2
                                logger.error("_systeminfo_parse: missing " + \
                                             "':' or not an integer in "
                                             "machine %s for " + \
                                             "worker: %s", host,
                                             str(worker_info))

                                raise SysteminfoException(
                                               SysteminfoError.PARSE_FAILURE,
                                               ("Missing " + \
                                               "':' or not an integer in "
                                               "machine %s for " + \
                                               "worker: %s") % \
                                               (host, str(worker_info)))
                            else:
                                port = int(parts[1])

                        service_status = info.attrib['status']
#                        print "service_name:", service_name, "port", port
                        if service_status not in ('Active', 'Passive', 'Busy',
                                                 'ReadOnly', 'ActiveSyncing'):
                            # Keep track of failed tableau processes
                            failed_proc_str += ("Machine %s: Process %s is "
                                                 "%s\n") % \
                                                 (machine_displayname,
                                                 service_name, service_status)

                        self._add(agentid, service_name, port, service_status)
                        logger.debug("system_info_parse: logged: " + \
                                     "%d, %s, %d, %s",
                                     agentid, service_name, port,
                                     service_status)
            elif child.tag == 'service':
#                print "service:",
                info = child.attrib
                if not 'status' in info:
                    logger.error("_systeminfo_parse: Missing 'status': %s",
                                 str(info))
                    raise SysteminfoException(SysteminfoError.PARSE_FAILURE,
                                              "Missing 'status': %s" % \
                                              str(info))

                #print "    status:", info['status']
                tableau_status = info['status']
                if tableau_status in ('Down', 'DecommisionedReadOnly',
                    'DecomisioningReadOnly', 'DecommissionFailedReadOnly'):
                    tableau_status = TableauProcess.STATUS_DEGRADED
                elif tableau_status in ('Active', 'Passive',
                                        'Busy', 'ReadOnly', 'ActiveSyncing'):
                    tableau_status = TableauProcess.STATUS_RUNNING
                elif tableau_status in ('StatusNotAvailable',
                                        'StatusNotAvailableSyncing'):
                    tableau_status = TableauProcess.STATUS_UNKNOWN
                else:
                    logger.error("_systeminfo_parse: Unexpected status: '%s'",
                                 tableau_status)
                    tableau_status = TableauProcess.STATUS_UNKNOWN

                # Note: The status can never be STOPPED since if Tableau
                # is stopped, then it won't respond to the systeminfo
                # GET URL.
                self._add(agent.agentid, "Status", 0, tableau_status)
            else:
                logger.error("_systeminfo_parse: Unexpected child.tag: '%s'",
                             child.tag)

        if tableau_status is None:
            logger.error("_systeminfo_parse: Tableau status not valid: %s",
                         str(systeminfo_xml))
            session.rollback()
            raise SysteminfoException(SysteminfoError.PARSE_FAILURE,
                                      "Tableau status not valid")

        if failed_proc_str:
            # Failed process(es) for the event
            body = {'info': failed_proc_str}
        else:
            body = None

        self._finish_status(agent, tableau_status, prev_tableau_status, body)
        return self.SYSTEMINFO_SUCCESS

    def _systeminfo_url(self):
        """For now, start with the tableau-server-url, and replace the
           hostname with 127.0.0.1.  Eventually we need to look at the
           yml, gateway.hosts, ssl.enabled, gateway.ports, and
           decided both the URL and which host should request systeminfo.
           (The gateway/web server may not be on the primary.)

           Returns None if no valid url is available.
        """

        systeminfo_url = self.system[SystemKeys.TABLEAU_INTERNAL_SERVER_URL]
        if not systeminfo_url:
            systeminfo_url = self.system[SystemKeys.TABLEAU_SERVER_URL]
        if not systeminfo_url:
            logger.error("_systeminfo_get: no url configured.")
            return None

        result = urlparse(systeminfo_url)

        if not result.scheme:
            logger.error("_systeminfo_get: Bad url: %s", systeminfo_url)
            return None

        if result.port:
            url = "%s://127.0.0.1:%d/admin/systeminfo.xml" % \
                                            (result.scheme, result.port)
        else:
            url = "%s://127.0.0.1/admin/systeminfo.xml" % (result.scheme)

        return url

    def _systeminfo_get(self, agent):
        # pylint: disable=too-many-return-statements
        """Returns:
            - The xml on success
            Raises SysteminfoException on error.
        """

        url = self._systeminfo_url()
        timeout_ms = self.system[SystemKeys.STATUS_SYSTEMINFO_TIMEOUT_MS]

        try:
            res = agent.connection.http_send_get(url, timeout=timeout_ms)
        except (socket.error, IOError, exc.HTTPException,
                                                httplib.HTTPException) as ex:
            logger.info("_systeminfo_get %s failed: %s",
                        url, str(ex))
            raise SysteminfoException(SysteminfoError.COMM_FAILURE,
                                      "_systeminfo_get %s failed: %s" % \
                                      (url, str(ex)))

        content_type = res.getheader('Content-Type', '').lower()

        logger.info("GET %s, Headers: '%s'", url, str(res.getheaders()))

        if content_type == 'application/x-json':
            # This extended type indicates the agent generated the JSON,
            # i.e. there was an error.
            try:
                data = json.loads(res.body)
            except ValueError as ex:
                logger.error("_systeminfo_get: Bad json returned for %s: %s",
                             url, res.body)
                raise SysteminfoException(SysteminfoError.JSON_PARSE_FAILURE,
                                          "Invalid json returned for %s: %s" % \
                                          (url, res.body))

            logger.info("_systeminfo_get: get %s reported failed: %s",
                        url, data)
            if 'error' in data:
                if data['error'].find(
                              "Unable to connect to the remote server") != -1:
                    # We had the tableau URL and it wasn't answering.
                    # Tableau is probably down, though could be the
                    # wrong URL/port.
                    raise SysteminfoException(SysteminfoError.CONNECT_FAILURE,
                                        ("HTTP GET %s reported failed: %s") % \
                                        (url, data['error']))

                if data['error'].find('The operation has timed out') != -1:
                    msgfmt = "HTTP GET Timed out after %.1f seconds on %s"
                    raise SysteminfoException(SysteminfoError.COMM_TIMEDOUT,
                                              msgfmt % (timeout_ms/1000., url))

                if 'status-code' in data and data['status-code'] == 404:
                    raise SysteminfoException(SysteminfoError.NOT_FOUND,
                                              "Page not found: %s" % (url))

            msg = "Unexpected response error: %s" % str(data)
            raise SysteminfoException(SysteminfoError.UNEXPECTED_RESPONSE, msg)

        return res.body

    def _set_status_stopped(self, agent):
        """systeminfo is enabled in tableau, so if it is failing now,
           assume tableau is stopped."""

        prev_tableau_status = self.get_tableau_status()
        self.remove_all_status()
        name = "Status"
        pid = 0
        tableau_status = TableauProcess.STATUS_STOPPED
        self._add(agent.agentid, name, 0, tableau_status)
        logger.debug("_set_status_stopped: logged: %s, %d, %s",
                     name, pid, tableau_status)

        self._finish_status(agent, tableau_status, prev_tableau_status,
                {'stdout': 'systeminfo failed.  Assuming Tableau is stopped.'})

    def _tableau_systeminfo_enabled(self):
        """Returns:
            True:   The Tableau configuration has systeminfo enabled.
            False:  The Tableau configuration for systeminfo is disabled."""

        yml_val = self.server.yml.get(
                    'wgserver.systeminfo.allow_referrer_ips',
                    default='')

        if yml_val.find('127.0.0.1') != -1 or yml_val.find('localhost') != -1:
            return True

        return False

    def check_status_with_connection(self, agent):
        tableau_systeminfo_enabled = self._tableau_systeminfo_enabled()

        data = {}

        systeminfo_url = self._systeminfo_url()

        tableau_version = YmlEntry.get(self.envid,
                                       'version.external',
                                       default='8')

        systeminfo_api_capable = self.is_systeminfo_api_capable(tableau_version)

        if systeminfo_url and systeminfo_api_capable and \
                self.system[SystemKeys.STATUS_SYSTEMINFO] and \
                tableau_systeminfo_enabled:
            try:
                # Returns the xml on success
                xml_result = self._systeminfo_get(agent)

                # Remember this URL worked
                self.systeminfo_url_worked = systeminfo_url

                self._systeminfo_parse(agent, xml_result)    # parse the xml

                # Send an event if appropriate
                self._systeminfo_eventit(agent, data, systeminfo_url)

                return

            except SysteminfoException as ex:
                prev_state = self.stateman.get_state()
                if ex.errnum == SysteminfoError.NOT_FOUND and \
                            self.systeminfo_url_worked == systeminfo_url and \
                            prev_state in (StateManager.STATE_STOPPED,
                                             StateManager.STATE_STOPPING):
                    # Could be the maintenance web server responding with
                    # "Not found" since and tableau is stopped.
                    self._set_status_stopped(agent)
                    logger.info("_system_info: Page not found and "
                                "it previously worked, but the state was %s "
                                "so not sending systeminfo event.", prev_state)
                    return
                elif ex.errnum == SysteminfoError.CONNECT_FAILURE:
                    logger.info("_system_info: failed to connect")

                    # Be as confident as possible that tableau really is stopped
                    # and the user didn't configure the tableau-server-url
                    # wrong.  The tableau-server-url derived url has to work
                    # at least once with systeminfo # before the failure to
                    # get systeminfo should mean that tableau is really stopped.
                    if self.systeminfo_url_worked == systeminfo_url:
                        logger.error("status-check: systeminfo failed while "
                                     "enabled in tableau: assuming tableu is "
                                     "stopped.")
                        self._set_status_stopped(agent)
                        data['info'] = "Systeminfo failed to connect, but " + \
                                       "connect previously worked.  " + \
                                       "Assuming Tableau is stopped."

                        self._systeminfo_eventit(agent, data, systeminfo_url)
                        return

                # systeminfo didn't work, but we don't know if tableau
                # being down is the cause.  The exception could be due to
                # a parse error, bad url, timeout, etc.
                data['error'] = ex.message
                if ex.errnum == SysteminfoError.PARSE_FAILURE:
                    # Add the raw XML to the error.
                    data['error'] += ' XML: ' + str(xml_result)
                self._systeminfo_eventit(agent, data, systeminfo_url)

                if self.system[SystemKeys.STATUS_SYSTEMINFO_ONLY]:
                    logger.info("systeminfo failed but not allowed to use "
                                "tabadmin status -v")
                    self._set_status_unknown(agent,
                                "systeminfo failed but configured to not "
                                "allow use of tabadmin status -v")
                    return

        # Get tableau status via 'tabadmin status -v' instead.
        self._get_status_tabadmin(agent)
        return

    def is_systeminfo_api_capable(self, tableau_version):
        """Check whether /admin/systeminfo.xml API is available for checking
        the status of Tableau processes"""
        try:
            tableau_major_ver = int(tableau_version.split(".")[0])
            if tableau_major_ver >= 9:
                return True

        except ValueError:
            logger.error("Failed to parse major version number "
                         "from Tableau version: '%s'", tableau_version)

        return False

    def _set_status_unknown(self, agent, body):
        """Remove all status and set tableau status to UNKNOWN."""

        prev_tableau_status = self.get_tableau_status()
        self.remove_all_status()
        tableau_status = TableauProcess.STATUS_UNKNOWN
        self._add(agent.agentid, "Status", 0, tableau_status)
        self._finish_status(agent, tableau_status, prev_tableau_status, body)

    def _systeminfo_eventit(self, agent, data, systeminfo_url):
        """Send if event failed/okay event as appropriate."""

        notification = self.server.notifications.get("systeminfo")

        if success(data):
            if notification.color == 'red':
                adata = agent.todict()
                if 'info' in data:
                    adata['info'] = data['info']
                if self.system[SystemKeys.STATUS_SYSTEMINFO_SEND_ALERTS]:
                    self.event.gen(EventControl.SYSTEMINFO_OKAY, adata)
                notification.modification_time = func.now()
                notification.color = 'green'
                notification.description = systeminfo_url
                meta.Session.commit()
        else:
            # Failed
            if notification.color != 'red' or \
                                notification.description != systeminfo_url:
                # If the systeminfo_url has changed, then tell them this
                # one didn't work (either).  We can potentially send
                # multiple of these events if they keep entering bad
                # URLs.
                adata = agent.todict()
                adata['error'] = data['error']
                if self.system[SystemKeys.STATUS_SYSTEMINFO_SEND_ALERTS]:
                    self.event.gen(EventControl.SYSTEMINFO_FAILED, adata)
                notification.modification_time = func.now()
                notification.color = 'red'
                notification.description = systeminfo_url
                meta.Session.commit()

    def _get_status_tabadmin(self, agent):
        # pylint: disable=too-many-locals
        # pylint: disable=too-many-branches
        # pylint: disable=too-many-statements

        """Try to get tableau status the old-fashioned way via
            'tabadmin status -v'.
        """

        agentid = agent.agentid

        body = self.server.status_cmd(agent)
        if 'error' in body:
            logger.error("status-check: Error from tabadmin status command: %s",
            str(body))
            return

        if 'exit-status' in body:
            if body['exit-status']:
                logger.error("status-check: Failed exit status: %d for " + \
                             "tabadmin status command: %s",
                             body['exit-status'], str(body))
                return
        else:
            logger.error("status-check: Missing exit-status from " + \
                         "tabadmin status command: %s", str(body))
            return

        if not body.has_key('stdout'):
            # fixme: Probably update the status table to say
            # something's wrong.
            logger.error("status-check: No output received for " + \
                        "status monitor. body: " + str(body))
            return

        stdout = body['stdout']
        lines = string.split(stdout, '\n')

        session = meta.Session()

        prev_tableau_status = self.get_tableau_status()

        self.remove_all_status()
        # Do not commit until after the table is added to.
        # Otherwise, the table could be empty temporarily.

        tableau_status = None
        failed_proc_str = ""
        machine_agent = Agent.get_by_id(agentid)
        if machine_agent:
            machine_displayname = machine_agent.displayname
        else:
            machine_displayname = "Unknown"
        for line in lines:
            line = line.strip()
            parts = line.split(' ')

            # 'Tableau Server Repository Database' (1764) is running.
            if parts[0] == "'Tableau" and parts[1] == 'Server':
                if agentid:
                    pattern = r"'Tableau Server (?P<service>.*)'" + \
                              r"\s(\((?P<pid>[0-9]*)\))?(status)?\s?" + \
                              r"is\s(?P<status>.*)\."
                    match = re.search(pattern, line)
                    if not match:
                        logger.debug("status-check: unmatched line: " + line)
                        continue

                    service = match.group('service')        # "Repository"
                    if not 'service':
                        logger.debug("status-check: empty service in " + \
                                     "line: %s", line)
                        continue

                    pid_str = match.group('pid')   # "1764"
                    if pid_str:
                        try:
                            pid = int(pid_str)
                        except StandardError:
                            logger.error("status-check: Bad PID: " + pid_str)
                            continue
                    else:
                        pid = -2

                    status = match.group('status') # "running" or "running..."
                    if not 'status':
                        logger.debug("status-check: empty 'status' " + \
                                     "in line: %s", line)
                        continue

                    self._add(agentid, service, pid, status)
                    logger.debug("status-check: logged: %s, %d, %s", service,
                                 pid, status)

                    if status.find('running') == -1:
                        # Keep track of failed tableau processes
                        failed_proc_str += ("Machine %s: Process %s is "
                                            "%s\n") % \
                                            (machine_displayname,
                                            service, status)
                else:
                    # FIXME: log error
                    pass
            elif parts[0] == 'Status:':
                server_status = parts[1].strip()
                if agentid:
                    self._add(agentid, "Status", 0, server_status)
                    if tableau_status == None or server_status == 'DEGRADED':
                        tableau_status = server_status
                else:
                    # FIXME: log error
                    pass
            else:
                line = line.strip()
                if line[-1:] == ':':
                    # A hostname or IP address is specified: new section
                    host = parts[0].strip().replace(':', '')
                    agentid = Agent.get_agentid_from_host(self.envid, host)
                    machine_agent = Agent.get_by_id(agentid)
                    if machine_agent:
                        machine_displayname = machine_agent.displayname
                    else:
                        machine_displayname = "Unknown"
                else:
                    # Examples:
                    #   "Connection error contacting worker 1"
                    #   'Tableau Server Cluster Controller' is stopped.
                    #   'Tableau Server Repository' status is not available.
                    #   'Tableau Server File Store' status is not available.
                    if not agentid:
                        logger.debug("status-check: Can't log due to " + \
                                     "unknown or disabled agent: %s, %d, %s",
                                     line, -1, 'error')
                    else:
                        self._add(agentid, line, -1, 'error')
                        logger.debug("status-check: logged: %s, %d, %s",
                                     line, -1, 'error')

        if tableau_status is None:
            logger.error("status-check: Tableau status not valid: %s",
                         str(lines))
            session.rollback()
            return

        if failed_proc_str:
            # Failed process(es) for the event
            body['info'] = failed_proc_str

        self._finish_status(agent, tableau_status, prev_tableau_status, body)

    def _finish_status(self, agent, tableau_status, prev_tableau_status, body):

        aconn = agent.connection
        acquired = aconn.user_action_lock(blocking=False)
        if not acquired:
            # If the user_action_lock is taken, that thread should
            # control the state.  We don't update the tableau process
            # table since state should be consistent with tableau process
            # status.
            logger.debug(
                "status-check: Primary agent locked for user action " + \
                "after tabadmin status finished.  " + \
                "Will not update state or tableau status.")
            meta.Session.rollback()
            return

        logger.debug("status-check: Logging main status: %s", tableau_status)
        self._set_main_state(prev_tableau_status, tableau_status, agent, body)

        meta.Session.commit()
        aconn.user_action_unlock()
Example #2
0
 def set_state(self, state="index", meta={}):
     self.state = StateManager.get_state(self.from_user_name, self.to_user_name, state, meta)
     StateManager.set_user_state(self.to_user_name, state, meta)
Example #3
0
 def set_state(self, state="index", meta={}):
     self.state = StateManager.get_state(self.from_user_name,
                                         self.to_user_name, state, meta)
     StateManager.set_user_state(self.to_user_name, state, meta)