Beispiel #1
0
 def rmBindMount(self, sid): 
     '''
     unBindMounts and deletes all bind mount points for all Fcombine users for a given SID
     '''
     log(3, "performing rmbindMount(sid) against all BMP's for sid '%s'" % sid)
     # get server.name of server.sid
     try:
         serverName = xsftp.webui.models.Server.objects.get(id=sid).server_name
     except xsftp.webui.models.Server.DoesNotExist:
         log(3, "Found an unrecognised SID (%s) in the SLAMMount directory, so can't unBindMount it - skipping." % sid)
         return
     # get live bmp's
     liveBindMountPointsDict = self.getLiveBindMountPoints()
     # get user list
     users = liveBindMountPointsDict.keys()
     # for each user
     for user in users:
         # if user has a bmp reflecting specified sid
         if serverName in liveBindMountPointsDict[user]:
             # lazy unBindMount the bmp
             unBindMountCmd = "umount -l /home/" + user + "/xsftp/" + serverName + " > /dev/null 2>&1"
             rc = 0
             while not rc: rc = os.system(unBindMountCmd)
             # delete the bmp
             try:
                 os.remove("/home/%s/xsftp/%s/where_are_my_files.txt" % (user, serverName))
             except OSError:
                 # file "where_are_my_files.txt" erroniously did not exist, but safe to ignore and continue.
                 pass
             rmBindMountPoindCmd = "rmdir /home/" + user + "/xsftp/" + serverName + " > /dev/null 2>&1"
             os.system(rmBindMountPoindCmd)
     return
Beispiel #2
0
    def initSLAMMountPoints(self): 
        '''
        Creates and removes (cleans up) the SLAM mount point directories in xsftp.common.constants.SERVERDIR based on data from the DB.
        '''

        # Get list of expected SLAM mount points
        expectedSLAMMountPoints = self.getExpectedSLAMMountPoints()
        log(4, "Expected SLAM Mount Points are ... %s" % expectedSLAMMountPoints)
        # Get list of live SLAM mount points
        liveSLAMMountPoints = self.getLiveSLAMMountPoints()
        log(4, "Live SMPs are ... %s" % [int(i) for i in liveSLAMMountPoints])
        # Create dir's for each sid which doesnt exist in liveSLAMMountPoints
        for sid in [str(x) for x in expectedSLAMMountPoints]:
            if not sid in liveSLAMMountPoints:
                # mount point for this server doesnt exist, create directory:
                newDir = xsftp.common.constants.SERVERDIR + str(sid)
                mkdirCmd = "mkdir " + newDir + " > /dev/null 2>&1"
                os.system(mkdirCmd)
                # Copy in a "where are my files" text message
                shutil.copy("%setc/xsftp/where_are_my_files.txt" % xsftp.common.constants.APPDIR, newDir)
        # Delete dir's for each sid which does not exist in expectedSLAMMountPoints
        for sid in liveSLAMMountPoints:
            if not sid in [str(x) for x in expectedSLAMMountPoints]:
                # lazy un-Bind-Mount and delete the server's associated bind mount points
                self.bmp_manager.rmBindMount(sid)
                # lazy un-mount the SMP
                #unmountSLAMMountPointCmd = "fusermount -uz " + xsftp.common.constants.SERVERDIR + sid + " > /dev/null 2>&1"
                unmountSLAMMountPointCmd = "umount -l " + xsftp.common.constants.SERVERDIR + sid + " > /dev/null 2>&1"
                rc = 0
                while not rc: rc = os.system(unmountSLAMMountPointCmd)
                #  delete the SLAM mount point
                rmSshfsMountPointCmd = "rm -f " + xsftp.common.constants.SERVERDIR + sid + "/where_are_my_files.txt; rmdir " + xsftp.common.constants.SERVERDIR + sid + " > /dev/null 2>&1"
                os.system(rmSshfsMountPointCmd)
        return
Beispiel #3
0
 def doBindMount(self, bmpAbsPath):
     '''
     Atempts to bind-mount a server referenced by the specified bmpAbsPath.
     If server is already bind mounted, we just return successfully.
     This function references the specified server's record in the Django for mount parameters.
     '''
     # get server's name
     name = bmpAbsPath.split("/")[-1]
     # get user's name
     user = bmpAbsPath.split("/")[2]
     # check if already bind mounted
     liveBindMounts = self.getLiveBindMounts()
     if liveBindMounts.has_key(user) and name in liveBindMounts[user]:
         log (4, "BMP %s is already bind mounted, skipping doBindMount." % bmpAbsPath)
         return
     # perform the bind mount
     # get sid of server
     sid = xsftp.webui.models.Server.objects.get(server_name=name).id
     # get ismbAbsPath
     smbAbsPath = "%s%s" % (xsftp.common.constants.SERVERDIR, sid)
     # log attempt
     log(4, "Bind mounting %s to %s" % (smbAbsPath, bmpAbsPath))
     bindMountCmd = "mount --bind %s %s > /dev/null 2>&1" % (smbAbsPath, bmpAbsPath)
     os.system(bindMountCmd)
     return
    def sendEmailAlert(self, sid, state, time_first_seen_in_new_state):
        '''
        Sends an email to everyone in  the global serverlink_alert_groups about the specified server-link's health problem.
        sid = sid of server which is unhealthy (int)
        state = int
        time = time first seen in this state (secs since epoc)
        '''
        recipients = []
        recipient_groups = xsftp.webui.models.Configuration.objects.all()[0].serverlink_alert_groups.all()
        for group in recipient_groups:
            for user in group.users.all():
                if user not in recipients:
                    recipients.append(user)
        email_addresses = [user.email for user in recipients if user.email]
        server_link = xsftp.webui.models.Server.objects.get(id=sid)
        server_link_name = server_link.server_name
        device_name = xsftp.webui.models.Configuration.objects.all()[0].device_name
        if not email_addresses:
            log(1, "Could not send Server Link Health warning email for Server '%s': No 'Server Link Health Global Alert Groups' have been specified." % server_link_name)
        # instantiate a new Server object, set its state, then extract its html details for that state.
        if server_link.status != state:
            server_link.status = state
        # generate text details, by converting the html healthstrings to text for email rendering.
        myWriter = formatter.DumbWriter()
        myFormatter = formatter.AbstractFormatter(myWriter)
        p = EmailTextParser(myFormatter)
        p.feed(server_link.healthStrings())
        # remove tab characters
        details = p.data.replace('\t','')
        # remove blank lines
        details = "\n".join([line for line in details.split("\n") if line != ''])
        p.close()
        #details = server_link.healthStrings()
        # generate time string
        total_seconds = int(time.time() - time_first_seen_in_new_state)
        days = total_seconds / 86400
        hours = total_seconds % 86400 / 3600
        minutes = total_seconds % 86400 % 3600 / 60
        seconds = total_seconds % 86400 % 3600 % 60
        time_string = "%s days, %s hours, %s minutes, %s seconds" % (days, hours, minutes, seconds)
        message = '''
    This is an automatic message from the Fcombine Device: %(device_name)s

    The Server Link '%(server_link_name)s' has been in unhealthy state %(state)s for %(time_string)s.

    arning - Jobs and Users may not be able to utilise this Server Link until it is repaired. See details below for help on remediating this issue.

    Details are:

    %(details)s
    ''' % {"device_name":device_name, "server_link_name":server_link_name, "state":state, "time_string":time_string, "details":details}
        try:
            email.send_email(subject="Fcombine Server Link Health warning for Server '%s'" % server_link_name, body=message, to=email_addresses)
        except Email_Error, e:
            log(1, "Could not send Server Link Health warning email for Server '%s': %s" % (server_link_name, e))
    def setStatus(self, bmpAbsPath, state):
        """
        Sets status values of a specified BMP in the serverStatusDict dictionary, which is of the form:
        {BMP : ('sid', state, timeSinceEpocFirstSeenInCurrentState, timeSinceEpocLastSeenHealthy)}

        Takes in two arguments:
            bmpAbsPath (string) is the BMP's absolute path as a string
            state (int) is the state value to assign to the specified BMP
        
        * If BMP has never been healthy, timeSinceEpocLastSeenHealthy value will be set to the time the daemon started.
        """
        # if there is no entry in the serverStatusDict for this bmp, and it's state is initially not zero (ie. it is unhealthy) then set this value to -1
        self.shared_vars.serverStatusDictLock.acquire()
        # if BMP has no entry in the serverStatusDict, set timeSinceLastHealthy value to the current time.
        if not self.shared_vars.serverStatusDict.has_key(bmpAbsPath):
            timeSinceLastHealthy = int(time.time())
        else:
            # assign it's new timeSinceLastHealthy value to that which it was previously.
            timeSinceLastHealthy = self.shared_vars.serverStatusDict[bmpAbsPath][3]
        self.shared_vars.serverStatusDictLock.release()
        # if the status value for the BMP given in this functions second argumen't is 0 (healthy)
        if state == 0:
            # set the timeSinceLastHealthy to now:
            timeSinceLastHealthy = int(time.time())
        # save the values to the tuples in the serverStatusDict dictionary.
        self.shared_vars.serverStatusDictLock.acquire()
        # if dicrt does not yet contain status for this bmp, or status has changed
        if (
            not self.shared_vars.serverStatusDict.has_key(bmpAbsPath)
            or self.shared_vars.serverStatusDict[bmpAbsPath][1] != state
        ):
            # we have detected a status change, set timeFirstSeenInCurrentState
            timeFirstSeenInCurrentState = int(time.time())
            log(3, "State Change: Found " + bmpAbsPath + " in state " + str(state))
            log(
                2,
                "State Change: Found Server Link '"
                + os.path.basename(bmpAbsPath)
                + "' in %s state " % ["unhealthy", "healthy"][state == 0]
                + str(state),
            )
        else:
            # preserve timeFirstSeenInCurrentState
            timeFirstSeenInCurrentState = self.shared_vars.serverStatusDict[bmpAbsPath][2]
        # save the values
        sid = xsftp.webui.models.Server.objects.get(server_name=bmpAbsPath.split("/")[-1]).id
        self.shared_vars.serverStatusDict[bmpAbsPath] = (sid, state, timeFirstSeenInCurrentState, timeSinceLastHealthy)
        self.shared_vars.serverStatusDictLock.release()
        # if state is healthy, remove this bmp's entry in the global alertTracekr dict
        self.shared_vars.alertTrackerLock.acquire()
        if state == 0 and sid in self.shared_vars.alertTracker:
            self.shared_vars.alertTracker.pop(sid)
        self.shared_vars.alertTrackerLock.release()
Beispiel #6
0
 def unBindMount(self, bmpAbsPath):
     '''
     Performs a lazy unmount on specified bmpAbsPath
     '''
     log(4,"Doing unBindMount('%s')..." % bmpAbsPath)
     name = bmpAbsPath.split("/")[-1]
     if name not in self.getLiveBindMountList():
         log (5, "BMP %s is not bind mounted, skipping unBindMount." % bmpAbsPath)
         return
     unBindMountCmd = "umount -l %s > /dev/null 2>&1" % bmpAbsPath
     os.system(unBindMountCmd)
     return
 def get_key_fingerprint(self, address, port, write_log=True):
     # get the key fingerprint from the known_hosts file
     if str(port) == '22':
         kh_key = address
     else:
         kh_key = "[%s]:%s" % (address, port)
     fingerPrint = None
     try:
         fingerPrint = hexlify(paramiko.util.load_host_keys(os.path.expanduser(xsftp.common.constants.KNOWN_HOSTS_FILE))[kh_key]['ssh-rsa'].get_fingerprint())
     except IOError:
         if write_log:
             log(4, "fingerprint check for endpoint server at address '%s' failed: Known hosts file does not exist!" % address)
     except KeyError:
         if write_log:
             log(4, "fingerprint check for endpoint server at address '%s' failed: Address not found in known hosts file." % address)
     return fingerPrint
Beispiel #8
0
 def initAllBindMounts(self): 
     '''
     # This function will bind mount all BMP's which are not already bind mounted.
     # We assume all required BMP's exist
     '''
     # get live bmp's
     liveBindMountPointDict = self.getLiveBindMountPoints()
     # extract users from live bmp's
     users = liveBindMountPointDict.keys()
     #for each user
     for user in users:
         # get user's bmp's
         bindMountPoints = liveBindMountPointDict[user]
         # for each of the user's bmp's
         for bindMountPoint in bindMountPoints:
             # assemble the bmpAbsPath
             bmpAbsPath = "/home/%s/xsftp/%s" % (user, bindMountPoint)
             # log bind mount attempt
             log(4, "Bindmounting %s" % (bmpAbsPath))
             # do the bind mount.
             self.doBindMount(bmpAbsPath)
     return
    def run(self):

        while True:
            # get current self.shared_vars.serverStatusDict
            self.shared_vars.serverStatusDictLock.acquire()
            currentServerStatusDict = self.shared_vars.serverStatusDict.copy()
            # get current bmp's
            self.shared_vars.serverStatusDictLock.release()
            bmpList = currentServerStatusDict.keys()
            # for each bmp
            for bmp in bmpList:
                # log its state if not healthy
                if currentServerStatusDict[bmp][1]:
                    log(5, "connectorWorkerThread %s reports: - BMP %s has been in STATE %s for %s seconds." % (self.getName(), bmp, currentServerStatusDict[bmp][1], (int(time.time()) - currentServerStatusDict[bmp][2])))
                    log(3, "Server Link '%s' has been in unhealthy state %s for %s seconds" %( os.path.basename(bmp), currentServerStatusDict[bmp][1], (int(time.time()) - currentServerStatusDict[bmp][2])))
                # if server is unhealthy for over config.REPAIR_DELAY seconds,
                if currentServerStatusDict[bmp][1] != 0 and (int(time.time()) - currentServerStatusDict[bmp][3]) > config.REPAIR_DELAY:
                    # first, log how long this bmp has been unhealthy for, and low long it has been since it was last healthy.
                    log(3, "BMP Requires repair: BMP %s has been in current unhealthy state %s for over %s seconds, and has been unhealthy for %s seconds." % (bmp, currentServerStatusDict[bmp][1], (int(time.time()) - currentServerStatusDict[bmp][2]), (int(time.time()) - currentServerStatusDict[bmp][3])) )
                    # Check if this bindMountPoint is having its status determined
                    self.shared_vars.statChecksInProgressLock.acquire()
                    if bmp in self.shared_vars.statChecksInProgress:
                        self.shared_vars.statChecksInProgressLock.release()
                        log(3, "Not spawning a repair for BMP %s - Reason: BMP is currently having its status checked" % bmp)
                        continue
                    self.shared_vars.statChecksInProgressLock.release()
                    # Check if this allegedly damaged BMP is being or has been worked on:
                    self.shared_vars.serverRepairInProgressLock.acquire()
                    if self.shared_vars.serverRepairInProgress.count((bmp, False)):
                        # this BMP is currently being repaired, so do not spawn another rmediator thread for it.
                        log(3, "Not spawning a repair for BMP %s - Reason: Repair job already underway..." % bmp)
                    elif self.shared_vars.serverRepairInProgress.count((bmp, True)):
                        # an attempt at repairing this BMP has been completed, waiting for the statusWorkerThread to check its status
                        log(3, "Not spawning a repair for BMP %s - Reason: Repaired but check still pending..." % bmp)
                    else:
                        # add this bmp to the remediation job queue and spawn a remediator thread for it
                        # The job queue contains a tuple, the 1st value is the bmp to be repaired, and the second is a bool which is False if not yet repaired.
                        # Once repaired, the remediator thread will change this value to true, then a statWorkerThread will remove jobs from job queue which have 2nd value == True.
                        self.shared_vars.serverRepairInProgress.append((bmp, False))
                        remediatorThread = RemediatorWorkerThread(self.shared_vars, self.slam_manager, self.bmp_manager, bmp)
                        remediatorThread.start()
                        log(3, "BMP %s added to repair queue." % bmp)
                        log(3, "Attempting repair of Server Link: %s" % os.path.basename(bmp))
                    self.shared_vars.serverRepairInProgressLock.release()
                # if server has been unhealthy for config.ALERT_DELAY mins, then pass to alert subsystem
                if currentServerStatusDict[bmp][1] != 0 and (int(time.time()) - currentServerStatusDict[bmp][3]) > config.ALERT_DELAY:
                    # alert the specified people of a problem
                    log(3, "BMP %s has been unhealthy for over %s mins. Activating alert subsystem" % ( bmp, ((int(time.time()) - currentServerStatusDict[bmp][3]))/60 ) )
                    log(2, "Server link %s has been unhealthy for over %s mins. Activating alert subsystem" % ( os.path.basename(bmp), ((int(time.time()) - currentServerStatusDict[bmp][3]))/60 ) )
                    # fire up the alerting subsystem
                    self.raiseEmailAlert(bmp)
            time.sleep(self.CWT_SLEEP)
 def run(self):
     status = -999  # catch all, but it will get overwritten.
     # get the server object referred to in the specified bmpAbsPath
     server = xsftp.webui.models.Server.objects.get(server_name=self.bmpAbsPath.split("/")[-1])
     # derive the smpAbsPath of the specified bmpAbsPath
     smpAbsPath = xsftp.common.constants.SERVERDIR + str(server.id)
     try:
         # stat the bmp to see how it is
         bmpStat = os.stat(self.bmpAbsPath)  # returns a stat object
         # if we get here without invoking an exception ...
         # stat the associated smpAbsPath to see how it is
         smpStat = os.stat(smpAbsPath)
         # stat the root partition (this should never fail, but catch it in any case
         rootStat = os.stat(xsftp.common.constants.SERVERDIR)
         # If we get here, then all stat commands have completed SUCCESSFULLY
         # Now do some comparisons
         # if the bmp device is the same as the smp device, and different to the root device:
         if bmpStat.st_dev == smpStat.st_dev != rootStat.st_dev:
             # everything is OK
             status = self.MPSTATE_OK
         # elif the bmp device is the same as the smp device and the same as the root device:
         elif bmpStat.st_dev == smpStat.st_dev == rootStat.st_dev:
             # the SLAM mount does not exist. Try find out why.
             s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
             s.settimeout(self.STWT_SOCKET_TIMEOUT)
             if server.type == "sftp":
                 log(4, "Testing SFTP-specific health for BMP: %s" % self.bmpAbsPath)
                 s.connect(
                     (server.address, server.port)
                 )  # this can throw several errors that we handle below, which divulge status info
                 t = paramiko.Transport(s)
                 t.connect()  # this will throw if something other than an SSH daemon is listening on the remote port (banner error) because it happens pre-auth.
                 remoteFingerPrint = t.get_remote_server_key().get_fingerprint()
                 try:
                     localFingerPrint = paramiko.util.load_host_keys(
                         os.path.expanduser(xsftp.common.constants.KNOWN_HOSTS_FILE)
                     )[server.address][
                         "ssh-rsa"
                     ].get_fingerprint()  # raises IOError if file doesnt exist or KeyError if server not a known_host yet
                 except:
                     localFingerPrint = None
                 # if keys dont match...
                 if localFingerPrint and not localFingerPrint == remoteFingerPrint:
                     # The remote fingerprint has changed! could be man-in-the-middle, etc.
                     status = self.MPSTATE_KEY_MISMATCH
                 else:
                     # the fingerprints are fine, continue the checks
                     key = paramiko.DSSKey.from_private_key_file(
                         str(config.KEYFILE)
                     )  # could raise "IOError: [Errno 2] No such file or directory:" in event of missing key
                     t.auth_publickey(server.remote_user, key)  # can throw several errors which divulge status info
                     # establish a client session to the endpoint to stat the specified remote_dir and ensure it exists
                     c = t.open_sftp_client()
                     st = c.stat(server.remote_path)  # this raises IOError: [Errno 2] No such file on error
                     # if the specified remote path isnt a directory
                     if not stat.S_ISDIR(st.st_mode):
                         # then the specified remote path is invalid
                         status = self.MPSTATE_BAD_REMOTE_PATH
                     else:
                         # if we get this far, then there is nothing wrong with the SSH layer and below (physical, etc), so to remediate we can initialise the SMP.
                         status = self.MPSTATE_SM_BROKEN
             elif server.type == "cifs":
                 log(4, "Testing CIFS-specific health for BMP: %s" % self.bmpAbsPath)
                 s.connect((server.address, server.cifs_port))
                 # if we get here, the target is listening and allowing connections on the specified port.
                 # test for CIFS related errors
                 s = SMBClient.SMBClient(
                     server.address,
                     server.cifs_port,
                     server.cifs_share,
                     username=server.remote_user,
                     password=server.cifs_password,
                 )
                 if server.remote_path and not s.is_dir(str(server.remote_path)):
                     status = self.MPSTATE_BAD_REMOTE_PATH
                 else:
                     # if we get here, the cifs stuff looks good, set status to the CIFS catch all
                     status = self.MPSTATE_CIFS_ERROR
                 s.close()
             elif server.type == "ftp":
                 log(4, "Testing FTP-specific health for BMP: %s" % self.bmpAbsPath)
                 f = FTPClient.FTP(
                     server.address,
                     port=server.ftp_port,
                     passive=server.ftp_passive,
                     user=str(server.remote_user),
                     passwd=str(server.ftp_password),
                     ssl=server.ftp_ssl,
                     ssl_implicit=server.ftp_ssl_implicit,
                 )
                 f.login()
                 f.retrlines("LIST", callback=lambda msg: None)
                 f.cwd(str(server.remote_path))
         # elif the bmp device is different to the smp device which is in turn different to the root device
         elif bmpStat.st_dev != smpStat.st_dev != rootStat.st_dev:
             # The SSHFS mount is correct, and the bindmount isn't
             status = self.MPSTATE_BM_BROKEN
         # elif the bmp device is different to the smp device which is in turn the same as the root device
         elif bmpStat.st_dev != smpStat.st_dev == rootStat.st_dev:
             # both the BM and SM are broken
             status = self.MPSTATE_BM_AND_SM_BROKEN
         else:
             # The catch-the-rest
             log(1, "Unexpected Server Link error, resetting Server Link %s:'%s'." % (server.server_name, server.id))
             status = self.MPSTATE_ERROR1
     # Catch exceptions for the above checks
     except FTPClient.error_wrong_service, e:
         status = self.MPSTATE_WRONG_SERVICE
     status = self.MPSTATE_FTP_DATA_CHANNEL_ERROR
 except FTPClient.error_bad_credentials, e:
     status = self.MPSTATE_AUTH_FAILED
 except FTPClient.error_ftps_not_supported, e:
     status = self.MPSTATE_FTP_FTPS_NOT_SUPPORTED
 except FTPClient.error_ftpes_not_supported, e:
     status = self.MPSTATE_FTP_FTPES_NOT_SUPPORTED
 except FTPClient.error_bad_remote_path, e:
     status = self.MPSTATE_BAD_REMOTE_PATH
 except FTPClient.error_ftpes_required, e:
     status = self.MPSTATE_FTP_FTPES_REQUIRED
 except FTPClient.Error, FTPClientExceptionText:
     status = self.MPSTATE_FTP_ERROR
     log(
         2,
         "Server Link '%s' (type FTP) in unhealthy state MPSTATE_FTP_ERROR, error message is: %s"
         % (server.server_name, FTPClientExceptionText),
     )
 except SMBClient.SMBClientException, e:
     e = str(e)
     if e == "bad share name":
         status = self.MPSTATE_CIFS_BAD_SHARE_NAME
     elif e == "bad credentials":
         status = self.MPSTATE_AUTH_FAILED
     elif e == "wrong service":
         status = self.MPSTATE_WRONG_SERVICE
     else:
         log(1, "CIFS health error: %s" % e)
         status = self.MPSTATE_CIFS_ERROR
 except socket.gaierror, e:
     if e[0] == -2:
    def run(self):
        # get the current state of this BMP
        self.shared_vars.serverStatusDictLock.acquire()
        currentServerStatus = self.shared_vars.serverStatusDict[self.bmpAbsPath]
        self.shared_vars.serverStatusDictLock.release()
        state = currentServerStatus[1]
        # setup a shorthand reference to the statWorkerThread class for state name references below
        swt = StatWorkerThread 
        # attempt to remediate this server based on its state
        if state == swt.MPSTATE_OK:
            # this bmp is in state 0: Healthy. Do nothing, purge this job from the self.shared_vars.serverRepairInProgress global job queue and return
            # this condition should never happen, but process it just incase it does - maybe a bmp transitions from unhealthy to healthy in the split second it takes for this thread to fire up for example...
            log(3, "Finished repair attempt: BMP=%s State=0:MPSTATE_OK (nothing to do, as it was healthy on arrival)" % self.bmpAbsPath)
            return
        elif state in [ swt.MPSTATE_BM_BROKEN,
                        swt.MPSTATE_SM_BROKEN,
                        swt.MPSTATE_BM_AND_SM_BROKEN,
                        swt.MPSTATE_BM_UNREATTACHED,
                        swt.MPSTATE_NO_ROUTE_TO_HOST,
                        swt.MPSTATE_CONNECTION_REFUSED,
                        swt.MPSTATE_CONNECTION_TIMEOUT,
                        swt.MPSTATE_KEY_MISMATCH,
                        swt.MPSTATE_KEYFILE_MISSING,
                        swt.MPSTATE_WRONG_SERVICE,
                        swt.MPSTATE_PUBLIC_KEY_NOT_ALLOWED,
                        swt.MPSTATE_AUTH_FAILED,
                        swt.MPSTATE_KEY_REQUIRES_PASSPHRASE,
                        swt.MPSTATE_BAD_REMOTE_PATH,
                        swt.MPSTATE_SOCKET_ERROR,
                        swt.MPSTATE_CIFS_BAD_SHARE_NAME,
                        swt.MPSTATE_CIFS_ERROR,
                        swt.MPSTATE_FTP_DATA_CHANNEL_ERROR,
                        swt.MPSTATE_FTP_FTPS_NOT_SUPPORTED,
                        swt.MPSTATE_FTP_FTPES_NOT_SUPPORTED,
                        swt.MPSTATE_FTP_FTPES_REQUIRED,
                      ]:
            # FIX:(1) BRING UP THE SMP (IF NECESSARY), RIP DOWN THE BINDMOUNT (IF IT EXISTS), AND BRING UP THE BIND MOUNT
            self.slam_manager.doSLAMMount(self.sid)
            self.bmp_manager.unBindMount(self.bmpAbsPath)
            self.bmp_manager.doBindMount(self.bmpAbsPath)
            log(3, "Finished repair attempt: BMP=%s State=%s" % (self.bmpAbsPath, state))
        elif state in [ swt.MPSTATE_SM_DISCONNECTED_AND_BM_BROKEN,
                        swt.MPSTATE_SM_DISCONNECTED,]:
            # FIX:(2) (WAIT FOR THE CONNECTION TO BE RE-ESTABLISHED NATURALLY - SSHFS.C / mount.cifs / curlftpfs WILL FIX IT)
            log(3, "BMP %s has been in state %s (DISCONNECTED) for %s seconds - awaiting self-heal" % (self.bmpAbsPath, state, (int(time.time()) - currentServerStatus[2]) ) )
        elif state == swt.MPSTATE_BMP_DOESNT_EXIST:
            # FIX:(3) REINIT ALL BMP'S, AND BRING UP THE BIND MOUNT
            self.bmp_manager.initBindMountPoints()
            self.bmp_manager.doBindMount(self.bmpAbsPath)
            log(3, "Finished repair attempt: BMP=%s State=%s:MPSTATE_BMP_DOESNT_EXIST" % (self.bmpAbsPath, state) )
        elif state == swt.MPSTATE_SMP_DOESNT_EXIST:
            # FIX:(5) INIT ALL SMP'S, THEN RUN FIX(1)
            self.slam_manager.initSLAMMountPoints()
            self.slam_manager.doSLAMMount(self.sid)
            self.bmp_manager.unBindMount(self.bmpAbsPath)
            self.bmp_manager.doBindMount(self.bmpAbsPath)
            log(3, "Finished repair attempt: BMP=%s State=%s:MPSTATE_SMP_DOESNT_EXIST" % (self.bmpAbsPath, state) )
        else: # state will be swt.MPSTATE_ERROR1, swt.MPSTATE_ERROR2, swt.MPSTATE_ERROR3, swt.MPSTATE_ERROR4, swt.MPSTATE_FTP_ERROR, or -10 (server link unused), etc...
            # FIX:(4) CATCHALL. PULL DOWN ALL BM'S FOR THIS SM, PULL DOWN SM, BRING UP SM, BRING UP ALL BM'S FOR THIS SM
            self.slam_manager.unSLAMMount(self.sid)
            self.slam_manager.doSLAMMount(self.sid)
            self.initBindMountsLock.acquire()
            self.bmp_manager.initAllBindMounts()
            self.initBindMountsLock.release()
            # Log repair attempt.
            log(3, "Finished repair attempt: BMP=%s State=%s:MPSTATE_ERROR%s" % (self.bmpAbsPath, state, state) )

        # Mark BMP's entry in the repairInProgress job queue as completed.
        self.shared_vars.serverRepairInProgressLock.acquire()
        self.shared_vars.serverRepairInProgress.remove((self.bmpAbsPath, False))
        self.shared_vars.serverRepairInProgress.append((self.bmpAbsPath, True))
        self.shared_vars.serverRepairInProgressLock.release()
        # remove BMP's entry in the self.shared_vars.pendingRepair queue
        self.shared_vars.pendingRepairLock.acquire()
        self.shared_vars.pendingRepair.pop(self.bmpAbsPath)
        self.shared_vars.pendingRepairLock.release()
        log(6, "Repair worker removed bmp %s from self.shared_vars.pendingRepair queue" % self.bmpAbsPath)
        log(3, "Completed repair attempt for Server Link: %s" % os.path.basename(self.bmpAbsPath))
        return
 def run(self):
     '''    
     Each iteration of this perpetual loop will profile every BMP and update the self.shared_vars.serverStatusDict with its status.
     It will also clear any jobs marked as completed from the remediation thread's job queue.
     '''
     while True:
         #log(6, "Number of objects = %s" % len(gc.get_objects()))
         self.initAllMountPoints()
         # cleanup self.shared_vars.serverStatusDict of any entries that should not ne in there (failsafe against memory leaks on self.shared_vars.serverStatusDict)
         exptectdBMPList = self.bmp_manager.getExpectedBindMountPoints(bmpabspath=True)
         self.shared_vars.serverStatusDictLock.acquire()
         currentServerStatusDict = self.shared_vars.serverStatusDict.copy()
         for key in currentServerStatusDict.keys():
             if key not in exptectdBMPList:
                 log(6, "removing bmpAbsPath: '%s' from the self.shared_vars.serverStatusDict as it no longer requires health checking." % key)
                 self.shared_vars.serverStatusDict.pop(key)
         self.shared_vars.serverStatusDictLock.release()
         # profile each BMP for its status.
         # get all expected BMP's:
         bindMountPointsDict = self.bmp_manager.getExpectedBindMountPoints()
         # for each user's BMP
         for user in bindMountPointsDict.keys():
             for bindMountPoint in bindMountPointsDict[user]:
                 # set bmpAbsPath
                 bmpAbsPath = "/home/%s/xsftp/%s" % (user, bindMountPoint)
                 # if this bmp's status is still being worked on by a statWorkerThread (spawned by a previous iteration of this loop in this thread)
                 self.shared_vars.statChecksInProgressLock.acquire()
                 if self.shared_vars.statChecksInProgress.__contains__(bmpAbsPath):
                     self.shared_vars.statChecksInProgressLock.release()
                     log(5, "%s's status is still being profiled, skipping this status update iteration." % bmpAbsPath)
                     # then skip attempt to update status again
                     continue
                 else:
                     # if it is pending repair
                     self.shared_vars.pendingRepairLock.acquire()
                     if bmpAbsPath in self.shared_vars.pendingRepair.keys():
                         log(5, "%s's is marked as repair pending, skipping this status update iteration." % bmpAbsPath)
                         self.shared_vars.pendingRepairLock.release()
                         self.shared_vars.statChecksInProgressLock.release()
                         # then skip attempt to update status until repair is done
                         continue
                     # otherwise, check its status
                     self.shared_vars.pendingRepairLock.release()
                     self.shared_vars.statChecksInProgress.append(bmpAbsPath)
                     # spawn a statWorkerThread for this BMP to determine status
                     statThread = StatWorkerThread(self.shared_vars, bmpAbsPath)
                     statThread.start()
                 self.shared_vars.statChecksInProgressLock.release()
         # write to the logs a table of statuses for each bmp.
         self.shared_vars.serverStatusDictLock.acquire()
         for key in self.shared_vars.serverStatusDict.keys():
             log(6, "*** self.shared_vars.serverStatusDict entry: %s : %s" % (key, str(self.shared_vars.serverStatusDict[key])))
         # Now update the Django database with the latest info from the ServerStatusDict
         # ConsolidatedServerStatusDict - a dictionary of server_names:(state, timeFirstSeenInCurrentState, timeLastSeenHealthy)
         consolidatedServerStatusDict = dict()
         # For each bind_mount in the ServerStatusDict
         for bmp in self.shared_vars.serverStatusDict.keys():
             # Get the associated server and other details
             (sid, currentState, timeFirstSeenInCurrentState, timeLastSeenHealthy) = self.shared_vars.serverStatusDict[bmp]
             # If the Server doesn't exist in ConsolidatedServerStatusDict
             if sid not in consolidatedServerStatusDict.keys():
                 # Then add all the details in to ConsolidatedServerStatusDict from the current 
                 consolidatedServerStatusDict[sid] = (currentState, timeFirstSeenInCurrentState, timeLastSeenHealthy)
             # Elif this bind_mount is healthy and existing_entry is healthy:
             elif currentState == 0 and consolidatedServerStatusDict[sid][0] == 0:
                 # use the values from the one with the oldest (lowest) timeFirstInCurrentState
                 if timeFirstSeenInCurrentState < consolidatedServerStatusDict[sid][1]:
                     consolidatedServerStatusDict[sid] = (currentState, timeFirstSeenInCurrentState, timeLastSeenHealthy)
             # Elif this bind mount is unhealthy and the existing entry is healthy:
             elif currentState != 0 and consolidatedServerStatusDict[sid][0] == 0:
                 # Then add all the details in to ConsolidatedServerStatusDict from the current bind_mount
                 consolidatedServerStatusDict[sid] = (currentState, timeFirstSeenInCurrentState, timeLastSeenHealthy)
             # Elif this bind mount is unhealthy:
             elif currentState != 0:
                 # use the values from the one with the newest (highest) timeFirstSeenInCurrentState value
                 if timeLastSeenHealthy > consolidatedServerStatusDict[sid][1]:
                     consolidatedServerStatusDict[sid] = (currentState, timeFirstSeenInCurrentState, timeLastSeenHealthy)
         self.shared_vars.serverStatusDictLock.release()
         # For each server in ConsolidatedServerStatusDict:
         for sid in consolidatedServerStatusDict.keys():
             # print sid, consolidatedServerStatusDict[sid]
             # Grab the server from the database
             server = xsftp.webui.models.Server.objects.get(id=sid)
             # Update its details
             server.status = consolidatedServerStatusDict[sid][0]
             server.timeFirstSeenInCurrentState =  datetime.datetime.fromtimestamp(consolidatedServerStatusDict[sid][1])
             server.timeLastSeenHealthy =  datetime.datetime.fromtimestamp(consolidatedServerStatusDict[sid][2])
             server.time_last_checked = datetime.datetime.now()
             # Save the server
             server.save(synchronise=False)
         # Check that all servers which have a key_fingerprint value have an equivalent entry in KNOWN_HOSTS, if not, nullify their key_fingerprint in the DB.
         f = file(xsftp.common.constants.KNOWN_HOSTS_FILE, 'r')
         f.close()
         knownHostAddresses = [host.split(':')[0].replace("[","").replace("]","") for host in paramiko.util.load_host_keys(os.path.expanduser(xsftp.common.constants.KNOWN_HOSTS_FILE)).keys()]
         for server in [serverObj for serverObj in xsftp.webui.models.Server.objects.all() if serverObj.key_fingerprint]:
             if server.address not in knownHostAddresses:
                 server.key_fingerprint = None
                 server.save(synchronise=False)
         #add fingerprints to sftp-type server objs that dont have one in django but do have one in known_hosts
         for server in [serverObj for serverObj in xsftp.webui.models.Server.objects.all() if serverObj.type == "sftp" and not serverObj.key_fingerprint]:
             # get the key fingerprint from the known_hosts file
             fingerPrint = self.get_key_fingerprint(server.address, server.port, write_log=False)
             if fingerPrint:
                 server.key_fingerprint = fingerPrint
                 server.save(synchronise=False)
         # Check for jobs that look like they are running (ie pid != None), and check that there is the associated process for it. If not, clean up the job's attributes
         jobs = xsftp.webui.models.Job.objects.all()
         running_jobs = list()
         for job in jobs:
             if job.pid:
                 # append running job
                 running_jobs.append(job)
             else:
                 # job isn't running, ensure its running_now value is sane (must not be None (terminating..) or True (running now))
                 if job.running_now != False:
                     job.running_now = False
                     job.save()
         for job in running_jobs:
             try:
                 # we call getpgid, and if the process doesn't exist, an Exception is raised
                 os.getpgid(job.pid)
             except OSError:
                 # The process doesn't exist, so clean up the job
                 job.running_now = False
                 job.pid = None
                 job.save()
         # now sleep for set time
         time.sleep(self.SWT_SLEEP)
    def doSLAMMount(self, sid): 
        '''
        Atempts to mount a server referenced by the specified sid.
        If server is already sshfs mounted, we just return successfully.
        '''
        # check if sshfs mount is already being worked on
        self.SLAMMountsInProgressLock.acquire()
        log(6, "Acquired self.SLAMMountsInProgressLock")
        if sid not in self.SLAMMountsInProgress.keys():
            log(5, "No other threads are working on this Server Link %s, I will assume responsiblity." % sid)
            self.SLAMMountsInProgress[sid] = threading.Condition(self.SLAMMountsInProgressLock)
            log(6, "About to release the self.SLAMMountsInProgressLock")
            self.SLAMMountsInProgressLock.release()
        else:
            log(5, "Server Link %s is already being worked on - waiting for it to be fixed" % sid)
            self.SLAMMountsInProgress[sid].wait()
            log(5, "This thread got woken up - Server Link %s has been marked as fixed (and waiting for check)" % sid)
            self.SLAMMountsInProgressLock.release()
            return
        # This next bit checks whether the sshfs mount is already mounted, which can happen if some other thread fixed just before we did.
        # Additionally, while we were getting here, a few threads on our tail may have already come in and joined the wait queue,
        # so we need to wake them up and then we can all bail out of here.
        if str(sid) in self.slam_manager.getLiveSLAMMounts():
            log (5, "Server Link %s is already mounted, skipping." % sid)
            self.SLAMMountsInProgressLock.acquire()
            condition = self.SLAMMountsInProgress.pop(sid)
            condition.notifyAll()
            self.SLAMMountsInProgressLock.release()
            return

        ###################################
        ### SERVER LINK MOUNTING BEGINS ###

        # get server object referenced by sid in argument
        serverObj = xsftp.webui.models.Server.objects.get(id=sid)

        # ================
        # SSHFS MOUNT CODE
        # ================

        if serverObj.type == 'sftp':
            # get server_name
            server_name = serverObj.server_name
            # get GID for this server's linux write group
            gid = str(grp.getgrnam("x_%s" % sid)[2])
            # get server's address
            address = serverObj.address
            # get port number
            port = int(serverObj.port)
            # get keyfile location
            key = serverObj.key_file
            # get remoteuser
            remoteuser = serverObj.remote_user
            # get remote path
            remotepath = serverObj.remote_path
            log(4, "Mounting %s (type: sftp): SID=%s  ADDRESS=%s PORT=%s writeGroupName=x_%s GID=%s KEY=%s USERNAME=%s REMOTE_PATH=%s" % (server_name, sid, address, port, sid, gid, key, remoteuser, remotepath))
            # if this server's address is NOT in the known_hosts file, or the file doesnt exist, add the StrictHostKeyChecking=no option to suppress interactive yes/no ssh confirmation
            doStrictKeyCheck = False
            for host in paramiko.util.load_host_keys(xsftp.common.constants.KNOWN_HOSTS_FILE).keys():
                components = host.split(':')
                host_name = components[0].replace("[","").replace("]","")
                if len(components) == 1:
                    port = 22
                elif len(components) == 2:
                    port = int(components[1])

                if server.address == host_name and server.port == port:
                    log(4, "Performing strict key check for server link %s since I found its matching hostname '%s:%s' in known_hosts" % (server_name, address, port))
                    doStrictKeyCheck = True

            if doStrictKeyCheck == True:
                mountCmd = "sshfs -o UserKnownHostsFile=%s,StrictHostKeyChecking=yes,compression=yes,cache=no,default_permissions,uid=0,gid=%s,umask=002,nonempty,reconnect,allow_other,IdentityFile=%s,ServerAliveInterval=3,port=%s %s@%s:'%s' %s%s > /dev/null 2>&1" % (xsftp.common.constants.KNOWN_HOSTS_FILE, gid, key, port, remoteuser, address, remotepath, xsftp.common.constants.SERVERDIR, sid)
                log(6, "sshfsmount command is: %s" % mountCmd)
            else:
                log (4, "omitting strict key check for server link %s since I could not find a matching hostname '%s' in known_hosts" % (server_name, address))
                mountCmd = "sshfs -o UserKnownHostsFile=%s,StrictHostKeyChecking=no,compression=yes,cache=no,default_permissions,uid=0,gid=%s,umask=002,nonempty,reconnect,allow_other,IdentityFile=%s,ServerAliveInterval=3,port=%s %s@%s:'%s' %s%s > /dev/null 2>&1" % (xsftp.common.constants.KNOWN_HOSTS_FILE, gid, key, port, remoteuser, address, remotepath, xsftp.common.constants.SERVERDIR, sid)
                log(6, "sshfsmount command is: %s" % mountCmd)
            result = os.system(mountCmd)
            if result:
                # log failed sshfs mount attempt
                log(2, "Server Link (type: sftp) establishment attempt for server '%s' failed. Return code was %s" % (server_name, result))
            else:
                # log successful sshfs mount attempt
                log(4, "Success: sshfs mount to %s:%s established." % (address, port))
                log(1, "Server Link '%s' successfully established." % server_name)
                # get the key fingerprint from the known_hosts file
                fingerPrint = self.get_key_fingerprint(address, port)
                # save fingerprint to django models.Server
                if serverObj.key_fingerprint != fingerPrint:
                    log(4, "Got new/different fingerprint %s for Server Link '%s'" % (fingerPrint, server_name))
                    serverObj.key_fingerprint = fingerPrint
                    serverObj.save(synchronise=False)

        # ===============
        # CIFS MOUNT CODE
        # ===============

        elif serverObj.type == 'cifs':
            argDict = { 'sid':serverObj.id,
                        'name':serverObj.server_name,
                        'address': serverObj.address,
                        'cifs_port':serverObj.cifs_port,
                        'cifs_share': serverObj.cifs_share,
                        'remote_path': serverObj.remote_path,
                        'mount_point': "%s%s" % (xsftp.common.constants.SERVERDIR, sid),
                        'remote_user': serverObj.remote_user,
                        'cifs_password': serverObj.cifs_password,
                        'gid': str(grp.getgrnam("x_%s" % serverObj.id)[2])
                        }
            mountCmd = "/sbin/mount.cifs //%(address)s/'%(cifs_share)s'/'%(remote_path)s' %(mount_point)s -o user='******',pass='******',uid=0,gid=%(gid)s,rw,dir_mode=0775,file_mode=0775,port=%(cifs_port)s > /dev/null 2>&1" % argDict
            # If the specified remote path points to a file instead of a dir, the mount command will still work and the mount point will appear as that file. Ensure this does not happen.
            remote_path_ok = True
            SMBClientExceptionText = "unknown"
            try:
                s = SMBClient.SMBClient(serverObj.address, serverObj.cifs_port, serverObj.cifs_share, username=serverObj.remote_user, password=serverObj.cifs_password)
                if serverObj.remote_path:
                    remote_path_ok = s.is_dir(str(serverObj.remote_path))
                    SMBClientExceptionText = "bad remote path"
            except Exception, SMBClientExceptionText:
                remote_path_ok = False
            try:
               s.close()
            except:
                pass
            if remote_path_ok:
                log(4, "Mounting %(name)s (type: cifs): SID=%(sid)s  ADDRESS=%(address)s PORT=%(cifs_port)s GID=%(gid)s USERNAME=%(remote_user)s SHARE_NAME=%(cifs_share)s REMOTE_PATH=%(remote_path)s" % argDict)
                log(6, "cifs mount command is: %s" % mountCmd.replace("pass='******'" % serverObj.cifs_password, "pass=<HIDDEN>"))
                p = subprocess.Popen(mountCmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
                rc = os.waitpid(p.pid, 0)[1]
                if rc:
                    log(2, "Server Link (type: cifs) establishment attempt for server '%s' failed. Return code was %s" % (serverObj.server_name, rc))
                else:
                    log(2, "Server Link '%s':'%s' (type: cifs) successfully established." % (serverObj.address, serverObj.server_name))
            else:
                log(2, 'Server Link (type: cifs) "%s" failed pre-checks (Error: %s), skipping establishment.' % (serverObj.server_name, SMBClientExceptionText))
 id = serverObj.id
 # perform pre-checks
 do_ftp_mount = True
 log(6,"performing FTP pre-checks for server link %s" % server_name)
 try:
     f = FTPClient.FTP(address, port=ftp_port, passive=ftp_passive, user=remote_user, passwd=ftp_password, ssl=ftp_ssl, ssl_implicit=ftp_ssl_implicit)
     f.login()
     f.retrlines('LIST', callback=lambda msg: None)
     f.cwd(remote_path)
 except Exception, FTPClientExceptionText:
     do_ftp_mount = False
 try: f.quit()
 except: pass
 # perform actual ftp mount
 if do_ftp_mount:
     log(6,"performing FTP mount for server link %s" % server_name)
     # XXX note the use of the -f switch to curlftpfs below, which forces it not to daemonize and instead run in the foreground. If we don't do this, then for some reason some FTPES mounts (to Win2k8 IIS servers) won't work (the mount appears to work and the underlying FTP session is successfully established but trying to open the mountpoint for reading produces an IOError). Investigate this, nothing that we use our own slighly customized curlftpfs - check our RPM build dir for the source and patches.
     mountCmd = "curlftpfs -f -o transform_symlinks,connect_timeout=5,allow_other,default_permissions,uid=0,umask=002,nonempty,cache=no,ftp_timeout=10"
     if ftp_ssl:
         mountCmd += ",ssl,no_verify_peer,no_verify_hostname"
     if not ftp_passive:
         mountCmd += ",ftp_port=-,disable_epsv"
     ftp_credentials = ",user='******'" % (remote_user, ftp_password) #.replace(":",r"\:"))
     mountCmd += ftp_credentials
     mountCmd += ",gid=%s" % str(grp.getgrnam("x_%s" % id)[2])
     if ftp_ssl and ftp_ssl_implicit:
         mountCmd += " ftps://"
     else:
         mountCmd += " ftp://"
     mountCmd += "%s:%s" % (address, ftp_port)
     if serverObj.remote_path: