def restart_instance(self, server_instance):
    """Restart a server instance.
    For example, instance = 'ent1:7882'

    Return False if the server instance is not in the SERVERS list, or if the
    server_instance could not be parsed into host:port format.
    """
    # Parse server_instance
    try:
      host, port = server_instance.split(':')
      port = int(port)
    except (ValueError, IndexError):
      logging.warn("Could not parse %s into host:port format" %
                   server_instance)
      return false

    # Check the server is in SERVERS
    if not host in self.cfg.getGlobalParam('SERVERS').get(port, []):
      logging.warn("Could not find %s:%s in SERVERS map, "
                   "ignoring restart_instance request" % (host, port))
      return false

    # Restart it
    logging.info("Restarting server %s:%d" % (host, port))
    self.cfg.globalParams.WriteConfigManagerServerRestartRequest(host, port)
    return true
  def _get_current_crawlsummary(self):
    """Current crawl summary from Borgmon.

    Returns:
      {'global-overall-urls-crawl-error': 0,
       'global-overall-urls-crawled': 199620.0}
    """

    # We need to map from the identifiers ('global-overall-urls-crawled')
    # to borgmon exprs ('num_urls_crawled_today')
    NAME_TO_BORGMON = {
        'global-overall-urls-crawled': 'num_urls_crawled_now',
        'global-overall-urls-crawl-error': 'num_urls_error_now'}

    summary = {}
    uservars = self._getuservars()[1]
    for param in self.cfg.getGlobalParam('ENT_CRAWL_SUMMARY').keys():
      bname = '?'
      try:
        bname = NAME_TO_BORGMON[param]
        summary[param] = uservars[bname]
      except KeyError:
        logging.warn('problem finding value for ' + param + ' aka ' + bname)
        summary[param] = 0.0
    logging.info("return: " + str(summary))
    return summary
Beispiel #3
0
  def PollRunningCommands(self):

    io_happened = 0

    # see if anyone finished
    anyone_finished = 0

    #
    # See if any commands have written stuff back to us, and if so,
    # write it to the log file.
    #
    io_happened = self.ReadFromCommands()

    for (running_command, popen3object) in self._running_commands.items():
      status = popen3object.poll()

      if status != -1:
        anyone_finished = 1
        status = status >> 8                         # keep only actual status
        running_request = running_command.GetRequest()

        # cleanup
        del self._running_commands[running_command]   # The command is gone.
        del self._iopipes[popen3object.fromchild]     # And so are its io pipes.
        del self._iopipes[popen3object.childerr]

        if status == 0:
          logging.info('+++ Finished: %s' % (running_request.GetFilename()))
          self._num_processed_success = self._num_processed_success + 1
          self.AddRecentRequest(running_request, 0)
          self._request_mgr.MoveRequestToSuccess(running_request)
          self._success_handler(running_request)
        else:
          logging.error('+++ Failed (rc=%s): %s' %
                        (status, running_command.GetCmd()))
          retries = self.GetRetries(running_request.GetType())
          attempts = running_request.GetAttempts()
          if attempts <= retries:
            self._retry_handler(running_request)
            # cmd failed and user requested retries and we did not reach
            # the retry limit. So we dispatch it again.
            logging.warn('%s retrying %s more times.' %
              (running_request.GetFilename(), retries - attempts + 1))
            running_request.AddAttempt() # Increment count
            self._num_retries = self._num_retries + 1
            running_request.AddStatusz(">>>>> Failure. One more attempt "\
                                       "granted [crt attempt %s]" % attempts)
            self.DispatchCommand(running_command)
          else:
            running_request.AddStatusz(">>>>> Failure. No more attempts")
            self._num_processed_failure = self._num_processed_failure + 1
            self.AddRecentRequest(running_request, 2)
            self._request_mgr.MoveRequestToFailure(running_request)
            self._failure_handler(running_request)

    return anyone_finished or io_happened
def RemoveFile(file):
  """Delete a file with logging.

  Arguments:
    file: string: file path
  """
  if os.path.exists(file):
    logging.info('Removing %s' % file)
    try:
      os.remove(file)
    except:
      logging.warn('Error removing %s' % file)
 def control_directory(self, dir, max_files, excluded_patterns = None):
   """
   This function does a directory level control (used as a last resort).
   If a directory has more than max_files, it removes the oldest ones,
   except those matching excluded_patterns.
   """
   (passed, failed) = self.control_files([(".", max_files, 1)],
                                         dir, excluded_patterns)
   if passed > 0:
     logging.warn("Removed %d files from directory %s because it had "
                  " more than %d files.  Did you miss a pattern?",
                  passed, dir, max_files)
  def _distributeFiles(self, dir):
    """Distributes the files in directory named dir.  Returns 0 on success,
    the return code from E.distribute() otherwise."""

    files = os.listdir(dir)
    try:
      files.remove('temp')
    except ValueError:
      logging.warn("expected to find file 'temp'")

    files = map(lambda x: os.path.join(dir, x), files)
    files_str = ' '.join(files)
    return E.distribute(self.cfg.getGlobalParam('MACHINES'), files_str, 60)
Beispiel #7
0
    def Parse(self, message_class):
        """Upgrades the Item to a parsed one, returning true if successful."""

        if self.message_class is not None:
            return 1

        try:
            self.message = message_class(self.message)
            self.message_class = message_class
            return 1
        except ProtocolBuffer.ProtocolBufferDecodeError:
            logging.warn("Parse error in message inside MessageSet.  Tried " "to parse as: " + message_class.__name__)
            return 0
Beispiel #8
0
 def ValidateRequest(self, request):
   reqtype = request.GetType()
   if not reqtype in self._command_info.keys():
     logging.warn('Skipping unregistered %s request from file %s' %
              (reqtype, request.GetFilename()))
     return 0
   validator = self.GetValidator(reqtype)
   if validator:
     (validator_rc, msg) = validator(request)
     if not validator_rc:
       logging.error('%s failed validation: %s' % (request.GetFilename(), msg))
     return validator_rc
   return 1 # Validate passed, or no validator found
def GetActiveVersion():
  """ Return the current version of the gsa.
      Returns None on failure.
  """
  # parse config.google.enterprise to determine the version
  cmd = 'find /export/hda3/ -name STATE -maxdepth 2 | xargs grep ACTIVE -l'
  try:
    f = os.popen(cmd,'r')
    data = f.read()
    f.close()
    pat = re.compile('/export/hda3/([0-9]+\.[0-9]+\.[0-9]+)/STATE')
    match = pat.search(data)
    if match:
      return match.group(1)
  except IOError, e:
    logging.warn('IOError in GetActiveVersion: %s' % e)
Beispiel #10
0
  def ParseDesc(self, cnt=0):
    """Parse the initial description.

    This could be Python or C++.

    Returns:
      (line_count, lang_type)
        line_count  Line to start parsing flags on (int)
        lang_type   Either 'python' or 'c'
       (-1, '')  if the flags start could not be found
    """
    exec_mod_start = self.executable + ':'

    after_blank = False
    cnt = 0
    for cnt in range(cnt, len(self.output)): # collect top description
      line = self.output[cnt].rstrip()
      # Python flags start with 'flags:\n'
      if ('flags:' == line
          and len(self.output) > cnt+1 and '' == self.output[cnt+1].rstrip()):
        cnt += 2
        logging.debug('Flags start (python): %s' % line)
        return (cnt, 'python')
      # SWIG flags just have the module name followed by colon.
      if exec_mod_start == line:
        logging.debug('Flags start (swig): %s' % line)
        return (cnt, 'python')
      # C++ flags begin after a blank line and with a constant string
      if after_blank and line.startswith('  Flags from '):
        logging.debug('Flags start (c): %s' % line)
        return (cnt, 'c')
      # java flags begin with a constant string
      if line == 'where flags are':
        logging.debug('Flags start (java): %s' % line)
        cnt += 2                        # skip "Standard flags:"
        return (cnt, 'java')

      logging.debug('Desc: %s' % line)
      self.desc.append(line)
      after_blank = (line == '')
    else:
      logging.warn('Never found the start of the flags section for "%s"!'
                   % self.long_name)
      return (-1, '')
  def _GetTimeseriesSize(self):
    """Return the amount of memory (in MB) to allocate to storing timeseries.

    Returns:
      Int (MB)
    """
    try:
      config_type = core_utils.GetEntConfigVar('ENT_CONFIG_TYPE')
    except AssertionError:
      # "File not Found" Assertion Error is normal during unit-testing
      logging.warn("Could not find ENT_CONFIG_TYPE, setting Borgmon timeseries"
                   " size to the oneway default")
      config_type = 'ONEBOX'
    if config_type == 'MINI':
      return 32   # 32 mb on Mini
    elif config_type == 'LITE' or config_type == 'FULL':
      return 16   # 16 mb on Mini
    else:
      return 256  # 256 all other platforms
def AddDefaultNTPServer(ntp_server_list):
  """ check to see if a default ntp server needs to be added to the list

  Args:
    ntp_server_list: ['time1.corp.google.com', 'time2.corp.google.com']
  """

  any_good_ntp = 0
  for ntp_server in ntp_server_list:
    (stat, out) = network_diag.check_ntpdate_output(ntp_server)
    if stat == 0 and 'stratum' in out and int(out['stratum']) < 15:
      any_good_ntp = 1
      break
    else:
      logging.warn('Bad NTP server: %s (stat=%d, %s)' % (ntp_server, stat, out))
  if not any_good_ntp:
    default_ntp_server = FindDefaultNTPServer()
    if default_ntp_server not in ntp_server_list:
      ntp_server_list.append(default_ntp_server)
Beispiel #13
0
def TestNode(node, logging, retry=1):
    """Currently we define success a node being sshable. We can add more stuff,
  like checking for same software and os verions etc.
  A retry mechanism is used to take care of transient errors.
  """
    max_wait = 30
    ret = 0
    while max_wait > 0:
        logging.info('Testing node %s' % node)
        cmd = 'ssh %s echo \$HOSTNAME\: I am alive.' % node
        #logging.info('Executing %s' % cmd)
        ret, _ = commands.getstatusoutput(cmd)
        if ret == 0 or not retry:
            break
        logging.warn(
            'Node %s is down. Retrying after 5 seconds. %s seconds left.' %
            (node, max_wait))
        max_wait = max_wait - 5
        time.sleep(5)
    return ret
Beispiel #14
0
    def _GetTimeseriesSize(self):
        """Return the amount of memory (in MB) to allocate to storing timeseries.

    Returns:
      Int (MB)
    """
        try:
            config_type = core_utils.GetEntConfigVar('ENT_CONFIG_TYPE')
        except AssertionError:
            # "File not Found" Assertion Error is normal during unit-testing
            logging.warn(
                "Could not find ENT_CONFIG_TYPE, setting Borgmon timeseries"
                " size to the oneway default")
            config_type = 'ONEBOX'
        if config_type == 'MINI':
            return 32  # 32 mb on Mini
        elif config_type == 'LITE' or config_type == 'FULL':
            return 16  # 16 mb on Mini
        else:
            return 256  # 256 all other platforms
Beispiel #15
0
def GetLiveNodes(logging, retry=1, active_only=1):
    """Get list of machines from ENT_CONFIG_FILE and checks which machines are up.
  MACHINES paramter in google_config can be wrong as it takes a while before a
  node can be removed from the list. It is not going to be very efficient for
  very large clusters.

  Arguments:
    logging - module for logging
    retry   - 1 - retry when testing if a node is active. 0 - otherwise.
    active_only - 1. only check active machines. 0 -otherwise.
  Returns:
    ['ent1', 'ent2', 'ent3', 'ent4']
  """
    nodelist = GetNodes(active_only)
    if not active_only:
        nodecount = len(nodelist)
        failures = GetNodeFailures(nodecount)
        logging.info('Total nodes: %s' % nodecount)
        logging.info('Allowed failures: %s' % failures)
    logging.info('Checking node status.')
    deadlist = []
    alivelist = []
    for node in nodelist:
        ret = TestNode(node, logging, retry)
        if ret:
            logging.warn('Node %s is inaccessible.' % node)
            deadlist.append(node)
        else:
            logging.info('Node %s is accessible.' % node)
            alivelist.append(node)
    logging.info('Inaccessible nodes: %s' % deadlist)
    # checking svs
    nodes_without_svs = []
    for node in alivelist:
        if not CheckSVSRunning(node):
            nodes_without_svs.append(node)
    if len(nodes_without_svs) > 0:
        logging.info('SVS not running on nodes: %s' % nodes_without_svs)
    alivelist = [node for node in alivelist if node not in nodes_without_svs]
    return alivelist
def ReadDataFromCache(cacheKey, expiry=15, cachedir=CACHEDIR):
  """ Gets the cached reply for cacheKey.
  This cache prevents SNMP request from being too slow.
  Input:
    cacheKey is a string and must not contain any characters that
      would be "bad" for a filename.
    expiry: Cached data will expire after expiry seconds.
    cachedir: directory in which to hold data.
  Result: cached string or None (no value or expired).
  """
  cachefile = '/%s/snmpcache-%s' % (cachedir, cacheKey)
  try:
    age = time.time() - os.stat(cachefile)[stat.ST_MTIME]
    if age > expiry:
      return None
    f = open('%s' % cachefile, 'r')
    data = f.read()
    f.close()
    if len(data) > 1: # sanity check for empty file
      return data
  except IOError, e:
    logging.warn('IOError in ReadDataFromCache: %s' % e)
Beispiel #17
0
    def run(self):
        i = self.n
        while i < len(self.machines):
            machine = self.machines[i]
            i = i + self.num_threads

            cmds = []
            if self.command:
                cmds.append("ssh %s -n %s %s" %
                            (BATCHMODE, machine, commands.mkarg(self.command)))
            if self.files:
                cmds.extend(
                    map(lambda f, m=machine:
                        "rsync -u -e \"ssh %s\" -aH %s %s:%s" %
                        (BATCHMODE, f, m, f),
                        string.split(self.files, " ")))

            for cmd in cmds:
                cmd = "%s%s" % (ALARM, cmd)
                # Run and get the error
                if not QUIET:
                    logging.info("%s: Executing [%s]." % (self.n, cmd))
                this_err = os.system(python_exec_wrapper(cmd))
                if this_err:
                    # Divide by 256 to get error code from the exit status.
                    this_err = this_err >> 8
                    if self.files:  # we were doing an rsync
                        if this_err == RSYNC_PARTIAL_TRANSFER_ERROR:
                            # If the file went missing then we didn't need the transfer
                            # anyway, so we just continue.
                            logging.warn('%s: File does not exist.' % self.n)
                            continue
                    logging.error("%s: Error %d." % (self.n, this_err))
                    self.err = this_err
                    break

            if DELAY:
                time.sleep(DELAY)
Beispiel #18
0
 def UserInOwnersFile(cls, username, owners_file):
   """Check that 'username' is in the OWNERS file 'owners_file'."""
   f = None
   try:
     try:
       if owners_file.startswith('google3/production/mpmroot'):
         filename = os.path.join('/google/src/files/p5/head/depot',
                                 owners_file)
       else:
         filename = os.path.join('/home/build', owners_file)
       f = open(filename, 'r')
       for line in f:
         m = cls._NON_COMMENT_REGEX.match(line)
         if m and m.group(0).strip() == username:
           return True
       return False
     except IOError, e:
       logging.warn('Failed to read Perforce OWNERS file %s: %s',
                    owners_file, e)
       return False
   finally:
     if f:
       f.close()
def ExecuteWrapper(machines, commandLine, out, alarm, verbose = 1,
                   forceRemote = 0, enthome=None, num_tries=1):
  """Thin wrapper over E.execute as we need process's return code (parameter to
  exit()) and E.execute returns exit status. Too late to modify E.execute()
  method implementation as there is a lot of code that already calls this method.
  Can't confirm whether any code has come to rely on the fact that E.execute
  returns exit status code instead of process's return code. Refer E.execute()
  for it's documentation"""
  ret = 0
  for trial in range(num_tries):
    ret = E.execute(machines, commandLine, out,
                    alarm, verbose, forceRemote, enthome)
    if os.WIFEXITED(ret):
      # child process exit codes are multiplied by 256, so undo this
      ret = os.WEXITSTATUS(ret)
    if ret == 0 or (trial + 1) == num_tries:
      # either we succeed or this was the last try (there is no point in
      # sleeping after the last try)
      break
    logging.warn('%d: Execution of %s failed. Sleeping for 5 seconds...' %
                 (trial, commandLine))
    time.sleep(5)
  return ret
Beispiel #20
0
  def run(self):
    i = self.n
    while i < len(self.machines):
      machine = self.machines[i]
      i = i + self.num_threads

      cmds = []
      if self.command:
        cmds.append("ssh %s -n %s %s" % (BATCHMODE, machine,
                                             commands.mkarg(self.command)))
      if self.files:
        cmds.extend(map(
          lambda f, m = machine: "rsync -u -e \"ssh %s\" -aH %s %s:%s" % (
          BATCHMODE, f, m, f),
          string.split(self.files," ")))

      for cmd in cmds:
        cmd = "%s%s" % (ALARM, cmd)
        # Run and get the error
        if not QUIET:
          logging.info("%s: Executing [%s]." % (self.n, cmd))
        this_err = os.system(python_exec_wrapper(cmd))
        if this_err:
          # Divide by 256 to get error code from the exit status.
          this_err = this_err >> 8
          if self.files:  # we were doing an rsync
            if this_err == RSYNC_PARTIAL_TRANSFER_ERROR:
              # If the file went missing then we didn't need the transfer
              # anyway, so we just continue.
              logging.warn('%s: File does not exist.' % self.n)
              continue
          logging.error("%s: Error %d." % (self.n, this_err))
          self.err = this_err
          break

      if DELAY:
        time.sleep(DELAY)
def check_ntpdate_output(name):
  """ run "ntpdate -q" command on a server, and return the result.

  Args:
    name - 'time1.corp.google.com'
  Returns:
    (0, {'delay': '0.02591', 'stratum': '2', 'offset': '-0.030579',
         'server': '172.24.0.11'})
  """

  cmd = '/usr/sbin/ntpdate -q  %s' % commands.mkarg(name)
  (stat, out) = commands.getstatusoutput(cmd)
  parsed_out = {}
  if stat == 0:
    # only interested in the attributes in the first line
    lines = out.split('\n')
    attrs = lines[0].split(',')
    for i in range(len(attrs)):
      list = attrs[i].split()
      parsed_out[list[0].strip()] = list[1].strip()
  else:
    logging.warn('Command "%s" failed with exit status %d: %s' % (cmd, stat,
      out))
  return (stat, parsed_out)
def check_ntpdate_output(name):
    """ run "ntpdate -q" command on a server, and return the result.

  Args:
    name - 'time1.corp.google.com'
  Returns:
    (0, {'delay': '0.02591', 'stratum': '2', 'offset': '-0.030579',
         'server': '172.24.0.11'})
  """

    cmd = '/usr/sbin/ntpdate -q  %s' % commands.mkarg(name)
    (stat, out) = commands.getstatusoutput(cmd)
    parsed_out = {}
    if stat == 0:
        # only interested in the attributes in the first line
        lines = out.split('\n')
        attrs = lines[0].split(',')
        for i in range(len(attrs)):
            list = attrs[i].split()
            parsed_out[list[0].strip()] = list[1].strip()
    else:
        logging.warn('Command "%s" failed with exit status %d: %s' %
                     (cmd, stat, out))
    return (stat, parsed_out)
Beispiel #23
0
def GetAttr(name, pid=None, fallback_to_ps=1):
  """Retrieves an attribute using /proc/stat file. If the kernel version
  mismatches or the attribute name is not supported then it can fallback
  to using ps command based on the value of fallback_to_ps argument.

  Returns None in case of failure. Otherwise returned attribute value is
  always a string.
  """
  if pid is None:
    pid = os.getpid()
  val = None
  id = GetColumnId(name, KERNEL_VERSION)
  if id is not None:
    try:
      data = open('/proc/%d/stat' % pid, 'r').read()
    except:
      logging.error('Error getting stats for pid %d.' % pid)
    else:
      val = data.split()[id]
  if val is None and fallback_to_ps:
    # Fallback to using 'ps'
    logging.warn('Error retrieving value. Using \'ps\'.')
    val = GetAttrUsingPS(name, pid)
  return val
Beispiel #24
0
  def GetExpectedArgv(self, host, port, type):
    """Get the expected argv for a server as calculated by Babysitter.

    The expected argv is adapted to suit the argv as exported by the server.
    For example, C++ binaries will include the binary name in argv, but Java
    server's will not.

    Arguments:
      host: string, "ent1"
      port: int, 7882
      type: string, "authzchecker"
    Return:
      string: "--foo --bar", or None if Babysitter does not know the server
    """

    babysitter_cmd = servertype.GetRestartCmd(type, self.cfg,
                                              host, port)
    if babysitter_cmd is None:
      logging.warn("No babysitter command found for %s:%s (%s), not able to "
                   "export this server on babysitter-argv-sum" %
                   (host, port, type))
      return None
    binary_name = servertype.GetBinaryName(type)

    cmd = ExtractBinaryArgs(babysitter_cmd, binary_name)

    if cmd is None:
      logging.warn("Could not extract binary arguments for %s:%s, not"
                   " able to export this server on babysitter-argv-sum" %
                   (host, port))
      logging.warn("Binary name was %s, babysitter command was:" % binary_name)
      logging.warn(babysitter_cmd)
      return None

    # If its not Java then we need to prepend the full binary path
    if not IsJavaCommand(babysitter_cmd):
      cmd = "%s %s" % (ExtractBinaryPath(babysitter_cmd, binary_name), cmd)

    return cmd
Beispiel #25
0
 def KillLBService(self, service, prefix, lb_reload=0):
     """Kills all traces of a local babysitter service.
 First removes the configuration file, then pid file and then kills the
 process group.
 Note: This is a best effort method. It never returns failure unless
 there is a problem in executing a command.
 """
     (conffile, pidfile, binary) = self.GetMiscFiles(prefix)
     logging.info('Killing %s', service)
     if not os.path.exists(conffile):
         logging.warn("Configuration file %s doesn't exist. Ignoring." %
                      conffile)
     else:
         RemoveFile(conffile)
         if lb_reload:
             self.ForceLocalBabysitterConfigReload()
     pgid = ''
     if not os.path.exists(pidfile):
         # try to guess the process id
         logging.warn(
             'PID file %s not found. Trying to find running processes.' %
             pidfile)
         pgids = GetProcessGroupIDs(binary, self.__ver)
         # there should be at most one process group id
         if len(pgids) > 1:
             # TODO(zsyed): we may want to kill all group IDs.? I don't even know if
             # this can happen.
             raise core_utils.GenericError, \
                   "More than one instance %s found" % binary
         pgid = pgids[0]
     else:
         pid_file = open(pidfile, 'r')
         pgid = pid_file.read().strip()
         pid_file.close()
         RemoveFile(pidfile)
     if not pgid:
         logging.warn(
             'No running processes found for %s, ver=%s. Assuming dead.' %
             (binary, self.__ver))
     else:
         logging.info('Killing all processes in group %s.' % pgid)
         core_utils.ExecCmd('kill -9 -%s' % pgid,
                            'Killing process group %s' % pgid,
                            ignore_errors=1)
     logging.info('%s stopped.' % service)
 def KillLBService(self, service, prefix, lb_reload=0):
   """Kills all traces of a local babysitter service.
   First removes the configuration file, then pid file and then kills the
   process group.
   Note: This is a best effort method. It never returns failure unless
   there is a problem in executing a command.
   """
   (conffile, pidfile, binary) = self.GetMiscFiles(prefix)
   logging.info('Killing %s', service)
   if not os.path.exists(conffile):
     logging.warn("Configuration file %s doesn't exist. Ignoring." % conffile)
   else:
     RemoveFile(conffile)
     if lb_reload:
       self.ForceLocalBabysitterConfigReload()
   pgid = ''
   if not os.path.exists(pidfile):
     # try to guess the process id
     logging.warn('PID file %s not found. Trying to find running processes.'
        % pidfile)
     pgids = GetProcessGroupIDs(binary, self.__ver)
     # there should be at most one process group id
     if len(pgids) > 1:
       # TODO(zsyed): we may want to kill all group IDs.? I don't even know if
       # this can happen.
       raise core_utils.GenericError, \
             "More than one instance %s found" % binary
     pgid = pgids[0]
   else:
     pid_file = open(pidfile, 'r')
     pgid = pid_file.read().strip()
     pid_file.close()
     RemoveFile(pidfile)
   if not pgid:
     logging.warn('No running processes found for %s, ver=%s. Assuming dead.' %
                  (binary, self.__ver))
   else:
     logging.info('Killing all processes in group %s.' % pgid)
     core_utils.ExecCmd('kill -9 -%s' % pgid,
         'Killing process group %s' % pgid,
                        ignore_errors=1)
   logging.info('%s stopped.' % service)
Beispiel #27
0
def main(argv):
  """Fetches args, reads the config file, and starts the checking.

  Args:
    argv: Arguments to the script, not used other than for checking for
          improper usage.
  """
  # argv should only contain the script that was executed
  if len(argv) > 1:
    del argv[0]
    _PrintUsageAndExit(error='This script takes no positional arguments: %s'
                       % str(argv))

  if not FLAGS.config:
    if FLAGS.users:
      if FLAGS.users.find(',') >= 0:
        # More than 1 user given, make user specify configuration
        _PrintUsageAndExit(error='--config is a required flag.')
      else:
        user = FLAGS.users
    else:
      user = os.environ.get('USER')

    email_regex = re.compile(_EMAIL_PAT % re.escape(user), re.MULTILINE)

    config_root = ('/google/src/head/depot/google3/production/tools/'
                   'check_groups/conf')
    if _CheckForEmail(email_regex, ['sre-team'], max_depth=0):
      logging.info('No --config specified, defaulting to sre.cfg')
      FLAGS.config = os.path.join(config_root, 'sre.cfg')
    elif _CheckForEmail(email_regex, ['quantitative-team'], max_depth=0):
      logging.info('No --config specified, defaulting to quant/quant.cfg')
      FLAGS.config = os.path.join(config_root, 'quant/quant.cfg')
    else:
      _PrintUsageAndExit(error='--config is a required flag.')

  if os.path.exists(FLAGS.config):
    pass
  elif os.path.exists(_DEFAULT_CONFIG_PATH % FLAGS.config):
    FLAGS.config = _DEFAULT_CONFIG_PATH % FLAGS.config
  else:
    _PrintUsageAndExit(error=('Config file %s does not exist or '
                              'I can\'t read it!') % FLAGS.config)

  config = ConfigParser.ConfigParser()
  config.optionxform = str  # Owners require case insensitivity.
  config.read(FLAGS.config)

  if FLAGS.list_classes:
    _ListClasses(config)
    sys.exit(0)

  # complain if they don't give us --classes
  if not FLAGS.classes:
    _PrintUsageAndExit(error='One of --classes or --list_classes is required.')

  # Complain if they're running out of /home/build, since they won't get
  # perforce checks, but don't die since they still get some use of it.
  if os.getcwd().startswith('/home/build'):
    logging.warn('**** Running out of /home/build: '
                 'Don\'t expect perforce checks to work. ****')

  # Default to $USER, but use --user if they give it
  if FLAGS.users:
    test_users = FLAGS.users.split(',')
  else:
    test_users = [os.environ.get('USER')]

  for each_user in test_users:
    logging.info('Starting analysis for user %s', each_user)
    cc = ConfigurationChecker(config, each_user)
    cc.CheckMemberships(FLAGS.classes)
  def Execute(self):

    logging.info('Autorunning dispatcher starting')

    draining = 0
    done = 0
    self._sleep_time = self._min_sleep_time

    signal.signal(signal.SIGTERM, SigtermHandler)
    # before starting the loop move any pending requests to
    # the in dir
    self._request_mgr.MoveAllPendingRequests()
    try:
      while not done:
        do_continue = 0
        try:
          request = None

          if not draining:

            #
            # Update the request_list, count waiting and running requests, and see
            # if we can get a new request to start.
            #
            self._request_mgr.LookForNewRequests()
            waiting_counts = self._request_mgr.GetWaitingCounts()
            running_counts = self.GetRunningCounts()
            request = self._request_mgr.GetNextRequest(waiting_counts,
                                                       running_counts)
            #
            # See if there is a command to dispatch
            #
            if request:
              request.InitStatuszFile(self._request_mgr.GetStatuszDir())
              reqtype = request.GetType()
              logging.info('=== "%s" request found' % reqtype)
              self._request_mgr.MoveRequestToPending(request)
              request.AddStatusz("Starting process")

              if reqtype == TERMINATE:
                #
                # Terminate requests make us stop doing new commands.
                #
                draining = 1
                logging.info('=== Draining running commands.')
                request.AddStatusz("Request drained")
                self._num_processed_success = self._num_processed_success + 1
                self.AddRecentRequest(request, 0)
                self._request_mgr.MoveRequestToSuccess(request)
              else:
                #
                # Validate the command and dispatch it if it looks good, continue
                # if this request was not valid.
                #
                if not self.ValidateRequest(request):
                  request.AddStatusz("Request has invalid parameters")
                  self._request_mgr.MoveRequestToFailure(request)
                  self._failure_handler(request)
                  self._num_processed_failure = self._num_processed_failure + 1
                  self.AddRecentRequest(request, 1)
                  # Note: cannot add a continue here in while: try:
                  do_continue = 1  # Go get another request, this one sucked.
                else:
                  command = RunnableCommand(request, self.GetCmdInfo(reqtype))
                  request.AddStatusz("Running request cmd [%s]" %
                                     command.GetCmd())
                  self.DispatchCommand(command)

          if not do_continue:
            #
            # See if any commands have completed or any IO is waiting.
            #
            command_did_something = self.PollRunningCommands()

            if not request and not command_did_something:
              #
              # No request started, nothing read, and nothing finished,
              # so sleep briefly, and increase length of next sleep.
              #
              time.sleep(self._sleep_time)
              self._sleep_time = min(self._sleep_time*SLEEP_TIME_FACTOR,
                                     self._max_sleep_time)
            else:
              # Something happened, so reduce sleep time to minimum
              self._sleep_time = self._min_sleep_time

            #
            # If we're draining and there are no running commands, we are done
            #
            if draining and self._running_commands == {}:
              logging.info('=== No more requests, exiting')
              done = 1

          #
          # Sync the request manager directories
          #
          self._request_mgr.SyncRequestDirs()

        except SIGTERMInterrupt:
          # Catch SIGTERM
          logging.warn('SIGTERMInterrupt caught, shutting down')
          draining = 1

    except KeyboardInterrupt:
      # Catch control-C
      logging.warn('KeyboardInterrupt, exiting immediatly')
      raise
def SigtermHandler(dummy, _):
  logging.warn('SIGTERM received, shutting down')
  raise SIGTERMInterrupt
      Returns None on failure.
  """
  # parse config.google.enterprise to determine the version
  cmd = 'find /export/hda3/ -name STATE -maxdepth 2 | xargs grep ACTIVE -l'
  try:
    f = os.popen(cmd,'r')
    data = f.read()
    f.close()
    pat = re.compile('/export/hda3/([0-9]+\.[0-9]+\.[0-9]+)/STATE')
    match = pat.search(data)
    if match:
      return match.group(1)
  except IOError, e:
    logging.warn('IOError in GetActiveVersion: %s' % e)
  except OSError, e:
    logging.warn('OSError in GetActiveVersion: %s' % e)
  return None

CACHEDIR='/var/cache/ent-snmp'

def ReadDataFromCache(cacheKey, expiry=15, cachedir=CACHEDIR):
  """ Gets the cached reply for cacheKey.
  This cache prevents SNMP request from being too slow.
  Input:
    cacheKey is a string and must not contain any characters that
      would be "bad" for a filename.
    expiry: Cached data will expire after expiry seconds.
    cachedir: directory in which to hold data.
  Result: cached string or None (no value or expired).
  """
  cachefile = '/%s/snmpcache-%s' % (cachedir, cacheKey)