Ejemplo n.º 1
0
def SyncOpLogs(all_machines, log_dir):
  """ This will sync the AdminRunner.OPERATOR.* logs to all machines """

  # We have to run this only on master
  master = find_master.FindMaster(2100, all_machines)

  # The name of this machine
  crt_machine = E.getCrtHostName()
  if len(master) == 1 and master[0] == crt_machine:
    for machine in all_machines:
      if machine != crt_machine:
        src_dir = '%s/AdminRunner.OPERATOR.*' % (log_dir)
        dest_dir = '%s:/%s' % (machine, log_dir)

        logging.info('Collecting operator logs from %s into %s' % (
          src_dir, dest_dir))

        rsync_cmd = 'rsync --timeout=20 --size-only -vau ' \
                     ' -e ssh %s %s/' % (src_dir, dest_dir)

        # rsync the logs
        lockfile = '%s/syncops_lock' % log_dir
        lock = E.acquire_lock(lockfile, 1, breakLockAfterGracePeriod = 0)
        if lock == None:
          logging.info('Cannot grab the lock. Return!')
          return

        try:
          (status, output) = liblog.DoCommand(rsync_cmd)
          if status != 0:
            logging.error('Failed to collect logs from %s: %s' % (
              machine, output))
        finally:
          lock.close()
          os.unlink(lockfile)
Ejemplo n.º 2
0
def CollectLogs(all_machines, gws_log_dir, log_collect_dir):
  # We only run this on oneway or master node of cluster.
  master = find_master.FindMaster(2100, all_machines)
  crt_machine = E.getCrtHostName()
  if len(all_machines) != 1 and (len(master) != 1 or master[0] != crt_machine):
    logging.info('Not a oneway or cluster master node. Return!')
    return

  lockfile = '%s/lock' % log_collect_dir
  # waiting up to 5 minutes for the lock.
  lock = E.acquire_lock(lockfile, 30, breakLockAfterGracePeriod = 0)
  if lock == None:
    logging.info('Cannot grab the lock. Return!')
    return

  try:
    for machine in all_machines:
      src_pattern = '%s/partnerlog.*' % gws_log_dir
      dest_dir = '%s/%s' % (log_collect_dir, machine)

      # If it's a oneway or master node, we make a symlink to gws_log_dir instead
      # of rsync to log_collect directory
      if machine == crt_machine:
        # To make it backward compatible, we need to remove old dest_dir if it's
        # already an existing directory from previous version because in previous
        # versions we created a dir and rsynced files even on the master node and
        # one-ways.
        if os.path.exists(dest_dir) and not os.path.islink(dest_dir):
          if not E.rm(master, '%s/*' % dest_dir) or not E.rmdir(master, dest_dir):
            logging.error('Directory %s exists and cannot be cleaned.', dest_dir)
            continue
          logging.info('Cleaned existing directory %s.', dest_dir)

        if E.ln(master, gws_log_dir, dest_dir):
          logging.info('Symlink %s to directory %s:%s for logs' %
                       (dest_dir, machine, gws_log_dir))
        else:
          logging.error('Cannot make a symlink from %s to %s' %
                        (dest_dir, gws_log_dir))
        continue

      # For non-master nodes on cluster, we need to rsync those files to master node
      logging.info('Collecting logs from %s:%s into %s' % (
        machine, src_pattern, dest_dir))

      # make log directories if needed
      liblog.MakeDir(dest_dir)

      # rsync all files from one remote machine in one command.
      rsync_cmd = 'rsync --timeout=60 --size-only -vau ' \
                  ' -e ssh %s:%s %s/' % (machine, src_pattern, dest_dir)

      # rsync the logs
      (status, output) = liblog.DoCommand(rsync_cmd)
      if status != 0:
        logging.error('Failed to collect logs from %s: %s' % (
          machine, output))
  finally:
    lock.close()
    os.unlink(lockfile)
Ejemplo n.º 3
0
def main(argv):
    if len(argv) != 1:
        sys.exit(__doc__)

    config = entconfig.EntConfig(argv[0])
    if not config.Load(): sys.exit(__doc__)

    # Collect syslogs only if active or serve
    state = install_utilities.install_state(config.var('VERSION'))
    if not state in ['ACTIVE', 'SERVE']:
        sys.exit(0)

    # Collect syslogs only from master node.
    if not isMaster(config):
        logging.fatal('Not a oneway or cluster master node. Return!')

    pywrapbase.InitGoogleScript('', [
        'foo',
        '--gfs_aliases=%s' % config.var("GFS_ALIASES"),
        '--bnsresolver_use_svelte=false', '--logtostderr'
    ], 0)
    gfile.Init()

    first_date, last_date, printable_date, file_date = \
                liblog.ParseDateRange('all',[])

    apache_main_dir = liblog.get_apache_dir(config)
    checkpoint_dir = liblog.get_syslog_checkpoint_dir(config)
    liblog.MakeGoogleDir(config, checkpoint_dir)

    if (config.var('SYSLOG_SERVER') == None
            or config.var('ENT_SYSLOG_GWS_FAC') == None):
        logging.fatal('SYSLOG logging is disabled')

    lockfile = '%s/syslog_lock' % config.var('LOGDIR')
    lock = E.acquire_lock(lockfile, 1, breakLockAfterGracePeriod=0)
    if not lock:
        return

    try:
        logger = syslog_client.SyslogClient(config.var('SYSLOG_SERVER'),
                                            config.var('EXTERNAL_WEB_IP'))
        logging.info("syslog-server = %s" % config.var('SYSLOG_SERVER'))
        for collection in os.listdir(apache_main_dir):
            apache_dir = '%s/%s' % (apache_main_dir, collection)
            checkpoint_file = "%s/%s" % (checkpoint_dir, collection)
            apache_logs = liblog.FindLogFiles(apache_dir, first_date,
                                              last_date)
            logging.info('syslog handles collection %s' % collection)
            if not SyslogLogs(apache_logs, apache_dir, checkpoint_file, logger,
                              config):
                sys.exit('Updating failed')

        logger.close()
    finally:
        lock.close()
        os.unlink(lockfile)

    sys.exit(0)
Ejemplo n.º 4
0
def main(argv):
  if len(argv) != 1:
    sys.exit(__doc__)

  config = entconfig.EntConfig(argv[0])
  if not config.Load():  sys.exit(__doc__)

  # Collect syslogs only if active or serve
  state = install_utilities.install_state(config.var('VERSION'))
  if not state in [ 'ACTIVE', 'SERVE' ]:
    sys.exit(0)

  # Collect syslogs only from master node.
  if not isMaster(config):
    logging.fatal('Not a oneway or cluster master node. Return!')

  pywrapbase.InitGoogleScript('', ['foo',
          '--gfs_aliases=%s' % config.var("GFS_ALIASES"),
          '--bnsresolver_use_svelte=false',
          '--logtostderr'], 0)
  gfile.Init()

  first_date, last_date, printable_date, file_date = \
              liblog.ParseDateRange('all',[])

  apache_main_dir =  liblog.get_apache_dir(config)
  checkpoint_dir = liblog.get_syslog_checkpoint_dir(config)
  liblog.MakeGoogleDir(config, checkpoint_dir)

  if ( config.var('SYSLOG_SERVER') == None or
       config.var('ENT_SYSLOG_GWS_FAC') == None ):
    logging.fatal('SYSLOG logging is disabled')

  lockfile = '%s/syslog_lock' % config.var('LOGDIR')
  lock = E.acquire_lock(lockfile, 1, breakLockAfterGracePeriod = 0)
  if not lock:
    return

  try:
    logger = syslog_client.SyslogClient(config.var('SYSLOG_SERVER'),
                                        config.var('EXTERNAL_WEB_IP'))
    logging.info("syslog-server = %s" % config.var('SYSLOG_SERVER'))
    for collection in os.listdir(apache_main_dir):
      apache_dir = '%s/%s' % (apache_main_dir, collection)
      checkpoint_file = "%s/%s" % (checkpoint_dir, collection)
      apache_logs = liblog.FindLogFiles(apache_dir, first_date, last_date)
      logging.info('syslog handles collection %s' % collection)
      if not SyslogLogs(apache_logs, apache_dir, checkpoint_file, logger,
                        config):
        sys.exit('Updating failed')

    logger.close()
  finally:
    lock.close()
    os.unlink(lockfile)

  sys.exit(0)
Ejemplo n.º 5
0
  def saveToDisk(self):
    """ Save it back to disk. This is needed if it has changed."""
    if not self.hasChanged():
      return

    lockfile = '%s.lock' % self.filename
    lock = E.acquire_lock(lockfile, 5, breakLockAfterGracePeriod = true)

    try:
      file = open(self.filename, 'w')
      colls = self.collection_dir_map.keys()
      colls.sort()
      for collection in colls:
        directories = " ".join(self.collection_dir_map[collection])
        file.write('%s %s\n' % (collection, directories))
      file.close()
      # reset the flag to be safe. Normally, this is called at
      # the end of a script execution.
      self.changed = false
    finally:
      lock.close()
      os.unlink(lockfile)
Ejemplo n.º 6
0
    def saveToDisk(self):
        """ Save it back to disk. This is needed if it has changed."""
        if not self.hasChanged():
            return

        lockfile = '%s.lock' % self.filename
        lock = E.acquire_lock(lockfile, 5, breakLockAfterGracePeriod=true)

        try:
            file = open(self.filename, 'w')
            colls = self.collection_dir_map.keys()
            colls.sort()
            for collection in colls:
                directories = " ".join(self.collection_dir_map[collection])
                file.write('%s %s\n' % (collection, directories))
            file.close()
            # reset the flag to be safe. Normally, this is called at
            # the end of a script execution.
            self.changed = false
        finally:
            lock.close()
            os.unlink(lockfile)
Ejemplo n.º 7
0
  def prereq_check(self, send_email, collections):
    """
    This checks the prerequisites for all collection, updates the epochs to
    serve from and (optionally) sends a mail.
    """
    if collections != None:
      collections = string.strip(collections)
    if not collections:
      collections = ent_collection.ListCollections(self.cfg.globalParams)
    else:
      collections = map(lambda c, p = self.cfg.globalParams:
                        ent_collection.EntCollection(c, p),
                        map(string.strip, string.split(collections, ","))
                        )
    # No collections -- exit quickly
    if not collections:
      return {}

    send_email = string.atoi(send_email)
    epochs = self.cfg.getGlobalParam('ENTERPRISE_EPOCHS')
    gwssers = self.cfg.globalParams.GetServerHostPorts("web")
    jobs = []
    for c in collections:
      collection = ent_collection.EntCollection(c, self.cfg.globalParams)
      # Write the testwords in a copy file
      filename = collection.get_var('TESTWORDS')
      filename_copy = "%s_" % filename
      open(filename_copy, "w").write(open(filename, "r").read())
      num = collection.get_var("TESTWORDS_IN_FIRST")
      jobs.append((self.cfg, gwssers, c, filename_copy, epochs, num))


    # Lock a file so we test once at a time
    lock_file = "%s/prerequisites_lock" % self.cfg.getGlobalParam("TMPDIR")
    flock = E.acquire_lock(lock_file, 12)

    try:
      # Run the tests -- one per thread ...
      # see how many threads to spawn
      if len(jobs) >= NUM_THREADS:
        num_threads = NUM_THREADS
      else:
        num_threads = len(jobs)

      # create the threads - workers
      threads = []
      for n in range(0, num_threads):
        threads.append(Runner(n, jobs))

      # start the threads
      for thread in threads[:-1]:
        thread.start()
      # I run the last one
      threads[-1].run()

      # wait to collect the errors at the end
      errors = threads[-1].errors
      max_epochs = threads[-1].max_epochs
      for thread in threads[:-1]:
        thread.join()
        for k, v in thread.max_epochs.items():
          max_epochs[k] = v
        for k, v in thread.errors.items():
          errors[k] = v

      # prepare and send a nice :) message
      if errors and send_email:
        last_msg_time = self.cfg.getGlobalParam('LAST_PREREQUISITES_EMAIL_TIME')
        email_interval  = self.cfg.getGlobalParam('ENTERPRISE_INTER_EMAIL_TIME')
        now = int(time.time())
        if now - last_msg_time > email_interval:
          msg = [M.MSG_PREREQ_FAIL]
          msg.extend(map(
            lambda (c, e): "Collection %s generated a wrong answer for %s" %
            (c, string.join(e, ",")), errors.items()))
          SendMail.send(self.cfg, None, 1, M.MSG_PREREQ_FAIL_SUBJECT,
                        string.join(msg, "\n"), 1)
          self.cfg.globalParams.set_var('LAST_PREREQUISITES_EMAIL_TIME', now)

      self.cfg.globalParams.set_var('LAST_PREREQUISITES_CHECK',
                                    time.strftime("%Y/%m/%d %H:%M:%S"))

      epochs.sort()
      cur_epoch = epochs[-1]
      for c in collections:
        collection = ent_collection.EntCollection(c, self.cfg.globalParams)
        collection.set_var('LAST_PREREQUISITES_ERRORS', errors.get(c, []))
        # EPOCH_SERVING has two values in the form of "es[0] es[1]"
        # es[0]: the epoch the prereq_check ask us to serve, or
        #        -1 means no epoch answers OK,
        #        -2 means current index answers OK
        # es[1]: the epoch the user set from UI, if -2 means use
        #        most recent valid epoch
        # the serving logic is as following:
        # -- if user set a sepcific epoch (es[1]) >= 0), serve es[1]
        # -- if user set most recent valid epoch (es[1] == -2), then
        #      serve the current index if no/all epochs answers ok
        #       (es[0] == -1 or es[0] == -2 )
        #      otherwise (es[0] >= 0) serve from the es[0]
        #
        es = string.split(string.strip(
          open(collection.get_var('EPOCHS_SERVING'), "r").read()), " ")

        # The epoch prereq_check asks us to serve
        # this from -- -2 means current index is OK,
        # -1 means no epoch answers OK (is returned by the checker)
        epoch = max_epochs.get(c, -2)
        if not errors.has_key(c): epoch = -2
        # initialize EPOCHS_SERVING
        if not es or len(es) == 1:
          es = [epoch, -2]
        else:
          es = map(string.atoi, es)
          # if this change cause automatic rollback, which means
          # - user choose the most recent valid epoch and
          # - the new epoch differs from previous epoch and
          # - the change is not from -1 -> -2 or -2 -> -1.
          # we log it in AdminRunner Operations log
          if  es[1] == -2 and  epoch != es[0] and ( es[0] + epoch != -3 ) :
            epochs_to_time = self.cfg.getGlobalParam('ENTERPRISE_EPOCHS_ENDTIME')
            epoch_time = epochs_to_time.get(epoch, M.MSG_EPOCH_CURRENT_TIME)
            self.writeAdminRunnerOpMsg(M.MSG_UI_LOG_INDEX_ROLLBACK % epoch_time)
          es[0] = epoch

        collection.set_file_var_content('EPOCHS_SERVING',
                                        string.join(map(str, es)), 0)

        # also check if the current serving epoch for the collection
        # is the most recent one, if not, send a warning email
        if send_email and ( ( es[1] == -2 and es[0] >= 0 ) or \
                            (es[1] >= 0 and es[1] != cur_epoch ) ) :
          last_msg_time = self.cfg.getGlobalParam(
            'LAST_SERVING_EPOCH_WARNING_EMAIL_TIME')
          email_interval  = self.cfg.getGlobalParam(
            'ENTERPRISE_INTER_EMAIL_TIME')
          now = int(time.time())
          if now - last_msg_time > email_interval:
            SendMail.send(self.cfg, None, 0,
                          M.MSG_SERVING_EPOCH_NOT_CURRENT % c, "", 0)
            self.cfg.globalParams.set_var(
              'LAST_SERVING_EPOCH_WARNING_EMAIL_TIME', now)

      self.cfg.saveParams()
    finally:
      flock.close()

    return errors