Exemple #1
0
    def add(self, machine, apc_outlet):
        """
    This adds a machine to the configuration
    """
        # We can add a machine only when we are in active state
        if install_utilities.install_state(
                self.cfg.getGlobalParam('VERSION')) != "ACTIVE":
            logging.error("Can add a machine only when we are in active state")
            return 1

        # First test for accessibility of the machine.
        if E.execute([machine], 'echo 1', None, 1) != E.ERR_OK:
            logging.error("Could not ssh into the machine %s" % machine)
            return 1

        # start the svs on the remote machine
        restart_svs_cmd = "%s/local/google3/enterprise/legacy/util/svs_utilities.py %s %s" % (
            self.cfg.getGlobalParam('ENTERPRISE_HOME'),
            self.cfg.getGlobalParam('ENTERPRISE_HOME'), machine)
        if E.execute([E.getCrtHostName()],
                         SECURE_WRAPPER_COMMAND % ( \
                              self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                              "-p2",
                              restart_svs_cmd),
                         None, 0) != E.ERR_OK:
            logging.error("Could not start svs on machine %s" % machine)
            return 1

        # wait for some time for svs to come up
        time.sleep(5)
        # check to see if the svs is up and is the right version
        if not svs_utilities.PingAndCheckSvsVersion(
                self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
                self.cfg.getGlobalParam('ENTERPRISE_HOME'), machine):
            logging.error("Svs not running correctly on machine %s" % machine)
            return 1
        ver = self.cfg.getGlobalParam('VERSION')
        home = self.cfg.getGlobalParam('ENTERPRISE_HOME')
        testver = install_utilities.is_test(ver)

        # update MACHINES
        machines = self.cfg.getGlobalParam('MACHINES')
        if machine not in machines:
            machines.append(machine)
        self.cfg.setGlobalParam('MACHINES', machines)

        ret = core_utils.RemDeadNode(ver, testver, machine)
        if ret:
            logging.error('Cannot remove dead node from lockserver.')
            # we ignore this error for now

        # We just added a new machine into the config
        # this will lead to a change in concentrator config
        # so we need to re-run serve service which will
        # write the new config and restart the concentrator
        serve_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \
                          "./serve_service.py %s" % (
          self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
          self.cfg.getGlobalParam('ENTERPRISE_HOME'))
        E.exe("%s %s" % (serve_cmd, "babysit"))

        num_tries = 5
        cur_try = 0
        while cur_try < num_tries:
            cur_try = cur_try + 1
            all_disks = self.cfg.mach_param_cache.GetFact(
                "mounted-drives", machine)
            bad_disks = self.cfg.mach_param_cache.GetFact(
                "var_log_badhds", machine)
            if bad_disks and all_disks:
                break
            time.sleep(60)
        if all_disks == None or bad_disks == None:
            logging.error("Could not get machine information about %s" %
                          machine)
            return 1

        bad_disks = string.split(bad_disks, ' ')
        all_disks = string.split(all_disks, ' ')
        good_disks = filter(lambda x, y=bad_disks: x not in y, all_disks)
        good_disks = map(lambda x: "%s3" % x, good_disks)
        # change sda3 to hda3 etc.
        good_disks = map(lambda x: re.sub(r'^s', 'h', x), good_disks)

        # Preprocess disks before adding to remove duplicates.
        unique_good_disks = []
        [
            unique_good_disks.append(disk) for disk in good_disks
            if disk not in unique_good_disks
        ]

        # Add disks
        self.updatedisk(machine, unique_good_disks, true)

        # apc map update
        apc_map = self.cfg.globalParams.var_copy('APC_MAP')
        apc_map[machine] = apc_util.PortMap(apc_outlet)
        if not self.cfg.setGlobalParam('APC_MAP', apc_map):
            logging.error("ERROR setting apc map to %s" % repr(apc_map))
            return 1

        # create appropriate datadirs on that machine
        if not self.cfg.createDataDirs([machine], node_replacement=1):
            logging.error("ERROR could not create datadirs on machine %s" %
                          machine)
            return 1

        # Replicate the config
        self.cfg.replicateConfigOnMachine(machine)

        # Reconfigure net on the target machine
        if not reconfigurenet_util.doReconfigureNet(
                self.cfg.globalParams, [machine], i_am_master=0):
            logging.error('reconfigurenet failed for %s' % machine)
            return 1

        # Start core services on the new node
        if not install_utilities.start_core(ver, home, [machine], ignore=0):
            logging.error("ERROR could not start core services on %s" %
                          machine)
            return 1
        # Add the chunkserver back
        gfs_utils.AddGFSChunkservers(ver, testver, [machine])

        # first we need to do Machine allocation.
        # this will assign things that will satisfy the constraints
        if not self.cfg.DoMachineAllocation(serversets=['workqueue-slave']):
            logging.error("ERROR doing machine allocation")
            return 1

        # now try to relllocate some servers from existing machines to the new machine
        replaced = self.cfg.AllocateServersToNewMachine(machine)
        if not replaced:
            logging.error("ERROR allocating services to the new machine")
            return 1

        # first we need to restart the babysitter
        E.exe("%s %s" % (serve_cmd, "babysit"))
        time.sleep(60)

        # Now we need to stop all the replaced services
        for server_string in replaced:
            server = serverlib.Server()
            server.InitFromName(server_string)
            replaced_type = server.servertype()
            kill_cmd = servertype.GetKillCmd(replaced_type, server.port())
            if E.execute([server.host()], kill_cmd, None, 1) != E.ERR_OK:
                logging.error("ERROR killing %s running on port %d on %s" % \
                                     (replaced_type, server.port(), server.host()))

        # we should make it active
        if not install_utilities.set_install_state(
                machine, self.cfg.getGlobalParam('ENTERPRISE_HOME'), "ACTIVE"):
            logging.error("ERROR changing state on machine %s. "
                          "Please make it active and activate and "
                          "start crawl service on it" % machine)
            return 1

        crawl_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \
                          "./crawl_service.py %s" % (
          self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
          self.cfg.getGlobalParam('ENTERPRISE_HOME'))
        if E.execute([machine], "%s %s" %
                     (crawl_cmd, "start"), None, 1) != E.ERR_OK:
            logging.error("Could not start crawl service on %s" % machine)
            return 1

        # save all the params
        self.cfg.saveParams()

        # for faster crawl recovery, lets restart all crawl processes
        self.restart_crawl_processes(serve_cmd)

        # activate the crawl and logcontrol service on the remote machine
        crawl_activate_cmd = "/etc/rc.d/init.d/crawl_%s activate >&/dev/null" \
                               "</dev/null" % self.cfg.getGlobalParam('VERSION')
        if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \
                                self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                                "-e",
                                crawl_activate_cmd),
                         None, 0) != E.ERR_OK:
            logging.error("Could not activate crawl service on machine %s" %
                          machine)
            logging.error("Please activate by hand")
            return 1
        log_activate_cmd = "/etc/rc.d/init.d/logcontrol_%s activate >&/dev/null" \
                               "</dev/null" % self.cfg.getGlobalParam('VERSION')
        if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \
                                self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                                "-e",
                               log_activate_cmd),
                         None, 0) != E.ERR_OK:
            logging.error(
                "Could not activate logcontrol service on machine %s" %
                machine)
            logging.error("Please activate by hand")
            return 1

        serve_activate_cmd = "/etc/rc.d/init.d/serve_%s activate >&/dev/null" \
                               "</dev/null" % self.cfg.getGlobalParam('VERSION')
        if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \
                                self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                                "-e",
                               serve_activate_cmd),
                         None, 0) != E.ERR_OK:
            logging.error("Could not activate serve service on machine %s" %
                          machine)
            logging.error("Please activate by hand")
            return 1

        logging.info("Machine %s successfully added into the system" % machine)

        if not mail_already_sent(M.MSG_MACHINEADDED % machine):
            SendMail.send(self.cfg, None, false, M.MSG_MACHINEADDED % machine,
                          "", true)
        return 0
  def change_install_state(self):
    """
    Tries to change the state of the present version to target_state.
    Returns true in case of success.
    Here is sumary of what it does:
      1. Get list of active nodes
      2. Get list of services to start and stop
      3. In case there is something to start
          a. reconfigure's net on all nodes after verifying quorum
          b. starts core servics
      4. Verifies there is a master elected.
      5. Starts thread for each node to start and stop the needed services
      6. Waits for output from each thread
      7. Calculates success of failure based on thread results
      8. Asks each thread to print its status regarding what services
         it actually started or stopped and what was the return code and
         error message if any.
    """
    if not install_utilities.safe_transition(self.version_,
                                             self.target_state_):
      return 0

    current_state = install_utilities.install_state(self.version_)

    start = time.time()
    # make sure svs is running
    svs_utilities.CheckSvsAlive(self.machines_)

    # get list of active nodes
    active_nodes = core_utils.GetLiveNodes(logging, self.retry_)
    ignore = core_utils.GetNodeFailures(core_utils.GetTotalNodes())
    # account for already inactive nodes
    ignore = ignore - (core_utils.GetTotalNodes() - len(active_nodes))
    ver = self.version_
    home = self.enthome_

    # See what we have to start / stop
    services_to_start = install_utilities.state_services_to_start(
      self.target_state_, self.machines_)
    services_to_stop = install_utilities.state_services_to_stop(
      install_utilities.install_state(self.version_), self.machines_)

    # Make some decisions
    total_nodes = len(self.cp_.var('ENT_ALL_MACHINES'))
    onebox = (total_nodes == 1)
    startcore = services_to_start and not onebox and not self.nonecore_only_
    checkquorum = startcore
    stopcore =  (services_to_stop and not onebox and not self.nonecore_only_
                 and self.target_state_ == 'INACTIVE')
    doservices = (not self.core_only_
                  and (services_to_start or services_to_stop))
    if self.target_state_ in ['INACTIVE']:
      # ent_core does not really know the state. install_manager
      # has to tell ent_core when "makeinactive"
      testver = install_utilities.install_state(self.version_)
    else:
      testver = self.target_state_ in ['TEST', 'INSTALL']
    # If it is onebox and target state is INSTALL, do not run reconfigure_net
    # This is to support pre 4.4 version migration code.
    reconfigurenet_enabled = not (onebox and (self.target_state_ == 'INSTALL'))

    # if stop coreonly services, check if none-core components are running
    if (install_utilities.install_state(self.version_) == 'ACTIVE' and
        self.target_state_ == 'INACTIVE' and self.core_only_):
      logging.fatal("cannot stop core services while none core services "\
                    "are running")

    # Execute the decisions
    if checkquorum:
      # We check quorum only when services are to be started.
      # We mainly need quorum for core services. For non core services like
      # crawl, logcontrol etc. we use users specified machines.
      core_utils.VerifyQuorum(active_nodes)

    # check if syslogd.conf and klogd.conf exist
    install_utilities.check_klogd_syslogd_conf(active_nodes, home)

    # Kill any spurious adminrunner/adminconsole processes if we are entering
    # TEST or ACTIVE mode.
    if self.target_state_ in ['TEST', 'ACTIVE']:
      install_utilities.kill_service(['adminrunner', 'adminconsole'],
                                     core_utils.GetNodes(1))

    # reconfigure without restarting gems
    success = 1
    if reconfigurenet_enabled and services_to_start:
      # check if we need to force NTP reconfig if this is to upgrade from 4.4
      force_ntp_reconfig = 0
      if self.target_state_ in ['TEST', 'ACTIVE']:
        last_version = install_utilities.get_latest_version(except_for=1)
        if (last_version is None or
            version_utilities.CmpVersions(last_version,
                                          NEW_NTP_OPTION_GSA_VERSION) > 0):
          force_ntp_reconfig = 1
      success = reconfigurenet_util.doReconfigureNet(self.cp_,
                  active_nodes, force_ntp_reconfig=force_ntp_reconfig)
      if not success:
        logging.error('reconfigurenet failed.')

    # if start nonecore services, check if core services are running
    if (not onebox and self.nonecore_only_ and
        self.target_state_ in ['TEST', 'ACTIVE']):
      core_running = install_utilities.is_core_running(ver, home,
                         active_nodes, ignore=ignore, testver=testver)
      if not core_running:
        logging.fatal("cannot start none core services "\
                      "when core services are not running")

    # start core services if needed
    if startcore and success:
      # Retry 3 times for master verification failures
      num_retry = 3
      # it is always OK to reinit core services if the version is in
      # INSTALLED state
      self.reinitok_ = install_utilities.reinit_core_ok(ver, home,
                         active_nodes, ignore=ignore, testver=testver)
      i = 1
      while i <= num_retry:
        # stop core services when retrying
        if i > 1:
          time.sleep(15)
          install_utilities.stop_core(ver, home, active_nodes, testver=testver)
          time.sleep(15)
        i = i + 1
        # Run ent_core --ver=<ver> --activate --gfs=0 through install_utilities.py
        success = install_utilities.start_core(ver, home, active_nodes,
                                               ignore=ignore,
                                               testver=testver,
                                               gfs=0)
        if not success:
          if i <= num_retry:
            logging.error('Error activating core services. Retrying...')
          elif self.reinitok_:
            # it is OK to ignore errors when trying to re-init core services
            install_utilities.reinit_core(ver, home, active_nodes, ignore=1,
                                          testver=testver)
            i = 1
            self.reinitok_ = None
          else:
            logging.error('Error activating core services.')
        else:
          # Make sure a master has been elected. If we go ahead without
          # verifying the master then it will take very long time for
          # services to be started. Making sure master is elected by now
          # results in very quick adminrunner startup.
          success = verify_master(ver, testver)
          if success:
            if not core_utils.InitDeadNodes(ver, testver, logging) == 0:
              logging.fatal('Error updating dead nodes to the lockserver.')
            break
          if i <= num_retry:
            logging.error('Error verifying the master. Retrying...')
          elif self.reinitok_:
            # it is OK to ignore errors when trying to re-init core services
            install_utilities.reinit_core(ver, home, active_nodes, ignore=1,
                                          testver=testver)
            i = 1
            self.reinitok_ = None
          else:
            raise core_utils.EntMasterError, ('Error getting current GSA master'
                                              ' from chubby.')
      # force gsa master on the desired node
      desired_gsa_master_node = core_utils.DesiredMasterNode()
      if desired_gsa_master_node is None:
        logging.fatal('No suitable node to run GSA master')
      logging.info('Forcing %s to become GSA master' % desired_gsa_master_node)
      find_master.ForceMaster(desired_gsa_master_node, testver)

      # make sure the transaction logs are in sync and start gfs
      success = install_utilities.start_gfs(ver, home, active_nodes,
                                            ignore=ignore,
                                            testver=testver)

      # make sure gfs master is not the GSA master node
      logging.info('Ensuring %s not to become GFS master' %
                   desired_gsa_master_node)
      gfs_utils.AvoidGFSMasterOnNode(ver, testver, desired_gsa_master_node)

    if doservices and success:
      node_threads = {}
      for n in self.machines_:
        node_threads[n] = NodeInstallManager(n, self.target_state_,
                                             self.version_,
                                             services_to_start,
                                             services_to_stop)

      # start node threads
      for (n, t) in node_threads.items():
        logging.info('STATUS: Starting thread for %s' % n)
        t.start()

      # wait for threads
      for (n,t) in node_threads.items():
        t.join()
        success = success and (t.err_ == 0)

      for (n,t) in node_threads.items():
        t.print_status()

    if stopcore and success:
      func = lambda: install_utilities.stop_core(ver, home, active_nodes,
                                          testver=testver)
      success = try_repeatedly(func, success=1)
      if not success:
        logging.error('Error inactivating core services.')

    # Start/Stop Borgmon and Reactor
    if self.cp_.var('ENT_ENABLE_EXTERNAL_BORGMON'):
      enable_external_borgmon = '--enable_external'
    else:
      enable_external_borgmon = '--noenable_external'
    borgmon_cmd = (
        "/export/hda3/%s/local/google3/enterprise/util/borgmon_util.py "
        "--ver %s --logtostderr %s" %
        (self.version_, self.version_, enable_external_borgmon))
    if success and current_state != self.target_state_:
      # 1) Stop Borgmon and Reactor if required
      if current_state in ['SERVE', 'TEST', 'ACTIVE']:
        E.execute(self.machines_,
                  "%s --mode %s --stop" % (borgmon_cmd, current_state),
                  None, 0)
      # 2) Start Borgmon and Reactor if required
      logging.info("target_state: %s" % self.target_state_)
      if self.target_state_ in ['SERVE', 'TEST', 'ACTIVE']:
        E.execute(self.machines_,
                  "%s --mode %s --start" % (borgmon_cmd, self.target_state_),
                  None, 0)

    # Start/Stop Session Manager only for oneways
    if core_utils.GetTotalNodes() == 1:
      if self.target_state_ in ['SERVE', 'TEST', 'ACTIVE']:
        sessionmanager_util.ActivateSessionManager(ver, testver)
      if self.target_state_ == 'INACTIVE' and success:
        sessionmanager_util.DeactivateSessionManager(ver, testver)

    # Kill any spurious adminrunner/adminconsole processes if we are entering
    # INACTIVE or SERVE mode.
    if self.target_state_ in ['SERVE', 'INACTIVE']:
      install_utilities.kill_service(['adminrunner', 'adminconsole'],
                                     core_utils.GetNodes(1))

    if self.target_state_ == 'INACTIVE' and success and not self.nonecore_only_:
      install_utilities.InactivateCleanup(ver, home, active_nodes)

    end = time.time()
    diff = (end - start)/60
    logging.info("STAT: change_install_state took %.2f minutes." % diff)
    return success
  def add(self, machine, apc_outlet):
    """
    This adds a machine to the configuration
    """
    # We can add a machine only when we are in active state
    if install_utilities.install_state(self.cfg.getGlobalParam('VERSION')) != "ACTIVE":
      logging.error("Can add a machine only when we are in active state")
      return 1

    # First test for accessibility of the machine.
    if E.execute([machine], 'echo 1', None, 1) != E.ERR_OK:
      logging.error("Could not ssh into the machine %s" % machine)
      return 1

    # start the svs on the remote machine
    restart_svs_cmd = "%s/local/google3/enterprise/legacy/util/svs_utilities.py %s %s" % (
                          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                          machine)
    if E.execute([E.getCrtHostName()],
                     SECURE_WRAPPER_COMMAND % ( \
                          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                          "-p2",
                          restart_svs_cmd),
                     None, 0) != E.ERR_OK:
      logging.error("Could not start svs on machine %s" % machine)
      return 1

    # wait for some time for svs to come up
    time.sleep(5)
    # check to see if the svs is up and is the right version
    if not svs_utilities.PingAndCheckSvsVersion(
                          self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
                          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                          machine):
      logging.error("Svs not running correctly on machine %s" % machine)
      return 1
    ver = self.cfg.getGlobalParam('VERSION')
    home = self.cfg.getGlobalParam('ENTERPRISE_HOME')
    testver = install_utilities.is_test(ver)

    # update MACHINES
    machines = self.cfg.getGlobalParam('MACHINES')
    if machine not in machines:
      machines.append(machine)
    self.cfg.setGlobalParam('MACHINES', machines)

    ret = core_utils.RemDeadNode(ver, testver, machine)
    if ret:
      logging.error('Cannot remove dead node from lockserver.')
      # we ignore this error for now

    # We just added a new machine into the config
    # this will lead to a change in concentrator config
    # so we need to re-run serve service which will
    # write the new config and restart the concentrator
    serve_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \
                      "./serve_service.py %s" % (
      self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
      self.cfg.getGlobalParam('ENTERPRISE_HOME'),
      self.cfg.getGlobalParam('ENTERPRISE_HOME'))
    E.exe("%s %s" % (serve_cmd, "babysit"))

    num_tries = 5
    cur_try = 0
    while cur_try < num_tries:
      cur_try = cur_try + 1
      all_disks = self.cfg.mach_param_cache.GetFact("mounted-drives", machine) 
      bad_disks = self.cfg.mach_param_cache.GetFact("var_log_badhds", machine) 
      if bad_disks and all_disks:
        break
      time.sleep(60)
    if all_disks == None or bad_disks == None:
      logging.error("Could not get machine information about %s" % machine)
      return 1

    bad_disks = string.split(bad_disks, ' ')
    all_disks = string.split(all_disks, ' ')
    good_disks = filter(lambda x, y=bad_disks: x not in y, all_disks)
    good_disks = map(lambda x: "%s3" % x, good_disks)
    # change sda3 to hda3 etc.
    good_disks = map(lambda x: re.sub(r'^s', 'h', x), good_disks)

    # Preprocess disks before adding to remove duplicates.
    unique_good_disks = []
    [unique_good_disks.append(disk) for disk in good_disks if disk not in unique_good_disks]

    # Add disks
    self.updatedisk(machine, unique_good_disks, true)

    # apc map update
    apc_map = self.cfg.globalParams.var_copy('APC_MAP')
    apc_map[machine] = apc_util.PortMap(apc_outlet)
    if not self.cfg.setGlobalParam('APC_MAP', apc_map):
      logging.error("ERROR setting apc map to %s" % repr(apc_map))
      return 1

    # create appropriate datadirs on that machine
    if not self.cfg.createDataDirs([machine], node_replacement = 1):
      logging.error("ERROR could not create datadirs on machine %s" % machine)
      return 1

    # Replicate the config
    self.cfg.replicateConfigOnMachine(machine)

    # Reconfigure net on the target machine
    if not reconfigurenet_util.doReconfigureNet(self.cfg.globalParams,
                                                [machine], i_am_master=0):
      logging.error('reconfigurenet failed for %s' % machine)
      return 1

    # Start core services on the new node
    if not install_utilities.start_core(ver, home, [machine], ignore=0):
      logging.error("ERROR could not start core services on %s" % machine)
      return 1
    # Add the chunkserver back
    gfs_utils.AddGFSChunkservers(ver, testver, [machine])

    # first we need to do Machine allocation.
    # this will assign things that will satisfy the constraints
    if not self.cfg.DoMachineAllocation(serversets=['workqueue-slave']):
      logging.error("ERROR doing machine allocation")
      return 1

    # now try to relllocate some servers from existing machines to the new machine
    replaced = self.cfg.AllocateServersToNewMachine(machine)
    if not replaced:
      logging.error("ERROR allocating services to the new machine")
      return 1

    # first we need to restart the babysitter
    E.exe("%s %s" % (serve_cmd, "babysit"))
    time.sleep(60)

    # Now we need to stop all the replaced services
    for server_string in replaced:
      server = serverlib.Server()
      server.InitFromName(server_string)
      replaced_type = server.servertype()
      kill_cmd = servertype.GetKillCmd(replaced_type, server.port())
      if E.execute([server.host()], kill_cmd, None, 1) != E.ERR_OK:
        logging.error("ERROR killing %s running on port %d on %s" % \
                             (replaced_type, server.port(), server.host()))


    # we should make it active
    if not install_utilities.set_install_state(machine,
                             self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                             "ACTIVE"):
      logging.error("ERROR changing state on machine %s. "
                    "Please make it active and activate and "
                    "start crawl service on it" % machine)
      return 1

    crawl_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \
                      "./crawl_service.py %s" % (
      self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
      self.cfg.getGlobalParam('ENTERPRISE_HOME'),
      self.cfg.getGlobalParam('ENTERPRISE_HOME'))
    if E.execute([machine], "%s %s" % (crawl_cmd, "start"), None, 1) != E.ERR_OK:
      logging.error("Could not start crawl service on %s" % machine)
      return 1

    # save all the params
    self.cfg.saveParams()

    # for faster crawl recovery, lets restart all crawl processes
    self.restart_crawl_processes(serve_cmd)

    # activate the crawl and logcontrol service on the remote machine
    crawl_activate_cmd = "/etc/rc.d/init.d/crawl_%s activate >&/dev/null" \
                           "</dev/null" % self.cfg.getGlobalParam('VERSION')
    if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \
                            self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                            "-e",
                            crawl_activate_cmd),
                     None, 0) != E.ERR_OK:
      logging.error("Could not activate crawl service on machine %s" % machine)
      logging.error("Please activate by hand")
      return 1
    log_activate_cmd = "/etc/rc.d/init.d/logcontrol_%s activate >&/dev/null" \
                           "</dev/null" % self.cfg.getGlobalParam('VERSION')
    if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \
                            self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                            "-e",
                           log_activate_cmd),
                     None, 0) != E.ERR_OK:
      logging.error("Could not activate logcontrol service on machine %s" % machine)
      logging.error("Please activate by hand")
      return 1

    serve_activate_cmd = "/etc/rc.d/init.d/serve_%s activate >&/dev/null" \
                           "</dev/null" % self.cfg.getGlobalParam('VERSION')
    if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \
                            self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                            "-e",
                           serve_activate_cmd),
                     None, 0) != E.ERR_OK:
      logging.error("Could not activate serve service on machine %s" % machine)
      logging.error("Please activate by hand")
      return 1

    logging.info("Machine %s successfully added into the system" % machine)

    if not mail_already_sent(M.MSG_MACHINEADDED % machine):
      SendMail.send(self.cfg, None, false,
                  M.MSG_MACHINEADDED % machine, "", true)
    return 0
  def reconfigurenet(self):
    """ Call reconfigurenet script on all machines """

    return not reconfigurenet_util.doReconfigureNet(self.cfg.globalParams)
Exemple #5
0
    def reconfigurenet(self):
        """ Call reconfigurenet script on all machines """

        return not reconfigurenet_util.doReconfigureNet(self.cfg.globalParams)
    def change_install_state(self):
        """
    Tries to change the state of the present version to target_state.
    Returns true in case of success.
    Here is sumary of what it does:
      1. Get list of active nodes
      2. Get list of services to start and stop
      3. In case there is something to start
          a. reconfigure's net on all nodes after verifying quorum
          b. starts core servics
      4. Verifies there is a master elected.
      5. Starts thread for each node to start and stop the needed services
      6. Waits for output from each thread
      7. Calculates success of failure based on thread results
      8. Asks each thread to print its status regarding what services
         it actually started or stopped and what was the return code and
         error message if any.
    """
        if not install_utilities.safe_transition(self.version_,
                                                 self.target_state_):
            return 0

        current_state = install_utilities.install_state(self.version_)

        start = time.time()
        # make sure svs is running
        svs_utilities.CheckSvsAlive(self.machines_)

        # get list of active nodes
        active_nodes = core_utils.GetLiveNodes(logging, self.retry_)
        ignore = core_utils.GetNodeFailures(core_utils.GetTotalNodes())
        # account for already inactive nodes
        ignore = ignore - (core_utils.GetTotalNodes() - len(active_nodes))
        ver = self.version_
        home = self.enthome_

        # See what we have to start / stop
        services_to_start = install_utilities.state_services_to_start(
            self.target_state_, self.machines_)
        services_to_stop = install_utilities.state_services_to_stop(
            install_utilities.install_state(self.version_), self.machines_)

        # Make some decisions
        total_nodes = len(self.cp_.var('ENT_ALL_MACHINES'))
        onebox = (total_nodes == 1)
        startcore = services_to_start and not onebox and not self.nonecore_only_
        checkquorum = startcore
        stopcore = (services_to_stop and not onebox and not self.nonecore_only_
                    and self.target_state_ == 'INACTIVE')
        doservices = (not self.core_only_
                      and (services_to_start or services_to_stop))
        if self.target_state_ in ['INACTIVE']:
            # ent_core does not really know the state. install_manager
            # has to tell ent_core when "makeinactive"
            testver = install_utilities.install_state(self.version_)
        else:
            testver = self.target_state_ in ['TEST', 'INSTALL']
        # If it is onebox and target state is INSTALL, do not run reconfigure_net
        # This is to support pre 4.4 version migration code.
        reconfigurenet_enabled = not (onebox and
                                      (self.target_state_ == 'INSTALL'))

        # if stop coreonly services, check if none-core components are running
        if (install_utilities.install_state(self.version_) == 'ACTIVE'
                and self.target_state_ == 'INACTIVE' and self.core_only_):
            logging.fatal("cannot stop core services while none core services "\
                          "are running")

        # Execute the decisions
        if checkquorum:
            # We check quorum only when services are to be started.
            # We mainly need quorum for core services. For non core services like
            # crawl, logcontrol etc. we use users specified machines.
            core_utils.VerifyQuorum(active_nodes)

        # check if syslogd.conf and klogd.conf exist
        install_utilities.check_klogd_syslogd_conf(active_nodes, home)

        # Kill any spurious adminrunner/adminconsole processes if we are entering
        # TEST or ACTIVE mode.
        if self.target_state_ in ['TEST', 'ACTIVE']:
            install_utilities.kill_service(['adminrunner', 'adminconsole'],
                                           core_utils.GetNodes(1))

        # reconfigure without restarting gems
        success = 1
        if reconfigurenet_enabled and services_to_start:
            # check if we need to force NTP reconfig if this is to upgrade from 4.4
            force_ntp_reconfig = 0
            if self.target_state_ in ['TEST', 'ACTIVE']:
                last_version = install_utilities.get_latest_version(
                    except_for=1)
                if (last_version is None or version_utilities.CmpVersions(
                        last_version, NEW_NTP_OPTION_GSA_VERSION) > 0):
                    force_ntp_reconfig = 1
            success = reconfigurenet_util.doReconfigureNet(
                self.cp_, active_nodes, force_ntp_reconfig=force_ntp_reconfig)
            if not success:
                logging.error('reconfigurenet failed.')

        # if start nonecore services, check if core services are running
        if (not onebox and self.nonecore_only_
                and self.target_state_ in ['TEST', 'ACTIVE']):
            core_running = install_utilities.is_core_running(ver,
                                                             home,
                                                             active_nodes,
                                                             ignore=ignore,
                                                             testver=testver)
            if not core_running:
                logging.fatal("cannot start none core services "\
                              "when core services are not running")

        # start core services if needed
        if startcore and success:
            # Retry 3 times for master verification failures
            num_retry = 3
            # it is always OK to reinit core services if the version is in
            # INSTALLED state
            self.reinitok_ = install_utilities.reinit_core_ok(ver,
                                                              home,
                                                              active_nodes,
                                                              ignore=ignore,
                                                              testver=testver)
            i = 1
            while i <= num_retry:
                # stop core services when retrying
                if i > 1:
                    time.sleep(15)
                    install_utilities.stop_core(ver,
                                                home,
                                                active_nodes,
                                                testver=testver)
                    time.sleep(15)
                i = i + 1
                # Run ent_core --ver=<ver> --activate --gfs=0 through install_utilities.py
                success = install_utilities.start_core(ver,
                                                       home,
                                                       active_nodes,
                                                       ignore=ignore,
                                                       testver=testver,
                                                       gfs=0)
                if not success:
                    if i <= num_retry:
                        logging.error(
                            'Error activating core services. Retrying...')
                    elif self.reinitok_:
                        # it is OK to ignore errors when trying to re-init core services
                        install_utilities.reinit_core(ver,
                                                      home,
                                                      active_nodes,
                                                      ignore=1,
                                                      testver=testver)
                        i = 1
                        self.reinitok_ = None
                    else:
                        logging.error('Error activating core services.')
                else:
                    # Make sure a master has been elected. If we go ahead without
                    # verifying the master then it will take very long time for
                    # services to be started. Making sure master is elected by now
                    # results in very quick adminrunner startup.
                    success = verify_master(ver, testver)
                    if success:
                        if not core_utils.InitDeadNodes(ver, testver,
                                                        logging) == 0:
                            logging.fatal(
                                'Error updating dead nodes to the lockserver.')
                        break
                    if i <= num_retry:
                        logging.error(
                            'Error verifying the master. Retrying...')
                    elif self.reinitok_:
                        # it is OK to ignore errors when trying to re-init core services
                        install_utilities.reinit_core(ver,
                                                      home,
                                                      active_nodes,
                                                      ignore=1,
                                                      testver=testver)
                        i = 1
                        self.reinitok_ = None
                    else:
                        raise core_utils.EntMasterError, (
                            'Error getting current GSA master'
                            ' from chubby.')
            # force gsa master on the desired node
            desired_gsa_master_node = core_utils.DesiredMasterNode()
            if desired_gsa_master_node is None:
                logging.fatal('No suitable node to run GSA master')
            logging.info('Forcing %s to become GSA master' %
                         desired_gsa_master_node)
            find_master.ForceMaster(desired_gsa_master_node, testver)

            # make sure the transaction logs are in sync and start gfs
            success = install_utilities.start_gfs(ver,
                                                  home,
                                                  active_nodes,
                                                  ignore=ignore,
                                                  testver=testver)

            # make sure gfs master is not the GSA master node
            logging.info('Ensuring %s not to become GFS master' %
                         desired_gsa_master_node)
            gfs_utils.AvoidGFSMasterOnNode(ver, testver,
                                           desired_gsa_master_node)

        if doservices and success:
            node_threads = {}
            for n in self.machines_:
                node_threads[n] = NodeInstallManager(n, self.target_state_,
                                                     self.version_,
                                                     services_to_start,
                                                     services_to_stop)

            # start node threads
            for (n, t) in node_threads.items():
                logging.info('STATUS: Starting thread for %s' % n)
                t.start()

            # wait for threads
            for (n, t) in node_threads.items():
                t.join()
                success = success and (t.err_ == 0)

            for (n, t) in node_threads.items():
                t.print_status()

        if stopcore and success:
            func = lambda: install_utilities.stop_core(
                ver, home, active_nodes, testver=testver)
            success = try_repeatedly(func, success=1)
            if not success:
                logging.error('Error inactivating core services.')

        # Start/Stop Borgmon and Reactor
        if self.cp_.var('ENT_ENABLE_EXTERNAL_BORGMON'):
            enable_external_borgmon = '--enable_external'
        else:
            enable_external_borgmon = '--noenable_external'
        borgmon_cmd = (
            "/export/hda3/%s/local/google3/enterprise/util/borgmon_util.py "
            "--ver %s --logtostderr %s" %
            (self.version_, self.version_, enable_external_borgmon))
        if success and current_state != self.target_state_:
            # 1) Stop Borgmon and Reactor if required
            if current_state in ['SERVE', 'TEST', 'ACTIVE']:
                E.execute(self.machines_,
                          "%s --mode %s --stop" % (borgmon_cmd, current_state),
                          None, 0)
            # 2) Start Borgmon and Reactor if required
            logging.info("target_state: %s" % self.target_state_)
            if self.target_state_ in ['SERVE', 'TEST', 'ACTIVE']:
                E.execute(
                    self.machines_,
                    "%s --mode %s --start" % (borgmon_cmd, self.target_state_),
                    None, 0)

        # Start/Stop Session Manager only for oneways
        if core_utils.GetTotalNodes() == 1:
            if self.target_state_ in ['SERVE', 'TEST', 'ACTIVE']:
                sessionmanager_util.ActivateSessionManager(ver, testver)
            if self.target_state_ == 'INACTIVE' and success:
                sessionmanager_util.DeactivateSessionManager(ver, testver)

        # Kill any spurious adminrunner/adminconsole processes if we are entering
        # INACTIVE or SERVE mode.
        if self.target_state_ in ['SERVE', 'INACTIVE']:
            install_utilities.kill_service(['adminrunner', 'adminconsole'],
                                           core_utils.GetNodes(1))

        if self.target_state_ == 'INACTIVE' and success and not self.nonecore_only_:
            install_utilities.InactivateCleanup(ver, home, active_nodes)

        end = time.time()
        diff = (end - start) / 60
        logging.info("STAT: change_install_state took %.2f minutes." % diff)
        return success