def run(self):
     # Stop all services on all machines
     ret = None
     for service in self.services_to_stop_:
         logging.info("STATUS: STOP %s service %s on %s" %
                      (service, self.version_, self.machine_))
         func = lambda: self.stop_service(service)
         ret = try_repeatedly(func, success=0)
         if ret:
             self.msg_ = "Error stopping %s on %s." % (service,
                                                       self.machine_)
             logging.error("STATUS: %s" % self.msg_)
             self.err_ = ret
             return
         self.stopped_.extend([service])
     # Change the STATE file
     # TODO(zsyed): Move STATE file to chubby.
     logging.info("STATE: version %s on %s: %s" %
                  (self.version_, self.machine_, self.target_state_))
     if not install_utilities.set_install_state(
             self.machine_, E.getEnterpriseHome(), self.target_state_):
         # Well here is not clear that we want to exit half way ..
         logging.error("ERROR changing state on machine %s. " \
                       "Please rollback and try again" % self.machine_ )
         self.err_ = ret
         self.msg_ = "Cannot change STATE file."
         return
     # Start all services to be started
     for service in self.services_to_start_:
         logging.info("STATUS: START %s service %s on %s" %
                      (service, self.version_, self.machine_))
         func = lambda: self.start_service(service)
         ret = try_repeatedly(func, success=0)
         if ret:
             self.msg_ = "Error starting %s on %s" % (service,
                                                      self.machine_)
             logging.error("STATUS: %s" % self.msg_)
             self.err_ = ret
             return
         self.started_.extend([service])
 def run(self):
   # Stop all services on all machines
   ret = None
   for service in self.services_to_stop_:
     logging.info("STATUS: STOP %s service %s on %s" % (
       service, self.version_, self.machine_))
     func = lambda: self.stop_service(service)
     ret = try_repeatedly(func, success=0)
     if ret:
       self.msg_ = "Error stopping %s on %s." % (service, self.machine_)
       logging.error("STATUS: %s" % self.msg_)
       self.err_ = ret
       return
     self.stopped_.extend([service])
   # Change the STATE file
   # TODO(zsyed): Move STATE file to chubby.
   logging.info("STATE: version %s on %s: %s" % (
     self.version_, self.machine_, self.target_state_))
   if not install_utilities.set_install_state(
     self.machine_, E.getEnterpriseHome(), self.target_state_):
     # Well here is not clear that we want to exit half way ..
     logging.error("ERROR changing state on machine %s. " \
                   "Please rollback and try again" % self.machine_ )
     self.err_ = ret
     self.msg_ = "Cannot change STATE file."
     return
   # Start all services to be started
   for service in self.services_to_start_:
     logging.info("STATUS: START %s service %s on %s" % (
         service, self.version_, self.machine_))
     func = lambda: self.start_service(service)
     ret = try_repeatedly(func, success=0)
     if ret:
       self.msg_ = "Error starting %s on %s" % (service, self.machine_)
       logging.error("STATUS: %s" % self.msg_)
       self.err_ = ret
       return
     self.started_.extend([service])
Example #3
0
    def add(self, machine, apc_outlet):
        """
    This adds a machine to the configuration
    """
        # We can add a machine only when we are in active state
        if install_utilities.install_state(
                self.cfg.getGlobalParam('VERSION')) != "ACTIVE":
            logging.error("Can add a machine only when we are in active state")
            return 1

        # First test for accessibility of the machine.
        if E.execute([machine], 'echo 1', None, 1) != E.ERR_OK:
            logging.error("Could not ssh into the machine %s" % machine)
            return 1

        # start the svs on the remote machine
        restart_svs_cmd = "%s/local/google3/enterprise/legacy/util/svs_utilities.py %s %s" % (
            self.cfg.getGlobalParam('ENTERPRISE_HOME'),
            self.cfg.getGlobalParam('ENTERPRISE_HOME'), machine)
        if E.execute([E.getCrtHostName()],
                         SECURE_WRAPPER_COMMAND % ( \
                              self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                              "-p2",
                              restart_svs_cmd),
                         None, 0) != E.ERR_OK:
            logging.error("Could not start svs on machine %s" % machine)
            return 1

        # wait for some time for svs to come up
        time.sleep(5)
        # check to see if the svs is up and is the right version
        if not svs_utilities.PingAndCheckSvsVersion(
                self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
                self.cfg.getGlobalParam('ENTERPRISE_HOME'), machine):
            logging.error("Svs not running correctly on machine %s" % machine)
            return 1
        ver = self.cfg.getGlobalParam('VERSION')
        home = self.cfg.getGlobalParam('ENTERPRISE_HOME')
        testver = install_utilities.is_test(ver)

        # update MACHINES
        machines = self.cfg.getGlobalParam('MACHINES')
        if machine not in machines:
            machines.append(machine)
        self.cfg.setGlobalParam('MACHINES', machines)

        ret = core_utils.RemDeadNode(ver, testver, machine)
        if ret:
            logging.error('Cannot remove dead node from lockserver.')
            # we ignore this error for now

        # We just added a new machine into the config
        # this will lead to a change in concentrator config
        # so we need to re-run serve service which will
        # write the new config and restart the concentrator
        serve_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \
                          "./serve_service.py %s" % (
          self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
          self.cfg.getGlobalParam('ENTERPRISE_HOME'))
        E.exe("%s %s" % (serve_cmd, "babysit"))

        num_tries = 5
        cur_try = 0
        while cur_try < num_tries:
            cur_try = cur_try + 1
            all_disks = self.cfg.mach_param_cache.GetFact(
                "mounted-drives", machine)
            bad_disks = self.cfg.mach_param_cache.GetFact(
                "var_log_badhds", machine)
            if bad_disks and all_disks:
                break
            time.sleep(60)
        if all_disks == None or bad_disks == None:
            logging.error("Could not get machine information about %s" %
                          machine)
            return 1

        bad_disks = string.split(bad_disks, ' ')
        all_disks = string.split(all_disks, ' ')
        good_disks = filter(lambda x, y=bad_disks: x not in y, all_disks)
        good_disks = map(lambda x: "%s3" % x, good_disks)
        # change sda3 to hda3 etc.
        good_disks = map(lambda x: re.sub(r'^s', 'h', x), good_disks)

        # Preprocess disks before adding to remove duplicates.
        unique_good_disks = []
        [
            unique_good_disks.append(disk) for disk in good_disks
            if disk not in unique_good_disks
        ]

        # Add disks
        self.updatedisk(machine, unique_good_disks, true)

        # apc map update
        apc_map = self.cfg.globalParams.var_copy('APC_MAP')
        apc_map[machine] = apc_util.PortMap(apc_outlet)
        if not self.cfg.setGlobalParam('APC_MAP', apc_map):
            logging.error("ERROR setting apc map to %s" % repr(apc_map))
            return 1

        # create appropriate datadirs on that machine
        if not self.cfg.createDataDirs([machine], node_replacement=1):
            logging.error("ERROR could not create datadirs on machine %s" %
                          machine)
            return 1

        # Replicate the config
        self.cfg.replicateConfigOnMachine(machine)

        # Reconfigure net on the target machine
        if not reconfigurenet_util.doReconfigureNet(
                self.cfg.globalParams, [machine], i_am_master=0):
            logging.error('reconfigurenet failed for %s' % machine)
            return 1

        # Start core services on the new node
        if not install_utilities.start_core(ver, home, [machine], ignore=0):
            logging.error("ERROR could not start core services on %s" %
                          machine)
            return 1
        # Add the chunkserver back
        gfs_utils.AddGFSChunkservers(ver, testver, [machine])

        # first we need to do Machine allocation.
        # this will assign things that will satisfy the constraints
        if not self.cfg.DoMachineAllocation(serversets=['workqueue-slave']):
            logging.error("ERROR doing machine allocation")
            return 1

        # now try to relllocate some servers from existing machines to the new machine
        replaced = self.cfg.AllocateServersToNewMachine(machine)
        if not replaced:
            logging.error("ERROR allocating services to the new machine")
            return 1

        # first we need to restart the babysitter
        E.exe("%s %s" % (serve_cmd, "babysit"))
        time.sleep(60)

        # Now we need to stop all the replaced services
        for server_string in replaced:
            server = serverlib.Server()
            server.InitFromName(server_string)
            replaced_type = server.servertype()
            kill_cmd = servertype.GetKillCmd(replaced_type, server.port())
            if E.execute([server.host()], kill_cmd, None, 1) != E.ERR_OK:
                logging.error("ERROR killing %s running on port %d on %s" % \
                                     (replaced_type, server.port(), server.host()))

        # we should make it active
        if not install_utilities.set_install_state(
                machine, self.cfg.getGlobalParam('ENTERPRISE_HOME'), "ACTIVE"):
            logging.error("ERROR changing state on machine %s. "
                          "Please make it active and activate and "
                          "start crawl service on it" % machine)
            return 1

        crawl_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \
                          "./crawl_service.py %s" % (
          self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
          self.cfg.getGlobalParam('ENTERPRISE_HOME'))
        if E.execute([machine], "%s %s" %
                     (crawl_cmd, "start"), None, 1) != E.ERR_OK:
            logging.error("Could not start crawl service on %s" % machine)
            return 1

        # save all the params
        self.cfg.saveParams()

        # for faster crawl recovery, lets restart all crawl processes
        self.restart_crawl_processes(serve_cmd)

        # activate the crawl and logcontrol service on the remote machine
        crawl_activate_cmd = "/etc/rc.d/init.d/crawl_%s activate >&/dev/null" \
                               "</dev/null" % self.cfg.getGlobalParam('VERSION')
        if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \
                                self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                                "-e",
                                crawl_activate_cmd),
                         None, 0) != E.ERR_OK:
            logging.error("Could not activate crawl service on machine %s" %
                          machine)
            logging.error("Please activate by hand")
            return 1
        log_activate_cmd = "/etc/rc.d/init.d/logcontrol_%s activate >&/dev/null" \
                               "</dev/null" % self.cfg.getGlobalParam('VERSION')
        if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \
                                self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                                "-e",
                               log_activate_cmd),
                         None, 0) != E.ERR_OK:
            logging.error(
                "Could not activate logcontrol service on machine %s" %
                machine)
            logging.error("Please activate by hand")
            return 1

        serve_activate_cmd = "/etc/rc.d/init.d/serve_%s activate >&/dev/null" \
                               "</dev/null" % self.cfg.getGlobalParam('VERSION')
        if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \
                                self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                                "-e",
                               serve_activate_cmd),
                         None, 0) != E.ERR_OK:
            logging.error("Could not activate serve service on machine %s" %
                          machine)
            logging.error("Please activate by hand")
            return 1

        logging.info("Machine %s successfully added into the system" % machine)

        if not mail_already_sent(M.MSG_MACHINEADDED % machine):
            SendMail.send(self.cfg, None, false, M.MSG_MACHINEADDED % machine,
                          "", true)
        return 0
  def add(self, machine, apc_outlet):
    """
    This adds a machine to the configuration
    """
    # We can add a machine only when we are in active state
    if install_utilities.install_state(self.cfg.getGlobalParam('VERSION')) != "ACTIVE":
      logging.error("Can add a machine only when we are in active state")
      return 1

    # First test for accessibility of the machine.
    if E.execute([machine], 'echo 1', None, 1) != E.ERR_OK:
      logging.error("Could not ssh into the machine %s" % machine)
      return 1

    # start the svs on the remote machine
    restart_svs_cmd = "%s/local/google3/enterprise/legacy/util/svs_utilities.py %s %s" % (
                          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                          machine)
    if E.execute([E.getCrtHostName()],
                     SECURE_WRAPPER_COMMAND % ( \
                          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                          "-p2",
                          restart_svs_cmd),
                     None, 0) != E.ERR_OK:
      logging.error("Could not start svs on machine %s" % machine)
      return 1

    # wait for some time for svs to come up
    time.sleep(5)
    # check to see if the svs is up and is the right version
    if not svs_utilities.PingAndCheckSvsVersion(
                          self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
                          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                          machine):
      logging.error("Svs not running correctly on machine %s" % machine)
      return 1
    ver = self.cfg.getGlobalParam('VERSION')
    home = self.cfg.getGlobalParam('ENTERPRISE_HOME')
    testver = install_utilities.is_test(ver)

    # update MACHINES
    machines = self.cfg.getGlobalParam('MACHINES')
    if machine not in machines:
      machines.append(machine)
    self.cfg.setGlobalParam('MACHINES', machines)

    ret = core_utils.RemDeadNode(ver, testver, machine)
    if ret:
      logging.error('Cannot remove dead node from lockserver.')
      # we ignore this error for now

    # We just added a new machine into the config
    # this will lead to a change in concentrator config
    # so we need to re-run serve service which will
    # write the new config and restart the concentrator
    serve_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \
                      "./serve_service.py %s" % (
      self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
      self.cfg.getGlobalParam('ENTERPRISE_HOME'),
      self.cfg.getGlobalParam('ENTERPRISE_HOME'))
    E.exe("%s %s" % (serve_cmd, "babysit"))

    num_tries = 5
    cur_try = 0
    while cur_try < num_tries:
      cur_try = cur_try + 1
      all_disks = self.cfg.mach_param_cache.GetFact("mounted-drives", machine) 
      bad_disks = self.cfg.mach_param_cache.GetFact("var_log_badhds", machine) 
      if bad_disks and all_disks:
        break
      time.sleep(60)
    if all_disks == None or bad_disks == None:
      logging.error("Could not get machine information about %s" % machine)
      return 1

    bad_disks = string.split(bad_disks, ' ')
    all_disks = string.split(all_disks, ' ')
    good_disks = filter(lambda x, y=bad_disks: x not in y, all_disks)
    good_disks = map(lambda x: "%s3" % x, good_disks)
    # change sda3 to hda3 etc.
    good_disks = map(lambda x: re.sub(r'^s', 'h', x), good_disks)

    # Preprocess disks before adding to remove duplicates.
    unique_good_disks = []
    [unique_good_disks.append(disk) for disk in good_disks if disk not in unique_good_disks]

    # Add disks
    self.updatedisk(machine, unique_good_disks, true)

    # apc map update
    apc_map = self.cfg.globalParams.var_copy('APC_MAP')
    apc_map[machine] = apc_util.PortMap(apc_outlet)
    if not self.cfg.setGlobalParam('APC_MAP', apc_map):
      logging.error("ERROR setting apc map to %s" % repr(apc_map))
      return 1

    # create appropriate datadirs on that machine
    if not self.cfg.createDataDirs([machine], node_replacement = 1):
      logging.error("ERROR could not create datadirs on machine %s" % machine)
      return 1

    # Replicate the config
    self.cfg.replicateConfigOnMachine(machine)

    # Reconfigure net on the target machine
    if not reconfigurenet_util.doReconfigureNet(self.cfg.globalParams,
                                                [machine], i_am_master=0):
      logging.error('reconfigurenet failed for %s' % machine)
      return 1

    # Start core services on the new node
    if not install_utilities.start_core(ver, home, [machine], ignore=0):
      logging.error("ERROR could not start core services on %s" % machine)
      return 1
    # Add the chunkserver back
    gfs_utils.AddGFSChunkservers(ver, testver, [machine])

    # first we need to do Machine allocation.
    # this will assign things that will satisfy the constraints
    if not self.cfg.DoMachineAllocation(serversets=['workqueue-slave']):
      logging.error("ERROR doing machine allocation")
      return 1

    # now try to relllocate some servers from existing machines to the new machine
    replaced = self.cfg.AllocateServersToNewMachine(machine)
    if not replaced:
      logging.error("ERROR allocating services to the new machine")
      return 1

    # first we need to restart the babysitter
    E.exe("%s %s" % (serve_cmd, "babysit"))
    time.sleep(60)

    # Now we need to stop all the replaced services
    for server_string in replaced:
      server = serverlib.Server()
      server.InitFromName(server_string)
      replaced_type = server.servertype()
      kill_cmd = servertype.GetKillCmd(replaced_type, server.port())
      if E.execute([server.host()], kill_cmd, None, 1) != E.ERR_OK:
        logging.error("ERROR killing %s running on port %d on %s" % \
                             (replaced_type, server.port(), server.host()))


    # we should make it active
    if not install_utilities.set_install_state(machine,
                             self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                             "ACTIVE"):
      logging.error("ERROR changing state on machine %s. "
                    "Please make it active and activate and "
                    "start crawl service on it" % machine)
      return 1

    crawl_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \
                      "./crawl_service.py %s" % (
      self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
      self.cfg.getGlobalParam('ENTERPRISE_HOME'),
      self.cfg.getGlobalParam('ENTERPRISE_HOME'))
    if E.execute([machine], "%s %s" % (crawl_cmd, "start"), None, 1) != E.ERR_OK:
      logging.error("Could not start crawl service on %s" % machine)
      return 1

    # save all the params
    self.cfg.saveParams()

    # for faster crawl recovery, lets restart all crawl processes
    self.restart_crawl_processes(serve_cmd)

    # activate the crawl and logcontrol service on the remote machine
    crawl_activate_cmd = "/etc/rc.d/init.d/crawl_%s activate >&/dev/null" \
                           "</dev/null" % self.cfg.getGlobalParam('VERSION')
    if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \
                            self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                            "-e",
                            crawl_activate_cmd),
                     None, 0) != E.ERR_OK:
      logging.error("Could not activate crawl service on machine %s" % machine)
      logging.error("Please activate by hand")
      return 1
    log_activate_cmd = "/etc/rc.d/init.d/logcontrol_%s activate >&/dev/null" \
                           "</dev/null" % self.cfg.getGlobalParam('VERSION')
    if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \
                            self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                            "-e",
                           log_activate_cmd),
                     None, 0) != E.ERR_OK:
      logging.error("Could not activate logcontrol service on machine %s" % machine)
      logging.error("Please activate by hand")
      return 1

    serve_activate_cmd = "/etc/rc.d/init.d/serve_%s activate >&/dev/null" \
                           "</dev/null" % self.cfg.getGlobalParam('VERSION')
    if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \
                            self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                            "-e",
                           serve_activate_cmd),
                     None, 0) != E.ERR_OK:
      logging.error("Could not activate serve service on machine %s" % machine)
      logging.error("Please activate by hand")
      return 1

    logging.info("Machine %s successfully added into the system" % machine)

    if not mail_already_sent(M.MSG_MACHINEADDED % machine):
      SendMail.send(self.cfg, None, false,
                  M.MSG_MACHINEADDED % machine, "", true)
    return 0