def _RunServeCmd(cfg, version, cmd, allnodes=0): """Run serve_service command. cmd: 'stop', 'start', 'activate', 'deactivate' allnodes: 1 to run command on all nodes """ serve_service_cmd = ( '/export/hda3/%s/local/google3/enterprise/legacy/scripts/' 'serve_service.py %s %s' % (version, cfg.getGlobalParam('ENTERPRISE_HOME'), cmd)) logging.info('Running: %s' % serve_service_cmd) if allnodes: machines = cfg.getGlobalParam(C.MACHINES) else: machines = [E.getCrtHostName()] if E.execute(machines, SECURE_WRAPPER_COMMAND % ( \ cfg.getGlobalParam('ENTERPRISE_HOME'), '-p2', serve_service_cmd), None, 0) != E.ERR_OK: logging.error('%s: failed' % serve_service_cmd) return 1 logging.info('%s: completed' % serve_service_cmd) return 0
def SyncOneboxLog(config): """Syncs Local Onebox log file with GFS Onebox Log file ONLY on clusters. As of 4.6.4, this is called from scripts/periodic_script.py and from onebox_handler.py, when the user does View Log AND the machine is a cluster. """ onebox_port = servertype.GetPortBase('oneboxenterprise') onebox_node = config.SERVERS[onebox_port] crt_machine = E.getCrtHostName() ent_config_type = config.var('ENT_CONFIG_TYPE') #If onebox server is not running no need to sync. if ent_config_type != 'CLUSTER' or crt_machine != onebox_node[0]: return tmp_dir = config.var('TMPDIR') gfs_cell = config.var('GFS_CELL') local_log_name = os.path.join(tmp_dir, config.var('ENTERPRISE_ONEBOX_LOG')) gfs_log_name = os.path.join(os.sep, 'gfs', gfs_cell, config.var('ENTERPRISE_ONEBOX_LOG')) equalize_command = 'equalize %s %s' % (local_log_name, gfs_log_name) # fileutil equalize copies only the difference of the log files. err, out = E.run_fileutil_command(config, equalize_command) if not err: return # files didn't match in the begining, possibly a new log file would have # created, copy the whole log file in such case. copy_command = 'cp -f %s %s' % (local_log_name, gfs_log_name) err, out = E.run_fileutil_command(config, copy_command) if err: logging.error('Error while syncing onebox logs.')
def replicateConfig(self, machines): """ Replicates the config to a list of machines """ cmd = (". %s && cd %s/local/google3/enterprise/legacy/scripts && " "./replicate_config.py %s %s" % (self.getGlobalParam("ENTERPRISE_BASHRC"), self.entHome, E.getCrtHostName(), self.globalParams.GetConfigFileName())) return E.execute(machines, cmd, None, true)
def replicateConfig(self, machines): """ Replicates the config to a list of machines """ cmd = (". %s && cd %s/local/google3/enterprise/legacy/scripts && " "./replicate_config.py %s %s" % ( self.getGlobalParam("ENTERPRISE_BASHRC"), self.entHome, E.getCrtHostName(), self.globalParams.GetConfigFileName())) return E.execute(machines, cmd, None, true)
def CollectLogs(all_machines, gws_log_dir, log_collect_dir): # We only run this on oneway or master node of cluster. master = find_master.FindMaster(2100, all_machines) crt_machine = E.getCrtHostName() if len(all_machines) != 1 and (len(master) != 1 or master[0] != crt_machine): logging.info('Not a oneway or cluster master node. Return!') return lockfile = '%s/lock' % log_collect_dir # waiting up to 5 minutes for the lock. lock = E.acquire_lock(lockfile, 30, breakLockAfterGracePeriod = 0) if lock == None: logging.info('Cannot grab the lock. Return!') return try: for machine in all_machines: src_pattern = '%s/partnerlog.*' % gws_log_dir dest_dir = '%s/%s' % (log_collect_dir, machine) # If it's a oneway or master node, we make a symlink to gws_log_dir instead # of rsync to log_collect directory if machine == crt_machine: # To make it backward compatible, we need to remove old dest_dir if it's # already an existing directory from previous version because in previous # versions we created a dir and rsynced files even on the master node and # one-ways. if os.path.exists(dest_dir) and not os.path.islink(dest_dir): if not E.rm(master, '%s/*' % dest_dir) or not E.rmdir(master, dest_dir): logging.error('Directory %s exists and cannot be cleaned.', dest_dir) continue logging.info('Cleaned existing directory %s.', dest_dir) if E.ln(master, gws_log_dir, dest_dir): logging.info('Symlink %s to directory %s:%s for logs' % (dest_dir, machine, gws_log_dir)) else: logging.error('Cannot make a symlink from %s to %s' % (dest_dir, gws_log_dir)) continue # For non-master nodes on cluster, we need to rsync those files to master node logging.info('Collecting logs from %s:%s into %s' % ( machine, src_pattern, dest_dir)) # make log directories if needed liblog.MakeDir(dest_dir) # rsync all files from one remote machine in one command. rsync_cmd = 'rsync --timeout=60 --size-only -vau ' \ ' -e ssh %s:%s %s/' % (machine, src_pattern, dest_dir) # rsync the logs (status, output) = liblog.DoCommand(rsync_cmd) if status != 0: logging.error('Failed to collect logs from %s: %s' % ( machine, output)) finally: lock.close() os.unlink(lockfile)
def browse(self, clientName, virtualFile, fromLine, toLine, grepString, fileArgs): grepString = urllib.quote_plus(grepString.strip()) fromLine = string.atoi(fromLine) toLine = string.atoi(toLine) fileLocation = self.makePhysicalFile(virtualFile, clientName, grepString, fileArgs) if not fileLocation: raise ar_exception.ARException( (ar_exception.LOADPARAMS, M.ERR_INVALIDFILESPECIFICATION)) # Get valid start line and end line numLines = toLine - fromLine + 1 if numLines < 0: numLines = 0 toLine = fromLine + numLines - 1 # Get the lines result = [] parsedGrepString = commands.mkarg(grepString) if (E.ERR_OK != E.execute([E.getCrtHostName()], "head -n %s %s | tail -n %s | grep -i -F -- %s" % (toLine, fileLocation, numLines, parsedGrepString), result, false)): return "0" return "%s\n%s" % (len(result[0]), result[0])
def SyncOpLogs(all_machines, log_dir): """ This will sync the AdminRunner.OPERATOR.* logs to all machines """ # We have to run this only on master master = find_master.FindMaster(2100, all_machines) # The name of this machine crt_machine = E.getCrtHostName() if len(master) == 1 and master[0] == crt_machine: for machine in all_machines: if machine != crt_machine: src_dir = '%s/AdminRunner.OPERATOR.*' % (log_dir) dest_dir = '%s:/%s' % (machine, log_dir) logging.info('Collecting operator logs from %s into %s' % ( src_dir, dest_dir)) rsync_cmd = 'rsync --timeout=20 --size-only -vau ' \ ' -e ssh %s %s/' % (src_dir, dest_dir) # rsync the logs lockfile = '%s/syncops_lock' % log_dir lock = E.acquire_lock(lockfile, 1, breakLockAfterGracePeriod = 0) if lock == None: logging.info('Cannot grab the lock. Return!') return try: (status, output) = liblog.DoCommand(rsync_cmd) if status != 0: logging.error('Failed to collect logs from %s: %s' % ( machine, output)) finally: lock.close() os.unlink(lockfile)
def browse(self, clientName, virtualFile, fromLine, toLine, grepString, fileArgs): grepString = urllib.quote_plus(grepString.strip()) fromLine = string.atoi(fromLine) toLine = string.atoi(toLine) fileLocation = self.makePhysicalFile(virtualFile, clientName, grepString, fileArgs) if not fileLocation: raise ar_exception.ARException(( ar_exception.LOADPARAMS, M.ERR_INVALIDFILESPECIFICATION)) # Get valid start line and end line numLines = toLine - fromLine + 1; if numLines < 0: numLines = 0 toLine = fromLine + numLines - 1 # Get the lines result = [] parsedGrepString = commands.mkarg(grepString) if ( E.ERR_OK != E.execute( [E.getCrtHostName()], "head -n %s %s | tail -n %s | grep -i -F -- %s" % (toLine, fileLocation, numLines, parsedGrepString), result, false) ): return "0" return "%s\n%s" % (len(result[0]), result[0])
def drain_urlmanagers(self): """ We need to do this before advancing the epoch -- we can do it multiple times """ urlmanagers = self.cfg.globalParams.GetServerHostPorts("urlmanager") num_shards = self.cfg.globalParams.GetNumShards('urlmanager') epoch = self.cfg.getGlobalParam('RT_EPOCH') for (host, port) in urlmanagers: # We don't do it here directly because of the timeout cmd = ". %s; cd %s/local/google3/enterprise/legacy/util && "\ "./port_talker.py %s %d 'd DumpingStatusTable' %d" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.entHome, host, port, 300) # 5 min timeout err = E.execute([E.getCrtHostName()], cmd, None, 0) if E.ERR_OK != err: logging.error("Error draining urlmanagers [%s]" % err) return 1 # Make sure that the file is out shard_num = servertype.GetPortShard(port) file = "%surlmanager_out_table_%02d_of_%02d_epoch%010d" % ( self.cfg.getGlobalParam('NAMESPACE_PREFIX'), shard_num, num_shards, epoch) err, out = E.run_fileutil_command(self.cfg.globalParams, "ls %s" % file) if E.ERR_OK != err: logging.error("The status table file [%s] is not there" % file) return 1 return 0
def PeriodicScript(config): """ If we have the FANCY_CLUSTER flag on we perform some master election """ global apacheRecheck machines = config.MACHINES # The two determinants : if we run sshd and the list of active AdminRunners sshd_alive = sshd_alive_check(config) tries = 6 sleep_time = 10 master = DetectMaster(machines, tries, sleep_time, config.var('VERSION')) if master is None: raise Error, 'Could not find master in %d tries.' % tries logging.info('%s is the master' % master) if not sshd_alive: # No ssh running. I restart sshd # sshd is not running!? ..too bad logging.error("sshd is not running on this machine. "\ "restarting sshd.") os.system("/etc/rc.d/init.d/sshd restart >&/dev/null </dev/null") # The name of this machine crt_machine = E.getCrtHostName() if master == crt_machine: # I am the master # make sure gfs master is not running on the same node on clusters if len(machines) > 1: AvoidGFSMasterOnNode(config, crt_machine) if IsAdminRunnerAlive(6, 10): # max_tries, sleep_time # Lincense stuff -- incrrease the serving time count IncreaseMasterCounter() # babysit loop_AdminRunner and loop_webserve_config StartAdminLoop(config, op='babysit') else: # I am the master but adminrunner does not seem to be running logging.info( 'I (%s) am the master but adminrunner does not seem to be' 'running.' % crt_machine) BecomeMaster(config) else: # I am not the master resync_with_master = 1 if core_utils.CanRunGSAMaster(crt_machine): if IsAdminRunnerAlive(1, 0): # max_tries, sleep_time # Kills adminrunner related processes, webconfig.py # calls KillProcessIfNotMaster MasterKillMyself(config) resync_with_master = 0 else: KillProcessIfNotMaster(config) if resync_with_master: ResyncWithMaster(config.GetConfigFileName(), config.ENTERPRISE_HOME, config.ENTERPRISE_USER, master) # check all services and kill services I am not supposed to run KillRedundantServices(config)
def isMaster(config): """Return true if is running on master node.""" if len(config.var('MACHINES')) == 1: return 1 (status, response) = port_talker.Talk(E.getCrtHostName(), core_utils.GSA_MASTER_PORT, 'v is_master', 5) return status and response[0] == '1'
def remove(self, machine): """ This removes a machine from the configuration """ if machine not in self.cfg.getGlobalParam('MACHINES'): logging.error("%s doesn't exist" % machine) return 1 ver = self.cfg.getGlobalParam('VERSION') home = self.cfg.getGlobalParam('ENTERPRISE_HOME') testver = install_utilities.is_test(ver) # if possible stop the core services, ignore return code install_utilities.stop_core(ver, home, [machine]) if machine == E.getCrtHostName(): logging.error("Cannot remove self") return 1 # Halt the machine if APC is used. error = self.halt(machine) self.cfg.globalParams.ReplaceVarInParam("SERVERS", None, machine) self.cfg.globalParams.ReplaceVarInParam("MACHINES", None, machine) ret = core_utils.AddDeadNode(ver, testver, machine) # remove the chunkserver running on the node gfs_utils.DeleteGFSChunkservers(ver, testver, [machine]) if ret: logging.error('Cannot add dead node to the lockserver.') # we ignore this error for now # now we need to remove the data disks that were on this machine data_disks = self.cfg.globalParams.var_copy('DATACHUNKDISKS') if data_disks.has_key(machine): del data_disks[machine] if not self.cfg.setGlobalParam('DATACHUNKDISKS', data_disks): return 1 # This also saves the config file if not self.cfg.DoMachineAllocation(): return 1 # Now we need to restart babysitter because the old one # is out of sync after this serve_service_cmd = ( ". %s && " "cd %s/local/google3/enterprise/legacy/scripts && " "./serve_service.py %s" % (self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME'))) E.exe("%s %s" % (serve_service_cmd, "babysit")) self.restart_crawl_processes(serve_service_cmd) if not mail_already_sent(M.MSG_MACHINEREMOVED % machine): SendMail.send(self.cfg, None, false, M.MSG_MACHINEREMOVED % machine, "", true) return error
def remove(self, machine): """ This removes a machine from the configuration """ if machine not in self.cfg.getGlobalParam('MACHINES'): logging.error("%s doesn't exist" % machine) return 1 ver = self.cfg.getGlobalParam('VERSION') home = self.cfg.getGlobalParam('ENTERPRISE_HOME') testver = install_utilities.is_test(ver) # if possible stop the core services, ignore return code install_utilities.stop_core(ver, home, [machine]) if machine == E.getCrtHostName(): logging.error("Cannot remove self") return 1 # Halt the machine if APC is used. error = self.halt(machine) self.cfg.globalParams.ReplaceVarInParam("SERVERS", None, machine) self.cfg.globalParams.ReplaceVarInParam("MACHINES", None, machine) ret = core_utils.AddDeadNode(ver, testver, machine) # remove the chunkserver running on the node gfs_utils.DeleteGFSChunkservers(ver, testver, [machine]) if ret: logging.error('Cannot add dead node to the lockserver.') # we ignore this error for now # now we need to remove the data disks that were on this machine data_disks = self.cfg.globalParams.var_copy('DATACHUNKDISKS') if data_disks.has_key(machine): del data_disks[machine] if not self.cfg.setGlobalParam('DATACHUNKDISKS', data_disks): return 1 # This also saves the config file if not self.cfg.DoMachineAllocation(): return 1 # Now we need to restart babysitter because the old one # is out of sync after this serve_service_cmd = (". %s && " "cd %s/local/google3/enterprise/legacy/scripts && " "./serve_service.py %s" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME'))) E.exe("%s %s" % (serve_service_cmd, "babysit")) self.restart_crawl_processes(serve_service_cmd) if not mail_already_sent(M.MSG_MACHINEREMOVED % machine): SendMail.send(self.cfg, None, false, M.MSG_MACHINEREMOVED % machine, "", true) return error
def PeriodicScript(config): """ If we have the FANCY_CLUSTER flag on we perform some master election """ global apacheRecheck machines = config.MACHINES # The two determinants : if we run sshd and the list of active AdminRunners sshd_alive = sshd_alive_check(config) tries = 6 sleep_time = 10 master = DetectMaster(machines, tries, sleep_time, config.var('VERSION')) if master is None: raise Error, 'Could not find master in %d tries.' % tries logging.info('%s is the master' % master) if not sshd_alive: # No ssh running. I restart sshd # sshd is not running!? ..too bad logging.error("sshd is not running on this machine. "\ "restarting sshd.") os.system("/etc/rc.d/init.d/sshd restart >&/dev/null </dev/null"); # The name of this machine crt_machine = E.getCrtHostName() if master == crt_machine: # I am the master # make sure gfs master is not running on the same node on clusters if len(machines) > 1: AvoidGFSMasterOnNode(config, crt_machine) if IsAdminRunnerAlive(6, 10): # max_tries, sleep_time # Lincense stuff -- incrrease the serving time count IncreaseMasterCounter() # babysit loop_AdminRunner and loop_webserve_config StartAdminLoop(config, op='babysit') else: # I am the master but adminrunner does not seem to be running logging.info('I (%s) am the master but adminrunner does not seem to be' 'running.' % crt_machine) BecomeMaster(config) else: # I am not the master resync_with_master = 1 if core_utils.CanRunGSAMaster(crt_machine): if IsAdminRunnerAlive(1, 0): # max_tries, sleep_time # Kills adminrunner related processes, webconfig.py # calls KillProcessIfNotMaster MasterKillMyself(config) resync_with_master = 0 else: KillProcessIfNotMaster(config) if resync_with_master: ResyncWithMaster(config.GetConfigFileName(), config.ENTERPRISE_HOME, config.ENTERPRISE_USER, master) # check all services and kill services I am not supposed to run KillRedundantServices(config)
def export(self, clientName, virtualFile, convert, fileArgs): fileLocation = self.makePhysicalFile(virtualFile, clientName, "", fileArgs) if not fileLocation: raise ar_exception.ARException( (ar_exception.LOADPARAMS, M.ERR_INVALIDFILESPECIFICATION)) return "%s %s" % (E.getCrtHostName(), fileLocation)
def export(self, clientName, virtualFile, convert, fileArgs): fileLocation = self.makePhysicalFile(virtualFile, clientName, "", fileArgs) if not fileLocation: raise ar_exception.ARException(( ar_exception.LOADPARAMS, M.ERR_INVALIDFILESPECIFICATION)) return "%s %s" % (E.getCrtHostName(), fileLocation)
def GetMachineParams(self, fact_name): """ Get facts from the machine parameter cache Arguments: fact_name: 'memory-total' Returns: 12430580 """ return self.mach_param_cache.GetFact(fact_name, E.getCrtHostName())
def haltcluster(self): machines = self.cfg.getGlobalParam("MACHINES") machines.remove(E.getCrtHostName()) msg = M.MSG_LOGSHUTDOWN self.writeAdminRunnerOpMsg(msg) if len(machines) > 0: # Halt all other machines now if rebooter.HaltMachines(self.cfg.globalParams.GetEntHome(), machines): # just send an email if not mail_already_sent(M.MSG_MACHINENEEDSHALT % string.join(machines, " ")): SendMail.send(self.cfg, None, false, M.MSG_MACHINENEEDSHALT % string.join(machines, " "), "", true) # Halt this machine after a delay time.sleep(120) rebooter.HaltMachines(self.cfg.globalParams.GetEntHome(), [E.getCrtHostName()]) return 0
def send_urlscheduler_command(self, command): schedulers = self.cfg.globalParams.GetServerHostPorts("urlscheduler") timeout = 60 # 1 minute is good enough for (machine, port) in schedulers: port_talker_cmd = ". %s; cd %s/local/google3/enterprise/legacy/util && "\ "./port_talker.py %s %d 'GET /run?Flags=%s\r\n\r\n' %d" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.entHome, machine, port, command, timeout) if E.ERR_OK != E.execute([E.getCrtHostName()], port_talker_cmd, None, 0): logging.error("Error talking to urlscheduler %s:%d" % (machine, port)) return 1 return 0
def send_cmd(self, server_name, cmd): srvrs = self.cfg.globalParams.GetServerManager().Set(server_name).Servers() for srvr in srvrs: actual_cmd = ". %s; cd %s/local/google3/enterprise/legacy/util && "\ "./port_talker.py %s %d '%s' %d" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.entHome, srvr.host(), srvr.port(), cmd, 60) # 1 min timeout err = E.execute([E.getCrtHostName()], actual_cmd, None, 0) if E.ERR_OK != err: logging.error("Error talking to server at %s:%d" % (srvr.host(), srvr.port())) return true
def haltcluster(self): machines = self.cfg.getGlobalParam("MACHINES") machines.remove(E.getCrtHostName()) msg = M.MSG_LOGSHUTDOWN self.writeAdminRunnerOpMsg(msg) if len(machines) > 0: # Halt all other machines now if rebooter.HaltMachines(self.cfg.globalParams.GetEntHome(), machines): # just send an email if not mail_already_sent( M.MSG_MACHINENEEDSHALT % string.join(machines, " ")): SendMail.send( self.cfg, None, false, M.MSG_MACHINENEEDSHALT % string.join(machines, " "), "", true) # Halt this machine after a delay time.sleep(120) rebooter.HaltMachines(self.cfg.globalParams.GetEntHome(), [E.getCrtHostName()]) return 0
def main(argv): pywrapfile.File.Init() config = entconfig.EntConfig(argv[0]) if not config.Load(): sys.exit(__doc__) # Collect logs only if active state = install_utilities.install_state(config.var('VERSION')) if not state in [ 'ACTIVE', 'SERVE' ]: sys.exit(0) # NO collection for sitesearches: if config.var('SITESEARCH_INTERFACE'): sys.exit(0) # If I'm not a config replica I don't collect the logs.. replicas = config.var('CONFIG_REPLICAS') crt_machine = E.getCrtHostName() if not crt_machine in replicas: logging.error('Not a replica') sys.exit(0) gws_log_dir = liblog.get_gws_log_dir(config) collect_dir = liblog.get_collect_dir(config) partition_dir = liblog.get_partition_dir(config) apache_dir = liblog.get_apache_dir(config) click_dir = liblog.get_click_dir(config) directory_map_file = liblog.get_directory_map_file(config) # in case cron job starts before adminrunner liblog.MakeDir(collect_dir) liblog.MakeDir(partition_dir) liblog.MakeDir(apache_dir) liblog.MakeDir(click_dir) # Collect Logs from machines all_machines = config.var('MACHINES') CollectLogs(all_machines, gws_log_dir, collect_dir) # Partition gwslogs by collections and convert to apache logs preprocess_logs.PartitionLogs(config) # Sanitize collection directory map coll_directory_map = liblog.CollectionDirectoryMap(directory_map_file) if coll_directory_map.sanitizeMap(partition_dir, apache_dir, click_dir): coll_directory_map.saveToDisk() # Send the OP logs to all machines SyncOpLogs(all_machines, config.var('LOGDIR')) logging.info('Done')
def RebootMachine(enthome, machine): 'Reboots a machine.' if machine == E.getCrtHostName(): # Rebooting ourself logging.info('Rebooting %s' % machine) E.execute([E.LOCALHOST], '/sbin/shutdown -r now', None, 1) # If we're still alive after a minute , the APC will kick in else: # Try shutting down cleanly first logging.info('Shutting down %s' % machine) E.execute([machine], '/sbin/shutdown -h now &', None, 1) time.sleep(60) logging.info('Rebooting %s via APC' % machine) return SendAPCCommand(enthome, machine, APC_REBOOT)
def send_cmd(self, server_name, cmd): srvrs = self.cfg.globalParams.GetServerManager().Set( server_name).Servers() for srvr in srvrs: actual_cmd = ". %s; cd %s/local/google3/enterprise/legacy/util && "\ "./port_talker.py %s %d '%s' %d" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.entHome, srvr.host(), srvr.port(), cmd, 60) # 1 min timeout err = E.execute([E.getCrtHostName()], actual_cmd, None, 0) if E.ERR_OK != err: logging.error("Error talking to server at %s:%d" % (srvr.host(), srvr.port())) return true
def send_urlmanager_command(self, command): ret = 0 urlmanagers = self.cfg.globalParams.GetServerHostPorts("urlmanager") for (host, port) in urlmanagers: # We don't do it here directly because of the timeout cmd = ". %s; cd %s/local/google3/enterprise/legacy/util && "\ "./port_talker.py %s %d %s %d" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.entHome, host, port, commands.mkarg(command), 300) err = E.execute([E.getCrtHostName()], cmd, None, 0) if E.ERR_OK != err: ret = 1 return ret
def main(): # on the GSA-Lite, there is no SVS, so just return if core_utils.GetEntConfigVar('ENT_CONFIG_TYPE') == 'LITE': return 0 enthome = sys.argv[1] if len(sys.argv) > 2: machine = sys.argv[2] else: machine = E.getCrtHostName() cfg = entconfig.EntConfig(enthome) if not cfg.Load(): return 1 restart_svs(machine) return 0
def KillRedundantServices(config): crt_machine = E.getCrtHostName() servers = config.SERVERS for server_port, server_machines in servers.items(): if crt_machine not in server_machines: response = port_talker.Talk("localhost", server_port, "v", 40) # kill the server if it is not supposed to run on the localhost if response[0]: server_to_kill = servertype.GetPortType(server_port) logging.info("kill service: %s" % server_to_kill) servertype.KillServer(server_to_kill, config, crt_machine, server_port)
def init_service(self, ent_home): """ Does the actual initialization. Reads the config file in the cp (EntConfig) member and initializes some members for easy access to usual parameters """ self.cp = entconfig.EntConfig(ent_home) if not self.cp.Load(): sys.exit("Cannot load the config file %s" % self.cp.GetConfigFileName()) # Get some params for easy access self.configfile = self.cp.GetConfigFileName() self.version = str(self.cp.var("VERSION")) self.entid_tag = "ENT_ID=%s_%s" % (self.version, self.service_name) self.ent_user = self.cp.var("ENTERPRISE_USER") self.ent_group = self.cp.var("ENTERPRISE_GROUP") self.ent_bashrc = self.cp.var("ENTERPRISE_BASHRC") self.ent_home = self.cp.var("ENTERPRISE_HOME") self.googlebot_dir = self.cp.var("GOOGLEBOT_DIR") self.version_tmpdir= "%s/tmp" % self.cp.var("ENTERPRISE_HOME") self.tmpdir = self.cp.var("TMPDIR") self.logdir = self.cp.var("LOGDIR") self.datadir = self.cp.var("DATADIR") self.scripts_dir = ("%s/local/google3/enterprise/legacy/scripts" % self.ent_home) self.util_dir = ("%s/local/google3/enterprise/legacy/util" % self.ent_home) self.machines = self.cp.var("MACHINES") # The master depends on the install state : for active / test / install # we have the adminrunner on the master, else we get it from MASTER # parameter self.install_state = install_utilities.install_state( self.version, rootdir = self.cp.var('ENT_DISK_ROOT')) self.local_machine = E.getCrtHostName() testver = install_utilities.is_test(self.version) if self.install_state in ["ACTIVE", "TEST", "INSTALL"]: try: self.master_machine = find_master.FindMasterUsingChubby(self.cp.var('VERSION')) except core_utils.EntMasterError, e: # Something is seriously wrong. logging.error("ERROR: Couldn't determine master") # Assume we aren't master, so we can at least do inactivate self.master_machine = None
def linecount(self, clientName, virtualFile, grepString, fileArgs): grepString = urllib.quote_plus(grepString.strip()) fileLocation = self.makePhysicalFile(virtualFile, clientName, grepString, fileArgs) if not fileLocation: raise ar_exception.ARException(( ar_exception.LOADPARAMS, M.ERR_INVALIDFILESPECIFICATION)) # find physical file size (line count) parsedGrepString = commands.mkarg(grepString) lineCount = self.getCount( "grep -i -F -- %s %s | wc -l | awk '{print $1}'" % (parsedGrepString, fileLocation), E.getCrtHostName()) return lineCount
def linecount(self, clientName, virtualFile, grepString, fileArgs): grepString = urllib.quote_plus(grepString.strip()) fileLocation = self.makePhysicalFile(virtualFile, clientName, grepString, fileArgs) if not fileLocation: raise ar_exception.ARException( (ar_exception.LOADPARAMS, M.ERR_INVALIDFILESPECIFICATION)) # find physical file size (line count) parsedGrepString = commands.mkarg(grepString) lineCount = self.getCount( "grep -i -F -- %s %s | wc -l | awk '{print $1}'" % (parsedGrepString, fileLocation), E.getCrtHostName()) return lineCount
def get_log_per_module(self, filename, module_name): '''Get the onebox module specific logging information.''' ent_config_type = self.cfg.getGlobalParam('ENT_CONFIG_TYPE') grep_string = '\"Query \\[\\|\\[%s\\]\"' % module_name if ent_config_type == 'CLUSTER': gfs_aliases = self.cfg.getGlobalParam('GFS_ALIASES') grep_command = 'fileutil --bnsresolver_use_svelte=false '\ '--gfs_aliases=%s cat %s | grep %s > %s' % ( gfs_aliases, filename, grep_string, self.TMP_LOG) else: grep_command = 'grep %s %s > %s' % (grep_string, filename, self.TMP_LOG) machine = E.getCrtHostName() E.execute([machine], grep_command, None, false) tmp_file = open(self.TMP_LOG, 'r') contents = tmp_file.read() tmp_file.close() os.remove(self.TMP_LOG) return contents
def main(argv): config = entconfig.EntConfig(argv[0]) if not config.Load(): sys.exit(__doc__) try: crawl = argv[1] port = string.atoi(argv[2]) except: sys.exit(__doc__) machines = config.var('MACHINES') master = find_master.FindMaster(port, machines) # The name of this machine crt_machine = E.getCrtHostName() if len(master) != 1 or master[0] != crt_machine: logging.info("I am not the master") sys.exit(0) # not a problem # find out the date 24 hours ago year,month,day = time.localtime(time.time() - 60*60*24 )[0:3] date1 = "date_%d_%d_%d" % (month, day, year) date2 = "date %d %d %d" % (month, day, year) logging.info("Running %s for %s" % (sys.argv[0], date2)) # 1. run log_report.py for each crawl ar = adminrunner_client.AdminRunnerClient("localhost", port) if not ar.LogReportStart(crawl, date1): sys.exit("Error starting crawl for " + crawl) logging.info("Started log_report for %s/%s" %(crawl, date1)) # 2. run apache_log.py for each crawl apache_cmd = "./apache_log.py %s update %s %s" % (argv[0], crawl, date2) (status, output) = commands.getstatusoutput(apache_cmd) if status != 0: logging.fatal("apache_log failed: %s" % output) logging.info("Finished apache_log: %s" % output) logging.info("Done")
def main(argv): config = entconfig.EntConfig(argv[0]) if not config.Load(): sys.exit(__doc__) try: crawl = argv[1] port = string.atoi(argv[2]) except: sys.exit(__doc__) machines = config.var('MACHINES') master = find_master.FindMaster(port, machines) # The name of this machine crt_machine = E.getCrtHostName() if len(master) != 1 or master[0] != crt_machine: logging.info("I am not the master") sys.exit(0) # not a problem # find out the date 24 hours ago year, month, day = time.localtime(time.time() - 60 * 60 * 24)[0:3] date1 = "date_%d_%d_%d" % (month, day, year) date2 = "date %d %d %d" % (month, day, year) logging.info("Running %s for %s" % (sys.argv[0], date2)) # 1. run log_report.py for each crawl ar = adminrunner_client.AdminRunnerClient("localhost", port) if not ar.LogReportStart(crawl, date1): sys.exit("Error starting crawl for " + crawl) logging.info("Started log_report for %s/%s" % (crawl, date1)) # 2. run apache_log.py for each crawl apache_cmd = "./apache_log.py %s update %s %s" % (argv[0], crawl, date2) (status, output) = commands.getstatusoutput(apache_cmd) if status != 0: logging.fatal("apache_log failed: %s" % output) logging.info("Finished apache_log: %s" % output) logging.info("Done")
def StartupWork(cfg, state): try: ############################################################################# # check memory-total logging.info("check memory-total") if not cfg.CheckMachineMemory(): logging.fatal("failed to check memory-total") return # One-time Initialization Work: # In fresh mode we first initialize the global default if install_utilities.GetInitState(cfg.globalParams) == C.FRESH: logging.info("initializing global default files") if not cfg.globalParams.InitGlobalDefaultFiles(overwrite=false): logging.fatal("failed to initialize global default files") return # Call to saveParams will also distribute the newly created global # default files. logging.info("saving params and distributing global default files.") logging.flush() # This is a best effort mechanism. We try upto 6 times to update the # files. We assume that we'll get the files to all alive machines. If # a machine can't get the files then we assume that it is dead and won't # become a master with missing default files. cfg.saveParams(retry=6) logging.flush() install_utilities.SetInitState(cfg, C.CONFIG_FILES_INITIALIZED) logging.info('system initialization for %s state successful' % C.CONFIG_FILES_INITIALIZED) logging.flush() # In install mode -- bail out now if state == "INSTALL": cfg.startupDone() return # Perform reset index if we got interrupted in the middle of it reset_index.ResetIndex(cfg, only_if_in_progress=1) try: localhost = E.getCrtHostName() if not cfg.setGlobalParam('MASTER', localhost): logging.fatal("Error setting the MASTER to %s" % localhost) except IOError: logging.fatal("Couldn't set MASTER -> exiting") # Allocate machines for the empty slots logging.info("doing machine allocation") if not cfg.DoMachineAllocation(): logging.info("failed to do machine allocation") # In the next install stage (INSTALL) mode we go and finish up the work if (install_utilities.GetInitState(cfg.globalParams) == C.CONFIG_FILES_INITIALIZED): logging.info("doing system initialization") # datadirs are initially made (by the base rpm) to not have any data disks # because it doesn't know which disks to use, but the datadir is needed # to get things running. logging.info("creating datadirs") if not cfg.createDataDirs(cfg.getGlobalParam(C.MACHINES)): logging.fatal("failed to make datadirs on machines") return # Create GFS subdirectories if cfg.getGlobalParam(C.GFS_CELL): logging.info("creating gfs subdirectories") if not cfg.createGFSChunkSubDirs(): logging.fatal("failed creating the gfs subdirectories") return # ensure initial spelling data is present cfg.EnsureSpellingData() # create a default collection w/content of mainCrawl / frontend logging.info("Creating default collection ...") cfg.CreateDefaultCollection() logging.info("Creating default frontend ...") cfg.CreateDefaultFrontend() logging.info("Assiging default collection and frontend ...") cfg.AssignDefaultCollAndFront() # create some initial files needed by various backend logging.info("Creating default backend files ...") cfg.CreateDefaultBackendFiles() # create built in query expansion entry logging.info("Creating default query expansion entry ...") cfg.CreateDefaultQueryExpEntry() install_utilities.SetInitState(cfg, C.INITIALIZED) logging.info("system initialization successful") # we start the serve_service as this is fresh install logging.info('Starting serve service...') RunServeService(cfg.globalParams, 'start') else: # we restart the babysitter because the servers map may have changed logging.info('Babysitting serve service after allocation...') RunServeService(cfg.globalParams, 'babysit') logging.info("Saving params") cfg.saveParams() # now we're ready to run; end startup mode logging.info("Done with startup") cfg.startupDone() except Exception, e: (t, v, tb) = sys.exc_info() exc_msg = string.join(traceback.format_exception(t, v, tb)) logging.error(exc_msg) logging.fatal("StartupWork failed with exception: %s; %s" % ( e, traceback.format_tb(tb)))
def makePhysicalFile(self, virtualFile, clientName, grepString, fileArg): """ Makes a physical file from a virtual one Creates a temp file with results from grep operation/cating of files Returns: [machine name], [file name] return null on error """ # Sanitize fileArg. fileArg = ''.join([ x for x in fileArg if x in string.ascii_letters + string.digits + '_-%' ]) # Translate from String to fileId if not virtualFile or not FILE_TABLE.has_key(virtualFile): return None # For each file that we can export we have to have an entry in the # global FILE_TABLE fe = FILE_TABLE[virtualFile] pathIn = fe.getPathIn(self.cfg.globalParams, clientName, fileArg, grepString) pathOut = fe.getPathOut(self.cfg.globalParams, clientName, fileArg, grepString) auxGrepString = fe.aux_grep auxCutString = fe.aux_cut machine = E.getCrtHostName() tmpPath = None # Copy web log from GFS to the log directory if necessary. if virtualFile == 'WEB_LOG' and self.cfg.getGlobalParam('GFS_ALIASES') and \ not os.path.exists(pathIn): ok = self.cfg.logmanager.CopyRawReportFromGfsToLocal( fileArg, clientName) if not ok or not os.path.exists(pathIn): logging.error('Failed on CopyRawReportFromGfsToLocal()') return None # Copy the feed log from GFS to the log directory if necessary. elif virtualFile == 'FEED_LOG' and self.cfg.getGlobalParam('GFS_ALIASES') and \ not os.path.exists(pathIn): tmpPath = pathOut + "_fromGFS" (status, output) = E.run_fileutil_command( self.cfg.globalParams, "cat %s > %s" % (pathIn, tmpPath), 5) if E.ERR_OK != status: logging.error("Failed to copy %s to %s" % (pathIn, tmpPath)) return None pathIn = tmpPath # Count the files that we can get files = E.ls([machine], pathIn) if not files: numFiles = 0 else: numFiles = len(files) # If we need only one file, and there are more than one, use the # last one. if numFiles > 0 and not fe.multi_files: pathIn = files[-1] # Create the auxiliary command to create the actual file command = None if numFiles == 0: # No files availavle. command = "echo -e '' > %s" % pathOut else: if virtualFile == 'FEED_LOG': command = "tail -n +2 %s " % pathIn if fe.do_tac: command = command + " | tac " # If we reverse the files before displaying elif fe.do_tac: command = "tac `ls -r %s` " % pathIn # Grep the lines we want if auxGrepString: if not command: command = "cat `ls -r %s`" % pathIn command = command + " | grep -- %s" % commands.mkarg( auxGrepString) # Maybe we need another grep if grepString: if not command: command = "cat `ls -r %s`" % pathIn parsedGrepString = commands.mkarg(grepString) command = command + " | grep -i -F -- %s" % parsedGrepString # Maybe a cut as well if auxCutString: if not command: command = "cat `ls -r %s`" % pathIn command = command + " | cut %s" % auxCutString if command: command = command + " > " + pathOut # execute the command "file+operation > temp_filename" # if the command is null just use the path we have. if not command: return pathIn E.execute([machine], command, None, false) if tmpPath: E.rm([machine], tmpPath) return pathOut
#!/usr/bin/python2.4 # # This handy command reads global config file on the current machine and # writes the value of CONFIGVERSION parameter together with the machine name. # (Used by periodic_script.py) # ############################################################################### """ Usage : get_config_version.py <enthome> """ import sys from google3.enterprise.legacy.util import E from google3.enterprise.legacy.adminrunner import entconfig if __name__ == '__main__': config = entconfig.EntConfig(sys.argv[1]) if not config.Load(): version = "" else: version = config.var("CONFIGVERSION") print "%s-%s" % (version, E.getCrtHostName()) ###############################################################################
def main(argv): """ expects: argv[0] to be a machine name, argv[1] be a google_config file - copys google_config file from the machine specified into the local machine (by calling CopyFile() function) - calls ReplicateConfig() - which """ if len(argv) < 2: sys.exit(__doc__) machine = argv[0] global_file = argv[1] global_dir = os.path.dirname(global_file) if E.getCrtHostName() == machine: logging.info("I do not try to replicate to myself") sys.exit(0) config = config_factory.ConfigFactory().CreateConfig(global_file) if not config.Load(): sys.exit("ERROR: Cannot read file %s " % global_file) file_to_copy = [ C.ETC_SYSCONFIG, global_file, "%s/lic_counter" % global_dir, "%s/lic_counter.bck" % global_dir, "%s/passwd" % global_dir, "/export/hda3/versionmanager/vmanager_passwd", "%s/server.p12" % global_dir ] # Sync all the files for f in file_to_copy: CopyFile(machine, f, production_tmp) # sync apache certificate CopyFileAsRoot(machine, (ssl_cert.CERT_FILENAME % config.var('ENTERPRISE_HOME')), production_tmp, config) # sync private key CopyFileAsRoot(machine, (ssl_cert.KEY_FILENAME % config.var('ENTERPRISE_HOME')), production_tmp, config) # sync the support request repository directory # GRR: ReplicateDirectory(machine, SUPPORTREQUEST_RECORDS_DIR) # replicate config including per collection/frontend stuff # some dirs in conf don't need to be rsync'ed if core_utils.CanRunGSAMaster(E.getCrtHostName()): exclude_patterns = ['cmr_working/failure/', 'cmr_working/success/', 'cmr_working/statusz/'] else: exclude_patterns = ['cmr_working/', 'cmr/', 'fixer/', 'fixer_cmr/', 'gems'] ReplicateDirectory(machine, "%s/local/conf/" % config.var('ENTERPRISE_HOME'), exclude_patterns) # to replicate per frontend gws stuff (keymatches, gws bad urls) ReplicateDirectory(machine, "%s/gws/" % config.var('GOOGLEDATA')) # need to adjust NTP if master has changed # assumption : this machine is not the master if 'PILE' != config.var('ENT_CONFIG_TYPE'): ent_home = E.getEnterpriseHome() os.system("%s NTP %s" % (C.RECONFIGURE_COMMAND % ent_home, machine)) # also make sure DNS is up to date os.system("%s DNS %s %s" % ( C.RECONFIGURE_COMMAND % ent_home, config.var('BOT_DNS_SERVERS'), config.var('DNS_SEARCH_PATH')))
def doReconfigureNet(config, machines=None, i_am_master=1, force_ntp_reconfig=0): """ reconfigure serveriron, DNS, NTPs, iptables, timezone as needed. Force NTP server reconfiguration if force_ntp_reconfig=1 """ # first we need to reconfigure our external IP address ret1 = ret2 = ret4 = ret5 = ret6 = ret8 = 0 configType = config.var('ENT_CONFIG_TYPE') ent_home = E.getEnterpriseHome() if not machines: machines = config.var('MACHINES') # Sanity check if not config.var('EXTERNAL_WEB_IP'): logging.error('config file is missing EXTERNAL_WEB_IP, probably corrupt') raise Exception, 'config file is missing EXTERNAL_WEB_IP, probably corrupt' if config.var('USE_DHCP') is None: logging.error('config file is missing USE_DHCP, probably corrupt') raise Exception, 'config file is missing USE_DHCP, probably corrupt' if config.var('DNS_DHCP') is None: logging.error('config file is missing DNS_DHCP, probably corrupt') raise Exception, 'config file is missing DNS_DHCP, probably corrupt' tries = 3 # default for CLUSTER and PILE configuration if configType in ["SUPER", "ONEBOX", "MINI", "LITE", "FULL"]: tries = 1 # reconfigure eth0 to dhcp mode if config.var('USE_DHCP') == 1: ret1 = ExecuteWrapper( machines, "%s LOCALNET DHCP %s" % ( C.RECONFIGURE_COMMAND % ent_home, config.var('ONEBOX_NETCARD_SETTINGS'), ), None, 600, num_tries = tries) logging.info("reconfigure eth0 IP to dhcp mode: %s" % ret1) else: ret1 = ExecuteWrapper( machines, "%s LOCALNET %s %s %s %s" % ( C.RECONFIGURE_COMMAND % ent_home, config.var('EXTERNAL_WEB_IP'), config.var('EXTERNAL_NETMASK'), config.var('EXTERNAL_DEFAULT_ROUTE'), config.var('ONEBOX_NETCARD_SETTINGS'), ), None, 600, num_tries = tries) logging.info("reconfigure eth0 IP address: %s" % ret1) elif "CLUSTER" == configType: # reconfigure serveriron IP address cmd = "(sleep 5; %s SERVERIRON %s %s %s %s %s %s %s %s %s >&/dev/null " \ "</dev/null)" % ( C.RECONFIGURE_COMMAND % ent_home, "%s/local/conf/defaults/" % ent_home, commands.mkarg(config.var("EXTERNAL_SWITCH_IP")), commands.mkarg(config.var("EXTERNAL_WEB_IP")), commands.mkarg(config.var("EXTERNAL_CRAWL_IP")), commands.mkarg(config.var("EXTERNAL_NETMASK")), commands.mkarg(config.var("EXTERNAL_DEFAULT_ROUTE")), commands.mkarg(str(config.var("EXTERNAL_WEB_PORT"))), commands.mkarg(str(config.var("SERVERIRON_AUTONEGOTIATION"))), commands.mkarg(str(config.var("ENT_ENABLE_EXTERNAL_SSH"))), ) t = threading.Thread(target=E.execute, args=([E.getCrtHostName()], cmd, None, 60)) t.start() logging.info("serveriron IP address reconfigured") # we don't want to change ANYTHING on PILE clusters if "PILE" != configType: # DNS needs to be set everywhere dns_server = "\"\"" dns_searchpath = "\"\"" if config.var('BOT_DNS_SERVERS') != None: dns_server = config.var('BOT_DNS_SERVERS') if config.var('DNS_SEARCH_PATH') != None: dns_searchpath = config.var('DNS_SEARCH_PATH') if config.var('DNS_DHCP') != 1: ret8 = ExecuteWrapper( machines, "%s DNSMODE STATIC" % ( C.RECONFIGURE_COMMAND % ent_home ), None, 600, num_tries = tries) logging.info("setting DNS mode to STATIC: %s" % ret8) ret2 = ExecuteWrapper( machines, "%s DNS %s %s" % ( C.RECONFIGURE_COMMAND % ent_home, commands.mkarg(dns_server), commands.mkarg(dns_searchpath), ), None, 600, num_tries = tries) logging.info("reconfigure DNS: %s" % ret2) else: ret8 = ExecuteWrapper( machines, "%s DNSMODE DHCP" % ( C.RECONFIGURE_COMMAND % ent_home ), None, 600, num_tries = tries) logging.info("setting DNS mode to DHCP: %s" % ret8) # NTP is special: all machines but the master must set their # NTP server to the master, the master to the external one. # However, It can take 3 minutes for the stratum level of the master # node to be set. Before that, non-master nodes using the master # node to do "ntpdate" may return "no server suitable for synchronization # found" error. # To fix the problem, we just use the external ntp servers for all nodes. # Later, periodic script will set the non-master nodes to use the master # node. (periodic script will only set it once as long as master does # not switch) ntpServers = "\"\"" if config.var('NTP_SERVERS') != None: ntp_server_list = copy.copy(config.var('NTP_SERVERS')) AddDefaultNTPServer(ntp_server_list) ntpServers = string.join(ntp_server_list, ",") if i_am_master: ret4 = ExecuteWrapper( machines, "%s NTP %d %s" % ( C.RECONFIGURE_COMMAND % ent_home, force_ntp_reconfig, commands.mkarg(ntpServers)), None, 600, num_tries = tries) logging.info("reconfigure NTP: %s" % ret4) # whenever we print dates, we want to include the timezone. Since # the timezone may change on the machine (through config interface), # and java VM does not pick this change up automatically, # we keep track of the timezone here ret5 = ExecuteWrapper( machines, "%s TIMEZONE %s" % ( C.RECONFIGURE_COMMAND % ent_home, commands.mkarg(config.var('TIMEZONE'))), None, 600, num_tries = tries) logging.info("reconfigure TIMEZONE: %s" % ret5) # iptables if configType in ["SUPER", "ONEBOX", "LITE", "FULL"]: iptables_file = "%s/local/conf/defaults/iptables.onebox" % ent_home elif "MINI" == configType: iptables_file = "%s/local/conf/defaults/iptables.mini" % ent_home elif "CLUSTER" == configType: iptables_file = "%s/local/conf/defaults/iptables.cluster" % ent_home else: iptables_file = "%s/local/conf/defaults/iptables.open" % ent_home ret6 = ExecuteWrapper( machines, "%s IPTABLES %s %s %s" % ( C.RECONFIGURE_COMMAND % ent_home, iptables_file, commands.mkarg(str(config.var("ENT_ENABLE_EXTERNAL_SSH"))), commands.mkarg(str(config.var("ENT_ENABLE_EXTERNAL_BORGMON"))), ), None, 600, num_tries = tries) logging.info("reconfigure IPTABLES: %s" % ret6) # return value from each reconfigurenet call is: # 0 : no change # 1 : changed # 2+: error (but we report no errors) # _our_ return value is if any of them failed ret = (ret1 <= 1 and ret2 <= 1 and ret4 <= 1 and ret5 <= 1 and ret6 <= 1 and ret8 <= 1) return ret
def add(self, machine, apc_outlet): """ This adds a machine to the configuration """ # We can add a machine only when we are in active state if install_utilities.install_state( self.cfg.getGlobalParam('VERSION')) != "ACTIVE": logging.error("Can add a machine only when we are in active state") return 1 # First test for accessibility of the machine. if E.execute([machine], 'echo 1', None, 1) != E.ERR_OK: logging.error("Could not ssh into the machine %s" % machine) return 1 # start the svs on the remote machine restart_svs_cmd = "%s/local/google3/enterprise/legacy/util/svs_utilities.py %s %s" % ( self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), machine) if E.execute([E.getCrtHostName()], SECURE_WRAPPER_COMMAND % ( \ self.cfg.getGlobalParam('ENTERPRISE_HOME'), "-p2", restart_svs_cmd), None, 0) != E.ERR_OK: logging.error("Could not start svs on machine %s" % machine) return 1 # wait for some time for svs to come up time.sleep(5) # check to see if the svs is up and is the right version if not svs_utilities.PingAndCheckSvsVersion( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), machine): logging.error("Svs not running correctly on machine %s" % machine) return 1 ver = self.cfg.getGlobalParam('VERSION') home = self.cfg.getGlobalParam('ENTERPRISE_HOME') testver = install_utilities.is_test(ver) # update MACHINES machines = self.cfg.getGlobalParam('MACHINES') if machine not in machines: machines.append(machine) self.cfg.setGlobalParam('MACHINES', machines) ret = core_utils.RemDeadNode(ver, testver, machine) if ret: logging.error('Cannot remove dead node from lockserver.') # we ignore this error for now # We just added a new machine into the config # this will lead to a change in concentrator config # so we need to re-run serve service which will # write the new config and restart the concentrator serve_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \ "./serve_service.py %s" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME')) E.exe("%s %s" % (serve_cmd, "babysit")) num_tries = 5 cur_try = 0 while cur_try < num_tries: cur_try = cur_try + 1 all_disks = self.cfg.mach_param_cache.GetFact( "mounted-drives", machine) bad_disks = self.cfg.mach_param_cache.GetFact( "var_log_badhds", machine) if bad_disks and all_disks: break time.sleep(60) if all_disks == None or bad_disks == None: logging.error("Could not get machine information about %s" % machine) return 1 bad_disks = string.split(bad_disks, ' ') all_disks = string.split(all_disks, ' ') good_disks = filter(lambda x, y=bad_disks: x not in y, all_disks) good_disks = map(lambda x: "%s3" % x, good_disks) # change sda3 to hda3 etc. good_disks = map(lambda x: re.sub(r'^s', 'h', x), good_disks) # Preprocess disks before adding to remove duplicates. unique_good_disks = [] [ unique_good_disks.append(disk) for disk in good_disks if disk not in unique_good_disks ] # Add disks self.updatedisk(machine, unique_good_disks, true) # apc map update apc_map = self.cfg.globalParams.var_copy('APC_MAP') apc_map[machine] = apc_util.PortMap(apc_outlet) if not self.cfg.setGlobalParam('APC_MAP', apc_map): logging.error("ERROR setting apc map to %s" % repr(apc_map)) return 1 # create appropriate datadirs on that machine if not self.cfg.createDataDirs([machine], node_replacement=1): logging.error("ERROR could not create datadirs on machine %s" % machine) return 1 # Replicate the config self.cfg.replicateConfigOnMachine(machine) # Reconfigure net on the target machine if not reconfigurenet_util.doReconfigureNet( self.cfg.globalParams, [machine], i_am_master=0): logging.error('reconfigurenet failed for %s' % machine) return 1 # Start core services on the new node if not install_utilities.start_core(ver, home, [machine], ignore=0): logging.error("ERROR could not start core services on %s" % machine) return 1 # Add the chunkserver back gfs_utils.AddGFSChunkservers(ver, testver, [machine]) # first we need to do Machine allocation. # this will assign things that will satisfy the constraints if not self.cfg.DoMachineAllocation(serversets=['workqueue-slave']): logging.error("ERROR doing machine allocation") return 1 # now try to relllocate some servers from existing machines to the new machine replaced = self.cfg.AllocateServersToNewMachine(machine) if not replaced: logging.error("ERROR allocating services to the new machine") return 1 # first we need to restart the babysitter E.exe("%s %s" % (serve_cmd, "babysit")) time.sleep(60) # Now we need to stop all the replaced services for server_string in replaced: server = serverlib.Server() server.InitFromName(server_string) replaced_type = server.servertype() kill_cmd = servertype.GetKillCmd(replaced_type, server.port()) if E.execute([server.host()], kill_cmd, None, 1) != E.ERR_OK: logging.error("ERROR killing %s running on port %d on %s" % \ (replaced_type, server.port(), server.host())) # we should make it active if not install_utilities.set_install_state( machine, self.cfg.getGlobalParam('ENTERPRISE_HOME'), "ACTIVE"): logging.error("ERROR changing state on machine %s. " "Please make it active and activate and " "start crawl service on it" % machine) return 1 crawl_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \ "./crawl_service.py %s" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME')) if E.execute([machine], "%s %s" % (crawl_cmd, "start"), None, 1) != E.ERR_OK: logging.error("Could not start crawl service on %s" % machine) return 1 # save all the params self.cfg.saveParams() # for faster crawl recovery, lets restart all crawl processes self.restart_crawl_processes(serve_cmd) # activate the crawl and logcontrol service on the remote machine crawl_activate_cmd = "/etc/rc.d/init.d/crawl_%s activate >&/dev/null" \ "</dev/null" % self.cfg.getGlobalParam('VERSION') if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \ self.cfg.getGlobalParam('ENTERPRISE_HOME'), "-e", crawl_activate_cmd), None, 0) != E.ERR_OK: logging.error("Could not activate crawl service on machine %s" % machine) logging.error("Please activate by hand") return 1 log_activate_cmd = "/etc/rc.d/init.d/logcontrol_%s activate >&/dev/null" \ "</dev/null" % self.cfg.getGlobalParam('VERSION') if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \ self.cfg.getGlobalParam('ENTERPRISE_HOME'), "-e", log_activate_cmd), None, 0) != E.ERR_OK: logging.error( "Could not activate logcontrol service on machine %s" % machine) logging.error("Please activate by hand") return 1 serve_activate_cmd = "/etc/rc.d/init.d/serve_%s activate >&/dev/null" \ "</dev/null" % self.cfg.getGlobalParam('VERSION') if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \ self.cfg.getGlobalParam('ENTERPRISE_HOME'), "-e", serve_activate_cmd), None, 0) != E.ERR_OK: logging.error("Could not activate serve service on machine %s" % machine) logging.error("Please activate by hand") return 1 logging.info("Machine %s successfully added into the system" % machine) if not mail_already_sent(M.MSG_MACHINEADDED % machine): SendMail.send(self.cfg, None, false, M.MSG_MACHINEADDED % machine, "", true) return 0
def add(self, machine, apc_outlet): """ This adds a machine to the configuration """ # We can add a machine only when we are in active state if install_utilities.install_state(self.cfg.getGlobalParam('VERSION')) != "ACTIVE": logging.error("Can add a machine only when we are in active state") return 1 # First test for accessibility of the machine. if E.execute([machine], 'echo 1', None, 1) != E.ERR_OK: logging.error("Could not ssh into the machine %s" % machine) return 1 # start the svs on the remote machine restart_svs_cmd = "%s/local/google3/enterprise/legacy/util/svs_utilities.py %s %s" % ( self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), machine) if E.execute([E.getCrtHostName()], SECURE_WRAPPER_COMMAND % ( \ self.cfg.getGlobalParam('ENTERPRISE_HOME'), "-p2", restart_svs_cmd), None, 0) != E.ERR_OK: logging.error("Could not start svs on machine %s" % machine) return 1 # wait for some time for svs to come up time.sleep(5) # check to see if the svs is up and is the right version if not svs_utilities.PingAndCheckSvsVersion( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), machine): logging.error("Svs not running correctly on machine %s" % machine) return 1 ver = self.cfg.getGlobalParam('VERSION') home = self.cfg.getGlobalParam('ENTERPRISE_HOME') testver = install_utilities.is_test(ver) # update MACHINES machines = self.cfg.getGlobalParam('MACHINES') if machine not in machines: machines.append(machine) self.cfg.setGlobalParam('MACHINES', machines) ret = core_utils.RemDeadNode(ver, testver, machine) if ret: logging.error('Cannot remove dead node from lockserver.') # we ignore this error for now # We just added a new machine into the config # this will lead to a change in concentrator config # so we need to re-run serve service which will # write the new config and restart the concentrator serve_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \ "./serve_service.py %s" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME')) E.exe("%s %s" % (serve_cmd, "babysit")) num_tries = 5 cur_try = 0 while cur_try < num_tries: cur_try = cur_try + 1 all_disks = self.cfg.mach_param_cache.GetFact("mounted-drives", machine) bad_disks = self.cfg.mach_param_cache.GetFact("var_log_badhds", machine) if bad_disks and all_disks: break time.sleep(60) if all_disks == None or bad_disks == None: logging.error("Could not get machine information about %s" % machine) return 1 bad_disks = string.split(bad_disks, ' ') all_disks = string.split(all_disks, ' ') good_disks = filter(lambda x, y=bad_disks: x not in y, all_disks) good_disks = map(lambda x: "%s3" % x, good_disks) # change sda3 to hda3 etc. good_disks = map(lambda x: re.sub(r'^s', 'h', x), good_disks) # Preprocess disks before adding to remove duplicates. unique_good_disks = [] [unique_good_disks.append(disk) for disk in good_disks if disk not in unique_good_disks] # Add disks self.updatedisk(machine, unique_good_disks, true) # apc map update apc_map = self.cfg.globalParams.var_copy('APC_MAP') apc_map[machine] = apc_util.PortMap(apc_outlet) if not self.cfg.setGlobalParam('APC_MAP', apc_map): logging.error("ERROR setting apc map to %s" % repr(apc_map)) return 1 # create appropriate datadirs on that machine if not self.cfg.createDataDirs([machine], node_replacement = 1): logging.error("ERROR could not create datadirs on machine %s" % machine) return 1 # Replicate the config self.cfg.replicateConfigOnMachine(machine) # Reconfigure net on the target machine if not reconfigurenet_util.doReconfigureNet(self.cfg.globalParams, [machine], i_am_master=0): logging.error('reconfigurenet failed for %s' % machine) return 1 # Start core services on the new node if not install_utilities.start_core(ver, home, [machine], ignore=0): logging.error("ERROR could not start core services on %s" % machine) return 1 # Add the chunkserver back gfs_utils.AddGFSChunkservers(ver, testver, [machine]) # first we need to do Machine allocation. # this will assign things that will satisfy the constraints if not self.cfg.DoMachineAllocation(serversets=['workqueue-slave']): logging.error("ERROR doing machine allocation") return 1 # now try to relllocate some servers from existing machines to the new machine replaced = self.cfg.AllocateServersToNewMachine(machine) if not replaced: logging.error("ERROR allocating services to the new machine") return 1 # first we need to restart the babysitter E.exe("%s %s" % (serve_cmd, "babysit")) time.sleep(60) # Now we need to stop all the replaced services for server_string in replaced: server = serverlib.Server() server.InitFromName(server_string) replaced_type = server.servertype() kill_cmd = servertype.GetKillCmd(replaced_type, server.port()) if E.execute([server.host()], kill_cmd, None, 1) != E.ERR_OK: logging.error("ERROR killing %s running on port %d on %s" % \ (replaced_type, server.port(), server.host())) # we should make it active if not install_utilities.set_install_state(machine, self.cfg.getGlobalParam('ENTERPRISE_HOME'), "ACTIVE"): logging.error("ERROR changing state on machine %s. " "Please make it active and activate and " "start crawl service on it" % machine) return 1 crawl_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \ "./crawl_service.py %s" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME')) if E.execute([machine], "%s %s" % (crawl_cmd, "start"), None, 1) != E.ERR_OK: logging.error("Could not start crawl service on %s" % machine) return 1 # save all the params self.cfg.saveParams() # for faster crawl recovery, lets restart all crawl processes self.restart_crawl_processes(serve_cmd) # activate the crawl and logcontrol service on the remote machine crawl_activate_cmd = "/etc/rc.d/init.d/crawl_%s activate >&/dev/null" \ "</dev/null" % self.cfg.getGlobalParam('VERSION') if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \ self.cfg.getGlobalParam('ENTERPRISE_HOME'), "-e", crawl_activate_cmd), None, 0) != E.ERR_OK: logging.error("Could not activate crawl service on machine %s" % machine) logging.error("Please activate by hand") return 1 log_activate_cmd = "/etc/rc.d/init.d/logcontrol_%s activate >&/dev/null" \ "</dev/null" % self.cfg.getGlobalParam('VERSION') if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \ self.cfg.getGlobalParam('ENTERPRISE_HOME'), "-e", log_activate_cmd), None, 0) != E.ERR_OK: logging.error("Could not activate logcontrol service on machine %s" % machine) logging.error("Please activate by hand") return 1 serve_activate_cmd = "/etc/rc.d/init.d/serve_%s activate >&/dev/null" \ "</dev/null" % self.cfg.getGlobalParam('VERSION') if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \ self.cfg.getGlobalParam('ENTERPRISE_HOME'), "-e", serve_activate_cmd), None, 0) != E.ERR_OK: logging.error("Could not activate serve service on machine %s" % machine) logging.error("Please activate by hand") return 1 logging.info("Machine %s successfully added into the system" % machine) if not mail_already_sent(M.MSG_MACHINEADDED % machine): SendMail.send(self.cfg, None, false, M.MSG_MACHINEADDED % machine, "", true) return 0
python2_invoke_string = "" if FLAGS.use_python2: # we want to invoke using python2 python2_invoke_string = "python2 " action = "--start=all" if FLAGS.kill_only: # we only want to kill and not restart action = "--kill=all" cmd = ('. %s; cd %s/enterprise/legacy/production/babysitter; ' ' %s./babysitter.py --batch %s %s --ports=%s --babyalias=localhost ' '--lockdir=%s/tmp --mailto= --mach=%s %s' % ( cp.var("BASHRC_FILE"), cp.var("MAIN_GOOGLE3_DIR"), python2_invoke_string, action, use_invalid_config, FLAGS.port, cp.var("ENTERPRISE_HOME"), FLAGS.machine, cp.GetConfigFileName() )) logging.info("Executing [%s]" % cmd) if E.ERR_OK != E.execute([E.getCrtHostName()], cmd, None, true): sys.exit("Error restarting server at %s:%s" % (FLAGS.machine, FLAGS.port)) if __name__ == '__main__': main(sys.argv)
sys.exit("Unable to load config file %s" % (cp.GetConfigFileName())) use_invalid_config = "" if FLAGS.useinvalidconfig: use_invalid_config = "--useinvalidconfig" python2_invoke_string = "" if FLAGS.use_python2: # we want to invoke using python2 python2_invoke_string = "python2 " action = "--start=all" if FLAGS.kill_only: # we only want to kill and not restart action = "--kill=all" cmd = ('. %s; cd %s/enterprise/legacy/production/babysitter; ' ' %s./babysitter.py --batch %s %s --ports=%s --babyalias=localhost ' '--lockdir=%s/tmp --mailto= --mach=%s %s' % (cp.var("BASHRC_FILE"), cp.var("MAIN_GOOGLE3_DIR"), python2_invoke_string, action, use_invalid_config, FLAGS.port, cp.var("ENTERPRISE_HOME"), FLAGS.machine, cp.GetConfigFileName())) logging.info("Executing [%s]" % cmd) if E.ERR_OK != E.execute([E.getCrtHostName()], cmd, None, true): sys.exit("Error restarting server at %s:%s" % (FLAGS.machine, FLAGS.port)) if __name__ == '__main__': main(sys.argv)
def main(argv): """ expects: argv[0] to be a machine name, argv[1] be a google_config file - copys google_config file from the machine specified into the local machine (by calling CopyFile() function) - calls ReplicateConfig() - which """ if len(argv) < 2: sys.exit(__doc__) machine = argv[0] global_file = argv[1] global_dir = os.path.dirname(global_file) if E.getCrtHostName() == machine: logging.info("I do not try to replicate to myself") sys.exit(0) config = config_factory.ConfigFactory().CreateConfig(global_file) if not config.Load(): sys.exit("ERROR: Cannot read file %s " % global_file) file_to_copy = [ C.ETC_SYSCONFIG, global_file, "%s/lic_counter" % global_dir, "%s/lic_counter.bck" % global_dir, "%s/passwd" % global_dir, "/export/hda3/versionmanager/vmanager_passwd", "%s/server.p12" % global_dir ] # Sync all the files for f in file_to_copy: CopyFile(machine, f, production_tmp) # sync apache certificate CopyFileAsRoot(machine, (ssl_cert.CERT_FILENAME % config.var('ENTERPRISE_HOME')), production_tmp, config) # sync private key CopyFileAsRoot(machine, (ssl_cert.KEY_FILENAME % config.var('ENTERPRISE_HOME')), production_tmp, config) # sync the support request repository directory # GRR: ReplicateDirectory(machine, SUPPORTREQUEST_RECORDS_DIR) # replicate config including per collection/frontend stuff # some dirs in conf don't need to be rsync'ed if core_utils.CanRunGSAMaster(E.getCrtHostName()): exclude_patterns = [ 'cmr_working/failure/', 'cmr_working/success/', 'cmr_working/statusz/' ] else: exclude_patterns = [ 'cmr_working/', 'cmr/', 'fixer/', 'fixer_cmr/', 'gems' ] ReplicateDirectory(machine, "%s/local/conf/" % config.var('ENTERPRISE_HOME'), exclude_patterns) # to replicate per frontend gws stuff (keymatches, gws bad urls) ReplicateDirectory(machine, "%s/gws/" % config.var('GOOGLEDATA')) # need to adjust NTP if master has changed # assumption : this machine is not the master if 'PILE' != config.var('ENT_CONFIG_TYPE'): ent_home = E.getEnterpriseHome() os.system("%s NTP %s" % (C.RECONFIGURE_COMMAND % ent_home, machine)) # also make sure DNS is up to date os.system( "%s DNS %s %s" % (C.RECONFIGURE_COMMAND % ent_home, config.var('BOT_DNS_SERVERS'), config.var('DNS_SEARCH_PATH')))