def kill_service(services, machines): """Kill all processes associated with specified services on the specified machines. E.execute() sends the commands concurrently when there is more than one node. Args: services: list of services to kill. 'adminconsole' and 'adminrunner' are currently supported. machines: list of hostnames """ # Map of services to the command that kills the service find_service_pid_cmd = { 'adminconsole': ("ps -e -o pid,args --width 100 | " "grep loop_AdminConsole.py | grep -v grep | " "awk '{print $1}' ; " "%s" % python_kill.GetServicesListeningOn(['8000'])), 'adminrunner': ("ps -e -o pid,args --width 100 | " "grep loop_AdminRunner.py | grep -v grep | " "awk '{print $1}' ; " "%s" % python_kill.GetServicesListeningOn(['2100'])), } for service in services: if service not in find_service_pid_cmd: logging.error('kill_service: Unrecognised service "%s"' % service) else: logging.info('kill_service: Killing service "%s" on %d nodes...' % (service, len(machines))) kill_cmd = ('sh -c "(kill `%s`; sleep 3; kill -9 `%s`; true)" ' '> /dev/null 2>&1' % (find_service_pid_cmd[service], find_service_pid_cmd[service])) E.execute(machines, kill_cmd, [], alarm=1, verbose=0) logging.info('kill_service: Done killing service "%s"' % service)
def SyncLogsWithCanonical(ver, canonical): """ sync logs with the canonical Arguments: ver: '4.6.5' canonical: 'ent1' """ gfs_master_nodes, _ = core_utils.GFSMasterNodes() gfs_master_nodes.remove(canonical) gfs_master_dir = '/export/hda3/%s/data/gfs_master' % ver log_dir = '%s/ent.gfsmaster' % gfs_master_dir backup_log_dir = '%s.backup.%d' % (log_dir, int(time.time())) vars = {'gfs_master_dir': gfs_master_dir, 'log_dir': log_dir, 'backup_log_dir': backup_log_dir, 'canonical': canonical } cmd = ('rm -rf %(log_dir)s.backup*; ' 'mv %(log_dir)s %(backup_log_dir)s; ' 'mkdir -p %(log_dir)s; chown nobody:nobody %(log_dir)s; ' 'rsync -c -e ssh -aHpogt %(canonical)s:%(log_dir)s %(gfs_master_dir)s' % vars ) out = [] enthome = '/export/hda3/%s' % ver E.execute(gfs_master_nodes, cmd, out, 1200, 1, 0, enthome)
def SyncLogsWithCanonical(ver, canonical): """ sync logs with the canonical Arguments: ver: '4.6.5' canonical: 'ent1' """ gfs_master_nodes, _ = core_utils.GFSMasterNodes() gfs_master_nodes.remove(canonical) gfs_master_dir = '/export/hda3/%s/data/gfs_master' % ver log_dir = '%s/ent.gfsmaster' % gfs_master_dir backup_log_dir = '%s.backup.%d' % (log_dir, int(time.time())) vars = { 'gfs_master_dir': gfs_master_dir, 'log_dir': log_dir, 'backup_log_dir': backup_log_dir, 'canonical': canonical } cmd = ( 'rm -rf %(log_dir)s.backup*; ' 'mv %(log_dir)s %(backup_log_dir)s; ' 'mkdir -p %(log_dir)s; chown nobody:nobody %(log_dir)s; ' 'rsync -c -e ssh -aHpogt %(canonical)s:%(log_dir)s %(gfs_master_dir)s' % vars) out = [] enthome = '/export/hda3/%s' % ver E.execute(gfs_master_nodes, cmd, out, 1200, 1, 0, enthome)
def installkey(self): """ installs the staging key as the currently installed private key returns: 0 on success, 1 on empty install key (not an error) 2 when the private key is invalid 3 when the private key could not be distributed """ self.updatelock.acquire() try: # first verify if the staging key is empty (not an error) if ( not os.path.exists(ssl_cert.STAGINGKEY_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME")) ) or 0 == len(open(ssl_cert.STAGINGKEY_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME"), "r").read()): return "1" # next verify that the staging key is a valid file verifycmd = "secure_script_wrapper -p2 %s verifystagingkey %s" % ( self.sslWrapperPath, self.cfg.getGlobalParam("ENTERPRISE_HOME"), ) outputList = [] verifycode = E.execute(["localhost"], verifycmd, outputList, 60) if verifycode != 0: E.rm(["localhost"], ssl_cert.STAGINGKEY_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME")) logging.error("Verify failed for key [%s]; error code: %d" % (str(outputList), verifycode)) return "2" # distribute the staging key retcode = E.distribute( self.cfg.getGlobalParam("MACHINES"), ssl_cert.STAGINGKEY_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME"), 60, ) if retcode != 0: logging.error("Couldn't distribute private key, error %d" % retcode) return "3" # next, copy the key on all machines cmd = "secure_script_wrapper -p2 %s installkey %s" % ( self.sslWrapperPath, self.cfg.getGlobalParam("ENTERPRISE_HOME"), ) outputList = [] retcode = E.execute(self.cfg.getGlobalParam("MACHINES"), cmd, outputList, 60) if retcode != 0: logging.error("Couldn't install cert: %s" % str(outputList)) return "3" self.writeAdminRunnerOpMsg(M.MSG_LOG_SSL_KEY_INSTALLED) finally: self.updatelock.release() return "0"
def installkey(self): """ installs the staging key as the currently installed private key returns: 0 on success, 1 on empty install key (not an error) 2 when the private key is invalid 3 when the private key could not be distributed """ self.updatelock.acquire() try: # first verify if the staging key is empty (not an error) if (not os.path.exists(ssl_cert.STAGINGKEY_FILENAME % \ self.cfg.getGlobalParam("ENTERPRISE_HOME"))) or \ 0 == len(open(ssl_cert.STAGINGKEY_FILENAME % \ self.cfg.getGlobalParam("ENTERPRISE_HOME"), "r").read()): return "1" # next verify that the staging key is a valid file verifycmd = "secure_script_wrapper -p2 %s verifystagingkey %s" % ( self.sslWrapperPath, self.cfg.getGlobalParam("ENTERPRISE_HOME") ) outputList = [] verifycode = E.execute(['localhost'], verifycmd, outputList, 60) if verifycode != 0: E.rm(['localhost'], ssl_cert.STAGINGKEY_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME")) logging.error("Verify failed for key [%s]; error code: %d" % (str(outputList), verifycode) ) return "2" # distribute the staging key retcode = E.distribute(self.cfg.getGlobalParam("MACHINES"), ssl_cert.STAGINGKEY_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME"), 60) if retcode != 0: logging.error("Couldn't distribute private key, error %d" % retcode) return "3" # next, copy the key on all machines cmd = "secure_script_wrapper -p2 %s installkey %s" % ( self.sslWrapperPath, self.cfg.getGlobalParam("ENTERPRISE_HOME")) outputList = [] retcode = E.execute(self.cfg.getGlobalParam("MACHINES"), cmd, outputList, 60) if retcode != 0: logging.error("Couldn't install cert: %s" % str(outputList)) return "3" self.writeAdminRunnerOpMsg(M.MSG_LOG_SSL_KEY_INSTALLED) finally: self.updatelock.release() return "0"
def HaltMachines(enthome, machines): 'Stops and powers down machines.' logging.info("Halting machines: %s" % string.join(machines, ",")) E.execute(machines, '/sbin/shutdown -h now &', None, 1) time.sleep(60) for machine in machines: SendAPCCommand(enthome, machine, APC_OFF) return 0
def RebootMachine(enthome, machine): 'Reboots a machine.' if machine == E.getCrtHostName(): # Rebooting ourself logging.info('Rebooting %s' % machine) E.execute([E.LOCALHOST], '/sbin/shutdown -r now', None, 1) # If we're still alive after a minute , the APC will kick in else: # Try shutting down cleanly first logging.info('Shutting down %s' % machine) E.execute([machine], '/sbin/shutdown -h now &', None, 1) time.sleep(60) logging.info('Rebooting %s via APC' % machine) return SendAPCCommand(enthome, machine, APC_REBOOT)
def installcert(self): """ installs the staging certificate as the currently installed certificate returns: 0 on success, and 1 on failure """ self.updatelock.acquire() try: # first verify that the staging certificate is a valid file verifycmd = "secure_script_wrapper -p2 %s verifystagingcert %s" % ( self.sslWrapperPath, self.cfg.getGlobalParam("ENTERPRISE_HOME"), ) outputList = [] verifycode = E.execute(["localhost"], verifycmd, outputList, 60) if verifycode != 0: E.rm(["localhost"], ssl_cert.STAGINGCERT_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME")) logging.error("Verify failed for certificate [%s]; error code: %d" % (str(outputList), verifycode)) return "1" # distribute the staging certificate retcode = E.distribute( self.cfg.getGlobalParam("MACHINES"), ssl_cert.STAGINGCERT_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME"), 60, ) if retcode != 0: logging.error("Couldn't distribute apache cert, error %d" % retcode) # next, generate the certificate on all machines cmd = "secure_script_wrapper -p2 %s installcert %s" % ( self.sslWrapperPath, self.cfg.getGlobalParam("ENTERPRISE_HOME"), ) outputList = [] retcode = E.execute(self.cfg.getGlobalParam("MACHINES"), cmd, outputList, 60) if retcode != 0: logging.error("Couldn't install cert: %s" % str(outputList)) return "1" self.writeAdminRunnerOpMsg(M.MSG_LOG_SSL_CERT_INSTALLED) finally: self.updatelock.release() return "0"
def check_klogd_syslogd_conf(machines, enthome, unittestdir=None): """ babysit klogd.conf and syslogd.conf file Recreate klogd.conf and syslogd.conf if they are not in the dir. Args: machines: ['ent1', 'ent2', 'ent3', 'ent4', 'ent5'] enthome: '/export/hda3/4.6.0.G.27/' unittestdir: '/tmp/etc/localbabysitter.d/' -- used for unittest only """ KLOGD_CONF_DATA = ( "klogd = {\n" " restart_command : '/sbin/service syslog restart',\n" " timeout : 30,\n" " interval : 30,\n" " use_service_wrapper : 0,\n" " pidfile : '/var/run/klogd.pid',\n" "}\n" ) SYSLOGD_CONF_DATA = ( "syslogd = {\n" " restart_command : '/sbin/service syslog restart',\n" " timeout : 30,\n" " interval : 30,\n" " use_service_wrapper : 0,\n" " pidfile : '/var/run/syslogd.pid',\n" "}\n" ) CHECK_CREATE_CMD = ( 'if [ ! -e "%(file)s" ]\n' 'then\n' ' echo "%(data)s" > %(file)s\n' ' chmod 644 %(file)s\n' 'fi\n' ) if unittestdir is None: dir = '/etc/localbabysitter.d/' else: dir = unittestdir file_info = {'klogd.conf':KLOGD_CONF_DATA, 'syslogd.conf':SYSLOGD_CONF_DATA } for fname, data in file_info.items(): file = os.path.join(dir, fname) cmd = CHECK_CREATE_CMD % {'data': data, 'file': file} if unittestdir is None: E.execute(machines, cmd, [], 0, 0, 0, enthome) else: os.system(cmd)
def installcert(self): """ installs the staging certificate as the currently installed certificate returns: 0 on success, and 1 on failure """ self.updatelock.acquire() try: # first verify that the staging certificate is a valid file verifycmd = "secure_script_wrapper -p2 %s verifystagingcert %s" % ( self.sslWrapperPath, self.cfg.getGlobalParam("ENTERPRISE_HOME") ) outputList = [] verifycode = E.execute(['localhost'], verifycmd, outputList, 60) if verifycode != 0: E.rm(['localhost'], ssl_cert.STAGINGCERT_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME")) logging.error("Verify failed for certificate [%s]; error code: %d" % (str(outputList), verifycode) ) return "1" # distribute the staging certificate retcode = E.distribute(self.cfg.getGlobalParam("MACHINES"), ssl_cert.STAGINGCERT_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME"), 60) if retcode != 0: logging.error("Couldn't distribute apache cert, error %d" % retcode) # next, generate the certificate on all machines cmd = "secure_script_wrapper -p2 %s installcert %s" % ( self.sslWrapperPath, self.cfg.getGlobalParam("ENTERPRISE_HOME")) outputList = [] retcode = E.execute(self.cfg.getGlobalParam("MACHINES"), cmd, outputList, 60) if retcode != 0: logging.error("Couldn't install cert: %s" % str(outputList)) return "1" self.writeAdminRunnerOpMsg(M.MSG_LOG_SSL_CERT_INSTALLED) finally: self.updatelock.release() return "0"
def setcert(self, certBody): """ Takes a cert file body as the input, and saves it as the staging certificate returns 0 on success, or 1 on failure """ retval = 0 self.updatelock.acquire() try: try: open(ssl_cert.STAGINGCERT_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME"), "w").write(certBody) except IOError: retval = 1 logging.error( "Couldn't save certificate to [%s]" % (ssl_cert.STAGINGCERT_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME")) ) if retval == 0: verifycmd = "secure_script_wrapper -p2 %s verifystagingcert %s" % ( self.sslWrapperPath, self.cfg.getGlobalParam("ENTERPRISE_HOME"), ) outputList = [] verifycode = E.execute(["localhost"], verifycmd, outputList, 60) if verifycode != 0: retval = 1 E.rm(["localhost"], ssl_cert.STAGINGCERT_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME")) logging.error("Couldn't verify certificate [%s]; error code: %d" % (str(outputList), verifycode)) finally: self.updatelock.release() return "%d" % retval
def core_op_out(ver, home, op, nodes, ignore=0, testver=None, gfs=1): """Executes ent_core command and returns the result. ent_core is running at the same time on all the nodes in different threads. Arguments: ver: '4.6.5' home: '/export/hda3/4.6.5' op: 'info' nodes: ['ent1', 'ent2'] ignore: 1 - ignore errors; 0 - otherwise. testver: 1 - if this is a test version; 0 - otherwise. gfs: 1 - start gfs during activation; 0 - otherwise. Returns: error code and output from all nodes. (0, 'state=RUNNING\nnodes=5\nfailures=1\nstate=RUNNING\nnodes=5\nfailures=1') """ if testver == None: testver = is_test(ver) out = [] core_base = ('/export/hda3/%s/local/google/bin/secure_script_wrapper ' '-e /export/hda3/%s/bin/ent_core --ver=%s' % (ver, ver, ver)) if testver: core_base = '%s --testver' % core_base if gfs == 0: core_base = '%s --gfs=0' % core_base cmd = '%s --%s' % (core_base, op) ret = E.execute(nodes, cmd, out, 0, 0, 0, home, ignore=ignore) if ret: logging.error(''.join(out)) return ret, ''.join(out)
def _RunServeCmd(cfg, version, cmd, allnodes=0): """Run serve_service command. cmd: 'stop', 'start', 'activate', 'deactivate' allnodes: 1 to run command on all nodes """ serve_service_cmd = ( '/export/hda3/%s/local/google3/enterprise/legacy/scripts/' 'serve_service.py %s %s' % (version, cfg.getGlobalParam('ENTERPRISE_HOME'), cmd)) logging.info('Running: %s' % serve_service_cmd) if allnodes: machines = cfg.getGlobalParam(C.MACHINES) else: machines = [E.getCrtHostName()] if E.execute(machines, SECURE_WRAPPER_COMMAND % ( \ cfg.getGlobalParam('ENTERPRISE_HOME'), '-p2', serve_service_cmd), None, 0) != E.ERR_OK: logging.error('%s: failed' % serve_service_cmd) return 1 logging.info('%s: completed' % serve_service_cmd) return 0
def renice_svs(machine, new_priority=SVS_NICE_VALUE): """Renices svs on 'machine' with new_priority. Returns 0 on failure, 1 on success. """ pid = None ret = 0 # Get pid of svs. try: pid = open(SVS_PID_FILE, 'r').read() except: err = str(sys.exc_info()[:2]) logging.error('Error reading SVS pid from %s: %s' % (SVS_PID_FILE, err)) if pid is not None: # Get nice value for pid old_priority = os_utils.GetAttr('nice', int(pid)) if old_priority is not None and int(old_priority) != new_priority: logging.info('Renicing SVS to %d.' % new_priority) # Get group id from pid. pgid = os_utils.GetAttr('pgid', int(pid)) # Renice the whole group. cmd = 'renice %d -g %d' % (new_priority, int(pgid)) rc = E.execute([machine], cmd, None, 1) if rc == 0: ret = 1 else: logging.error('Error renicing SVS: %d' % ret) return ret
def replicateConfig(self, machines): """ Replicates the config to a list of machines """ cmd = (". %s && cd %s/local/google3/enterprise/legacy/scripts && " "./replicate_config.py %s %s" % ( self.getGlobalParam("ENTERPRISE_BASHRC"), self.entHome, E.getCrtHostName(), self.globalParams.GetConfigFileName())) return E.execute(machines, cmd, None, true)
def get_core_states(ver, home, nodes, ignore=0, testver=None): """ get the ENT_CORE_STATE from all nodes Running "ent_core --ver --info" on all the nodes sequentially Arguments: ver: '4.6.5' home: '/export/hda3/4.6.5' nodes: ['ent1', 'ent2'] ignore: 1 - ignore errors; 0 - otherwise. testver: 1 - if this is a test version; 0 - otherwise. Returns: {'ent1': 'state=RUNNING\nnodes=5\nfailures=1', 'ent2': 'state=RUNNING\nnodes=5\nfailures=1'} """ if testver == None: testver = is_test(ver) states = {} core_base = ('/export/hda3/%s/local/google/bin/secure_script_wrapper ' '-e /export/hda3/%s/bin/ent_core --ver=%s' % (ver, ver, ver)) if testver: core_base = '%s --testver' % core_base cmd = '%s --info' % core_base for node in nodes: out = [] ret = E.execute([node], cmd, out, 0, 0, 0, home, ignore=ignore) if not ret: states[node] = ''.join(out) return states
def replicateConfig(self, machines): """ Replicates the config to a list of machines """ cmd = (". %s && cd %s/local/google3/enterprise/legacy/scripts && " "./replicate_config.py %s %s" % (self.getGlobalParam("ENTERPRISE_BASHRC"), self.entHome, E.getCrtHostName(), self.globalParams.GetConfigFileName())) return E.execute(machines, cmd, None, true)
def renice_svs(machine, new_priority=SVS_NICE_VALUE): """Renices svs on 'machine' with new_priority. Returns 0 on failure, 1 on success. """ pid = None ret = 0 # Get pid of svs. try: pid = open(SVS_PID_FILE, 'r').read() except: err = str(sys.exc_info()[:2]) logging.error('Error reading SVS pid from %s: %s' % (SVS_PID_FILE, err) ) if pid is not None: # Get nice value for pid old_priority = os_utils.GetAttr('nice', int(pid)) if old_priority is not None and int(old_priority) != new_priority: logging.info('Renicing SVS to %d.' % new_priority) # Get group id from pid. pgid = os_utils.GetAttr('pgid', int(pid)) # Renice the whole group. cmd = 'renice %d -g %d' % (new_priority, int(pgid)) rc = E.execute([machine], cmd, None, 1) if rc == 0: ret = 1 else: logging.error('Error renicing SVS: %d' % ret) return ret
def drain_urlmanagers(self): """ We need to do this before advancing the epoch -- we can do it multiple times """ urlmanagers = self.cfg.globalParams.GetServerHostPorts("urlmanager") num_shards = self.cfg.globalParams.GetNumShards('urlmanager') epoch = self.cfg.getGlobalParam('RT_EPOCH') for (host, port) in urlmanagers: # We don't do it here directly because of the timeout cmd = ". %s; cd %s/local/google3/enterprise/legacy/util && "\ "./port_talker.py %s %d 'd DumpingStatusTable' %d" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.entHome, host, port, 300) # 5 min timeout err = E.execute([E.getCrtHostName()], cmd, None, 0) if E.ERR_OK != err: logging.error("Error draining urlmanagers [%s]" % err) return 1 # Make sure that the file is out shard_num = servertype.GetPortShard(port) file = "%surlmanager_out_table_%02d_of_%02d_epoch%010d" % ( self.cfg.getGlobalParam('NAMESPACE_PREFIX'), shard_num, num_shards, epoch) err, out = E.run_fileutil_command(self.cfg.globalParams, "ls %s" % file) if E.ERR_OK != err: logging.error("The status table file [%s] is not there" % file) return 1 return 0
def browse(self, clientName, virtualFile, fromLine, toLine, grepString, fileArgs): grepString = urllib.quote_plus(grepString.strip()) fromLine = string.atoi(fromLine) toLine = string.atoi(toLine) fileLocation = self.makePhysicalFile(virtualFile, clientName, grepString, fileArgs) if not fileLocation: raise ar_exception.ARException(( ar_exception.LOADPARAMS, M.ERR_INVALIDFILESPECIFICATION)) # Get valid start line and end line numLines = toLine - fromLine + 1; if numLines < 0: numLines = 0 toLine = fromLine + numLines - 1 # Get the lines result = [] parsedGrepString = commands.mkarg(grepString) if ( E.ERR_OK != E.execute( [E.getCrtHostName()], "head -n %s %s | tail -n %s | grep -i -F -- %s" % (toLine, fileLocation, numLines, parsedGrepString), result, false) ): return "0" return "%s\n%s" % (len(result[0]), result[0])
def SendAPCCommand(enthome, machine, cmd): 'Sends commands to the APC.' return E.ERR_OK != E.execute([E.LOCALHOST], EXEC_REBOOT % (enthome, enthome, machine, REBOOT_PARAM[cmd]), None, 1)
def _ExecuteCommand(cmd, machines=['localhost'], out=None, timeout=15*60, num_tries=2, error_filter=None): """ Helper file to execute a command multiple times until it succeeds. Input: cmd: command to run timeout: seconds to allow command to run machines: machine list on which to execute out: list of output lines num_tries: number of times to retry command error_filter: A function that is called if the cmd execution failed. The takes a list of output lines as input, can filter the errors, and decideds if the execution counts as successful. Returns 0 if execution succeeded, 1 if the execution failed. Output: 0 for success, 1 for failure """ if out == None: out = [] for i in range(0, num_tries): if E.execute(machines, cmd, out, timeout) == 0: # Command was successful return 0 # Execution failed if error_filter and error_filter(out) == 0: # Error filter says error is ok, execution counts as success logging.error('Cmd %s ignoring error: %s' % (cmd, ''.join(out))) return 0 if i < num_tries-1: logging.error('Cmd %s error: %s, retrying.' % (cmd, ''.join(out))) else: logging.error('Cmd %s error: %s, failing.' % (cmd, ''.join(out))) return 1
def browse(self, clientName, virtualFile, fromLine, toLine, grepString, fileArgs): grepString = urllib.quote_plus(grepString.strip()) fromLine = string.atoi(fromLine) toLine = string.atoi(toLine) fileLocation = self.makePhysicalFile(virtualFile, clientName, grepString, fileArgs) if not fileLocation: raise ar_exception.ARException( (ar_exception.LOADPARAMS, M.ERR_INVALIDFILESPECIFICATION)) # Get valid start line and end line numLines = toLine - fromLine + 1 if numLines < 0: numLines = 0 toLine = fromLine + numLines - 1 # Get the lines result = [] parsedGrepString = commands.mkarg(grepString) if (E.ERR_OK != E.execute([E.getCrtHostName()], "head -n %s %s | tail -n %s | grep -i -F -- %s" % (toLine, fileLocation, numLines, parsedGrepString), result, false)): return "0" return "%s\n%s" % (len(result[0]), result[0])
def InactivateCleanup(ver, home, active_nodes): """Cleanup things that inactivate may have failed to handle. Remove /etc/google/ent4-X-X.chubby_cell and /etc/localbabysitter.d/*. Kills nobody processes containing \<4.x.x\>. """ cmd = ('/bin/rm -f /etc/google/%s.chubby_cell ' '/etc/localbabysitter.d/*-%s.conf' % (core_utils.GetCellName(ver), ver)) # Ignore any errors E.execute(active_nodes, cmd, [], 0, 0, 0, home) cmd = "/usr/bin/pkill -KILL -u nobody -f '/%s/'" % ver.replace('.', '\\.') # Kill a few times just to make sure for _ in range(0, 3): # Ignore any errors E.execute(active_nodes, cmd, [], 0, 0, 0, home)
def start_gfs(ver, home, nodes, ignore=0, testver=None, sync_log=1): """ Starts GFS. By default, gfs replica transaction log will be sync'ed before starting GFS. This operation should not take more than a few seconds. It uses rsysc internally. Arguments: ver: '4.6.5' home: '/export/hda3/4.6.5' nodes: ['ent1', 'ent2'] ignore: 1 - ignore errors; 0 - otherwise. testver: 1 - if this is a test version; 0 - otherwise. sync_log: 1 - sync the gfs replica transaction log with the canonical; 0 - otherwise Returns: 1 - successful. 0 - otherwise. """ if sync_log: # make sure the transaction logs are in sync before starting gfs start = time.time() logging.info("START GFS: Sync'ing GFS replica transaction logs") try: ent_home = "/export/hda3/%s" % ver vm_dir = "/export/hda3/versionmanager" bashrc_file = "%s/local/conf/ent_bashrc" % ent_home entcore_dir = "%s/google3/enterprise/core" % vm_dir handle_gfs_no_master_cmd = (". %s && cd %s && ./handle_gfs_no_master.py " "%s %s %s") % (bashrc_file, entcore_dir, ver, testver, "force_sync") output= [] E.execute([E.LOCALHOST], handle_gfs_no_master_cmd, output, None, enthome=ent_home) except Exception, e: (t, v, tb) = sys.exc_info() exc_msg = string.join(traceback.format_exception(t, v, tb)) # the "sync log" operation is not a necessary step for starting # gfs. So ignore any error and go ahead to start gfs. logging.info("The following errors are ignored: [%s]" % exc_msg) end = time.time() diff = end - start logging.info("START GFS: STAT: Sync'ing GFS replica transaction logs" " took %s seconds" % diff)
def execCmd(self, cmd, machines, logMsg=None): ''' Takes a command string and returns its output''' outputList = [] retcode = E.execute(machines, cmd, outputList, 60) result = str(outputList) if retcode != 0: return '1' if logMsg: self.writeAdminRunnerOpMsg(logMsg); return '0\n%d\n%s' % (len(result), result)
def execCmd(self, cmd, machines, logMsg=None): ''' Takes a command string and returns its output''' outputList = [] retcode = E.execute(machines, cmd, outputList, 60) result = str(outputList) if retcode != 0: return '1' if logMsg: self.writeAdminRunnerOpMsg(logMsg) return '0\n%d\n%s' % (len(result), result)
def set_install_state(machine, enterprise_home, to_status): """ This function sets the status of the installed version to to_status """ state_file_name = enterprise_home + "/STATE" cmd = "echo %(state)s > %(file)s && chmod 755 %(file)s && "\ "chown nobody.nobody %(file)s" % {'state':to_status, 'file':state_file_name} return E.ERR_OK == E.execute([machine], cmd, None, E.true)
def get_log_per_module(self, filename, module_name): '''Get the onebox module specific logging information.''' ent_config_type = self.cfg.getGlobalParam('ENT_CONFIG_TYPE') grep_string = '\"Query \\[\\|\\[%s\\]\"' % module_name if ent_config_type == 'CLUSTER': gfs_aliases = self.cfg.getGlobalParam('GFS_ALIASES') grep_command = 'fileutil --bnsresolver_use_svelte=false '\ '--gfs_aliases=%s cat %s | grep %s > %s' % ( gfs_aliases, filename, grep_string, self.TMP_LOG) else: grep_command = 'grep %s %s > %s' % (grep_string, filename, self.TMP_LOG) machine = E.getCrtHostName() E.execute([machine], grep_command, None, false) tmp_file = open(self.TMP_LOG, 'r') contents = tmp_file.read() tmp_file.close() os.remove(self.TMP_LOG) return contents
def getCount(self, command, machine): """ used for wc to return # of lines """ result = [] if E.ERR_OK != E.execute([machine], command, result, false): return -1 # parse out the return code from the executed command try: return string.atoi(result[0]) except: logging.error("Problem getting the count on %s" % command) return -1
def compile_synonyms(self, source, destination): """Execute the synonyms compiler and return error code.""" # In the command line, we set the maximum errors to 1 so we can halt as # quickly as possible. The assumption is that the input files have already # been validated. commandline = \ 'make_custom_synonym_map --synonym_file=%s --output_syn_map=%s \ --max_errors=1' % (source, destination) return E.execute( ['localhost'], commandline, None, QueryExpansionBase.COMPILER_TIMEOUT)
def compile_synonyms(self, source, destination): """Execute the synonyms compiler and return error code.""" # In the command line, we set the maximum errors to 1 so we can halt as # quickly as possible. The assumption is that the input files have already # been validated. commandline = \ 'make_custom_synonym_map --synonym_file=%s --output_syn_map=%s \ --max_errors=1' % (source, destination) return E.execute(['localhost'], commandline, None, QueryExpansionBase.COMPILER_TIMEOUT)
def getCount(self, command, machine): """ used for wc to return # of lines """ result = [] if E.ERR_OK != E.execute([machine], command, result, false): return -1; # parse out the return code from the executed command try: return string.atoi(result[0]) except: logging.error("Problem getting the count on %s" % command ); return -1
def rebootwholeappliance(self): #reboot after a minute. On a cluster, reboot all other nodes first. cmd = E.nonblocking_cmd('shutdown -r +1 &') machines = self.cfg.getGlobalParam("MACHINES") logging.info("List of machines : %s" % machines) my_hostname = [] out = [] E.execute(['localhost'], 'hostname', my_hostname, E.true, 1, 0, self.cfg.getGlobalParam('ENTERPRISE_HOME')) logging.info('My hostname is : ' + str(my_hostname[0])) #shallow copy is okay because list is not nested machines_other = machines[:] machines_other.remove(my_hostname[0]) logging.info( 'Removed myself from the list of nodes because I will reboot myself at the end. List now is : ' + str(machines_other)) if len(machines_other) > 0: E.execute(machines_other, cmd, out, E.true, 1, 0, self.cfg.getGlobalParam('ENTERPRISE_HOME')) logging.info('Output of command is : ' + str(out)) out = [] else: logging.info('There are no other nodes. This must be a oneway') logging.info('Running the command on myself : ' + str(my_hostname)) E.execute(my_hostname, cmd, out, E.true, 1, 0, self.cfg.getGlobalParam('ENTERPRISE_HOME')) logging.info('Output of command is : ' + str(out)) return 0
def rebootwholeappliance(self): #reboot after a minute. On a cluster, reboot all other nodes first. cmd = E.nonblocking_cmd('shutdown -r +1 &') machines = self.cfg.getGlobalParam("MACHINES") logging.info("List of machines : %s" % machines) my_hostname=[] out = [] E.execute(['localhost'],'hostname',my_hostname,E.true,1,0,self.cfg.getGlobalParam('ENTERPRISE_HOME')) logging.info('My hostname is : '+ str(my_hostname[0]) ) #shallow copy is okay because list is not nested machines_other=machines[:] machines_other.remove(my_hostname[0]) logging.info('Removed myself from the list of nodes because I will reboot myself at the end. List now is : '+ str(machines_other) ) if len(machines_other) > 0 : E.execute(machines_other, cmd, out, E.true, 1, 0,self.cfg.getGlobalParam('ENTERPRISE_HOME')) logging.info('Output of command is : '+ str(out) ) out = [] else: logging.info('There are no other nodes. This must be a oneway') logging.info('Running the command on myself : '+ str(my_hostname) ) E.execute(my_hostname, cmd, out, E.true, 1, 0,self.cfg.getGlobalParam('ENTERPRISE_HOME')) logging.info('Output of command is : '+ str(out) ) return 0
def send_urlscheduler_command(self, command): schedulers = self.cfg.globalParams.GetServerHostPorts("urlscheduler") timeout = 60 # 1 minute is good enough for (machine, port) in schedulers: port_talker_cmd = ". %s; cd %s/local/google3/enterprise/legacy/util && "\ "./port_talker.py %s %d 'GET /run?Flags=%s\r\n\r\n' %d" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.entHome, machine, port, command, timeout) if E.ERR_OK != E.execute([E.getCrtHostName()], port_talker_cmd, None, 0): logging.error("Error talking to urlscheduler %s:%d" % (machine, port)) return 1 return 0
def send_cmd(self, server_name, cmd): srvrs = self.cfg.globalParams.GetServerManager().Set(server_name).Servers() for srvr in srvrs: actual_cmd = ". %s; cd %s/local/google3/enterprise/legacy/util && "\ "./port_talker.py %s %d '%s' %d" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.entHome, srvr.host(), srvr.port(), cmd, 60) # 1 min timeout err = E.execute([E.getCrtHostName()], actual_cmd, None, 0) if E.ERR_OK != err: logging.error("Error talking to server at %s:%d" % (srvr.host(), srvr.port())) return true
def send_cmd(self, server_name, cmd): srvrs = self.cfg.globalParams.GetServerManager().Set( server_name).Servers() for srvr in srvrs: actual_cmd = ". %s; cd %s/local/google3/enterprise/legacy/util && "\ "./port_talker.py %s %d '%s' %d" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.entHome, srvr.host(), srvr.port(), cmd, 60) # 1 min timeout err = E.execute([E.getCrtHostName()], actual_cmd, None, 0) if E.ERR_OK != err: logging.error("Error talking to server at %s:%d" % (srvr.host(), srvr.port())) return true
def send_urlmanager_command(self, command): ret = 0 urlmanagers = self.cfg.globalParams.GetServerHostPorts("urlmanager") for (host, port) in urlmanagers: # We don't do it here directly because of the timeout cmd = ". %s; cd %s/local/google3/enterprise/legacy/util && "\ "./port_talker.py %s %d %s %d" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.entHome, host, port, commands.mkarg(command), 300) err = E.execute([E.getCrtHostName()], cmd, None, 0) if E.ERR_OK != err: ret = 1 return ret
def SVSErrorsStatus(self, lockserv_cmd_out=None): """ Check SVS errors recorded by gsa-master Args: lockserv_cmd_out: {'ent1': 'machine problem Unknown\n'} (for unit test only) Return: status, desc (e.g. 0, []). status is 1 if there are SVS erros. Otherwise, status is 0. """ # Add any SVS errors (from gsa-master) to the problem list all_machs_status = 0 desc = [] if self._ent_config == 'CLUSTER': if lockserv_cmd_out is None: version = self._cfg.getGlobalParam('VERSION') lockserv_cmd_prefix = core_utils.GetLSClientCmd(version, install_utilities.is_test(version)) for machine in self._live_machines: if lockserv_cmd_out is None: chubby_file = '/ls/%s/svs_%s' % (core_utils.GetCellName(version), machine) lockserv_cmd = '%s cat %s' % (lockserv_cmd_prefix, chubby_file) out = [] lockserv_status = E.execute(['localhost'], lockserv_cmd, out, 60) else: lockserv_status = 0 if machine in lockserv_cmd_out: out = [lockserv_cmd_out[machine]] else: out = [] if lockserv_status == 0 and len(out) > 0 and out[0] != '': errors = out[0].splitlines() status = 0 for i in range(0, len(errors)): if (errors[i].find('unrecoverable error') >= 0 or errors[i].find('file system error') >= 0): errors[i] = '' # Ignore this error else: status = 1 # Show an error if status: # A svs error has been recorded all_machs_status = max(all_machs_status, status) errors = [e for e in errors if e != ''] # add machine name desc.append('%s: %s' % (machine, ' '.join(errors))) return all_machs_status, desc
def IsGFSRunning(ver, testver): """ if gfs_master process running on any node Arguments: ver: '4.6.5' testver: 0 - not a test version. 1 - test version. Returns: 1 - gfs_master processes running on some nodes. 0 - otherwise. """ gfs_master_nodes, _ = core_utils.GFSMasterNodes() list_gfs_master_cmd = "lsof -i :%s " % core_utils.GetGFSMasterPort(testver) out = [] enthome = '/export/hda3/%s' % ver status = E.execute(gfs_master_nodes, list_gfs_master_cmd, out, 300, 1, 0, enthome) # ignore status for list return ''.join(out).find('LISTEN') != -1
def importkey(self, keyBody): """ Takes a private key file body as the input, and saves it as the staging key returns 0 on success, or 1 on failure """ retval = 0 self.updatelock.acquire() try: try: open(ssl_cert.STAGINGKEY_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME"), 'w').write(keyBody) except IOError: retval = 1 logging.error("Couldn't save key to [%s]" % (ssl_cert.STAGINGKEY_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME"))) if retval == 0: # check the key if keyBody: verifycmd = "secure_script_wrapper -p2 %s verifystagingkey %s" % ( self.sslWrapperPath, self.cfg.getGlobalParam("ENTERPRISE_HOME") ) outputList = [] verifycode = E.execute(['localhost'], verifycmd, outputList, 60) else: verifycode = 0 if verifycode != 0: retval = 1 E.rm(['localhost'], ssl_cert.STAGINGKEY_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME")) logging.error("Couldn't verify key [%s]; error code: %d" % (str(outputList), verifycode) ) finally: self.updatelock.release() return "%d" % retval
def run(self): db_handler = self.db_handler # TODO(dcz): i18n db_handler.writeAdminRunnerOpMsg('database sync started on source %s' % self.source) out_lines = [] try: rtn_code = E.execute(['localhost'], self.cmd, out_lines, 0) if rtn_code == 0: # TODO(dcz): i18n db_handler.writeAdminRunnerOpMsg( 'database sync succeeded on source %s' % self.source) db_handler.distributedbfiles(self.log) db_handler.distributedbfiles(self.last) else: # TODO(dcz): i18n db_handler.writeAdminRunnerOpMsg( 'database sync on source %s failed with status %d' % (self.source, rtn_code)) finally: # clean-up global dict del self.group[self.source]
def generatekey(self): """ creates a randomly generated staging key; returns 0 on success, 1 on failure """ self.updatelock.acquire() try: cmd = "secure_script_wrapper -p2 %s generatekey %s" % ( self.sslWrapperPath, self.cfg.getGlobalParam("ENTERPRISE_HOME")) outputList = [] retcode = E.execute(['localhost'], cmd, outputList, 60) # if the command failed, we don't want to leave a malformed key around if retcode != 0: E.rm(['localhost'], ssl_cert.STAGINGKEY_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME")) logging.error("Couldn't generate private key: %s" %str(outputList)) finally: self.updatelock.release() return "%d" % retcode
def ExecuteWrapper(machines, commandLine, out, alarm, verbose = 1, forceRemote = 0, enthome=None, num_tries=1): """Thin wrapper over E.execute as we need process's return code (parameter to exit()) and E.execute returns exit status. Too late to modify E.execute() method implementation as there is a lot of code that already calls this method. Can't confirm whether any code has come to rely on the fact that E.execute returns exit status code instead of process's return code. Refer E.execute() for it's documentation""" ret = 0 for trial in range(num_tries): ret = E.execute(machines, commandLine, out, alarm, verbose, forceRemote, enthome) if os.WIFEXITED(ret): # child process exit codes are multiplied by 256, so undo this ret = os.WEXITSTATUS(ret) if ret == 0 or (trial + 1) == num_tries: # either we succeed or this was the last try (there is no point in # sleeping after the last try) break logging.warn('%d: Execution of %s failed. Sleeping for 5 seconds...' % (trial, commandLine)) time.sleep(5) return ret
def setcert(self, certBody): """ Takes a cert file body as the input, and saves it as the staging certificate returns 0 on success, or 1 on failure """ retval = 0 self.updatelock.acquire() try: try: open(ssl_cert.STAGINGCERT_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME"), 'w').write(certBody) except IOError: retval = 1 logging.error("Couldn't save certificate to [%s]" % (ssl_cert.STAGINGCERT_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME"))) if retval == 0: verifycmd = "secure_script_wrapper -p2 %s verifystagingcert %s" % ( self.sslWrapperPath, self.cfg.getGlobalParam("ENTERPRISE_HOME") ) outputList = [] verifycode = E.execute(['localhost'], verifycmd, outputList, 60) if verifycode != 0: retval = 1 E.rm(['localhost'], ssl_cert.STAGINGCERT_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME")) logging.error("Couldn't verify certificate [%s]; error code: %d" % (str(outputList), verifycode) ) finally: self.updatelock.release() return "%d" % retval
def EnsureGFSMasterRunning(ver, testver): """ check borgmon alert to see if there is a primary gfs master. If not, try to diagnose and handle the problem following the instructions described in http://www.corp.google.com/~bhmarks/redStone.html Arguments: ver: '4.6.5' testver: 0 - not a test version. 1 - test version. Returns: error messages - if there is an error. None - otherwise """ if (CheckNoMasterBorgmonAlert(ver, testver) or CheckNoMasterUsingElectionStatus(ver, testver)): # find out the node that is supposed to be the primary master master_node = GFSMasterLockHolder(ver, testver) handle_gfs_no_master_cmd = ("/export/hda3/%s/local/google3/enterprise/" "core/handle_gfs_no_master.py %s %s" % (ver, ver, testver)) out = [] status = E.execute([master_node], handle_gfs_no_master_cmd, out, 1200) if status: return ''.join(out) return None
def _ExecuteCommand(cmd, machines=['localhost'], out=None, timeout=15 * 60, num_tries=2, error_filter=None): """ Helper file to execute a command multiple times until it succeeds. Input: cmd: command to run timeout: seconds to allow command to run machines: machine list on which to execute out: list of output lines num_tries: number of times to retry command error_filter: A function that is called if the cmd execution failed. The takes a list of output lines as input, can filter the errors, and decideds if the execution counts as successful. Returns 0 if execution succeeded, 1 if the execution failed. Output: 0 for success, 1 for failure """ if out == None: out = [] for i in range(0, num_tries): if E.execute(machines, cmd, out, timeout) == 0: # Command was successful return 0 # Execution failed if error_filter and error_filter(out) == 0: # Error filter says error is ok, execution counts as success logging.error('Cmd %s ignoring error: %s' % (cmd, ''.join(out))) return 0 if i < num_tries - 1: logging.error('Cmd %s error: %s, retrying.' % (cmd, ''.join(out))) else: logging.error('Cmd %s error: %s, failing.' % (cmd, ''.join(out))) return 1
sys.exit("Unable to load config file %s" % (cp.GetConfigFileName())) use_invalid_config = "" if FLAGS.useinvalidconfig: use_invalid_config = "--useinvalidconfig" python2_invoke_string = "" if FLAGS.use_python2: # we want to invoke using python2 python2_invoke_string = "python2 " action = "--start=all" if FLAGS.kill_only: # we only want to kill and not restart action = "--kill=all" cmd = ('. %s; cd %s/enterprise/legacy/production/babysitter; ' ' %s./babysitter.py --batch %s %s --ports=%s --babyalias=localhost ' '--lockdir=%s/tmp --mailto= --mach=%s %s' % (cp.var("BASHRC_FILE"), cp.var("MAIN_GOOGLE3_DIR"), python2_invoke_string, action, use_invalid_config, FLAGS.port, cp.var("ENTERPRISE_HOME"), FLAGS.machine, cp.GetConfigFileName())) logging.info("Executing [%s]" % cmd) if E.ERR_OK != E.execute([E.getCrtHostName()], cmd, None, true): sys.exit("Error restarting server at %s:%s" % (FLAGS.machine, FLAGS.port)) if __name__ == '__main__': main(sys.argv)