def drain_urlmanagers(self): """ We need to do this before advancing the epoch -- we can do it multiple times """ urlmanagers = self.cfg.globalParams.GetServerHostPorts("urlmanager") num_shards = self.cfg.globalParams.GetNumShards('urlmanager') epoch = self.cfg.getGlobalParam('RT_EPOCH') for (host, port) in urlmanagers: # We don't do it here directly because of the timeout cmd = ". %s; cd %s/local/google3/enterprise/legacy/util && "\ "./port_talker.py %s %d 'd DumpingStatusTable' %d" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.entHome, host, port, 300) # 5 min timeout err = E.execute([E.getCrtHostName()], cmd, None, 0) if E.ERR_OK != err: logging.error("Error draining urlmanagers [%s]" % err) return 1 # Make sure that the file is out shard_num = servertype.GetPortShard(port) file = "%surlmanager_out_table_%02d_of_%02d_epoch%010d" % ( self.cfg.getGlobalParam('NAMESPACE_PREFIX'), shard_num, num_shards, epoch) err, out = E.run_fileutil_command(self.cfg.globalParams, "ls %s" % file) if E.ERR_OK != err: logging.error("The status table file [%s] is not there" % file) return 1 return 0
def browse(self, clientName, virtualFile, fromLine, toLine, grepString, fileArgs): grepString = urllib.quote_plus(grepString.strip()) fromLine = string.atoi(fromLine) toLine = string.atoi(toLine) fileLocation = self.makePhysicalFile(virtualFile, clientName, grepString, fileArgs) if not fileLocation: raise ar_exception.ARException(( ar_exception.LOADPARAMS, M.ERR_INVALIDFILESPECIFICATION)) # Get valid start line and end line numLines = toLine - fromLine + 1; if numLines < 0: numLines = 0 toLine = fromLine + numLines - 1 # Get the lines result = [] parsedGrepString = commands.mkarg(grepString) if ( E.ERR_OK != E.execute( [E.getCrtHostName()], "head -n %s %s | tail -n %s | grep -i -F -- %s" % (toLine, fileLocation, numLines, parsedGrepString), result, false) ): return "0" return "%s\n%s" % (len(result[0]), result[0])
def SyncLogsWithCanonical(ver, canonical): """ sync logs with the canonical Arguments: ver: '4.6.5' canonical: 'ent1' """ gfs_master_nodes, _ = core_utils.GFSMasterNodes() gfs_master_nodes.remove(canonical) gfs_master_dir = '/export/hda3/%s/data/gfs_master' % ver log_dir = '%s/ent.gfsmaster' % gfs_master_dir backup_log_dir = '%s.backup.%d' % (log_dir, int(time.time())) vars = {'gfs_master_dir': gfs_master_dir, 'log_dir': log_dir, 'backup_log_dir': backup_log_dir, 'canonical': canonical } cmd = ('rm -rf %(log_dir)s.backup*; ' 'mv %(log_dir)s %(backup_log_dir)s; ' 'mkdir -p %(log_dir)s; chown nobody:nobody %(log_dir)s; ' 'rsync -c -e ssh -aHpogt %(canonical)s:%(log_dir)s %(gfs_master_dir)s' % vars ) out = [] enthome = '/export/hda3/%s' % ver E.execute(gfs_master_nodes, cmd, out, 1200, 1, 0, enthome)
def KillProcessIfNotMaster(config): # kill babysitter util_dir = "%s/local/google3/enterprise/legacy/util" % config.ENTERPRISE_HOME E.killBabysitter(util_dir, config.GetConfigFileName(), config.VERSION) # stop snmpd when not a master disableSnmp()
def deleteCollection(self, collection): """Delete all reports and logs for a particular collection.""" self.logreplock.acquire() try: for reportType in [liblog.RAW_REPORT, liblog.SUMMARY_REPORT]: reports = self.getLogReports(collection, reportType) for report in reports: # stop running job if report is being (re)generated. if report.completeState != COMPLETE: self.stopRunningJob(self.jobName(report)) # delete data files if any. (html_file, valid_file) = liblog.get_report_filenames(self.entConfig, reportType, report.reportName, collection) self.RemoveReportFiles(html_file, valid_file) self.reportCount[reportType] -= len(reports) logging.info('Delete total %d reports of type %s for collection %s.' % ( len(reports), reportType, collection)) listfile = liblog.get_report_list_filename(self.entConfig, reportType, collection) (err, out) = E.run_fileutil_command(self.entConfig, 'rm -f %s' % listfile) if err: logging.error('Cannot remove list file %s.' % listfile) report_collection_dir = liblog.get_report_collection_dir(self.entConfig, collection) (err, out) = E.run_fileutil_command(self.entConfig, 'rmdir %s' % report_collection_dir) if err: logging.error('Cannot delete unused directory %s' % \ report_collection_dir) finally: self.logreplock.release()
def replicateConfig(self, machines): """ Replicates the config to a list of machines """ cmd = (". %s && cd %s/local/google3/enterprise/legacy/scripts && " "./replicate_config.py %s %s" % (self.getGlobalParam("ENTERPRISE_BASHRC"), self.entHome, E.getCrtHostName(), self.globalParams.GetConfigFileName())) return E.execute(machines, cmd, None, true)
def SyncOpLogs(all_machines, log_dir): """ This will sync the AdminRunner.OPERATOR.* logs to all machines """ # We have to run this only on master master = find_master.FindMaster(2100, all_machines) # The name of this machine crt_machine = E.getCrtHostName() if len(master) == 1 and master[0] == crt_machine: for machine in all_machines: if machine != crt_machine: src_dir = '%s/AdminRunner.OPERATOR.*' % (log_dir) dest_dir = '%s:/%s' % (machine, log_dir) logging.info('Collecting operator logs from %s into %s' % ( src_dir, dest_dir)) rsync_cmd = 'rsync --timeout=20 --size-only -vau ' \ ' -e ssh %s %s/' % (src_dir, dest_dir) # rsync the logs lockfile = '%s/syncops_lock' % log_dir lock = E.acquire_lock(lockfile, 1, breakLockAfterGracePeriod = 0) if lock == None: logging.info('Cannot grab the lock. Return!') return try: (status, output) = liblog.DoCommand(rsync_cmd) if status != 0: logging.error('Failed to collect logs from %s: %s' % ( machine, output)) finally: lock.close() os.unlink(lockfile)
def CollectLogs(all_machines, gws_log_dir, log_collect_dir): # We only run this on oneway or master node of cluster. master = find_master.FindMaster(2100, all_machines) crt_machine = E.getCrtHostName() if len(all_machines) != 1 and (len(master) != 1 or master[0] != crt_machine): logging.info('Not a oneway or cluster master node. Return!') return lockfile = '%s/lock' % log_collect_dir # waiting up to 5 minutes for the lock. lock = E.acquire_lock(lockfile, 30, breakLockAfterGracePeriod = 0) if lock == None: logging.info('Cannot grab the lock. Return!') return try: for machine in all_machines: src_pattern = '%s/partnerlog.*' % gws_log_dir dest_dir = '%s/%s' % (log_collect_dir, machine) # If it's a oneway or master node, we make a symlink to gws_log_dir instead # of rsync to log_collect directory if machine == crt_machine: # To make it backward compatible, we need to remove old dest_dir if it's # already an existing directory from previous version because in previous # versions we created a dir and rsynced files even on the master node and # one-ways. if os.path.exists(dest_dir) and not os.path.islink(dest_dir): if not E.rm(master, '%s/*' % dest_dir) or not E.rmdir(master, dest_dir): logging.error('Directory %s exists and cannot be cleaned.', dest_dir) continue logging.info('Cleaned existing directory %s.', dest_dir) if E.ln(master, gws_log_dir, dest_dir): logging.info('Symlink %s to directory %s:%s for logs' % (dest_dir, machine, gws_log_dir)) else: logging.error('Cannot make a symlink from %s to %s' % (dest_dir, gws_log_dir)) continue # For non-master nodes on cluster, we need to rsync those files to master node logging.info('Collecting logs from %s:%s into %s' % ( machine, src_pattern, dest_dir)) # make log directories if needed liblog.MakeDir(dest_dir) # rsync all files from one remote machine in one command. rsync_cmd = 'rsync --timeout=60 --size-only -vau ' \ ' -e ssh %s:%s %s/' % (machine, src_pattern, dest_dir) # rsync the logs (status, output) = liblog.DoCommand(rsync_cmd) if status != 0: logging.error('Failed to collect logs from %s: %s' % ( machine, output)) finally: lock.close() os.unlink(lockfile)
def SyncOneboxLog(config): """Syncs Local Onebox log file with GFS Onebox Log file ONLY on clusters. As of 4.6.4, this is called from scripts/periodic_script.py and from onebox_handler.py, when the user does View Log AND the machine is a cluster. """ onebox_port = servertype.GetPortBase('oneboxenterprise') onebox_node = config.SERVERS[onebox_port] crt_machine = E.getCrtHostName() ent_config_type = config.var('ENT_CONFIG_TYPE') #If onebox server is not running no need to sync. if ent_config_type != 'CLUSTER' or crt_machine != onebox_node[0]: return tmp_dir = config.var('TMPDIR') gfs_cell = config.var('GFS_CELL') local_log_name = os.path.join(tmp_dir, config.var('ENTERPRISE_ONEBOX_LOG')) gfs_log_name = os.path.join(os.sep, 'gfs', gfs_cell, config.var('ENTERPRISE_ONEBOX_LOG')) equalize_command = 'equalize %s %s' % (local_log_name, gfs_log_name) # fileutil equalize copies only the difference of the log files. err, out = E.run_fileutil_command(config, equalize_command) if not err: return # files didn't match in the begining, possibly a new log file would have # created, copy the whole log file in such case. copy_command = 'cp -f %s %s' % (local_log_name, gfs_log_name) err, out = E.run_fileutil_command(config, copy_command) if err: logging.error('Error while syncing onebox logs.')
def kill_service(services, machines): """Kill all processes associated with specified services on the specified machines. E.execute() sends the commands concurrently when there is more than one node. Args: services: list of services to kill. 'adminconsole' and 'adminrunner' are currently supported. machines: list of hostnames """ # Map of services to the command that kills the service find_service_pid_cmd = { 'adminconsole': ("ps -e -o pid,args --width 100 | " "grep loop_AdminConsole.py | grep -v grep | " "awk '{print $1}' ; " "%s" % python_kill.GetServicesListeningOn(['8000'])), 'adminrunner': ("ps -e -o pid,args --width 100 | " "grep loop_AdminRunner.py | grep -v grep | " "awk '{print $1}' ; " "%s" % python_kill.GetServicesListeningOn(['2100'])), } for service in services: if service not in find_service_pid_cmd: logging.error('kill_service: Unrecognised service "%s"' % service) else: logging.info('kill_service: Killing service "%s" on %d nodes...' % (service, len(machines))) kill_cmd = ('sh -c "(kill `%s`; sleep 3; kill -9 `%s`; true)" ' '> /dev/null 2>&1' % (find_service_pid_cmd[service], find_service_pid_cmd[service])) E.execute(machines, kill_cmd, [], alarm=1, verbose=0) logging.info('kill_service: Done killing service "%s"' % service)
def main(unused_args): if not E.access([E.LOCALHOST], FLAGS.enthome, 'rd'): sys.exit("Invalid enthome %s" % FLAGS.enthome) if ( not FLAGS.box_keys_dir or not E.access([E.LOCALHOST], FLAGS.box_keys_dir, 'rwd') ): sys.exit("Invalid box_keys_dir %s" % FLAGS.box_keys_dir) if ( not FLAGS.license_keys_dir or not E.access([E.LOCALHOST], FLAGS.license_keys_dir, 'rwd') ): sys.exit("Invalid license_keys_dir %s" % FLAGS.license_keys_dir) if FLAGS.installstate not in ["INSTALL", "ACTIVE", "SERVE", "TEST"]: sys.exit("Invalid --installstate %s" % FLAGS.installstate) reset_index.SetResetStatusCacheTimeout(FLAGS.reset_status_cache_timeout) as = adminrunner_server.AdminRunnerServer(FLAGS.enthome, FLAGS.installstate, FLAGS.port, FLAGS.box_keys_dir, FLAGS.license_keys_dir) # make sure we have been given a config file if (not as.cfg.getGlobalParam(C.ENT_SYSTEM_HAS_VALID_CONFIG) and as.cfg.getInstallState() != 'INSTALL'): logging.fatal("adminrunner doesn't have a config file; you must "\ "install a conf rpm or run it with "\ "--installstate=INSTALL (for migration)") return # just in case fatal doesn't exit
def setcert(self, certBody): """ Takes a cert file body as the input, and saves it as the staging certificate returns 0 on success, or 1 on failure """ retval = 0 self.updatelock.acquire() try: try: open(ssl_cert.STAGINGCERT_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME"), "w").write(certBody) except IOError: retval = 1 logging.error( "Couldn't save certificate to [%s]" % (ssl_cert.STAGINGCERT_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME")) ) if retval == 0: verifycmd = "secure_script_wrapper -p2 %s verifystagingcert %s" % ( self.sslWrapperPath, self.cfg.getGlobalParam("ENTERPRISE_HOME"), ) outputList = [] verifycode = E.execute(["localhost"], verifycmd, outputList, 60) if verifycode != 0: retval = 1 E.rm(["localhost"], ssl_cert.STAGINGCERT_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME")) logging.error("Couldn't verify certificate [%s]; error code: %d" % (str(outputList), verifycode)) finally: self.updatelock.release() return "%d" % retval
def SetInitState(cfg, state): """Sets system's initialization state. For oneway, it stores it in C.ENT_SYSTEM_INIT_STATE. For Clusters, it stores it in chubby file /ls/ent<version>/ENT_SYSTEM_INIT_STATE. @param cfg - of type configurator. @param state - string """ # oneway? if 1 == len(core_utils.GetNodes()): cfg.setGlobalParam(C.ENT_SYSTEM_INIT_STATE, state) return tmpfile = E.mktemp('/export/hda3/tmp') try: f = open(tmpfile, 'w') f.write(state) f.close() except IOError: logging.fatal('Cannot write to temp file %s' % tmpfile) return version = cfg.getGlobalParam('VERSION') lockserv_cmd_prefix = core_utils.GetLSClientCmd(version, is_test(version)) chubby_root_dir = '/ls/%s' % core_utils.GetCellName(version) write_cmd = '%s cp %s %s/%s' % (lockserv_cmd_prefix, tmpfile, chubby_root_dir, 'ENT_SYSTEM_INIT_STATE') logging.info('setting system init state to: %s', state) E.exe_or_fail(write_cmd) E.exe('rm -rf %s' % tmpfile)
def browse(self, clientName, virtualFile, fromLine, toLine, grepString, fileArgs): grepString = urllib.quote_plus(grepString.strip()) fromLine = string.atoi(fromLine) toLine = string.atoi(toLine) fileLocation = self.makePhysicalFile(virtualFile, clientName, grepString, fileArgs) if not fileLocation: raise ar_exception.ARException( (ar_exception.LOADPARAMS, M.ERR_INVALIDFILESPECIFICATION)) # Get valid start line and end line numLines = toLine - fromLine + 1 if numLines < 0: numLines = 0 toLine = fromLine + numLines - 1 # Get the lines result = [] parsedGrepString = commands.mkarg(grepString) if (E.ERR_OK != E.execute([E.getCrtHostName()], "head -n %s %s | tail -n %s | grep -i -F -- %s" % (toLine, fileLocation, numLines, parsedGrepString), result, false)): return "0" return "%s\n%s" % (len(result[0]), result[0])
def _RunServeCmd(cfg, version, cmd, allnodes=0): """Run serve_service command. cmd: 'stop', 'start', 'activate', 'deactivate' allnodes: 1 to run command on all nodes """ serve_service_cmd = ( '/export/hda3/%s/local/google3/enterprise/legacy/scripts/' 'serve_service.py %s %s' % (version, cfg.getGlobalParam('ENTERPRISE_HOME'), cmd)) logging.info('Running: %s' % serve_service_cmd) if allnodes: machines = cfg.getGlobalParam(C.MACHINES) else: machines = [E.getCrtHostName()] if E.execute(machines, SECURE_WRAPPER_COMMAND % ( \ cfg.getGlobalParam('ENTERPRISE_HOME'), '-p2', serve_service_cmd), None, 0) != E.ERR_OK: logging.error('%s: failed' % serve_service_cmd) return 1 logging.info('%s: completed' % serve_service_cmd) return 0
def SyncLogsWithCanonical(ver, canonical): """ sync logs with the canonical Arguments: ver: '4.6.5' canonical: 'ent1' """ gfs_master_nodes, _ = core_utils.GFSMasterNodes() gfs_master_nodes.remove(canonical) gfs_master_dir = '/export/hda3/%s/data/gfs_master' % ver log_dir = '%s/ent.gfsmaster' % gfs_master_dir backup_log_dir = '%s.backup.%d' % (log_dir, int(time.time())) vars = { 'gfs_master_dir': gfs_master_dir, 'log_dir': log_dir, 'backup_log_dir': backup_log_dir, 'canonical': canonical } cmd = ( 'rm -rf %(log_dir)s.backup*; ' 'mv %(log_dir)s %(backup_log_dir)s; ' 'mkdir -p %(log_dir)s; chown nobody:nobody %(log_dir)s; ' 'rsync -c -e ssh -aHpogt %(canonical)s:%(log_dir)s %(gfs_master_dir)s' % vars) out = [] enthome = '/export/hda3/%s' % ver E.execute(gfs_master_nodes, cmd, out, 1200, 1, 0, enthome)
def replicateConfig(self, machines): """ Replicates the config to a list of machines """ cmd = (". %s && cd %s/local/google3/enterprise/legacy/scripts && " "./replicate_config.py %s %s" % ( self.getGlobalParam("ENTERPRISE_BASHRC"), self.entHome, E.getCrtHostName(), self.globalParams.GetConfigFileName())) return E.execute(machines, cmd, None, true)
def installkey(self): """ installs the staging key as the currently installed private key returns: 0 on success, 1 on empty install key (not an error) 2 when the private key is invalid 3 when the private key could not be distributed """ self.updatelock.acquire() try: # first verify if the staging key is empty (not an error) if ( not os.path.exists(ssl_cert.STAGINGKEY_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME")) ) or 0 == len(open(ssl_cert.STAGINGKEY_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME"), "r").read()): return "1" # next verify that the staging key is a valid file verifycmd = "secure_script_wrapper -p2 %s verifystagingkey %s" % ( self.sslWrapperPath, self.cfg.getGlobalParam("ENTERPRISE_HOME"), ) outputList = [] verifycode = E.execute(["localhost"], verifycmd, outputList, 60) if verifycode != 0: E.rm(["localhost"], ssl_cert.STAGINGKEY_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME")) logging.error("Verify failed for key [%s]; error code: %d" % (str(outputList), verifycode)) return "2" # distribute the staging key retcode = E.distribute( self.cfg.getGlobalParam("MACHINES"), ssl_cert.STAGINGKEY_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME"), 60, ) if retcode != 0: logging.error("Couldn't distribute private key, error %d" % retcode) return "3" # next, copy the key on all machines cmd = "secure_script_wrapper -p2 %s installkey %s" % ( self.sslWrapperPath, self.cfg.getGlobalParam("ENTERPRISE_HOME"), ) outputList = [] retcode = E.execute(self.cfg.getGlobalParam("MACHINES"), cmd, outputList, 60) if retcode != 0: logging.error("Couldn't install cert: %s" % str(outputList)) return "3" self.writeAdminRunnerOpMsg(M.MSG_LOG_SSL_KEY_INSTALLED) finally: self.updatelock.release() return "0"
def GetDirList(self): return \ [E.joinpaths([self.config.var('GOOGLEDATA'), 'gws/clients', self.name]), E.joinpaths([self.config.var('GOOGLEDATA'), 'gws/p4clientinfo', self.name]), E.joinpaths([self.config.var('CONFIGDIR'), 'frontends', self.name])]
def deactivate(self): """ deactivate this service and macke the cron script unexecutable """ E.exe_or_fail("/sbin/chkconfig --del %s_%s" % ( self.service_name, self.version)) E.exe_or_fail("chmod 644 /etc/cron.%s/cron_%s_%s" % ( self.service_cron_time, self.service_name, self.version))
def activate(self): """ activates this service and makes it's periodic cron script executable """ E.exe_or_fail("/sbin/chkconfig --add %s_%s" % ( self.service_name, self.version)) E.exe_or_fail("chmod 755 /etc/cron.%s/cron_%s_%s" % ( self.service_cron_time, self.service_name, self.version))
def installkey(self): """ installs the staging key as the currently installed private key returns: 0 on success, 1 on empty install key (not an error) 2 when the private key is invalid 3 when the private key could not be distributed """ self.updatelock.acquire() try: # first verify if the staging key is empty (not an error) if (not os.path.exists(ssl_cert.STAGINGKEY_FILENAME % \ self.cfg.getGlobalParam("ENTERPRISE_HOME"))) or \ 0 == len(open(ssl_cert.STAGINGKEY_FILENAME % \ self.cfg.getGlobalParam("ENTERPRISE_HOME"), "r").read()): return "1" # next verify that the staging key is a valid file verifycmd = "secure_script_wrapper -p2 %s verifystagingkey %s" % ( self.sslWrapperPath, self.cfg.getGlobalParam("ENTERPRISE_HOME") ) outputList = [] verifycode = E.execute(['localhost'], verifycmd, outputList, 60) if verifycode != 0: E.rm(['localhost'], ssl_cert.STAGINGKEY_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME")) logging.error("Verify failed for key [%s]; error code: %d" % (str(outputList), verifycode) ) return "2" # distribute the staging key retcode = E.distribute(self.cfg.getGlobalParam("MACHINES"), ssl_cert.STAGINGKEY_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME"), 60) if retcode != 0: logging.error("Couldn't distribute private key, error %d" % retcode) return "3" # next, copy the key on all machines cmd = "secure_script_wrapper -p2 %s installkey %s" % ( self.sslWrapperPath, self.cfg.getGlobalParam("ENTERPRISE_HOME")) outputList = [] retcode = E.execute(self.cfg.getGlobalParam("MACHINES"), cmd, outputList, 60) if retcode != 0: logging.error("Couldn't install cert: %s" % str(outputList)) return "3" self.writeAdminRunnerOpMsg(M.MSG_LOG_SSL_KEY_INSTALLED) finally: self.updatelock.release() return "0"
def remove(self, machine): """ This removes a machine from the configuration """ if machine not in self.cfg.getGlobalParam('MACHINES'): logging.error("%s doesn't exist" % machine) return 1 ver = self.cfg.getGlobalParam('VERSION') home = self.cfg.getGlobalParam('ENTERPRISE_HOME') testver = install_utilities.is_test(ver) # if possible stop the core services, ignore return code install_utilities.stop_core(ver, home, [machine]) if machine == E.getCrtHostName(): logging.error("Cannot remove self") return 1 # Halt the machine if APC is used. error = self.halt(machine) self.cfg.globalParams.ReplaceVarInParam("SERVERS", None, machine) self.cfg.globalParams.ReplaceVarInParam("MACHINES", None, machine) ret = core_utils.AddDeadNode(ver, testver, machine) # remove the chunkserver running on the node gfs_utils.DeleteGFSChunkservers(ver, testver, [machine]) if ret: logging.error('Cannot add dead node to the lockserver.') # we ignore this error for now # now we need to remove the data disks that were on this machine data_disks = self.cfg.globalParams.var_copy('DATACHUNKDISKS') if data_disks.has_key(machine): del data_disks[machine] if not self.cfg.setGlobalParam('DATACHUNKDISKS', data_disks): return 1 # This also saves the config file if not self.cfg.DoMachineAllocation(): return 1 # Now we need to restart babysitter because the old one # is out of sync after this serve_service_cmd = ( ". %s && " "cd %s/local/google3/enterprise/legacy/scripts && " "./serve_service.py %s" % (self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME'))) E.exe("%s %s" % (serve_service_cmd, "babysit")) self.restart_crawl_processes(serve_service_cmd) if not mail_already_sent(M.MSG_MACHINEREMOVED % machine): SendMail.send(self.cfg, None, false, M.MSG_MACHINEREMOVED % machine, "", true) return error
def StartAdminLoop(config, op='start'): """Start AdminRunner Loop as well as Loop for initial_config_Server""" install_state = install_utilities.install_state(config.VERSION) logging.info("Starting the AdminRunner") ar_args = [] ar_args.append("--port=2100") ar_args.append("--enthome=%s" % config.ENTERPRISE_HOME) ar_args.append("--installstate=%s" % install_state) ar_args.append("--reset_status_cache_timeout=60") restart_loop_AdminRunner = 0 if op == 'babysit': pidfile = E.GetPidFileName('loop_AdminRunner') pid = E.ReadPidFile(pidfile) if os_utils.GetAttr('pid', pid=pid, fallback_to_ps=0) == None: restart_loop_AdminRunner = 1 if op == 'start' or restart_loop_AdminRunner: E.su_exe_or_fail( config.ENTERPRISE_USER, """ ps axwwwww | fgrep AdminRunner | fgrep -v fgrep | \ colrm 7 | xargs kill -9 2> /dev/null; \ . %(eb)s; \ cd %(eh)s/local/google3/enterprise/legacy/scripts/ && \ ENT_ID=%(v)s_crawl ./loop_AdminRunner.py \ %(eh)s %(args)s >> \ /%(ld)s/loop_AdminOut_`whoami` 2>&1 &""" % { 'eh' : config.ENTERPRISE_HOME, 'eb' : config.ENTERPRISE_BASHRC, 'v' : config.VERSION, 'ld' : config.LOGDIR, 'args' : string.join(map(commands.mkarg, ar_args)) }) restart_loop_webserver_config = 0 if op == 'babysit': pidfile = E.GetPidFileName('loop_webserver_config') pid = E.ReadPidFile(pidfile) if os_utils.GetAttr('pid', pid=pid, fallback_to_ps=0) == None: restart_loop_webserver_config = 1 if (install_state != "INSTALL" and (op == 'start' or restart_loop_webserver_config)): logging.info("Starting webserver_config") E.su_exe_or_fail( config.ENTERPRISE_USER, """ ps axwwwww | fgrep webserver_config.py | fgrep -v fgrep \ | colrm 7 | xargs kill -9 2> /dev/null; \ . %s; \ cd %s/enterprise/legacy/scripts/ && \ ENT_ID=%s_crawl ./loop_webserver_config.py %s \ >> /%s/loop_WebserverConfig_`whoami` 2>&1 &""" % ( config.ENTERPRISE_BASHRC, config.MAIN_GOOGLE3_DIR, config.VERSION, config.GetConfigFileName(), config.LOGDIR))
def HaltMachines(enthome, machines): 'Stops and powers down machines.' logging.info("Halting machines: %s" % string.join(machines, ",")) E.execute(machines, '/sbin/shutdown -h now &', None, 1) time.sleep(60) for machine in machines: SendAPCCommand(enthome, machine, APC_OFF) return 0
def remove(self, machine): """ This removes a machine from the configuration """ if machine not in self.cfg.getGlobalParam('MACHINES'): logging.error("%s doesn't exist" % machine) return 1 ver = self.cfg.getGlobalParam('VERSION') home = self.cfg.getGlobalParam('ENTERPRISE_HOME') testver = install_utilities.is_test(ver) # if possible stop the core services, ignore return code install_utilities.stop_core(ver, home, [machine]) if machine == E.getCrtHostName(): logging.error("Cannot remove self") return 1 # Halt the machine if APC is used. error = self.halt(machine) self.cfg.globalParams.ReplaceVarInParam("SERVERS", None, machine) self.cfg.globalParams.ReplaceVarInParam("MACHINES", None, machine) ret = core_utils.AddDeadNode(ver, testver, machine) # remove the chunkserver running on the node gfs_utils.DeleteGFSChunkservers(ver, testver, [machine]) if ret: logging.error('Cannot add dead node to the lockserver.') # we ignore this error for now # now we need to remove the data disks that were on this machine data_disks = self.cfg.globalParams.var_copy('DATACHUNKDISKS') if data_disks.has_key(machine): del data_disks[machine] if not self.cfg.setGlobalParam('DATACHUNKDISKS', data_disks): return 1 # This also saves the config file if not self.cfg.DoMachineAllocation(): return 1 # Now we need to restart babysitter because the old one # is out of sync after this serve_service_cmd = (". %s && " "cd %s/local/google3/enterprise/legacy/scripts && " "./serve_service.py %s" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME'))) E.exe("%s %s" % (serve_service_cmd, "babysit")) self.restart_crawl_processes(serve_service_cmd) if not mail_already_sent(M.MSG_MACHINEREMOVED % machine): SendMail.send(self.cfg, None, false, M.MSG_MACHINEREMOVED % machine, "", true) return error
def ResyncWithMaster(google_config_file, enterprise_home, enterprise_user, master): scripts_dir = "%s/local/google3/enterprise/legacy/scripts" % enterprise_home enterprise_bashrc = "%s/local/conf/ent_bashrc" % enterprise_home E.su_exe_or_fail( enterprise_user, ". %s && cd %s && alarm 600 ./replicate_config.py %s %s" % (enterprise_bashrc, scripts_dir, master, google_config_file)) SetNonMasterNTP(master)
def remove_unused_from(dirname, fileutil, grace_seconds): ''' Get a list of all files in the given directory that aren't opened and delete them. fileutil - full path of fileutil grace_seconds - Even if a file isn't currently opened we consider it being in-use if it has been accessed recently (less this many seconds ago) ''' if not dirname: logging.error("Not given a directory to cleanup") return open_files_cmd = ("lsof +D %s -Fn" % dirname) (status, output) = E.getstatusoutput(open_files_cmd) #if status != E.ERR_OK: # return # lsof doesn't return 0 even on success, so ignore it # lsof returns several lines for each file because multiple threads in a # process could have it open. Get a list of unique files. open_files = {} for line in output.split(): if line[0] == 'n': file = line[1:] open_files[file] = 1 # Get a list of all files in the directory - not starting with . all_files = glob.glob("%s/*" % dirname) # Delete all unused files. for file in all_files: if file not in open_files: try: age = int(time.time()) - os.stat(file)[stat.ST_ATIME] if age > grace_seconds: logging.info('Removing unused file %s' % file) (s, o) = E.getstatusoutput("%s rm -f %s" % (fileutil, file)) # If fileutil can't delete it for any reason, nuke it directly # And its attribute file. if os.path.exists(file): os.remove(file) os.remove( '%s/.attr.plain.%s' % (os.path.dirname(file), os.path.basename(file))) else: logging.info('Ignoring unused file %s of age %s seconds' % (file, age)) continue except OSError: # File got deleted since we ran glob? Ignore away. continue
def TouchFile(global_params, filename): """ check to see if filename exists, create if it does not exists """ # first check if file exists ls_cmd = "ls %s" % filename err, out = E.run_fileutil_command(self.globalParams, ls_cmd) if err != E.ERR_OK: # create if not exists create_cmd = "truncate %s 0" % filename err, out = E.run_fileutil_command(self.globalParams, create_cmd) if err != E.ERR_OK: logging.fatal("Could not create file: %s" % filename)
def deactivate(self): """ Override this for some extra cleanup""" ent_service.ent_service.deactivate(self) # Remove cronjobs relating to this install version ONLY... tmp_cron_file = "%s/tmp/nobody_cron_crawl_" % self.ent_home E.exe_or_fail("""if $(/usr/bin/crontab -lu %s &>/dev/null); then crontab -lu %s | grep -v %s > %s; crontab -u %s %s; fi""" % \ (self.ent_user, self.ent_user, self.entid_tag, tmp_cron_file, self.ent_user, tmp_cron_file)) return 1
def main(argv): if len(argv) < 1: sys.exit(__doc__) pidfile = E.GetPidFileName('loop_AdminRunner') E.WritePidFile(pidfile) ENTERPRISE_HOME = commands.mkarg(argv[0]) # ENTERPRISE_HOME are like "'/export/hda3/4.6.0.G.35'". # removing the "'" at the beginning and the end. ent_home = ENTERPRISE_HOME.split("'")[1] adminrunner_dir = ent_home + '/local/google3/enterprise/legacy/adminrunner' keys_dir = os.path.expanduser('~/.gnupg') restart_command = ['./adminrunner.py'] restart_command.extend(argv[1:]) restart_command.extend(['--log_dir=/export/hda3/logs', '--box_keys_dir=' + keys_dir, '--license_keys_dir=' + keys_dir]) consecutive_kills_skipped = 0 while True: if not check_healthz.CheckHealthz(2100): # If AdminRunner process exists, give it some time if reset_index.IsResetIndexInProgress(ent_home): # Everything may be slower during reset index. limit = MAX_RESET_INDEX_CONSECUTIVE_KILLS_TO_SKIP else: limit = MAX_CONSECUTIVE_KILLS_TO_SKIP if consecutive_kills_skipped < limit: # Check if adminrunner process exists (err, pidstring) = python_kill.Kill('adminrunner.py', '2100', print_only=1) # if there is no err and pidstring is '', adminrunner is gone. # otherwise, we assume it is alive and wait for a while if err != 0 or pidstring != '': consecutive_kills_skipped += 1 time.sleep(CHECK_RESTART_INTERVAL) continue # Start/restart adminrunner, clear ResetIndexInProgress status consecutive_kills_skipped = 0 reset_index.ClearResetIndexStatus(ent_home) print "Killing adminrunner..." python_kill.Kill('adminrunner.py', '2100') print "Restarting adminrunner..." subprocess.Popen(restart_command, cwd=adminrunner_dir) # Give it some extra time after startup. Loading and validating # large number of collections and frontends with all large number of # keymatches etc can take a long time and adminrunner doesn't respond # to health checks during this time. time.sleep(SLEEP_AFTER_RESTART) else: consecutive_kills_skipped = 0 time.sleep(CHECK_RESTART_INTERVAL)
def send_cmd(self, server_name, cmd): srvrs = self.cfg.globalParams.GetServerManager().Set(server_name).Servers() for srvr in srvrs: actual_cmd = ". %s; cd %s/local/google3/enterprise/legacy/util && "\ "./port_talker.py %s %d '%s' %d" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.entHome, srvr.host(), srvr.port(), cmd, 60) # 1 min timeout err = E.execute([E.getCrtHostName()], actual_cmd, None, 0) if E.ERR_OK != err: logging.error("Error talking to server at %s:%d" % (srvr.host(), srvr.port())) return true
def ResyncWithMaster(google_config_file, enterprise_home, enterprise_user, master) : scripts_dir = "%s/local/google3/enterprise/legacy/scripts" % enterprise_home enterprise_bashrc = "%s/local/conf/ent_bashrc" % enterprise_home E.su_exe_or_fail( enterprise_user, ". %s && cd %s && alarm 600 ./replicate_config.py %s %s" % (enterprise_bashrc, scripts_dir, master, google_config_file)) SetNonMasterNTP(master)
def send_urlscheduler_command(self, command): schedulers = self.cfg.globalParams.GetServerHostPorts("urlscheduler") timeout = 60 # 1 minute is good enough for (machine, port) in schedulers: port_talker_cmd = ". %s; cd %s/local/google3/enterprise/legacy/util && "\ "./port_talker.py %s %d 'GET /run?Flags=%s\r\n\r\n' %d" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.entHome, machine, port, command, timeout) if E.ERR_OK != E.execute([E.getCrtHostName()], port_talker_cmd, None, 0): logging.error("Error talking to urlscheduler %s:%d" % (machine, port)) return 1 return 0
def remove_unused_from(dirname, fileutil, grace_seconds): ''' Get a list of all files in the given directory that aren't opened and delete them. fileutil - full path of fileutil grace_seconds - Even if a file isn't currently opened we consider it being in-use if it has been accessed recently (less this many seconds ago) ''' if not dirname: logging.error("Not given a directory to cleanup") return open_files_cmd = ("lsof +D %s -Fn" % dirname) (status, output) = E.getstatusoutput(open_files_cmd) #if status != E.ERR_OK: # return # lsof doesn't return 0 even on success, so ignore it # lsof returns several lines for each file because multiple threads in a # process could have it open. Get a list of unique files. open_files = {} for line in output.split(): if line[0] == 'n': file = line[1:] open_files[file] = 1 # Get a list of all files in the directory - not starting with . all_files = glob.glob("%s/*" % dirname) # Delete all unused files. for file in all_files: if file not in open_files: try: age = int(time.time()) - os.stat(file)[stat.ST_ATIME] if age > grace_seconds: logging.info('Removing unused file %s' % file) (s, o) = E.getstatusoutput("%s rm -f %s" % (fileutil, file)) # If fileutil can't delete it for any reason, nuke it directly # And its attribute file. if os.path.exists(file): os.remove(file) os.remove('%s/.attr.plain.%s' % (os.path.dirname(file), os.path.basename(file))) else: logging.info('Ignoring unused file %s of age %s seconds' % (file, age)) continue except OSError: # File got deleted since we ran glob? Ignore away. continue
def send_cmd(self, server_name, cmd): srvrs = self.cfg.globalParams.GetServerManager().Set( server_name).Servers() for srvr in srvrs: actual_cmd = ". %s; cd %s/local/google3/enterprise/legacy/util && "\ "./port_talker.py %s %d '%s' %d" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.entHome, srvr.host(), srvr.port(), cmd, 60) # 1 min timeout err = E.execute([E.getCrtHostName()], actual_cmd, None, 0) if E.ERR_OK != err: logging.error("Error talking to server at %s:%d" % (srvr.host(), srvr.port())) return true
def installcert(self): """ installs the staging certificate as the currently installed certificate returns: 0 on success, and 1 on failure """ self.updatelock.acquire() try: # first verify that the staging certificate is a valid file verifycmd = "secure_script_wrapper -p2 %s verifystagingcert %s" % ( self.sslWrapperPath, self.cfg.getGlobalParam("ENTERPRISE_HOME"), ) outputList = [] verifycode = E.execute(["localhost"], verifycmd, outputList, 60) if verifycode != 0: E.rm(["localhost"], ssl_cert.STAGINGCERT_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME")) logging.error("Verify failed for certificate [%s]; error code: %d" % (str(outputList), verifycode)) return "1" # distribute the staging certificate retcode = E.distribute( self.cfg.getGlobalParam("MACHINES"), ssl_cert.STAGINGCERT_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME"), 60, ) if retcode != 0: logging.error("Couldn't distribute apache cert, error %d" % retcode) # next, generate the certificate on all machines cmd = "secure_script_wrapper -p2 %s installcert %s" % ( self.sslWrapperPath, self.cfg.getGlobalParam("ENTERPRISE_HOME"), ) outputList = [] retcode = E.execute(self.cfg.getGlobalParam("MACHINES"), cmd, outputList, 60) if retcode != 0: logging.error("Couldn't install cert: %s" % str(outputList)) return "1" self.writeAdminRunnerOpMsg(M.MSG_LOG_SSL_CERT_INSTALLED) finally: self.updatelock.release() return "0"
def RebootMachine(enthome, machine): 'Reboots a machine.' if machine == E.getCrtHostName(): # Rebooting ourself logging.info('Rebooting %s' % machine) E.execute([E.LOCALHOST], '/sbin/shutdown -r now', None, 1) # If we're still alive after a minute , the APC will kick in else: # Try shutting down cleanly first logging.info('Shutting down %s' % machine) E.execute([machine], '/sbin/shutdown -h now &', None, 1) time.sleep(60) logging.info('Rebooting %s via APC' % machine) return SendAPCCommand(enthome, machine, APC_REBOOT)
def check_klogd_syslogd_conf(machines, enthome, unittestdir=None): """ babysit klogd.conf and syslogd.conf file Recreate klogd.conf and syslogd.conf if they are not in the dir. Args: machines: ['ent1', 'ent2', 'ent3', 'ent4', 'ent5'] enthome: '/export/hda3/4.6.0.G.27/' unittestdir: '/tmp/etc/localbabysitter.d/' -- used for unittest only """ KLOGD_CONF_DATA = ( "klogd = {\n" " restart_command : '/sbin/service syslog restart',\n" " timeout : 30,\n" " interval : 30,\n" " use_service_wrapper : 0,\n" " pidfile : '/var/run/klogd.pid',\n" "}\n" ) SYSLOGD_CONF_DATA = ( "syslogd = {\n" " restart_command : '/sbin/service syslog restart',\n" " timeout : 30,\n" " interval : 30,\n" " use_service_wrapper : 0,\n" " pidfile : '/var/run/syslogd.pid',\n" "}\n" ) CHECK_CREATE_CMD = ( 'if [ ! -e "%(file)s" ]\n' 'then\n' ' echo "%(data)s" > %(file)s\n' ' chmod 644 %(file)s\n' 'fi\n' ) if unittestdir is None: dir = '/etc/localbabysitter.d/' else: dir = unittestdir file_info = {'klogd.conf':KLOGD_CONF_DATA, 'syslogd.conf':SYSLOGD_CONF_DATA } for fname, data in file_info.items(): file = os.path.join(dir, fname) cmd = CHECK_CREATE_CMD % {'data': data, 'file': file} if unittestdir is None: E.execute(machines, cmd, [], 0, 0, 0, enthome) else: os.system(cmd)
def MasterKillMyself(config): util_dir = "%s/local/google3/enterprise/legacy/util" % config.ENTERPRISE_HOME # Stop AdminRunner for script in [ "loop_AdminRunner.py", "adminrunner.py", "loop_webserver_config.py", "webserver_config.py" ]: E.su_exe_or_fail(config.ENTERPRISE_USER, ". %s && cd %s && ./python_kill.py --binname=%s "\ "--kill_by_group" % (config.ENTERPRISE_BASHRC, util_dir, script)) KillProcessIfNotMaster(config)
def installcert(self): """ installs the staging certificate as the currently installed certificate returns: 0 on success, and 1 on failure """ self.updatelock.acquire() try: # first verify that the staging certificate is a valid file verifycmd = "secure_script_wrapper -p2 %s verifystagingcert %s" % ( self.sslWrapperPath, self.cfg.getGlobalParam("ENTERPRISE_HOME") ) outputList = [] verifycode = E.execute(['localhost'], verifycmd, outputList, 60) if verifycode != 0: E.rm(['localhost'], ssl_cert.STAGINGCERT_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME")) logging.error("Verify failed for certificate [%s]; error code: %d" % (str(outputList), verifycode) ) return "1" # distribute the staging certificate retcode = E.distribute(self.cfg.getGlobalParam("MACHINES"), ssl_cert.STAGINGCERT_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME"), 60) if retcode != 0: logging.error("Couldn't distribute apache cert, error %d" % retcode) # next, generate the certificate on all machines cmd = "secure_script_wrapper -p2 %s installcert %s" % ( self.sslWrapperPath, self.cfg.getGlobalParam("ENTERPRISE_HOME")) outputList = [] retcode = E.execute(self.cfg.getGlobalParam("MACHINES"), cmd, outputList, 60) if retcode != 0: logging.error("Couldn't install cert: %s" % str(outputList)) return "1" self.writeAdminRunnerOpMsg(M.MSG_LOG_SSL_CERT_INSTALLED) finally: self.updatelock.release() return "0"
def GetLanguageCatalog(self, language): '''Returns a dictionary from 'MSG_...' to translated string''' enthome = E.getEnterpriseHome() languagelocal = language.replace('-', '_') languagefile = ( '%s/local/google3/enterprise/i18n/FrontendMessages_%s.xlb' % (enthome, languagelocal)) try: msgs = readxlb.Read(languagefile) except IOError: logging.error('Missing language file %s' % languagefile) if language != 'en': # Try en language so we can at least replace the tags. return self.GetLanguageCatalog('en') # Sorry, no language files msgs = {} # Handle right-to-left languages if language in ['ar', 'fa', 'iw', 'ku', 'sd', 'ur', 'yi']: msgs['DIR'] = 'rtl' else: msgs['DIR'] = 'ltr' # Encode Unicode messages in UTF-8. (Adminrunner expects UTF-8 messages.) for key in msgs: msgs[key] = msgs[key].encode('utf-8') # Stick in the language list inside the advanced search page in # sorted order. msgs["INSERT_LANGUAGE_LIST_IN_ADV_SEARCH"] = self.GenerateLanguageList( msgs) return msgs
def renice_svs(machine, new_priority=SVS_NICE_VALUE): """Renices svs on 'machine' with new_priority. Returns 0 on failure, 1 on success. """ pid = None ret = 0 # Get pid of svs. try: pid = open(SVS_PID_FILE, 'r').read() except: err = str(sys.exc_info()[:2]) logging.error('Error reading SVS pid from %s: %s' % (SVS_PID_FILE, err)) if pid is not None: # Get nice value for pid old_priority = os_utils.GetAttr('nice', int(pid)) if old_priority is not None and int(old_priority) != new_priority: logging.info('Renicing SVS to %d.' % new_priority) # Get group id from pid. pgid = os_utils.GetAttr('pgid', int(pid)) # Renice the whole group. cmd = 'renice %d -g %d' % (new_priority, int(pgid)) rc = E.execute([machine], cmd, None, 1) if rc == 0: ret = 1 else: logging.error('Error renicing SVS: %d' % ret) return ret
def distributedeletedbfile(self, filename): # Remove local database files from all the nodes. machines = self.cfg.getGlobalParam("MACHINES") if not E.rm(machines, filename): logging.error("Failed to delete %s on %s" % (filename, machines)) return "1" return "0"
def get_core_states(ver, home, nodes, ignore=0, testver=None): """ get the ENT_CORE_STATE from all nodes Running "ent_core --ver --info" on all the nodes sequentially Arguments: ver: '4.6.5' home: '/export/hda3/4.6.5' nodes: ['ent1', 'ent2'] ignore: 1 - ignore errors; 0 - otherwise. testver: 1 - if this is a test version; 0 - otherwise. Returns: {'ent1': 'state=RUNNING\nnodes=5\nfailures=1', 'ent2': 'state=RUNNING\nnodes=5\nfailures=1'} """ if testver == None: testver = is_test(ver) states = {} core_base = ('/export/hda3/%s/local/google/bin/secure_script_wrapper ' '-e /export/hda3/%s/bin/ent_core --ver=%s' % (ver, ver, ver)) if testver: core_base = '%s --testver' % core_base cmd = '%s --info' % core_base for node in nodes: out = [] ret = E.execute([node], cmd, out, 0, 0, 0, home, ignore=ignore) if not ret: states[node] = ''.join(out) return states
def core_op_out(ver, home, op, nodes, ignore=0, testver=None, gfs=1): """Executes ent_core command and returns the result. ent_core is running at the same time on all the nodes in different threads. Arguments: ver: '4.6.5' home: '/export/hda3/4.6.5' op: 'info' nodes: ['ent1', 'ent2'] ignore: 1 - ignore errors; 0 - otherwise. testver: 1 - if this is a test version; 0 - otherwise. gfs: 1 - start gfs during activation; 0 - otherwise. Returns: error code and output from all nodes. (0, 'state=RUNNING\nnodes=5\nfailures=1\nstate=RUNNING\nnodes=5\nfailures=1') """ if testver == None: testver = is_test(ver) out = [] core_base = ('/export/hda3/%s/local/google/bin/secure_script_wrapper ' '-e /export/hda3/%s/bin/ent_core --ver=%s' % (ver, ver, ver)) if testver: core_base = '%s --testver' % core_base if gfs == 0: core_base = '%s --gfs=0' % core_base cmd = '%s --%s' % (core_base, op) ret = E.execute(nodes, cmd, out, 0, 0, 0, home, ignore=ignore) if ret: logging.error(''.join(out)) return ret, ''.join(out)