def SetInitState(cfg, state): """Sets system's initialization state. For oneway, it stores it in C.ENT_SYSTEM_INIT_STATE. For Clusters, it stores it in chubby file /ls/ent<version>/ENT_SYSTEM_INIT_STATE. @param cfg - of type configurator. @param state - string """ # oneway? if 1 == len(core_utils.GetNodes()): cfg.setGlobalParam(C.ENT_SYSTEM_INIT_STATE, state) return tmpfile = E.mktemp('/export/hda3/tmp') try: f = open(tmpfile, 'w') f.write(state) f.close() except IOError: logging.fatal('Cannot write to temp file %s' % tmpfile) return version = cfg.getGlobalParam('VERSION') lockserv_cmd_prefix = core_utils.GetLSClientCmd(version, is_test(version)) chubby_root_dir = '/ls/%s' % core_utils.GetCellName(version) write_cmd = '%s cp %s %s/%s' % (lockserv_cmd_prefix, tmpfile, chubby_root_dir, 'ENT_SYSTEM_INIT_STATE') logging.info('setting system init state to: %s', state) E.exe_or_fail(write_cmd) E.exe('rm -rf %s' % tmpfile)
def main(unused_args): if not E.access([E.LOCALHOST], FLAGS.enthome, 'rd'): sys.exit("Invalid enthome %s" % FLAGS.enthome) if ( not FLAGS.box_keys_dir or not E.access([E.LOCALHOST], FLAGS.box_keys_dir, 'rwd') ): sys.exit("Invalid box_keys_dir %s" % FLAGS.box_keys_dir) if ( not FLAGS.license_keys_dir or not E.access([E.LOCALHOST], FLAGS.license_keys_dir, 'rwd') ): sys.exit("Invalid license_keys_dir %s" % FLAGS.license_keys_dir) if FLAGS.installstate not in ["INSTALL", "ACTIVE", "SERVE", "TEST"]: sys.exit("Invalid --installstate %s" % FLAGS.installstate) reset_index.SetResetStatusCacheTimeout(FLAGS.reset_status_cache_timeout) as = adminrunner_server.AdminRunnerServer(FLAGS.enthome, FLAGS.installstate, FLAGS.port, FLAGS.box_keys_dir, FLAGS.license_keys_dir) # make sure we have been given a config file if (not as.cfg.getGlobalParam(C.ENT_SYSTEM_HAS_VALID_CONFIG) and as.cfg.getInstallState() != 'INSTALL'): logging.fatal("adminrunner doesn't have a config file; you must "\ "install a conf rpm or run it with "\ "--installstate=INSTALL (for migration)") return # just in case fatal doesn't exit
def parse_args(self): """ Parse the command in args - the first line of the request """ # make sure we have a valid command if not self.accepted_commands.has_key(self.command): self.error = "Invalid command" return # make sure we have the right number of params if len(self.params) != self.accepted_commands[self.command].num_params: self.error = "Invalid number of arguments." return if self.cfg != None: if self.cfg.isInStartupMode(): diff = int(time.time()) - self.cfg.startup_time if (diff > FLAGS.max_startup_time_seconds): # Adminrunner does respond to status requests at this time. So # if it gets stuck in the startup process, then it remains there # forever. We need this fall back. logging.fatal("Startup took too long (%d seconds)" % diff) self.error = "It appears that the system has just been started. " \ "Please wait for a few minutes and try again." return # make sure this command is allowed in this state if self.cfg.getInstallState() not in install_utilities.INSTALL_STATES: self.error = "Command not allowed in state %s" % self.cfg.getInstallState() return return
def main(argv): if len(argv) != 1: sys.exit(__doc__) config = entconfig.EntConfig(argv[0]) if not config.Load(): sys.exit(__doc__) # Collect syslogs only if active or serve state = install_utilities.install_state(config.var('VERSION')) if not state in ['ACTIVE', 'SERVE']: sys.exit(0) # Collect syslogs only from master node. if not isMaster(config): logging.fatal('Not a oneway or cluster master node. Return!') pywrapbase.InitGoogleScript('', [ 'foo', '--gfs_aliases=%s' % config.var("GFS_ALIASES"), '--bnsresolver_use_svelte=false', '--logtostderr' ], 0) gfile.Init() first_date, last_date, printable_date, file_date = \ liblog.ParseDateRange('all',[]) apache_main_dir = liblog.get_apache_dir(config) checkpoint_dir = liblog.get_syslog_checkpoint_dir(config) liblog.MakeGoogleDir(config, checkpoint_dir) if (config.var('SYSLOG_SERVER') == None or config.var('ENT_SYSLOG_GWS_FAC') == None): logging.fatal('SYSLOG logging is disabled') lockfile = '%s/syslog_lock' % config.var('LOGDIR') lock = E.acquire_lock(lockfile, 1, breakLockAfterGracePeriod=0) if not lock: return try: logger = syslog_client.SyslogClient(config.var('SYSLOG_SERVER'), config.var('EXTERNAL_WEB_IP')) logging.info("syslog-server = %s" % config.var('SYSLOG_SERVER')) for collection in os.listdir(apache_main_dir): apache_dir = '%s/%s' % (apache_main_dir, collection) checkpoint_file = "%s/%s" % (checkpoint_dir, collection) apache_logs = liblog.FindLogFiles(apache_dir, first_date, last_date) logging.info('syslog handles collection %s' % collection) if not SyslogLogs(apache_logs, apache_dir, checkpoint_file, logger, config): sys.exit('Updating failed') logger.close() finally: lock.close() os.unlink(lockfile) sys.exit(0)
def main(argv): if len(argv) != 1: sys.exit(__doc__) config = entconfig.EntConfig(argv[0]) if not config.Load(): sys.exit(__doc__) # Collect syslogs only if active or serve state = install_utilities.install_state(config.var('VERSION')) if not state in [ 'ACTIVE', 'SERVE' ]: sys.exit(0) # Collect syslogs only from master node. if not isMaster(config): logging.fatal('Not a oneway or cluster master node. Return!') pywrapbase.InitGoogleScript('', ['foo', '--gfs_aliases=%s' % config.var("GFS_ALIASES"), '--bnsresolver_use_svelte=false', '--logtostderr'], 0) gfile.Init() first_date, last_date, printable_date, file_date = \ liblog.ParseDateRange('all',[]) apache_main_dir = liblog.get_apache_dir(config) checkpoint_dir = liblog.get_syslog_checkpoint_dir(config) liblog.MakeGoogleDir(config, checkpoint_dir) if ( config.var('SYSLOG_SERVER') == None or config.var('ENT_SYSLOG_GWS_FAC') == None ): logging.fatal('SYSLOG logging is disabled') lockfile = '%s/syslog_lock' % config.var('LOGDIR') lock = E.acquire_lock(lockfile, 1, breakLockAfterGracePeriod = 0) if not lock: return try: logger = syslog_client.SyslogClient(config.var('SYSLOG_SERVER'), config.var('EXTERNAL_WEB_IP')) logging.info("syslog-server = %s" % config.var('SYSLOG_SERVER')) for collection in os.listdir(apache_main_dir): apache_dir = '%s/%s' % (apache_main_dir, collection) checkpoint_file = "%s/%s" % (checkpoint_dir, collection) apache_logs = liblog.FindLogFiles(apache_dir, first_date, last_date) logging.info('syslog handles collection %s' % collection) if not SyslogLogs(apache_logs, apache_dir, checkpoint_file, logger, config): sys.exit('Updating failed') logger.close() finally: lock.close() os.unlink(lockfile) sys.exit(0)
def main(unused_argv): if FLAGS.input_file_path is None: logging.fatal('No input_file_path specified argument --input_file_path.') if FLAGS.output_file_path is None: logging.fatal('No output_file_path specified' 'argument --output_file_path.') final_file = FLAGS.soccer_final_file_path Sport(FLAGS.input_file_path, FLAGS.output_file_path, final_file).converts()
def TouchFile(global_params, filename): """ check to see if filename exists, create if it does not exists """ # first check if file exists ls_cmd = "ls %s" % filename err, out = E.run_fileutil_command(self.globalParams, ls_cmd) if err != E.ERR_OK: # create if not exists create_cmd = "truncate %s 0" % filename err, out = E.run_fileutil_command(self.globalParams, create_cmd) if err != E.ERR_OK: logging.fatal("Could not create file: %s" % filename)
def main(unused_argv): if FLAGS.input_file_path is None: logging.fatal( 'No input_file_path specified argument --input_file_path.') if FLAGS.output_file_path is None: logging.fatal('No output_file_path specified' 'argument --output_file_path.') final_file = FLAGS.soccer_final_file_path Sport(FLAGS.input_file_path, FLAGS.output_file_path, final_file).converts()
def main(unused_args): logging.info('quote main()') # Loggging/exit for scary case if 'religion' in FLAGS.quote_text: logging.fatal('religion in quote!') # Create quote object based on the flags quote = quote_lib.Quote(FLAGS.quote_text, FLAGS.quote_author) # Write or read the quote object with the filesystem if flags indicate. if FLAGS.write_file: quote.WriteShirt(FLAGS.write_file) if FLAGS.read_file: quote.ReadShirt(FLAGS.read_file) # In any case, print the quote to stdout print quote.GetFormatted(),
def main(argv): config = entconfig.EntConfig(argv[0]) if not config.Load(): sys.exit(__doc__) try: crawl = argv[1] port = string.atoi(argv[2]) except: sys.exit(__doc__) machines = config.var('MACHINES') master = find_master.FindMaster(port, machines) # The name of this machine crt_machine = E.getCrtHostName() if len(master) != 1 or master[0] != crt_machine: logging.info("I am not the master") sys.exit(0) # not a problem # find out the date 24 hours ago year, month, day = time.localtime(time.time() - 60 * 60 * 24)[0:3] date1 = "date_%d_%d_%d" % (month, day, year) date2 = "date %d %d %d" % (month, day, year) logging.info("Running %s for %s" % (sys.argv[0], date2)) # 1. run log_report.py for each crawl ar = adminrunner_client.AdminRunnerClient("localhost", port) if not ar.LogReportStart(crawl, date1): sys.exit("Error starting crawl for " + crawl) logging.info("Started log_report for %s/%s" % (crawl, date1)) # 2. run apache_log.py for each crawl apache_cmd = "./apache_log.py %s update %s %s" % (argv[0], crawl, date2) (status, output) = commands.getstatusoutput(apache_cmd) if status != 0: logging.fatal("apache_log failed: %s" % output) logging.info("Finished apache_log: %s" % output) logging.info("Done")
def main(argv): config = entconfig.EntConfig(argv[0]) if not config.Load(): sys.exit(__doc__) try: crawl = argv[1] port = string.atoi(argv[2]) except: sys.exit(__doc__) machines = config.var('MACHINES') master = find_master.FindMaster(port, machines) # The name of this machine crt_machine = E.getCrtHostName() if len(master) != 1 or master[0] != crt_machine: logging.info("I am not the master") sys.exit(0) # not a problem # find out the date 24 hours ago year,month,day = time.localtime(time.time() - 60*60*24 )[0:3] date1 = "date_%d_%d_%d" % (month, day, year) date2 = "date %d %d %d" % (month, day, year) logging.info("Running %s for %s" % (sys.argv[0], date2)) # 1. run log_report.py for each crawl ar = adminrunner_client.AdminRunnerClient("localhost", port) if not ar.LogReportStart(crawl, date1): sys.exit("Error starting crawl for " + crawl) logging.info("Started log_report for %s/%s" %(crawl, date1)) # 2. run apache_log.py for each crawl apache_cmd = "./apache_log.py %s update %s %s" % (argv[0], crawl, date2) (status, output) = commands.getstatusoutput(apache_cmd) if status != 0: logging.fatal("apache_log failed: %s" % output) logging.info("Finished apache_log: %s" % output) logging.info("Done")
def delete_pagerank_barriers(self): """ Deletes barrier files used by pr_main """ # Sanity check to see we are indeed running pr_main pr_prog = self.cp.var('PAGERANKER_PROG') if pr_prog != 'pr_main': logging.fatal('Not using pr_main anymore') # Get all required parameter from entconfig barrier_name_prefix = '%s/barriers' % self.cp.var('NAMESPACE_PREFIX') datadir = self.cp.var('DATADIR') gfs_aliases = self.cp.var('GFS_ALIASES') # Nuke'em. When there is only a single pr_main running (shards=1), it # does this during its startup. cmd = ('%s/fileutil --datadir=%s --gfs_aliases=%s ' '--bnsresolver_use_svelte=false ' ' rm -f %s.barrier_progpr_*_of_*_op*_iter*' % (self.bin_dir, datadir, gfs_aliases, barrier_name_prefix)) logging.info('Deleting barriers - %s' % cmd) E.exe(cmd)
def GetInitState(entcfg): """Returns System's initialization state. For oneway, it is the value of C.ENT_SYSTEM_INIT_STATE and for clusters, it is the value stored in chubby file /ls/ent<version>/ENT_SYTEM_INIT_STATE. If chubby file is non existent, it returns state C.FRESH. @param entcfg - of type googleconfig. @return - state """ # oneway? if 1 == len(core_utils.GetNodes()): return entcfg.var(C.ENT_SYSTEM_INIT_STATE) # For cluster, get the state from chubby. version = entcfg.var('VERSION') lockserv_cmd_prefix = core_utils.GetLSClientCmd(version, is_test(version)) chubby_root_dir = '/ls/%s' % core_utils.GetCellName(version) # Verify that chubby is functional. We do not want to accidentally return # FRESH state that can result in total wipe out of data. ls_cmd = '%s ls %s' % (lockserv_cmd_prefix, chubby_root_dir) (status, output) = E.getstatusoutput(ls_cmd) if E.ERR_OK != status: logging.fatal('GetInitState: Could not talk to chubby.') return None cat_cmd = '%s cat %s/%s' % (lockserv_cmd_prefix, chubby_root_dir, 'ENT_SYSTEM_INIT_STATE') (status, state) = E.getstatusoutput(cat_cmd) if E.ERR_OK != status: # For fresh install, file init_state won't exist in chubby yet. # Hence, consider this as a FRESH state. state = C.FRESH logging.info('current system init state: %s', state) return state
def StartupWork(cfg, state): try: ############################################################################# # check memory-total logging.info("check memory-total") if not cfg.CheckMachineMemory(): logging.fatal("failed to check memory-total") return # One-time Initialization Work: # In fresh mode we first initialize the global default if install_utilities.GetInitState(cfg.globalParams) == C.FRESH: logging.info("initializing global default files") if not cfg.globalParams.InitGlobalDefaultFiles(overwrite=false): logging.fatal("failed to initialize global default files") return # Call to saveParams will also distribute the newly created global # default files. logging.info("saving params and distributing global default files.") logging.flush() # This is a best effort mechanism. We try upto 6 times to update the # files. We assume that we'll get the files to all alive machines. If # a machine can't get the files then we assume that it is dead and won't # become a master with missing default files. cfg.saveParams(retry=6) logging.flush() install_utilities.SetInitState(cfg, C.CONFIG_FILES_INITIALIZED) logging.info('system initialization for %s state successful' % C.CONFIG_FILES_INITIALIZED) logging.flush() # In install mode -- bail out now if state == "INSTALL": cfg.startupDone() return # Perform reset index if we got interrupted in the middle of it reset_index.ResetIndex(cfg, only_if_in_progress=1) try: localhost = E.getCrtHostName() if not cfg.setGlobalParam('MASTER', localhost): logging.fatal("Error setting the MASTER to %s" % localhost) except IOError: logging.fatal("Couldn't set MASTER -> exiting") # Allocate machines for the empty slots logging.info("doing machine allocation") if not cfg.DoMachineAllocation(): logging.info("failed to do machine allocation") # In the next install stage (INSTALL) mode we go and finish up the work if (install_utilities.GetInitState(cfg.globalParams) == C.CONFIG_FILES_INITIALIZED): logging.info("doing system initialization") # datadirs are initially made (by the base rpm) to not have any data disks # because it doesn't know which disks to use, but the datadir is needed # to get things running. logging.info("creating datadirs") if not cfg.createDataDirs(cfg.getGlobalParam(C.MACHINES)): logging.fatal("failed to make datadirs on machines") return # Create GFS subdirectories if cfg.getGlobalParam(C.GFS_CELL): logging.info("creating gfs subdirectories") if not cfg.createGFSChunkSubDirs(): logging.fatal("failed creating the gfs subdirectories") return # ensure initial spelling data is present cfg.EnsureSpellingData() # create a default collection w/content of mainCrawl / frontend logging.info("Creating default collection ...") cfg.CreateDefaultCollection() logging.info("Creating default frontend ...") cfg.CreateDefaultFrontend() logging.info("Assiging default collection and frontend ...") cfg.AssignDefaultCollAndFront() # create some initial files needed by various backend logging.info("Creating default backend files ...") cfg.CreateDefaultBackendFiles() # create built in query expansion entry logging.info("Creating default query expansion entry ...") cfg.CreateDefaultQueryExpEntry() install_utilities.SetInitState(cfg, C.INITIALIZED) logging.info("system initialization successful") # we start the serve_service as this is fresh install logging.info('Starting serve service...') RunServeService(cfg.globalParams, 'start') else: # we restart the babysitter because the servers map may have changed logging.info('Babysitting serve service after allocation...') RunServeService(cfg.globalParams, 'babysit') logging.info("Saving params") cfg.saveParams() # now we're ready to run; end startup mode logging.info("Done with startup") cfg.startupDone() except Exception, e: (t, v, tb) = sys.exc_info() exc_msg = string.join(traceback.format_exception(t, v, tb)) logging.error(exc_msg) logging.fatal("StartupWork failed with exception: %s; %s" % ( e, traceback.format_tb(tb)))
def change_install_state(self): """ Tries to change the state of the present version to target_state. Returns true in case of success. Here is sumary of what it does: 1. Get list of active nodes 2. Get list of services to start and stop 3. In case there is something to start a. reconfigure's net on all nodes after verifying quorum b. starts core servics 4. Verifies there is a master elected. 5. Starts thread for each node to start and stop the needed services 6. Waits for output from each thread 7. Calculates success of failure based on thread results 8. Asks each thread to print its status regarding what services it actually started or stopped and what was the return code and error message if any. """ if not install_utilities.safe_transition(self.version_, self.target_state_): return 0 current_state = install_utilities.install_state(self.version_) start = time.time() # make sure svs is running svs_utilities.CheckSvsAlive(self.machines_) # get list of active nodes active_nodes = core_utils.GetLiveNodes(logging, self.retry_) ignore = core_utils.GetNodeFailures(core_utils.GetTotalNodes()) # account for already inactive nodes ignore = ignore - (core_utils.GetTotalNodes() - len(active_nodes)) ver = self.version_ home = self.enthome_ # See what we have to start / stop services_to_start = install_utilities.state_services_to_start( self.target_state_, self.machines_) services_to_stop = install_utilities.state_services_to_stop( install_utilities.install_state(self.version_), self.machines_) # Make some decisions total_nodes = len(self.cp_.var('ENT_ALL_MACHINES')) onebox = (total_nodes == 1) startcore = services_to_start and not onebox and not self.nonecore_only_ checkquorum = startcore stopcore = (services_to_stop and not onebox and not self.nonecore_only_ and self.target_state_ == 'INACTIVE') doservices = (not self.core_only_ and (services_to_start or services_to_stop)) if self.target_state_ in ['INACTIVE']: # ent_core does not really know the state. install_manager # has to tell ent_core when "makeinactive" testver = install_utilities.install_state(self.version_) else: testver = self.target_state_ in ['TEST', 'INSTALL'] # If it is onebox and target state is INSTALL, do not run reconfigure_net # This is to support pre 4.4 version migration code. reconfigurenet_enabled = not (onebox and (self.target_state_ == 'INSTALL')) # if stop coreonly services, check if none-core components are running if (install_utilities.install_state(self.version_) == 'ACTIVE' and self.target_state_ == 'INACTIVE' and self.core_only_): logging.fatal("cannot stop core services while none core services "\ "are running") # Execute the decisions if checkquorum: # We check quorum only when services are to be started. # We mainly need quorum for core services. For non core services like # crawl, logcontrol etc. we use users specified machines. core_utils.VerifyQuorum(active_nodes) # check if syslogd.conf and klogd.conf exist install_utilities.check_klogd_syslogd_conf(active_nodes, home) # Kill any spurious adminrunner/adminconsole processes if we are entering # TEST or ACTIVE mode. if self.target_state_ in ['TEST', 'ACTIVE']: install_utilities.kill_service(['adminrunner', 'adminconsole'], core_utils.GetNodes(1)) # reconfigure without restarting gems success = 1 if reconfigurenet_enabled and services_to_start: # check if we need to force NTP reconfig if this is to upgrade from 4.4 force_ntp_reconfig = 0 if self.target_state_ in ['TEST', 'ACTIVE']: last_version = install_utilities.get_latest_version( except_for=1) if (last_version is None or version_utilities.CmpVersions( last_version, NEW_NTP_OPTION_GSA_VERSION) > 0): force_ntp_reconfig = 1 success = reconfigurenet_util.doReconfigureNet( self.cp_, active_nodes, force_ntp_reconfig=force_ntp_reconfig) if not success: logging.error('reconfigurenet failed.') # if start nonecore services, check if core services are running if (not onebox and self.nonecore_only_ and self.target_state_ in ['TEST', 'ACTIVE']): core_running = install_utilities.is_core_running(ver, home, active_nodes, ignore=ignore, testver=testver) if not core_running: logging.fatal("cannot start none core services "\ "when core services are not running") # start core services if needed if startcore and success: # Retry 3 times for master verification failures num_retry = 3 # it is always OK to reinit core services if the version is in # INSTALLED state self.reinitok_ = install_utilities.reinit_core_ok(ver, home, active_nodes, ignore=ignore, testver=testver) i = 1 while i <= num_retry: # stop core services when retrying if i > 1: time.sleep(15) install_utilities.stop_core(ver, home, active_nodes, testver=testver) time.sleep(15) i = i + 1 # Run ent_core --ver=<ver> --activate --gfs=0 through install_utilities.py success = install_utilities.start_core(ver, home, active_nodes, ignore=ignore, testver=testver, gfs=0) if not success: if i <= num_retry: logging.error( 'Error activating core services. Retrying...') elif self.reinitok_: # it is OK to ignore errors when trying to re-init core services install_utilities.reinit_core(ver, home, active_nodes, ignore=1, testver=testver) i = 1 self.reinitok_ = None else: logging.error('Error activating core services.') else: # Make sure a master has been elected. If we go ahead without # verifying the master then it will take very long time for # services to be started. Making sure master is elected by now # results in very quick adminrunner startup. success = verify_master(ver, testver) if success: if not core_utils.InitDeadNodes(ver, testver, logging) == 0: logging.fatal( 'Error updating dead nodes to the lockserver.') break if i <= num_retry: logging.error( 'Error verifying the master. Retrying...') elif self.reinitok_: # it is OK to ignore errors when trying to re-init core services install_utilities.reinit_core(ver, home, active_nodes, ignore=1, testver=testver) i = 1 self.reinitok_ = None else: raise core_utils.EntMasterError, ( 'Error getting current GSA master' ' from chubby.') # force gsa master on the desired node desired_gsa_master_node = core_utils.DesiredMasterNode() if desired_gsa_master_node is None: logging.fatal('No suitable node to run GSA master') logging.info('Forcing %s to become GSA master' % desired_gsa_master_node) find_master.ForceMaster(desired_gsa_master_node, testver) # make sure the transaction logs are in sync and start gfs success = install_utilities.start_gfs(ver, home, active_nodes, ignore=ignore, testver=testver) # make sure gfs master is not the GSA master node logging.info('Ensuring %s not to become GFS master' % desired_gsa_master_node) gfs_utils.AvoidGFSMasterOnNode(ver, testver, desired_gsa_master_node) if doservices and success: node_threads = {} for n in self.machines_: node_threads[n] = NodeInstallManager(n, self.target_state_, self.version_, services_to_start, services_to_stop) # start node threads for (n, t) in node_threads.items(): logging.info('STATUS: Starting thread for %s' % n) t.start() # wait for threads for (n, t) in node_threads.items(): t.join() success = success and (t.err_ == 0) for (n, t) in node_threads.items(): t.print_status() if stopcore and success: func = lambda: install_utilities.stop_core( ver, home, active_nodes, testver=testver) success = try_repeatedly(func, success=1) if not success: logging.error('Error inactivating core services.') # Start/Stop Borgmon and Reactor if self.cp_.var('ENT_ENABLE_EXTERNAL_BORGMON'): enable_external_borgmon = '--enable_external' else: enable_external_borgmon = '--noenable_external' borgmon_cmd = ( "/export/hda3/%s/local/google3/enterprise/util/borgmon_util.py " "--ver %s --logtostderr %s" % (self.version_, self.version_, enable_external_borgmon)) if success and current_state != self.target_state_: # 1) Stop Borgmon and Reactor if required if current_state in ['SERVE', 'TEST', 'ACTIVE']: E.execute(self.machines_, "%s --mode %s --stop" % (borgmon_cmd, current_state), None, 0) # 2) Start Borgmon and Reactor if required logging.info("target_state: %s" % self.target_state_) if self.target_state_ in ['SERVE', 'TEST', 'ACTIVE']: E.execute( self.machines_, "%s --mode %s --start" % (borgmon_cmd, self.target_state_), None, 0) # Start/Stop Session Manager only for oneways if core_utils.GetTotalNodes() == 1: if self.target_state_ in ['SERVE', 'TEST', 'ACTIVE']: sessionmanager_util.ActivateSessionManager(ver, testver) if self.target_state_ == 'INACTIVE' and success: sessionmanager_util.DeactivateSessionManager(ver, testver) # Kill any spurious adminrunner/adminconsole processes if we are entering # INACTIVE or SERVE mode. if self.target_state_ in ['SERVE', 'INACTIVE']: install_utilities.kill_service(['adminrunner', 'adminconsole'], core_utils.GetNodes(1)) if self.target_state_ == 'INACTIVE' and success and not self.nonecore_only_: install_utilities.InactivateCleanup(ver, home, active_nodes) end = time.time() diff = (end - start) / 60 logging.info("STAT: change_install_state took %.2f minutes." % diff) return success
def CheckPywrapfile(): """Checks that pywrapfile is properly loaded.""" readable = pywrapfile.File.Readable('/etc/hostname') if not readable: logging.fatal('Pywrapfile not working. ' 'Make sure BUILD dep //file/localfile is present.')
def EnsureSpellingData(self, reset = 0): """ This ensures that initial spelling data is present. If reset is set we clear ENT_SPELL_SERVING_ID and revert files to initial state. """ logging.info("ensuring presence of initial spelling data") serving_id_cfg_name = 'ENT_SPELL_SERVING_ID' # if reset is set - blow away runtime dictionary version. (this is # useful after index has been reset). if self.hasGlobalParam(serving_id_cfg_name) and (reset == 1): self.setGlobalParam(serving_id_cfg_name, 0); if (self.hasGlobalParam(serving_id_cfg_name)) and \ (self.getGlobalParam(serving_id_cfg_name) == 0): fileutil_args = "" if self.hasGlobalParam('GFS_ALIASES'): fileutil_args = "--gfs_aliases='%s'" % \ self.getGlobalParam('GFS_ALIASES') fileutil_args += " --bnsresolver_use_svelte=false" if self.hasGlobalParam('DATADIR'): fileutil_args = "%s --datadir=%s" % \ (fileutil_args, self.getGlobalParam('DATADIR')) # note: assumes that the parent of spell_root exists spell_root = self.getGlobalParam('ENT_SPELL_ROOT_DIR') end = len(spell_root) - 1 if spell_root[end] == '/': spell_root = spell_root[0:end] target_path = "%s/spell-0" % \ self.getGlobalParam('ENT_SPELL_ROOT_DIR') self.EnsureDirectory(fileutil_args, spell_root) self.EnsureDirectory(fileutil_args, "%s" % spell_root) self.EnsureDirectory(fileutil_args, "%s" % target_path) logging.info("ensuring files") if not self.hasGlobalParam('ENTERPRISE_HOME'): logging.fatal("No ENTERPRISE_HOME config parameter") return src_path = "%s/../spelling-data/runtime" % \ self.getGlobalParam('ENTERPRISE_HOME') cmnd = "(cd %s ; " % src_path cmnd = cmnd + "for f in *.spelling.* ; " cmnd = cmnd + "do fileutil %s -f cp %s/$f %s/$f; done)" % \ (fileutil_args, src_path, target_path) res = E.exe(cmnd) logging.info("Result of command %s is %d" % (cmnd, res) ) # ensure spelling data is present num_src_files = self.CountSpellingFiles(fileutil_args, src_path) logging.info("There are %d spelling files in the source directory" % \ num_src_files) num_target_files = \ self.CountSpellingFiles(fileutil_args, target_path) logging.info("There are %d spelling files in the target directory"% \ num_target_files) if num_src_files == num_target_files: logging.info("spelling data present") else: logging.fatal("failed to ensure presence of spelling data") return else: logging.info("no config param %s, or it's not 0" % serving_id_cfg_name) logging.info("skipping spelling data check")
def exe_or_fail(cmd, verbose = 0): """Executes the command and on error it fails""" err = exe(cmd, verbose = verbose) if err != ERR_OK: logging.fatal("Error code [%s] while executing [%s]" % (err, cmd))
def change_install_state(self): """ Tries to change the state of the present version to target_state. Returns true in case of success. Here is sumary of what it does: 1. Get list of active nodes 2. Get list of services to start and stop 3. In case there is something to start a. reconfigure's net on all nodes after verifying quorum b. starts core servics 4. Verifies there is a master elected. 5. Starts thread for each node to start and stop the needed services 6. Waits for output from each thread 7. Calculates success of failure based on thread results 8. Asks each thread to print its status regarding what services it actually started or stopped and what was the return code and error message if any. """ if not install_utilities.safe_transition(self.version_, self.target_state_): return 0 current_state = install_utilities.install_state(self.version_) start = time.time() # make sure svs is running svs_utilities.CheckSvsAlive(self.machines_) # get list of active nodes active_nodes = core_utils.GetLiveNodes(logging, self.retry_) ignore = core_utils.GetNodeFailures(core_utils.GetTotalNodes()) # account for already inactive nodes ignore = ignore - (core_utils.GetTotalNodes() - len(active_nodes)) ver = self.version_ home = self.enthome_ # See what we have to start / stop services_to_start = install_utilities.state_services_to_start( self.target_state_, self.machines_) services_to_stop = install_utilities.state_services_to_stop( install_utilities.install_state(self.version_), self.machines_) # Make some decisions total_nodes = len(self.cp_.var('ENT_ALL_MACHINES')) onebox = (total_nodes == 1) startcore = services_to_start and not onebox and not self.nonecore_only_ checkquorum = startcore stopcore = (services_to_stop and not onebox and not self.nonecore_only_ and self.target_state_ == 'INACTIVE') doservices = (not self.core_only_ and (services_to_start or services_to_stop)) if self.target_state_ in ['INACTIVE']: # ent_core does not really know the state. install_manager # has to tell ent_core when "makeinactive" testver = install_utilities.install_state(self.version_) else: testver = self.target_state_ in ['TEST', 'INSTALL'] # If it is onebox and target state is INSTALL, do not run reconfigure_net # This is to support pre 4.4 version migration code. reconfigurenet_enabled = not (onebox and (self.target_state_ == 'INSTALL')) # if stop coreonly services, check if none-core components are running if (install_utilities.install_state(self.version_) == 'ACTIVE' and self.target_state_ == 'INACTIVE' and self.core_only_): logging.fatal("cannot stop core services while none core services "\ "are running") # Execute the decisions if checkquorum: # We check quorum only when services are to be started. # We mainly need quorum for core services. For non core services like # crawl, logcontrol etc. we use users specified machines. core_utils.VerifyQuorum(active_nodes) # check if syslogd.conf and klogd.conf exist install_utilities.check_klogd_syslogd_conf(active_nodes, home) # Kill any spurious adminrunner/adminconsole processes if we are entering # TEST or ACTIVE mode. if self.target_state_ in ['TEST', 'ACTIVE']: install_utilities.kill_service(['adminrunner', 'adminconsole'], core_utils.GetNodes(1)) # reconfigure without restarting gems success = 1 if reconfigurenet_enabled and services_to_start: # check if we need to force NTP reconfig if this is to upgrade from 4.4 force_ntp_reconfig = 0 if self.target_state_ in ['TEST', 'ACTIVE']: last_version = install_utilities.get_latest_version(except_for=1) if (last_version is None or version_utilities.CmpVersions(last_version, NEW_NTP_OPTION_GSA_VERSION) > 0): force_ntp_reconfig = 1 success = reconfigurenet_util.doReconfigureNet(self.cp_, active_nodes, force_ntp_reconfig=force_ntp_reconfig) if not success: logging.error('reconfigurenet failed.') # if start nonecore services, check if core services are running if (not onebox and self.nonecore_only_ and self.target_state_ in ['TEST', 'ACTIVE']): core_running = install_utilities.is_core_running(ver, home, active_nodes, ignore=ignore, testver=testver) if not core_running: logging.fatal("cannot start none core services "\ "when core services are not running") # start core services if needed if startcore and success: # Retry 3 times for master verification failures num_retry = 3 # it is always OK to reinit core services if the version is in # INSTALLED state self.reinitok_ = install_utilities.reinit_core_ok(ver, home, active_nodes, ignore=ignore, testver=testver) i = 1 while i <= num_retry: # stop core services when retrying if i > 1: time.sleep(15) install_utilities.stop_core(ver, home, active_nodes, testver=testver) time.sleep(15) i = i + 1 # Run ent_core --ver=<ver> --activate --gfs=0 through install_utilities.py success = install_utilities.start_core(ver, home, active_nodes, ignore=ignore, testver=testver, gfs=0) if not success: if i <= num_retry: logging.error('Error activating core services. Retrying...') elif self.reinitok_: # it is OK to ignore errors when trying to re-init core services install_utilities.reinit_core(ver, home, active_nodes, ignore=1, testver=testver) i = 1 self.reinitok_ = None else: logging.error('Error activating core services.') else: # Make sure a master has been elected. If we go ahead without # verifying the master then it will take very long time for # services to be started. Making sure master is elected by now # results in very quick adminrunner startup. success = verify_master(ver, testver) if success: if not core_utils.InitDeadNodes(ver, testver, logging) == 0: logging.fatal('Error updating dead nodes to the lockserver.') break if i <= num_retry: logging.error('Error verifying the master. Retrying...') elif self.reinitok_: # it is OK to ignore errors when trying to re-init core services install_utilities.reinit_core(ver, home, active_nodes, ignore=1, testver=testver) i = 1 self.reinitok_ = None else: raise core_utils.EntMasterError, ('Error getting current GSA master' ' from chubby.') # force gsa master on the desired node desired_gsa_master_node = core_utils.DesiredMasterNode() if desired_gsa_master_node is None: logging.fatal('No suitable node to run GSA master') logging.info('Forcing %s to become GSA master' % desired_gsa_master_node) find_master.ForceMaster(desired_gsa_master_node, testver) # make sure the transaction logs are in sync and start gfs success = install_utilities.start_gfs(ver, home, active_nodes, ignore=ignore, testver=testver) # make sure gfs master is not the GSA master node logging.info('Ensuring %s not to become GFS master' % desired_gsa_master_node) gfs_utils.AvoidGFSMasterOnNode(ver, testver, desired_gsa_master_node) if doservices and success: node_threads = {} for n in self.machines_: node_threads[n] = NodeInstallManager(n, self.target_state_, self.version_, services_to_start, services_to_stop) # start node threads for (n, t) in node_threads.items(): logging.info('STATUS: Starting thread for %s' % n) t.start() # wait for threads for (n,t) in node_threads.items(): t.join() success = success and (t.err_ == 0) for (n,t) in node_threads.items(): t.print_status() if stopcore and success: func = lambda: install_utilities.stop_core(ver, home, active_nodes, testver=testver) success = try_repeatedly(func, success=1) if not success: logging.error('Error inactivating core services.') # Start/Stop Borgmon and Reactor if self.cp_.var('ENT_ENABLE_EXTERNAL_BORGMON'): enable_external_borgmon = '--enable_external' else: enable_external_borgmon = '--noenable_external' borgmon_cmd = ( "/export/hda3/%s/local/google3/enterprise/util/borgmon_util.py " "--ver %s --logtostderr %s" % (self.version_, self.version_, enable_external_borgmon)) if success and current_state != self.target_state_: # 1) Stop Borgmon and Reactor if required if current_state in ['SERVE', 'TEST', 'ACTIVE']: E.execute(self.machines_, "%s --mode %s --stop" % (borgmon_cmd, current_state), None, 0) # 2) Start Borgmon and Reactor if required logging.info("target_state: %s" % self.target_state_) if self.target_state_ in ['SERVE', 'TEST', 'ACTIVE']: E.execute(self.machines_, "%s --mode %s --start" % (borgmon_cmd, self.target_state_), None, 0) # Start/Stop Session Manager only for oneways if core_utils.GetTotalNodes() == 1: if self.target_state_ in ['SERVE', 'TEST', 'ACTIVE']: sessionmanager_util.ActivateSessionManager(ver, testver) if self.target_state_ == 'INACTIVE' and success: sessionmanager_util.DeactivateSessionManager(ver, testver) # Kill any spurious adminrunner/adminconsole processes if we are entering # INACTIVE or SERVE mode. if self.target_state_ in ['SERVE', 'INACTIVE']: install_utilities.kill_service(['adminrunner', 'adminconsole'], core_utils.GetNodes(1)) if self.target_state_ == 'INACTIVE' and success and not self.nonecore_only_: install_utilities.InactivateCleanup(ver, home, active_nodes) end = time.time() diff = (end - start)/60 logging.info("STAT: change_install_state took %.2f minutes." % diff) return success
def main(argv): argc = len(argv) if argc < 10: sys.exit(__doc__) config = entconfig.EntConfig(argv[0]) if not config.Load(): sys.exit(__doc__) pywrapbase.InitGoogleScript('', [ 'foo', '--gfs_aliases=%s' % config.var("GFS_ALIASES"), '--bnsresolver_use_svelte=false', '--logtostderr' ], 0) gfile.Init() client = argv[1] date_fields = string.split(argv[2], '_') date_range = liblog.ParseDateRange(date_fields[0], date_fields[1:]) withResults = argv[3] topCount = argv[4] diagnosticTerms = argv[5] html_file = argv[6] valid_file = argv[7] new_html_file = argv[8] new_valid_file = argv[9] if not date_range: sys.exit(__doc__) first_date, last_date, printable_date, file_date = date_range if last_date.as_int() < first_date.as_int(): logging.fatal('invalid date range') gws_log_dir = liblog.get_gws_log_dir(config) collect_dir = liblog.get_collect_dir(config) partition_dir = liblog.get_partition_dir(config) directory_map_file = liblog.get_directory_map_file(config) # Collect logs first from all gws nodes and preprocess # logs to make sure logs are up to date. all_machines = config.var('MACHINES') collect_logs.CollectLogs(all_machines, gws_log_dir, collect_dir) preprocess_logs.PartitionLogs(config) gws_logs = liblog.FindClientLogFiles(partition_dir, directory_map_file, client, first_date, last_date) # note that collection (client) has been factored into gwslog_dir. result = CreateLogReport(config, printable_date, gws_logs, config.var('MAIN_GOOGLE3_DIR'), withResults, topCount, diagnosticTerms, html_file, valid_file, new_html_file, new_valid_file) if result == liblog.FAILURE: logging.error('CreateLogReport Failed') sys.exit(result)
def exe_or_fail(cmd, verbose=0): """Executes the command and on error it fails""" err = exe(cmd, verbose=verbose) if err != ERR_OK: logging.fatal("Error code [%s] while executing [%s]" % (err, cmd))
def EnsureSpellingData(self, reset=0): """ This ensures that initial spelling data is present. If reset is set we clear ENT_SPELL_SERVING_ID and revert files to initial state. """ logging.info("ensuring presence of initial spelling data") serving_id_cfg_name = 'ENT_SPELL_SERVING_ID' # if reset is set - blow away runtime dictionary version. (this is # useful after index has been reset). if self.hasGlobalParam(serving_id_cfg_name) and (reset == 1): self.setGlobalParam(serving_id_cfg_name, 0) if (self.hasGlobalParam(serving_id_cfg_name)) and \ (self.getGlobalParam(serving_id_cfg_name) == 0): fileutil_args = "" if self.hasGlobalParam('GFS_ALIASES'): fileutil_args = "--gfs_aliases='%s'" % \ self.getGlobalParam('GFS_ALIASES') fileutil_args += " --bnsresolver_use_svelte=false" if self.hasGlobalParam('DATADIR'): fileutil_args = "%s --datadir=%s" % \ (fileutil_args, self.getGlobalParam('DATADIR')) # note: assumes that the parent of spell_root exists spell_root = self.getGlobalParam('ENT_SPELL_ROOT_DIR') end = len(spell_root) - 1 if spell_root[end] == '/': spell_root = spell_root[0:end] target_path = "%s/spell-0" % \ self.getGlobalParam('ENT_SPELL_ROOT_DIR') self.EnsureDirectory(fileutil_args, spell_root) self.EnsureDirectory(fileutil_args, "%s" % spell_root) self.EnsureDirectory(fileutil_args, "%s" % target_path) logging.info("ensuring files") if not self.hasGlobalParam('ENTERPRISE_HOME'): logging.fatal("No ENTERPRISE_HOME config parameter") return src_path = "%s/../spelling-data/runtime" % \ self.getGlobalParam('ENTERPRISE_HOME') cmnd = "(cd %s ; " % src_path cmnd = cmnd + "for f in *.spelling.* ; " cmnd = cmnd + "do fileutil %s -f cp %s/$f %s/$f; done)" % \ (fileutil_args, src_path, target_path) res = E.exe(cmnd) logging.info("Result of command %s is %d" % (cmnd, res)) # ensure spelling data is present num_src_files = self.CountSpellingFiles(fileutil_args, src_path) logging.info("There are %d spelling files in the source directory" % \ num_src_files) num_target_files = \ self.CountSpellingFiles(fileutil_args, target_path) logging.info("There are %d spelling files in the target directory"% \ num_target_files) if num_src_files == num_target_files: logging.info("spelling data present") else: logging.fatal("failed to ensure presence of spelling data") return else: logging.info("no config param %s, or it's not 0" % serving_id_cfg_name) logging.info("skipping spelling data check")
sys.exit('Cannot load configuration.') pywrapbase.InitGoogleScript('', [ 'foo', '--gfs_aliases=%s' % ent_config.var('GFS_ALIASES'), '--bnsresolver_use_svelte=false', '--logtostderr', '--minloglevel=1' ], 0) gfile.Init() begin = time.time() cfg = configurator.configurator(FLAGS.enthome) if FLAGS.do_import: logging.info("initializing global default files") if not cfg.globalParams.InitGlobalDefaultFiles(overwrite=0): logging.fatal("failed to initialize global default files") config_xml_serialization.ImportConfiguration(cfg, open(FLAGS.xml_path)) if FLAGS.do_export: f = open(FLAGS.xml_path, 'w') f.write(config_xml_serialization.ExportConfiguration(cfg)) f.close() diff = (time.time() - begin) / 60 logging.info('Migration operation took %.2f minutes.' % diff) if __name__ == '__main__': main(sys.argv)
def main(argv): argc = len(argv) if argc < 10: sys.exit(__doc__) config = entconfig.EntConfig(argv[0]) if not config.Load(): sys.exit(__doc__) pywrapbase.InitGoogleScript('', ['foo', '--gfs_aliases=%s' % config.var("GFS_ALIASES"), '--bnsresolver_use_svelte=false', '--logtostderr'], 0) gfile.Init() client = argv[1] date_fields = string.split(argv[2], '_') date_range = liblog.ParseDateRange(date_fields[0], date_fields[1:]) withResults = argv[3] topCount = argv[4] diagnosticTerms = argv[5] html_file = argv[6] valid_file = argv[7] new_html_file = argv[8] new_valid_file = argv[9] if not date_range: sys.exit(__doc__) first_date, last_date, printable_date, file_date = date_range if last_date.as_int() < first_date.as_int(): logging.fatal('invalid date range') gws_log_dir = liblog.get_gws_log_dir(config) collect_dir = liblog.get_collect_dir(config) partition_dir = liblog.get_partition_dir(config) directory_map_file = liblog.get_directory_map_file(config) # Collect logs first from all gws nodes and preprocess # logs to make sure logs are up to date. all_machines = config.var('MACHINES') collect_logs.CollectLogs(all_machines, gws_log_dir, collect_dir) preprocess_logs.PartitionLogs(config) gws_logs = liblog.FindClientLogFiles(partition_dir, directory_map_file, client, first_date, last_date) # note that collection (client) has been factored into gwslog_dir. result = CreateLogReport(config, printable_date, gws_logs, config.var('MAIN_GOOGLE3_DIR'), withResults, topCount, diagnosticTerms, html_file, valid_file, new_html_file, new_valid_file) if result == liblog.FAILURE: logging.error('CreateLogReport Failed') sys.exit(result)
def RegisterTypeHandler(self, reqtype, command_info): if not command_info.has_key(COMMAND): logging.fatal('%s invalid registration, no command specified' % reqtype) raise RuntimeError('%s registration failed, missing command' % reqtype) else: self._command_info[reqtype] = command_info
ent_config = entconfig.EntConfig(FLAGS.enthome) if not ent_config.Load(): sys.exit('Cannot load configuration.') pywrapbase.InitGoogleScript('', ['foo', '--gfs_aliases=%s' % ent_config.var('GFS_ALIASES'), '--bnsresolver_use_svelte=false', '--logtostderr', '--minloglevel=1'], 0) gfile.Init() begin = time.time() cfg = configurator.configurator(FLAGS.enthome) if FLAGS.do_import: logging.info("initializing global default files") if not cfg.globalParams.InitGlobalDefaultFiles(overwrite=0): logging.fatal("failed to initialize global default files") config_xml_serialization.ImportConfiguration(cfg, open(FLAGS.xml_path)) if FLAGS.do_export: f = open(FLAGS.xml_path, 'w') f.write(config_xml_serialization.ExportConfiguration(cfg)) f.close() diff = (time.time() - begin ) / 60 logging.info('Migration operation took %.2f minutes.' % diff) if __name__ == '__main__': main(sys.argv)