def _RemoveTopLevelDirs(cfg, dir_root, gfs_aliases=None, datadir=None, unrecoverable_dirs=[]): """ Remove top level dirs/files under dir_root one by one in a loop Parameters: cfg: entconfig.EntConfig(entHome, 1) dir_root: '/gfs/ent/' gfs_aliases: 'ent=master.ent.gfs.ent4-6-0-G-27.ls.google.com:3830' -- for gfs only datadir: '/export/hda3/4.6.0.G.27/data/enterprise-data' -- for bigfile only unrecoverable_dirs: [] -- introduced for unittests. Returns: Dirs under dir_root remain to be removed """ def _RemoveDir(dir, cmd_args): """ using fileutil to remove a dir First do "rm" without the "-a" flag. If it does not work, try the "-a" flag. Args: dir: '/gfs/ent/base-indexer000-of-003/global-anchor000-of-003' cmd_args: '--gfs_aliases=ent=master.ent.gfs.ent4-6-0-G-27.ls.' 'google.com:3830' Return: 1 - removed successfully 0 - otherwise """ fileutil_args = ['', '-a'] for fileutil_arg in fileutil_args: cmd = ('fileutil %s rm %s -R -f %s' % (cmd_args, fileutil_arg, dir)) if _ExecuteCommand(cmd, error_filter=_NoMatchOk) == 0: return 1 return 0 top_level_dirs = _TopLevelDirsToRemove(cfg, dir_root, gfs_aliases=gfs_aliases, datadir=datadir, unrecoverable_dirs=unrecoverable_dirs) # 'fileutil ls' failed, sacrifice unrecoverable_dirs, try removing all if len(top_level_dirs) == 0: top_level_dirs = ['%s*' % dir_root] max_tries = 3 count = 0 cmd_args = _ComposeFileutilArgs(gfs_aliases, datadir) # retry until no dir not removed or hit max_tries while len(top_level_dirs) > 0 and count < max_tries: count += 1 dirs_not_removed = [] for dir in top_level_dirs: if _RemoveDir(dir, cmd_args) == 0: dirs_not_removed.append(dir) top_level_dirs = dirs_not_removed logging.flush() return top_level_dirs
def _Reactivate(cfg, version): """Reactivate the serve service. Returns '' or error string. """ logging.info('Reset Index: Reactivate') logging.flush() result = '' if _RunServeCmd(cfg, version, 'start'): result = 'Startup failed.' # Need to activate on all nodes; if we switched masters, we may have # deactivated multiple nodes. if _RunServeCmd(cfg, version, 'activate', allnodes=1): result = 'Activate failed.' return result
def RunCommand(command): """Simple command wrapper that checks exit status and logs/raises error if the command fails. Logs and times commands that succeed. Args: command - string holding a command to run Returns: None """ start_time = time.time() if os.system(command): logging.error('Command failure: %s' % command) raise CommandError, 'cannot complete command: %s' % command logging.info('Command successful (took %f sec): %s' % ((time.time()-start_time), command)) logging.flush()
def RunCommand(command): """Simple command wrapper that checks exit status and logs/raises error if the command fails. Logs and times commands that succeed. Args: command - string holding a command to run Returns: None """ start_time = time.time() if os.system(command): logging.error('Command failure: %s' % command) raise CommandError, 'cannot complete command: %s' % command logging.info('Command successful (took %f sec): %s' % ((time.time() - start_time), command)) logging.flush()
def _Inactivate(cfg, version): """Inactivate serve_service and bring up gfs Return: '' for success, error for failure """ logging.info('Reset Index: Inactivate') logging.flush() status = _RunServeCmd(cfg, version, 'deactivate', allnodes=1) if status: return 'Deactivate failure.' status = _RunServeCmd(cfg, version, 'stop', allnodes=1) if status: return 'Stop failure.' time.sleep(60) # Sleep a bit return ''
def _ClearIndex(cfg, version): """Clear the index directories in GFS/bigfiles. Return: '' for success, error for failure """ logging.info('Reset Index: ClearIndex') logging.flush() # Delete (local) urltracker data on oneway. urltracker_dir = cfg.getGlobalParam('URLTRACKER_DIRECTORY') if os.access(urltracker_dir, os.R_OK): cmd = ('rm -R -f %s' % (urltracker_dir)) logging.info('Deleting local urltracker directory: %s' % (urltracker_dir)) if _ExecuteCommand(cmd, machines=cfg.getGlobalParam(C.MACHINES)): return 'File removal failed.' else: logging.info('No local urltracker data to delete') if cfg.getGlobalParam(C.GFS_CELL): logging.info('Deleting GFS files') gfs_aliases = core_utils.GetGFSAliases( version, install_utilities.is_test(version)) dirs_not_removed = _RemoveTopLevelDirs(cfg, '/gfs/ent/', gfs_aliases=gfs_aliases) if len(dirs_not_removed) > 0: return 'Shared file removal failed.' logging.info('Deleting bigfiles') datadir = '%s/data/enterprise-data' % cfg.getGlobalParam('ENTERPRISE_HOME') dirs_not_removed = _RemoveTopLevelDirs(cfg, '/bigfile/', datadir=datadir) if len(dirs_not_removed) > 0: return 'File removal failed.' # delete spelling data on oneway: spell_root = cfg.getGlobalParam('ENT_SPELL_ROOT_DIR') if spell_root[-1] == '/': spell_root = spell_root[:-1] if os.access(spell_root, os.R_OK): cmd = ('rm -R -f %s' % spell_root) logging.info('Deleting local (non-gfs) spelling data') if _ExecuteCommand(cmd, machines=cfg.getGlobalParam(C.MACHINES)): return 'File removal failed.' else: logging.info('No local (non-gfs) spelling data to delete') return ''
def _ClearIndex(cfg, version): """Clear the index directories in GFS/bigfiles. Return: '' for success, error for failure """ logging.info('Reset Index: ClearIndex') logging.flush() # Delete (local) urltracker data on oneway. urltracker_dir = cfg.getGlobalParam('URLTRACKER_DIRECTORY') if os.access(urltracker_dir, os.R_OK): cmd = ('rm -R -f %s' % (urltracker_dir)) logging.info('Deleting local urltracker directory: %s' % (urltracker_dir)) if _ExecuteCommand(cmd, machines=cfg.getGlobalParam(C.MACHINES)): return 'File removal failed.' else: logging.info('No local urltracker data to delete') if cfg.getGlobalParam(C.GFS_CELL): logging.info('Deleting GFS files') gfs_aliases = core_utils.GetGFSAliases(version, install_utilities.is_test(version)) dirs_not_removed = _RemoveTopLevelDirs(cfg, '/gfs/ent/', gfs_aliases=gfs_aliases) if len(dirs_not_removed) > 0: return 'Shared file removal failed.' logging.info('Deleting bigfiles') datadir = '%s/data/enterprise-data' % cfg.getGlobalParam('ENTERPRISE_HOME') dirs_not_removed = _RemoveTopLevelDirs(cfg, '/bigfile/', datadir=datadir) if len(dirs_not_removed) > 0: return 'File removal failed.' # delete spelling data on oneway: spell_root = cfg.getGlobalParam('ENT_SPELL_ROOT_DIR') if spell_root[-1] == '/': spell_root = spell_root[:-1] if os.access(spell_root, os.R_OK): cmd = ('rm -R -f %s' % spell_root); logging.info('Deleting local (non-gfs) spelling data') if _ExecuteCommand(cmd, machines=cfg.getGlobalParam(C.MACHINES)): return 'File removal failed.' else: logging.info('No local (non-gfs) spelling data to delete') return ''
def _ReinitializeIndex(cfg, _): """Reinitialize the index directories in GFS/bigfiles. Returns '' or error string. """ logging.info('Reset Index: ReinitializeIndex') logging.flush() result = '' # Create GFS subdirectories if cfg.getGlobalParam(C.GFS_CELL): logging.info('creating gfs subdirectories') if not cfg.createGFSChunkSubDirs(reset_index=1): result = 'Failed creating new shared files' # create some initial files needed by various backend logging.info('Creating default backend files ...') cfg.CreateDefaultBackendFiles() # ensure initial spelling data is present cfg.EnsureSpellingData(reset = 1) return result
def _ReinitializeIndex(cfg, _): """Reinitialize the index directories in GFS/bigfiles. Returns '' or error string. """ logging.info('Reset Index: ReinitializeIndex') logging.flush() result = '' # Create GFS subdirectories if cfg.getGlobalParam(C.GFS_CELL): logging.info('creating gfs subdirectories') if not cfg.createGFSChunkSubDirs(reset_index=1): result = 'Failed creating new shared files' # create some initial files needed by various backend logging.info('Creating default backend files ...') cfg.CreateDefaultBackendFiles() # ensure initial spelling data is present cfg.EnsureSpellingData(reset=1) return result
def _TopLevelDirsToRemove(cfg, dir_root, gfs_aliases=None, datadir=None, unrecoverable_dirs=[]): """ top level dirs/files under dir_root Trying to find out all the dirs/files under dir_root to be removed, given unrecoverable_dirs. Note: fileutil has different behavior for gfs, bigfile, and local files. Sometimes, the "fileuitl ls" will give errors without the "-a" flag. However, with the "-a" flag, some unwanted directories are also returned. So the idea here is to first try without the "-a" flag. If it fails, try the "-a" flag. For every dir it returns, make sure it is under dir_root. Parameters: cfg: entconfig.EntConfig(entHome, 1) dir_root: '/gfs/ent/' gfs_aliases: 'ent=master.ent.gfs.ent4-6-0-G-27.ls.google.com:3830' -- for gfs only datadir: '/export/hda3/4.6.0.G.27/data/enterprise-data' -- for bigfile only unrecoverable_dirs: [] -- introduced for unittests. Returns: ['/bigfile/pr_stats_shard000', '/bigfile/pr_test_00_00', ...] """ # get all the top level GFS dirs if gfs_aliases is not None: unrecoverable_dirs.extend(_UnrecoverableGFSDirs(cfg)) if dir_root not in unrecoverable_dirs: unrecoverable_dirs.append(dir_root) # first try without the "-a" option. If it does not work, try the "-a" option cmd_args = _ComposeFileutilArgs(gfs_aliases, datadir) fileutil_args = ['', '-a'] timeout=20*60 top_level_dirs_to_remove = [] for fileutil_arg in fileutil_args: # unfortunately, fileutil in 4.6.4 works differently on gfs and bigfile. # it has to use "/gfs/ent/" and "/bigfile/*". if gfs_aliases is not None: cmd = ('fileutil %s ls %s %s' % (cmd_args, fileutil_arg, dir_root)) else: cmd = ('fileutil %s ls %s %s*' % (cmd_args, fileutil_arg, dir_root)) out = [] status = _ExecuteCommand(cmd, timeout=15*60, out=out, error_filter=_NoMatchOk) if status == 0 and len(out) >= 1: top_level_dirs = out[0].splitlines() # don't remove unrecoverable dirs. Also noise may exist in the result. for dir in top_level_dirs: if not dir.endswith('..'): dir = E.normpath(dir) if (dir.startswith(dir_root) and _IsRecoverable(dir, unrecoverable_dirs)): top_level_dirs_to_remove.append(dir) if len(top_level_dirs_to_remove) > 0: break else: timeout *= 2 logging.info('Reset Index: Top Level Dirs to Remove:') logging.info(' %s' % top_level_dirs_to_remove) logging.flush() return top_level_dirs_to_remove
def _RemoveTopLevelDirs(cfg, dir_root, gfs_aliases=None, datadir=None, unrecoverable_dirs=[]): """ Remove top level dirs/files under dir_root one by one in a loop Parameters: cfg: entconfig.EntConfig(entHome, 1) dir_root: '/gfs/ent/' gfs_aliases: 'ent=master.ent.gfs.ent4-6-0-G-27.ls.google.com:3830' -- for gfs only datadir: '/export/hda3/4.6.0.G.27/data/enterprise-data' -- for bigfile only unrecoverable_dirs: [] -- introduced for unittests. Returns: Dirs under dir_root remain to be removed """ def _RemoveDir(dir, cmd_args): """ using fileutil to remove a dir First do "rm" without the "-a" flag. If it does not work, try the "-a" flag. Args: dir: '/gfs/ent/base-indexer000-of-003/global-anchor000-of-003' cmd_args: '--gfs_aliases=ent=master.ent.gfs.ent4-6-0-G-27.ls.' 'google.com:3830' Return: 1 - removed successfully 0 - otherwise """ fileutil_args = ['', '-a'] for fileutil_arg in fileutil_args: cmd = ('fileutil %s rm %s -R -f %s' % (cmd_args, fileutil_arg, dir)) if _ExecuteCommand(cmd, error_filter=_NoMatchOk) == 0: return 1 return 0 top_level_dirs = _TopLevelDirsToRemove( cfg, dir_root, gfs_aliases=gfs_aliases, datadir=datadir, unrecoverable_dirs=unrecoverable_dirs) # 'fileutil ls' failed, sacrifice unrecoverable_dirs, try removing all if len(top_level_dirs) == 0: top_level_dirs = ['%s*' % dir_root] max_tries = 3 count = 0 cmd_args = _ComposeFileutilArgs(gfs_aliases, datadir) # retry until no dir not removed or hit max_tries while len(top_level_dirs) > 0 and count < max_tries: count += 1 dirs_not_removed = [] for dir in top_level_dirs: if _RemoveDir(dir, cmd_args) == 0: dirs_not_removed.append(dir) top_level_dirs = dirs_not_removed logging.flush() return top_level_dirs
def _TopLevelDirsToRemove(cfg, dir_root, gfs_aliases=None, datadir=None, unrecoverable_dirs=[]): """ top level dirs/files under dir_root Trying to find out all the dirs/files under dir_root to be removed, given unrecoverable_dirs. Note: fileutil has different behavior for gfs, bigfile, and local files. Sometimes, the "fileuitl ls" will give errors without the "-a" flag. However, with the "-a" flag, some unwanted directories are also returned. So the idea here is to first try without the "-a" flag. If it fails, try the "-a" flag. For every dir it returns, make sure it is under dir_root. Parameters: cfg: entconfig.EntConfig(entHome, 1) dir_root: '/gfs/ent/' gfs_aliases: 'ent=master.ent.gfs.ent4-6-0-G-27.ls.google.com:3830' -- for gfs only datadir: '/export/hda3/4.6.0.G.27/data/enterprise-data' -- for bigfile only unrecoverable_dirs: [] -- introduced for unittests. Returns: ['/bigfile/pr_stats_shard000', '/bigfile/pr_test_00_00', ...] """ # get all the top level GFS dirs if gfs_aliases is not None: unrecoverable_dirs.extend(_UnrecoverableGFSDirs(cfg)) if dir_root not in unrecoverable_dirs: unrecoverable_dirs.append(dir_root) # first try without the "-a" option. If it does not work, try the "-a" option cmd_args = _ComposeFileutilArgs(gfs_aliases, datadir) fileutil_args = ['', '-a'] timeout = 20 * 60 top_level_dirs_to_remove = [] for fileutil_arg in fileutil_args: # unfortunately, fileutil in 4.6.4 works differently on gfs and bigfile. # it has to use "/gfs/ent/" and "/bigfile/*". if gfs_aliases is not None: cmd = ('fileutil %s ls %s %s' % (cmd_args, fileutil_arg, dir_root)) else: cmd = ('fileutil %s ls %s %s*' % (cmd_args, fileutil_arg, dir_root)) out = [] status = _ExecuteCommand(cmd, timeout=15 * 60, out=out, error_filter=_NoMatchOk) if status == 0 and len(out) >= 1: top_level_dirs = out[0].splitlines() # don't remove unrecoverable dirs. Also noise may exist in the result. for dir in top_level_dirs: if not dir.endswith('..'): dir = E.normpath(dir) if (dir.startswith(dir_root) and _IsRecoverable(dir, unrecoverable_dirs)): top_level_dirs_to_remove.append(dir) if len(top_level_dirs_to_remove) > 0: break else: timeout *= 2 logging.info('Reset Index: Top Level Dirs to Remove:') logging.info(' %s' % top_level_dirs_to_remove) logging.flush() return top_level_dirs_to_remove
def FlushLogHandler(self): logging.flush() return ''
def StartupWork(cfg, state): try: ############################################################################# # check memory-total logging.info("check memory-total") if not cfg.CheckMachineMemory(): logging.fatal("failed to check memory-total") return # One-time Initialization Work: # In fresh mode we first initialize the global default if install_utilities.GetInitState(cfg.globalParams) == C.FRESH: logging.info("initializing global default files") if not cfg.globalParams.InitGlobalDefaultFiles(overwrite=false): logging.fatal("failed to initialize global default files") return # Call to saveParams will also distribute the newly created global # default files. logging.info("saving params and distributing global default files.") logging.flush() # This is a best effort mechanism. We try upto 6 times to update the # files. We assume that we'll get the files to all alive machines. If # a machine can't get the files then we assume that it is dead and won't # become a master with missing default files. cfg.saveParams(retry=6) logging.flush() install_utilities.SetInitState(cfg, C.CONFIG_FILES_INITIALIZED) logging.info('system initialization for %s state successful' % C.CONFIG_FILES_INITIALIZED) logging.flush() # In install mode -- bail out now if state == "INSTALL": cfg.startupDone() return # Perform reset index if we got interrupted in the middle of it reset_index.ResetIndex(cfg, only_if_in_progress=1) try: localhost = E.getCrtHostName() if not cfg.setGlobalParam('MASTER', localhost): logging.fatal("Error setting the MASTER to %s" % localhost) except IOError: logging.fatal("Couldn't set MASTER -> exiting") # Allocate machines for the empty slots logging.info("doing machine allocation") if not cfg.DoMachineAllocation(): logging.info("failed to do machine allocation") # In the next install stage (INSTALL) mode we go and finish up the work if (install_utilities.GetInitState(cfg.globalParams) == C.CONFIG_FILES_INITIALIZED): logging.info("doing system initialization") # datadirs are initially made (by the base rpm) to not have any data disks # because it doesn't know which disks to use, but the datadir is needed # to get things running. logging.info("creating datadirs") if not cfg.createDataDirs(cfg.getGlobalParam(C.MACHINES)): logging.fatal("failed to make datadirs on machines") return # Create GFS subdirectories if cfg.getGlobalParam(C.GFS_CELL): logging.info("creating gfs subdirectories") if not cfg.createGFSChunkSubDirs(): logging.fatal("failed creating the gfs subdirectories") return # ensure initial spelling data is present cfg.EnsureSpellingData() # create a default collection w/content of mainCrawl / frontend logging.info("Creating default collection ...") cfg.CreateDefaultCollection() logging.info("Creating default frontend ...") cfg.CreateDefaultFrontend() logging.info("Assiging default collection and frontend ...") cfg.AssignDefaultCollAndFront() # create some initial files needed by various backend logging.info("Creating default backend files ...") cfg.CreateDefaultBackendFiles() # create built in query expansion entry logging.info("Creating default query expansion entry ...") cfg.CreateDefaultQueryExpEntry() install_utilities.SetInitState(cfg, C.INITIALIZED) logging.info("system initialization successful") # we start the serve_service as this is fresh install logging.info('Starting serve service...') RunServeService(cfg.globalParams, 'start') else: # we restart the babysitter because the servers map may have changed logging.info('Babysitting serve service after allocation...') RunServeService(cfg.globalParams, 'babysit') logging.info("Saving params") cfg.saveParams() # now we're ready to run; end startup mode logging.info("Done with startup") cfg.startupDone() except Exception, e: (t, v, tb) = sys.exc_info() exc_msg = string.join(traceback.format_exception(t, v, tb)) logging.error(exc_msg) logging.fatal("StartupWork failed with exception: %s; %s" % ( e, traceback.format_tb(tb)))