def createGFSChunkSubDirs(self, reset_index=0): """ Creates GFS subdirectories for base indexer and anchor processing Args: reset_index: 1, if this is for resetting index. Some dirs may exist 0, otherwise. Return: true - successful false - otherwise """ ## def execMultiTimes(self, to_exec, max_tries=10): """ exec a gfs command by trying multiple times """ try_num = 0 while try_num < max_tries: err, out = E.run_fileutil_command(self.globalParams, to_exec) if E.ERR_OK == err: return true time.sleep(10) try_num = try_num + 1 logging.error("Error executing %s" % to_exec) return false ## def createGFSDir(self, dir_name, reset_index): """ create a GFS Dir. if this is for reset index, the dir may already exist. So just try it once and ignore any error. Args: dir_name: '/gfs/ent/feeds' reset_index: 1 - during reset index 0 - othewise Return: true - successful false - otherwise """ if reset_index: execMultiTimes(self, "mkdir -p %s" % dir_name, max_tries=1) return true else: return execMultiTimes(self, "mkdir -p %s" % dir_name) num_shards = self.globalParams.GetEntNumShards() namespace_prefix = self.getGlobalParam('NAMESPACE_PREFIX') pr_namespace_prefix = self.getGlobalParam( 'OUTPUT_NAMESPACE_PREFIX')['pr_main'] # This also waits for the gfs master to come up. if not execMultiTimes( self, "mkdir -p %s" % E.normpath("%s/tmp" % namespace_prefix), 30): return false # Create the pr_main subdir for pr_main to write to. if not execMultiTimes( self, "mkdir -p %s" % E.normpath("%s/pr_main" % pr_namespace_prefix), 30): return false # Create FEEDS_DIR (feeds), a directory where uploaded feeds will get stored if not createGFSDir(self, self.getGlobalParam('FEEDS_DIR'), reset_index): return false # Create FEED_STATUS_DIR (feedstatus), a directory to store feed status if not createGFSDir(self, self.getGlobalParam('FEED_STATUS_DIR'), reset_index): return false # Create SSO_LOG_DIR, a directory to store sso logging output files if not createGFSDir(self, self.getGlobalParam('SSO_LOG_DIR'), reset_index): return false # Create LOG_REPORT_DIR (log_report), a directory to # store raw and summary reports if not createGFSDir(self, self.getGlobalParam('LOG_REPORT_DIR'), reset_index): return false # Create CRAWLQUEUE_DIR (crawlqueue), a directory to store crawlqueue data if not createGFSDir(self, self.getGlobalParam('CRAWLQUEUE_DIR'), reset_index): return false # Create SYSLOG_CHECKPOINTS_DIR (syslog_checkpoints) # a directory to store syslog checkpoint files if not createGFSDir(self, self.getGlobalParam('SYSLOG_CHECKPOINTS_DIR'), reset_index): return false # Create directories for base_indexer, global_anchor and # global-link. dirs = [ 'base-indexer', 'global-anchor', 'global-link', 'urlhistory-processor' ] for i in range(num_shards): for dir in dirs: if not execMultiTimes( self, "mkdir -p %s%s%03d-of-%03d" % (namespace_prefix, dir, i, num_shards)): return false # Inside the base-indexer directories we create directories for # global anchor command logs for j in range(num_shards): if not execMultiTimes( self, "mkdir -p %sbase-indexer%03d-of-%03d/global-anchor%03d-of-%03d" % (namespace_prefix, i, num_shards, j, num_shards)): return false num_tracker_shards = self.globalParams.GetNumShards( 'urltracker_server') for i in range(num_tracker_shards): if not execMultiTimes( self, "mkdir -p %surltracker%03d-of-%03d" % (namespace_prefix, i, num_tracker_shards)): return false return true
def get_top_dir(rootdir): return E.normpath('%s/export/hda3' % os.path.normpath(rootdir))
def _TopLevelDirsToRemove(cfg, dir_root, gfs_aliases=None, datadir=None, unrecoverable_dirs=[]): """ top level dirs/files under dir_root Trying to find out all the dirs/files under dir_root to be removed, given unrecoverable_dirs. Note: fileutil has different behavior for gfs, bigfile, and local files. Sometimes, the "fileuitl ls" will give errors without the "-a" flag. However, with the "-a" flag, some unwanted directories are also returned. So the idea here is to first try without the "-a" flag. If it fails, try the "-a" flag. For every dir it returns, make sure it is under dir_root. Parameters: cfg: entconfig.EntConfig(entHome, 1) dir_root: '/gfs/ent/' gfs_aliases: 'ent=master.ent.gfs.ent4-6-0-G-27.ls.google.com:3830' -- for gfs only datadir: '/export/hda3/4.6.0.G.27/data/enterprise-data' -- for bigfile only unrecoverable_dirs: [] -- introduced for unittests. Returns: ['/bigfile/pr_stats_shard000', '/bigfile/pr_test_00_00', ...] """ # get all the top level GFS dirs if gfs_aliases is not None: unrecoverable_dirs.extend(_UnrecoverableGFSDirs(cfg)) if dir_root not in unrecoverable_dirs: unrecoverable_dirs.append(dir_root) # first try without the "-a" option. If it does not work, try the "-a" option cmd_args = _ComposeFileutilArgs(gfs_aliases, datadir) fileutil_args = ['', '-a'] timeout = 20 * 60 top_level_dirs_to_remove = [] for fileutil_arg in fileutil_args: # unfortunately, fileutil in 4.6.4 works differently on gfs and bigfile. # it has to use "/gfs/ent/" and "/bigfile/*". if gfs_aliases is not None: cmd = ('fileutil %s ls %s %s' % (cmd_args, fileutil_arg, dir_root)) else: cmd = ('fileutil %s ls %s %s*' % (cmd_args, fileutil_arg, dir_root)) out = [] status = _ExecuteCommand(cmd, timeout=15 * 60, out=out, error_filter=_NoMatchOk) if status == 0 and len(out) >= 1: top_level_dirs = out[0].splitlines() # don't remove unrecoverable dirs. Also noise may exist in the result. for dir in top_level_dirs: if not dir.endswith('..'): dir = E.normpath(dir) if (dir.startswith(dir_root) and _IsRecoverable(dir, unrecoverable_dirs)): top_level_dirs_to_remove.append(dir) if len(top_level_dirs_to_remove) > 0: break else: timeout *= 2 logging.info('Reset Index: Top Level Dirs to Remove:') logging.info(' %s' % top_level_dirs_to_remove) logging.flush() return top_level_dirs_to_remove
def _TopLevelDirsToRemove(cfg, dir_root, gfs_aliases=None, datadir=None, unrecoverable_dirs=[]): """ top level dirs/files under dir_root Trying to find out all the dirs/files under dir_root to be removed, given unrecoverable_dirs. Note: fileutil has different behavior for gfs, bigfile, and local files. Sometimes, the "fileuitl ls" will give errors without the "-a" flag. However, with the "-a" flag, some unwanted directories are also returned. So the idea here is to first try without the "-a" flag. If it fails, try the "-a" flag. For every dir it returns, make sure it is under dir_root. Parameters: cfg: entconfig.EntConfig(entHome, 1) dir_root: '/gfs/ent/' gfs_aliases: 'ent=master.ent.gfs.ent4-6-0-G-27.ls.google.com:3830' -- for gfs only datadir: '/export/hda3/4.6.0.G.27/data/enterprise-data' -- for bigfile only unrecoverable_dirs: [] -- introduced for unittests. Returns: ['/bigfile/pr_stats_shard000', '/bigfile/pr_test_00_00', ...] """ # get all the top level GFS dirs if gfs_aliases is not None: unrecoverable_dirs.extend(_UnrecoverableGFSDirs(cfg)) if dir_root not in unrecoverable_dirs: unrecoverable_dirs.append(dir_root) # first try without the "-a" option. If it does not work, try the "-a" option cmd_args = _ComposeFileutilArgs(gfs_aliases, datadir) fileutil_args = ['', '-a'] timeout=20*60 top_level_dirs_to_remove = [] for fileutil_arg in fileutil_args: # unfortunately, fileutil in 4.6.4 works differently on gfs and bigfile. # it has to use "/gfs/ent/" and "/bigfile/*". if gfs_aliases is not None: cmd = ('fileutil %s ls %s %s' % (cmd_args, fileutil_arg, dir_root)) else: cmd = ('fileutil %s ls %s %s*' % (cmd_args, fileutil_arg, dir_root)) out = [] status = _ExecuteCommand(cmd, timeout=15*60, out=out, error_filter=_NoMatchOk) if status == 0 and len(out) >= 1: top_level_dirs = out[0].splitlines() # don't remove unrecoverable dirs. Also noise may exist in the result. for dir in top_level_dirs: if not dir.endswith('..'): dir = E.normpath(dir) if (dir.startswith(dir_root) and _IsRecoverable(dir, unrecoverable_dirs)): top_level_dirs_to_remove.append(dir) if len(top_level_dirs_to_remove) > 0: break else: timeout *= 2 logging.info('Reset Index: Top Level Dirs to Remove:') logging.info(' %s' % top_level_dirs_to_remove) logging.flush() return top_level_dirs_to_remove
def createGFSChunkSubDirs(self, reset_index=0): """ Creates GFS subdirectories for base indexer and anchor processing Args: reset_index: 1, if this is for resetting index. Some dirs may exist 0, otherwise. Return: true - successful false - otherwise """ ## def execMultiTimes(self, to_exec, max_tries=10): """ exec a gfs command by trying multiple times """ try_num = 0 while try_num < max_tries: err, out = E.run_fileutil_command(self.globalParams, to_exec) if E.ERR_OK == err: return true time.sleep(10) try_num = try_num + 1 logging.error("Error executing %s" % to_exec) return false ## def createGFSDir(self, dir_name, reset_index): """ create a GFS Dir. if this is for reset index, the dir may already exist. So just try it once and ignore any error. Args: dir_name: '/gfs/ent/feeds' reset_index: 1 - during reset index 0 - othewise Return: true - successful false - otherwise """ if reset_index: execMultiTimes(self, "mkdir -p %s" % dir_name, max_tries=1) return true else: return execMultiTimes(self, "mkdir -p %s" % dir_name) num_shards = self.globalParams.GetEntNumShards() namespace_prefix = self.getGlobalParam('NAMESPACE_PREFIX') pr_namespace_prefix = self.getGlobalParam('OUTPUT_NAMESPACE_PREFIX')['pr_main'] # This also waits for the gfs master to come up. if not execMultiTimes(self, "mkdir -p %s" % E.normpath( "%s/tmp" % namespace_prefix), 30): return false # Create the pr_main subdir for pr_main to write to. if not execMultiTimes(self, "mkdir -p %s" % E.normpath( "%s/pr_main" % pr_namespace_prefix), 30): return false # Create FEEDS_DIR (feeds), a directory where uploaded feeds will get stored if not createGFSDir(self, self.getGlobalParam('FEEDS_DIR'), reset_index): return false # Create FEED_STATUS_DIR (feedstatus), a directory to store feed status if not createGFSDir(self, self.getGlobalParam('FEED_STATUS_DIR'), reset_index): return false # Create SSO_LOG_DIR, a directory to store sso logging output files if not createGFSDir(self, self.getGlobalParam('SSO_LOG_DIR'), reset_index): return false # Create LOG_REPORT_DIR (log_report), a directory to # store raw and summary reports if not createGFSDir(self, self.getGlobalParam('LOG_REPORT_DIR'), reset_index): return false # Create CRAWLQUEUE_DIR (crawlqueue), a directory to store crawlqueue data if not createGFSDir(self, self.getGlobalParam('CRAWLQUEUE_DIR'), reset_index): return false # Create SYSLOG_CHECKPOINTS_DIR (syslog_checkpoints) # a directory to store syslog checkpoint files if not createGFSDir(self, self.getGlobalParam('SYSLOG_CHECKPOINTS_DIR'), reset_index): return false # Create directories for base_indexer, global_anchor and # global-link. dirs = ['base-indexer', 'global-anchor', 'global-link', 'urlhistory-processor'] for i in range(num_shards): for dir in dirs: if not execMultiTimes(self, "mkdir -p %s%s%03d-of-%03d" % ( namespace_prefix, dir, i, num_shards)): return false # Inside the base-indexer directories we create directories for # global anchor command logs for j in range(num_shards): if not execMultiTimes( self, "mkdir -p %sbase-indexer%03d-of-%03d/global-anchor%03d-of-%03d" % ( namespace_prefix, i, num_shards, j, num_shards)): return false num_tracker_shards = self.globalParams.GetNumShards('urltracker_server') for i in range(num_tracker_shards): if not execMultiTimes(self, "mkdir -p %surltracker%03d-of-%03d" % ( namespace_prefix, i, num_tracker_shards)): return false return true