def testListDirFail(self): # The function should raise an exception if a given path does not exist on the file system self.assertRaises( NoSuchPathException, lambda: hdfs.listdir(self.nonExistingHDFSdir, return_objs=False)) self.assertRaises( OSError, lambda: hdfs.listdir(self.nonExistingHomeDir, return_objs=False))
def find_hadd_stage_files(input_path, regions, find_hadd_stage1): path_split = [ subpath for subpath in input_path.split(os.path.sep) if subpath != '' ] nof_levels = len(path_split) if not (5 < nof_levels < 11): raise ValueError("Invalid path: %s" % input_path) current_paths = [input_path] if nof_levels == 6: assert (len(current_paths) == 1) current_path = os.path.join(current_paths.pop(), 'histograms') if not hdfs.isdir(current_path): return [] current_paths = [current_path] nof_levels += 1 if nof_levels == 7: assert (len(current_paths) == 1) current_path = current_paths.pop() current_paths = hdfs.listdir(current_path) nof_levels += 1 if nof_levels == 8: next_paths = [] for current_path in current_paths: region_paths = hdfs.listdir(current_path) for region_path in region_paths: if os.path.basename(region_path).startswith( tuple(ANALYSIS_REGIONS[region] for region in regions)): next_paths.append(region_path) current_paths = next_paths nof_levels += 1 if nof_levels == 9: next_paths = [] for current_path in current_paths: for next_path in hdfs.listdir(current_path): next_path_basename = os.path.basename(next_path) if not (find_hadd_stage1 != (next_path_basename != 'hadd')): next_paths.append(next_path) current_paths = next_paths nof_levels += 1 if nof_levels == 10: next_paths = [] for current_path in current_paths: candidate_files = [] metadata = extract_metadata(current_path) if metadata['region_key'] not in regions: continue for candidate_file in hdfs.listdir(current_path): if not hdfs.isfile(candidate_file): continue if is_hadd_stage_file(candidate_file, find_hadd_stage1, metadata): candidate_files.append(candidate_file) if candidate_files: assert (len(candidate_files) == 1) next_paths.append(candidate_files[0]) current_paths = next_paths return current_paths
def testListDir(self): # Obtain the list of full paths of the files in each directory with the hdfs module (that does not use FUSE) dirListHDFS = set(hdfs.listdir(self.userHDFSdir, return_objs=False)) dirListHome = set(hdfs.listdir(self.userHomeDir, return_objs=False)) # Obtain the list of full paths of the files in each directory (NB! uses FUSE) dirListHDFSposix = set( map(lambda path: os.path.join(self.userHDFSdir, path), os.listdir(self.userHDFSdir))) dirListHomePosix = set( map(lambda path: os.path.join(self.userHomeDir, path), os.listdir(self.userHomeDir))) # Make sure that both results coincide self.assertEqual(dirListHDFS, dirListHDFSposix) self.assertEqual(dirListHome, dirListHomePosix)
def testListDirObjects(self): # Obtain the list of _hdfs.info objects and take out only name field from each object dirListHDFSobjs = set( map(lambda obj: obj.name, hdfs.listdir(self.userHDFSdir, return_objs=True))) # Obtain the list of file names explicitly (NB! uses FUSE) dirListHDFSposix = set( map(lambda path: os.path.join(self.userHDFSdir, path), os.listdir(self.userHDFSdir))) # Make sure that both results coincide self.assertEqual(dirListHDFSobjs, dirListHDFSposix) # It's not possible to obtain the list of _hdfs.info objects from a non-HDFS path self.assertRaises( hdfsException, lambda: hdfs.listdir(self.userHomeDir, return_objs=True))
def get_file_list(chunk): files_to_copy_chunk = [] taskdirs = hdfs.listdir(chunk) if len(taskdirs) != 1: raise RuntimeError("Found multiple tasks in %s" % chunk) taskdir = taskdirs[0] subdirs = hdfs.listdir(taskdir) if not subdirs: raise RuntimeError("Unable to find any subdirs in %s" % taskdir) for subdir in subdirs: files_to_copy_chunk.extend(filter(lambda path: path.endswith('.root'), hdfs.listdir(subdir))) files_to_copy_chunk = list(sorted(files_to_copy_chunk, key = get_file_idx)) return files_to_copy_chunk
def get_paths(input_paths, whitelist, blacklist): valid_paths = {} for input_path in input_paths: input_path_split = [ subpath for subpath in input_path.split(os.path.sep) if subpath != '' ] nof_levels = len(input_path_split) if nof_levels == 6: input_path_subdir = os.path.join(input_path, OUTPUT_RLE) if not hdfs.isdir(input_path_subdir): raise ValueError("No such directory: %s" % input_path_subdir) for channel_dir in sorted(hdfs.listdir(input_path_subdir)): channel_name = os.path.basename(channel_dir) if whitelist and channel_name not in whitelist: logging.info("Excluding channel: {}".format(channel_name)) continue if channel_name in blacklist: logging.info("Excluding channel: {}".format(channel_name)) continue if channel_name in valid_paths: raise ValueError( "Found duplicate paths for the same channel: %s and %s" % (valid_paths[channel_name], input_path)) logging.debug('Found channel {} at path {}'.format( channel_name, channel_dir)) valid_paths[channel_name] = channel_dir elif nof_levels == 8: if input_path_split[-2] != OUTPUT_RLE: raise ValueError("Invalid path: %s" % input_path) channel_name = input_path_split[-1] if whitelist and channel_name not in whitelist: raise ValueError("Path %s conflicting with whitelist: %s" % (input_path, ', '.join(whitelist))) if channel_name in blacklist: raise ValueError("Path %s conflicting with blacklist: %s" % (input_path, ', '.join(blacklist))) if channel_name in valid_paths: raise ValueError( "Found duplicate paths for the same channel: %s and %s" % (valid_paths[channel_name], input_path)) logging.debug('Found channel {} at path {}'.format( channel_name, input_path)) valid_paths[channel_name] = input_path else: raise ValueError("Invalid path: %s" % input_path) assert (len(set(valid_paths.values())) == len(valid_paths)) return valid_paths
def get_filelist(basedir): if basedir.startswith('/eos'): try: logging.debug('Trying eos on %s' % basedir) filelist = cmd_execute('eos ls %s' % basedir) return map(lambda filename: os.path.join(basedir, filename), filelist.split('\n')) except Exception as err: pass try: logging.debug('Trying XRD on %s' % basedir) filelist = cmd_execute('xrdfs root://eoscms.cern.ch ls %s' % basedir) return map(lambda filename: 'root://eoscms.cern.ch/%s' % filename, filelist.split('\n')) except Exception as err: pass raise ValueError('Cannot access files on %s because: %s' % (basedir, err)) else: logging.debug('Trying local file system on %s' % basedir) return hdfs.listdir(basedir)
} rles = {} for channel in cfg_options: logging.info('Inspecting channel {}'.format(channel)) base_path = os.path.join(cfg_options[channel], 'output_rle', channel) if not hdfs.isdir(base_path): raise ValueError('No such directory: %s' % base_path) rles[channel] = {} for region_name, region in CHANNEL_OPTIONS[channel].items(): region_path = os.path.join(base_path, region) if not hdfs.isdir(region_path): continue logging.info('Inspecting region {}'.format(region_name)) rles[channel][region_name] = {} for sample_path in hdfs.listdir(region_path): sample_name = os.path.basename(sample_path) if sample_name != 'ttHJetToNonbb_M125_amcatnlo': continue logging.info('Inspecting sample {}'.format(sample_name)) rles[channel][region_name][sample_name] = {} for rle_file_path in hdfs.listdir(sample_path): rle_file = os.path.basename(rle_file_path) sys_option = '' if 'central' in rle_file: sys_option = 'central' elif 'CMS' in rle_file: sys_option = rle_file[rle_file.find('CMS') : rle_file.find(rle_file.split('_')[-1]) - 1] else: raise RuntimeError('Unrecognizable file: %s' % rle_file_path) assert(sys_option)
rle=rle, sample_key=sample_key, )) continue file_basename = os.path.basename(grep_stdout) file_idx = int(file_basename[:file_basename.rfind('.')]) grep_result = os.path.join( sample_path, '000%d' % (file_idx / 1000), 'tree_{i}.root'.format(i=file_idx)) rles[rle].append(grep_result) else: # instead of forming a list of files let's loop over the subfolders and the files therein instead logging.debug('Looping over the files in {sample_path}'.format( sample_path=sample_path)) for subdir in hdfs.listdir(sample_path): logging.debug( 'Found subdirectory {subdir}'.format(subdir=subdir)) for rootfile in hdfs.listdir(subdir): logging.debug("Processing file '{rootfile}'".format( rootfile=rootfile, )) # open the file ch_root = ROOT.TChain("Events") ch_root.AddFile(rootfile) run_a = array.array('I', [0]) lumi_a = array.array('I', [0]) evt_a = array.array('L', [0]) ch_root.SetBranchAddress("run", run_a)
def get_rles(input_paths, whitelist, blacklist, read_all_systematics): has_errors = False rles = collections.OrderedDict() valid_paths = get_paths(input_paths, whitelist, blacklist) for channel_name, channel_dir in valid_paths.items(): rles[channel_name] = collections.OrderedDict() for region_dir in sorted(hdfs.listdir(channel_dir)): region_name = os.path.basename(region_dir) logging.debug('Found region {} in channel {}'.format( channel_name, region_name)) rles[channel_name][region_name] = collections.OrderedDict() for sample_dir in sorted(hdfs.listdir(region_dir)): sample_name = os.path.basename(sample_dir) if sample_name in SAMPLES_EXCLUDE: continue logging.debug( 'Found sample {} in region {} and channel {}'.format( sample_name, region_name, channel_name)) rles[channel_name][region_name][ sample_name] = collections.OrderedDict() for rle_dir in sorted(hdfs.listdir(sample_dir)): central_or_shift = os.path.basename(rle_dir) if central_or_shift in SYSTEMATICS_EXCLUDE: continue if not read_all_systematics and central_or_shift != SYSTEMATICS_CENTRAL: continue logging.debug( 'Found systematics {} for sample {} in region {} and channel {}' .format(central_or_shift, sample_name, region_name, channel_name)) rles[channel_name][region_name][sample_name][ central_or_shift] = [] rle_filenames = sorted(hdfs.listdir(rle_dir)) if not rle_filenames: logging.warning( 'Directory {} is empty'.format(rle_dir)) continue rle_arr = [] for rle_filename in rle_filenames: if not rle_filename.endswith('.txt'): raise RuntimeError( "Unexpected extension in file: %s" % rle_filename) with open(rle_filename, 'r') as rle_file: for line in rle_file: line_stripped = line.rstrip('\n') if not REGEX_RLE.match(line_stripped): raise RuntimeError( "Unexpected line found in %s: %s" % (rle_filename, line_stripped)) rle = line_stripped if rle in rle_arr: logging.error( "Duplicate event %s found in channel %s, region %s, sample %s, systematics %s" % \ (rle, channel_name, region_name, sample_name, central_or_shift) ) has_errors = True continue rle_arr.append(rle) logging.debug( 'Found {} events in sample {}, region {}, systematics {}, channel {}' .format(len(rle_arr), sample_name, region_name, central_or_shift, channel_name)) rles[channel_name][region_name][sample_name][ central_or_shift].extend(rle_arr) return rles, has_errors
if nof_jobs < 0: logging.error("Unable to parse total number of jobs from file: %s" % crab_logfile) continue version = os.path.basename(output_dir) version_date = version.split('_')[1] prefix = '{}_{}'.format(version, chunk_str) if chunk_str else version userName = os.path.basename(os.path.dirname(output_dir)) dataset_requestName = '%s__%s' % (dataset_match.group(1), dataset_match.group(2)) requestName = '%s_%s' % (prefix, dataset_requestName) max_requestname_len = 160 - len(userName) if len(requestName) > max_requestname_len: requestName = requestName[:max_requestname_len] crab_path = os.path.join('/hdfs', 'cms', output_dir[1:], dataset_match.group(1), requestName) if hdfs.isdir(crab_path): logging.debug("Found directory: {}".format(crab_path)) subdirs = hdfs.listdir(crab_path) if len(subdirs) != 1: logging.error("Expected exactly one subdir in {} but found {}: {}".format( crab_path, len(subdirs), ', '.join(subdirs) )) continue subdir = subdirs[0] root_files = [ root_file for subsubdir in hdfs.listdir(subdir) for root_file in hdfs.listdir(subsubdir) if root_file.endswith('.root') ] root_idxs = set(map(lambda fn: int(TREE_REGEX.match(os.path.basename(fn)).group('idx')), root_files)) assert(not (root_idxs & expected_fails)) root_idxs = root_idxs | expected_fails nof_completed = len(root_idxs) * 100. / nof_jobs expected_idxs = set(range(1, nof_jobs + 1)) assert(not (root_idxs - expected_idxs))
def get_hadd_stage2(input_paths): results = {} for input_path in input_paths: path_split = [ subpath for subpath in input_path.split(os.path.sep) if subpath != '' ] nof_levels = len(path_split) if not (5 < nof_levels < 11): raise ValueError("Invalid path: %s" % input_path) current_paths = [input_path] if nof_levels == 6: assert (len(current_paths) == 1) current_path = os.path.join(current_paths.pop(), 'histograms') if not hdfs.isdir(current_path): return [] current_paths = [current_path] nof_levels += 1 if nof_levels == 7: assert (len(current_paths) == 1) current_path = current_paths.pop() current_paths = hdfs.listdir(current_path) nof_levels += 1 if nof_levels == 8: next_paths = [] for current_path in current_paths: region_paths = hdfs.listdir(current_path) for region_path in region_paths: next_paths.append(region_path) current_paths = next_paths nof_levels += 1 if nof_levels == 9: next_paths = [] for current_path in current_paths: for next_path in hdfs.listdir(current_path): next_path_basename = os.path.basename(next_path) if next_path_basename == 'hadd': next_paths.append(next_path) current_paths = next_paths nof_levels += 1 if nof_levels == 10: next_paths = [] for current_path in current_paths: candidate_files = [] for candidate_file in hdfs.listdir(current_path): if not hdfs.isfile(candidate_file): continue candidate_file_basename = os.path.basename(candidate_file) if candidate_file_basename.startswith('hadd_stage2') and \ not HADD_STAGE2_RE.match(candidate_file_basename.split('.')[0]): candidate_files.append(candidate_file) if candidate_files: assert (len(candidate_files) == 1) next_paths.append(candidate_files[0]) current_paths = next_paths for current_path in current_paths: current_path_split = [ subpath for subpath in current_path.split(os.path.sep) if subpath != '' ] channel = current_path_split[7] region = current_path_split[8] channel_region = '{}_{}'.format(channel, region) if channel_region in results: raise RuntimeError( "Found two paths corresponding to the same channel (%s) and region (%s): %s and %s" % \ (channel, region, current_path, results[channel_region]) ) results[channel_region] = current_path logging.debug( 'Found hadd stage2 file corresponding to channel {} and region {}: {}' .format(channel, region, current_path)) return [results[k] for k in sorted(results.keys())]