コード例 #1
0
 def testListDirFail(self):
     # The function should raise an exception if a given path does not exist on the file system
     self.assertRaises(
         NoSuchPathException,
         lambda: hdfs.listdir(self.nonExistingHDFSdir, return_objs=False))
     self.assertRaises(
         OSError,
         lambda: hdfs.listdir(self.nonExistingHomeDir, return_objs=False))
コード例 #2
0
def find_hadd_stage_files(input_path, regions, find_hadd_stage1):
    path_split = [
        subpath for subpath in input_path.split(os.path.sep) if subpath != ''
    ]
    nof_levels = len(path_split)
    if not (5 < nof_levels < 11):
        raise ValueError("Invalid path: %s" % input_path)

    current_paths = [input_path]
    if nof_levels == 6:
        assert (len(current_paths) == 1)
        current_path = os.path.join(current_paths.pop(), 'histograms')
        if not hdfs.isdir(current_path):
            return []
        current_paths = [current_path]
        nof_levels += 1
    if nof_levels == 7:
        assert (len(current_paths) == 1)
        current_path = current_paths.pop()
        current_paths = hdfs.listdir(current_path)
        nof_levels += 1
    if nof_levels == 8:
        next_paths = []
        for current_path in current_paths:
            region_paths = hdfs.listdir(current_path)
            for region_path in region_paths:
                if os.path.basename(region_path).startswith(
                        tuple(ANALYSIS_REGIONS[region] for region in regions)):
                    next_paths.append(region_path)
        current_paths = next_paths
        nof_levels += 1
    if nof_levels == 9:
        next_paths = []
        for current_path in current_paths:
            for next_path in hdfs.listdir(current_path):
                next_path_basename = os.path.basename(next_path)
                if not (find_hadd_stage1 != (next_path_basename != 'hadd')):
                    next_paths.append(next_path)
        current_paths = next_paths
        nof_levels += 1
    if nof_levels == 10:
        next_paths = []
        for current_path in current_paths:
            candidate_files = []
            metadata = extract_metadata(current_path)
            if metadata['region_key'] not in regions:
                continue
            for candidate_file in hdfs.listdir(current_path):
                if not hdfs.isfile(candidate_file):
                    continue
                if is_hadd_stage_file(candidate_file, find_hadd_stage1,
                                      metadata):
                    candidate_files.append(candidate_file)
            if candidate_files:
                assert (len(candidate_files) == 1)
                next_paths.append(candidate_files[0])
        current_paths = next_paths
    return current_paths
コード例 #3
0
 def testListDir(self):
     # Obtain the list of full paths of the files in each directory with the hdfs module (that does not use FUSE)
     dirListHDFS = set(hdfs.listdir(self.userHDFSdir, return_objs=False))
     dirListHome = set(hdfs.listdir(self.userHomeDir, return_objs=False))
     # Obtain the list of full paths of the files in each directory (NB! uses FUSE)
     dirListHDFSposix = set(
         map(lambda path: os.path.join(self.userHDFSdir, path),
             os.listdir(self.userHDFSdir)))
     dirListHomePosix = set(
         map(lambda path: os.path.join(self.userHomeDir, path),
             os.listdir(self.userHomeDir)))
     # Make sure that both results coincide
     self.assertEqual(dirListHDFS, dirListHDFSposix)
     self.assertEqual(dirListHome, dirListHomePosix)
コード例 #4
0
 def testListDirObjects(self):
     # Obtain the list of _hdfs.info objects and take out only name field from each object
     dirListHDFSobjs = set(
         map(lambda obj: obj.name,
             hdfs.listdir(self.userHDFSdir, return_objs=True)))
     # Obtain the list of file names explicitly (NB! uses FUSE)
     dirListHDFSposix = set(
         map(lambda path: os.path.join(self.userHDFSdir, path),
             os.listdir(self.userHDFSdir)))
     # Make sure that both results coincide
     self.assertEqual(dirListHDFSobjs, dirListHDFSposix)
     # It's not possible to obtain the list of _hdfs.info objects from a non-HDFS path
     self.assertRaises(
         hdfsException,
         lambda: hdfs.listdir(self.userHomeDir, return_objs=True))
コード例 #5
0
def get_file_list(chunk):
  files_to_copy_chunk = []
  taskdirs = hdfs.listdir(chunk)
  if len(taskdirs) != 1:
    raise RuntimeError("Found multiple tasks in %s" % chunk)

  taskdir = taskdirs[0]
  subdirs = hdfs.listdir(taskdir)
  if not subdirs:
    raise RuntimeError("Unable to find any subdirs in %s" % taskdir)

  for subdir in subdirs:
    files_to_copy_chunk.extend(filter(lambda path: path.endswith('.root'), hdfs.listdir(subdir)))

  files_to_copy_chunk = list(sorted(files_to_copy_chunk, key = get_file_idx))
  return files_to_copy_chunk
コード例 #6
0
def get_paths(input_paths, whitelist, blacklist):
    valid_paths = {}
    for input_path in input_paths:
        input_path_split = [
            subpath for subpath in input_path.split(os.path.sep)
            if subpath != ''
        ]
        nof_levels = len(input_path_split)
        if nof_levels == 6:
            input_path_subdir = os.path.join(input_path, OUTPUT_RLE)
            if not hdfs.isdir(input_path_subdir):
                raise ValueError("No such directory: %s" % input_path_subdir)
            for channel_dir in sorted(hdfs.listdir(input_path_subdir)):
                channel_name = os.path.basename(channel_dir)
                if whitelist and channel_name not in whitelist:
                    logging.info("Excluding channel: {}".format(channel_name))
                    continue
                if channel_name in blacklist:
                    logging.info("Excluding channel: {}".format(channel_name))
                    continue
                if channel_name in valid_paths:
                    raise ValueError(
                        "Found duplicate paths for the same channel: %s and %s"
                        % (valid_paths[channel_name], input_path))
                logging.debug('Found channel {} at path {}'.format(
                    channel_name, channel_dir))
                valid_paths[channel_name] = channel_dir
        elif nof_levels == 8:
            if input_path_split[-2] != OUTPUT_RLE:
                raise ValueError("Invalid path: %s" % input_path)
            channel_name = input_path_split[-1]
            if whitelist and channel_name not in whitelist:
                raise ValueError("Path %s conflicting with whitelist: %s" %
                                 (input_path, ', '.join(whitelist)))
            if channel_name in blacklist:
                raise ValueError("Path %s conflicting with blacklist: %s" %
                                 (input_path, ', '.join(blacklist)))
            if channel_name in valid_paths:
                raise ValueError(
                    "Found duplicate paths for the same channel: %s and %s" %
                    (valid_paths[channel_name], input_path))
            logging.debug('Found channel {} at path {}'.format(
                channel_name, input_path))
            valid_paths[channel_name] = input_path
        else:
            raise ValueError("Invalid path: %s" % input_path)
    assert (len(set(valid_paths.values())) == len(valid_paths))
    return valid_paths
コード例 #7
0
def get_filelist(basedir):
    if basedir.startswith('/eos'):
        try:
            logging.debug('Trying eos on %s' % basedir)
            filelist = cmd_execute('eos ls %s' % basedir)
            return map(lambda filename: os.path.join(basedir, filename),
                       filelist.split('\n'))
        except Exception as err:
            pass
        try:
            logging.debug('Trying XRD on %s' % basedir)
            filelist = cmd_execute('xrdfs root://eoscms.cern.ch ls %s' %
                                   basedir)
            return map(lambda filename: 'root://eoscms.cern.ch/%s' % filename,
                       filelist.split('\n'))
        except Exception as err:
            pass
        raise ValueError('Cannot access files on %s because: %s' %
                         (basedir, err))
    else:
        logging.debug('Trying local file system on %s' % basedir)
        return hdfs.listdir(basedir)
コード例 #8
0
}

rles = {}
for channel in cfg_options:
  logging.info('Inspecting channel {}'.format(channel))
  base_path = os.path.join(cfg_options[channel], 'output_rle', channel)
  if not hdfs.isdir(base_path):
    raise ValueError('No such directory: %s' % base_path)
  rles[channel] = {}
  for region_name, region in CHANNEL_OPTIONS[channel].items():
    region_path = os.path.join(base_path, region)
    if not hdfs.isdir(region_path):
      continue
    logging.info('Inspecting region {}'.format(region_name))
    rles[channel][region_name] = {}
    for sample_path in hdfs.listdir(region_path):
      sample_name = os.path.basename(sample_path)
      if sample_name != 'ttHJetToNonbb_M125_amcatnlo': continue
      logging.info('Inspecting sample {}'.format(sample_name))
      rles[channel][region_name][sample_name] = {}
      for rle_file_path in hdfs.listdir(sample_path):
        rle_file = os.path.basename(rle_file_path)
        sys_option = ''
        if 'central' in rle_file:
          sys_option = 'central'
        elif 'CMS' in rle_file:
          sys_option = rle_file[rle_file.find('CMS') : rle_file.find(rle_file.split('_')[-1]) - 1]
        else:
          raise RuntimeError('Unrecognizable file: %s' % rle_file_path)
        assert(sys_option)
コード例 #9
0
ファイル: rle_matcher.py プロジェクト: saswatinandan/tth-htt
                                rle=rle,
                                sample_key=sample_key,
                            ))
                        continue

                    file_basename = os.path.basename(grep_stdout)
                    file_idx = int(file_basename[:file_basename.rfind('.')])
                    grep_result = os.path.join(
                        sample_path, '000%d' % (file_idx / 1000),
                        'tree_{i}.root'.format(i=file_idx))
                    rles[rle].append(grep_result)
        else:
            # instead of forming a list of files let's loop over the subfolders and the files therein instead
            logging.debug('Looping over the files in {sample_path}'.format(
                sample_path=sample_path))
            for subdir in hdfs.listdir(sample_path):
                logging.debug(
                    'Found subdirectory {subdir}'.format(subdir=subdir))
                for rootfile in hdfs.listdir(subdir):
                    logging.debug("Processing file '{rootfile}'".format(
                        rootfile=rootfile, ))

                    # open the file
                    ch_root = ROOT.TChain("Events")
                    ch_root.AddFile(rootfile)

                    run_a = array.array('I', [0])
                    lumi_a = array.array('I', [0])
                    evt_a = array.array('L', [0])

                    ch_root.SetBranchAddress("run", run_a)
コード例 #10
0
def get_rles(input_paths, whitelist, blacklist, read_all_systematics):
    has_errors = False
    rles = collections.OrderedDict()
    valid_paths = get_paths(input_paths, whitelist, blacklist)
    for channel_name, channel_dir in valid_paths.items():
        rles[channel_name] = collections.OrderedDict()
        for region_dir in sorted(hdfs.listdir(channel_dir)):
            region_name = os.path.basename(region_dir)
            logging.debug('Found region {} in channel {}'.format(
                channel_name, region_name))
            rles[channel_name][region_name] = collections.OrderedDict()
            for sample_dir in sorted(hdfs.listdir(region_dir)):
                sample_name = os.path.basename(sample_dir)
                if sample_name in SAMPLES_EXCLUDE:
                    continue
                logging.debug(
                    'Found sample {} in region {} and channel {}'.format(
                        sample_name, region_name, channel_name))
                rles[channel_name][region_name][
                    sample_name] = collections.OrderedDict()
                for rle_dir in sorted(hdfs.listdir(sample_dir)):
                    central_or_shift = os.path.basename(rle_dir)
                    if central_or_shift in SYSTEMATICS_EXCLUDE:
                        continue
                    if not read_all_systematics and central_or_shift != SYSTEMATICS_CENTRAL:
                        continue
                    logging.debug(
                        'Found systematics {} for sample {} in region {} and channel {}'
                        .format(central_or_shift, sample_name, region_name,
                                channel_name))
                    rles[channel_name][region_name][sample_name][
                        central_or_shift] = []
                    rle_filenames = sorted(hdfs.listdir(rle_dir))
                    if not rle_filenames:
                        logging.warning(
                            'Directory {} is empty'.format(rle_dir))
                        continue
                    rle_arr = []
                    for rle_filename in rle_filenames:
                        if not rle_filename.endswith('.txt'):
                            raise RuntimeError(
                                "Unexpected extension in file: %s" %
                                rle_filename)
                        with open(rle_filename, 'r') as rle_file:
                            for line in rle_file:
                                line_stripped = line.rstrip('\n')
                                if not REGEX_RLE.match(line_stripped):
                                    raise RuntimeError(
                                        "Unexpected line found in %s: %s" %
                                        (rle_filename, line_stripped))
                                rle = line_stripped
                                if rle in rle_arr:
                                    logging.error(
                                      "Duplicate event %s found in channel %s, region %s, sample %s, systematics %s" % \
                                      (rle, channel_name, region_name, sample_name, central_or_shift)
                                    )
                                    has_errors = True
                                    continue
                                rle_arr.append(rle)
                    logging.debug(
                        'Found {} events in sample {}, region {}, systematics {}, channel {}'
                        .format(len(rle_arr), sample_name, region_name,
                                central_or_shift, channel_name))
                    rles[channel_name][region_name][sample_name][
                        central_or_shift].extend(rle_arr)
    return rles, has_errors
コード例 #11
0
 if nof_jobs < 0:
   logging.error("Unable to parse total number of jobs from file: %s" % crab_logfile)
   continue
 version = os.path.basename(output_dir)
 version_date = version.split('_')[1]
 prefix = '{}_{}'.format(version, chunk_str) if chunk_str else version
 userName = os.path.basename(os.path.dirname(output_dir))
 dataset_requestName = '%s__%s' % (dataset_match.group(1), dataset_match.group(2))
 requestName = '%s_%s' % (prefix, dataset_requestName)
 max_requestname_len = 160 - len(userName)
 if len(requestName) > max_requestname_len:
   requestName = requestName[:max_requestname_len]
 crab_path = os.path.join('/hdfs', 'cms', output_dir[1:], dataset_match.group(1), requestName)
 if hdfs.isdir(crab_path):
   logging.debug("Found directory: {}".format(crab_path))
   subdirs = hdfs.listdir(crab_path)
   if len(subdirs) != 1:
     logging.error("Expected exactly one subdir in {} but found {}: {}".format(
       crab_path, len(subdirs), ', '.join(subdirs)
     ))
     continue
   subdir = subdirs[0]
   root_files = [
     root_file for subsubdir in hdfs.listdir(subdir) for root_file in hdfs.listdir(subsubdir) if root_file.endswith('.root')
   ]
   root_idxs = set(map(lambda fn: int(TREE_REGEX.match(os.path.basename(fn)).group('idx')), root_files))
   assert(not (root_idxs & expected_fails))
   root_idxs = root_idxs | expected_fails
   nof_completed = len(root_idxs) * 100. / nof_jobs
   expected_idxs = set(range(1, nof_jobs + 1))
   assert(not (root_idxs - expected_idxs))
コード例 #12
0
ファイル: extract_htxs.py プロジェクト: huiling110/tth-htt
def get_hadd_stage2(input_paths):
    results = {}
    for input_path in input_paths:
        path_split = [
            subpath for subpath in input_path.split(os.path.sep)
            if subpath != ''
        ]
        nof_levels = len(path_split)
        if not (5 < nof_levels < 11):
            raise ValueError("Invalid path: %s" % input_path)

        current_paths = [input_path]
        if nof_levels == 6:
            assert (len(current_paths) == 1)
            current_path = os.path.join(current_paths.pop(), 'histograms')
            if not hdfs.isdir(current_path):
                return []
            current_paths = [current_path]
            nof_levels += 1
        if nof_levels == 7:
            assert (len(current_paths) == 1)
            current_path = current_paths.pop()
            current_paths = hdfs.listdir(current_path)
            nof_levels += 1
        if nof_levels == 8:
            next_paths = []
            for current_path in current_paths:
                region_paths = hdfs.listdir(current_path)
                for region_path in region_paths:
                    next_paths.append(region_path)
            current_paths = next_paths
            nof_levels += 1
        if nof_levels == 9:
            next_paths = []
            for current_path in current_paths:
                for next_path in hdfs.listdir(current_path):
                    next_path_basename = os.path.basename(next_path)
                    if next_path_basename == 'hadd':
                        next_paths.append(next_path)
            current_paths = next_paths
            nof_levels += 1
        if nof_levels == 10:
            next_paths = []
            for current_path in current_paths:
                candidate_files = []
                for candidate_file in hdfs.listdir(current_path):
                    if not hdfs.isfile(candidate_file):
                        continue
                    candidate_file_basename = os.path.basename(candidate_file)
                    if candidate_file_basename.startswith('hadd_stage2') and \
                       not HADD_STAGE2_RE.match(candidate_file_basename.split('.')[0]):
                        candidate_files.append(candidate_file)
                if candidate_files:
                    assert (len(candidate_files) == 1)
                    next_paths.append(candidate_files[0])
            current_paths = next_paths
        for current_path in current_paths:
            current_path_split = [
                subpath for subpath in current_path.split(os.path.sep)
                if subpath != ''
            ]
            channel = current_path_split[7]
            region = current_path_split[8]
            channel_region = '{}_{}'.format(channel, region)
            if channel_region in results:
                raise RuntimeError(
                  "Found two paths corresponding to the same channel (%s) and region (%s): %s and %s" % \
                  (channel, region, current_path, results[channel_region])
                )
            results[channel_region] = current_path
            logging.debug(
                'Found hadd stage2 file corresponding to channel {} and region {}: {}'
                .format(channel, region, current_path))

    return [results[k] for k in sorted(results.keys())]