Esempio n. 1
0
def do_cuts(args):
    from root_optimize.timing import secondsToStr

    # before doing anything, let's ensure the directory we make is ok
    if not os.path.exists(args.output_directory):
        os.makedirs(args.output_directory)
    else:
        raise IOError("Output directory already exists: {0:s}".format(
            args.output_directory))

    # first step is to group by the sample DID
    dids = defaultdict(list)
    for fname in args.files:
        dids[utils.get_did(fname)].append(fname)

    # load in the supercuts file
    supercuts = utils.read_supercuts_file(args.supercuts)

    # load up the weights file
    if not os.path.isfile(args.weightsFile):
        raise ValueError(
            'The supplied weights file `{0}` does not exist or I cannot find it.'
            .format(args.weightsFile))
    else:
        weights = json.load(file(args.weightsFile))

    # parallelize
    num_cores = min(multiprocessing.cpu_count(), args.num_cores)
    logger.log(25, "Using {0} cores".format(num_cores))
    results = Parallel(n_jobs=num_cores)(delayed(utils.do_cut)(
        did, files, supercuts, weights, args.tree_name, args.output_directory,
        args.eventWeightBranch, args.numpy) for did, files in dids.iteritems())

    for did, result in zip(dids, results):
        logger.log(
            25, 'DID {0:s}: {1:s}'.format(did,
                                          'ok' if result[0] else 'not ok'))

    logger.log(
        25, "Total CPU elapsed time: {0}".format(
            secondsToStr(sum(result[1] for result in results))))

    return True
Esempio n. 2
0
def do_cuts(args):
    from root_optimize.timing import secondsToStr

    # before doing anything, let's ensure the directory we make is ok
    if not os.path.exists(args.output_directory):
        os.makedirs(args.output_directory)
    elif args.overwrite:
        import shutil
        shutil.rmtree(args.output_directory)
    else:
        raise IOError("Output directory already exists: {0:s}".format(
            args.output_directory))

    # first step is to group by the sample DID
    dids = defaultdict(list)
    for fname in args.files:
        dids[utils.get_did(fname)].append(fname)

    # load in the supercuts file
    supercuts = utils.read_supercuts_file(args.supercuts)

    # load up the weights file
    if not os.path.isfile(args.weightsFile):
        raise ValueError(
            'The supplied weights file `{0}` does not exist or I cannot find it.'
            .format(args.weightsFile))
    else:
        weights = json.load(file(args.weightsFile))

    # parallelize
    num_cores = min(multiprocessing.cpu_count(), args.num_cores)
    logger.log(25, "Using {0} cores".format(num_cores))

    pids = None
    # if pids is None, do_cut() will disable the progress
    if not args.hide_subtasks:
        from numpy import memmap, uint64
        pids = memmap(os.path.join(tempfile.mkdtemp(), 'pids'),
                      dtype=uint64,
                      shape=num_cores,
                      mode='w+')

    overall_progress = tqdm.tqdm(total=len(dids),
                                 desc='Num. files',
                                 position=0,
                                 leave=True,
                                 unit='file',
                                 dynamic_ncols=True)

    class CallBack(object):
        completed = defaultdict(int)

        def __init__(self, index, parallel):
            self.index = index
            self.parallel = parallel

        def __call__(self, index):
            CallBack.completed[self.parallel] += 1
            overall_progress.update()
            overall_progress.refresh()
            if self.parallel._original_iterable:
                self.parallel.dispatch_next()

    import joblib.parallel
    joblib.parallel.CallBack = CallBack

    results = Parallel(n_jobs=num_cores)(delayed(utils.do_cut)(
        did, files, supercuts, weights, args.tree_name, args.output_directory,
        args.eventWeightBranch, args.numpy, pids)
                                         for did, files in dids.items())

    overall_progress.close()

    for did, result in zip(dids, results):
        logger.log(
            25, 'DID {0:s}: {1:s}'.format(did,
                                          'ok' if result[0] else 'not ok'))

    logger.log(
        25, "Total CPU elapsed time: {0}".format(
            secondsToStr(sum(result[1] for result in results))))

    return True
Esempio n. 3
0
def do_cuts(args):
    from root_optimize.timing import secondsToStr

    # before doing anything, let's ensure the directory we make is ok
    if not os.path.exists(args.output_directory):
        os.makedirs(args.output_directory)
    elif args.overwrite:
        import shutil

        shutil.rmtree(args.output_directory)
    else:
        raise IOError("Output directory already exists: {0:s}".format(
            args.output_directory))

    tree_patterns = [
        re.compile(str.encode(fnmatch.translate(tree_pattern)))
        for tree_pattern in args.tree_patterns
    ]

    # first step is to group by the tree name
    trees = defaultdict(list)
    for fname in args.files:
        with uproot.open(fname) as f:
            tree_names = set(
                sorted(
                    tname.split(b';')[0]
                    for tname in f.allkeys(filterclass=lambda cls: issubclass(
                        cls, uproot.tree.TTreeMethods))))
            logger.log(25,
                       "{0:s} has {1:d} trees".format(fname, len(tree_names)))
            for tree_name in tree_names:
                matched = any(
                    tree_pattern.search(tree_name)
                    for tree_pattern in tree_patterns)
                if matched:
                    trees[tree_name].append(fname)
                logger.log(
                    25,
                    "  - [{1:s}] {0:s}".format(tree_name.decode('utf-8'),
                                               "x" if matched else " "),
                )

    # load in the supercuts file
    supercuts = utils.read_supercuts_file(args.supercuts)

    branchesSpecified = utils.supercuts_to_branches(supercuts)
    eventWeightBranchesSpecified = utils.extract_branch_names(
        args.eventWeightBranch)
    proposedBranches = set(
        map(str.encode,
            itertools.chain(branchesSpecified, eventWeightBranchesSpecified)))

    # parallelize
    num_cores = min(multiprocessing.cpu_count(), args.num_cores)
    logger.log(25, "Using {0} cores".format(num_cores))

    pids = None
    # if pids is None, do_cut() will disable the progress
    if not args.hide_subtasks:
        from numpy import memmap, uint64

        pids = memmap(
            os.path.join(tempfile.mkdtemp(), "pids"),
            dtype=uint64,
            shape=num_cores,
            mode="w+",
        )

    overall_progress = tqdm.tqdm(
        total=len(trees),
        desc="Num. trees",
        position=0,
        leave=True,
        unit="tree",
        ncols=120,
        miniters=1,
    )

    class BatchCompletionCallBack(object):
        completed = defaultdict(int)

        def __init__(self, time, index, parallel):
            self.index = index
            self.parallel = parallel

        def __call__(self, index):
            BatchCompletionCallBack.completed[self.parallel] += 1
            overall_progress.update()
            # overall_progress.refresh()
            if self.parallel._original_iterator is not None:
                self.parallel.dispatch_next()

    import joblib.parallel

    joblib.parallel.BatchCompletionCallBack = BatchCompletionCallBack

    with utils.std_out_err_redirect_tqdm():
        results = Parallel(n_jobs=num_cores)(delayed(utils.do_cut)(
            tree_name,
            files,
            supercuts,
            proposedBranches,
            args.output_directory,
            args.eventWeightBranch,
            pids,
        ) for tree_name, files in trees.items())

    overall_progress.close()

    for tree_name, result in zip(trees, results):
        logger.log(
            25,
            "Tree {0:s}: {1:s}".format(tree_name.decode('utf-8'),
                                       "ok" if result[0] else "not ok"),
        )

    logger.log(
        25,
        "Total CPU elapsed time: {0}".format(
            secondsToStr(sum(result[1] for result in results))),
    )

    return True
Esempio n. 4
0
def do_cuts(args):
  from root_optimize.timing import secondsToStr

  # before doing anything, let's ensure the directory we make is ok
  if not os.path.exists(args.output_directory):
    os.makedirs(args.output_directory)
  elif args.overwrite:
    import shutil
    shutil.rmtree(args.output_directory)
  else:
    raise IOError("Output directory already exists: {0:s}".format(args.output_directory))

  # first step is to group by the sample DID
  dids = defaultdict(list)
  for fname in args.files:
    dids[utils.get_did(fname)].append(fname)

  # load in the supercuts file
  supercuts = utils.read_supercuts_file(args.supercuts)

  # load up the weights file
  if not os.path.isfile(args.weightsFile):
    raise ValueError('The supplied weights file `{0}` does not exist or I cannot find it.'.format(args.weightsFile))
  else:
    weights = json.load(file(args.weightsFile))

  # parallelize
  num_cores = min(multiprocessing.cpu_count(), args.num_cores)
  logger.log(25, "Using {0} cores".format(num_cores) )

  pids = None
  # if pids is None, do_cut() will disable the progress
  if not args.hide_subtasks:
    from numpy import memmap, uint64
    pids = memmap(os.path.join(tempfile.mkdtemp(), 'pids'), dtype=uint64, shape=num_cores, mode='w+')

  overall_progress = tqdm.tqdm(total=len(dids), desc='Num. files', position=0, leave=True, unit='file', dynamic_ncols=True)
  class CallBack(object):
    completed = defaultdict(int)

    def __init__(self, index, parallel):
      self.index = index
      self.parallel = parallel

    def __call__(self, index):
      CallBack.completed[self.parallel] += 1
      overall_progress.update()
      overall_progress.refresh()
      if self.parallel._original_iterable:
        self.parallel.dispatch_next()

  import joblib.parallel
  joblib.parallel.CallBack = CallBack

  results = Parallel(n_jobs=num_cores)(delayed(utils.do_cut)(did, files, supercuts, weights, args.tree_name, args.output_directory, args.eventWeightBranch, args.numpy, pids) for did, files in dids.iteritems())

  overall_progress.close()

  for did, result in zip(dids, results):
    logger.log(25, 'DID {0:s}: {1:s}'.format(did, 'ok' if result[0] else 'not ok'))

  logger.log(25, "Total CPU elapsed time: {0}".format(secondsToStr(sum(result[1] for result in results))))

  return True