Example #1
0
 def evaluate(self, ast_node):
     qv = ast_visitor()
     #print(ast.dump(ast_node))
     qv.visit(ast_node)
     if isinstance(self.dataset_source, str):
         data_pathname = self.dataset_source
     else:
         data_pathname = 'temp.awkd'
         awkward.save(data_pathname, self.dataset_source)
     f = open('temp.py', 'w')
     f.write('import awkward\n')
     source = ast_node.source
     while hasattr(source, 'source'):
         source = source.source
     if data_pathname[-5:] == '.awkd':
         f.write(source.rep + " = awkward.load('" + data_pathname + "')\n")
     elif data_pathname[-5:] == '.root':
         f.write('import uproot\n')
         f.write("input_file = uproot.open('" + data_pathname + "')\n")
         f.write(
             source.rep +
             " = input_file[input_file.keys()[0]].lazyarrays(namedecode='utf-8')\n"
         )
     else:
         raise BaseException('unimplemented file type: ' + data_pathname)
     f.write('output_array = awkward.fromiter(' + ast_node.rep + ')\n')
     f.write("awkward.save('output.awkd', output_array)\n")
     f.close()
     os.system('python temp.py')
     if not isinstance(self.dataset_source, str):
         os.remove(data_pathname)
     os.remove('temp.py')
     output = awkward.load('output.awkd')
     os.remove('output.awkd')
     return output
Example #2
0
def save_awk(scores, labels, observers):
    """
    Saves as .awkd
    :param scores:
    :param labels:
    :param observers:
    :return:
    """
    import awkward
    output = {'scores': scores}
    output.update(labels)
    output.update(observers)

    name_remap = {}
    arraynames = list(output)
    for i in range(len(arraynames)):
        for j in range(i + 1, len(arraynames)):
            if arraynames[i].startswith(arraynames[j]):
                name_remap[arraynames[j]] = '%s_%d' % (arraynames[j],
                                                       len(name_remap))
            if arraynames[j].startswith(arraynames[i]):
                name_remap[arraynames[i]] = '%s_%d' % (arraynames[i],
                                                       len(name_remap))
    _logger.info('Renamed the following variables in the output file: %s',
                 str(name_remap))
    output = {
        name_remap[k] if k in name_remap else k: v
        for k, v in output.items()
    }

    awkward.save(args.predict_output, output, mode='w')
Example #3
0
def merge_npzs_to_ak(rawdir, outfile=None, nmax=None):
    """
    Loops over all .npz files in rawdir, stacks all events into an ak array,
    and dumps it to a file.
    """
    if outfile is None: outfile = osp.dirname(rawdir) + '/merged.awkd'
    bbefp.logger.info(f'Merging {rawdir} --> {outfile}')
    merged = ak.fromiter(_iter_npzs(rawdir, nmax))
    ak.save(outfile, ak_transpose(merged))
Example #4
0
def main():
    """ Loop over all combinations of mass and width. """
    args = parse_input()
    masses = np.linspace(
        args.mass_min, args.mass_max,
        np.ceil((args.mass_max - args.mass_min) / args.mass_step) + 1)
    widths = np.linspace(
        args.width_min, args.width_max,
        np.ceil((args.width_max - args.width_min) / args.width_step) + 1)

    def generator():
        with tqdm.tqdm(unit='event',
                       total=masses.size * widths.size * args.nevents,
                       desc='Generating') as pbar:
            for mass in masses:
                for width in widths:
                    yield from run(args.nevents, mass, width)
                    pbar.update(args.nevents)

    events = ak.fromiter(generator())
    ak.save('events.awkd', events, mode='w')
Example #5
0
def convert(source, destdir, basename, step=None, limit=None):
    df = pd.read_hdf(source, key='table')
    logging.info('Total events: %s' % str(df.shape[0]))
    if limit is not None:
        df = df.iloc[0:limit]
        logging.info('Restricting to the first %s events:' % str(df.shape[0]))
    if step is None:
        step = df.shape[0]
    idx=-1
    while True:
        idx+=1
        start=idx*step
        if start>=df.shape[0]: break
        if not os.path.exists(destdir):
            os.makedirs(destdir)
        output = os.path.join(destdir, '%s_%d.awkd'%(basename, idx))
        logging.info(output)
        if os.path.exists(output):
            logging.warning('... file already exist: continue ...')
            continue
        v=_transform(df, start=start, stop=start+step)
        awkward.save(output, v, mode='x')
Example #6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-c',
                        '--data-config',
                        type=str,
                        default='data/ak15_points_pf_sv_v0.yaml',
                        help='data config YAML file')
    parser.add_argument('-i',
                        '--data-train',
                        nargs='*',
                        default=[],
                        help='training files')
    parser.add_argument('-t',
                        '--data-test',
                        nargs='*',
                        default=[],
                        help='testing files')
    parser.add_argument(
        '--data-fraction',
        type=float,
        default=1,
        help=
        'fraction of events to load from each file; for training, the events are randomly selected for each epoch'
    )
    parser.add_argument(
        '--data-dilation',
        type=int,
        default=1,
        help=
        'reduce number of file by a factor of `d` for training. NOT recommended in general - use `--data-fraction` instead.'
    )
    parser.add_argument(
        '--files-per-fetch',
        type=int,
        default=20,
        help=
        'number of files to load each time; shuffling is done within these events, so choose a number large enough to get events from all classes'
    )
    parser.add_argument('--train-val-split',
                        type=float,
                        default=0.8,
                        help='training/validation split fraction')
    parser.add_argument(
        '--demo',
        action='store_true',
        default=False,
        help=
        'quickly test the setup by running over only a small number of events')
    parser.add_argument(
        '--lr-finder',
        type=str,
        default=None,
        help=
        'run learning rate finder instead of the actual training; format: ``start_lr, end_lr, num_iters``'
    )
    parser.add_argument(
        '-n',
        '--network-config',
        type=str,
        default='networks/particle_net_pfcand_sv.py',
        help=
        'network architecture configuration file; the path must be relative to the current dir'
    )
    parser.add_argument(
        '--network-option',
        nargs=2,
        action='append',
        default=[],
        help=
        'options to pass to the model class constructor, e.g., `--network-option use_counts False`'
    )
    parser.add_argument(
        '-m',
        '--model-prefix',
        type=str,
        default='test_output/model_name',
        help=
        'path to save or load the model; for training, this will be used as a prefix; for testing, this should be the full path including extension'
    )
    parser.add_argument('--num-epochs',
                        type=int,
                        default=20,
                        help='number of epochs')
    parser.add_argument(
        '--optimizer',
        type=str,
        default='ranger',
        choices=['adam', 'ranger'],  # TODO: add more
        help='optimizer for the training')
    parser.add_argument(
        '--load-epoch',
        type=int,
        default=None,
        help=
        'used to resume interrupted training, load model and optimizer state saved in the `epoch-%d_state.pt` and `epoch-%d_optimizer.pt` files'
    )
    parser.add_argument('--start-lr',
                        type=float,
                        default=5e-3,
                        help='start learning rate')
    parser.add_argument(
        '--lr-steps',
        type=str,
        default='10,20',
        help=
        'steps to reduce the lr; currently only used when setting `--optimizer` to adam'
    )
    parser.add_argument('--batch-size',
                        type=int,
                        default=128,
                        help='batch size')
    parser.add_argument(
        '--use-amp',
        action='store_true',
        default=False,
        help='use mixed precision training (fp16); NOT WORKING YET')
    parser.add_argument(
        '--gpus',
        type=str,
        default='0',
        help='device for the training/testing; to use CPU, set to empty string ('
        '); to use multiple gpu, set it as a comma separated list, e.g., `1,2,3,4`'
    )
    parser.add_argument(
        '--num-workers',
        type=int,
        default=2,
        help=
        'number of threads to load the dataset; memory consuption and disk access load increases (~linearly) with this numbers'
    )
    parser.add_argument('--predict',
                        action='store_true',
                        default=False,
                        help='run prediction instead of training')
    parser.add_argument(
        '--predict-output',
        type=str,
        help=
        'path to save the prediction output, support `.root` and `.awkd` format'
    )
    parser.add_argument(
        '--export-onnx',
        type=str,
        default=None,
        help=
        'export the PyTorch model to ONNX model and save it at the given path (path must ends w/ .onnx); '
        'needs to set `--data-config`, `--network-config`, and `--model-prefix` (requires the full model path)'
    )

    args = parser.parse_args()
    _logger.info(args)

    if args.use_amp:
        raise NotImplementedError


#         from apex import amp

    if args.data_dilation > 1:
        _logger.warning(
            'Use of `data-dilation` is not recomended in general -- consider using `data-fraction` instead.'
        )

    # training/testing mode
    training_mode = not args.predict

    # device
    if args.gpus:
        gpus = [int(i) for i in args.gpus.split(',')]
        dev = torch.device(gpus[0])
    else:
        gpus = None
        dev = torch.device('cpu')

    # load data
    if training_mode:
        filelist = sorted(sum([glob.glob(f) for f in args.data_train], []))
        # np.random.seed(1)
        np.random.shuffle(filelist)
        if args.demo:
            filelist = filelist[:20]
            _logger.info(filelist)
            args.data_fraction = 0.1
            args.files_per_fetch = 5
        train_data = SimpleIterDataset(filelist,
                                       args.data_config,
                                       for_training=True,
                                       partial_load=((0, args.train_val_split),
                                                     args.data_fraction),
                                       dilation=args.data_dilation,
                                       files_per_fetch=args.files_per_fetch)
        val_data = SimpleIterDataset(filelist,
                                     args.data_config,
                                     for_training=True,
                                     partial_load=((args.train_val_split, 1),
                                                   args.data_fraction),
                                     dilation=args.data_dilation,
                                     files_per_fetch=args.files_per_fetch)
        train_loader = DataLoader(train_data,
                                  num_workers=args.num_workers,
                                  batch_size=args.batch_size,
                                  drop_last=True,
                                  pin_memory=True)
        val_loader = DataLoader(val_data,
                                num_workers=args.num_workers,
                                batch_size=args.batch_size,
                                drop_last=True,
                                pin_memory=True)
        data_config = train_data.config
    else:
        filelist = sorted(sum([glob.glob(f) for f in args.data_test], []))
        test_data = SimpleIterDataset(filelist,
                                      args.data_config,
                                      for_training=False,
                                      files_per_fetch=1)
        test_loader = DataLoader(test_data,
                                 num_workers=args.num_workers,
                                 batch_size=args.batch_size,
                                 drop_last=False,
                                 pin_memory=True)
        data_config = test_data.config

    # model
    network_module = import_module(
        args.network_config.replace('.py', '').replace('/', '.'))
    network_options = {k: ast.literal_eval(v) for k, v in args.network_option}
    if args.export_onnx:
        network_options['for_inference'] = True
    model, model_info = network_module.get_model(data_config,
                                                 **network_options)
    _logger.info(model)

    # export to ONNX
    if args.export_onnx:
        assert (args.export_onnx.endswith('.onnx'))
        model_path = args.model_prefix
        _logger.info('Exporting model %s to ONNX' % model_path)
        model.load_state_dict(torch.load(model_path, map_location='cpu'))
        model = model.cpu()
        model.eval()

        os.makedirs(os.path.dirname(args.export_onnx), exist_ok=True)
        inputs = tuple(
            torch.ones(model_info['input_shapes'][k], dtype=torch.float32)
            for k in model_info['input_names'])
        torch.onnx.export(model,
                          inputs,
                          args.export_onnx,
                          input_names=model_info['input_names'],
                          output_names=model_info['output_names'],
                          dynamic_axes=model_info.get('dynamic_axes', None),
                          opset_version=11)
        _logger.info('ONNX model saved to %s', args.export_onnx)
        return

    # note: we should always save/load the state_dict of the original model, not the one wrapped by nn.DataParallel
    # so we do not convert it to nn.DataParallel now
    model = model.to(dev)

    # loss function
    try:
        loss_func = network_module.get_loss(data_config, **network_options)
        _logger.info(loss_func)
    except AttributeError:
        loss_func = torch.nn.CrossEntropyLoss()
        _logger.warning(
            'Loss function not defined in %s. Will use `torch.nn.CrossEntropyLoss()` by default.',
            args.network_config)

    if training_mode:
        # optimizer & learning rate
        if args.optimizer == 'adam':
            opt = torch.optim.Adam(model.parameters(), lr=args.start_lr)
            if args.lr_finder is None:
                lr_steps = [int(x) for x in args.lr_steps.split(',')]
                scheduler = torch.optim.lr_scheduler.MultiStepLR(
                    opt, milestones=lr_steps, gamma=0.1)
        else:
            from utils.nn.optimizer.ranger import Ranger
            opt = Ranger(model.parameters(), lr=args.start_lr)
            if args.lr_finder is None:
                lr_decay_epochs = max(1, int(args.num_epochs * 0.3))
                lr_decay_rate = 0.01**(1. / lr_decay_epochs)
                scheduler = torch.optim.lr_scheduler.MultiStepLR(
                    opt,
                    milestones=list(
                        range(args.num_epochs - lr_decay_epochs,
                              args.num_epochs)),
                    gamma=lr_decay_rate)

        # TODO: mixed precision training
        if args.use_amp:
            #             model, opt = amp.initialize(
            #                model, opt, opt_level="O2",
            #                keep_batchnorm_fp32=True, loss_scale="dynamic"
            #             )
            model, opt = amp.initialize(model,
                                        opt,
                                        opt_level="O1",
                                        keep_batchnorm_fp32=None,
                                        loss_scale="dynamic")

        # load previous training and resume if `--load-epoch` is set
        if args.load_epoch is not None:
            _logger.info('Resume training from epoch %d' % args.load_epoch)
            model_state = torch.load(args.model_prefix +
                                     '_epoch-%d_state.pt' % args.load_epoch,
                                     map_location=dev)
            model.load_state_dict(model_state)
            opt_state = torch.load(args.model_prefix +
                                   '_epoch-%d_optimizer.pt' % args.load_epoch,
                                   map_location=dev)
            opt.load_state_dict(opt_state)

        # mutli-gpu
        if gpus is not None and len(gpus) > 1:
            model = torch.nn.DataParallel(
                model, device_ids=gpus
            )  # model becomes `torch.nn.DataParallel` w/ model.module being the orignal `torch.nn.Module`
        model = model.to(dev)

        # lr finder: keep it after all other setups
        if args.lr_finder is not None:
            start_lr, end_lr, num_iter = args.lr_finder.replace(' ',
                                                                '').split(',')
            from utils.lr_finder import LRFinder
            lr_finder = LRFinder(model,
                                 opt,
                                 loss_func,
                                 device=dev,
                                 input_names=train_data.config.input_names,
                                 label_names=train_data.config.label_names)
            lr_finder.range_test(train_loader,
                                 start_lr=float(start_lr),
                                 end_lr=float(end_lr),
                                 num_iter=int(num_iter))
            lr_finder.plot(output='lr_finder.png'
                           )  # to inspect the loss-learning rate graph
            return

        # training loop
        best_valid_acc = 0
        for epoch in range(args.num_epochs):
            if args.load_epoch is not None:
                if epoch <= args.load_epoch:
                    continue
            print('-' * 50)
            _logger.info('Epoch #%d training' % epoch)
            train(model, loss_func, opt, scheduler, train_loader, dev)
            if args.model_prefix:
                dirname = os.path.dirname(args.model_prefix)
                if dirname and not os.path.exists(dirname):
                    os.makedirs(dirname)
                state_dict = model.module.state_dict() if isinstance(
                    model, torch.nn.DataParallel) else model.state_dict()
                torch.save(state_dict,
                           args.model_prefix + '_epoch-%d_state.pt' % epoch)
                torch.save(
                    opt.state_dict(),
                    args.model_prefix + '_epoch-%d_optimizer.pt' % epoch)

            _logger.info('Epoch #%d validating' % epoch)
            valid_acc = evaluate(model, val_loader, dev, loss_func=loss_func)
            if valid_acc > best_valid_acc:
                best_valid_acc = valid_acc
                if args.model_prefix:
                    shutil.copy2(
                        args.model_prefix + '_epoch-%d_state.pt' % epoch,
                        args.model_prefix + '_best_acc_state.pt')
                    torch.save(model, args.model_prefix + '_best_acc_full.pt')
            _logger.info(
                'Epoch #%d: Current validation acc: %.5f (best: %.5f)' %
                (epoch, valid_acc, best_valid_acc))
    else:
        # run prediction
        if args.model_prefix.endswith('.onnx'):
            _logger.info('Loading model %s for eval' % args.model_prefix)
            from utils.nn.tools import evaluate_onnx
            test_acc, scores, labels, observers = evaluate_onnx(
                args.model_prefix, test_loader)
        else:
            model_path = args.model_prefix if args.model_prefix.endswith(
                '.pt') else args.model_prefix + '_best_acc_state.pt'
            _logger.info('Loading model %s for eval' % model_path)
            model.load_state_dict(torch.load(model_path, map_location=dev))
            if gpus is not None and len(gpus) > 1:
                model = torch.nn.DataParallel(model, device_ids=gpus)
            model = model.to(dev)
            test_acc, scores, labels, observers = evaluate(model,
                                                           test_loader,
                                                           dev,
                                                           for_training=False)
        _logger.info('Test acc %.5f' % test_acc)

        if args.predict_output:
            os.makedirs(os.path.dirname(args.predict_output), exist_ok=True)
            if args.predict_output.endswith('.root'):
                from utils.data.fileio import _write_root
                output = {}
                for idx, label_name in enumerate(data_config.label_value):
                    output[label_name] = (
                        labels[data_config.label_names[0]] == idx)
                    output['score_' + label_name] = scores[:, idx]
                for k, v in labels.items():
                    if k == data_config.label_names[0]:
                        continue
                    if v.ndim > 1:
                        _logger.warning('Ignoring %s, not a 1d array.', k)
                        continue
                    output[k] = v
                for k, v in observers.items():
                    if v.ndim > 1:
                        _logger.warning('Ignoring %s, not a 1d array.', k)
                        continue
                    output[k] = v
                _write_root(args.predict_output, output)
            else:
                import awkward
                output = {'scores': scores}
                output.update(labels)
                output.update(observers)
                awkward.save(args.predict_output, output, mode='w')

            _logger.info('Written output to %s' % args.predict_output)
Example #7
0
def flatten_ntuple_write(input_filename, tree_name, output_filename, class_json_filename=None, verbose=False):
    """Convert ntuple to flattened file with awkward array table.
    All data for a given method are output as one long list, ignoring event splitting.

    Tried to output as ROOT TTree, but very difficult in PyROOT with
    variable size branches.

    Parameters
    ----------
    input_filename : std
        Input Ntuple filename
    tree_name : str
        Name of TTree inside input file
    output_filename : str
        Output filename
    class_json_filename : None, optional
        If a str, output class info dicts to this file in JSON format
    verbose : bool, optional
        If True, print class info
    """
    f_in = ROOT.TFile(input_filename)
    if f_in.IsZombie():
        raise RuntimeError("Cannot open ROOT file %s" % input_filename)
    tree = f_in.Get(tree_name)
    check_tobj(tree)

    tree_info, class_infos, method_list = parse_tree(tree)

    if verbose:
        print_tree_summary(tree_info, class_infos, 'tree')

    print(tree.GetEntries(), "entries in tree")
    print(len(method_list), "hists in tree")

    # store list of values for each method call, where each event is a dict of {method:value}
    tree_data = defaultdict(list)

    # Use tqdm for nice progressbar, disable on non-TTY
    for ind in trange(tree.GetEntries(), disable=None):
        this_data = get_data(tree, ind, method_list)
        # flatten all events into one long list per method, makes for a much
        # more compact output, we don't care about individual events
        # guess we could compare those events with the same number of entries
        # in both files? i.e. 1 or 0, but hard to do for
        # eg jets, in which jet #1 may not be the same object in both files
        # don't use items() as not iterator in python2
        for key in this_data:
            # may be a single scalar, or iterable - use extend where possible
            try:
                _ = iter(this_data[key])
                tree_data[key].extend(this_data[key])
            except TypeError:
                tree_data[key].append(this_data[key])

    print("tree_data size:", get_size(tree_data))

    # Save JSON data
    if class_infos and class_json_filename:
        with open(class_json_filename, 'w') as jf:
            json.dump(class_infos, jf, indent=2, sort_keys=True)

    is_hdf5 = "hdf5" in os.path.splitext(output_filename)[1]
    if is_hdf5:
        # Save to HDF5
        import h5py
        import numpy as np
        with h5py.File(output_filename, "w") as f:
            for k in tree_data:
                f.create_dataset(k, data=np.array(tree_data[k]),
                                 compression="gzip", compression_opts=9)
    else:
        # Save to awkward array
        # make awkward table, save with compression
        import awkward
        # Use awkeard 0.12/13/14 as 0.15 has a bug that means it can't load()
        # the file
        # And awkward 1 doesn't even allow this format
        # And the awkward 0.9 in CMSSW_10_6 is too old for this
        major, minor, _ =  awkward.version.version_info
        major = int(major)
        minor = int(minor)
        if major == 1:
            raise ImportError("Need awkward 0.12.X, you have %s" % awkward.__version__)
        elif minor > 14:
            raise ImportError("Need awkward 0.12 / 0.13 / 0.14, you have %s" % awkward.__version__)
        elif minor < 12:
            raise ImportError("Need awkward 0.12 / 0.13 / 0.14, you have %s" % awkward.__version__)

        awkd_table = awkward.fromiter([tree_data])
        awkward.save(output_filename, awkd_table, mode='w', compression=True)
Example #8
0
    # if any(n.startswith(name) for n in namelist):
    #     raise KeyError("cannot add {0} to zipfile because the following already exist: {1}".format(repr(name), ", ".join(repr(n) for n in namelist if n.startswith(name))))

# sed -i 's/raise KeyError/print/' ~/Library/Python/3.7/lib/python/site-packages/awkward/persist.py

if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument("filename", help="filename")
    args = parser.parse_args()

    f = uproot.open(str(args.filename))
    t = f["Events"]

    ac = [dict(
        types=[np.bool_,bool,np.integer,np.float32],
        pair=(lambda x: blosc.compress(x,cname="lz4hc"), ("blosc", "decompress")),
        minsize=8192,
        contexts="*",
        )]

    bnames = [bn.decode("ascii") for bn in t.keys()]

    t0 = time.time()
    fname = "table.awkd"
    for iname,name in enumerate(tqdm(bnames)):
        arr = t.array(name)
        awkward.save(fname,arr,name=name,compression=ac,mode="w" if iname==0 else "a",)
    t1 = time.time()
    print(t1-t0)
Example #9
0
                    ew.selected_event = event_n
                    pts = getattr(ew, j_name + "_Total_PT")
                    top_4 = np.argsort(pts)[-4:]
                    num_found = len(top_4)
                    for var_num, var_name in enumerate(kinematics):
                        wanted = j_name + "_Total_" + var_name
                        vals = getattr(ew, wanted)[top_4]
                        all_kinematics[j_class][order, var_num, j, event_n, :num_found] = vals
                    energy, px, py, pz = all_kinematics[j_class][order, 1:, j, event_n, :num_found]
                    if num_found > 1:
                        shape_vals = ShapeVariables.shape(energy, px, py, pz)[1]
                        for var_num, var_name in enumerate(shapes):
                            all_shapes[j_class][order, var_num, j, event_n] = shape_vals[var_name]
                        
    content = {"shape_names": shapes, "kinematic_names": kinematics, "orders": ["nlo", "lo"], "jet_names": [spectral_names, traditional_names, iterative_names], "kinematics" : awkward.fromiter(all_kinematics), "shapes": awkward.fromiter(all_shapes)}
    awkward.save("../megaIgnore/IRC_shapes.awkd", content)
else:
    data = awkward.load("../megaIgnore/IRC_shapes.awkd")
    shapes = data["shape_names"]
    kinematics = data["kinematic_names"]
    spectral_names, traditional_names, iterative_names = data["jet_names"]
    all_kinematics = data["kinematics"]
    all_shapes = data["shapes"]


def plot_jet_name(name, variable, bounds=None, ax=None):
    colours = ['blue', 'purple', 'orange']
    line_styles = ['--', '-', '-.']
    if variable in kinematics:
        table = all_kinematics
        v_index = kinematics.index(variable)
def run_query(input_filenames=None, tree_name=None, branches=None):
    import awkward, uproot    
    a = (lambda event: (awkward.Table if hasattr(awkward, 'Table') else awkward['Table'])((event.MVA3lCERN_weight_ttH if hasattr(event, 'MVA3lCERN_weight_ttH') else event['MVA3lCERN_weight_ttH'])))
    b = (lambda event: event[(((((((((((((((((((((((((((event.trilep_type if hasattr(event, 'trilep_type') else event['trilep_type']) > 0) & ((event.nTaus_OR_Pt25 if hasattr(event, 'nTaus_OR_Pt25') else event['nTaus_OR_Pt25']) == 0)) & (abs((event.total_charge if hasattr(event, 'total_charge') else event['total_charge'])) == 1)) & ((event.nJets_OR_T if hasattr(event, 'nJets_OR_T') else event['nJets_OR_T']) >= 2)) & ((event.nJets_OR_T_MV2c10_70 if hasattr(event, 'nJets_OR_T_MV2c10_70') else event['nJets_OR_T_MV2c10_70']) > 0)) & ((event.lep_Pt_1 if hasattr(event, 'lep_Pt_1') else event['lep_Pt_1']) > 15000.0)) & ((event.lep_Pt_2 if hasattr(event, 'lep_Pt_2') else event['lep_Pt_2']) > 15000.0)) & ((event.lep_isolationFixedCutLoose_0 if hasattr(event, 'lep_isolationFixedCutLoose_0') else event['lep_isolationFixedCutLoose_0']) > 0)) & (abs(((event.Mlll012 if hasattr(event, 'Mlll012') else event['Mlll012']) - 91200.0)) > 10000.0)) & (((event.lep_ID_0 if hasattr(event, 'lep_ID_0') else event['lep_ID_0']) != (-(event.lep_ID_1 if hasattr(event, 'lep_ID_1') else event['lep_ID_1']))) | ((event.Mll01 if hasattr(event, 'Mll01') else event['Mll01']) > 12000.0))) & (((event.lep_ID_0 if hasattr(event, 'lep_ID_0') else event['lep_ID_0']) != (-(event.lep_ID_2 if hasattr(event, 'lep_ID_2') else event['lep_ID_2']))) | ((event.Mll02 if hasattr(event, 'Mll02') else event['Mll02']) > 12000.0))) & (((((abs((event.lep_ID_0 if hasattr(event, 'lep_ID_0') else event['lep_ID_0'])) == 13) & ((event.lep_isMedium_0 if hasattr(event, 'lep_isMedium_0') else event['lep_isMedium_0']) > 0)) | (abs((event.lep_ID_0 if hasattr(event, 'lep_ID_0') else event['lep_ID_0'])) == 11)) & (((abs((event.lep_ID_1 if hasattr(event, 'lep_ID_1') else event['lep_ID_1'])) == 11) & (abs((event.lep_Eta_1 if hasattr(event, 'lep_Eta_1') else event['lep_Eta_1'])) < 2.0)) | ((abs((event.lep_ID_1 if hasattr(event, 'lep_ID_1') else event['lep_ID_1'])) == 13) & ((event.lep_isMedium_1 if hasattr(event, 'lep_isMedium_1') else event['lep_isMedium_1']) > 0)))) & (((abs((event.lep_ID_2 if hasattr(event, 'lep_ID_2') else event['lep_ID_2'])) == 11) & (abs((event.lep_Eta_2 if hasattr(event, 'lep_Eta_2') else event['lep_Eta_2'])) < 2.0)) | ((abs((event.lep_ID_2 if hasattr(event, 'lep_ID_2') else event['lep_ID_2'])) == 13) & ((event.lep_isMedium_2 if hasattr(event, 'lep_isMedium_2') else event['lep_isMedium_2']) > 0))))) & ((((abs((event.lep_ID_1 if hasattr(event, 'lep_ID_1') else event['lep_ID_1'])) * abs((event.lep_ID_2 if hasattr(event, 'lep_ID_2') else event['lep_ID_2']))) != 169) & ((event.DRll12 if hasattr(event, 'DRll12') else event['DRll12']) > 0.5)) | ((abs((event.lep_ID_1 if hasattr(event, 'lep_ID_1') else event['lep_ID_1'])) * abs((event.lep_ID_2 if hasattr(event, 'lep_ID_2') else event['lep_ID_2']))) == 169))) & ((event.Mll12 if hasattr(event, 'Mll12') else event['Mll12']) > 12000.0)) & (((event.lep_ID_0 if hasattr(event, 'lep_ID_0') else event['lep_ID_0']) != (-(event.lep_ID_1 if hasattr(event, 'lep_ID_1') else event['lep_ID_1']))) | (abs(((event.Mll01 if hasattr(event, 'Mll01') else event['Mll01']) - 91200.0)) > 10000.0))) & (((event.lep_ID_0 if hasattr(event, 'lep_ID_0') else event['lep_ID_0']) != (-(event.lep_ID_2 if hasattr(event, 'lep_ID_2') else event['lep_ID_2']))) | (abs(((event.Mll02 if hasattr(event, 'Mll02') else event['Mll02']) - 91200.0)) > 10000.0))) & ((event.MVA3lCERN_weight_ttH if hasattr(event, 'MVA3lCERN_weight_ttH') else event['MVA3lCERN_weight_ttH']) > (-1))) & (((event.lep_ID_0 if hasattr(event, 'lep_ID_0') else event['lep_ID_0']) != (-(event.lep_ID_1 if hasattr(event, 'lep_ID_1') else event['lep_ID_1']))) | (abs(((event.Mll01 if hasattr(event, 'Mll01') else event['Mll01']) - 91200.0)) > 10000.0))) & (((event.lep_ID_0 if hasattr(event, 'lep_ID_0') else event['lep_ID_0']) != (-(event.lep_ID_2 if hasattr(event, 'lep_ID_2') else event['lep_ID_2']))) | (abs(((event.Mll02 if hasattr(event, 'Mll02') else event['Mll02']) - 91200.0)) > 10000.0))) & ((event.MVA3lCERN_weight_ttH if hasattr(event, 'MVA3lCERN_weight_ttH') else event['MVA3lCERN_weight_ttH']) > 0.3)) & ((event.MVA3lCERN_weight_ttW if hasattr(event, 'MVA3lCERN_weight_ttW') else event['MVA3lCERN_weight_ttW']) < 0.75)) & ((event.MVA3lCERN_weight_ttZ if hasattr(event, 'MVA3lCERN_weight_ttZ') else event['MVA3lCERN_weight_ttZ']) < 0.75)) & ((event.MVA3lCERN_weight_VV if hasattr(event, 'MVA3lCERN_weight_VV') else event['MVA3lCERN_weight_VV']) < 0.75)) & ((event.MVA3lCERN_weight_ttbar if hasattr(event, 'MVA3lCERN_weight_ttbar') else event['MVA3lCERN_weight_ttbar']) < 0.3)) & ((((((((event.dilep_type if hasattr(event, 'dilep_type') else event['dilep_type']) > 0) & (((event.lep_ID_0 if hasattr(event, 'lep_ID_0') else event['lep_ID_0']) * (event.lep_ID_1 if hasattr(event, 'lep_ID_1') else event['lep_ID_1'])) > 0)) & ((event.lep_isQMisID_1 if hasattr(event, 'lep_isQMisID_1') else event['lep_isQMisID_1']) == 0)) & ((event.lep_isQMisID_0 if hasattr(event, 'lep_isQMisID_0') else event['lep_isQMisID_0']) == 0)) | ((((event.trilep_type if hasattr(event, 'trilep_type') else event['trilep_type']) > 0) & ((event.lep_isQMisID_2 if hasattr(event, 'lep_isQMisID_2') else event['lep_isQMisID_2']) == 0)) & ((event.lep_isQMisID_1 if hasattr(event, 'lep_isQMisID_1') else event['lep_isQMisID_1']) == 0))) | (((event.quadlep_type if hasattr(event, 'quadlep_type') else event['quadlep_type']) > 0) & ((event.FSF_4L_tot if hasattr(event, 'FSF_4L_tot') else event['FSF_4L_tot']) == 1))) & (((((((((event.dilep_type if hasattr(event, 'dilep_type') else event['dilep_type']) > 0) & (((((abs((event.lep_ID_0 if hasattr(event, 'lep_ID_0') else event['lep_ID_0'])) == 13) & ((event.lep_isMedium_0 if hasattr(event, 'lep_isMedium_0') else event['lep_isMedium_0']) > 0)) & ((event.lep_isolationFixedCutLoose_0 if hasattr(event, 'lep_isolationFixedCutLoose_0') else event['lep_isolationFixedCutLoose_0']) > 0)) & ((event.lep_promptLeptonVeto_TagWeight_0 if hasattr(event, 'lep_promptLeptonVeto_TagWeight_0') else event['lep_promptLeptonVeto_TagWeight_0']) < (-0.5))) | ((((((abs((event.lep_ID_0 if hasattr(event, 'lep_ID_0') else event['lep_ID_0'])) == 11) & ((event.lep_isolationFixedCutLoose_0 if hasattr(event, 'lep_isolationFixedCutLoose_0') else event['lep_isolationFixedCutLoose_0']) > 0)) & ((event.lep_isTightLH_0 if hasattr(event, 'lep_isTightLH_0') else event['lep_isTightLH_0']) > 0)) & ((event.lep_chargeIDBDTTight_0 if hasattr(event, 'lep_chargeIDBDTTight_0') else event['lep_chargeIDBDTTight_0']) > 0.7)) & ((event.lep_ambiguityType_0 if hasattr(event, 'lep_ambiguityType_0') else event['lep_ambiguityType_0']) == 0)) & ((event.lep_promptLeptonVeto_TagWeight_0 if hasattr(event, 'lep_promptLeptonVeto_TagWeight_0') else event['lep_promptLeptonVeto_TagWeight_0']) < (-0.7))))) & (((((abs((event.lep_ID_1 if hasattr(event, 'lep_ID_1') else event['lep_ID_1'])) == 13) & ((event.lep_isMedium_1 if hasattr(event, 'lep_isMedium_1') else event['lep_isMedium_1']) > 0)) & ((event.lep_isolationFixedCutLoose_1 if hasattr(event, 'lep_isolationFixedCutLoose_1') else event['lep_isolationFixedCutLoose_1']) > 0)) & ((event.lep_promptLeptonVeto_TagWeight_1 if hasattr(event, 'lep_promptLeptonVeto_TagWeight_1') else event['lep_promptLeptonVeto_TagWeight_1']) < (-0.5))) | ((((((abs((event.lep_ID_1 if hasattr(event, 'lep_ID_1') else event['lep_ID_1'])) == 11) & ((event.lep_isolationFixedCutLoose_1 if hasattr(event, 'lep_isolationFixedCutLoose_1') else event['lep_isolationFixedCutLoose_1']) > 0)) & ((event.lep_isTightLH_1 if hasattr(event, 'lep_isTightLH_1') else event['lep_isTightLH_1']) > 0)) & ((event.lep_chargeIDBDTTight_1 if hasattr(event, 'lep_chargeIDBDTTight_1') else event['lep_chargeIDBDTTight_1']) > 0.7)) & ((event.lep_ambiguityType_1 if hasattr(event, 'lep_ambiguityType_1') else event['lep_ambiguityType_1']) == 0)) & ((event.lep_promptLeptonVeto_TagWeight_1 if hasattr(event, 'lep_promptLeptonVeto_TagWeight_1') else event['lep_promptLeptonVeto_TagWeight_1']) < (-0.7))))) | ((((event.trilep_type if hasattr(event, 'trilep_type') else event['trilep_type']) > 0) & ((event.nTaus_OR_Pt25 if hasattr(event, 'nTaus_OR_Pt25') else event['nTaus_OR_Pt25']) == 0)) & (((((abs((event.lep_ID_2 if hasattr(event, 'lep_ID_2') else event['lep_ID_2'])) == 13) & ((event.lep_isolationFixedCutLoose_2 if hasattr(event, 'lep_isolationFixedCutLoose_2') else event['lep_isolationFixedCutLoose_2']) > 0)) & ((event.lep_promptLeptonVeto_TagWeight_2 if hasattr(event, 'lep_promptLeptonVeto_TagWeight_2') else event['lep_promptLeptonVeto_TagWeight_2']) < (-0.5))) | ((((((abs((event.lep_ID_2 if hasattr(event, 'lep_ID_2') else event['lep_ID_2'])) == 11) & ((event.lep_isolationFixedCutLoose_2 if hasattr(event, 'lep_isolationFixedCutLoose_2') else event['lep_isolationFixedCutLoose_2']) > 0)) & ((event.lep_isTightLH_2 if hasattr(event, 'lep_isTightLH_2') else event['lep_isTightLH_2']) > 0)) & ((event.lep_chargeIDBDTTight_2 if hasattr(event, 'lep_chargeIDBDTTight_2') else event['lep_chargeIDBDTTight_2']) > 0.7)) & ((event.lep_promptLeptonVeto_TagWeight_2 if hasattr(event, 'lep_promptLeptonVeto_TagWeight_2') else event['lep_promptLeptonVeto_TagWeight_2']) < (-0.7))) & ((event.lep_ambiguityType_2 if hasattr(event, 'lep_ambiguityType_2') else event['lep_ambiguityType_2']) == 0))) & ((((abs((event.lep_ID_1 if hasattr(event, 'lep_ID_1') else event['lep_ID_1'])) == 13) & ((event.lep_isolationFixedCutLoose_1 if hasattr(event, 'lep_isolationFixedCutLoose_1') else event['lep_isolationFixedCutLoose_1']) > 0)) & ((event.lep_promptLeptonVeto_TagWeight_1 if hasattr(event, 'lep_promptLeptonVeto_TagWeight_1') else event['lep_promptLeptonVeto_TagWeight_1']) < (-0.5))) | ((((((abs((event.lep_ID_1 if hasattr(event, 'lep_ID_1') else event['lep_ID_1'])) == 11) & ((event.lep_isolationFixedCutLoose_1 if hasattr(event, 'lep_isolationFixedCutLoose_1') else event['lep_isolationFixedCutLoose_1']) > 0)) & ((event.lep_isTightLH_1 if hasattr(event, 'lep_isTightLH_1') else event['lep_isTightLH_1']) > 0)) & ((event.lep_chargeIDBDTTight_1 if hasattr(event, 'lep_chargeIDBDTTight_1') else event['lep_chargeIDBDTTight_1']) > 0.7)) & ((event.lep_promptLeptonVeto_TagWeight_1 if hasattr(event, 'lep_promptLeptonVeto_TagWeight_1') else event['lep_promptLeptonVeto_TagWeight_1']) < (-0.7))) & ((event.lep_ambiguityType_1 if hasattr(event, 'lep_ambiguityType_1') else event['lep_ambiguityType_1']) == 0)))))) | (((event.dilep_type if hasattr(event, 'dilep_type') else event['dilep_type']) > 0) & ((event.nTaus_OR_Pt25 if hasattr(event, 'nTaus_OR_Pt25') else event['nTaus_OR_Pt25']) > 1))) | ((((event.dilep_type if hasattr(event, 'dilep_type') else event['dilep_type']) > 0) | ((event.trilep_type if hasattr(event, 'trilep_type') else event['trilep_type']) > 0)) == 0)) | ((event.quadlep_type if hasattr(event, 'quadlep_type') else event['quadlep_type']) > 0)) | (((event.trilep_type if hasattr(event, 'trilep_type') else event['trilep_type']) > 0) & ((event.nTaus_OR_Pt25 if hasattr(event, 'nTaus_OR_Pt25') else event['nTaus_OR_Pt25']) > 0))))) | ((event.is1L2Tau if hasattr(event, 'is1L2Tau') else event['is1L2Tau']) > 0))])
    out = awkward.Table()
    out['0'] =[]
    for i in uproot.iterate(input_filenames,tree_name,branches=branches,namedecode="utf-8",entrysteps=50000, reportentries=False):
        out = awkward.concatenate([out, (a)((b)(awkward.Table(i)))])
        
    # for i in uproot.iterate(input_filenames,tree_name,branches=branches,namedecode="utf-8",entrysteps=10000, reportentries=True):
    #     print("Entry range: ", i[0], i[1])
    #     out = awkward.concatenate([out, (a)((b)(awkward.Table(i[2])))])
    
    return out

    

if __name__ == '__main__':
    branch_list = ['trilep_type', 'nTaus_OR_Pt25', 'total_charge', 'nJets_OR_T', 'nJets_OR_T_MV2c10_70', 'lep_Pt_1', 'lep_Pt_2', 'lep_isolationFixedCutLoose_0', 'Mlll012', 'lep_ID_0', 'lep_ID_1', 'Mll01', 'lep_ID_2', 'Mll02', 'lep_isMedium_0', 'lep_Eta_1', 'lep_isMedium_1', 'lep_Eta_2', 'lep_isMedium_2', 'DRll12', 'Mll12', 'MVA3lCERN_weight_ttH', 'MVA3lCERN_weight_ttW', 'MVA3lCERN_weight_ttZ', 'MVA3lCERN_weight_VV', 'MVA3lCERN_weight_ttbar', 'dilep_type', 'lep_promptLeptonVeto_TagWeight_0', 'lep_isTightLH_0', 'lep_chargeIDBDTTight_0', 'lep_ambiguityType_0', 'lep_isolationFixedCutLoose_1', 'lep_promptLeptonVeto_TagWeight_1', 'lep_isTightLH_1', 'lep_chargeIDBDTTight_1', 'lep_ambiguityType_1', 'lep_isolationFixedCutLoose_2', 'lep_promptLeptonVeto_TagWeight_2', 'lep_isTightLH_2', 'lep_chargeIDBDTTight_2', 'lep_ambiguityType_2', 'quadlep_type', 'lep_isQMisID_1', 'lep_isQMisID_0', 'lep_isQMisID_2', 'FSF_4L_tot', 'is1L2Tau']

    a = run_query('/data/kyungeon/ttHML_v09_01/user.kchoi.ttHML_80fb_VV/ttHML_80fb_364286_mc16d.root', 'nominal', branch_list)
    # a = run_query('/scratch/data/ttHML_v09_01/user.kchoi.ttHML_80fb_VV/ttHML_80fb_364253_mc16d.root', 'nominal', branch_list)
    import awkward
    awkward.save('h.awkd', a)
Example #11
0
            info["label"] = label
            info["iter"] = i

            ac = copy.deepcopy(awkward.persist.compression)
            ac[0]["types"] += [np.float32]
            if label.startswith("lz4"):
                ac[0]["pair"] = (fcomp, ("lz4.frame", "decompress"))
            if label.startswith("blosc"):
                ac[0]["pair"] = (fcomp, ("blosc", "decompress"))
            if label.startswith("lzma"):
                ac[0]["pair"] = (fcomp, ("backports.lzma", "decompress"))

            fname = "tables/table_{}.awkd".format(label)

            t0 = time.time()
            awkward.save(fname, table, compression=ac, mode="w")
            t1 = time.time()
            info["t_compress_ms"] = 1e3 * (t1 - t0)

            t0 = time.time()
            tmp = awkward.load(fname,
                               whitelist=awkward.persist.whitelist + [
                                   ['lz4.frame', 'decompress'],
                                   ['lz4.block', 'decompress'],
                                   ['blosc', 'decompress'],
                                   ['backports.lzma', 'decompress'],
                               ])
            t1 = time.time()
            info["t_decompress_ms"] = 1e3 * (t1 - t0)

            info["uncompressed_bytes"] = table.nbytes
Example #12
0
    start = time.time()
    nn = ParticleNetJetTagsProducer(args.model, args.preprocess)
    diff = time.time() - start
    print('--- Setup model: %f s total' % (diff,))

    start = time.time()
    outputs = nn.predict(taginfo, eval_flags)
    diff = time.time() - start
    print('--- Run prediction: %f s total, %f s per jet ---' % (diff, diff / outputs['probQCDbb'].counts.sum()))
#     print(outputs)
#     for k in outputs:
#         print(k, outputs[k].content.mean())

    if 'FatJet_ParticleNetMD_probXbb' in table:
        print('Compare w/ stored values')
        print('Stored values:\n ...', table['FatJet_ParticleNetMD_probXbb'][:5])
        print('Computed values:\n ...', outputs['probXbb'][:5])
        print('Diff (50%, 95%, 99%, 100%) = ', np.percentile(
            np.abs(outputs['probXbb'] - table['FatJet_ParticleNetMD_probXbb']).content, [50, 95, 99, 100]))

#     assert(np.array_equal(jetmass.counts, outputs['probQCDbb'].counts))
    alloutputs = awkward.JaggedArray.zip(outputs)
    if args.make_baseline:
        with open('baseline.awkd', 'wb') as fout:
            awkward.save(fout, alloutputs)
    else:
        if os.path.exists('baseline.awkd'):
            with open('baseline.awkd', 'rb') as fin:
                baseline = awkward.load(fin)
            print("Comparison to baseline:", (alloutputs == baseline).all().all())