def evaluate(self, ast_node): qv = ast_visitor() #print(ast.dump(ast_node)) qv.visit(ast_node) if isinstance(self.dataset_source, str): data_pathname = self.dataset_source else: data_pathname = 'temp.awkd' awkward.save(data_pathname, self.dataset_source) f = open('temp.py', 'w') f.write('import awkward\n') source = ast_node.source while hasattr(source, 'source'): source = source.source if data_pathname[-5:] == '.awkd': f.write(source.rep + " = awkward.load('" + data_pathname + "')\n") elif data_pathname[-5:] == '.root': f.write('import uproot\n') f.write("input_file = uproot.open('" + data_pathname + "')\n") f.write( source.rep + " = input_file[input_file.keys()[0]].lazyarrays(namedecode='utf-8')\n" ) else: raise BaseException('unimplemented file type: ' + data_pathname) f.write('output_array = awkward.fromiter(' + ast_node.rep + ')\n') f.write("awkward.save('output.awkd', output_array)\n") f.close() os.system('python temp.py') if not isinstance(self.dataset_source, str): os.remove(data_pathname) os.remove('temp.py') output = awkward.load('output.awkd') os.remove('output.awkd') return output
def save_awk(scores, labels, observers): """ Saves as .awkd :param scores: :param labels: :param observers: :return: """ import awkward output = {'scores': scores} output.update(labels) output.update(observers) name_remap = {} arraynames = list(output) for i in range(len(arraynames)): for j in range(i + 1, len(arraynames)): if arraynames[i].startswith(arraynames[j]): name_remap[arraynames[j]] = '%s_%d' % (arraynames[j], len(name_remap)) if arraynames[j].startswith(arraynames[i]): name_remap[arraynames[i]] = '%s_%d' % (arraynames[i], len(name_remap)) _logger.info('Renamed the following variables in the output file: %s', str(name_remap)) output = { name_remap[k] if k in name_remap else k: v for k, v in output.items() } awkward.save(args.predict_output, output, mode='w')
def merge_npzs_to_ak(rawdir, outfile=None, nmax=None): """ Loops over all .npz files in rawdir, stacks all events into an ak array, and dumps it to a file. """ if outfile is None: outfile = osp.dirname(rawdir) + '/merged.awkd' bbefp.logger.info(f'Merging {rawdir} --> {outfile}') merged = ak.fromiter(_iter_npzs(rawdir, nmax)) ak.save(outfile, ak_transpose(merged))
def main(): """ Loop over all combinations of mass and width. """ args = parse_input() masses = np.linspace( args.mass_min, args.mass_max, np.ceil((args.mass_max - args.mass_min) / args.mass_step) + 1) widths = np.linspace( args.width_min, args.width_max, np.ceil((args.width_max - args.width_min) / args.width_step) + 1) def generator(): with tqdm.tqdm(unit='event', total=masses.size * widths.size * args.nevents, desc='Generating') as pbar: for mass in masses: for width in widths: yield from run(args.nevents, mass, width) pbar.update(args.nevents) events = ak.fromiter(generator()) ak.save('events.awkd', events, mode='w')
def convert(source, destdir, basename, step=None, limit=None): df = pd.read_hdf(source, key='table') logging.info('Total events: %s' % str(df.shape[0])) if limit is not None: df = df.iloc[0:limit] logging.info('Restricting to the first %s events:' % str(df.shape[0])) if step is None: step = df.shape[0] idx=-1 while True: idx+=1 start=idx*step if start>=df.shape[0]: break if not os.path.exists(destdir): os.makedirs(destdir) output = os.path.join(destdir, '%s_%d.awkd'%(basename, idx)) logging.info(output) if os.path.exists(output): logging.warning('... file already exist: continue ...') continue v=_transform(df, start=start, stop=start+step) awkward.save(output, v, mode='x')
def main(): parser = argparse.ArgumentParser() parser.add_argument('-c', '--data-config', type=str, default='data/ak15_points_pf_sv_v0.yaml', help='data config YAML file') parser.add_argument('-i', '--data-train', nargs='*', default=[], help='training files') parser.add_argument('-t', '--data-test', nargs='*', default=[], help='testing files') parser.add_argument( '--data-fraction', type=float, default=1, help= 'fraction of events to load from each file; for training, the events are randomly selected for each epoch' ) parser.add_argument( '--data-dilation', type=int, default=1, help= 'reduce number of file by a factor of `d` for training. NOT recommended in general - use `--data-fraction` instead.' ) parser.add_argument( '--files-per-fetch', type=int, default=20, help= 'number of files to load each time; shuffling is done within these events, so choose a number large enough to get events from all classes' ) parser.add_argument('--train-val-split', type=float, default=0.8, help='training/validation split fraction') parser.add_argument( '--demo', action='store_true', default=False, help= 'quickly test the setup by running over only a small number of events') parser.add_argument( '--lr-finder', type=str, default=None, help= 'run learning rate finder instead of the actual training; format: ``start_lr, end_lr, num_iters``' ) parser.add_argument( '-n', '--network-config', type=str, default='networks/particle_net_pfcand_sv.py', help= 'network architecture configuration file; the path must be relative to the current dir' ) parser.add_argument( '--network-option', nargs=2, action='append', default=[], help= 'options to pass to the model class constructor, e.g., `--network-option use_counts False`' ) parser.add_argument( '-m', '--model-prefix', type=str, default='test_output/model_name', help= 'path to save or load the model; for training, this will be used as a prefix; for testing, this should be the full path including extension' ) parser.add_argument('--num-epochs', type=int, default=20, help='number of epochs') parser.add_argument( '--optimizer', type=str, default='ranger', choices=['adam', 'ranger'], # TODO: add more help='optimizer for the training') parser.add_argument( '--load-epoch', type=int, default=None, help= 'used to resume interrupted training, load model and optimizer state saved in the `epoch-%d_state.pt` and `epoch-%d_optimizer.pt` files' ) parser.add_argument('--start-lr', type=float, default=5e-3, help='start learning rate') parser.add_argument( '--lr-steps', type=str, default='10,20', help= 'steps to reduce the lr; currently only used when setting `--optimizer` to adam' ) parser.add_argument('--batch-size', type=int, default=128, help='batch size') parser.add_argument( '--use-amp', action='store_true', default=False, help='use mixed precision training (fp16); NOT WORKING YET') parser.add_argument( '--gpus', type=str, default='0', help='device for the training/testing; to use CPU, set to empty string (' '); to use multiple gpu, set it as a comma separated list, e.g., `1,2,3,4`' ) parser.add_argument( '--num-workers', type=int, default=2, help= 'number of threads to load the dataset; memory consuption and disk access load increases (~linearly) with this numbers' ) parser.add_argument('--predict', action='store_true', default=False, help='run prediction instead of training') parser.add_argument( '--predict-output', type=str, help= 'path to save the prediction output, support `.root` and `.awkd` format' ) parser.add_argument( '--export-onnx', type=str, default=None, help= 'export the PyTorch model to ONNX model and save it at the given path (path must ends w/ .onnx); ' 'needs to set `--data-config`, `--network-config`, and `--model-prefix` (requires the full model path)' ) args = parser.parse_args() _logger.info(args) if args.use_amp: raise NotImplementedError # from apex import amp if args.data_dilation > 1: _logger.warning( 'Use of `data-dilation` is not recomended in general -- consider using `data-fraction` instead.' ) # training/testing mode training_mode = not args.predict # device if args.gpus: gpus = [int(i) for i in args.gpus.split(',')] dev = torch.device(gpus[0]) else: gpus = None dev = torch.device('cpu') # load data if training_mode: filelist = sorted(sum([glob.glob(f) for f in args.data_train], [])) # np.random.seed(1) np.random.shuffle(filelist) if args.demo: filelist = filelist[:20] _logger.info(filelist) args.data_fraction = 0.1 args.files_per_fetch = 5 train_data = SimpleIterDataset(filelist, args.data_config, for_training=True, partial_load=((0, args.train_val_split), args.data_fraction), dilation=args.data_dilation, files_per_fetch=args.files_per_fetch) val_data = SimpleIterDataset(filelist, args.data_config, for_training=True, partial_load=((args.train_val_split, 1), args.data_fraction), dilation=args.data_dilation, files_per_fetch=args.files_per_fetch) train_loader = DataLoader(train_data, num_workers=args.num_workers, batch_size=args.batch_size, drop_last=True, pin_memory=True) val_loader = DataLoader(val_data, num_workers=args.num_workers, batch_size=args.batch_size, drop_last=True, pin_memory=True) data_config = train_data.config else: filelist = sorted(sum([glob.glob(f) for f in args.data_test], [])) test_data = SimpleIterDataset(filelist, args.data_config, for_training=False, files_per_fetch=1) test_loader = DataLoader(test_data, num_workers=args.num_workers, batch_size=args.batch_size, drop_last=False, pin_memory=True) data_config = test_data.config # model network_module = import_module( args.network_config.replace('.py', '').replace('/', '.')) network_options = {k: ast.literal_eval(v) for k, v in args.network_option} if args.export_onnx: network_options['for_inference'] = True model, model_info = network_module.get_model(data_config, **network_options) _logger.info(model) # export to ONNX if args.export_onnx: assert (args.export_onnx.endswith('.onnx')) model_path = args.model_prefix _logger.info('Exporting model %s to ONNX' % model_path) model.load_state_dict(torch.load(model_path, map_location='cpu')) model = model.cpu() model.eval() os.makedirs(os.path.dirname(args.export_onnx), exist_ok=True) inputs = tuple( torch.ones(model_info['input_shapes'][k], dtype=torch.float32) for k in model_info['input_names']) torch.onnx.export(model, inputs, args.export_onnx, input_names=model_info['input_names'], output_names=model_info['output_names'], dynamic_axes=model_info.get('dynamic_axes', None), opset_version=11) _logger.info('ONNX model saved to %s', args.export_onnx) return # note: we should always save/load the state_dict of the original model, not the one wrapped by nn.DataParallel # so we do not convert it to nn.DataParallel now model = model.to(dev) # loss function try: loss_func = network_module.get_loss(data_config, **network_options) _logger.info(loss_func) except AttributeError: loss_func = torch.nn.CrossEntropyLoss() _logger.warning( 'Loss function not defined in %s. Will use `torch.nn.CrossEntropyLoss()` by default.', args.network_config) if training_mode: # optimizer & learning rate if args.optimizer == 'adam': opt = torch.optim.Adam(model.parameters(), lr=args.start_lr) if args.lr_finder is None: lr_steps = [int(x) for x in args.lr_steps.split(',')] scheduler = torch.optim.lr_scheduler.MultiStepLR( opt, milestones=lr_steps, gamma=0.1) else: from utils.nn.optimizer.ranger import Ranger opt = Ranger(model.parameters(), lr=args.start_lr) if args.lr_finder is None: lr_decay_epochs = max(1, int(args.num_epochs * 0.3)) lr_decay_rate = 0.01**(1. / lr_decay_epochs) scheduler = torch.optim.lr_scheduler.MultiStepLR( opt, milestones=list( range(args.num_epochs - lr_decay_epochs, args.num_epochs)), gamma=lr_decay_rate) # TODO: mixed precision training if args.use_amp: # model, opt = amp.initialize( # model, opt, opt_level="O2", # keep_batchnorm_fp32=True, loss_scale="dynamic" # ) model, opt = amp.initialize(model, opt, opt_level="O1", keep_batchnorm_fp32=None, loss_scale="dynamic") # load previous training and resume if `--load-epoch` is set if args.load_epoch is not None: _logger.info('Resume training from epoch %d' % args.load_epoch) model_state = torch.load(args.model_prefix + '_epoch-%d_state.pt' % args.load_epoch, map_location=dev) model.load_state_dict(model_state) opt_state = torch.load(args.model_prefix + '_epoch-%d_optimizer.pt' % args.load_epoch, map_location=dev) opt.load_state_dict(opt_state) # mutli-gpu if gpus is not None and len(gpus) > 1: model = torch.nn.DataParallel( model, device_ids=gpus ) # model becomes `torch.nn.DataParallel` w/ model.module being the orignal `torch.nn.Module` model = model.to(dev) # lr finder: keep it after all other setups if args.lr_finder is not None: start_lr, end_lr, num_iter = args.lr_finder.replace(' ', '').split(',') from utils.lr_finder import LRFinder lr_finder = LRFinder(model, opt, loss_func, device=dev, input_names=train_data.config.input_names, label_names=train_data.config.label_names) lr_finder.range_test(train_loader, start_lr=float(start_lr), end_lr=float(end_lr), num_iter=int(num_iter)) lr_finder.plot(output='lr_finder.png' ) # to inspect the loss-learning rate graph return # training loop best_valid_acc = 0 for epoch in range(args.num_epochs): if args.load_epoch is not None: if epoch <= args.load_epoch: continue print('-' * 50) _logger.info('Epoch #%d training' % epoch) train(model, loss_func, opt, scheduler, train_loader, dev) if args.model_prefix: dirname = os.path.dirname(args.model_prefix) if dirname and not os.path.exists(dirname): os.makedirs(dirname) state_dict = model.module.state_dict() if isinstance( model, torch.nn.DataParallel) else model.state_dict() torch.save(state_dict, args.model_prefix + '_epoch-%d_state.pt' % epoch) torch.save( opt.state_dict(), args.model_prefix + '_epoch-%d_optimizer.pt' % epoch) _logger.info('Epoch #%d validating' % epoch) valid_acc = evaluate(model, val_loader, dev, loss_func=loss_func) if valid_acc > best_valid_acc: best_valid_acc = valid_acc if args.model_prefix: shutil.copy2( args.model_prefix + '_epoch-%d_state.pt' % epoch, args.model_prefix + '_best_acc_state.pt') torch.save(model, args.model_prefix + '_best_acc_full.pt') _logger.info( 'Epoch #%d: Current validation acc: %.5f (best: %.5f)' % (epoch, valid_acc, best_valid_acc)) else: # run prediction if args.model_prefix.endswith('.onnx'): _logger.info('Loading model %s for eval' % args.model_prefix) from utils.nn.tools import evaluate_onnx test_acc, scores, labels, observers = evaluate_onnx( args.model_prefix, test_loader) else: model_path = args.model_prefix if args.model_prefix.endswith( '.pt') else args.model_prefix + '_best_acc_state.pt' _logger.info('Loading model %s for eval' % model_path) model.load_state_dict(torch.load(model_path, map_location=dev)) if gpus is not None and len(gpus) > 1: model = torch.nn.DataParallel(model, device_ids=gpus) model = model.to(dev) test_acc, scores, labels, observers = evaluate(model, test_loader, dev, for_training=False) _logger.info('Test acc %.5f' % test_acc) if args.predict_output: os.makedirs(os.path.dirname(args.predict_output), exist_ok=True) if args.predict_output.endswith('.root'): from utils.data.fileio import _write_root output = {} for idx, label_name in enumerate(data_config.label_value): output[label_name] = ( labels[data_config.label_names[0]] == idx) output['score_' + label_name] = scores[:, idx] for k, v in labels.items(): if k == data_config.label_names[0]: continue if v.ndim > 1: _logger.warning('Ignoring %s, not a 1d array.', k) continue output[k] = v for k, v in observers.items(): if v.ndim > 1: _logger.warning('Ignoring %s, not a 1d array.', k) continue output[k] = v _write_root(args.predict_output, output) else: import awkward output = {'scores': scores} output.update(labels) output.update(observers) awkward.save(args.predict_output, output, mode='w') _logger.info('Written output to %s' % args.predict_output)
def flatten_ntuple_write(input_filename, tree_name, output_filename, class_json_filename=None, verbose=False): """Convert ntuple to flattened file with awkward array table. All data for a given method are output as one long list, ignoring event splitting. Tried to output as ROOT TTree, but very difficult in PyROOT with variable size branches. Parameters ---------- input_filename : std Input Ntuple filename tree_name : str Name of TTree inside input file output_filename : str Output filename class_json_filename : None, optional If a str, output class info dicts to this file in JSON format verbose : bool, optional If True, print class info """ f_in = ROOT.TFile(input_filename) if f_in.IsZombie(): raise RuntimeError("Cannot open ROOT file %s" % input_filename) tree = f_in.Get(tree_name) check_tobj(tree) tree_info, class_infos, method_list = parse_tree(tree) if verbose: print_tree_summary(tree_info, class_infos, 'tree') print(tree.GetEntries(), "entries in tree") print(len(method_list), "hists in tree") # store list of values for each method call, where each event is a dict of {method:value} tree_data = defaultdict(list) # Use tqdm for nice progressbar, disable on non-TTY for ind in trange(tree.GetEntries(), disable=None): this_data = get_data(tree, ind, method_list) # flatten all events into one long list per method, makes for a much # more compact output, we don't care about individual events # guess we could compare those events with the same number of entries # in both files? i.e. 1 or 0, but hard to do for # eg jets, in which jet #1 may not be the same object in both files # don't use items() as not iterator in python2 for key in this_data: # may be a single scalar, or iterable - use extend where possible try: _ = iter(this_data[key]) tree_data[key].extend(this_data[key]) except TypeError: tree_data[key].append(this_data[key]) print("tree_data size:", get_size(tree_data)) # Save JSON data if class_infos and class_json_filename: with open(class_json_filename, 'w') as jf: json.dump(class_infos, jf, indent=2, sort_keys=True) is_hdf5 = "hdf5" in os.path.splitext(output_filename)[1] if is_hdf5: # Save to HDF5 import h5py import numpy as np with h5py.File(output_filename, "w") as f: for k in tree_data: f.create_dataset(k, data=np.array(tree_data[k]), compression="gzip", compression_opts=9) else: # Save to awkward array # make awkward table, save with compression import awkward # Use awkeard 0.12/13/14 as 0.15 has a bug that means it can't load() # the file # And awkward 1 doesn't even allow this format # And the awkward 0.9 in CMSSW_10_6 is too old for this major, minor, _ = awkward.version.version_info major = int(major) minor = int(minor) if major == 1: raise ImportError("Need awkward 0.12.X, you have %s" % awkward.__version__) elif minor > 14: raise ImportError("Need awkward 0.12 / 0.13 / 0.14, you have %s" % awkward.__version__) elif minor < 12: raise ImportError("Need awkward 0.12 / 0.13 / 0.14, you have %s" % awkward.__version__) awkd_table = awkward.fromiter([tree_data]) awkward.save(output_filename, awkd_table, mode='w', compression=True)
# if any(n.startswith(name) for n in namelist): # raise KeyError("cannot add {0} to zipfile because the following already exist: {1}".format(repr(name), ", ".join(repr(n) for n in namelist if n.startswith(name)))) # sed -i 's/raise KeyError/print/' ~/Library/Python/3.7/lib/python/site-packages/awkward/persist.py if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("filename", help="filename") args = parser.parse_args() f = uproot.open(str(args.filename)) t = f["Events"] ac = [dict( types=[np.bool_,bool,np.integer,np.float32], pair=(lambda x: blosc.compress(x,cname="lz4hc"), ("blosc", "decompress")), minsize=8192, contexts="*", )] bnames = [bn.decode("ascii") for bn in t.keys()] t0 = time.time() fname = "table.awkd" for iname,name in enumerate(tqdm(bnames)): arr = t.array(name) awkward.save(fname,arr,name=name,compression=ac,mode="w" if iname==0 else "a",) t1 = time.time() print(t1-t0)
ew.selected_event = event_n pts = getattr(ew, j_name + "_Total_PT") top_4 = np.argsort(pts)[-4:] num_found = len(top_4) for var_num, var_name in enumerate(kinematics): wanted = j_name + "_Total_" + var_name vals = getattr(ew, wanted)[top_4] all_kinematics[j_class][order, var_num, j, event_n, :num_found] = vals energy, px, py, pz = all_kinematics[j_class][order, 1:, j, event_n, :num_found] if num_found > 1: shape_vals = ShapeVariables.shape(energy, px, py, pz)[1] for var_num, var_name in enumerate(shapes): all_shapes[j_class][order, var_num, j, event_n] = shape_vals[var_name] content = {"shape_names": shapes, "kinematic_names": kinematics, "orders": ["nlo", "lo"], "jet_names": [spectral_names, traditional_names, iterative_names], "kinematics" : awkward.fromiter(all_kinematics), "shapes": awkward.fromiter(all_shapes)} awkward.save("../megaIgnore/IRC_shapes.awkd", content) else: data = awkward.load("../megaIgnore/IRC_shapes.awkd") shapes = data["shape_names"] kinematics = data["kinematic_names"] spectral_names, traditional_names, iterative_names = data["jet_names"] all_kinematics = data["kinematics"] all_shapes = data["shapes"] def plot_jet_name(name, variable, bounds=None, ax=None): colours = ['blue', 'purple', 'orange'] line_styles = ['--', '-', '-.'] if variable in kinematics: table = all_kinematics v_index = kinematics.index(variable)
def run_query(input_filenames=None, tree_name=None, branches=None): import awkward, uproot a = (lambda event: (awkward.Table if hasattr(awkward, 'Table') else awkward['Table'])((event.MVA3lCERN_weight_ttH if hasattr(event, 'MVA3lCERN_weight_ttH') else event['MVA3lCERN_weight_ttH']))) b = (lambda event: event[(((((((((((((((((((((((((((event.trilep_type if hasattr(event, 'trilep_type') else event['trilep_type']) > 0) & ((event.nTaus_OR_Pt25 if hasattr(event, 'nTaus_OR_Pt25') else event['nTaus_OR_Pt25']) == 0)) & (abs((event.total_charge if hasattr(event, 'total_charge') else event['total_charge'])) == 1)) & ((event.nJets_OR_T if hasattr(event, 'nJets_OR_T') else event['nJets_OR_T']) >= 2)) & ((event.nJets_OR_T_MV2c10_70 if hasattr(event, 'nJets_OR_T_MV2c10_70') else event['nJets_OR_T_MV2c10_70']) > 0)) & ((event.lep_Pt_1 if hasattr(event, 'lep_Pt_1') else event['lep_Pt_1']) > 15000.0)) & ((event.lep_Pt_2 if hasattr(event, 'lep_Pt_2') else event['lep_Pt_2']) > 15000.0)) & ((event.lep_isolationFixedCutLoose_0 if hasattr(event, 'lep_isolationFixedCutLoose_0') else event['lep_isolationFixedCutLoose_0']) > 0)) & (abs(((event.Mlll012 if hasattr(event, 'Mlll012') else event['Mlll012']) - 91200.0)) > 10000.0)) & (((event.lep_ID_0 if hasattr(event, 'lep_ID_0') else event['lep_ID_0']) != (-(event.lep_ID_1 if hasattr(event, 'lep_ID_1') else event['lep_ID_1']))) | ((event.Mll01 if hasattr(event, 'Mll01') else event['Mll01']) > 12000.0))) & (((event.lep_ID_0 if hasattr(event, 'lep_ID_0') else event['lep_ID_0']) != (-(event.lep_ID_2 if hasattr(event, 'lep_ID_2') else event['lep_ID_2']))) | ((event.Mll02 if hasattr(event, 'Mll02') else event['Mll02']) > 12000.0))) & (((((abs((event.lep_ID_0 if hasattr(event, 'lep_ID_0') else event['lep_ID_0'])) == 13) & ((event.lep_isMedium_0 if hasattr(event, 'lep_isMedium_0') else event['lep_isMedium_0']) > 0)) | (abs((event.lep_ID_0 if hasattr(event, 'lep_ID_0') else event['lep_ID_0'])) == 11)) & (((abs((event.lep_ID_1 if hasattr(event, 'lep_ID_1') else event['lep_ID_1'])) == 11) & (abs((event.lep_Eta_1 if hasattr(event, 'lep_Eta_1') else event['lep_Eta_1'])) < 2.0)) | ((abs((event.lep_ID_1 if hasattr(event, 'lep_ID_1') else event['lep_ID_1'])) == 13) & ((event.lep_isMedium_1 if hasattr(event, 'lep_isMedium_1') else event['lep_isMedium_1']) > 0)))) & (((abs((event.lep_ID_2 if hasattr(event, 'lep_ID_2') else event['lep_ID_2'])) == 11) & (abs((event.lep_Eta_2 if hasattr(event, 'lep_Eta_2') else event['lep_Eta_2'])) < 2.0)) | ((abs((event.lep_ID_2 if hasattr(event, 'lep_ID_2') else event['lep_ID_2'])) == 13) & ((event.lep_isMedium_2 if hasattr(event, 'lep_isMedium_2') else event['lep_isMedium_2']) > 0))))) & ((((abs((event.lep_ID_1 if hasattr(event, 'lep_ID_1') else event['lep_ID_1'])) * abs((event.lep_ID_2 if hasattr(event, 'lep_ID_2') else event['lep_ID_2']))) != 169) & ((event.DRll12 if hasattr(event, 'DRll12') else event['DRll12']) > 0.5)) | ((abs((event.lep_ID_1 if hasattr(event, 'lep_ID_1') else event['lep_ID_1'])) * abs((event.lep_ID_2 if hasattr(event, 'lep_ID_2') else event['lep_ID_2']))) == 169))) & ((event.Mll12 if hasattr(event, 'Mll12') else event['Mll12']) > 12000.0)) & (((event.lep_ID_0 if hasattr(event, 'lep_ID_0') else event['lep_ID_0']) != (-(event.lep_ID_1 if hasattr(event, 'lep_ID_1') else event['lep_ID_1']))) | (abs(((event.Mll01 if hasattr(event, 'Mll01') else event['Mll01']) - 91200.0)) > 10000.0))) & (((event.lep_ID_0 if hasattr(event, 'lep_ID_0') else event['lep_ID_0']) != (-(event.lep_ID_2 if hasattr(event, 'lep_ID_2') else event['lep_ID_2']))) | (abs(((event.Mll02 if hasattr(event, 'Mll02') else event['Mll02']) - 91200.0)) > 10000.0))) & ((event.MVA3lCERN_weight_ttH if hasattr(event, 'MVA3lCERN_weight_ttH') else event['MVA3lCERN_weight_ttH']) > (-1))) & (((event.lep_ID_0 if hasattr(event, 'lep_ID_0') else event['lep_ID_0']) != (-(event.lep_ID_1 if hasattr(event, 'lep_ID_1') else event['lep_ID_1']))) | (abs(((event.Mll01 if hasattr(event, 'Mll01') else event['Mll01']) - 91200.0)) > 10000.0))) & (((event.lep_ID_0 if hasattr(event, 'lep_ID_0') else event['lep_ID_0']) != (-(event.lep_ID_2 if hasattr(event, 'lep_ID_2') else event['lep_ID_2']))) | (abs(((event.Mll02 if hasattr(event, 'Mll02') else event['Mll02']) - 91200.0)) > 10000.0))) & ((event.MVA3lCERN_weight_ttH if hasattr(event, 'MVA3lCERN_weight_ttH') else event['MVA3lCERN_weight_ttH']) > 0.3)) & ((event.MVA3lCERN_weight_ttW if hasattr(event, 'MVA3lCERN_weight_ttW') else event['MVA3lCERN_weight_ttW']) < 0.75)) & ((event.MVA3lCERN_weight_ttZ if hasattr(event, 'MVA3lCERN_weight_ttZ') else event['MVA3lCERN_weight_ttZ']) < 0.75)) & ((event.MVA3lCERN_weight_VV if hasattr(event, 'MVA3lCERN_weight_VV') else event['MVA3lCERN_weight_VV']) < 0.75)) & ((event.MVA3lCERN_weight_ttbar if hasattr(event, 'MVA3lCERN_weight_ttbar') else event['MVA3lCERN_weight_ttbar']) < 0.3)) & ((((((((event.dilep_type if hasattr(event, 'dilep_type') else event['dilep_type']) > 0) & (((event.lep_ID_0 if hasattr(event, 'lep_ID_0') else event['lep_ID_0']) * (event.lep_ID_1 if hasattr(event, 'lep_ID_1') else event['lep_ID_1'])) > 0)) & ((event.lep_isQMisID_1 if hasattr(event, 'lep_isQMisID_1') else event['lep_isQMisID_1']) == 0)) & ((event.lep_isQMisID_0 if hasattr(event, 'lep_isQMisID_0') else event['lep_isQMisID_0']) == 0)) | ((((event.trilep_type if hasattr(event, 'trilep_type') else event['trilep_type']) > 0) & ((event.lep_isQMisID_2 if hasattr(event, 'lep_isQMisID_2') else event['lep_isQMisID_2']) == 0)) & ((event.lep_isQMisID_1 if hasattr(event, 'lep_isQMisID_1') else event['lep_isQMisID_1']) == 0))) | (((event.quadlep_type if hasattr(event, 'quadlep_type') else event['quadlep_type']) > 0) & ((event.FSF_4L_tot if hasattr(event, 'FSF_4L_tot') else event['FSF_4L_tot']) == 1))) & (((((((((event.dilep_type if hasattr(event, 'dilep_type') else event['dilep_type']) > 0) & (((((abs((event.lep_ID_0 if hasattr(event, 'lep_ID_0') else event['lep_ID_0'])) == 13) & ((event.lep_isMedium_0 if hasattr(event, 'lep_isMedium_0') else event['lep_isMedium_0']) > 0)) & ((event.lep_isolationFixedCutLoose_0 if hasattr(event, 'lep_isolationFixedCutLoose_0') else event['lep_isolationFixedCutLoose_0']) > 0)) & ((event.lep_promptLeptonVeto_TagWeight_0 if hasattr(event, 'lep_promptLeptonVeto_TagWeight_0') else event['lep_promptLeptonVeto_TagWeight_0']) < (-0.5))) | ((((((abs((event.lep_ID_0 if hasattr(event, 'lep_ID_0') else event['lep_ID_0'])) == 11) & ((event.lep_isolationFixedCutLoose_0 if hasattr(event, 'lep_isolationFixedCutLoose_0') else event['lep_isolationFixedCutLoose_0']) > 0)) & ((event.lep_isTightLH_0 if hasattr(event, 'lep_isTightLH_0') else event['lep_isTightLH_0']) > 0)) & ((event.lep_chargeIDBDTTight_0 if hasattr(event, 'lep_chargeIDBDTTight_0') else event['lep_chargeIDBDTTight_0']) > 0.7)) & ((event.lep_ambiguityType_0 if hasattr(event, 'lep_ambiguityType_0') else event['lep_ambiguityType_0']) == 0)) & ((event.lep_promptLeptonVeto_TagWeight_0 if hasattr(event, 'lep_promptLeptonVeto_TagWeight_0') else event['lep_promptLeptonVeto_TagWeight_0']) < (-0.7))))) & (((((abs((event.lep_ID_1 if hasattr(event, 'lep_ID_1') else event['lep_ID_1'])) == 13) & ((event.lep_isMedium_1 if hasattr(event, 'lep_isMedium_1') else event['lep_isMedium_1']) > 0)) & ((event.lep_isolationFixedCutLoose_1 if hasattr(event, 'lep_isolationFixedCutLoose_1') else event['lep_isolationFixedCutLoose_1']) > 0)) & ((event.lep_promptLeptonVeto_TagWeight_1 if hasattr(event, 'lep_promptLeptonVeto_TagWeight_1') else event['lep_promptLeptonVeto_TagWeight_1']) < (-0.5))) | ((((((abs((event.lep_ID_1 if hasattr(event, 'lep_ID_1') else event['lep_ID_1'])) == 11) & ((event.lep_isolationFixedCutLoose_1 if hasattr(event, 'lep_isolationFixedCutLoose_1') else event['lep_isolationFixedCutLoose_1']) > 0)) & ((event.lep_isTightLH_1 if hasattr(event, 'lep_isTightLH_1') else event['lep_isTightLH_1']) > 0)) & ((event.lep_chargeIDBDTTight_1 if hasattr(event, 'lep_chargeIDBDTTight_1') else event['lep_chargeIDBDTTight_1']) > 0.7)) & ((event.lep_ambiguityType_1 if hasattr(event, 'lep_ambiguityType_1') else event['lep_ambiguityType_1']) == 0)) & ((event.lep_promptLeptonVeto_TagWeight_1 if hasattr(event, 'lep_promptLeptonVeto_TagWeight_1') else event['lep_promptLeptonVeto_TagWeight_1']) < (-0.7))))) | ((((event.trilep_type if hasattr(event, 'trilep_type') else event['trilep_type']) > 0) & ((event.nTaus_OR_Pt25 if hasattr(event, 'nTaus_OR_Pt25') else event['nTaus_OR_Pt25']) == 0)) & (((((abs((event.lep_ID_2 if hasattr(event, 'lep_ID_2') else event['lep_ID_2'])) == 13) & ((event.lep_isolationFixedCutLoose_2 if hasattr(event, 'lep_isolationFixedCutLoose_2') else event['lep_isolationFixedCutLoose_2']) > 0)) & ((event.lep_promptLeptonVeto_TagWeight_2 if hasattr(event, 'lep_promptLeptonVeto_TagWeight_2') else event['lep_promptLeptonVeto_TagWeight_2']) < (-0.5))) | ((((((abs((event.lep_ID_2 if hasattr(event, 'lep_ID_2') else event['lep_ID_2'])) == 11) & ((event.lep_isolationFixedCutLoose_2 if hasattr(event, 'lep_isolationFixedCutLoose_2') else event['lep_isolationFixedCutLoose_2']) > 0)) & ((event.lep_isTightLH_2 if hasattr(event, 'lep_isTightLH_2') else event['lep_isTightLH_2']) > 0)) & ((event.lep_chargeIDBDTTight_2 if hasattr(event, 'lep_chargeIDBDTTight_2') else event['lep_chargeIDBDTTight_2']) > 0.7)) & ((event.lep_promptLeptonVeto_TagWeight_2 if hasattr(event, 'lep_promptLeptonVeto_TagWeight_2') else event['lep_promptLeptonVeto_TagWeight_2']) < (-0.7))) & ((event.lep_ambiguityType_2 if hasattr(event, 'lep_ambiguityType_2') else event['lep_ambiguityType_2']) == 0))) & ((((abs((event.lep_ID_1 if hasattr(event, 'lep_ID_1') else event['lep_ID_1'])) == 13) & ((event.lep_isolationFixedCutLoose_1 if hasattr(event, 'lep_isolationFixedCutLoose_1') else event['lep_isolationFixedCutLoose_1']) > 0)) & ((event.lep_promptLeptonVeto_TagWeight_1 if hasattr(event, 'lep_promptLeptonVeto_TagWeight_1') else event['lep_promptLeptonVeto_TagWeight_1']) < (-0.5))) | ((((((abs((event.lep_ID_1 if hasattr(event, 'lep_ID_1') else event['lep_ID_1'])) == 11) & ((event.lep_isolationFixedCutLoose_1 if hasattr(event, 'lep_isolationFixedCutLoose_1') else event['lep_isolationFixedCutLoose_1']) > 0)) & ((event.lep_isTightLH_1 if hasattr(event, 'lep_isTightLH_1') else event['lep_isTightLH_1']) > 0)) & ((event.lep_chargeIDBDTTight_1 if hasattr(event, 'lep_chargeIDBDTTight_1') else event['lep_chargeIDBDTTight_1']) > 0.7)) & ((event.lep_promptLeptonVeto_TagWeight_1 if hasattr(event, 'lep_promptLeptonVeto_TagWeight_1') else event['lep_promptLeptonVeto_TagWeight_1']) < (-0.7))) & ((event.lep_ambiguityType_1 if hasattr(event, 'lep_ambiguityType_1') else event['lep_ambiguityType_1']) == 0)))))) | (((event.dilep_type if hasattr(event, 'dilep_type') else event['dilep_type']) > 0) & ((event.nTaus_OR_Pt25 if hasattr(event, 'nTaus_OR_Pt25') else event['nTaus_OR_Pt25']) > 1))) | ((((event.dilep_type if hasattr(event, 'dilep_type') else event['dilep_type']) > 0) | ((event.trilep_type if hasattr(event, 'trilep_type') else event['trilep_type']) > 0)) == 0)) | ((event.quadlep_type if hasattr(event, 'quadlep_type') else event['quadlep_type']) > 0)) | (((event.trilep_type if hasattr(event, 'trilep_type') else event['trilep_type']) > 0) & ((event.nTaus_OR_Pt25 if hasattr(event, 'nTaus_OR_Pt25') else event['nTaus_OR_Pt25']) > 0))))) | ((event.is1L2Tau if hasattr(event, 'is1L2Tau') else event['is1L2Tau']) > 0))]) out = awkward.Table() out['0'] =[] for i in uproot.iterate(input_filenames,tree_name,branches=branches,namedecode="utf-8",entrysteps=50000, reportentries=False): out = awkward.concatenate([out, (a)((b)(awkward.Table(i)))]) # for i in uproot.iterate(input_filenames,tree_name,branches=branches,namedecode="utf-8",entrysteps=10000, reportentries=True): # print("Entry range: ", i[0], i[1]) # out = awkward.concatenate([out, (a)((b)(awkward.Table(i[2])))]) return out if __name__ == '__main__': branch_list = ['trilep_type', 'nTaus_OR_Pt25', 'total_charge', 'nJets_OR_T', 'nJets_OR_T_MV2c10_70', 'lep_Pt_1', 'lep_Pt_2', 'lep_isolationFixedCutLoose_0', 'Mlll012', 'lep_ID_0', 'lep_ID_1', 'Mll01', 'lep_ID_2', 'Mll02', 'lep_isMedium_0', 'lep_Eta_1', 'lep_isMedium_1', 'lep_Eta_2', 'lep_isMedium_2', 'DRll12', 'Mll12', 'MVA3lCERN_weight_ttH', 'MVA3lCERN_weight_ttW', 'MVA3lCERN_weight_ttZ', 'MVA3lCERN_weight_VV', 'MVA3lCERN_weight_ttbar', 'dilep_type', 'lep_promptLeptonVeto_TagWeight_0', 'lep_isTightLH_0', 'lep_chargeIDBDTTight_0', 'lep_ambiguityType_0', 'lep_isolationFixedCutLoose_1', 'lep_promptLeptonVeto_TagWeight_1', 'lep_isTightLH_1', 'lep_chargeIDBDTTight_1', 'lep_ambiguityType_1', 'lep_isolationFixedCutLoose_2', 'lep_promptLeptonVeto_TagWeight_2', 'lep_isTightLH_2', 'lep_chargeIDBDTTight_2', 'lep_ambiguityType_2', 'quadlep_type', 'lep_isQMisID_1', 'lep_isQMisID_0', 'lep_isQMisID_2', 'FSF_4L_tot', 'is1L2Tau'] a = run_query('/data/kyungeon/ttHML_v09_01/user.kchoi.ttHML_80fb_VV/ttHML_80fb_364286_mc16d.root', 'nominal', branch_list) # a = run_query('/scratch/data/ttHML_v09_01/user.kchoi.ttHML_80fb_VV/ttHML_80fb_364253_mc16d.root', 'nominal', branch_list) import awkward awkward.save('h.awkd', a)
info["label"] = label info["iter"] = i ac = copy.deepcopy(awkward.persist.compression) ac[0]["types"] += [np.float32] if label.startswith("lz4"): ac[0]["pair"] = (fcomp, ("lz4.frame", "decompress")) if label.startswith("blosc"): ac[0]["pair"] = (fcomp, ("blosc", "decompress")) if label.startswith("lzma"): ac[0]["pair"] = (fcomp, ("backports.lzma", "decompress")) fname = "tables/table_{}.awkd".format(label) t0 = time.time() awkward.save(fname, table, compression=ac, mode="w") t1 = time.time() info["t_compress_ms"] = 1e3 * (t1 - t0) t0 = time.time() tmp = awkward.load(fname, whitelist=awkward.persist.whitelist + [ ['lz4.frame', 'decompress'], ['lz4.block', 'decompress'], ['blosc', 'decompress'], ['backports.lzma', 'decompress'], ]) t1 = time.time() info["t_decompress_ms"] = 1e3 * (t1 - t0) info["uncompressed_bytes"] = table.nbytes
start = time.time() nn = ParticleNetJetTagsProducer(args.model, args.preprocess) diff = time.time() - start print('--- Setup model: %f s total' % (diff,)) start = time.time() outputs = nn.predict(taginfo, eval_flags) diff = time.time() - start print('--- Run prediction: %f s total, %f s per jet ---' % (diff, diff / outputs['probQCDbb'].counts.sum())) # print(outputs) # for k in outputs: # print(k, outputs[k].content.mean()) if 'FatJet_ParticleNetMD_probXbb' in table: print('Compare w/ stored values') print('Stored values:\n ...', table['FatJet_ParticleNetMD_probXbb'][:5]) print('Computed values:\n ...', outputs['probXbb'][:5]) print('Diff (50%, 95%, 99%, 100%) = ', np.percentile( np.abs(outputs['probXbb'] - table['FatJet_ParticleNetMD_probXbb']).content, [50, 95, 99, 100])) # assert(np.array_equal(jetmass.counts, outputs['probQCDbb'].counts)) alloutputs = awkward.JaggedArray.zip(outputs) if args.make_baseline: with open('baseline.awkd', 'wb') as fout: awkward.save(fout, alloutputs) else: if os.path.exists('baseline.awkd'): with open('baseline.awkd', 'rb') as fin: baseline = awkward.load(fin) print("Comparison to baseline:", (alloutputs == baseline).all().all())