if nPhi == 53: seedingConfig = nPhi53SeedingConfig if nPhi == 100: seedingConfig = nPhi100SeedingConfig if nPhi == 75: seedingConfig = nPhi75SeedingConfig if nPhi == 25: seedingConfig = nPhi25SeedingConfig if nPhi == 10: seedingConfig = nPhi10SeedingConfig if nPhi == 6: seedingConfig = nPhi6SeedingConfig if nPhi == 1: seedingConfig = nPhi1SeedingConfig # generate the doublets: the important part is the config_cls ! if os.path.exists(path + "-doublets.csv"): doublets = pd.read_csv(path + "-doublets.csv", index_col=0) results[k]['TInitialDoubletBuilding'] = time_info[1] with open(path + "-metaDoublets.pickle", 'rb') as f: time_info = pickle.load(f) else: with time_this() as time_info: doublets = generate_doublets(hits_path=path + '-hits.csv', config_cls=seedingConfig) doublets.to_csv(path + '-doublets.csv') with open(os.path.join(path + "-metaDoublets.pickle"), 'wb') as f: pickle.dump(time_info, f) results[k]['TInitialDoubletBuilding'] = time_info[1] print('number of doublets = ', len(doublets)) results[k]['Ndoublets'] = len(doublets) from hepqpr.qallse.qallse import Config config = Config() config.tplet_max_curv = ptThr dw = DataWrapper.from_path(path + '-hits.csv') if modelName == "D0": from hepqpr.qallse.qallse_d0 import D0Config new_config = merge_dicts(D0Config().as_dict(), config.as_dict()) model = QallseD0(dw, **new_config)
def create_dataset(input_path=_get_default_input_path(), output_path='.', density=.1, min_hits_per_track=5, high_pt_cut=1., double_hits_ok=False, gen_doublets=False, prefix=None, random_seed=None, phi_bounds=None) -> Tuple[Dict, str]: input_path = input_path.replace('-hits.csv', '') # just in case # capture all parameters, so we can dump them to a file later input_params = locals() # initialise random if random_seed is None: random_seed = random.randint(0, 1 << 30) random.seed(random_seed) event_id = re.search('(event[0-9]+)', input_path)[0] # compute the prefix if prefix is None: prefix = f'ez-{density}' if high_pt_cut > 0: prefix += f'_hpt-{high_pt_cut}' else: prefix += '_baby' if double_hits_ok: prefix += '_dbl' # ---------- prepare data # load the data hits = pd.read_csv(input_path + '-hits.csv') particles = pd.read_csv(input_path + '-particles.csv') truth = pd.read_csv(input_path + '-truth.csv') # add indexes particles.set_index('particle_id', drop=False, inplace=True) truth.set_index('hit_id', drop=False, inplace=True) hits.set_index('hit_id', drop=False, inplace=True) # create a merged dataset with hits and truth df = hits.join(truth, rsuffix='_', how='inner') logger.debug(f'Loaded {len(df)} hits from {input_path}.') # ---------- filter hits # keep only hits in the barrel region df = df[hits.volume_id.isin(BARREL_VOLUME_IDS)] logger.debug(f'Filtered hits from barrel. Remaining hits: {len(df)}.') if phi_bounds is not None: df['phi'] = np.arctan2(df.y, df.x) df = df[(df.phi >= phi_bounds[0]) & (df.phi <= phi_bounds[1])] logger.debug( f'Filtered using phi bounds {phi_bounds}. Remaining hits: {len(df)}.' ) # store the noise for later, then remove them from the main dataframe # do this before filtering double hits, as noise will be thrown away as duplicates noise_df = df.loc[df.particle_id == 0] df = df[df.particle_id != 0] if not double_hits_ok: df.drop_duplicates(['particle_id', 'volume_id', 'layer_id'], keep='first', inplace=True) logger.debug( f'Dropped double hits. Remaining hits: {len(df) + len(noise_df)}.') # ---------- sample tracks num_tracks = int(df.particle_id.nunique() * density) sampled_particle_ids = random.sample(df.particle_id.unique().tolist(), num_tracks) df = df[df.particle_id.isin(sampled_particle_ids)] # ---------- sample noise num_noise = int(len(noise_df) * density) sampled_noise = random.sample(noise_df.hit_id.values.tolist(), num_noise) noise_df = noise_df.loc[sampled_noise] # ---------- recreate hits, particle and truth new_hit_ids = df.hit_id.values.tolist() + noise_df.hit_id.values.tolist() new_hits = hits.loc[new_hit_ids] new_truth = truth.loc[new_hit_ids] new_particles = particles.loc[sampled_particle_ids] # ---------- fix truth if high_pt_cut > 0: # set low pt weights to 0 hpt_mask = np.sqrt(truth.tpx**2 + truth.tpy**2) >= high_pt_cut new_truth.loc[~hpt_mask, 'weight'] = 0 logger.debug(f'High Pt hits: {sum(hpt_mask)}/{len(new_truth)}') if min_hits_per_track > 0: short_tracks = new_truth.groupby('particle_id').filter( lambda g: len(g) < min_hits_per_track) new_truth.loc[short_tracks.index, 'weight'] = 0 new_truth.weight = new_truth.weight / new_truth.weight.sum() # ---------- write data # write the dataset to disk output_path = os.path.join(output_path, prefix) os.makedirs(output_path, exist_ok=True) output_path = os.path.join(output_path, event_id) new_hits.to_csv(output_path + '-hits.csv', index=False) new_truth.to_csv(output_path + '-truth.csv', index=False) new_particles.to_csv(output_path + '-particles.csv', index=False) # ---------- write metadata metadata = dict( num_hits=new_hits.shape[0], num_tracks=num_tracks, num_important_tracks=new_truth[ new_truth.weight != 0].particle_id.nunique(), num_noise=num_noise, random_seed=random_seed, time=datetime.now().isoformat(), ) for k, v in metadata.items(): logger.debug(f' {k}={v}') metadata['params'] = input_params with open(output_path + '-meta.json', 'w') as f: json.dump(metadata, f, indent=4) # ------------ gen doublets if gen_doublets: from hepqpr.qallse.seeding import generate_doublets doublets_df = generate_doublets(truth=new_truth, hits=new_hits) with open(output_path + '-doublets.csv', 'w') as f: doublets_df.to_csv(f, index=False) logger.info( f'Doublets (len={len(doublets_df)}) generated in f{output_path}.' ) return metadata, output_path