コード例 #1
0
ファイル: seedingLoss.py プロジェクト: masamuch/hepqpr-qallse
        if nPhi == 53: seedingConfig = nPhi53SeedingConfig
        if nPhi == 100: seedingConfig = nPhi100SeedingConfig
        if nPhi == 75: seedingConfig = nPhi75SeedingConfig
        if nPhi == 25: seedingConfig = nPhi25SeedingConfig
        if nPhi == 10: seedingConfig = nPhi10SeedingConfig
        if nPhi == 6: seedingConfig = nPhi6SeedingConfig
        if nPhi == 1: seedingConfig = nPhi1SeedingConfig
        # generate the doublets: the important part is the config_cls !
        if os.path.exists(path + "-doublets.csv"):
            doublets = pd.read_csv(path + "-doublets.csv", index_col=0)
            results[k]['TInitialDoubletBuilding'] = time_info[1]
            with open(path + "-metaDoublets.pickle", 'rb') as f:
                time_info = pickle.load(f)
        else:
            with time_this() as time_info:
                doublets = generate_doublets(hits_path=path + '-hits.csv',
                                             config_cls=seedingConfig)
                doublets.to_csv(path + '-doublets.csv')
            with open(os.path.join(path + "-metaDoublets.pickle"), 'wb') as f:
                pickle.dump(time_info, f)
        results[k]['TInitialDoubletBuilding'] = time_info[1]
        print('number of doublets = ', len(doublets))
        results[k]['Ndoublets'] = len(doublets)

        from hepqpr.qallse.qallse import Config
        config = Config()
        config.tplet_max_curv = ptThr
        dw = DataWrapper.from_path(path + '-hits.csv')
        if modelName == "D0":
            from hepqpr.qallse.qallse_d0 import D0Config
            new_config = merge_dicts(D0Config().as_dict(), config.as_dict())
            model = QallseD0(dw, **new_config)
コード例 #2
0
def create_dataset(input_path=_get_default_input_path(),
                   output_path='.',
                   density=.1,
                   min_hits_per_track=5,
                   high_pt_cut=1.,
                   double_hits_ok=False,
                   gen_doublets=False,
                   prefix=None,
                   random_seed=None,
                   phi_bounds=None) -> Tuple[Dict, str]:
    input_path = input_path.replace('-hits.csv', '')  # just in case

    # capture all parameters, so we can dump them to a file later
    input_params = locals()

    # initialise random
    if random_seed is None:
        random_seed = random.randint(0, 1 << 30)
    random.seed(random_seed)

    event_id = re.search('(event[0-9]+)', input_path)[0]

    # compute the prefix
    if prefix is None:
        prefix = f'ez-{density}'
        if high_pt_cut > 0:
            prefix += f'_hpt-{high_pt_cut}'
        else:
            prefix += '_baby'
        if double_hits_ok:
            prefix += '_dbl'

    # ---------- prepare data

    # load the data
    hits = pd.read_csv(input_path + '-hits.csv')
    particles = pd.read_csv(input_path + '-particles.csv')
    truth = pd.read_csv(input_path + '-truth.csv')

    # add indexes
    particles.set_index('particle_id', drop=False, inplace=True)
    truth.set_index('hit_id', drop=False, inplace=True)
    hits.set_index('hit_id', drop=False, inplace=True)

    # create a merged dataset with hits and truth
    df = hits.join(truth, rsuffix='_', how='inner')

    logger.debug(f'Loaded {len(df)} hits from {input_path}.')

    # ---------- filter hits

    # keep only hits in the barrel region
    df = df[hits.volume_id.isin(BARREL_VOLUME_IDS)]
    logger.debug(f'Filtered hits from barrel. Remaining hits: {len(df)}.')

    if phi_bounds is not None:
        df['phi'] = np.arctan2(df.y, df.x)
        df = df[(df.phi >= phi_bounds[0]) & (df.phi <= phi_bounds[1])]
        logger.debug(
            f'Filtered using phi bounds {phi_bounds}. Remaining hits: {len(df)}.'
        )

    # store the noise for later, then remove them from the main dataframe
    # do this before filtering double hits, as noise will be thrown away as duplicates
    noise_df = df.loc[df.particle_id == 0]
    df = df[df.particle_id != 0]

    if not double_hits_ok:
        df.drop_duplicates(['particle_id', 'volume_id', 'layer_id'],
                           keep='first',
                           inplace=True)
        logger.debug(
            f'Dropped double hits. Remaining hits: {len(df) + len(noise_df)}.')

    # ---------- sample tracks

    num_tracks = int(df.particle_id.nunique() * density)
    sampled_particle_ids = random.sample(df.particle_id.unique().tolist(),
                                         num_tracks)
    df = df[df.particle_id.isin(sampled_particle_ids)]

    # ---------- sample noise

    num_noise = int(len(noise_df) * density)
    sampled_noise = random.sample(noise_df.hit_id.values.tolist(), num_noise)
    noise_df = noise_df.loc[sampled_noise]

    # ---------- recreate hits, particle and truth

    new_hit_ids = df.hit_id.values.tolist() + noise_df.hit_id.values.tolist()
    new_hits = hits.loc[new_hit_ids]
    new_truth = truth.loc[new_hit_ids]
    new_particles = particles.loc[sampled_particle_ids]

    # ---------- fix truth

    if high_pt_cut > 0:
        # set low pt weights to 0
        hpt_mask = np.sqrt(truth.tpx**2 + truth.tpy**2) >= high_pt_cut
        new_truth.loc[~hpt_mask, 'weight'] = 0
        logger.debug(f'High Pt hits: {sum(hpt_mask)}/{len(new_truth)}')

    if min_hits_per_track > 0:
        short_tracks = new_truth.groupby('particle_id').filter(
            lambda g: len(g) < min_hits_per_track)
        new_truth.loc[short_tracks.index, 'weight'] = 0

    new_truth.weight = new_truth.weight / new_truth.weight.sum()

    # ---------- write data

    # write the dataset to disk
    output_path = os.path.join(output_path, prefix)
    os.makedirs(output_path, exist_ok=True)
    output_path = os.path.join(output_path, event_id)

    new_hits.to_csv(output_path + '-hits.csv', index=False)
    new_truth.to_csv(output_path + '-truth.csv', index=False)
    new_particles.to_csv(output_path + '-particles.csv', index=False)

    # ---------- write metadata

    metadata = dict(
        num_hits=new_hits.shape[0],
        num_tracks=num_tracks,
        num_important_tracks=new_truth[
            new_truth.weight != 0].particle_id.nunique(),
        num_noise=num_noise,
        random_seed=random_seed,
        time=datetime.now().isoformat(),
    )
    for k, v in metadata.items():
        logger.debug(f'  {k}={v}')

    metadata['params'] = input_params

    with open(output_path + '-meta.json', 'w') as f:
        json.dump(metadata, f, indent=4)

    # ------------ gen doublets

    if gen_doublets:

        from hepqpr.qallse.seeding import generate_doublets
        doublets_df = generate_doublets(truth=new_truth, hits=new_hits)
        with open(output_path + '-doublets.csv', 'w') as f:
            doublets_df.to_csv(f, index=False)
            logger.info(
                f'Doublets (len={len(doublets_df)}) generated in f{output_path}.'
            )

    return metadata, output_path