コード例 #1
0
def main(path, cuts, threshold):
    config = 'configs/data_mc.yaml'

    if cuts:
        precuts = '_precuts'
    else:
        precuts = ''

    print('Read in data...')
    gamma = read_h5py(path + '/gamma{}.hdf5'.format(precuts), key='events')
    proton = read_h5py(path + '/proton{}.hdf5'.format(precuts), key='events')
    data = read_h5py(path + '/crab_data{}.hdf5'.format(precuts), key='events')
    print('Done...')

    if config is not None:
        with open(config) as f:
            config = yaml.safe_load(f)
    else:
        config = {}
    print(config)

    fig = plt.figure()
    ax_hist = fig.add_subplot(1, 1, 1)

    keys = set(data.columns).intersection(set(proton.columns))
    wech = [
        'run', 'event', 'pointing_position_az', 'pointing_position_zd',
        'theta_deg_off_1', 'theta_deg_off_2', 'theta_deg_off_3',
        'theta_deg_off_4', 'theta_deg_off_5'
    ]
    for key in wech:
        if key in keys and key in set(gamma.columns):
            keys.remove(key)
    dfs = OrderedDict([
        ('data', data),
        ('proton', proton),
        ('gamma', gamma),
    ])

    # Dict for histograms
    h = {}
    print('Plotting keys...')
    with PdfPages('{}/pdf/feature_comp_cuts.pdf'.format(path)) as pdf:
        for key in sorted(list(keys)):
            print(key)

            kwargs = config.get(key, {})
            if 'transform' in kwargs:
                kwargs['transform'] = eval(kwargs['transform'])

            ax_hist.cla()
            plot_histograms(dfs, key, h, threshold, ax=ax_hist, **kwargs)

            pdf.savefig(fig)
コード例 #2
0
def main(infile):

    with h5py.File(infile, mode='r') as f:
        is_simulation = 'corsika_runs' in f

    if is_simulation:
        df = read_h5py(infile, key='events', columns=columns_sim)
        obstime = None
    else:
        df = read_h5py(infile, key='events', columns=columns_obs)
        obstime = Time(df.dragon_time, format='unix')

    altaz = AltAz(obstime=obstime, location=location)

    pointing = SkyCoord(
        alt=u.Quantity(df.alt_tel.values, u.rad, copy=False),
        az=u.Quantity(df.az_tel.values, u.rad, copy=False),
        frame=altaz,
    )

    camera_frame = CameraFrame(telescope_pointing=pointing,
                               location=location,
                               obstime=obstime,
                               focal_length=28 * u.m)

    prediction_cam = SkyCoord(
        x=u.Quantity(df.source_x_prediction.values, u.m, copy=False),
        y=u.Quantity(df.source_y_prediction.values, u.m, copy=False),
        frame=camera_frame,
    )

    prediction_altaz = prediction_cam.transform_to(altaz)

    append_column_to_hdf5(infile, prediction_altaz.alt.rad, 'events',
                          'source_alt_prediction')
    append_column_to_hdf5(infile, prediction_altaz.az.rad, 'events',
                          'source_az_prediction')

    if not is_simulation:
        prediction_icrs = prediction_altaz.transform_to('icrs')
        pointing_icrs = pointing.transform_to('icrs')

        append_column_to_hdf5(infile, prediction_icrs.ra.rad, 'events',
                              'source_ra_prediction')
        append_column_to_hdf5(infile, prediction_icrs.dec.rad, 'events',
                              'source_dec_prediction')
        append_column_to_hdf5(infile, pointing_icrs.ra.rad, 'events',
                              'pointing_ra')
        append_column_to_hdf5(infile, pointing_icrs.dec.rad, 'events',
                              'pointing_dec')
コード例 #3
0
def theta_cut(
    path_gamma,
    path_hadron,
    theta_cut,
    length=None,
    path_feature='/home/msackel/Desktop/gammaClassification/config/feature.yaml'
):
    '''
	Read to concanate features from *.yaml file. And load the data from the given files.
	Hadron file added theta_deg feature to make theta cut on data.
	'''
    with open(path_feature) as f:
        feature = yaml.load(f)

    gamma_data = pd.read_hdf(path_gamma, key='events')[feature]
    hadron_data = read_h5py(path_hadron,
                            key='events',
                            columns=feature + ['theta_deg'])
    '''
	Theta cut on hadron data and label data.
	'''
    hadron_data = hadron_data[hadron_data['theta_deg']**2 >= theta_cut]
    hadron_data['label'] = 0
    gamma_data['label'] = 1
    '''
	Length is set to the minimal lenght of the both data and a concat dataset is returned.
	'''
    if (length == None):
        length = min([len(hadron_data), len(gamma_data)])

    return pd.concat(
        [hadron_data.drop('theta_deg', axis=1)[:length], gamma_data[:length]])
コード例 #4
0
def main(data_path, key):

    events = read_h5py(data_path, key='events', columns=columns)

    theta2_cuts = np.arange(0.1, 0.0, -0.001)
    prediction_thresholds = np.arange(0.75, 1, 0.001)

    max_significance = 0
    selected = events
    for threshold in tqdm(prediction_thresholds):
        selected = selected.query('gamma_prediction >= {}'.format(threshold))

        theta2_on = selected.theta_deg**2
        theta2_off = pd.concat(
            [selected['theta_deg_off_{}'.format(i)] for i in range(1, 6)])**2

        for theta2_cut in theta2_cuts:
            theta2_on = theta2_on[theta2_on <= theta2_cut]
            theta2_off = theta2_off[theta2_off <= theta2_cut]

            n_on = len(theta2_on)
            n_off = len(theta2_off)

            sig = li_ma_significance(n_on, n_off, 0.2)
            if sig >= max_significance:
                max_significance = sig
                best_threshold = threshold
                best_theta2_cut = theta2_cut

    print('Threshold:', best_threshold)
    print('θ² cut:   ', best_theta2_cut)
    print('Li&Ma    :', max_significance)
コード例 #5
0
def read_timestamp(path):
    try:
        timestamp = read_h5py(path, key='events', columns=['timestamp'])
        timestamp = pd.to_datetime(timestamp['timestamp'])
    except KeyError:
        try:
            col = 'unix_time_utc'
            unix_time_utc = read_h5py(path, key='events', columns=[col])
            timestamp = pd.to_datetime(
                unix_time_utc[col + '_0'] * 1e6 + unix_time_utc[col + '_1'],
                unit='us',
            )
        except KeyError:
            raise KeyError(
                'File contains neither "timestamp" nor "unix_time_utc"')
    return timestamp
コード例 #6
0
def test_to_h5py():
    from fact.io import to_h5py, read_h5py

    df = pd.DataFrame({
        'x': np.random.normal(size=50),
        'N': np.random.randint(0, 10, dtype='uint8')
    })

    with tempfile.NamedTemporaryFile() as f:
        to_h5py(df, f.name, key='test')

        with h5py.File(f.name, 'r') as hf:

            assert 'test' in hf.keys()

            g = hf['test']

            assert 'x' in g.keys()
            assert 'N' in g.keys()

        df2 = read_h5py(f.name, key='test')
        df2.sort_index(1, inplace=True)
        df.sort_index(1, inplace=True)

        assert all(df.dtypes == df2.dtypes)
        assert all(df['x'] == df2['x'])
        assert all(df['N'] == df2['N'])
コード例 #7
0
def load_gamma_subset(sourcefile,
                      theta2_cut=0.0, conf_cut=0.9, num_off_positions=1, analysis_type='classic', with_runs=False):
    events = read_h5py(sourcefile, key='events')

    selection_columns = ['theta_deg', 'gamma_prediction', 'zd_tracking', 'conc_core']
    theta_off_columns = ['theta_deg_off_{}'.format(i)
                         for i in range(1, num_off_positions)]
    bg_prediction_columns = ['gamma_prediction_off_{}'.format(i)
                             for i in range(1, num_off_positions)]

    if analysis_type == 'source':
        log.info('\tSelection events for source dependent analysis')
        log.info("\t\tgamma_pred_cut={0:.2f}".format(conf_cut))
        on_data, off_data = split_on_off_source_dependent(
            events=events,
            prediction_threshold=conf_cut,
            on_prediction_key='gamma_prediction',
            off_prediction_keys=bg_prediction_columns)
        on_mc = events.query('gamma_prediction >= {}'.format(conf_cut))
    elif analysis_type == 'classic':
        log.info('\tSelection events for source independent analysis')
        log.info("\t\tgamma_pred_cut={0:.2f}".format(conf_cut))
        log.info("\t\ttheta2_cut={0:.2f}".format(theta2_cut))
        on_data, off_data = split_on_off_source_independent(
            events=events.query('gamma_prediction >= {}'.format(conf_cut)),
            theta2_cut=theta2_cut,
            theta_key='theta_deg',
            theta_off_keys=theta_off_columns)
        on_mc = events.query(
            '(theta_deg <= {}) & (gamma_prediction >= {})'.format(
                theta2_cut, conf_cut))

    log.info("\t{} Data Events (on region)".format(len(on_data)))
    log.info("\t\t{} Data Events ({} off regions)".format(len(off_data),
                                                          num_off_positions))
    log.info("\t{} MC gammas after selection".format(len(on_mc)))

    if with_runs:
        runs = read_h5py(sourcefile, key='runs')
        t_obs = runs.ontime.sum()

    n_events_per_off_region = len(off_data) / num_off_positions
    n_events_on_region = len(on_data)
    n_events_expected_signal = n_events_on_region - n_events_per_off_region

    return on_mc, on_data, off_data
コード例 #8
0
def create_mask(input_file, mask_config):
    columns = list(mask_config.keys())
    df = read_h5py(input_file, key='events', columns=columns)

    mask = np.ones(len(df), dtype='bool')
    for key, (op, val) in mask_config.items():
        mask &= OPERATORS[op](df[key], val)
    return mask
コード例 #9
0
def test_write_lists_h5py():
    from fact.io import to_h5py, read_h5py

    df = pd.DataFrame({'x': [[1.0, 2.0], [3.0, 4.0]]})

    with tempfile.NamedTemporaryFile(suffix='.hdf5') as f:
        to_h5py(df, f.name)

        df = read_h5py(f.name, columns=['x'])

        assert df['x_0'].iloc[0] == 1.0
コード例 #10
0
def test_hdf5():
    from erna.io import Writer

    with tempfile.NamedTemporaryFile(prefix='erna_test_', suffix='.hdf5') as f:
        with Writer(f.name) as writer:
            assert writer.fmt == 'hdf5'
            for i in range(n_dfs):
                writer.append(random_df(n_rows))

        df = read_h5py(f.name, key='events')
        assert len(df) == n_rows * n_dfs
コード例 #11
0
def read_dfs_for_column(datasets, column, masks=None):
    dfs = []
    for d, dataset in enumerate(datasets):
        if 'parts' in dataset:
            parts = []
            for p, part in enumerate(dataset['parts']):
                df = read_h5py(
                    part['path'], key='events', columns=[column]
                )
                if masks is not None:
                    mask = masks[d][p]
                    df = df.loc[mask].copy()
                parts.append(df)
            dfs.append(parts)
        else:
            df = read_h5py(dataset['path'], key='events', columns=[column])
            if masks is not None:
                mask = masks[d]
                df = df.loc[mask].copy()
            dfs.append(df)
    return dfs
コード例 #12
0
def read_file(infile):
    log.debug(f"Reading {infile}")
    events = read_h5py(infile, key='events', columns=list(COLUMN_MAP.keys()))
    sim_runs = read_h5py(infile, key='corsika_runs')

    events.rename(columns=COLUMN_MAP, inplace=True)

    n_showers = np.sum(sim_runs.num_showers * sim_runs.shower_reuse)
    log.debug(f"Number of events from corsika_runs: {n_showers}")

    sim_info = SimulatedEventsInfo(
        n_showers=n_showers,
        energy_min=u.Quantity(sim_runs["energy_range_min"][0], u.TeV),
        energy_max=u.Quantity(sim_runs["energy_range_max"][0], u.TeV),
        max_impact=u.Quantity(sim_runs["max_scatter_range"][0], u.m),
        spectral_index=sim_runs["spectral_index"][0],
        viewcone=u.Quantity(
            sim_runs["max_viewcone_radius"][0] -
            sim_runs["min_viewcone_radius"][0], u.deg),
    )
    return table.QTable.from_pandas(events, units=UNIT_MAP), sim_info
コード例 #13
0
def test_to_h5py_string():
    from fact.io import to_h5py, read_h5py

    df = pd.DataFrame({
        'name': ['Mrk 501', 'Mrk 421', 'Crab'],
    })

    with tempfile.NamedTemporaryFile() as f:
        to_h5py(df, f.name, key='test')
        df2 = read_h5py(f.name, key='test')

        assert all(df.dtypes == df2.dtypes)
        assert all(df['name'] == df2['name'])
コード例 #14
0
def test_to_h5py_append_second_group():
    from fact.io import to_h5py, read_h5py

    df1 = pd.DataFrame({
        'x': np.random.normal(size=50),
        'N': np.random.randint(0, 10, dtype='uint8')
    })
    df2 = pd.DataFrame({
        'x': np.random.normal(size=50),
        'N': np.random.randint(0, 10, dtype='uint8')
    })

    with tempfile.NamedTemporaryFile() as f:
        to_h5py(df1, f.name, key='g1', index=False)
        to_h5py(df2, f.name, key='g2', index=False)

        df_g1 = read_h5py(f.name, key='g1')
        df_g2 = read_h5py(f.name, key='g2')

        for col in df_g1.columns:
            assert all(df_g1[col] == df1[col])

        for col in df_g2.columns:
            assert all(df_g2[col] == df2[col])
コード例 #15
0
def read_run_calculate_thetas(run, columns, threshold, source: SkyCoord,
                              n_offs):

    df = read_h5py(run, key='events', columns=columns)

    ontime = calc_ontime(df).to(u.hour)

    if type(threshold) == float:
        df_selected = df.query(f'gammaness > {threshold}')
    else:
        df['selected_gh'] = evaluate_binned_cut(
            df.gammaness.to_numpy(),
            df.gamma_energy_prediction.to_numpy() * u.TeV, threshold,
            operator.ge)
        df_selected = df.query('selected_gh')

    location = EarthLocation.from_geodetic(-17.89139 * u.deg, 28.76139 * u.deg,
                                           2184 * u.m)
    obstime = Time(df_selected.dragon_time, format='unix')

    altaz = AltAz(obstime=obstime, location=location)

    pointing = SkyCoord(
        alt=u.Quantity(df_selected.alt_tel.values, u.rad, copy=False),
        az=u.Quantity(df_selected.az_tel.values, u.rad, copy=False),
        frame=altaz,
    )
    pointing_icrs = pointing.transform_to('icrs')

    prediction_icrs = SkyCoord(df_selected.source_ra_prediction.values * u.rad,
                               df_selected.source_dec_prediction.values *
                               u.rad,
                               frame='icrs')

    theta, theta_off = calc_theta_off(
        source_coord=source,
        reco_coord=prediction_icrs,
        pointing_coord=pointing_icrs,
        n_off=n_offs,
    )

    # generate df containing corresponding energies etc for theta_off
    df_selected5 = df_selected
    for i in range(n_offs - 1):
        df_selected5 = df_selected5.append(df_selected)

    return df_selected, ontime, theta, df_selected5, theta_off
コード例 #16
0
 def init(self):
     self.dl2_file = read_h5py(
         self.dl2_file,
         key="events",
         columns=[
             "event_num",
             "run_id",
             "night",
             "source_position_az",
             "source_position_zd",
             "source_position_x",
             "source_position_y",
             "cog_x",
             "cog_y",
             "timestamp",
             "pointing_position_az",
             "pointing_position_zd",
         ],
     )
コード例 #17
0
def test_to_h5py_append():
    from fact.io import to_h5py, read_h5py

    df1 = pd.DataFrame({
        'x': np.random.normal(size=50),
        'N': np.random.randint(0, 10, dtype='uint8')
    })
    df2 = pd.DataFrame({
        'x': np.random.normal(size=50),
        'N': np.random.randint(0, 10, dtype='uint8')
    })

    with tempfile.NamedTemporaryFile() as f:
        to_h5py(df1, f.name, key='test', index=False)
        to_h5py(df2, f.name, key='test', mode='a', index=False)

        df_read = read_h5py(f.name, key='test')
        df_written = pd.concat([df1, df2], ignore_index=True)

        for col in df_written.columns:
            assert all(df_read[col] == df_written[col])
コード例 #18
0
def test_to_h5py_datetime():
    from fact.io import to_h5py, read_h5py

    df = pd.DataFrame({
        't_ns':
        pd.date_range('2017-01-01', freq='1ns', periods=100),
        't_us':
        pd.date_range('2017-01-01', freq='1us', periods=100),
        't_ms':
        pd.date_range('2017-01-01', freq='1ms', periods=100),
        't_s':
        pd.date_range('2017-01-01', freq='1s', periods=100),
        't_d':
        pd.date_range('2017-01-01', freq='1d', periods=100),
    })

    with tempfile.NamedTemporaryFile() as f:
        to_h5py(df, f.name, key='test')
        df2 = read_h5py(f.name, key='test')

        for col in df.columns:
            assert all(df[col] == df2[col])
コード例 #19
0
ファイル: io.py プロジェクト: fact-project/aict-tools
def read_data(file_path, key=None, columns=None, first=None, last=None, **kwargs):
    """
    This is similar to the read_data function in fact.io
    pandas hdf5:   pd.HDFStore
    h5py hdf5:     fact.io.read_h5py
    """
    _, extension = os.path.splitext(file_path)

    if extension in [".hdf", ".hdf5", ".h5"]:
        try:
            df = pd.read_hdf(
                file_path, key=key, columns=columns, start=first, stop=last, **kwargs
            )
        except (TypeError, ValueError):
            df = read_h5py(
                file_path, key=key, columns=columns, first=first, last=last, **kwargs
            )
        return df
    else:
        raise NotImplementedError(
            f"AICT tools cannot handle data with extension {extension} yet."
        )
コード例 #20
0
    ax.errorbar(
        binned['center'],
        binned[key],
        xerr=0.5 * binned['width'],
        label=label,
        linestyle='',
    )
    ax.legend()
    ax.set_xscale('log')
    ax.set_xlabel(
            rf'$\log_{{10}}(E_{{\mathrm{{MC}}}} \,\, / \,\, \mathrm{{{energy_unit}}})$'
    )

    return ax

gamma_2_150 = read_h5py('../build/dl2_gamma_south_pointing_20200706_v0.5.2_local_DL1_testing.h5', key = 'events')
gamma_2_300 = read_h5py('../HDD/build_scaling_300/dl2_gamma_south_pointing_20200706_v0.5.2_local_DL1_testing.h5', key = 'events')
gamma_1_150 = read_h5py('../HDD/build_noscaling/dl2_gamma_south_pointing_20200514_v0.5.1_v01_DL1_testing.h5', key = 'events')
gamma_1_300 = read_h5py('../HDD/build_noscaling_300/dl2_gamma_south_pointing_20200514_v0.5.1_v01_DL1_testing.h5', key = 'events')

gammaness_threshold = 0.6

figures = []

figures.append(plt.figure())
ax = figures[-1].add_subplot(1, 1, 1)
plotting.angular_res(gamma_1_150, 'mc_energy', ax, label='v0.5.1 and intensity > 150')
plotting.angular_res(gamma_1_300, 'mc_energy', ax, label='v0.5.1 and intensity > 300')
plotting.angular_res(gamma_2_150, 'mc_energy', ax, label='v0.5.2 and intensity > 150')
plotting.angular_res(gamma_2_300, 'mc_energy', ax, label='v0.5.2 and intensity > 300')
#ax.set_title('All events')
コード例 #21
0
def main(outdir, gamma_diff_file, gamma_file, output):
    offs = [
        f'{outdir}/dl2_v0.5.1_LST-1.Run01837.h5',
        f'{outdir}/dl2_v0.5.1_LST-1.Run01840.h5',
        f'{outdir}/dl2_v0.5.1_LST-1.Run01841.h5',
        f'{outdir}/dl2_v0.5.1_LST-1.Run01842.h5'
    ]

    ons = [
        f'{outdir}/dl2_v0.5.1_LST-1.Run01832.h5',
        f'{outdir}/dl2_v0.5.1_LST-1.Run01833.h5',
        f'{outdir}/dl2_v0.5.1_LST-1.Run01834.h5',
        f'{outdir}/dl2_v0.5.1_LST-1.Run01835.h5',
        f'{outdir}/dl2_v0.5.1_LST-1.Run01836.h5',
        f'{outdir}/dl2_v0.5.1_LST-1.Run01843.h5',
        f'{outdir}/dl2_v0.5.1_LST-1.Run01844.h5'
    ]

    df_off = pd.DataFrame()
    for i, run in enumerate(offs):
        df_off = pd.concat(
            [df_off, read_h5py(run, key='events', columns=columns)],
            ignore_index=True)

    df_on = pd.DataFrame()
    for i, run in enumerate(ons):
        df_on = pd.concat(
            [df_on, read_h5py(run, key='events', columns=columns)],
            ignore_index=True)

    gamma_diff = read_h5py(gamma_diff_file, key='events')
    gamma = read_h5py(gamma_file, key='events')

    figures = []
    theta2_cut = 0.04
    gammaness_threshold = 0.6

    #theta2 camera center
    figures.append(plt.figure())
    ax = figures[-1].add_subplot(1, 1, 1)
    plotting.theta2(df_on, theta2_cut, gammaness_threshold, df_off, ax)
    #ax.set_title('Crab camera center, total-time scaling')

    figures.append(plt.figure())
    ax = figures[-1].add_subplot(1, 1, 1)
    plotting.theta2(df_on,
                    theta2_cut,
                    gammaness_threshold,
                    df_off,
                    ax,
                    alpha='manuel')
    #ax.set_title('Crab camera center, furthest $50\%$ scaling')

    #crab coordinates
    on_pointing = []
    for i, run in enumerate(ons):
        df = read_h5py(run, key='events', columns=columns)
        on_pointing.append(df)

    #figures.append(plt.figure())
    #ax = figures[-1].add_subplot(1, 1, 1)
    #plotting.plot2D_runs(on_pointing, ons, 'crab', gammaness_threshold, ax)
    #
    #figures.append(plt.figure())
    #ax = figures[-1].add_subplot(1, 1, 1)
    #plotting.plot2D(df_on, gammaness_threshold, ax)

    figures.append(plt.figure())
    ax = figures[-1].add_subplot(1, 1, 1)
    plotting.theta2(df_on, 0.1, gammaness_threshold, df_off, ax, coord='crab')
    ax.set_title('Crab coordinates, total-time scaling')

    figures.append(plt.figure())
    ax = figures[-1].add_subplot(1, 1, 1)
    plotting.theta2(df_on,
                    0.1,
                    gammaness_threshold,
                    df_off,
                    ax,
                    alpha='manuel',
                    coord='crab')
    ax.set_title('Crab coordinates, furthest $50\%$ scaling')

    #test plots
    figures.append(plt.figure())
    ax = figures[-1].add_subplot(1, 1, 1)
    ax.hist(gamma_diff.disp_prediction, bins=100, histtype='step')
    ax.set_xlabel('disp prediction')
    ax.set_title('gamma-diffuse testing')

    figures.append(plt.figure())
    ax = figures[-1].add_subplot(1, 1, 1)
    ax.hist(gamma_diff.gammaness, bins=100, histtype='step')
    ax.set_xlabel('gammaness')
    ax.set_title('gamma-diffuse testing')

    #figures.append(plt.figure())
    #ax = figures[-1].add_subplot(1, 1, 1)
    #plotting.theta2(gamma_diff, theta2_cut, gammaness_threshold, ax=ax, range=None)
    #ax.set_title('gamma-diffuse testing')

    #angular resolustion
    figures.append(plt.figure())
    ax = figures[-1].add_subplot(1, 1, 1)
    plotting.angular_res(gamma, 'mc_energy', ax)
    ax.set_title('Angular resolution (no cuts)')

    figures.append(plt.figure())
    ax = figures[-1].add_subplot(1, 1, 1)

    gamma['sign_prediction'] = np.sign(gamma.disp_prediction)
    gamma_cuts = gamma.query('sign_prediction == disp_sign')
    gamma_cuts = gamma_cuts.query(f'gammaness > {gammaness_threshold}')
    plotting.angular_res(gamma_cuts, 'mc_energy', ax)
    ax.set_title(
        f'Angular resolution (correct sign prediction & gammaness > {gammaness_threshold})'
    )

    figures.append(plt.figure())
    ax = figures[-1].add_subplot(1, 1, 1)
    plotting.angular_res(gamma, 'mc_energy', ax, label='All events')
    plotting.angular_res(
        gamma_cuts,
        'mc_energy',
        ax,
        label=rf'correct sign and $p_\gamma > {gammaness_threshold}$')

    #saving
    with PdfPages(output) as pdf:
        for fig in figures:
            fig.tight_layout()
            pdf.savefig(fig)
コード例 #22
0
def main(
    config,
    gamma_file,
    corsika_file,
    output_file,
    obstime,
    seed,
    label,
    threshold,
    theta2_cut,
):
    '''
    unfold fact simulations
    '''
    setup_logging()
    log = logging.getLogger('fact_funfolding')
    log.setLevel(logging.INFO)

    random_state = np.random.RandomState(seed)
    np.random.set_state(random_state.get_state())

    config = Config.from_yaml(config)
    e_ref = config.e_ref
    threshold = threshold or config.threshold
    theta2_cut = theta2_cut or config.theta2_cut

    log.info(f'Using threshold {threshold}')
    log.info(f'Using theta2 cut {theta2_cut}')

    # define binning in e_est and e_true
    bins_obs = logspace_binning(config.e_est_low, config.e_est_high, e_ref,
                                config.n_bins_est)
    bins_true = logspace_binning(config.e_true_low, config.e_true_high, e_ref,
                                 config.n_bins_true)

    # read in files
    query = 'gamma_prediction > {} and theta_deg**2 < {}'.format(
        threshold, theta2_cut)

    gammas = read_h5py(gamma_file, key='events').query(query)
    with h5py.File(gamma_file, 'r') as f:
        sample_fraction = f.attrs.get('sample_fraction', 1.0)
        log.info('Using sampling fraction of {:.3f}'.format(sample_fraction))

    query = 'gamma_prediction > {}'.format(threshold)
    corsika_events = read_h5py(
        corsika_file,
        key='corsika_events',
        columns=['total_energy'],
    )
    simulated_spectrum = read_simulated_spectrum(corsika_file)

    weights = calc_weights_powerlaw(
        u.Quantity(gammas['corsika_event_header_total_energy'].values,
                   u.GeV,
                   copy=False),
        obstime=obstime,
        n_events=simulated_spectrum['n_showers'],
        e_min=simulated_spectrum['energy_min'],
        e_max=simulated_spectrum['energy_max'],
        simulated_index=simulated_spectrum['energy_spectrum_slope'],
        scatter_radius=simulated_spectrum['x_scatter'],
        target_index=HEGRA_INDEX,
        flux_normalization=HEGRA_NORM,
        e_ref=HEGRA_E_REF,
        sample_fraction=sample_fraction,
    )

    # calculate effective area in given binning
    a_eff, bin_center, bin_width, a_eff_low, a_eff_high = collection_area(
        corsika_events.total_energy.values,
        gammas[E_TRUE].values,
        impact=simulated_spectrum['x_scatter'],
        bins=bins_true.to_value(u.GeV),
        sample_fraction=sample_fraction,
    )

    gammas['bin'] = np.digitize(gammas[E_TRUE], bins_true.to(u.GeV).value)
    # split dataframes in train / test set
    gammas['test'] = False
    n_test = np.random.poisson(weights.sum())
    idx = np.random.choice(gammas.index, n_test, p=weights / weights.sum())
    gammas.loc[idx, 'test'] = True

    df_test = gammas[gammas.test]
    df_model = gammas[~gammas.test]

    X_model = df_model[E_PRED].values
    y_model = df_model[E_TRUE].values

    X_test = df_test[E_PRED].values
    y_test = df_test[E_TRUE].values

    g_model = np.digitize(X_model, bins_obs.to(u.GeV).value)
    f_model = np.digitize(y_model, bins_true.to(u.GeV).value)

    g_test = np.digitize(X_test, bins_obs.to(u.GeV).value)
    f_test = np.digitize(y_test, bins_true.to(u.GeV).value)

    model = ff.model.LinearModel(random_state=random_state)
    model.initialize(digitized_obs=g_model, digitized_truth=f_model)

    vec_g_test, vec_f_test = model.generate_vectors(digitized_obs=g_test,
                                                    digitized_truth=f_test)
    vec_g_model, vec_f_model = model.generate_vectors(digitized_obs=g_model,
                                                      digitized_truth=f_model)

    llh = ff.solution.StandardLLH(
        tau=config.tau,
        log_f=True,
        reg_factor_f=1 / a_eff.value[1:-1] if config.tau else None,
    )
    llh.initialize(
        vec_g=vec_g_test,
        model=model,
        ignore_n_bins_low=1,
        ignore_n_bins_high=1,
    )

    sol_mcmc = ff.solution.LLHSolutionMCMC(
        n_burn_steps=config.n_burn_steps,
        n_used_steps=config.n_used_steps,
        random_state=random_state,
    )
    sol_mcmc.initialize(llh=llh, model=model)
    sol_mcmc.set_x0_and_bounds(x0=np.random.poisson(vec_f_test))

    vec_f_est, sigma_vec_f, sample, probs, autocorr_time = sol_mcmc.fit()

    additional_features_to_save = dict()
    additional_features_to_save['a_eff'] = a_eff
    additional_features_to_save['a_eff_low'] = a_eff_low
    additional_features_to_save['a_eff_high'] = a_eff_high

    save_spectrum(
        output_file,
        bins_true,
        vec_f_est / a_eff / bin_width / u.GeV / obstime,
        sigma_vec_f / a_eff / bin_width / u.GeV / obstime,
        counts=vec_f_est,
        counts_err=sigma_vec_f,
        tau=config.tau,
        label=label or config.label,
        add_features=additional_features_to_save,
    )
コード例 #23
0
ファイル: fact_to_dl3.py プロジェクト: LukasBeiske/aict-tools
def main(
    configuration_path,
    data_path,
    separator_model_path,
    energy_model_path,
    disp_model_path,
    sign_model_path,
    output,
    random_source,
    wobble_distance,
    key,
    chunksize,
    n_jobs,
    yes,
    verbose,
):
    '''
    Apply given model to data. Two columns are added to the file, energy_prediction
    and energy_prediction_std

    CONFIGURATION_PATH: Path to the config yaml file

    DATA_PATH: path to the FACT data in a h5py hdf5 file, e.g. erna_gather_fits output

    SEPARATOR_MODEL_PATH: Path to the pickled separation model.

    ENERGY_MODEL_PATH: Path to the pickled energy regression model.

    DISP_MODEL_PATH: Path to the pickled disp model.

    SIGN_MODEL_PATH: Path to the pickled sign model.
    '''
    log = setup_logging()

    config = AICTConfig.from_yaml(configuration_path)

    if os.path.isfile(output):
        if not yes:
            click.confirm(
                'Outputfile {} exists. Overwrite?'.format(output),
                abort=True,
            )
        open(output, 'w').close()

    log.info('Loading model')
    separator_model = load_model(separator_model_path)
    energy_model = load_model(energy_model_path)
    disp_model = load_model(disp_model_path)
    sign_model = load_model(sign_model_path)
    log.info('Done')

    if n_jobs:
        separator_model.n_jobs = n_jobs
        energy_model.n_jobs = n_jobs
        disp_model.n_jobs = n_jobs
        sign_model.n_jobs = n_jobs

    columns = set(needed_columns)
    for model in ('separator', 'energy', 'disp'):
        model_config = getattr(config, model)
        columns.update(model_config.columns_to_read_apply)
    try:
        runs = read_h5py(data_path, key='runs')
        sources = runs['source'].unique()
        if len(sources) > 1:
            raise click.ClickException(
                'to_dl3 only supports files with a single source')
        source = SkyCoord.from_name(sources[0])
        columns.update(['timestamp', 'night'])
    except (KeyError, OSError):
        source = None
        columns.update(dl3_columns_sim_read)

    df_generator = read_telescope_data_chunked(
        data_path,
        config,
        chunksize=chunksize,
        columns=columns,
    )

    log.info('Predicting on data...')
    for df, start, end in tqdm(df_generator):
        df_sep = feature_generation(df, config.separator.feature_generation)
        df['gamma_prediction'] = predict_separator(
            df_sep[config.separator.features],
            separator_model,
        )

        df_energy = feature_generation(df, config.energy.feature_generation)
        df['gamma_energy_prediction'] = predict_energy(
            df_energy[config.energy.features],
            energy_model,
            log_target=config.energy.log_target,
        )

        df_disp = feature_generation(df, config.disp.feature_generation)
        disp = predict_disp(
            df_disp[config.disp.features],
            disp_model,
            sign_model,
            log_target=config.disp.log_target,
        )

        prediction_x = df.cog_x + disp * np.cos(df.delta)
        prediction_y = df.cog_y + disp * np.sin(df.delta)
        df['source_x_prediction'] = prediction_x
        df['source_y_prediction'] = prediction_y
        df['disp_prediction'] = disp

        if source:
            obstime = Time(
                pd.to_datetime(df['timestamp'].values).to_pydatetime())
            source_altaz = concat_results_altaz(
                parallelize_array_computation(
                    partial(to_altaz, source=source),
                    obstime,
                    n_jobs=n_jobs,
                ))

            result = parallelize_array_computation(
                calc_source_features_obs,
                prediction_x,
                prediction_y,
                source_altaz.zen.deg,
                source_altaz.az.deg,
                df['pointing_position_zd'].values,
                df['pointing_position_az'].values,
                obstime,
                n_jobs=n_jobs,
            )
        else:

            if random_source:
                zd, az = calc_random_source(
                    df['pointing_position_zd'],
                    df['pointing_position_az'],
                    wobble_distance,
                )
                df['source_position_zd'] = zd
                df['source_position_az'] = az

            result = parallelize_array_computation(
                calc_source_features_sim,
                prediction_x,
                prediction_y,
                df['source_position_zd'].values,
                df['source_position_az'].values,
                df['pointing_position_zd'].values,
                df['pointing_position_az'].values,
                df['cog_x'].values,
                df['cog_y'].values,
                df['delta'].values,
                project_disp=config.disp.project_disp,
                n_jobs=n_jobs,
            )

        for k in result[0].keys():
            df[k] = np.concatenate([r[k] for r in result])

        if source:
            to_h5py(df[dl3_columns_obs], output, key='events', mode='a')
        else:
            to_h5py(df[dl3_columns_sim], output, key='events', mode='a')

    with h5py.File(data_path, 'r') as f:
        sample_fraction = f.attrs.get('sample_fraction', 1.0)

    set_sample_fraction(output, sample_fraction)
    copy_runs_group(data_path, output)
コード例 #24
0
def main(gamma_path, std, n_bins, threshold, theta2_cut, preliminary, config, output):

    df = read_h5py(
        gamma_path,
        key='events',
        columns=[
            'gamma_energy_prediction',
            'corsika_event_header_total_energy',
            'gamma_prediction',
            'theta_deg'
        ],
    )

    if config:
        with open(config) as f:
            plot_config.update(yaml.load(f))

    if threshold:
        df = df.query('gamma_prediction >= @threshold').copy()
    if theta2_cut:
        df = df.query('theta_deg**2 <= @theta2_cut').copy()

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    divider = make_axes_locatable(ax)
    cax = divider.append_axes('right', size='5%', pad=0.025)

    ax.set_aspect(1)
    ax.set_xscale('log')
    ax.set_yscale('log')

    e_min = min(
        df.gamma_energy_prediction.min(),
        df.corsika_event_header_total_energy.min()
    )
    e_max = max(
        df.gamma_energy_prediction.max(),
        df.corsika_event_header_total_energy.max()
    )

    limits = np.log10([e_min, e_max])
    bins = np.logspace(limits[0], limits[1], n_bins + 1)

    hist, xedges, yedges = np.histogram2d(
        df.corsika_event_header_total_energy.values,
        df.gamma_energy_prediction.values,
        bins=bins,
    )
    plot = ax.pcolormesh(
        xedges, yedges, hist.T,
        norm=LogNorm() if plot_config['logz'] else None,
        cmap=plot_config['cmap'],
    )
    plot.set_rasterized(True)

    fig.colorbar(plot, cax=cax)

    if preliminary:
        add_preliminary(
            plot_config['preliminary_position'],
            size=plot_config['preliminary_size'],
            color=plot_config['preliminary_color'],
            ax=ax,
        )

    ax.set_xlabel(plot_config['xlabel'])
    ax.set_ylabel(plot_config['ylabel'])

    fig.tight_layout(pad=0)

    if output:
        fig.savefig(output, dpi=300)
    else:
        plt.show()
コード例 #25
0
def main(
    configuration_path,
    data_path,
    separator_model_path,
    energy_model_path,
    disp_model_path,
    sign_model_path,
    output,
    random_source,
    wobble_distance,
    key,
    chunksize,
    n_jobs,
    yes,
    verbose,
):
    """
    Apply given model to data. Two columns are added to the file, energy_prediction
    and energy_prediction_std

    CONFIGURATION_PATH: Path to the config yaml file

    DATA_PATH: path to the FACT data in a h5py hdf5 file, e.g. erna_gather_fits output

    SEPARATOR_MODEL_PATH: Path to the pickled separation model.

    ENERGY_MODEL_PATH: Path to the pickled energy regression model.

    DISP_MODEL_PATH: Path to the pickled disp model.

    SIGN_MODEL_PATH: Path to the pickled sign model.
    """
    log = setup_logging()

    config = AICTConfig.from_yaml(configuration_path)

    if os.path.isfile(output):
        if not yes:
            click.confirm(
                "Outputfile {} exists. Overwrite?".format(output),
                abort=True,
            )
        open(output, "w").close()

    log.info("Loading model")
    separator_model = load_model(separator_model_path)
    energy_model = load_model(energy_model_path)
    disp_model = load_model(disp_model_path)
    sign_model = load_model(sign_model_path)
    log.info("Done")

    if n_jobs:
        separator_model.n_jobs = n_jobs
        energy_model.n_jobs = n_jobs
        disp_model.n_jobs = n_jobs
        sign_model.n_jobs = n_jobs

    columns = set(needed_columns)
    for model in ("separator", "energy", "disp"):
        model_config = getattr(config, model)
        columns.update(model_config.columns_to_read_apply)
    try:
        runs = read_h5py(data_path, key="runs")
        sources = runs["source"].unique()
        if len(sources) > 1:
            raise click.ClickException(
                "to_dl3 only supports files with a single source")
        source = SkyCoord.from_name(sources[0])
        columns.update(["timestamp", "night"])
    except (KeyError, OSError):
        source = None
        columns.update(dl3_columns_sim_read)

    df_generator = read_telescope_data_chunked(
        data_path,
        config,
        chunksize=chunksize,
        columns=columns,
    )

    log.info("Predicting on data...")
    for df, start, end in tqdm(df_generator):
        df_sep = feature_generation(df, config.separator.feature_generation)
        df["gamma_prediction"] = predict_separator(
            df_sep[config.separator.features],
            separator_model,
        )

        df_energy = feature_generation(df, config.energy.feature_generation)
        df["gamma_energy_prediction"] = predict_energy(
            df_energy[config.energy.features],
            energy_model,
            log_target=config.energy.log_target,
        )

        df_disp = feature_generation(df, config.disp.feature_generation)
        disp = predict_disp(
            df_disp[config.disp.features],
            disp_model,
            sign_model,
            log_target=config.disp.log_target,
        )

        prediction_x = df.cog_x + disp * np.cos(df.delta)
        prediction_y = df.cog_y + disp * np.sin(df.delta)
        df["source_x_prediction"] = prediction_x
        df["source_y_prediction"] = prediction_y
        df["disp_prediction"] = disp

        if source:
            obstime = Time(df["timestamp"].to_numpy().astype("U"))
            source_altaz = concat_results_altaz(
                parallelize_array_computation(
                    partial(to_altaz, source=source),
                    obstime,
                    n_jobs=n_jobs,
                ))

            result = parallelize_array_computation(
                calc_source_features_obs,
                prediction_x,
                prediction_y,
                source_altaz.zen.deg,
                source_altaz.az.deg,
                df["pointing_position_zd"].to_numpy(),
                df["pointing_position_az"].to_numpy(),
                obstime,
                n_jobs=n_jobs,
            )
        else:

            if random_source:
                zd, az = calc_random_source(
                    df["pointing_position_zd"],
                    df["pointing_position_az"],
                    wobble_distance,
                )
                df["source_position_zd"] = zd
                df["source_position_az"] = az

            result = parallelize_array_computation(
                calc_source_features_sim,
                prediction_x,
                prediction_y,
                df["source_position_zd"].to_numpy(),
                df["source_position_az"].to_numpy(),
                df["pointing_position_zd"].to_numpy(),
                df["pointing_position_az"].to_numpy(),
                df["cog_x"].to_numpy(),
                df["cog_y"].to_numpy(),
                df["delta"].to_numpy(),
                project_disp=config.disp.project_disp,
                n_jobs=n_jobs,
            )

        for k in result[0].keys():
            df[k] = np.concatenate([r[k] for r in result])

        if source:
            to_h5py(df[dl3_columns_obs], output, key="events", mode="a")
        else:
            to_h5py(df[dl3_columns_sim], output, key="events", mode="a")

    with h5py.File(data_path, "r") as f:
        sample_fraction = f.attrs.get("sample_fraction", 1.0)

    set_sample_fraction(output, sample_fraction)
    copy_group(data_path, output, "runs")
    copy_group(data_path, output, "corsika_runs")
def theta_square_plot(theta2_cut=0.8,
                      data_path=plotting_path,
                      key='events',
                      start=None,
                      end=None,
                      threshold=0.5,
                      bins=40,
                      alpha=0.2,
                      output=False):
    import pandas as pd
    import matplotlib.pyplot as plt
    import numpy as np
    import h5py
    from dateutil.parser import parse as parse_date

    from fact.io import read_h5py
    from fact.analysis import (
        li_ma_significance,
        split_on_off_source_dependent,
    )
    import click

    columns = [
        'gamma_prediction',
        'theta_deg',
        'theta_deg_off_1',
        'theta_deg_off_2',
        'theta_deg_off_3',
        'theta_deg_off_4',
        'theta_deg_off_5',
        'unix_time_utc',
    ]

    stats_box_template = r'''Source: {source}, $t_\mathrm{{obs}} = {t_obs:.2f}\,\mathrm{{h}}$
    $N_\mathrm{{On}} = {n_on}$, $N_\mathrm{{Off}} = {n_off}$, $\alpha = {alpha}$
    $N_\mathrm{{Exc}} = {n_excess:.1f} \pm {n_excess_err:.1f}$, $S_\mathrm{{Li&Ma}} = {significance:.1f}\,\sigma$
    '''

    theta_cut = np.sqrt(theta2_cut)

    with h5py.File(data_path, 'r') as f:
        source_dependent = 'gamma_prediction_off_1' in f[key].keys()

    if source_dependent:
        print('Separation was using source dependent features')
        columns.extend('gamma_prediction_off_' + str(i) for i in range(1, 6))
        theta_cut = np.inf
        theta2_cut = np.inf

    events = read_h5py(data_path, key='events', columns=columns)
    events['timestamp'] = pd.to_datetime(
        events['unix_time_utc_0'] * 1e6 + events['unix_time_utc_1'],
        unit='us',
    )
    runs = read_h5py(data_path, key='runs')
    runs['run_start'] = pd.to_datetime(runs['run_start'])
    runs['run_stop'] = pd.to_datetime(runs['run_stop'])

    if start is not None:
        events = events.query('timestamp >= @start')
        runs = runs.query('run_start >= @start')
    if end is not None:
        events = events.query('timestamp <= @end')
        runs = runs.query('run_stop <= @end')

    if source_dependent:
        on_data, off_data = split_on_off_source_dependent(events, threshold)
        theta_on = on_data.theta_deg
        theta_off = off_data.theta_deg
    else:
        selected = events.query('gamma_prediction >= {}'.format(threshold))
        theta_on = selected.theta_deg
        theta_off = pd.concat(
            [selected['theta_deg_off_{}'.format(i)] for i in range(1, 6)])

    del events

    if source_dependent:
        limits = [
            0,
            max(
                np.percentile(theta_on, 99)**2,
                np.percentile(theta_off, 99)**2),
        ]
    else:
        limits = [0, 0.3]

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    h_on, bin_edges = np.histogram(theta_on.apply(lambda x: x**2).values,
                                   bins=bins,
                                   range=limits)
    h_off, bin_edges, _ = ax.hist(
        theta_off.apply(lambda x: x**2).values,
        bins=bin_edges,
        range=limits,
        weights=np.full(len(theta_off), 0.2),
        histtype='stepfilled',
        color='lightgray',
    )

    bin_center = bin_edges[1:] - np.diff(bin_edges) * 0.5
    bin_width = np.diff(bin_edges)

    ax.errorbar(
        bin_center,
        h_on,
        yerr=np.sqrt(h_on) / 2,
        xerr=bin_width / 2,
        linestyle='',
        label='On',
    )
    ax.errorbar(
        bin_center,
        h_off,
        yerr=alpha * np.sqrt(h_off) / 2,
        xerr=bin_width / 2,
        linestyle='',
        label='Off',
    )

    if not source_dependent:
        ax.axvline(theta_cut**2, color='gray', linestyle='--')

    n_on = np.sum(theta_on < theta_cut)
    n_off = np.sum(theta_off < theta_cut)
    significance = li_ma_significance(n_on, n_off, alpha=alpha)

    ax.text(
        0.5,
        0.95,
        stats_box_template.format(
            source='Crab',
            t_obs=83.656,
            n_on=n_on,
            n_off=n_off,
            alpha=alpha,
            n_excess=n_on - alpha * n_off,
            n_excess_err=np.sqrt(n_on + alpha**2 * n_off),
            significance=significance,
        ),
        transform=ax.transAxes,
        fontsize=12,
        va='top',
        ha='center',
    )

    ax.set_xlabel(r'$(\theta / {}^\circ )^2$')
    ax.legend()
    fig.tight_layout()
    plt.xlim(0.0, 0.3)

    if output:
        fig.savefig(output, dpi=300)
    else:
        #plt.show()
        pass
コード例 #27
0
def main(
    config,
    observation_file,
    gamma_file,
    corsika_file,
    output_file,
    seed,
    label,
    threshold,
    theta2_cut,
):
    '''
    unfold fact data
    '''
    setup_logging()
    log = logging.getLogger('fact_funfolding')
    log.setLevel(logging.INFO)

    random_state = np.random.RandomState(seed)
    np.random.set_state(random_state.get_state())

    config = Config.from_yaml(config)
    e_ref = config.e_ref
    threshold = threshold or config.threshold
    theta2_cut = theta2_cut or config.theta2_cut

    log.info(f'Using threshold {threshold}')
    log.info(f'Using theta2 cut {theta2_cut}')

    # define binning in e_est and e_true
    bins_obs = logspace_binning(config.e_est_low, config.e_est_high, e_ref,
                                config.n_bins_est)
    bins_true = logspace_binning(config.e_true_low, config.e_true_high, e_ref,
                                 config.n_bins_true)

    # read in files
    query = 'gamma_prediction > {} and theta_deg**2 < {}'.format(
        threshold, theta2_cut)

    log.info('Reading simulated gammas')
    gammas = read_h5py(gamma_file, key='events').query(query)
    with h5py.File(gamma_file, 'r') as f:
        sample_fraction = f.attrs.get('sample_fraction', 1.0)
        log.info('Using sampling fraction of {:.3f}'.format(sample_fraction))

    query = 'gamma_prediction > {}'.format(threshold)

    log.info('Reading observations')
    observations = read_h5py(observation_file, key='events').query(query)

    on, off = split_on_off_source_independent(observations,
                                              theta2_cut=theta2_cut)

    observation_runs = read_h5py(observation_file, key='runs')
    obstime = observation_runs.ontime.sum() * u.s

    corsika_events = read_h5py(
        corsika_file,
        key='corsika_events',
        columns=['total_energy'],
    )

    simulated_spectrum = read_simulated_spectrum(corsika_file)

    a_eff, bin_center, bin_width, a_eff_low, a_eff_high = collection_area(
        corsika_events.total_energy.values,
        gammas[E_TRUE].values,
        impact=simulated_spectrum['x_scatter'],
        bins=bins_true.to_value(u.GeV),
        sample_fraction=sample_fraction,
    )

    # unfold using funfolding
    X_model = gammas[E_PRED].values
    y_model = gammas[E_TRUE].values

    X_data = on[E_PRED].values

    g_model = np.digitize(X_model, bins_obs.to(u.GeV).value)
    f_model = np.digitize(y_model, bins_true.to(u.GeV).value)

    g_data = np.digitize(X_data, bins_obs.to(u.GeV).value)

    model = ff.model.LinearModel(random_state=random_state)
    model.initialize(digitized_obs=g_model, digitized_truth=f_model)

    vec_g_data, _ = model.generate_vectors(digitized_obs=g_data)
    vec_g_model, vec_f_model = model.generate_vectors(digitized_obs=g_model,
                                                      digitized_truth=f_model)

    if config.background:
        X_bg = off[E_PRED].values
        g_bg = np.digitize(X_bg, bins_obs.to(u.GeV).value)
        vec_g_bg, _ = model.generate_vectors(digitized_obs=g_bg, )
        model.add_background(vec_g_bg * 0.2)

    llh = ff.solution.StandardLLH(
        tau=config.tau,
        log_f=True,
        reg_factor_f=1 / a_eff.value[1:-1] if config.tau else None,
    )
    llh.initialize(
        vec_g=vec_g_data,
        model=model,
        ignore_n_bins_low=1,
        ignore_n_bins_high=1,
    )

    sol_mcmc = ff.solution.LLHSolutionMCMC(
        n_burn_steps=config.n_burn_steps,
        n_used_steps=config.n_used_steps,
        random_state=random_state,
    )
    sol_mcmc.initialize(llh=llh, model=model)
    sol_mcmc.set_x0_and_bounds(x0=np.random.poisson(vec_f_model *
                                                    vec_g_data.sum() /
                                                    vec_g_model.sum()))

    vec_f_est, sigma_vec_f, sample, probs, autocorr_time = sol_mcmc.fit()

    additional_features_to_save = dict()
    additional_features_to_save['a_eff'] = a_eff
    additional_features_to_save['a_eff_low'] = a_eff_low
    additional_features_to_save['a_eff_high'] = a_eff_high

    save_spectrum(
        output_file,
        bins_true,
        vec_f_est / a_eff / obstime / bin_width / u.GeV,
        sigma_vec_f / a_eff / obstime / bin_width / u.GeV,
        counts=vec_f_est,
        counts_err=sigma_vec_f,
        g=vec_g_data,
        bg=vec_g_bg,
        tau=config.tau,
        label=label or config.label,
        add_features=additional_features_to_save,
    )
コード例 #28
0
def main(
    configuration_path,
    data_path,
    separator_model_path,
    energy_model_path,
    disp_model_path,
    sign_model_path,
    output,
    key,
    chunksize,
    n_jobs,
    yes,
    verbose,
):
    '''
    Apply given model to data. Two columns are added to the file, energy_prediction
    and energy_prediction_std

    CONFIGURATION_PATH: Path to the config yaml file

    DATA_PATH: path to the FACT data in a h5py hdf5 file, e.g. erna_gather_fits output

    SEPARATOR_MODEL_PATH: Path to the pickled separation model.

    ENERGY_MODEL_PATH: Path to the pickled energy regression model.

    DISP_MODEL_PATH: Path to the pickled disp model.

    SIGN_MODEL_PATH: Path to the pickled sign model.
    '''
    logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
    log = logging.getLogger()

    config = AICTConfig.from_yaml(configuration_path)

    if os.path.isfile(output):
        if not yes:
            click.confirm(
                'Outputfile {} exists. Overwrite?'.format(output),
                abort=True,
            )
        open(output, 'w').close()

    log.info('Loading model')
    separator_model = joblib.load(separator_model_path)
    energy_model = joblib.load(energy_model_path)
    disp_model = joblib.load(disp_model_path)
    sign_model = joblib.load(sign_model_path)
    log.info('Done')

    if n_jobs:
        separator_model.n_jobs = n_jobs
        energy_model.n_jobs = n_jobs
        disp_model.n_jobs = n_jobs
        sign_model.n_jobs = n_jobs

    columns = set(needed_columns)
    for model in ('separator', 'energy', 'disp'):
        model_config = getattr(config, model)
        columns.update(model_config.columns_to_read_apply)
    try:
        runs = read_h5py(data_path, key='runs')
        sources = runs['source'].unique()
        if len(sources) > 1:
            raise click.ClickException(
                'to_dl3 only supports files with a single source'
            )
        source = SkyCoord.from_name(sources[0])
        columns.update(['timestamp', 'night'])
    except (KeyError, OSError) as e:
        source = None
        columns.update(dl3_columns_sim_read)

    df_generator = read_telescope_data_chunked(
        data_path,
        config,
        chunksize=chunksize,
        columns=columns,
    )

    log.info('Predicting on data...')
    for df, start, end in tqdm(df_generator):
        df_sep = feature_generation(df, config.separator.feature_generation)
        df['gamma_prediction'] = predict_separator(
            df_sep[config.separator.features], separator_model,
        )

        df_energy = feature_generation(df, config.energy.feature_generation)
        df['gamma_energy_prediction'] = predict_energy(
            df_energy[config.energy.features],
            energy_model,
            log_target=config.energy.log_target,
        )

        df_disp = feature_generation(df, config.disp.feature_generation)
        disp = predict_disp(
            df_disp[config.disp.features], disp_model, sign_model
        )

        source_x = df.cog_x + disp * np.cos(df.delta)
        source_y = df.cog_y + disp * np.sin(df.delta)
        df['source_x_prediction'] = source_x
        df['source_y_prediction'] = source_y
        df['disp_prediction'] = disp

        if source:
            obstime = Time(pd.to_datetime(df['timestamp'].values).to_pydatetime())
            source_altaz = concat_results_altaz(parallelize_array_computation(
                partial(to_altaz, source=source),
                obstime,
                n_jobs=n_jobs,
            ))

            result = parallelize_array_computation(
                calc_source_features_obs,
                source_x,
                source_y,
                source_altaz.zen.deg,
                source_altaz.az.deg,
                df['pointing_position_zd'].values,
                df['pointing_position_az'].values,
                obstime,
                n_jobs=n_jobs,
            )
        else:

            result = parallelize_array_computation(
                calc_source_features_sim,
                source_x,
                source_y,
                df['source_position_zd'].values,
                df['source_position_az'].values,
                df['pointing_position_zd'].values,
                df['pointing_position_az'].values,
                df['cog_x'].values,
                df['cog_y'].values,
                df['delta'].values,
                n_jobs=n_jobs,
            )

        for k in result[0].keys():
            df[k] = np.concatenate([r[k] for r in result])

        if source:
            to_h5py(df[dl3_columns_obs], output, key='events', mode='a')
        else:
            to_h5py(df[dl3_columns_sim], output, key='events', mode='a')

    if source:
        log.info('Copying "runs" group')
        to_h5py(runs, output, key='runs', mode='a')
コード例 #29
0
def main(data_path, threshold, theta2_cut, key, bins, alpha, start, end,
         preliminary, ymax, config, output):
    '''
    Given the DATA_PATH to a data hdf5 file (e.g. the output of ERNAs gather scripts)
    this script will create the infamous theta square plot.

    This plot shows the events of (selected gamma-like) events which have been
    reconstructed as coming from the source region and the one coming from a
    (more or less abritrary) off region.

    In a traditional IACT analysis this plot is used to calculate the significance of
    detection.

    The HDF files are expected to a have a group called 'runs' and a group called 'events'
    The events group has to have the columns:
        'theta',
        'theta_deg_off_1',
        'theta_deg_off_2',
        'theta_deg_off_3',
        'theta_deg_off_4',
        'theta_deg_off_5',

    If a prediction threshold is to be used, also 'gamma_prediction',
    must be in the group.
    The 'gamma_prediction' column can be added to the data using
    'klaas_apply_separation_model' for example.
    '''
    if config:
        with open(config) as f:
            plot_config.update(yaml.safe_load(f))

    theta_cut = np.sqrt(theta2_cut)

    if threshold > 0.0:
        columns.append('gamma_prediction')

    events = read_h5py(data_path, key='events', columns=columns)

    if start or end:
        events['timestamp'] = read_timestamp(data_path)

    try:
        runs = read_h5py(data_path, key='runs')
        runs['run_start'] = pd.to_datetime(runs['run_start'])
        runs['run_stop'] = pd.to_datetime(runs['run_stop'])
    except IOError:
        runs = pd.DataFrame(
            columns=['run_start', 'run_stop', 'ontime', 'source'])

    if start is not None:
        events = events.query('timestamp >= @start')
        runs = runs.query('run_start >= @start')
    if end is not None:
        events = events.query('timestamp <= @end')
        runs = runs.query('run_stop <= @end')

    if threshold > 0:
        selected = events.query('gamma_prediction >= {}'.format(threshold))
    else:
        selected = events
    theta_on = selected.theta_deg
    theta_off = pd.concat(
        [selected['theta_deg_off_{}'.format(i)] for i in range(1, 6)])

    del events

    max_theta2 = 0.3
    width = max_theta2 / bins
    rounded_width = theta2_cut / np.round(theta2_cut / width)
    bins = np.arange(0, max_theta2 + 0.1 * rounded_width, rounded_width)

    print('Using {} bins to get theta_cut on a bin edge'.format(len(bins) - 1))

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    h_on, bin_edges = np.histogram(
        theta_on.apply(lambda x: x**2).values,
        bins=bins,
    )
    h_off, bin_edges, _ = ax.hist(
        theta_off.apply(lambda x: x**2).values,
        bins=bin_edges,
        weights=np.full(len(theta_off), 0.2),
        histtype='stepfilled',
        color='lightgray',
        zorder=0,
    )

    bin_center = bin_edges[1:] - np.diff(bin_edges) * 0.5
    bin_width = np.diff(bin_edges)

    ax.errorbar(
        bin_center,
        h_on,
        yerr=np.sqrt(h_on),
        xerr=bin_width / 2,
        linestyle='',
        label='On',
    )

    ax.errorbar(bin_center,
                h_off,
                yerr=alpha * np.sqrt(h_off),
                xerr=bin_width / 2,
                linestyle='',
                label='Off',
                zorder=1)

    ax.axvline(theta_cut**2, color='black', alpha=0.3, linestyle='--')

    n_on = np.sum(theta_on < theta_cut)
    n_off = np.sum(theta_off < theta_cut)
    significance = li_ma_significance(n_on, n_off, alpha=alpha)

    print('N_on', n_on)
    print('N_off', n_off)
    print('Li&Ma: {}'.format(significance))

    ax.text(
        0.5,
        0.95,
        stats_box_template.format(
            source=runs.source.iloc[0] if len(runs) > 0 else '',
            t_obs=runs.ontime.sum() / 3600,
            n_on=n_on,
            n_off=n_off,
            alpha=alpha,
            n_excess=n_on - alpha * n_off,
            n_excess_err=np.sqrt(n_on + alpha**2 * n_off),
            significance=significance,
        ),
        transform=ax.transAxes,
        va='top',
        ha='center',
    )

    if preliminary:
        add_preliminary(
            plot_config['preliminary_position'],
            size=plot_config['preliminary_size'],
            color=plot_config['preliminary_color'],
            ax=ax,
        )

    if ymax:
        ax.set_ylim(0, ymax)

    ax.set_xlim(0, bins.max())
    ax.set_xlabel(plot_config['xlabel'])
    ax.legend(loc=plot_config['legend_loc'])
    fig.tight_layout(pad=0)

    if output:
        fig.savefig(output, dpi=300)
    else:
        plt.show()
コード例 #30
0
from fact.io import read_h5py
from fact.analysis import li_ma_significance, split_on_off_source_independent
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np

crab_data = read_h5py(
    '/home/msackel/Desktop/gammaClassification/data/raw_data/crab_precuts.hdf5',
    key='events',
    columns=[
        'theta_deg',
        'theta_deg_off_1',
        'theta_deg_off_2',
        'theta_deg_off_3',
        'theta_deg_off_4',
        'theta_deg_off_5',
    ])

theta_on = crab_data['theta_deg']
theta_off = pd.concat(
    [crab_data['theta_deg_off_' + str(i)] for i in range(1, 6)])

# plt.style.use('msackel')

plt.figure(figsize=(4.5, 3.375))
plt.hist(theta_on**2, range=[0, 0.2], bins=50, histtype='step', label='On')
plt.hist(theta_off**2,
         range=[0, 0.2],
         bins=50,
         alpha=0.6,
         label='Off',
コード例 #31
0
from fact.io import read_h5py

exec(open('/home/msackel/Desktop/gammaClassification/programm/theta_cut/theta_cut.py').read())
exec(open('/home/msackel/Desktop/gammaClassification/programm/model_significance/model_significance.py').read())

Tree = RandomForestClassifier(max_depth=15, max_features=7, criterion='entropy', n_estimators=100, n_jobs=10)

with open('/home/msackel/Desktop/gammaClassification/config/feature.yaml') as f:
		feature = yaml.load(f)

eval_data = read_h5py(
				'/home/msackel/Desktop/gammaClassification/data/raw_data/mrk501_2014_precuts.hdf5',
				key='events',
				columns=list(feature) + [
						'theta_deg',
						'theta_deg_off_1',
						'theta_deg_off_2',
						'theta_deg_off_3',
						'theta_deg_off_4',
						'theta_deg_off_5',
						]
				)

print('---Theta**2 = 0.5')
train_data = theta_cut('/home/msackel/Desktop/gammaClassification/data/raw_data/gamma_precuts.hdf5', 
				'/home/msackel/Desktop/gammaClassification/data/raw_data/mrk501_2014_precuts.hdf5', 0.5) 

Tree.fit(train_data.drop('label', axis=1), train_data.label)

plot_significance(Tree, eval_data, path='plots/significance_mrk.pdf')