def test_to_h5py(): from fact.io import to_h5py, read_h5py df = pd.DataFrame({ 'x': np.random.normal(size=50), 'N': np.random.randint(0, 10, dtype='uint8') }) with tempfile.NamedTemporaryFile() as f: to_h5py(df, f.name, key='test') with h5py.File(f.name, 'r') as hf: assert 'test' in hf.keys() g = hf['test'] assert 'x' in g.keys() assert 'N' in g.keys() df2 = read_h5py(f.name, key='test') df2.sort_index(1, inplace=True) df.sort_index(1, inplace=True) assert all(df.dtypes == df2.dtypes) assert all(df['x'] == df2['x']) assert all(df['N'] == df2['N'])
def main(infile, outfile, tel_name): sim_runs = None parameters = pd.read_hdf(infile, key=f'dl1/event/telescope/parameters/{tel_name}') focal_length = pd.read_hdf( infile, key='instrument/telescope/optics').drop_duplicates().set_index( 'name').loc['LST', 'equivalent_focal_length'] # renaming for simulations if 'mc_az' in parameters.columns: with tables.open_file(infile) as f: sim_runs = f.root.simulation.run_config[:] sim_runs_df = pd.DataFrame() for name in sim_runs.dtype.names: if name != 'run_array_direction': sim_runs_df[name] = sim_runs[name] parameters['az_tel'] = parameters.mc_az_tel parameters['alt_tel'] = parameters.mc_alt_tel parameters['focal_length'] = focal_length to_h5py(parameters, outfile, key='events', mode='w') if sim_runs is not None: to_h5py(sim_runs_df, outfile, key='corsika_runs', mode='a')
def test_write_lists_h5py(): from fact.io import to_h5py, read_h5py df = pd.DataFrame({'x': [[1.0, 2.0], [3.0, 4.0]]}) with tempfile.NamedTemporaryFile(suffix='.hdf5') as f: to_h5py(df, f.name) df = read_h5py(f.name, columns=['x']) assert df['x_0'].iloc[0] == 1.0
def test_multiple_config(): from aict_tools.apply import create_mask_h5py config = [{"b": [">", 0]}, {"b": ["<", 5]}] with tempfile.NamedTemporaryFile(prefix="test_aict_", suffix=".hdf5") as f: to_h5py(df, f.name, key="events") mask = create_mask_h5py(h5py.File(f.name, "r"), n_events=len(df), selection_config=config) assert all(mask == [False, True, False, True])
def main(infile, outfile, tel_name): parameters = pd.read_hdf(infile, key = f'dl1/event/telescope/parameters/{tel_name}') focal_length = pd.read_hdf(infile, key = 'instrument/telescope/optics').drop_duplicates().set_index('name').loc['LST', 'equivalent_focal_length'] # renaming for simulations if 'mc_az' in parameters.columns: parameters['az_tel'] = parameters.mc_az_tel parameters['alt_tel'] = parameters.mc_alt_tel parameters['focal_length'] = focal_length to_h5py(parameters, outfile, key='events', mode = 'w')
def test_dict_config(): from aict_tools.apply import create_mask_h5py config = {'a': ['>', 2], 'b': ['<', 5]} with tempfile.NamedTemporaryFile(prefix='test_aict_', suffix='.hdf5') as f: to_h5py(df, f.name, key='events') mask = create_mask_h5py(h5py.File(f.name, 'r'), n_events=len(df), selection_config=config) assert all(mask == [False, False, False, True])
def test_to_h5py_string(): from fact.io import to_h5py, read_h5py df = pd.DataFrame({ 'name': ['Mrk 501', 'Mrk 421', 'Crab'], }) with tempfile.NamedTemporaryFile() as f: to_h5py(df, f.name, key='test') df2 = read_h5py(f.name, key='test') assert all(df.dtypes == df2.dtypes) assert all(df['name'] == df2['name'])
def main(outputfile, inputdir): inputfiles = [] for d in inputdir: inputfiles.extend(glob(os.path.join(d, 'cer*'))) for f in inputfiles[:]: if f + '.gz' in inputfiles: inputfiles.remove(f + '.gz') print('Processing', len(inputfiles), 'files') with Pool(cpu_count()) as pool: results = pool.imap_unordered(get_headers, inputfiles) run_headers = [] run_ends = [] for run_header, event_headers, run_end in tqdm(results, total=len(inputfiles)): run_headers.append(run_header) run_ends.append(run_end) df = pd.DataFrame(event_headers[event_columns]) to_h5py(df, outputfile, key='corsika_events', mode='a') print('saving runwise information') runs = pd.DataFrame(np.array(run_headers)[run_header_columns]) # some runs might have failed and thus no run end block for run_end in run_ends: if run_end is not None: dtype = run_end.dtype break else: raise IOError('All run_end blocks are None, all runs failed.') dummy = np.array([(b'RUNE', np.nan, np.nan)], dtype=dtype)[0] run_ends = [r if r is not None else dummy for r in run_ends] run_ends = np.array(run_ends) print('Number of failed runs:', np.count_nonzero(np.isnan(run_ends['n_events']))) runs['n_events'] = run_ends['n_events'] to_h5py(runs, outputfile, key='corsika_runs', mode='a') print('done')
def main(output_file, input_file, n_jobs): if n_jobs == -1: n_jobs = 15 print('Calculating features using', n_jobs, 'cores') if is_simulation_file(input_file[0]): print('Received simulation files as input.') else: print('Received data files as input.') with Pool(n_jobs) as pool: results = pool.imap_unordered(cluster_labels, input_file) for df in tqdm(results, total=len(input_file)): to_h5py(df, output_file, key="events", mode='a', index=False)
def test_to_h5py_append(): from fact.io import to_h5py, read_h5py df1 = pd.DataFrame({ 'x': np.random.normal(size=50), 'N': np.random.randint(0, 10, dtype='uint8') }) df2 = pd.DataFrame({ 'x': np.random.normal(size=50), 'N': np.random.randint(0, 10, dtype='uint8') }) with tempfile.NamedTemporaryFile() as f: to_h5py(df1, f.name, key='test', index=False) to_h5py(df2, f.name, key='test', mode='a', index=False) df_read = read_h5py(f.name, key='test') df_written = pd.concat([df1, df2], ignore_index=True) for col in df_written.columns: assert all(df_read[col] == df_written[col])
def write( typename, output_path, site_location, array_events_data, telescope_events_data, runs_all, positions, stereo, id_no): print('Writing ' + typename + " data...",datetime.now().time().strftime("%H:%M:%S")) telescope_events = pd.DataFrame(telescope_events_data) array_events = pd.DataFrame(array_events_data) runs = pd.DataFrame(runs_all) # Calculate and add telescope location to telescope_events telescope_events = add_tel_location( telescope_events, site_location, positions) if typename == 'gamma-diffuse': output_file = output_path + 'gammas-diffuse' + str(id_no) + '.hdf5' else: output_file = output_path + typename + 's' + str(id_no) + '.hdf5' # Save to hdf5 file to_h5py(telescope_events, output_file, key='telescope_events', mode='w') to_h5py(array_events, output_file, key='array_events', mode='a') to_h5py(runs, output_file, key='runs', mode='a')
def test_to_h5py_datetime(): from fact.io import to_h5py, read_h5py df = pd.DataFrame({ 't_ns': pd.date_range('2017-01-01', freq='1ns', periods=100), 't_us': pd.date_range('2017-01-01', freq='1us', periods=100), 't_ms': pd.date_range('2017-01-01', freq='1ms', periods=100), 't_s': pd.date_range('2017-01-01', freq='1s', periods=100), 't_d': pd.date_range('2017-01-01', freq='1d', periods=100), }) with tempfile.NamedTemporaryFile() as f: to_h5py(df, f.name, key='test') df2 = read_h5py(f.name, key='test') for col in df.columns: assert all(df[col] == df2[col])
def main(inputfile, outputfile): logging.info('Opening file') f = uproot.open(inputfile) logging.info('Getting tree') tree = f['Events'] branches = set(k.decode('ascii') for k in tree.keys()) ids = np.arange(tree.numentries) dfs = [] logging.info('Start reading telescope events') telescope_id = 1 while f'MHillas_{telescope_id}.' in branches: columns = { k.format(telescope_id=telescope_id): v for k, v in TELESCOPE_COLUMNS.items() } df = tree.pandas.df(columns.keys()) df.rename(columns=columns, inplace=True) df['event_id'] = ids df['telescope_id'] = telescope_id dfs.append(df) telescope_id += 1 df = pd.concat(dfs) df = df[df.trigger_time != -100] logging.info(f'Writing {len(df)} telescope events to hdf5 file') to_h5py(df, outputfile, mode='w', key='telescope_events') logging.info('done') df = tree.pandas.df(ARRAY_COLUMNS.keys()) df.rename(columns=ARRAY_COLUMNS, inplace=True) df['event_id'] = ids logging.info(f'Writing {len(df)} array events to hdf5 file') to_h5py(df, outputfile, mode='a', key='array_events') logging.info('done')
def main(): args = parser.parse_args() runs = pd.read_csv(args.runlist) runs['night_date'] = pd.to_datetime(runs['night'].astype(str), format='%Y%m%d') initialised = False for idx, run in tqdm(runs.iterrows(), total=len(runs)): night = int('{:%Y%m%d}'.format(run.night_date)) base = datepath(args.ganymed_base, run.night_date) ganymed_file = os.path.join( base, '{}_{:03d}-summary.root'.format(night, run.run_id)) df = read_mars(ganymed_file, tree='Events') df['night'] = night df['run_id'] = run.run_id if not initialised: to_h5py(args.outputfile, df, key='events', mode='w') initialised = True else: to_h5py(args.outputfile, df, key='events', mode='a')
def append(self, df): if self.outputfile is None: return if self.fmt == 'jsonl': if self._file is None: self._file = open(self.outputfile, 'w') df.to_json(self._file, lines=True, date_format='iso', orient='records') self._file.write('\n') elif self.fmt == 'csv': if self._file is None: self._file = open(self.outputfile, 'w') df.to_csv(self._file, header=not self.header_written) elif self.fmt == 'hdf5': mode = 'a' if self.header_written else 'w' to_h5py(df, self.outputfile, key='events', mode=mode) self.header_written = True
def main(output_file, input_file, eps, n_jobs, lower, upper): if n_jobs == -1: n_jobs = 48 # cpu_count() print('Calculating features using', n_jobs, 'cores') if is_simulation_file(input_file[0]): print('Received simulation files as input.') else: print('Received data files as input.') with Pool(n_jobs) as pool: results = [ pool.apply_async(gen_features_norm, kwds={ 'data_file': f, 'lower': lower, 'upper': upper }) for f in input_file ] for df in tqdm(as_completed(results), total=len(input_file)): to_h5py(df, output_file, key="events", mode='a', index=False)
def test_to_h5py_append_second_group(): from fact.io import to_h5py, read_h5py df1 = pd.DataFrame({ 'x': np.random.normal(size=50), 'N': np.random.randint(0, 10, dtype='uint8') }) df2 = pd.DataFrame({ 'x': np.random.normal(size=50), 'N': np.random.randint(0, 10, dtype='uint8') }) with tempfile.NamedTemporaryFile() as f: to_h5py(df1, f.name, key='g1', index=False) to_h5py(df2, f.name, key='g2', index=False) df_g1 = read_h5py(f.name, key='g1') df_g2 = read_h5py(f.name, key='g2') for col in df_g1.columns: assert all(df_g1[col] == df1[col]) for col in df_g2.columns: assert all(df_g2[col] == df2[col])
def main(outputfile, inputdir, infile_re, n_jobs): inputfiles = [] file_re = re.compile(infile_re) for d in tqdm(inputdir): for root, dirs, files in os.walk(os.path.abspath(d)): for f in files: if file_re.match(f): inputfiles.append(os.path.join(root, f)) print('Processing', len(inputfiles), 'files') with ProcessPoolExecutor(n_jobs) as pool: futures = [pool.submit(get_headers, f) for f in inputfiles] run_headers = [] run_ends = [] reuses = [] for future in tqdm(as_completed(futures), total=len(inputfiles)): run_header, event_headers, run_end = future.result() run_headers.append(run_header) run_ends.append(run_end) df = pd.DataFrame(event_headers[event_columns]) to_h5py(df, outputfile, key='corsika_events', mode='a') reuses.append(df['n_reuse'].iloc[0]) print('saving runwise information') runs = pd.DataFrame(np.array(run_headers)[run_header_columns]) runs['n_events'] = np.array(run_ends)['n_events'] runs['n_reuse'] = reuses to_h5py(runs, outputfile, key='corsika_runs', mode='a') print('done')
def main(): description = ('Convert hillas file to h5py.') parser = argparse.ArgumentParser(description=description, formatter_class=Formatter) parser.add_argument('-f', '--files', dest='input_path', help='path to the HDF5 hillas files') parser.add_argument('-o', dest='output_path', required=True, help='output path to store the h5py file') args = parser.parse_args() input_path = args.input_path output_path = args.output_path with HDF5Reader(input_path) as reader: tel = reader.read('data') arr = reader.read('mc') pnt = reader.read('pointing') run = reader.read('mcheader') arr = arr.rename( columns={ 'energy': 'mc_energy', 'alt': 'mc_alt', 'az': 'mc_az', 'core_x': 'mc_core_x', 'core_y': 'mc_core_y', 'h_first_int': 'mc_h_first_int', 'shower_primary_id': 'mc_shower_primary_id', 'x_max': 'mc_x_max', #'iobs':'run_id', #'iev': 'array_event_id' }) pnt = pnt.drop(columns=['t_cpu']) arr = pd.merge(arr, pnt, on=['iobs', 'iev']) arr = arr.rename(columns={'iobs': 'run_id', 'iev': 'array_event_id'}) tel['array_event_id'] = tel.iev.values tel = tel.rename(columns={'iev': 'telescope_event_id', 'iobs': 'run_id'}) tel = tel.drop(columns=['t_cpu']) plate_scale = 37.56 tel.x = tel.x * plate_scale tel.y = tel.y * plate_scale run = run.rename(columns={'iobs': 'run_id'}) to_h5py(tel, output_path, key='telescope_events', mode='w') to_h5py(arr, output_path, key='array_events', mode='a') to_h5py(run, output_path, key='runs', mode='a')
def main(): args = parser.parse_args() df = read_mars(args.inputfile, tree=args.tree, verbose=True) to_h5py(args.outputfile, df, key=args.tree, mode='w')
def main( configuration_path, data_path, separator_model_path, energy_model_path, disp_model_path, sign_model_path, output, key, chunksize, n_jobs, yes, verbose, ): ''' Apply given model to data. Two columns are added to the file, energy_prediction and energy_prediction_std CONFIGURATION_PATH: Path to the config yaml file DATA_PATH: path to the FACT data in a h5py hdf5 file, e.g. erna_gather_fits output SEPARATOR_MODEL_PATH: Path to the pickled separation model. ENERGY_MODEL_PATH: Path to the pickled energy regression model. DISP_MODEL_PATH: Path to the pickled disp model. SIGN_MODEL_PATH: Path to the pickled sign model. ''' logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO) log = logging.getLogger() config = AICTConfig.from_yaml(configuration_path) if os.path.isfile(output): if not yes: click.confirm( 'Outputfile {} exists. Overwrite?'.format(output), abort=True, ) open(output, 'w').close() log.info('Loading model') separator_model = joblib.load(separator_model_path) energy_model = joblib.load(energy_model_path) disp_model = joblib.load(disp_model_path) sign_model = joblib.load(sign_model_path) log.info('Done') if n_jobs: separator_model.n_jobs = n_jobs energy_model.n_jobs = n_jobs disp_model.n_jobs = n_jobs sign_model.n_jobs = n_jobs columns = set(needed_columns) for model in ('separator', 'energy', 'disp'): model_config = getattr(config, model) columns.update(model_config.columns_to_read_apply) try: runs = read_h5py(data_path, key='runs') sources = runs['source'].unique() if len(sources) > 1: raise click.ClickException( 'to_dl3 only supports files with a single source' ) source = SkyCoord.from_name(sources[0]) columns.update(['timestamp', 'night']) except (KeyError, OSError) as e: source = None columns.update(dl3_columns_sim_read) df_generator = read_telescope_data_chunked( data_path, config, chunksize=chunksize, columns=columns, ) log.info('Predicting on data...') for df, start, end in tqdm(df_generator): df_sep = feature_generation(df, config.separator.feature_generation) df['gamma_prediction'] = predict_separator( df_sep[config.separator.features], separator_model, ) df_energy = feature_generation(df, config.energy.feature_generation) df['gamma_energy_prediction'] = predict_energy( df_energy[config.energy.features], energy_model, log_target=config.energy.log_target, ) df_disp = feature_generation(df, config.disp.feature_generation) disp = predict_disp( df_disp[config.disp.features], disp_model, sign_model ) source_x = df.cog_x + disp * np.cos(df.delta) source_y = df.cog_y + disp * np.sin(df.delta) df['source_x_prediction'] = source_x df['source_y_prediction'] = source_y df['disp_prediction'] = disp if source: obstime = Time(pd.to_datetime(df['timestamp'].values).to_pydatetime()) source_altaz = concat_results_altaz(parallelize_array_computation( partial(to_altaz, source=source), obstime, n_jobs=n_jobs, )) result = parallelize_array_computation( calc_source_features_obs, source_x, source_y, source_altaz.zen.deg, source_altaz.az.deg, df['pointing_position_zd'].values, df['pointing_position_az'].values, obstime, n_jobs=n_jobs, ) else: result = parallelize_array_computation( calc_source_features_sim, source_x, source_y, df['source_position_zd'].values, df['source_position_az'].values, df['pointing_position_zd'].values, df['pointing_position_az'].values, df['cog_x'].values, df['cog_y'].values, df['delta'].values, n_jobs=n_jobs, ) for k in result[0].keys(): df[k] = np.concatenate([r[k] for r in result]) if source: to_h5py(df[dl3_columns_obs], output, key='events', mode='a') else: to_h5py(df[dl3_columns_sim], output, key='events', mode='a') if source: log.info('Copying "runs" group') to_h5py(runs, output, key='runs', mode='a')
def main(file): run_meta = read_data('~/phs_analysis/open_crab_sample_runs.csv') to_h5py(run_meta.iloc[:], file, key='runs', mode='a')
def main( configuration_path, data_path, separator_model_path, energy_model_path, disp_model_path, sign_model_path, output, random_source, wobble_distance, key, chunksize, n_jobs, yes, verbose, ): ''' Apply given model to data. Two columns are added to the file, energy_prediction and energy_prediction_std CONFIGURATION_PATH: Path to the config yaml file DATA_PATH: path to the FACT data in a h5py hdf5 file, e.g. erna_gather_fits output SEPARATOR_MODEL_PATH: Path to the pickled separation model. ENERGY_MODEL_PATH: Path to the pickled energy regression model. DISP_MODEL_PATH: Path to the pickled disp model. SIGN_MODEL_PATH: Path to the pickled sign model. ''' log = setup_logging() config = AICTConfig.from_yaml(configuration_path) if os.path.isfile(output): if not yes: click.confirm( 'Outputfile {} exists. Overwrite?'.format(output), abort=True, ) open(output, 'w').close() log.info('Loading model') separator_model = load_model(separator_model_path) energy_model = load_model(energy_model_path) disp_model = load_model(disp_model_path) sign_model = load_model(sign_model_path) log.info('Done') if n_jobs: separator_model.n_jobs = n_jobs energy_model.n_jobs = n_jobs disp_model.n_jobs = n_jobs sign_model.n_jobs = n_jobs columns = set(needed_columns) for model in ('separator', 'energy', 'disp'): model_config = getattr(config, model) columns.update(model_config.columns_to_read_apply) try: runs = read_h5py(data_path, key='runs') sources = runs['source'].unique() if len(sources) > 1: raise click.ClickException( 'to_dl3 only supports files with a single source') source = SkyCoord.from_name(sources[0]) columns.update(['timestamp', 'night']) except (KeyError, OSError): source = None columns.update(dl3_columns_sim_read) df_generator = read_telescope_data_chunked( data_path, config, chunksize=chunksize, columns=columns, ) log.info('Predicting on data...') for df, start, end in tqdm(df_generator): df_sep = feature_generation(df, config.separator.feature_generation) df['gamma_prediction'] = predict_separator( df_sep[config.separator.features], separator_model, ) df_energy = feature_generation(df, config.energy.feature_generation) df['gamma_energy_prediction'] = predict_energy( df_energy[config.energy.features], energy_model, log_target=config.energy.log_target, ) df_disp = feature_generation(df, config.disp.feature_generation) disp = predict_disp( df_disp[config.disp.features], disp_model, sign_model, log_target=config.disp.log_target, ) prediction_x = df.cog_x + disp * np.cos(df.delta) prediction_y = df.cog_y + disp * np.sin(df.delta) df['source_x_prediction'] = prediction_x df['source_y_prediction'] = prediction_y df['disp_prediction'] = disp if source: obstime = Time( pd.to_datetime(df['timestamp'].values).to_pydatetime()) source_altaz = concat_results_altaz( parallelize_array_computation( partial(to_altaz, source=source), obstime, n_jobs=n_jobs, )) result = parallelize_array_computation( calc_source_features_obs, prediction_x, prediction_y, source_altaz.zen.deg, source_altaz.az.deg, df['pointing_position_zd'].values, df['pointing_position_az'].values, obstime, n_jobs=n_jobs, ) else: if random_source: zd, az = calc_random_source( df['pointing_position_zd'], df['pointing_position_az'], wobble_distance, ) df['source_position_zd'] = zd df['source_position_az'] = az result = parallelize_array_computation( calc_source_features_sim, prediction_x, prediction_y, df['source_position_zd'].values, df['source_position_az'].values, df['pointing_position_zd'].values, df['pointing_position_az'].values, df['cog_x'].values, df['cog_y'].values, df['delta'].values, project_disp=config.disp.project_disp, n_jobs=n_jobs, ) for k in result[0].keys(): df[k] = np.concatenate([r[k] for r in result]) if source: to_h5py(df[dl3_columns_obs], output, key='events', mode='a') else: to_h5py(df[dl3_columns_sim], output, key='events', mode='a') with h5py.File(data_path, 'r') as f: sample_fraction = f.attrs.get('sample_fraction', 1.0) set_sample_fraction(output, sample_fraction) copy_runs_group(data_path, output)
def main( configuration_path, data_path, separator_model_path, energy_model_path, disp_model_path, sign_model_path, output, random_source, wobble_distance, key, chunksize, n_jobs, yes, verbose, ): """ Apply given model to data. Two columns are added to the file, energy_prediction and energy_prediction_std CONFIGURATION_PATH: Path to the config yaml file DATA_PATH: path to the FACT data in a h5py hdf5 file, e.g. erna_gather_fits output SEPARATOR_MODEL_PATH: Path to the pickled separation model. ENERGY_MODEL_PATH: Path to the pickled energy regression model. DISP_MODEL_PATH: Path to the pickled disp model. SIGN_MODEL_PATH: Path to the pickled sign model. """ log = setup_logging() config = AICTConfig.from_yaml(configuration_path) if os.path.isfile(output): if not yes: click.confirm( "Outputfile {} exists. Overwrite?".format(output), abort=True, ) open(output, "w").close() log.info("Loading model") separator_model = load_model(separator_model_path) energy_model = load_model(energy_model_path) disp_model = load_model(disp_model_path) sign_model = load_model(sign_model_path) log.info("Done") if n_jobs: separator_model.n_jobs = n_jobs energy_model.n_jobs = n_jobs disp_model.n_jobs = n_jobs sign_model.n_jobs = n_jobs columns = set(needed_columns) for model in ("separator", "energy", "disp"): model_config = getattr(config, model) columns.update(model_config.columns_to_read_apply) try: runs = read_h5py(data_path, key="runs") sources = runs["source"].unique() if len(sources) > 1: raise click.ClickException( "to_dl3 only supports files with a single source") source = SkyCoord.from_name(sources[0]) columns.update(["timestamp", "night"]) except (KeyError, OSError): source = None columns.update(dl3_columns_sim_read) df_generator = read_telescope_data_chunked( data_path, config, chunksize=chunksize, columns=columns, ) log.info("Predicting on data...") for df, start, end in tqdm(df_generator): df_sep = feature_generation(df, config.separator.feature_generation) df["gamma_prediction"] = predict_separator( df_sep[config.separator.features], separator_model, ) df_energy = feature_generation(df, config.energy.feature_generation) df["gamma_energy_prediction"] = predict_energy( df_energy[config.energy.features], energy_model, log_target=config.energy.log_target, ) df_disp = feature_generation(df, config.disp.feature_generation) disp = predict_disp( df_disp[config.disp.features], disp_model, sign_model, log_target=config.disp.log_target, ) prediction_x = df.cog_x + disp * np.cos(df.delta) prediction_y = df.cog_y + disp * np.sin(df.delta) df["source_x_prediction"] = prediction_x df["source_y_prediction"] = prediction_y df["disp_prediction"] = disp if source: obstime = Time(df["timestamp"].to_numpy().astype("U")) source_altaz = concat_results_altaz( parallelize_array_computation( partial(to_altaz, source=source), obstime, n_jobs=n_jobs, )) result = parallelize_array_computation( calc_source_features_obs, prediction_x, prediction_y, source_altaz.zen.deg, source_altaz.az.deg, df["pointing_position_zd"].to_numpy(), df["pointing_position_az"].to_numpy(), obstime, n_jobs=n_jobs, ) else: if random_source: zd, az = calc_random_source( df["pointing_position_zd"], df["pointing_position_az"], wobble_distance, ) df["source_position_zd"] = zd df["source_position_az"] = az result = parallelize_array_computation( calc_source_features_sim, prediction_x, prediction_y, df["source_position_zd"].to_numpy(), df["source_position_az"].to_numpy(), df["pointing_position_zd"].to_numpy(), df["pointing_position_az"].to_numpy(), df["cog_x"].to_numpy(), df["cog_y"].to_numpy(), df["delta"].to_numpy(), project_disp=config.disp.project_disp, n_jobs=n_jobs, ) for k in result[0].keys(): df[k] = np.concatenate([r[k] for r in result]) if source: to_h5py(df[dl3_columns_obs], output, key="events", mode="a") else: to_h5py(df[dl3_columns_sim], output, key="events", mode="a") with h5py.File(data_path, "r") as f: sample_fraction = f.attrs.get("sample_fraction", 1.0) set_sample_fraction(output, sample_fraction) copy_group(data_path, output, "runs") copy_group(data_path, output, "corsika_runs")
def main(xml_name, ft_version, outputfile, config, start, end, source, datacheck, runlist, run_type): ''' Gather the fits outputfiles of the erna automatic processing into a hdf5 file. The hdf5 file is written using h5py and contains the level 2 features in the `events` group and some metadata for each run in the `runs` group. It is possible to only gather files that pass a given datacheck with the --datacheck option. The possible conditions are implemented in erna.datacheck_conditions/ XML_NAME: name of the xml for which you want to gather output FT_VERSION: FACT Tools version for which you want to gather output OUTPUTFILE: the outputfile ''' config = load_config(config) database.init(**config['processing_database']) database.connect() if datacheck and runlist: print('Only one of datacheck or runlist allowed') sys.exit(1) if datacheck is not None: if not (datacheck in datacheck_conditions or os.path.isfile(datacheck)): print('Conditions must be a file or any of: ') for key in datacheck_conditions: print(key) sys.exit(1) processing_db = create_mysql_engine(**config['processing_database']) fact_db = create_mysql_engine(**config['fact_database']) try: jar = (Jar.select(Jar.id, Jar.version).where(Jar.version == ft_version).get()) except Jar.DoesNotExist: print('FACT-Tools version not found, avaliable jars are') for jar in Jar.select(Jar.version): print(jar.version) sys.exit(1) try: xml = XML.get(jar=jar, name=xml_name) except XML.DoesNotExist: print('XML not found, avaliable xmls are:') for xml in XML.select( XML.name).join(Jar).where(Jar.version == ft_version): print(xml.name) sys.exit(1) job_query = (Job.select( RawDataFile.night.alias('night'), RawDataFile.run_id.alias('run_id'), Job.result_file, ProcessingState.description.alias('status')).join(RawDataFile).switch( Job).join(ProcessingState).where( Job.jar == jar, Job.xml == xml, RawDataFile.run_type_name == run_type, )) if start: start = dateutil.parser.parse(start).date() job_query = job_query.where(RawDataFile.night >= start) if end: end = dateutil.parser.parse(end).date() job_query = job_query.where(RawDataFile.night <= end) sql, params = job_query.sql() with processing_db.connect() as conn: jobs = pd.read_sql_query(sql, conn, params=params) if runlist is None: conditions = [ 'fNight <= {}'.format(jobs.night.max()), 'fNight >= {}'.format(jobs.night.min()), 'fSourceName = "{}"'.format(source), ] else: wanted_runs = pd.read_csv(runlist) conditions = [ 'fNight <= {}'.format(wanted_runs.night.max()), 'fNight >= {}'.format(wanted_runs.night.min()), ] if datacheck is not None: if os.path.isfile(datacheck): with open(datacheck, 'r') as f: conditions.extend(f.read().splitlines()) else: conditions.extend(datacheck_conditions[datacheck]) runs = get_runs(fact_db, conditions=conditions).set_index(['night', 'run_id']) jobs = jobs.join(runs, on=['night', 'run_id'], how='inner') if runlist is not None: jobs = wanted_runs.join( jobs.set_index(['night', 'run_id']), on=['night', 'run_id'], how='inner', lsuffix='user_input_', ) successful_jobs = jobs.query('status == "success"') total = len(jobs) successful = len(successful_jobs) if runlist is not None: if len(wanted_runs) != len(jobs): click.confirm( 'Only {} of {} runs available, continue?:'.format( total, len(jobs)), abort=True, ) if total != successful: click.confirm( 'Only {} of {} jobs successful, continue?'.format( successful, total), abort=True, ) print('Found {} runs with a total ontime of {:1.2f} h'.format( len(jobs), jobs.ontime.sum() / 3600)) if os.path.isfile(outputfile): a = input('Outputfile exists! Overwrite? [y, N]: ') if not a.lower().startswith('y'): sys.exit() columns = [ 'night', 'run_id', 'source', 'ontime', 'right_ascension', 'declination', 'zenith', 'azimuth', 'run_start', 'run_stop', ] to_h5py(successful_jobs[columns], outputfile, key='runs', mode='w') with h5py.File(outputfile, 'a') as f: if runlist is not None: f['runs'].attrs['datacheck'] = 'RUNLIST' else: f['runs'].attrs['datacheck'] = ' AND '.join(conditions) write_fits_to_hdf5(outputfile, successful_jobs.result_file, mode='a')