Example #1
0
    def test_module(self):
        outfile = tempfile.NamedTemporaryFile(delete=True)

        pipe = kp.Pipeline()
        pipe.attach(kp.io.OfflinePump,
                    filename=data_path("offline/numucc.root"))
        pipe.attach(km.io.MCTracksTabulator)
        pipe.attach(kp.io.HDF5Sink, filename=outfile.name)
        pipe.drain()

        pipe = kp.Pipeline()
        pipe.attach(kp.io.HDF5Pump, filename=outfile.name)
        pipe.attach(km.common.Observer, count=10, required_keys=["McTracks"])
        pipe.drain()
Example #2
0
def main():
    from docopt import docopt
    args = docopt(__doc__, version=VERSION)

    det_id = int(args['-d'])
    plot_path = args['-o']
    ligier_ip = args['-l']
    ligier_port = int(args['-p'])
    du = int(args['-u'])
    interval = int(args['-i'])

    detector = kp.hardware.Detector(det_id=det_id)

    pipe = kp.Pipeline(timeit=True)
    pipe.attach(kp.io.ch.CHPump,
                host=ligier_ip,
                port=ligier_port,
                tags='IO_MONIT',
                timeout=60 * 60 * 24 * 7,
                max_queue=2000)
    pipe.attach(PMTRates,
                detector=detector,
                du=du,
                interval=interval,
                plot_path=plot_path)
    pipe.drain()
Example #3
0
    def test_module(self):
        outfile = tempfile.NamedTemporaryFile(delete=True)

        pipe = kp.Pipeline()
        pipe.attach(
            kp.io.OfflinePump,
            filename=data_path(
                "offline/mcv6.0.gsg_muon_highE-CC_50-500GeV.km3sim.jterbr00008357.jorcarec.aanet.905.root"
            ),
        )
        pipe.attach(km.io.EventInfoTabulator)
        pipe.attach(kp.io.HDF5Sink, filename=outfile.name)
        pipe.drain(10)

        pipe = kp.Pipeline()
        pipe.attach(kp.io.HDF5Pump, filename=outfile.name)
        pipe.attach(km.common.Observer, count=10, required_keys=["EventInfo"])
        pipe.attach(CheckW2listContents)
        pipe.drain()
Example #4
0
def main():
    pipe = kp.Pipeline()
    pipe.attach(kp.io.ch.CHPump,
                host='192.168.0.21',
                port=5553,
                tags='IO_MONIT',
                timeout=30,
                max_queue=2000000)
    pipe.attach(OOSAnalyzer)
    pipe.drain()
Example #5
0
def main():
    pipe = kp.Pipeline()
    pipe.attach(kp.io.ch.CHPump,
                host='192.168.0.21',
                port=5553,
                tags='IO_TSL0',
                timeout=60 * 5,
                max_queue=2000)
    pipe.attach(kp.io.daq.TimesliceParser)
    pipe.attach(OOSAnalyzer)
    pipe.drain()
def main():
    pipe = kp.Pipeline()
    pipe.attach(kp.io.ch.CHPump,
                host='192.168.0.21',
                port=5553,
                tags='IO_EVT',
                timeout=60 * 5,
                max_queue=2000)
    pipe.attach(kp.io.daq.DAQProcessor)
    pipe.attach(OOSAnalyzer)
    pipe.drain()
Example #7
0
    def test_module(self):
        outfile = tempfile.NamedTemporaryFile(delete=True)

        pipe = kp.Pipeline()
        pipe.attach(
            kp.io.OfflinePump,
            filename=data_path(
                "offline/mcv6.0.gsg_muon_highE-CC_50-500GeV.km3sim.jterbr00008357.jorcarec.aanet.905.root"
            ),
        )
        pipe.attach(km.io.RecoTracksTabulator, best_tracks=True)
        pipe.attach(kp.io.HDF5Sink, filename=outfile.name)
        pipe.drain(5)

        pipe = kp.Pipeline()
        pipe.attach(kp.io.HDF5Pump, filename=outfile.name)
        pipe.attach(km.common.Observer, count=5, required_keys=["Tracks"])
        pipe.attach(km.common.Observer, count=5, required_keys=["RecStages"])
        pipe.attach(km.common.Observer, count=5, required_keys=["BestJmuon"])
        pipe.attach(CheckRecoContents)
        pipe.drain()
Example #8
0
def main():
    detx_delay = open('D_BCI_0004_calibrated.detx', 'r')
    pipe = kp.Pipeline()
    pipe.attach(kp.io.ch.CHPump,
                host='192.168.0.21',
                port=5553,
                tags='IO_TSL0',
                timeout=60 * 5,
                max_queue=2000)
    pipe.attach(kp.io.daq.TimesliceParser)
    pipe.attach(OOSAnalyzer, filee=detx_delay)
    pipe.drain()
Example #9
0
def main():
    detector = kp.hardware.Detector(det_id=29)
    pipe = kp.Pipeline(timeit=True)
    pipe.attach(
        kp.io.CHPump,
        host="192.168.0.110",
        port=5553,
        tags="IO_MONIT",
        timeout=60 * 60 * 24 * 7,
        max_queue=1000,
    )
    pipe.attach(PMTRates, detector=detector, du=2, interval=2)
    pipe.drain()
Example #10
0
 def build_pipe(self, infile, outfile, timeit=True):
     """Initialize and connect the modules from the different stages."""
     components = [
         *self.get_cmpts_pre(infile=infile),
         *self.get_cmpts_main(),
         *self.get_cmpts_post(outfile=outfile),
     ]
     pipe = kp.Pipeline(timeit=timeit)
     if self.n_statusbar is not None:
         pipe.attach(km.common.StatusBar, every=self.n_statusbar)
     if self.n_memory_observer is not None:
         pipe.attach(km.common.MemoryObserver, every=self.n_memory_observer)
     for cmpt, kwargs in components:
         pipe.attach(cmpt, **kwargs)
     return pipe
Example #11
0
def main():
    from docopt import docopt

    args = docopt(__doc__, version=kp.version)

    tag = args["TAG"]
    outfile = args["OUTFILE"]
    port = int(args["-p"])
    ip = args["-i"]
    n = int(args["-n"])

    pipe = kp.Pipeline()
    pipe.attach(kp.io.ch.CHPump, host=ip, port=port, tags=tag)
    pipe.attach(Dumper, filename=outfile)
    pipe.drain(n)
Example #12
0
def main():
    from docopt import docopt
    args = docopt(__doc__, version=VERSION)

    plots_path = args['-o']
    ligier_ip = args['-l']
    ligier_port = int(args['-p'])

    pipe = kp.Pipeline()
    pipe.attach(kp.io.ch.CHPump,
                host=ligier_ip,
                port=ligier_port,
                tags='IO_EVT',
                timeout=60 * 60 * 24 * 7,
                max_queue=200000)
    pipe.attach(kp.io.daq.DAQProcessor)
    pipe.attach(TriggerRate, interval=300, plots_path=plots_path)
    pipe.drain()
Example #13
0
def main():
    from docopt import docopt
    args = docopt(__doc__, version=VERSION)

    plots_path = args['-o']
    ligier_ip = args['-l']
    ligier_port = int(args['-p'])

    pipe = kp.Pipeline()
    pipe.attach(
        kp.io.ch.CHPump,
        host=ligier_ip,
        port=ligier_port,
        tags='IO_TSL0,IO_TSL1,IO_TSL2,IO_TSSN',
        timeout=60 * 60 * 24 * 7,
        max_queue=200000)
    pipe.attach(TimesliceRate, interval=10, plots_path=plots_path)
    pipe.drain()
Example #14
0
def main():
    from docopt import docopt
    args = docopt(__doc__)

    plots_path = args['-o']
    ligier_ip = args['-l']
    ligier_port = int(args['-p'])

    pipe = kp.Pipeline()
    pipe.attach(kp.io.ch.CHPump,
                host=ligier_ip,
                port=ligier_port,
                tags='IO_OLINE',
                timeout=60 * 60 * 24 * 7,
                max_queue=2000)
    pipe.attach(kp.io.daq.DAQProcessor)
    pipe.attach(RecoPlotter, plots_path=plots_path)
    pipe.drain()
Example #15
0
def main():
    from docopt import docopt
    args = docopt(__doc__)

    det_id = int(args['-d'])
    plots_path = args['-o']
    ligier_ip = args['-l']
    ligier_port = int(args['-p'])

    pipe = kp.Pipeline()
    pipe.attach(kp.io.ch.CHPump,
                host=ligier_ip,
                port=ligier_port,
                tags='IO_MONIT',
                timeout=60 * 60 * 24 * 7,
                max_queue=2000)
    pipe.attach(CalibrateAHRS, det_id=det_id, plots_path=plots_path)
    pipe.drain()
Example #16
0
def main():
    from docopt import docopt
    args = docopt(__doc__)

    ligier_ip = args['-l']
    ligier_port = int(args['-p'])
    logging_ligier_ip = args['-m']
    logging_ligier_port = int(args['-q'])

    pipe = kp.Pipeline()
    pipe.attach(kp.io.ch.CHPump,
                host=ligier_ip,
                port=ligier_port,
                tags="IO_TSSN")
    pipe.attach(kp.io.daq.TimesliceParser)
    pipe.attach(TimeSyncChecker,
                logging_ligier_ip=logging_ligier_ip,
                logging_ligier_port=logging_ligier_port)
    pipe.drain()
Example #17
0
def main():
    from docopt import docopt
    args = docopt(__doc__, version=VERSION)

    det_id = int(args['-d'])
    plots_path = args['-o']
    ligier_ip = args['-l']
    ligier_port = int(args['-p'])

    pipe = kp.Pipeline()
    pipe.attach(kp.io.ch.CHPump,
                host=ligier_ip,
                port=ligier_port,
                tags='IO_SUM',
                timeout=60 * 60 * 24 * 7,
                max_queue=2000)
    pipe.attach(kp.io.daq.DAQProcessor)
    pipe.attach(DOMRates, det_id=det_id, plots_path=plots_path)
    pipe.drain()
Example #18
0
    def test_calibration_in_pipeline(self):
        class DummyPump(kp.Module):
            def configure(self):
                self.index = 0

            def process(self, blob):
                self.index += 1
                mc_hits = Table({"pmt_id": [1, 2, 1], "time": [10.1, 11.2, 12.3]})
                hits = Table(
                    {
                        "dom_id": [2, 3, 3],
                        "channel_id": [0, 1, 2],
                        "time": [10.1, 11.2, 12.3],
                        "tot": [0, 10, 255],
                    }
                )

                blob["Hits"] = hits
                blob["McHits"] = mc_hits
                return blob

        _self = self

        class Observer(kp.Module):
            def process(self, blob):
                assert "Hits" in blob
                assert "McHits" in blob
                assert "CalibHits" in blob
                assert "CalibMcHits" in blob
                assert not hasattr(blob["Hits"], "pmt_id")
                assert hasattr(blob["CalibHits"], "pmt_id")
                assert not hasattr(blob["McHits"], "dom_id")
                assert hasattr(blob["CalibHits"], "dom_id")
                assert np.allclose([10.1, 11.2, 12.3], blob["Hits"].time)
                assert np.allclose([42.09, 87.31, 111.34], blob["CalibHits"].time)
                assert np.allclose(blob["McHits"].time, blob["CalibMcHits"].time)
                return blob

        pipe = kp.Pipeline()
        pipe.attach(DummyPump)
        pipe.attach(Calibration, filename=data_path("detx/detx_v1.detx"))
        pipe.attach(Observer)
        pipe.drain(3)
Example #19
0
def k40calib(filename, tmax, ctmin, stream, filter_hrv, det_id,
             calib_filename):
    pipe = kp.Pipeline()
    pipe.attach(kp.io.jpp.TimeslicePump, filename=filename, stream=stream)
    pipe.attach(StatusBar, every=5000)
    pipe.attach(MemoryObserver, every=10000)
    pipe.attach(k40.HRVFIFOTimesliceFilter,
                filter_hrv=filter_hrv,
                filename=filename)
    pipe.attach(k40.SummaryMedianPMTRateService, filename=filename)
    pipe.attach(k40.TwofoldCounter, tmax=tmax)
    pipe.attach(k40.K40BackgroundSubtractor, mode="offline")
    pipe.attach(
        k40.IntraDOMCalibrator,
        ctmin=ctmin,
        mode="offline",
        det_id=det_id,
        calib_filename=calib_filename,
    )
    pipe.drain()
Example #20
0
def main():
    """The main script"""
    from docopt import docopt

    args = docopt(__doc__, version=kp.version)

    kp.logger.set_level("km3pipe", args["-d"])

    pipe = kp.Pipeline()
    pipe.attach(
        kp.io.ch.CHPump,
        host=args["SOURCE_IP"],
        port=int(args["-p"]),
        tags=args["-m"],
        timeout=int(args["-x"]),
        max_queue=int(args["-s"]),
        show_statistics=True,
    )
    pipe.attach(LigierSender, target_ip=args["-t"], port=int(args["-q"]))
    pipe.drain()
Example #21
0
def main():
    from docopt import docopt

    args = docopt(__doc__)

    dom_id = int(args["DOM_ID"])

    pipe = kp.Pipeline(timeit=True)
    pipe.attach(
        kp.io.CHPump,
        host="127.0.0.1",
        port=5553,
        tags="IO_SUM, IO_MONIT",
        timeout=60 * 60 * 24 * 7,
        max_queue=1000,
    )
    pipe.attach(MonitoringChannelPicker, dom_id=dom_id)
    pipe.attach(SummarysliceMatcher,
                dom_id=dom_id,
                n_timeslices=int(args["-n"]))
    pipe.drain()
Example #22
0
def main():
    from docopt import docopt
    args = docopt(__doc__)

    det_id = int(args['-d'])
    plots_path = args['-o']
    ligier_ip = args['-l']
    ligier_port = int(args['-p'])

    pipe = kp.Pipeline()
    pipe.attach(LocalDBService, thread_safety=False)
    pipe.attach(ELOGService)
    pipe.attach(kp.io.ch.CHPump,
                host=ligier_ip,
                port=ligier_port,
                tags='IO_EVT, IO_SUM',
                timeout=60 * 60 * 24 * 7,
                max_queue=2000)
    pipe.attach(kp.io.daq.DAQProcessor)
    pipe.attach(ZTPlot, det_id=det_id, plots_path=plots_path, elog=False)
    pipe.drain()
Example #23
0
def main():
    from docopt import docopt
    args = docopt(__doc__)

    det_id = int(args['-d'])
    plots_path = args['-o']
    ligier_ip = args['-l']
    ligier_port = int(args['-p'])

    pipe = kp.Pipeline()
    pipe.attach(kp.io.ch.CHPump,
                host=ligier_ip,
                port=ligier_port,
                tags='IO_EVT',
                timeout=60 * 60 * 24 * 7,
                max_queue=2000)
    pipe.attach(kp.io.daq.DAQProcessor)
    pipe.attach(TriggerMap,
                det_id=det_id,
                plots_path=plots_path,
                only_if="Hits")
    pipe.drain()
Example #24
0
def main():
    from docopt import docopt
    args = docopt(__doc__)

    det_id = int(args['-d'])
    plots_path = args['-o']
    ligier_ip = args['-l']
    ligier_port = int(args['-p'])

    det_oid = kp.db.DBManager().get_det_oid(det_id)

    pipe = kp.Pipeline(timeit=True)
    pipe.attach(
        kp.io.ch.CHPump,
        host=ligier_ip,
        port=ligier_port,
        tags='IO_TSL1, IO_MONIT',
        timeout=7 * 60 * 60 * 24,
        max_queue=200000)
    pipe.attach(kp.io.ch.CHTagger)
    pipe.attach(StatusBar, every=50000)
    pipe.attach(MemoryObserver, every=100000)
    pipe.attach(k40.MedianPMTRatesService, only_if='IO_MONIT')
    pipe.attach(kp.io.daq.TimesliceParser)
    pipe.attach(
        k40.TwofoldCounter,
        tmax=10,
        dump_filename=os.path.join(plots_path, 'twofold_counts.p'))
    pipe.attach(Siphon, volume=10 * 60 * 180, flush=True)
    pipe.attach(k40.K40BackgroundSubtractor)
    pipe.attach(k40.IntraDOMCalibrator, ctmin=-1, det_id=det_id)
    pipe.attach(
        IntraDOMCalibrationPlotter,
        det_oid=det_oid,
        data_path=plots_path,
        plots_path=plots_path)
    pipe.attach(k40.ResetTwofoldCounts)
    pipe.drain()
Example #25
0
def main():
    args = docopt(__doc__, version=kp.version)

    du = int(args["-u"]) if args["-u"] else None

    try:
        det_id = int(args["-d"])
        det = kp.hardware.Detector(det_id=det_id)
    except ValueError:
        detx = args["-d"]
        det = kp.hardware.Detector(filename=detx)

    if args["-s"] is not None:
        subtitle = args["-s"]
    else:
        subtitle = ", ".join(args["FILENAMES"])

    pipe = kp.Pipeline()
    if args["--offline"]:
        pipe.attach(km.common.MultiFilePump,
                    pump=kp.io.OfflinePump,
                    filenames=args["FILENAMES"])
        pipe.attach(km.io.HitsTabulator, kind="offline")
    else:
        pipe.attach(
            km.common.MultiFilePump,
            pump=kp.io.online.EventPump,
            filenames=args["FILENAMES"],
        )
    pipe.attach(StatusBar, every=2500)
    pipe.attach(
        TriggerMap,
        detector=det,
        du=du,
        plot_filename=args["-p"],
        subtitle=subtitle,
    )
    pipe.drain()
Example #26
0
    def configure(self):
        self.h5loc = self.require("h5loc")
        self.n = self.get("n", default=10)

    def process(self, blob):
        table = kp.Table({"x": np.random.randn(self.n)}, h5loc=self.h5loc)
        blob["RandomNumbers"] = table
        return blob


#####################################################
# Creating a simple pipeline
# --------------------------
# We create a very basic pipeline:

pipe = kp.Pipeline()
pipe.attach(km.StatusBar, every=1)
pipe.attach(km.mc.GlobalRandomState, seed=23)
pipe.attach(RandomNumberGenerator, h5loc="/rnd", n=5)
pipe.attach(kp.io.HDF5Sink, filename="rnd.h5")
pipe.drain(11)

#####################################################
# Provenance
# ----------
# The provenance information is managed by the singleton class
# ``Provenance``. To access all the provenance information,
# use the ``as_json()`` method:

print(kp.Provenance().as_json(indent=2))
Example #27
0
The following script calculates the PMT time offsets using K40 coincidences

"""

# Author: Jonas Reubelt <*****@*****.**> and Tamas Gal <*****@*****.**>
# License: MIT
import km3pipe as kp
from km3modules import k40
from km3modules.common import StatusBar, MemoryObserver, Siphon
from km3modules.plot import IntraDOMCalibrationPlotter
import km3pipe.style

km3pipe.style.use("km3pipe")

pipe = kp.Pipeline(timeit=True)
pipe.attach(
    kp.io.ch.CHPump,
    host="127.0.0.1",
    port=5553,
    tags="IO_TSL, IO_MONIT",
    timeout=7 * 60 * 60 * 24,
    max_queue=42,
)
pipe.attach(kp.io.ch.CHTagger)
pipe.attach(StatusBar, every=1000)
pipe.attach(MemoryObserver, every=5000)
pipe.attach(k40.MedianPMTRatesService, only_if="IO_MONIT")
pipe.attach(kp.io.daq.TimesliceParser)
pipe.attach(k40.TwofoldCounter, tmax=10)
pipe.attach(Siphon, volume=10 * 10 * 1, flush=True)
Example #28
0
def main():
    from docopt import docopt

    args = docopt(__doc__, version=kp.version)
    step_size = int(args["--step-size"])

    default_flags = (
        "--offline-header",
        "--event-info",
        "--offline-hits",
        "--mc-hits",
        "--mc-tracks",
        "--with-calibration",
        "--reco-tracks",
        "--best-tracks",
    )
    if not any([args[k] for k in default_flags]):
        for k in default_flags:
            args[k] = True

    outfile = args["-o"]
    if outfile is None:
        outfile = args["FILENAME"] + ".h5"

    provfile = args["--provenance-file"]
    if provfile is None:
        provfile = outfile + ".prov.json"

    Provenance().outfile = provfile
    pipe = kp.Pipeline(timeit=args["--timeit"])
    pipe.attach(kp.io.OfflinePump,
                filename=args["FILENAME"],
                step_size=step_size)
    pipe.attach(km.StatusBar, every=1000)
    pipe.attach(km.common.MemoryObserver, every=500)
    if args["--offline-header"]:
        pipe.attach(km.io.OfflineHeaderTabulator)
    if args["--event-info"]:
        pipe.attach(km.io.EventInfoTabulator)
    if args["--offline-hits"]:
        pipe.attach(
            km.io.HitsTabulator,
            name="Offline",
            kind="offline",
            with_calibration=args["--with-calibration"],
        )
    if args["--online-hits"]:
        pipe.attach(km.io.HitsTabulator, name="Online", kind="online")
    if args["--mc-hits"]:
        pipe.attach(km.io.HitsTabulator, name="MC", kind="mc")
    if args["--mc-tracks"]:
        pipe.attach(km.io.MCTracksTabulator,
                    read_usr_data=args["--mc-tracks-usr-data"])
    if args["--reco-tracks"]:
        pipe.attach(
            km.io.RecoTracksTabulator,
            best_tracks=args["--best-tracks"],
            aashower_legacy=args["--aashower-legacy"],
        )
    pipe.attach(kp.io.HDF5Sink, filename=outfile)
    if args["-n"] is not None:
        pipe.drain(int(args["-n"]))
    else:
        pipe.drain()
Example #29
0
def make_nn_images(fname, detx_filepath, config):
    """
    Main code with config parameters. Reads raw .hdf5 files and creates 2D/3D histogram projections that can be used
    for a CNN.

    Parameters
    ----------
    fname : str
        Filename (full path!) of the input file.
    detx_filepath : str
        String with the full filepath to the corresponding .detx file of the input file.
        Used for the binning and for the hits calibration if the input file is not calibrated yet
        (e.g. hits do not contain pos_x/y/z, time, ...).
    config : dict
        Dictionary that contains all configuration options of the make_nn_images function.
        An explanation of the config parameters can be found in orcasong/default_config.toml.

    """
    # Load all parameters from the config # TODO put everything in a config class, this is horrible
    output_dirpath = config['output_dirpath']
    chunksize, complib, complevel = config['chunksize'], config[
        'complib'], config['complevel']
    flush_freq = config['flush_freq']
    n_bins = tuple(config['n_bins'])
    timecut = (config['timecut_mode'], config['timecut_timespan'])
    do_mc_hits = config['do_mc_hits']
    det_geo = config['det_geo']
    do2d = config['do2d']
    do2d_plots = (config['do2d_plots'], config['do2d_plots_n'])
    do3d = config['do3d']
    do4d = (config['do4d'], config['do4d_mode'])
    prod_ident = config[
        'prod_ident'] if config['prod_ident'] != 'None' else None
    data_cuts = dict()
    data_cuts['triggered'] = config['data_cut_triggered']
    data_cuts['energy_lower_limit'] = config[
        'data_cut_e_low'] if config['data_cut_e_low'] != 'None' else None
    data_cuts['energy_upper_limit'] = config[
        'data_cut_e_high'] if config['data_cut_e_high'] != 'None' else None
    data_cuts['throw_away_prob'] = config['data_cut_throw_away'] if config[
        'data_cut_throw_away'] != 'None' else None
    data_cuts[
        'custom_skip_function'] = config['data_cut_custom_func'] if config[
            'data_cut_custom_func'] != 'None' else None

    make_output_dirs(output_dirpath, do2d, do3d, do4d)

    filename = os.path.basename(os.path.splitext(fname)[0])
    filename_output = filename.replace('.', '_')

    # set random km3pipe (=numpy) seed
    print('Setting a Global Random State with the seed < 42 >.')
    km.GlobalRandomState(seed=42)

    geo, x_bin_edges, y_bin_edges, z_bin_edges = calculate_bin_edges(
        n_bins, det_geo, detx_filepath, do4d)
    pdf_2d_plots = PdfPages(output_dirpath + '/orcasong_output/4dTo2d/' +
                            filename_output +
                            '_plots.pdf') if do2d_plots[0] is True else None

    file_particle_type = get_file_particle_type(fname)

    print('Generating histograms from the hits for files based on ' + fname)

    # Initialize OrcaSong Event Pipeline

    pipe = kp.Pipeline()  # add timeit=True argument for profiling
    pipe.attach(km.common.StatusBar, every=200)
    pipe.attach(km.common.MemoryObserver, every=400)
    pipe.attach(kp.io.hdf5.HDF5Pump, filename=fname)
    pipe.attach(km.common.Keep,
                keys=[
                    'EventInfo', 'Header', 'RawHeader', 'McTracks', 'Hits',
                    'McHits'
                ])
    pipe.attach(EventDataExtractor,
                file_particle_type=file_particle_type,
                geo=geo,
                do_mc_hits=do_mc_hits,
                data_cuts=data_cuts,
                do4d=do4d,
                prod_ident=prod_ident)
    pipe.attach(km.common.Keep, keys=['event_hits', 'event_track'])
    pipe.attach(EventSkipper, data_cuts=data_cuts)
    pipe.attach(HistogramMaker,
                x_bin_edges=x_bin_edges,
                y_bin_edges=y_bin_edges,
                z_bin_edges=z_bin_edges,
                n_bins=n_bins,
                timecut=timecut,
                do2d=do2d,
                do2d_plots=do2d_plots,
                pdf_2d_plots=pdf_2d_plots,
                do3d=do3d,
                do4d=do4d)
    pipe.attach(km.common.Delete, keys=['event_hits'])

    if do2d:
        for proj in ['xy', 'xz', 'yz', 'xt', 'yt', 'zt']:
            savestr = output_dirpath + '/orcasong_output/4dTo2d/' + proj + '/' + filename_output + '_' + proj + '.h5'
            pipe.attach(kp.io.HDF5Sink,
                        filename=savestr,
                        blob_keys=[proj, 'event_track'],
                        complib=complib,
                        complevel=complevel,
                        chunksize=chunksize,
                        flush_frequency=flush_freq)

    if do3d:
        for proj in ['xyz', 'xyt', 'xzt', 'yzt', 'rzt']:
            savestr = output_dirpath + '/orcasong_output/4dTo3d/' + proj + '/' + filename_output + '_' + proj + '.h5'
            pipe.attach(kp.io.HDF5Sink,
                        filename=savestr,
                        blob_keys=[proj, 'event_track'],
                        complib=complib,
                        complevel=complevel,
                        chunksize=chunksize,
                        flush_frequency=flush_freq)

    if do4d[0]:
        proj = 'xyzt' if not do4d[1] == 'channel_id' else 'xyzc'
        savestr = output_dirpath + '/orcasong_output/4dTo4d/' + proj + '/' + filename_output + '_' + proj + '.h5'
        pipe.attach(kp.io.HDF5Sink,
                    filename=savestr,
                    blob_keys=[proj, 'event_track'],
                    complib=complib,
                    complevel=complevel,
                    chunksize=chunksize,
                    flush_frequency=flush_freq)

    # Execute Pipeline
    pipe.drain()

    if do2d_plots[0] is True:
        pdf_2d_plots.close()
Example #30
0
def shuffle_h5(filepath_input, tool=False, seed=42, delete=False, chunksize=None,
               complib=None, complevel=None, legacy_mode=False, shuffle=True,
               event_skipper=None, filepath_output=None):
    """
    Shuffles a .h5 file where each dataset needs to have the same number of rows (axis_0).
    The shuffled data is saved to a new .h5 file with the suffix < _shuffled.h5 >.

    Can also skip certain events if a event_skipper is given.

    Parameters
    ----------
    filepath_input : str
        Filepath of the unshuffled input file.
    tool : bool
        Specifies if the function is accessed from the shuffle_h5_tool.
        In this case, the shuffled .h5 file is returned.
    seed : int
        Sets a fixed random seed for the shuffling.
    delete : bool
        Specifies if the old, unshuffled file should be deleted after extracting the data.
    chunksize : None/int
        Specifies the chunksize for axis_0 in the shuffled output files.
        If None, the chunksize is read from the input files.
        Else, a custom chunksize will be used.
    complib : None/str
        Specifies the compression library that should be used for saving the shuffled output files.
        If None, the compression library is read from the input files.
        Else, a custom compression library will be used.
        Currently available: 'gzip', or 'lzf'.
    complevel : None/int
        Specifies the compression level that should be used for saving the shuffled output files.
        A compression level is only available for gzip compression, not lzf!
        If None, the compression level is read from the input files.
        Else, a custom compression level will be used.
    legacy_mode : bool
        Boolean flag that specifies, if the legacy shuffle mode should be used instead of the standard one.
        A more detailed description of this mode can be found in the summary at the top of this python file.
    shuffle : bool
        If false, events will not be shuffled.
    event_skipper : func, optional
        Function that takes the blob as an input, and returns a bool.
        If the bool is true, the blob will be skipped.
    filepath_output : str, optional
        If given, this will be the name of the output file. Otherwise, a name
        is auto generated.

    Returns
    -------
    output_file_shuffled : h5py.File
        H5py file instance of the shuffled output file.

    """
    if event_skipper is None and not shuffle:
        raise ValueError("Either event_skipper or shuffle has to be set")

    complib_f, complevel_f, chunksize_f = get_f_compression_and_chunking(filepath_input)

    chunksize = chunksize_f if chunksize is None else chunksize
    complib = complib_f if complib is None else complib
    complevel = complevel_f if complevel is None else complevel

    if complib == 'lzf':
        complevel = None

    if filepath_output is None:
        filepath_output = get_filepath_output(filepath_input, shuffle,
                                              event_skipper)

    if not legacy_mode:
        # set random km3pipe (=numpy) seed
        print('Setting a Global Random State with the seed < 42 >.')
        km.GlobalRandomState(seed=seed)

        # km3pipe uses pytables for saving the shuffled output file, which has the name 'zlib' for the 'gzip' filter
        if complib == 'gzip':
            complib = 'zlib'

        pipe = kp.Pipeline(timeit=True)  # add timeit=True argument for profiling
        pipe.attach(km.common.StatusBar, every=200)
        pipe.attach(km.common.MemoryObserver, every=200)
        pipe.attach(kp.io.hdf5.HDF5Pump, filename=filepath_input, shuffle=shuffle, reset_index=True)

        if event_skipper is not None:
            pipe.attach(EventSkipper, event_skipper=event_skipper)

        pipe.attach(kp.io.hdf5.HDF5Sink, filename=filepath_output, complib=complib, complevel=complevel, chunksize=chunksize, flush_frequency=1000)
        pipe.drain()

        # copy the used_files dataset to the new file
        copy_used_files(filepath_input, filepath_output)

        if delete:
            os.remove(filepath_input)

        # delete folders with '_i_' that are created by pytables in the HDF5Sink, we don't need them
        output_file_shuffled = h5py.File(filepath_output, 'r+')
        for folder_name in output_file_shuffled:
            if folder_name.startswith('_i_'):
                del output_file_shuffled[folder_name]

    else:
        input_file = h5py.File(filepath_input, 'r')
        folder_data_array_dict = {}

        for folder_name in input_file:
            folder_data_array = input_file[folder_name][()]  # get whole numpy array into memory
            folder_data_array_dict[folder_name] = folder_data_array  # workaround in order to be able to close the input file at the next step

        input_file.close()

        if delete:
            os.remove(filepath_input)

        output_file_shuffled = h5py.File(filepath_output, 'w')
        for n, dataset_key in enumerate(folder_data_array_dict):

            dataset = folder_data_array_dict[dataset_key]

            if n == 0:
                # get a particular seed for the first dataset such that the shuffling is consistent across the datasets
                r = np.random.RandomState(seed)
                state = r.get_state()
                r.shuffle(dataset)

            else:
                r.set_state(state)  # recover shuffle seed of the first dataset
                r.shuffle(dataset)

            chunks = (chunksize,) + dataset.shape[1:]
            output_file_shuffled.create_dataset(dataset_key, data=dataset, dtype=dataset.dtype, chunks=chunks,
                                                compression=complib, compression_opts=complevel)

    # close file in the case of tool=True
    if tool is False:
        output_file_shuffled.close()
    else:
        return output_file_shuffled