def test_module(self): outfile = tempfile.NamedTemporaryFile(delete=True) pipe = kp.Pipeline() pipe.attach(kp.io.OfflinePump, filename=data_path("offline/numucc.root")) pipe.attach(km.io.MCTracksTabulator) pipe.attach(kp.io.HDF5Sink, filename=outfile.name) pipe.drain() pipe = kp.Pipeline() pipe.attach(kp.io.HDF5Pump, filename=outfile.name) pipe.attach(km.common.Observer, count=10, required_keys=["McTracks"]) pipe.drain()
def main(): from docopt import docopt args = docopt(__doc__, version=VERSION) det_id = int(args['-d']) plot_path = args['-o'] ligier_ip = args['-l'] ligier_port = int(args['-p']) du = int(args['-u']) interval = int(args['-i']) detector = kp.hardware.Detector(det_id=det_id) pipe = kp.Pipeline(timeit=True) pipe.attach(kp.io.ch.CHPump, host=ligier_ip, port=ligier_port, tags='IO_MONIT', timeout=60 * 60 * 24 * 7, max_queue=2000) pipe.attach(PMTRates, detector=detector, du=du, interval=interval, plot_path=plot_path) pipe.drain()
def test_module(self): outfile = tempfile.NamedTemporaryFile(delete=True) pipe = kp.Pipeline() pipe.attach( kp.io.OfflinePump, filename=data_path( "offline/mcv6.0.gsg_muon_highE-CC_50-500GeV.km3sim.jterbr00008357.jorcarec.aanet.905.root" ), ) pipe.attach(km.io.EventInfoTabulator) pipe.attach(kp.io.HDF5Sink, filename=outfile.name) pipe.drain(10) pipe = kp.Pipeline() pipe.attach(kp.io.HDF5Pump, filename=outfile.name) pipe.attach(km.common.Observer, count=10, required_keys=["EventInfo"]) pipe.attach(CheckW2listContents) pipe.drain()
def main(): pipe = kp.Pipeline() pipe.attach(kp.io.ch.CHPump, host='192.168.0.21', port=5553, tags='IO_MONIT', timeout=30, max_queue=2000000) pipe.attach(OOSAnalyzer) pipe.drain()
def main(): pipe = kp.Pipeline() pipe.attach(kp.io.ch.CHPump, host='192.168.0.21', port=5553, tags='IO_TSL0', timeout=60 * 5, max_queue=2000) pipe.attach(kp.io.daq.TimesliceParser) pipe.attach(OOSAnalyzer) pipe.drain()
def main(): pipe = kp.Pipeline() pipe.attach(kp.io.ch.CHPump, host='192.168.0.21', port=5553, tags='IO_EVT', timeout=60 * 5, max_queue=2000) pipe.attach(kp.io.daq.DAQProcessor) pipe.attach(OOSAnalyzer) pipe.drain()
def test_module(self): outfile = tempfile.NamedTemporaryFile(delete=True) pipe = kp.Pipeline() pipe.attach( kp.io.OfflinePump, filename=data_path( "offline/mcv6.0.gsg_muon_highE-CC_50-500GeV.km3sim.jterbr00008357.jorcarec.aanet.905.root" ), ) pipe.attach(km.io.RecoTracksTabulator, best_tracks=True) pipe.attach(kp.io.HDF5Sink, filename=outfile.name) pipe.drain(5) pipe = kp.Pipeline() pipe.attach(kp.io.HDF5Pump, filename=outfile.name) pipe.attach(km.common.Observer, count=5, required_keys=["Tracks"]) pipe.attach(km.common.Observer, count=5, required_keys=["RecStages"]) pipe.attach(km.common.Observer, count=5, required_keys=["BestJmuon"]) pipe.attach(CheckRecoContents) pipe.drain()
def main(): detx_delay = open('D_BCI_0004_calibrated.detx', 'r') pipe = kp.Pipeline() pipe.attach(kp.io.ch.CHPump, host='192.168.0.21', port=5553, tags='IO_TSL0', timeout=60 * 5, max_queue=2000) pipe.attach(kp.io.daq.TimesliceParser) pipe.attach(OOSAnalyzer, filee=detx_delay) pipe.drain()
def main(): detector = kp.hardware.Detector(det_id=29) pipe = kp.Pipeline(timeit=True) pipe.attach( kp.io.CHPump, host="192.168.0.110", port=5553, tags="IO_MONIT", timeout=60 * 60 * 24 * 7, max_queue=1000, ) pipe.attach(PMTRates, detector=detector, du=2, interval=2) pipe.drain()
def build_pipe(self, infile, outfile, timeit=True): """Initialize and connect the modules from the different stages.""" components = [ *self.get_cmpts_pre(infile=infile), *self.get_cmpts_main(), *self.get_cmpts_post(outfile=outfile), ] pipe = kp.Pipeline(timeit=timeit) if self.n_statusbar is not None: pipe.attach(km.common.StatusBar, every=self.n_statusbar) if self.n_memory_observer is not None: pipe.attach(km.common.MemoryObserver, every=self.n_memory_observer) for cmpt, kwargs in components: pipe.attach(cmpt, **kwargs) return pipe
def main(): from docopt import docopt args = docopt(__doc__, version=kp.version) tag = args["TAG"] outfile = args["OUTFILE"] port = int(args["-p"]) ip = args["-i"] n = int(args["-n"]) pipe = kp.Pipeline() pipe.attach(kp.io.ch.CHPump, host=ip, port=port, tags=tag) pipe.attach(Dumper, filename=outfile) pipe.drain(n)
def main(): from docopt import docopt args = docopt(__doc__, version=VERSION) plots_path = args['-o'] ligier_ip = args['-l'] ligier_port = int(args['-p']) pipe = kp.Pipeline() pipe.attach(kp.io.ch.CHPump, host=ligier_ip, port=ligier_port, tags='IO_EVT', timeout=60 * 60 * 24 * 7, max_queue=200000) pipe.attach(kp.io.daq.DAQProcessor) pipe.attach(TriggerRate, interval=300, plots_path=plots_path) pipe.drain()
def main(): from docopt import docopt args = docopt(__doc__, version=VERSION) plots_path = args['-o'] ligier_ip = args['-l'] ligier_port = int(args['-p']) pipe = kp.Pipeline() pipe.attach( kp.io.ch.CHPump, host=ligier_ip, port=ligier_port, tags='IO_TSL0,IO_TSL1,IO_TSL2,IO_TSSN', timeout=60 * 60 * 24 * 7, max_queue=200000) pipe.attach(TimesliceRate, interval=10, plots_path=plots_path) pipe.drain()
def main(): from docopt import docopt args = docopt(__doc__) plots_path = args['-o'] ligier_ip = args['-l'] ligier_port = int(args['-p']) pipe = kp.Pipeline() pipe.attach(kp.io.ch.CHPump, host=ligier_ip, port=ligier_port, tags='IO_OLINE', timeout=60 * 60 * 24 * 7, max_queue=2000) pipe.attach(kp.io.daq.DAQProcessor) pipe.attach(RecoPlotter, plots_path=plots_path) pipe.drain()
def main(): from docopt import docopt args = docopt(__doc__) det_id = int(args['-d']) plots_path = args['-o'] ligier_ip = args['-l'] ligier_port = int(args['-p']) pipe = kp.Pipeline() pipe.attach(kp.io.ch.CHPump, host=ligier_ip, port=ligier_port, tags='IO_MONIT', timeout=60 * 60 * 24 * 7, max_queue=2000) pipe.attach(CalibrateAHRS, det_id=det_id, plots_path=plots_path) pipe.drain()
def main(): from docopt import docopt args = docopt(__doc__) ligier_ip = args['-l'] ligier_port = int(args['-p']) logging_ligier_ip = args['-m'] logging_ligier_port = int(args['-q']) pipe = kp.Pipeline() pipe.attach(kp.io.ch.CHPump, host=ligier_ip, port=ligier_port, tags="IO_TSSN") pipe.attach(kp.io.daq.TimesliceParser) pipe.attach(TimeSyncChecker, logging_ligier_ip=logging_ligier_ip, logging_ligier_port=logging_ligier_port) pipe.drain()
def main(): from docopt import docopt args = docopt(__doc__, version=VERSION) det_id = int(args['-d']) plots_path = args['-o'] ligier_ip = args['-l'] ligier_port = int(args['-p']) pipe = kp.Pipeline() pipe.attach(kp.io.ch.CHPump, host=ligier_ip, port=ligier_port, tags='IO_SUM', timeout=60 * 60 * 24 * 7, max_queue=2000) pipe.attach(kp.io.daq.DAQProcessor) pipe.attach(DOMRates, det_id=det_id, plots_path=plots_path) pipe.drain()
def test_calibration_in_pipeline(self): class DummyPump(kp.Module): def configure(self): self.index = 0 def process(self, blob): self.index += 1 mc_hits = Table({"pmt_id": [1, 2, 1], "time": [10.1, 11.2, 12.3]}) hits = Table( { "dom_id": [2, 3, 3], "channel_id": [0, 1, 2], "time": [10.1, 11.2, 12.3], "tot": [0, 10, 255], } ) blob["Hits"] = hits blob["McHits"] = mc_hits return blob _self = self class Observer(kp.Module): def process(self, blob): assert "Hits" in blob assert "McHits" in blob assert "CalibHits" in blob assert "CalibMcHits" in blob assert not hasattr(blob["Hits"], "pmt_id") assert hasattr(blob["CalibHits"], "pmt_id") assert not hasattr(blob["McHits"], "dom_id") assert hasattr(blob["CalibHits"], "dom_id") assert np.allclose([10.1, 11.2, 12.3], blob["Hits"].time) assert np.allclose([42.09, 87.31, 111.34], blob["CalibHits"].time) assert np.allclose(blob["McHits"].time, blob["CalibMcHits"].time) return blob pipe = kp.Pipeline() pipe.attach(DummyPump) pipe.attach(Calibration, filename=data_path("detx/detx_v1.detx")) pipe.attach(Observer) pipe.drain(3)
def k40calib(filename, tmax, ctmin, stream, filter_hrv, det_id, calib_filename): pipe = kp.Pipeline() pipe.attach(kp.io.jpp.TimeslicePump, filename=filename, stream=stream) pipe.attach(StatusBar, every=5000) pipe.attach(MemoryObserver, every=10000) pipe.attach(k40.HRVFIFOTimesliceFilter, filter_hrv=filter_hrv, filename=filename) pipe.attach(k40.SummaryMedianPMTRateService, filename=filename) pipe.attach(k40.TwofoldCounter, tmax=tmax) pipe.attach(k40.K40BackgroundSubtractor, mode="offline") pipe.attach( k40.IntraDOMCalibrator, ctmin=ctmin, mode="offline", det_id=det_id, calib_filename=calib_filename, ) pipe.drain()
def main(): """The main script""" from docopt import docopt args = docopt(__doc__, version=kp.version) kp.logger.set_level("km3pipe", args["-d"]) pipe = kp.Pipeline() pipe.attach( kp.io.ch.CHPump, host=args["SOURCE_IP"], port=int(args["-p"]), tags=args["-m"], timeout=int(args["-x"]), max_queue=int(args["-s"]), show_statistics=True, ) pipe.attach(LigierSender, target_ip=args["-t"], port=int(args["-q"])) pipe.drain()
def main(): from docopt import docopt args = docopt(__doc__) dom_id = int(args["DOM_ID"]) pipe = kp.Pipeline(timeit=True) pipe.attach( kp.io.CHPump, host="127.0.0.1", port=5553, tags="IO_SUM, IO_MONIT", timeout=60 * 60 * 24 * 7, max_queue=1000, ) pipe.attach(MonitoringChannelPicker, dom_id=dom_id) pipe.attach(SummarysliceMatcher, dom_id=dom_id, n_timeslices=int(args["-n"])) pipe.drain()
def main(): from docopt import docopt args = docopt(__doc__) det_id = int(args['-d']) plots_path = args['-o'] ligier_ip = args['-l'] ligier_port = int(args['-p']) pipe = kp.Pipeline() pipe.attach(LocalDBService, thread_safety=False) pipe.attach(ELOGService) pipe.attach(kp.io.ch.CHPump, host=ligier_ip, port=ligier_port, tags='IO_EVT, IO_SUM', timeout=60 * 60 * 24 * 7, max_queue=2000) pipe.attach(kp.io.daq.DAQProcessor) pipe.attach(ZTPlot, det_id=det_id, plots_path=plots_path, elog=False) pipe.drain()
def main(): from docopt import docopt args = docopt(__doc__) det_id = int(args['-d']) plots_path = args['-o'] ligier_ip = args['-l'] ligier_port = int(args['-p']) pipe = kp.Pipeline() pipe.attach(kp.io.ch.CHPump, host=ligier_ip, port=ligier_port, tags='IO_EVT', timeout=60 * 60 * 24 * 7, max_queue=2000) pipe.attach(kp.io.daq.DAQProcessor) pipe.attach(TriggerMap, det_id=det_id, plots_path=plots_path, only_if="Hits") pipe.drain()
def main(): from docopt import docopt args = docopt(__doc__) det_id = int(args['-d']) plots_path = args['-o'] ligier_ip = args['-l'] ligier_port = int(args['-p']) det_oid = kp.db.DBManager().get_det_oid(det_id) pipe = kp.Pipeline(timeit=True) pipe.attach( kp.io.ch.CHPump, host=ligier_ip, port=ligier_port, tags='IO_TSL1, IO_MONIT', timeout=7 * 60 * 60 * 24, max_queue=200000) pipe.attach(kp.io.ch.CHTagger) pipe.attach(StatusBar, every=50000) pipe.attach(MemoryObserver, every=100000) pipe.attach(k40.MedianPMTRatesService, only_if='IO_MONIT') pipe.attach(kp.io.daq.TimesliceParser) pipe.attach( k40.TwofoldCounter, tmax=10, dump_filename=os.path.join(plots_path, 'twofold_counts.p')) pipe.attach(Siphon, volume=10 * 60 * 180, flush=True) pipe.attach(k40.K40BackgroundSubtractor) pipe.attach(k40.IntraDOMCalibrator, ctmin=-1, det_id=det_id) pipe.attach( IntraDOMCalibrationPlotter, det_oid=det_oid, data_path=plots_path, plots_path=plots_path) pipe.attach(k40.ResetTwofoldCounts) pipe.drain()
def main(): args = docopt(__doc__, version=kp.version) du = int(args["-u"]) if args["-u"] else None try: det_id = int(args["-d"]) det = kp.hardware.Detector(det_id=det_id) except ValueError: detx = args["-d"] det = kp.hardware.Detector(filename=detx) if args["-s"] is not None: subtitle = args["-s"] else: subtitle = ", ".join(args["FILENAMES"]) pipe = kp.Pipeline() if args["--offline"]: pipe.attach(km.common.MultiFilePump, pump=kp.io.OfflinePump, filenames=args["FILENAMES"]) pipe.attach(km.io.HitsTabulator, kind="offline") else: pipe.attach( km.common.MultiFilePump, pump=kp.io.online.EventPump, filenames=args["FILENAMES"], ) pipe.attach(StatusBar, every=2500) pipe.attach( TriggerMap, detector=det, du=du, plot_filename=args["-p"], subtitle=subtitle, ) pipe.drain()
def configure(self): self.h5loc = self.require("h5loc") self.n = self.get("n", default=10) def process(self, blob): table = kp.Table({"x": np.random.randn(self.n)}, h5loc=self.h5loc) blob["RandomNumbers"] = table return blob ##################################################### # Creating a simple pipeline # -------------------------- # We create a very basic pipeline: pipe = kp.Pipeline() pipe.attach(km.StatusBar, every=1) pipe.attach(km.mc.GlobalRandomState, seed=23) pipe.attach(RandomNumberGenerator, h5loc="/rnd", n=5) pipe.attach(kp.io.HDF5Sink, filename="rnd.h5") pipe.drain(11) ##################################################### # Provenance # ---------- # The provenance information is managed by the singleton class # ``Provenance``. To access all the provenance information, # use the ``as_json()`` method: print(kp.Provenance().as_json(indent=2))
The following script calculates the PMT time offsets using K40 coincidences """ # Author: Jonas Reubelt <*****@*****.**> and Tamas Gal <*****@*****.**> # License: MIT import km3pipe as kp from km3modules import k40 from km3modules.common import StatusBar, MemoryObserver, Siphon from km3modules.plot import IntraDOMCalibrationPlotter import km3pipe.style km3pipe.style.use("km3pipe") pipe = kp.Pipeline(timeit=True) pipe.attach( kp.io.ch.CHPump, host="127.0.0.1", port=5553, tags="IO_TSL, IO_MONIT", timeout=7 * 60 * 60 * 24, max_queue=42, ) pipe.attach(kp.io.ch.CHTagger) pipe.attach(StatusBar, every=1000) pipe.attach(MemoryObserver, every=5000) pipe.attach(k40.MedianPMTRatesService, only_if="IO_MONIT") pipe.attach(kp.io.daq.TimesliceParser) pipe.attach(k40.TwofoldCounter, tmax=10) pipe.attach(Siphon, volume=10 * 10 * 1, flush=True)
def main(): from docopt import docopt args = docopt(__doc__, version=kp.version) step_size = int(args["--step-size"]) default_flags = ( "--offline-header", "--event-info", "--offline-hits", "--mc-hits", "--mc-tracks", "--with-calibration", "--reco-tracks", "--best-tracks", ) if not any([args[k] for k in default_flags]): for k in default_flags: args[k] = True outfile = args["-o"] if outfile is None: outfile = args["FILENAME"] + ".h5" provfile = args["--provenance-file"] if provfile is None: provfile = outfile + ".prov.json" Provenance().outfile = provfile pipe = kp.Pipeline(timeit=args["--timeit"]) pipe.attach(kp.io.OfflinePump, filename=args["FILENAME"], step_size=step_size) pipe.attach(km.StatusBar, every=1000) pipe.attach(km.common.MemoryObserver, every=500) if args["--offline-header"]: pipe.attach(km.io.OfflineHeaderTabulator) if args["--event-info"]: pipe.attach(km.io.EventInfoTabulator) if args["--offline-hits"]: pipe.attach( km.io.HitsTabulator, name="Offline", kind="offline", with_calibration=args["--with-calibration"], ) if args["--online-hits"]: pipe.attach(km.io.HitsTabulator, name="Online", kind="online") if args["--mc-hits"]: pipe.attach(km.io.HitsTabulator, name="MC", kind="mc") if args["--mc-tracks"]: pipe.attach(km.io.MCTracksTabulator, read_usr_data=args["--mc-tracks-usr-data"]) if args["--reco-tracks"]: pipe.attach( km.io.RecoTracksTabulator, best_tracks=args["--best-tracks"], aashower_legacy=args["--aashower-legacy"], ) pipe.attach(kp.io.HDF5Sink, filename=outfile) if args["-n"] is not None: pipe.drain(int(args["-n"])) else: pipe.drain()
def make_nn_images(fname, detx_filepath, config): """ Main code with config parameters. Reads raw .hdf5 files and creates 2D/3D histogram projections that can be used for a CNN. Parameters ---------- fname : str Filename (full path!) of the input file. detx_filepath : str String with the full filepath to the corresponding .detx file of the input file. Used for the binning and for the hits calibration if the input file is not calibrated yet (e.g. hits do not contain pos_x/y/z, time, ...). config : dict Dictionary that contains all configuration options of the make_nn_images function. An explanation of the config parameters can be found in orcasong/default_config.toml. """ # Load all parameters from the config # TODO put everything in a config class, this is horrible output_dirpath = config['output_dirpath'] chunksize, complib, complevel = config['chunksize'], config[ 'complib'], config['complevel'] flush_freq = config['flush_freq'] n_bins = tuple(config['n_bins']) timecut = (config['timecut_mode'], config['timecut_timespan']) do_mc_hits = config['do_mc_hits'] det_geo = config['det_geo'] do2d = config['do2d'] do2d_plots = (config['do2d_plots'], config['do2d_plots_n']) do3d = config['do3d'] do4d = (config['do4d'], config['do4d_mode']) prod_ident = config[ 'prod_ident'] if config['prod_ident'] != 'None' else None data_cuts = dict() data_cuts['triggered'] = config['data_cut_triggered'] data_cuts['energy_lower_limit'] = config[ 'data_cut_e_low'] if config['data_cut_e_low'] != 'None' else None data_cuts['energy_upper_limit'] = config[ 'data_cut_e_high'] if config['data_cut_e_high'] != 'None' else None data_cuts['throw_away_prob'] = config['data_cut_throw_away'] if config[ 'data_cut_throw_away'] != 'None' else None data_cuts[ 'custom_skip_function'] = config['data_cut_custom_func'] if config[ 'data_cut_custom_func'] != 'None' else None make_output_dirs(output_dirpath, do2d, do3d, do4d) filename = os.path.basename(os.path.splitext(fname)[0]) filename_output = filename.replace('.', '_') # set random km3pipe (=numpy) seed print('Setting a Global Random State with the seed < 42 >.') km.GlobalRandomState(seed=42) geo, x_bin_edges, y_bin_edges, z_bin_edges = calculate_bin_edges( n_bins, det_geo, detx_filepath, do4d) pdf_2d_plots = PdfPages(output_dirpath + '/orcasong_output/4dTo2d/' + filename_output + '_plots.pdf') if do2d_plots[0] is True else None file_particle_type = get_file_particle_type(fname) print('Generating histograms from the hits for files based on ' + fname) # Initialize OrcaSong Event Pipeline pipe = kp.Pipeline() # add timeit=True argument for profiling pipe.attach(km.common.StatusBar, every=200) pipe.attach(km.common.MemoryObserver, every=400) pipe.attach(kp.io.hdf5.HDF5Pump, filename=fname) pipe.attach(km.common.Keep, keys=[ 'EventInfo', 'Header', 'RawHeader', 'McTracks', 'Hits', 'McHits' ]) pipe.attach(EventDataExtractor, file_particle_type=file_particle_type, geo=geo, do_mc_hits=do_mc_hits, data_cuts=data_cuts, do4d=do4d, prod_ident=prod_ident) pipe.attach(km.common.Keep, keys=['event_hits', 'event_track']) pipe.attach(EventSkipper, data_cuts=data_cuts) pipe.attach(HistogramMaker, x_bin_edges=x_bin_edges, y_bin_edges=y_bin_edges, z_bin_edges=z_bin_edges, n_bins=n_bins, timecut=timecut, do2d=do2d, do2d_plots=do2d_plots, pdf_2d_plots=pdf_2d_plots, do3d=do3d, do4d=do4d) pipe.attach(km.common.Delete, keys=['event_hits']) if do2d: for proj in ['xy', 'xz', 'yz', 'xt', 'yt', 'zt']: savestr = output_dirpath + '/orcasong_output/4dTo2d/' + proj + '/' + filename_output + '_' + proj + '.h5' pipe.attach(kp.io.HDF5Sink, filename=savestr, blob_keys=[proj, 'event_track'], complib=complib, complevel=complevel, chunksize=chunksize, flush_frequency=flush_freq) if do3d: for proj in ['xyz', 'xyt', 'xzt', 'yzt', 'rzt']: savestr = output_dirpath + '/orcasong_output/4dTo3d/' + proj + '/' + filename_output + '_' + proj + '.h5' pipe.attach(kp.io.HDF5Sink, filename=savestr, blob_keys=[proj, 'event_track'], complib=complib, complevel=complevel, chunksize=chunksize, flush_frequency=flush_freq) if do4d[0]: proj = 'xyzt' if not do4d[1] == 'channel_id' else 'xyzc' savestr = output_dirpath + '/orcasong_output/4dTo4d/' + proj + '/' + filename_output + '_' + proj + '.h5' pipe.attach(kp.io.HDF5Sink, filename=savestr, blob_keys=[proj, 'event_track'], complib=complib, complevel=complevel, chunksize=chunksize, flush_frequency=flush_freq) # Execute Pipeline pipe.drain() if do2d_plots[0] is True: pdf_2d_plots.close()
def shuffle_h5(filepath_input, tool=False, seed=42, delete=False, chunksize=None, complib=None, complevel=None, legacy_mode=False, shuffle=True, event_skipper=None, filepath_output=None): """ Shuffles a .h5 file where each dataset needs to have the same number of rows (axis_0). The shuffled data is saved to a new .h5 file with the suffix < _shuffled.h5 >. Can also skip certain events if a event_skipper is given. Parameters ---------- filepath_input : str Filepath of the unshuffled input file. tool : bool Specifies if the function is accessed from the shuffle_h5_tool. In this case, the shuffled .h5 file is returned. seed : int Sets a fixed random seed for the shuffling. delete : bool Specifies if the old, unshuffled file should be deleted after extracting the data. chunksize : None/int Specifies the chunksize for axis_0 in the shuffled output files. If None, the chunksize is read from the input files. Else, a custom chunksize will be used. complib : None/str Specifies the compression library that should be used for saving the shuffled output files. If None, the compression library is read from the input files. Else, a custom compression library will be used. Currently available: 'gzip', or 'lzf'. complevel : None/int Specifies the compression level that should be used for saving the shuffled output files. A compression level is only available for gzip compression, not lzf! If None, the compression level is read from the input files. Else, a custom compression level will be used. legacy_mode : bool Boolean flag that specifies, if the legacy shuffle mode should be used instead of the standard one. A more detailed description of this mode can be found in the summary at the top of this python file. shuffle : bool If false, events will not be shuffled. event_skipper : func, optional Function that takes the blob as an input, and returns a bool. If the bool is true, the blob will be skipped. filepath_output : str, optional If given, this will be the name of the output file. Otherwise, a name is auto generated. Returns ------- output_file_shuffled : h5py.File H5py file instance of the shuffled output file. """ if event_skipper is None and not shuffle: raise ValueError("Either event_skipper or shuffle has to be set") complib_f, complevel_f, chunksize_f = get_f_compression_and_chunking(filepath_input) chunksize = chunksize_f if chunksize is None else chunksize complib = complib_f if complib is None else complib complevel = complevel_f if complevel is None else complevel if complib == 'lzf': complevel = None if filepath_output is None: filepath_output = get_filepath_output(filepath_input, shuffle, event_skipper) if not legacy_mode: # set random km3pipe (=numpy) seed print('Setting a Global Random State with the seed < 42 >.') km.GlobalRandomState(seed=seed) # km3pipe uses pytables for saving the shuffled output file, which has the name 'zlib' for the 'gzip' filter if complib == 'gzip': complib = 'zlib' pipe = kp.Pipeline(timeit=True) # add timeit=True argument for profiling pipe.attach(km.common.StatusBar, every=200) pipe.attach(km.common.MemoryObserver, every=200) pipe.attach(kp.io.hdf5.HDF5Pump, filename=filepath_input, shuffle=shuffle, reset_index=True) if event_skipper is not None: pipe.attach(EventSkipper, event_skipper=event_skipper) pipe.attach(kp.io.hdf5.HDF5Sink, filename=filepath_output, complib=complib, complevel=complevel, chunksize=chunksize, flush_frequency=1000) pipe.drain() # copy the used_files dataset to the new file copy_used_files(filepath_input, filepath_output) if delete: os.remove(filepath_input) # delete folders with '_i_' that are created by pytables in the HDF5Sink, we don't need them output_file_shuffled = h5py.File(filepath_output, 'r+') for folder_name in output_file_shuffled: if folder_name.startswith('_i_'): del output_file_shuffled[folder_name] else: input_file = h5py.File(filepath_input, 'r') folder_data_array_dict = {} for folder_name in input_file: folder_data_array = input_file[folder_name][()] # get whole numpy array into memory folder_data_array_dict[folder_name] = folder_data_array # workaround in order to be able to close the input file at the next step input_file.close() if delete: os.remove(filepath_input) output_file_shuffled = h5py.File(filepath_output, 'w') for n, dataset_key in enumerate(folder_data_array_dict): dataset = folder_data_array_dict[dataset_key] if n == 0: # get a particular seed for the first dataset such that the shuffling is consistent across the datasets r = np.random.RandomState(seed) state = r.get_state() r.shuffle(dataset) else: r.set_state(state) # recover shuffle seed of the first dataset r.shuffle(dataset) chunks = (chunksize,) + dataset.shape[1:] output_file_shuffled.create_dataset(dataset_key, data=dataset, dtype=dataset.dtype, chunks=chunks, compression=complib, compression_opts=complevel) # close file in the case of tool=True if tool is False: output_file_shuffled.close() else: return output_file_shuffled