def test_find_fuzzy(self): changed_keys = [] rucio_local = straxen.RucioLocalFrontend(path=self.rucio_path) for key in self.test_keys: changed_key = strax.DataKey(run_id=key.run_id, data_type=key.data_type, lineage={ 'dtype': ['Plugin', '1.0.0', {}], }) changed_keys += [changed_key] # We shouldn't find this data with self.assertRaises(strax.DataNotAvailable): rucio_local.find(changed_key) # Also find several shouldn't work find_several_keys = rucio_local.find_several(changed_keys) self.assertFalse(any(find_several_keys)) # Now test fuzzy with self.assertWarns(UserWarning): find_several_keys_fuzzy = rucio_local.find_several( changed_keys, fuzzy_for=changed_keys[0].data_type, ) self.assertTrue(all(find_several_keys_fuzzy))
def metadata(self, run_id): """Metadata to save along with produced data""" return dict(run_id=run_id, data_type=self.provides, data_kind=self.data_kind, dtype=self.dtype, lineage_hash=strax.DataKey(run_id, self.provides, self.lineage).lineage_hash, compressor=self.compressor, lineage=self.lineage)
def setUp(self) -> None: self.test_keys = [ strax.DataKey(run_id=run_id, data_type='dtype', lineage={ 'dtype': ['Plugin', '0.0.0', {}], }) for run_id in ('-1', '-2') ] self.rucio_path = './.test_rucio' self.write_test_data()
def merge( runid_str, # run number padded with 0s dtype, # data type 'level' e.g. records, peaklets st, # strax context path # path where the data is stored ): # get the storage path, since will need to reset later _storage_paths = [storage.path for storage in st.storage] # initialize plugin needed for processing plugin = st._get_plugins((dtype, ), runid_str)[dtype] st._set_plugin_config(plugin, runid_str, tolerant=False) plugin.setup() for keystring in plugin.provides: key = strax.DataKey(runid_str, keystring, plugin.lineage) saver = st.storage[0].saver(key, plugin.metadata(runid_str, keystring)) # monkey patch the saver tmpname = os.path.split(saver.tempdirname)[1] dirname = os.path.split(saver.dirname)[1] saver.tempdirname = os.path.join(path, tmpname) saver.dirname = os.path.join(path, dirname) saver.is_forked = True # merge the jsons saver.close() # change the storage frontend to use the merged data st.storage[0] = strax.DataDirectory(path) # rechunk the data if we can for keystring in plugin.provides: rechunk = True if isinstance(plugin.rechunk_on_save, immutabledict): if not plugin.rechunk_on_save[keystring]: rechunk = False else: if not plugin.rechunk_on_save: rechunk = False if rechunk: print(f"Rechunking {keystring}") st.copy_to_frontend(runid_str, keystring, 1, rechunk=True) else: print( f"Not rechunking {keystring}. Just copy to the staging directory." ) key = st.key_for(runid_str, keystring) src = os.path.join(st.storage[0].path, str(key)) dest = os.path.join(st.storage[1].path, str(key)) shutil.copytree(src, dest) # reset in case we need to merge more data st.storage = [strax.DataDirectory(path) for path in _storage_paths]
def metadata(self, run_id, data_type): """Metadata to save along with produced data""" if not data_type in self.provides: raise RuntimeError(f"{data_type} not in {self.provides}?") return dict(run_id=run_id, data_type=data_type, data_kind=self.data_kind_for(data_type), dtype=self.dtype_for(data_type), lineage_hash=strax.DataKey(run_id, data_type, self.lineage).lineage_hash, compressor=self.compressor, lineage=self.lineage)
def setUpClass(cls) -> None: """ For testing purposes, slightly alter the RucioFrontend such that we can run tests outside of dali too """ # Some non-existing keys that we will try finding in the test cases. cls.test_keys = [ strax.DataKey(run_id=run_id, data_type='dtype', lineage={ 'dtype': ['Plugin', '0.0.0.', {}], }) for run_id in ('-1', '-2') ]
def main(): parser = argparse.ArgumentParser(description="Combine strax output") parser.add_argument('dataset', help='Run number', type=int) parser.add_argument('dtype', help='dtype to combine') parser.add_argument('--context', help='Strax context') parser.add_argument('--input_path', help='path where the temp directory is') parser.add_argument('--output_path', help='final location of combined data') args = parser.parse_args() if os.path.exists(args.output_path): raise (FileExistsError("Output path %s already exists" % args.output_path)) runid = args.dataset runid_str = "%06d" % runid dtype = args.dtype path = args.input_path output_path = args.output_path # get context st = eval(f'straxen.contexts.{args.context}()') st.storage = [strax.DataDirectory(output_path)] # initialize plugin needed for processing plugin = st._get_plugins((dtype, ), runid_str)[dtype] st._set_plugin_config(plugin, runid_str, tolerant=False) plugin.setup() # setup rucio client rc = RucioSummoner() for keystring in plugin.provides: dirname = f"{runid_str}-{keystring}-{hash}" upload_path = os.path.join(output_path, dirname) key = strax.DataKey(runid_str, keystring, plugin.lineage) saver = st.storage[0].saver(key, plugin.metadata(runid_str, keystring)) saver.is_forked = True tmpdir, tmpname = os.path.split(saver.tempdirname) rmtree(saver.tempdirname) copytree(os.path.join(path, tmpname), saver.tempdirname) saver.is_forked = True saver.close()
def setUpClass(cls) -> None: """ For testing purposes, slightly alter the RucioFrontend such that we can run tests outside of dali too """ if not straxen.utilix_is_configured(): return if 'rcc' not in socket.getfqdn(): # If we are not on RCC, for testing, add some dummy site straxen.RucioFrontend.local_rses = { 'UC_DALI_USERDISK': r'.rcc.', 'test_rucio': f'{socket.getfqdn()}' } straxen.RucioFrontend.get_rse_prefix = lambda *x: 'test_rucio' # Some non-existing keys that we will try finding in the test cases. cls.test_keys = [ strax.DataKey(run_id=run_id, data_type='dtype', lineage={ 'dtype': ['Plugin', '0.0.0.', {}], }) for run_id in ('-1', '-2') ]
def keys_for_runs(self, target: str, run_ids: ty.Union[np.ndarray, list, tuple, str] ) -> ty.List[strax.DataKey]: """ Get the data-keys for a multitude of runs. If use_per_run_defaults is False which it preferably is (#246), getting many keys should be fast as we only only compute the lineage once. :param run_ids: Runs to get datakeys for :param target: datatype requested :return: list of datakeys of the target for the given runs. """ run_ids = strax.to_str_tuple(run_ids) if self.context_config['use_per_run_defaults']: return [self.key_for(r, target) for r in run_ids] elif len(run_ids): # Get the lineage once, for the context specifies that the # defaults may not change! p = self._get_plugins((target,), run_ids[0])[target] return [strax.DataKey(r, target, p.lineage) for r in run_ids] else: return []
def _key_for(self, run_id, target): p = self._get_plugins((target, ), run_id)[target] return strax.DataKey(run_id, target, p.lineage)
def check_cache(d): nonlocal plugins, loaders, savers, seen if d in seen: return seen.add(d) p = plugins[d] key = strax.DataKey(run_id, d, p.lineage) for sb_i, sf in enumerate(self.storage): try: # Bit clunky... but allows specifying executor later sf.find(key, **self._find_options) loaders[d] = partial(sf.loader, key, n_range=n_range, **self._find_options) # Found it! No need to make it del plugins[d] break except strax.DataNotAvailable: continue else: if time_range is not None: # While the data type providing the time information is # available (else we'd have failed earlier), one of the # other requested data types is not. raise strax.DataNotAvailable( f"Time range selection assumes data is already " f"available, but {d} for {run_id} is not.") if d in self.context_config['forbid_creation_of']: raise strax.DataNotAvailable( f"{d} for {run_id} not found in any storage, and " "your context specifies it cannot be created.") # Not in any cache. We will be computing it. to_compute[d] = p for dep_d in p.depends_on: check_cache(dep_d) # Should we save this data? if time_range is not None: # No, since we're not even getting the whole data. # Without this check, saving could be attempted if the # storage converter mode is enabled. self.log.warning(f"Not saving {d} while " f"selecting a time range in the run") return if any([ len(v) > 0 for k, v in self._find_options.items() if 'fuzzy' in k ]): # In fuzzy matching mode, we cannot (yet) derive the lineage # of any data we are creating. To avoid create false # data entries, we currently do not save at all. self.log.warning(f"Not saving {d} while fuzzy matching is " f"turned on.") return if self.context_config['allow_incomplete']: self.log.warning(f"Not saving {d} while loading incomplete " f"data is allowed.") return elif p.save_when == strax.SaveWhen.NEVER: if d in save: raise ValueError("Plugin forbids saving of {d}") return elif p.save_when == strax.SaveWhen.TARGET: if d not in targets: return elif p.save_when == strax.SaveWhen.EXPLICIT: if d not in save: return else: assert p.save_when == strax.SaveWhen.ALWAYS for sf in self.storage: if sf.readonly: continue if d not in to_compute: if not self.context_config['storage_converter']: continue try: sf.find(key, **self._find_options) # Already have this data in this backend continue except strax.DataNotAvailable: # Don't have it, so let's convert it! pass try: savers[d].append(sf.saver(key, metadata=p.metadata(run_id))) except strax.DataNotAvailable: # This frontend cannot save. Too bad. pass
def check_cache(d): nonlocal plugins, loaders, savers, seen if d in seen: return seen.add(d) p = plugins[d] # Can we load this data, or must we compute it? loading_this_data = False key = strax.DataKey(run_id, d, p.lineage) for sb_i, sf in enumerate(self.storage): try: # Partial is clunky... but allows specifying executor later # Since it doesn't run until later, we must do a find now # that we can still handle DataNotAvailable sf.find(key, **self._find_options) loaders[d] = partial(sf.loader, key, n_range=n_range, **self._find_options) except strax.DataNotAvailable: continue else: # Found it! No need to make it or look in other frontends loading_this_data = True del plugins[d] break else: # Data not found anywhere. We will be computing it. if time_range is not None and not d.startswith('_temp'): # While the data type providing the time information is # available (else we'd have failed earlier), one of the # other requested data types is not. raise strax.DataNotAvailable( f"Time range selection assumes data is already " f"available, but {d} for {run_id} is not.") if d in self.context_config['forbid_creation_of']: raise strax.DataNotAvailable( f"{d} for {run_id} not found in any storage, and " "your context specifies it cannot be created.") to_compute[d] = p for dep_d in p.depends_on: check_cache(dep_d) # Should we save this data? If not, return. if (loading_this_data and not self.context_config['storage_converter']): return if p.save_when == strax.SaveWhen.NEVER: if d in save: raise ValueError("Plugin forbids saving of {d}") return elif p.save_when == strax.SaveWhen.TARGET: if d not in targets: return elif p.save_when == strax.SaveWhen.EXPLICIT: if d not in save: return else: assert p.save_when == strax.SaveWhen.ALWAYS # Warn about conditions that preclude saving, but the user # might not expect. if time_range is not None: # We're not even getting the whole data. # Without this check, saving could be attempted if the # storage converter mode is enabled. self.log.warning(f"Not saving {d} while " f"selecting a time range in the run") return if any([ len(v) > 0 for k, v in self._find_options.items() if 'fuzzy' in k ]): # In fuzzy matching mode, we cannot (yet) derive the # lineage of any data we are creating. To avoid creating # false data entries, we currently do not save at all. self.log.warning(f"Not saving {d} while fuzzy matching is" f" turned on.") return if self.context_config['allow_incomplete']: self.log.warning(f"Not saving {d} while loading incomplete" f" data is allowed.") return # Save the target and any other outputs of the plugin. for d_to_save in set([d] + list(p.provides)): if d_to_save in savers and len(savers[d_to_save]): # This multi-output plugin was scanned before # let's not create doubled savers assert p.multi_output continue key = strax.DataKey(run_id, d_to_save, p.lineage) for sf in self.storage: if sf.readonly: continue if loading_this_data: # Usually, we don't save if we're loading if not self.context_config['storage_converter']: continue # ... but in storage converter mode we do: try: sf.find(key, **self._find_options) # Already have this data in this backend continue except strax.DataNotAvailable: # Don't have it, so let's save it! pass # If we get here, we must try to save try: savers[d_to_save].append( sf.saver(key, metadata=p.metadata(run_id, d_to_save))) except strax.DataNotAvailable: # This frontend cannot save. Too bad. pass
def main(): parser = argparse.ArgumentParser(description="Upload combined output to rucio") parser.add_argument('dataset', help='Run number', type=int) parser.add_argument('dtype', help='dtype to upload') parser.add_argument('rse', help='Target RSE') parser.add_argument('--context', help='Strax context') args = parser.parse_args() tmp_path = tempfile.mkdtemp() runid = args.dataset runid_str = "%06d" % runid dtype = args.dtype rse = args.rse # get context st = eval(f'straxen.contexts.{args.context}()') st.storage = [strax.DataDirectory(tmp_path)] plugin = st._get_plugins((dtype,), runid_str)[dtype] rc = RucioSummoner() for keystring in plugin.provides: key = strax.DataKey(runid_str, keystring, plugin.lineage) hash = key.lineage_hash # TODO check with utilix DB call that the hashes match? dirname = f"{runid_str}-{keystring}-{hash}" upload_path = os.path.join('combined', dirname) print(f"Uploading {dirname}") os.listdir(upload_path) # make a rucio DID did = make_did(runid, keystring, hash) # check if a rule already exists for this DID rucio_rule = rc.GetRule(upload_structure=did) # if not in rucio already and no rule exists, upload into rucio if not rucio_rule['exists']: result = rc.Upload(did, upload_path, rse, lifetime=None) # check that upload was successful new_rule = rc.GetRule(upload_structure=did, rse=rse) # TODO check number of files new_data_dict={} new_data_dict['location'] = rse new_data_dict['did'] = did new_data_dict['status'] = "transferred" new_data_dict['host'] = "rucio-catalogue" new_data_dict['type'] = keystring new_data_dict['lifetime'] = new_rule['expires'], new_data_dict['protocol'] = 'rucio' new_data_dict['creation_time'] = datetime.datetime.utcnow().isoformat() new_data_dict['checksum'] = 'shit' db.update_data(runid, new_data_dict) else: print(f"Rucio rule already exists for {did}")
def check_cache(d): nonlocal plugins, loaders, savers, seen if d in seen: return seen.add(d) p = plugins[d] key = strax.DataKey(run_id, d, p.lineage) for sb_i, sf in enumerate(self.storage): try: loaders[d] = sf.loader(key, n_range=n_range, **self._fuzzy_options) # Found it! No need to make it del plugins[d] break except strax.DataNotAvailable: continue else: if time_range is not None: # While the data type providing the time information is # available (else we'd have failed earlier), one of the # other requested data types is not. raise strax.DataNotAvailable( f"Time range selection assumes data is already " f"available, but {d} for {run_id} is not.") # Not in any cache. We will be computing it. to_compute[d] = p for dep_d in p.depends_on: check_cache(dep_d) # Should we save this data? if time_range is not None: # No, since we're not even getting the whole data return elif p.save_when == strax.SaveWhen.NEVER: if d in save: raise ValueError("Plugin forbids saving of {d}") return elif p.save_when == strax.SaveWhen.TARGET: if d not in targets: return elif p.save_when == strax.SaveWhen.EXPLICIT: if d not in save: return else: assert p.save_when == strax.SaveWhen.ALWAYS for sf in self.storage: if sf.readonly: continue if d not in to_compute: if not self.context_config['storage_converter']: continue try: sf.find(key, **self._fuzzy_options) # Already have this data in this backend continue except strax.DataNotAvailable: pass try: savers[d].append( sf.saver(key, metadata=p.metadata(run_id), meta_only=p.save_meta_only)) except strax.DataNotAvailable: # This frontend cannot save. Too bad. pass
def check_cache(d): nonlocal plugins, loaders, savers, seen if d in seen: return seen.add(d) p = plugins[d] # Can we load this data? loading_this_data = False key = strax.DataKey(run_id, d, p.lineage) ldr = self._get_partial_loader_for(key, chunk_number=chunk_number, time_range=time_range) if not ldr and run_id.startswith('_'): if time_range is not None: raise NotImplementedError("time range loading not yet " "supported for superruns") sub_run_spec = self.run_metadata( run_id, 'sub_run_spec')['sub_run_spec'] self.make(list(sub_run_spec.keys()), d) ldrs = [] for subrun in sub_run_spec: sub_key = strax.DataKey( subrun, d, self._get_plugins((d, ), subrun)[d].lineage) if sub_run_spec[subrun] == 'all': _subrun_time_range = None else: _subrun_time_range = sub_run_spec[subrun] ldr = self._get_partial_loader_for( sub_key, time_range=_subrun_time_range, chunk_number=chunk_number) if not ldr: raise RuntimeError( f"Could not load {d} for subrun {subrun} " f"even though we made it??") ldrs.append(ldr) def concat_loader(*args, **kwargs): for x in ldrs: yield from x(*args, **kwargs) ldr = lambda *args, **kwargs: concat_loader(*args, **kwargs) if ldr: # Found it! No need to make it or look in other frontends loading_this_data = True loaders[d] = ldr del plugins[d] else: # Data not found anywhere. We will be computing it. if (time_range is not None and plugins[d].save_when != strax.SaveWhen.NEVER): # While the data type providing the time information is # available (else we'd have failed earlier), one of the # other requested data types is not. raise strax.DataNotAvailable( f"Time range selection assumes data is already " f"available, but {d} for {run_id} is not.") if '*' in self.context_config['forbid_creation_of']: raise strax.DataNotAvailable( f"{d} for {run_id} not found in any storage, and " "your context specifies no new data can be created.") if d in self.context_config['forbid_creation_of']: raise strax.DataNotAvailable( f"{d} for {run_id} not found in any storage, and " "your context specifies it cannot be created.") to_compute[d] = p for dep_d in p.depends_on: check_cache(dep_d) # Should we save this data? If not, return. if (loading_this_data and not self.context_config['storage_converter']): return if p.save_when == strax.SaveWhen.NEVER: if d in save: raise ValueError("Plugin forbids saving of {d}") return elif p.save_when == strax.SaveWhen.TARGET: if d not in targets: return elif p.save_when == strax.SaveWhen.EXPLICIT: if d not in save: return else: assert p.save_when == strax.SaveWhen.ALWAYS # Warn about conditions that preclude saving, but the user # might not expect. if time_range is not None: # We're not even getting the whole data. # Without this check, saving could be attempted if the # storage converter mode is enabled. self.log.warning(f"Not saving {d} while " f"selecting a time range in the run") return if any([ len(v) > 0 for k, v in self._find_options.items() if 'fuzzy' in k ]): # In fuzzy matching mode, we cannot (yet) derive the # lineage of any data we are creating. To avoid creating # false data entries, we currently do not save at all. self.log.warning(f"Not saving {d} while fuzzy matching is" f" turned on.") return if self.context_config['allow_incomplete']: self.log.warning(f"Not saving {d} while loading incomplete" f" data is allowed.") return # Save the target and any other outputs of the plugin. for d_to_save in set([d] + list(p.provides)): if d_to_save in savers and len(savers[d_to_save]): # This multi-output plugin was scanned before # let's not create doubled savers assert p.multi_output continue key = strax.DataKey(run_id, d_to_save, p.lineage) for sf in self.storage: if sf.readonly: continue if loading_this_data: # Usually, we don't save if we're loading if not self.context_config['storage_converter']: continue # ... but in storage converter mode we do: try: sf.find(key, **self._find_options) # Already have this data in this backend continue except strax.DataNotAvailable: # Don't have it, so let's save it! pass # If we get here, we must try to save try: savers[d_to_save].append( sf.saver(key, metadata=p.metadata(run_id, d_to_save))) except strax.DataNotAvailable: # This frontend cannot save. Too bad. pass
def main(): parser = argparse.ArgumentParser( description="Strax Processing With Outsource") parser.add_argument('dataset', help='Run number', type=int) parser.add_argument('--input_dtype', help='strax input') parser.add_argument('--output_dtype', help='strax output') parser.add_argument('--context', help='name of context') parser.add_argument('--chunks', nargs='*', help='chunk ids to download') args = parser.parse_args() # directory where we will be putting everything data_dir = './data' # get context st = eval(f'straxen.contexts.{args.context}()') st.storage = [strax.DataDirectory(data_dir)] runid = args.dataset in_dtype = args.input_dtype out_dtype = args.output_dtype hash = db.get_hash(args.context, in_dtype) # download the input data admix.download(runid, in_dtype, hash, chunks=args.chunks, location=data_dir) runid_str = "%06d" % runid input_metadata = st.get_metadata(runid_str, in_dtype) input_key = strax.DataKey(runid_str, in_dtype, input_metadata['lineage']) # initialize plugin needed for processing plugin = st._get_plugins((out_dtype, ), runid_str)[out_dtype] st._set_plugin_config(plugin, runid_str, tolerant=False) plugin.setup() # setup savers savers = dict() for keystring in plugin.provides: key = strax.DataKey(runid_str, keystring, plugin.lineage) saver = st.storage[0].saver(key, plugin.metadata(runid, keystring)) saver.is_forked = True savers[keystring] = saver # setup a few more variables backend = st.storage[0].backends[0] dtype = literal_eval(input_metadata['dtype']) chunk_kwargs = dict(data_type=input_metadata['data_type'], data_kind=input_metadata['data_kind'], dtype=dtype) # process the chunks for chunk in args.chunks: # read in the input data for this chunk in_data = backend._read_and_format_chunk( backend_key=st.storage[0].find(input_key)[1], metadata=input_metadata, chunk_info=input_metadata['chunks'][int(chunk)], dtype=dtype, time_range=None, chunk_construction_kwargs=chunk_kwargs) # process this chunk output_data = plugin.do_compute(chunk_i=chunk, **{in_dtype: in_data}) # save the output -- you have to loop because there could be > 1 output dtypes for keystring, strax_chunk in output_data.items(): savers[keystring].save(strax_chunk, chunk_i=int(chunk))
def process(runid, out_dtype, st, chunks, close_savers=False, tmp_path='.tmp_for_strax' ): runid_str = "%06d" % runid t0 = time.time() # initialize plugin needed for processing this output type plugin = st._get_plugins((out_dtype,), runid_str)[out_dtype] st._set_plugin_config(plugin, runid_str, tolerant=False) plugin.setup() # now move on to processing # if we didn't pass any chunks, we process the whole thing -- otherwise just do the chunks we listed if chunks is None: print("Chunks is none -- processing whole thing!") # then we just process the whole thing for keystring in plugin.provides: print(f"Making {keystring}") st.make(runid_str, keystring, max_workers=8, allow_multiple=True, ) print(f"DONE processing {keystring}") # process chunk-by-chunk else: # setup savers savers = dict() for keystring in plugin.provides: print(f"Making {keystring}") key = strax.DataKey(runid_str, keystring, plugin.lineage) saver = st.storage[0].saver(key, plugin.metadata(runid, keystring)) saver.is_forked = True savers[keystring] = saver # setup a few more variables # TODO not sure exactly how this works when an output plugin depends on >1 plugin # maybe that doesn't matter? in_dtype = plugin.depends_on[0] input_metadata = st.get_metadata(runid_str, in_dtype) input_key = strax.DataKey(runid_str, in_dtype, input_metadata['lineage']) backend = st.storage[0].backends[0] dtype = literal_eval(input_metadata['dtype']) chunk_kwargs = dict(data_type=input_metadata['data_type'], data_kind=input_metadata['data_kind'], dtype=dtype) for chunk in chunks: # read in the input data for this chunk chunk_info = None for chunk_md in input_metadata['chunks']: if chunk_md['chunk_i'] == int(chunk): chunk_info = chunk_md break assert chunk_info is not None, f"Could not find chunk_id: {chunk}" in_data = backend._read_and_format_chunk(backend_key=st.storage[0].find(input_key)[1], metadata=input_metadata, chunk_info=chunk_info, dtype=dtype, time_range=None, chunk_construction_kwargs=chunk_kwargs ) # process this chunk output_data = plugin.do_compute(chunk_i=chunk, **{in_dtype: in_data}) # save the output -- you have to loop because there could be > 1 output dtypes for keystring, strax_chunk in output_data.items(): savers[keystring].save(strax_chunk, chunk_i=int(chunk)) if close_savers: for dtype, saver in savers.items(): # copy the metadata to a tmp directory tmpdir = os.path.join(tmp_path, os.path.split(saver.tempdirname)[1]) os.makedirs(tmpdir, exist_ok=True) for file in os.listdir(saver.tempdirname): if file.endswith('json'): src = os.path.join(saver.tempdirname, file) dest = os.path.join(tmpdir, file) copyfile(src, dest) saver.close() process_time = time.time() - t0 print(f"=== Processing time for {out_dtype}: {process_time/60:0.2f} minutes === ")