def _find(self, key, write, fuzzy_for, fuzzy_for_options): # Check exact match / write case dirname = osp.join(self.path, str(key)) bk = self.backend_key(dirname) if osp.exists(dirname): if write: if self._can_overwrite(key): return bk raise strax.DataExistsError(at=bk) return bk if write: return bk if not fuzzy_for and not fuzzy_for_options: raise strax.DataNotAvailable # Check metadata of all potentially matching data dirs for match... for dirname in os.listdir(self.path): fn = osp.join(self.path, dirname) if not osp.isdir(fn): continue _run_id, _data_type, _ = dirname.split('_') if _run_id != key.run_id or _data_type != key.data_type: continue # TODO: check for broken data metadata = self.backends[0].get_metadata(fn) if self._matches(metadata['lineage'], key.lineage, fuzzy_for, fuzzy_for_options): return self.backend_key(dirname) raise strax.DataNotAvailable
def _find(self, key, write, allow_incomplete, fuzzy_for, fuzzy_for_options): """Determine if data exists Search the S3 store to see if data is there. """ if fuzzy_for or fuzzy_for_options: raise NotImplementedError("Can't do fuzzy with S3") key_str = str(key) bk = self.backend_key(key_str) try: self.backends[0].get_metadata(key) except ClientError as ex: if ex.response['Error']['Code'] == 'NoSuchKey': if write: return bk else: raise strax.DataNotAvailable else: raise ex if write and not self._can_overwrite(key): raise strax.DataExistsError(at=bk) return bk
def _find(self, key, write, allow_incomplete, fuzzy_for, fuzzy_for_options): dirname = osp.join(self.path, str(key)) exists = os.path.exists(dirname) bk = self.backend_key(dirname) if write: if exists and not self._can_overwrite(key): raise strax.DataExistsError(at=dirname) return bk if allow_incomplete: # Check for incomplete data (only exact matching for now) if fuzzy_for or fuzzy_for_options: raise NotImplementedError( "Mixing of fuzzy matching and allow_incomplete " "not supported by DataDirectory.") tempdirname = dirname + '_temp' bk = self.backend_key(tempdirname) if osp.exists(tempdirname): return bk # Check exact match if exists and self._folder_matches(dirname, key, None, None): return bk # Check metadata of all potentially matching data dirs for match... for fn in self._subfolders(): if self._folder_matches(fn, key, fuzzy_for, fuzzy_for_options): return self.backend_key(fn) raise strax.DataNotAvailable
def _find(self, key, write, fuzzy_for, fuzzy_for_options): """Determine if data exists Search the S3 store to see if data is there. """ if fuzzy_for or fuzzy_for_options: raise NotImplementedError("Can't do fuzzy with S3") # Check exact match / write case key_str = str(key) bk = self.backend_key(key_str) # See if any objects exist for this key objects_list = self.s3.list_objects(Bucket=BUCKET_NAME, Prefix=key_str) if 'Contents' in objects_list: if write and not self._can_overwrite(key): raise strax.DataExistsError(at=bk) return bk else: # No objects yet... if write: return bk else: # If reading and no objects, then problem raise strax.DataNotAvailable
def _find(self, key: strax.DataKey, write, allow_incomplete, fuzzy_for, fuzzy_for_options): if fuzzy_for or fuzzy_for_options: raise NotImplementedError("Can't do fuzzy with RunDB yet.") # Check if the run exists if self.runid_field == 'name': run_query = {'name': key.run_id} else: run_query = {'number': int(key.run_id)} dq = self._data_query(key) doc = self.collection.find_one({**run_query, **dq}, projection=dq) if doc is None: # Data was not found if not write: raise strax.DataNotAvailable output_path = os.path.join(self.new_data_path, str(key)) if self.new_data_path is not None: doc = self.collection.find_one(run_query, projection={'_id'}) if not doc: raise ValueError( f"Attempt to register new data for non-existing run {key.run_id}" ) # noqa self.collection.find_one_and_update( {'_id': doc['_id']}, { '$push': { 'data': { 'location': output_path, 'host': self.hostname, 'type': key.data_type, 'protocol': strax.FileSytemBackend.__name__, # TODO: duplication with metadata stuff elsewhere? 'meta': { 'lineage': key.lineage } } } }) return (strax.FileSytemBackend.__name__, output_path) datum = doc['data'][0] if write and not self._can_overwrite(key): raise strax.DataExistsError(at=datum['location']) return datum['protocol'], datum['location']
def _find(self, key, write, allow_incomplete, fuzzy_for, fuzzy_for_options): self.raise_if_non_compatible_run_id(key.run_id) dirname = osp.join(self.path, str(key)) exists = os.path.exists(dirname) bk = self.backend_key(dirname) if write: if exists and not self._can_overwrite(key): raise strax.DataExistsError(at=dirname) return bk if allow_incomplete and not exists: # Check for incomplete data (only exact matching for now) if fuzzy_for or fuzzy_for_options: raise NotImplementedError( "Mixing of fuzzy matching and allow_incomplete " "not supported by DataDirectory.") tempdirname = dirname + '_temp' bk = self.backend_key(tempdirname) if osp.exists(tempdirname): return bk # Check exact match if exists and self._folder_matches(dirname, key, None, None): return bk # Check metadata of all potentially matching data dirs for # matches. This only makes sense for fuzzy searches since # otherwise we should have had an exact match already. (Also # really slows down st.select runs otherwise because we doing an # entire search over all the files in self._subfolders for all # non-available keys). if fuzzy_for or fuzzy_for_options: for fn in self._subfolders(): if self._folder_matches(fn, key, fuzzy_for, fuzzy_for_options): return self.backend_key(fn) raise strax.DataNotAvailable
def _find(self, key: strax.DataKey, write, allow_incomplete, fuzzy_for, fuzzy_for_options): if fuzzy_for or fuzzy_for_options: raise NotImplementedError("Can't do fuzzy with RunDB yet.") # Check if the run exists if self.runid_field == 'name': run_query = {'name': str(key.run_id)} else: run_query = {'number': int(key.run_id)} # Check that we are in rucio backend if self.rucio_path is not None: rucio_key = self.key_to_rucio_did(key) dq = { 'data': { '$elemMatch': { # TODO can we query smart on the lineage_hash? 'type': key.data_type, 'did': rucio_key, 'protocol': 'rucio' } } } doc = self.collection.find_one({**run_query, **dq}, projection=dq) if doc is not None: datum = doc['data'][0] assert datum.get( 'did', '' ) == rucio_key, f'Expected {rucio_key} got data on {datum["location"]}' backend_name, backend_key = datum[ 'protocol'], f'{key.run_id}-{key.data_type}-{key.lineage_hash}' return backend_name, backend_key dq = self._data_query(key) doc = self.collection.find_one({**run_query, **dq}, projection=dq) if doc is None: # Data was not found if not write: raise strax.DataNotAvailable output_path = os.path.join(self.new_data_path, str(key)) if self.new_data_path is not None: doc = self.collection.find_one(run_query, projection={'_id'}) if not doc: raise ValueError( f"Attempt to register new data for non-existing run {key.run_id}" ) # noqa self.collection.find_one_and_update( {'_id': doc['_id']}, { '$push': { 'data': { 'location': output_path, 'host': self.hostname, 'type': key.data_type, 'protocol': strax.FileSytemBackend.__name__, # TODO: duplication with metadata stuff elsewhere? 'meta': { 'lineage': key.lineage } } } }) return (strax.FileSytemBackend.__name__, output_path) datum = doc['data'][0] if write and not self._can_overwrite(key): raise strax.DataExistsError(at=datum['location']) return datum['protocol'], datum['location']
def _find(self, key: strax.DataKey, write, allow_incomplete, fuzzy_for, fuzzy_for_options): if key.run_id.startswith('_'): # Superruns are currently not supprorted.. raise strax.DataNotAvailable if fuzzy_for or fuzzy_for_options: warnings.warn( "Can't do fuzzy with RunDB yet. Only returning exact matches") # Check if the run exists if self.runid_field == 'name': run_query = {'name': str(key.run_id)} else: run_query = {'number': int(key.run_id)} # Check that we are in rucio backend if self.rucio_path is not None: rucio_key = key_to_rucio_did(key) rucio_available_query = self.available_query[-1] dq = { 'data': { '$elemMatch': { 'type': key.data_type, 'did': rucio_key, **rucio_available_query, }, } } doc = self.collection.find_one({ **run_query, **dq, }, projection=dq) if doc is not None: datum = doc['data'][0] error_message = f'Expected {rucio_key} got data on {datum["location"]}' if datum.get('did', '') != rucio_key: raise RuntimeError(error_message) backend_name = 'RucioLocalBackend' backend_key = key_to_rucio_did(key) return backend_name, backend_key dq = self._data_query(key) doc = self.collection.find_one({**run_query, **dq}, projection=dq) if doc is None: # Data was not found if not write: raise strax.DataNotAvailable output_path = os.path.join(self.new_data_path, str(key)) if self.new_data_path is not None: doc = self.collection.find_one(run_query, projection={'_id'}) if not doc: raise ValueError(f"Attempt to register new data for" f" non-existing run {key.run_id}") self.collection.find_one_and_update({'_id': doc['_id']}, { '$push': { 'data': { 'location': output_path, 'host': self.hostname, 'type': key.data_type, 'protocol': strax.FileSytemBackend.__name__, 'meta': { 'lineage': key.lineage } } } }) return (strax.FileSytemBackend.__name__, output_path) datum = doc['data'][0] if datum['host'] == 'rucio-catalogue': raise strax.DataNotAvailable if write and not self._can_overwrite(key): raise strax.DataExistsError(at=datum['location']) return datum['protocol'], datum['location']