def _index(doc, metadata, op_type='index', _addon=MappingProxyType({})): assert isinstance(doc, ValidatedDict) assert isinstance(metadata, ValidatedDict) _now = datetime.now(timezone.utc) dataset = ESDataset(**doc) dataset['_meta'] = metadata dataset['_n3c'] = { "url": _addon.get("n3c_url"), "status": _addon.get("n3c_status"), "timestamp": _addon.get("n3c_timestamp") } dataset['_ts'] = { "date_created": _addon.get("date_created") or _now, "last_updated": _now } try: dataset.save(op_type=op_type) except elasticsearch_dsl.ValidationException as exc: raise DatasetValidationError(str(exc)) except elasticsearch.exceptions.ConflictError: raise ConflictError("document already exists.") return dataset
def update_url(): for title, url in get_titles(): print(title) print(url) search = Dataset.search() search = search.query("match", name=title) for dataset in search: # since we did a match query on a text field if dataset.name == title: print(dataset.meta.id) _dataset = Dataset.get(dataset.meta.id) _dataset["_n3c"] = {"url": url} _dataset.save() print()
def _clean(metadict, defdict=None): defdict = defdict or {} # defaults assert isinstance(metadict, dict) assert isinstance(defdict, dict) # declared fields in database _meta = ESDataset._ObjectBase__get_field('_meta')._doc_class() fields = {field for field, _, _ in _meta._ObjectBase__list_fields()} # auto-correct field_to_aliases = { # database key: (aliases, ) "username": ("user", "owner"), "class_id": ("schema", "schema_class", "schema_class_id", "type"), } alias_to_field = {} for key, aliases in field_to_aliases.items(): for alias in aliases: alias_to_field[alias] = key _metadata = {} E01 = "repeated metadata field '{}'." E02 = "unsupported metadata field '{}'." for key, val in metadict.items(): if key in fields: if key in _metadata: raise RegistryError(E01.format(key)) _metadata[key] = val elif key in alias_to_field: if alias_to_field[key] in _metadata: raise RegistryError(E01.format(key)) _metadata[alias_to_field[key]] = val else: # undefined key raise RegistryError(E02.format(key)) # default values if defdict: defdict = _clean(defdict) for key, val in defdict.items(): if key not in _metadata: _metadata[key] = val # result class AliasDict(ValidatedDict): def __getitem__(self, key): if key in alias_to_field: # alias key = alias_to_field[key] return super().__getitem__(key) return AliasDict(_metadata)
def exists(anyid=None, **multi_match): # TODO multimatch """ Check if a document exists by its id fields. Or optionally provide other criterions. Examples: dataset.exists('83dc3401f86819de') dataset.exists('EGAD00001003941') dataset.exists(name="Wellderly Dataset from Scripps CTSA center") """ if not any((anyid, multi_match)): raise RegistryError("specify at least one condition.") if anyid: return ESDataset.exists(_id=anyid) or \ ESDataset.exists(identifier=anyid) return ESDataset.exists(**multi_match)
def get_meta(_id): """ Retrieve a dataset file's metadata. """ dataset = ESDataset.get(id=_id, ignore=404, _source="_meta") if dataset: return RegistryDocument.wraps(dataset).meta raise NoEntityError(f"dataset {_id} does not exist.")
def get(_id): """ Retrieve a dataset document with its _id. Identifier field is not possible to be used here. This way, we have a weak privacy assurance. """ dataset = ESDataset.get(id=_id, ignore=404) if dataset: return RegistryDocument.wraps(dataset) raise NoEntityError(f"dataset {_id} does not exist.")
def delete(_id): """ Delete a dataset metadata document. If you only have the identifier, use get function to lookup the _id and then delete with _id. Return the name of the metadata to confirm. """ dataset = ESDataset.get(id=_id, ignore=404) if not dataset: raise NoEntityError(f"dataset {_id} does not exist.") dataset.delete() return dataset.name
def _build(metafilter): assert isinstance(metafilter, ValidatedDict) # special consideration for field 'private' private = metafilter.pop('private', None) # pass the rest as _meta field filters search = ESDataset.find(**metafilter) if private: # if explicitly want private datasets, only return private ones search = search.filter('match', _meta__private=True) else: # if not, only return public datasets of that criterion search = search.exclude('match', _meta__private=True) # private datasets and public ones are never returned together return search
def update(_id, new_doc, **metadata): """ Update a dataset metadata document. Return the version after update. (1, 2, ...) """ # NOTE # Internally, the update is performed by # Revalidating and replacing the original document. new_doc = ensure_document(new_doc) dataset = ESDataset.get(id=_id, ignore=404) if not dataset: raise NoEntityError(f"dataset {_id} does not exist.") # Cannot change the identifier field, because it would result # in changing the document _id. Delete and add again instead. if new_doc.get('identifier') != dataset.identifier: raise ConflictError("cannot change identifier field.") # NOTE **important** # Patch the original document metadata with the partial update. _meta = dataset['_meta'].to_dict() _meta.update(_clean(new_doc.pop('_meta', {}))) _meta.update(_clean(metadata)) _meta = _clean(_meta) new_doc = validate(new_doc, _meta['schema']) dataset = _index( new_doc, _meta, _addon={ # Carry over our internal metadata like # N3C ticket info and creation timestamp. "date_created": dataset._ts.date_created, "n3c_url": dataset._n3c.url, "n3c_status": dataset._n3c.status, "n3c_timestamp": dataset._n3c.timestamp }) return dataset.meta.version