def _get_metadata(self, ds_identifier, meta, full): foreign = jsonload(self.get_core_metadata_filenames()[0]) for term in ('name', 'title', 'description', 'keywords', 'version', 'homepage'): if term in foreign: meta[term] = foreign[term] if 'author' in foreign: meta['author'] = _compact_author(foreign['author']) if 'contributors' in foreign: meta['contributors'] = [ _compact_author(c) for c in foreign['contributors'] ] # two license terms were supported at some point if 'license' in foreign: meta['license'] = _compact_license(foreign['license']) if 'licenses' in foreign: meta['license'] = [ _compact_license(l) for l in foreign['licenses'] ] meta['dcterms:conformsTo'] = [ 'http://specs.frictionlessdata.io/data-packages', 'http://docs.datalad.org/metadata.html#v0-1' ] return meta
def _get_metadata(self, ds_identifier, meta, full): foreign = jsonload( self.get_core_metadata_filenames()[0]) for term in ( 'name', 'title', 'description', 'keywords', 'version', 'homepage'): if term in foreign: meta[term] = foreign[term] if 'author' in foreign: meta['author'] = _compact_author(foreign['author']) if 'contributors' in foreign: meta['contributors'] = [_compact_author(c) for c in foreign['contributors']] # two license terms were supported at some point if 'license' in foreign: meta['license'] = _compact_license(foreign['license']) if 'licenses' in foreign: meta['license'] = [_compact_license(l) for l in foreign['licenses']] meta['dcterms:conformsTo'] = [ 'http://specs.frictionlessdata.io/data-packages', 'http://docs.datalad.org/metadata.html#v0-1'] return meta
def _get_dataset_metadata(self): meta = {} metadata_path = opj(self.ds.path, self.metadatasrc_fname) if not exists(metadata_path): return meta foreign = jsonload(metadata_path) for term in self._key2stdkey: if term in foreign: meta[self._key2stdkey[term]] = foreign[term] if 'author' in foreign: meta['author'] = _compact_author(foreign['author']) if 'contributors' in foreign: meta['contributors'] = [ _compact_author(c) for c in foreign['contributors'] ] # two license terms were supported at some point if 'license' in foreign: meta['license'] = _compact_license(foreign['license']) if 'licenses' in foreign: meta['license'] = [ _compact_license(l) for l in foreign['licenses'] ] meta['conformsto'] = 'http://specs.frictionlessdata.io/data-packages' return meta
def _load_json_object(fpath, cache=None): if cache is None: cache = {} obj = cache.get(fpath, jsonload(fpath, fixup=True) if lexists(fpath) else {}) cache[fpath] = obj return obj
def _yield_dsmeta(ds): srcfiles, cfg_srcfiles = _get_dsmeta_srcfiles(ds) dsmeta = {} for srcfile in srcfiles: abssrcfile = ds.pathobj / PurePosixPath(srcfile) # TODO get annexed files, or do in a central place? if not abssrcfile.exists(): # nothing to load # warn if this was configured if srcfile in cfg_srcfiles: yield dict( path=ds.path, type='dataset', status='impossible', message=( 'configured custom metadata source is not ' 'available in %s: %s', ds, srcfile), ) # no further operation on half-broken metadata return lgr.debug('Load custom metadata from %s', abssrcfile) meta = jsonload(text_type(abssrcfile)) dsmeta.update(meta) if dsmeta: yield dict( path=ds.path, metadata=dsmeta, type='dataset', status='ok', )
def _load_json_object(fpath, cache=None): if cache is None: cache = {} obj = cache.get( fpath, jsonload(fpath, fixup=True) if op.lexists(fpath) else {}) cache[fpath] = obj return obj
def __call__(self, dataset, refcommit, process_type, status): # shortcut ds = dataset log_progress( lgr.info, 'extractorcustom', 'Start custom metadata extraction from %s', ds, total=len(status) + 1, label='Custom metadata extraction', unit=' Files', ) if process_type in ('all', 'content'): mfile_expr = _get_fmeta_expr(ds) for rec in status: log_progress( lgr.info, 'extractorcustom', 'Extracted custom metadata from %s', rec['path'], update=1, increment=True) # build metadata file path meta_fpath = _get_fmeta_objpath(ds, mfile_expr, rec) if meta_fpath is not None and op.exists(meta_fpath): try: meta = jsonload(text_type(meta_fpath)) if meta: yield dict( path=rec['path'], metadata=meta, type=rec['type'], status='ok', ) except Exception as e: yield dict( path=rec['path'], type=rec['type'], status='error', message=exc_str(e), ) if process_type in ('all', 'dataset'): for r in _yield_dsmeta(ds): yield r log_progress( lgr.info, 'extractorcustom', 'Extracted custom metadata from %s', ds.path, update=1, increment=True) log_progress( lgr.info, 'extractorcustom', 'Finished custom metadata extraction from %s', ds.path )
def get_metadata(ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False): # common identifier ds_identifier = ds.id # metadata receptacle meta = [] # where things are meta_path = opj(ds.path, metadata_basepath) main_meta_fname = opj(meta_path, metadata_filename) # from cache? if ignore_cache or not exists(main_meta_fname): # start with the implicit meta data, currently there is no cache for # this type of meta data, as it will change with every clone. # In contrast, native meta data is cached. implicit_meta = _get_implicit_metadata(ds, ds_identifier) meta.append(implicit_meta) # and any native meta data meta.extend( get_native_metadata(ds, guess_type=guess_type, ds_identifier=ds_identifier)) else: # from cache cached_meta = jsonload(main_meta_fname) if isinstance(cached_meta, list): meta.extend(cached_meta) else: meta.append(cached_meta) # cached meta data doesn't have version info for the top-level # dataset -> look for the item and update it for m in meta: if not is_implicit_metadata(m): continue if m.get('@id', None) == ds_identifier: m.update(_get_implicit_metadata(ds, ds_identifier)) break if ignore_subdatasets: # all done now return meta from datalad.metadata.parsers.aggregate import MetadataParser as AggregateParser agg_parser = AggregateParser(ds) if agg_parser.has_metadata(): agg_meta = agg_parser.get_metadata(ds_identifier) # try hard to keep things a simple non-nested list if isinstance(agg_meta, list): meta.extend(agg_meta) else: meta.append(agg_meta) return meta
def get_metadata(ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False): # common identifier ds_identifier = ds.id # metadata receptacle meta = [] # where things are meta_path = opj(ds.path, metadata_basepath) main_meta_fname = opj(meta_path, metadata_filename) # from cache? if ignore_cache or not exists(main_meta_fname): # start with the implicit meta data, currently there is no cache for # this type of meta data, as it will change with every clone. # In contrast, native meta data is cached. implicit_meta = _get_implicit_metadata(ds, ds_identifier) meta.append(implicit_meta) # and any native meta data meta.extend( get_native_metadata( ds, guess_type=guess_type, ds_identifier=ds_identifier)) else: # from cache cached_meta = jsonload(main_meta_fname) if isinstance(cached_meta, list): meta.extend(cached_meta) else: meta.append(cached_meta) # cached meta data doesn't have version info for the top-level # dataset -> look for the item and update it for m in meta: if not is_implicit_metadata(m): continue if m.get('@id', None) == ds_identifier: m.update(_get_implicit_metadata(ds, ds_identifier)) break if ignore_subdatasets: # all done now return meta from datalad.metadata.parsers.aggregate import MetadataParser as AggregateParser agg_parser = AggregateParser(ds) if agg_parser.has_metadata(): agg_meta = agg_parser.get_metadata(ds_identifier) # try hard to keep things a simple non-nested list if isinstance(agg_meta, list): meta.extend(agg_meta) else: meta.append(agg_meta) return meta
def _get_dataset_metadata(self): """ Returns ------- dict keys are homogenized datalad metadata keys, values are arbitrary """ fpath = opj(self.ds.path, self._dataset_metadata_filename) obj = {} if exists(fpath): obj = jsonload(fpath, fixup=True) if 'definition' in obj: obj['@context'] = obj['definition'] del obj['definition'] obj['@id'] = self.ds.id return obj
def _finalize_record(r): msg, rec = _split_record_message(r.pop('body', [])) r['message'] = msg # TODO this can also just be a runrecord ID in which case we need # to load the file and report its content rec = jsonloads(rec) if not isinstance(rec, dict): # this is a runinfo file name rec = jsonload( text_type(ds.pathobj / '.datalad' / 'runinfo' / rec), # TODO this should not be necessary, instead jsonload() # should be left on auto, and `run` should save compressed # files with an appropriate extension compressed=True, ) r['run_record'] = rec return r
def _get_metadata(self, ds_identifier, meta, full): bids = jsonload( self.get_core_metadata_filenames()[0]) # TODO maybe normalize labels of standard licenses to definition URIs # perform mapping for bidsterm, dataladterm in (('Name', 'name'), ('License', 'license'), ('Authors', 'author'), ('ReferencesAndLinks', 'citation'), ('Funding', 'foaf:fundedBy'), ('Description', 'description')): if bidsterm in bids: meta[dataladterm] = bids[bidsterm] README_fname = opj(self.ds.path, 'README') if not meta.get('description') and exists(README_fname): # BIDS uses README to provide description, so if was not # explicitly provided to possibly override longer README, let's just # load README try: desc = open(README_fname, encoding="utf-8").read() except UnicodeDecodeError as exc: lgr.warning( "Failed to decode content of %s. " "Re-loading allowing for UTF-8 errors with replacement: %s" % (README_fname, exc_str(exc)) ) desc = open(README_fname, encoding="utf-8", errors="replace").read() meta['description'] = desc.strip() compliance = ["http://docs.datalad.org/metadata.html#v0-1"] # special case if bids.get('BIDSVersion'): compliance.append( 'http://bids.neuroimaging.io/bids_spec{}.pdf'.format( bids['BIDSVersion'].strip())) else: compliance.append('http://bids.neuroimaging.io') meta['dcterms:conformsTo'] = compliance return meta
def get_metadata(self, dsid=None, full=False): base_meta = _get_base_dataset_metadata(dsid if dsid else self.ds.id) meta = [base_meta] basepath = opj(self.ds.path, '.datalad', 'meta') parts = [] for subds_meta_fname in self.get_core_metadata_filenames(): # get the part between the 'meta' dir and the filename # which is the subdataset mountpoint subds_path = subds_meta_fname[len(basepath) + 1:-10] if not subds_path: # this is a potentially existing cache of the native meta data # of the superdataset, not for us... continue submeta_info = { 'location': subds_path} # load aggregated meta data subds_meta = jsonload(subds_meta_fname) # we cannot simply append, or we get weired nested graphs # proper way would be to expand the JSON-LD, extend the list and # compact/flatten at the end. However assuming a single context # we can cheat. subds_meta = _simplify_meta_data_structure(subds_meta) _adjust_subdataset_location(subds_meta, subds_path) # sift through all meta data sets look for a meta data set that # knows about being part of this dataset, so we record its @id as # part for md in subds_meta: cand_id = md.get('dcterms:isPartOf', None) if cand_id == dsid and '@id' in md: submeta_info['@id'] = md['@id'] break if subds_meta: meta.extend(subds_meta) parts.append(submeta_info) if len(parts): if len(parts) == 1: parts = parts[0] base_meta['dcterms:hasPart'] = parts return meta
def get_metadata(self, dsid=None, full=False): base_meta = _get_base_dataset_metadata(dsid if dsid else self.ds.id) meta = [base_meta] basepath = opj(self.ds.path, '.datalad', 'meta') parts = [] for subds_meta_fname in self.get_core_metadata_filenames(): # get the part between the 'meta' dir and the filename # which is the subdataset mountpoint subds_path = subds_meta_fname[len(basepath) + 1:-10] if not subds_path: # this is a potentially existing cache of the native meta data # of the superdataset, not for us... continue submeta_info = {'location': subds_path} # load aggregated meta data subds_meta = jsonload(subds_meta_fname) # we cannot simply append, or we get weired nested graphs # proper way would be to expand the JSON-LD, extend the list and # compact/flatten at the end. However assuming a single context # we can cheat. subds_meta = _simplify_meta_data_structure(subds_meta) _adjust_subdataset_location(subds_meta, subds_path) # sift through all meta data sets look for a meta data set that # knows about being part of this dataset, so we record its @id as # part for md in subds_meta: cand_id = md.get('dcterms:isPartOf', None) if cand_id == dsid and '@id' in md: submeta_info['@id'] = md['@id'] break if subds_meta: meta.extend(subds_meta) parts.append(submeta_info) if len(parts): if len(parts) == 1: parts = parts[0] base_meta['dcterms:hasPart'] = parts return meta
def _get_metadata(self, ds_identifier, meta, full): bids = jsonload(self.get_core_metadata_filenames()[0]) # TODO maybe normalize labels of standard licenses to definition URIs # perform mapping for bidsterm, dataladterm in (('Name', 'name'), ('License', 'license'), ('Authors', 'author'), ('ReferencesAndLinks', 'citation'), ('Funding', 'foaf:fundedBy'), ('Description', 'description')): if bidsterm in bids: meta[dataladterm] = bids[bidsterm] compliance = ["http://docs.datalad.org/metadata.html#v0-1"] # special case if bids.get('BIDSVersion'): compliance.append( 'http://bids.neuroimaging.io/bids_spec{}.pdf'.format( bids['BIDSVersion'].strip())) else: compliance.append('http://bids.neuroimaging.io') meta['dcterms:conformsTo'] = compliance return meta
def _get_dataset_metadata(self): """ Returns ------- dict keys are homogenized datalad metadata keys, values are arbitrary """ fpath = opj(self.ds.path, DATASET_METADATA_FILE) obj = {} if exists(fpath): obj = jsonload(fpath, fixup=True) if 'definition' in obj: obj['@context'] = obj['definition'] del obj['definition'] obj['@id'] = self.ds.id subdsinfo = [ { # this version would change anytime we aggregate metadata, let's not # do this for now #'version': sds['revision'], 'type': sds['type'], 'name': sds['gitmodule_name'], } for sds in subdatasets(dataset=self.ds, recursive=False, return_type='generator', result_renderer='disabled', on_failure='ignore') ] if subdsinfo: obj['haspart'] = subdsinfo superds = self.ds.get_superdataset(registered_only=True, topmost=False) if superds: obj['ispartof'] = { '@id': superds.id, 'type': 'dataset', } return obj
def _get_dataset_metadata(self): """ Returns ------- dict keys are homogenized datalad metadata keys, values are arbitrary """ fpath = opj(self.ds.path, self._dataset_metadata_filename) obj = {} if exists(fpath): obj = jsonload(fpath, fixup=True) if 'definition' in obj: obj['@context'] = obj['definition'] del obj['definition'] obj['@id'] = self.ds.id subdsinfo = [{ # this version would change anytime we aggregate metadata, let's not # do this for now #'version': sds['revision'], 'type': sds['type'], 'name': sds['gitmodule_name'], } for sds in subdatasets( dataset=self.ds, recursive=False, return_type='generator', result_renderer='disabled') ] if subdsinfo: obj['haspart'] = subdsinfo superds = self.ds.get_superdataset(registered_only=True, topmost=False) if superds: obj['ispartof'] = { '@id': superds.id, 'type': 'dataset', } return obj
def _get_dataset_metadata(self): meta = {} metadata_path = opj(self.ds.path, self.metadatasrc_fname) if not exists(metadata_path): return meta foreign = jsonload(metadata_path) for term in self._key2stdkey: if term in foreign: meta[self._key2stdkey[term]] = foreign[term] if 'author' in foreign: meta['author'] = _compact_author(foreign['author']) if 'contributors' in foreign: meta['contributors'] = [_compact_author(c) for c in foreign['contributors']] # two license terms were supported at some point if 'license' in foreign: meta['license'] = _compact_license(foreign['license']) if 'licenses' in foreign: meta['license'] = [_compact_license(l) for l in foreign['licenses']] meta['conformsto'] = 'http://specs.frictionlessdata.io/data-packages' return meta
def __call__(self, dataset, refcommit, process_type, status): # shortcut ds = dataset log_progress( lgr.info, 'extractorcustom', 'Start custom metadata extraction from %s', ds, total=len(status) + 1, label='Custom metadata extraction', unit=' Files', ) if process_type in ('all', 'content'): mfile_expr = _get_fmeta_expr(ds) for rec in status: log_progress(lgr.info, 'extractorcustom', 'Extracted custom metadata from %s', rec['path'], update=1, increment=True) # build metadata file path meta_fpath = _get_fmeta_objpath(ds, mfile_expr, rec) if meta_fpath is not None and op.exists(meta_fpath): try: meta = jsonload(text_type(meta_fpath)) if isinstance(meta, dict) and meta \ and '@id' not in meta: # in case we have a single, top-level # document, and it has no ID: assume that # it describes the file and assign the # datalad file ID meta['@id'] = get_file_id(rec) if meta: yield dict( path=rec['path'], metadata=meta, type=rec['type'], status='ok', ) except Exception as e: yield dict( path=rec['path'], type=rec['type'], status='error', message=exc_str(e), ) if process_type in ('all', 'dataset'): for r in _yield_dsmeta(ds): yield r log_progress(lgr.info, 'extractorcustom', 'Extracted custom metadata from %s', ds.path, update=1, increment=True) log_progress(lgr.info, 'extractorcustom', 'Finished custom metadata extraction from %s', ds.path)