Exemple #1
0
    def _get_metadata(self, ds_identifier, meta, full):
        foreign = jsonload(self.get_core_metadata_filenames()[0])

        for term in ('name', 'title', 'description', 'keywords', 'version',
                     'homepage'):
            if term in foreign:
                meta[term] = foreign[term]
        if 'author' in foreign:
            meta['author'] = _compact_author(foreign['author'])
        if 'contributors' in foreign:
            meta['contributors'] = [
                _compact_author(c) for c in foreign['contributors']
            ]
        # two license terms were supported at some point
        if 'license' in foreign:
            meta['license'] = _compact_license(foreign['license'])
        if 'licenses' in foreign:
            meta['license'] = [
                _compact_license(l) for l in foreign['licenses']
            ]

        meta['dcterms:conformsTo'] = [
            'http://specs.frictionlessdata.io/data-packages',
            'http://docs.datalad.org/metadata.html#v0-1'
        ]

        return meta
    def _get_metadata(self, ds_identifier, meta, full):
        foreign = jsonload(
            self.get_core_metadata_filenames()[0])

        for term in (
                'name', 'title', 'description', 'keywords', 'version',
                'homepage'):
            if term in foreign:
                meta[term] = foreign[term]
        if 'author' in foreign:
            meta['author'] = _compact_author(foreign['author'])
        if 'contributors' in foreign:
            meta['contributors'] = [_compact_author(c)
                                    for c in foreign['contributors']]
        # two license terms were supported at some point
        if 'license' in foreign:
            meta['license'] = _compact_license(foreign['license'])
        if 'licenses' in foreign:
            meta['license'] = [_compact_license(l) for l in foreign['licenses']]

        meta['dcterms:conformsTo'] = [
            'http://specs.frictionlessdata.io/data-packages',
            'http://docs.datalad.org/metadata.html#v0-1']

        return meta
Exemple #3
0
    def _get_dataset_metadata(self):
        meta = {}
        metadata_path = opj(self.ds.path, self.metadatasrc_fname)
        if not exists(metadata_path):
            return meta
        foreign = jsonload(metadata_path)

        for term in self._key2stdkey:
            if term in foreign:
                meta[self._key2stdkey[term]] = foreign[term]
        if 'author' in foreign:
            meta['author'] = _compact_author(foreign['author'])
        if 'contributors' in foreign:
            meta['contributors'] = [
                _compact_author(c) for c in foreign['contributors']
            ]
        # two license terms were supported at some point
        if 'license' in foreign:
            meta['license'] = _compact_license(foreign['license'])
        if 'licenses' in foreign:
            meta['license'] = [
                _compact_license(l) for l in foreign['licenses']
            ]

        meta['conformsto'] = 'http://specs.frictionlessdata.io/data-packages'

        return meta
Exemple #4
0
def _load_json_object(fpath, cache=None):
    if cache is None:
        cache = {}
    obj = cache.get(fpath,
                    jsonload(fpath, fixup=True) if lexists(fpath) else {})
    cache[fpath] = obj
    return obj
Exemple #5
0
def _yield_dsmeta(ds):
    srcfiles, cfg_srcfiles = _get_dsmeta_srcfiles(ds)
    dsmeta = {}
    for srcfile in srcfiles:
        abssrcfile = ds.pathobj / PurePosixPath(srcfile)
        # TODO get annexed files, or do in a central place?
        if not abssrcfile.exists():
            # nothing to load
            # warn if this was configured
            if srcfile in cfg_srcfiles:
                yield dict(
                    path=ds.path,
                    type='dataset',
                    status='impossible',
                    message=(
                        'configured custom metadata source is not '
                        'available in %s: %s',
                        ds, srcfile),
                )
                # no further operation on half-broken metadata
                return
        lgr.debug('Load custom metadata from %s', abssrcfile)
        meta = jsonload(text_type(abssrcfile))
        dsmeta.update(meta)
    if dsmeta:
        yield dict(
            path=ds.path,
            metadata=dsmeta,
            type='dataset',
            status='ok',
        )
Exemple #6
0
def _load_json_object(fpath, cache=None):
    if cache is None:
        cache = {}
    obj = cache.get(
        fpath,
        jsonload(fpath, fixup=True) if op.lexists(fpath) else {})
    cache[fpath] = obj
    return obj
Exemple #7
0
    def __call__(self, dataset, refcommit, process_type, status):
        # shortcut
        ds = dataset

        log_progress(
            lgr.info,
            'extractorcustom',
            'Start custom metadata extraction from %s', ds,
            total=len(status) + 1,
            label='Custom metadata extraction',
            unit=' Files',
        )
        if process_type in ('all', 'content'):
            mfile_expr = _get_fmeta_expr(ds)
            for rec in status:
                log_progress(
                    lgr.info,
                    'extractorcustom',
                    'Extracted custom metadata from %s', rec['path'],
                    update=1,
                    increment=True)
                # build metadata file path
                meta_fpath = _get_fmeta_objpath(ds, mfile_expr, rec)
                if meta_fpath is not None and op.exists(meta_fpath):
                    try:
                        meta = jsonload(text_type(meta_fpath))
                        if meta:
                            yield dict(
                                path=rec['path'],
                                metadata=meta,
                                type=rec['type'],
                                status='ok',
                            )
                    except Exception as e:
                        yield dict(
                            path=rec['path'],
                            type=rec['type'],
                            status='error',
                            message=exc_str(e),
                        )

        if process_type in ('all', 'dataset'):
            for r in _yield_dsmeta(ds):
                yield r
            log_progress(
                lgr.info,
                'extractorcustom',
                'Extracted custom metadata from %s', ds.path,
                update=1,
                increment=True)

        log_progress(
            lgr.info,
            'extractorcustom',
            'Finished custom metadata extraction from %s', ds.path
        )
Exemple #8
0
def get_metadata(ds,
                 guess_type=False,
                 ignore_subdatasets=False,
                 ignore_cache=False):
    # common identifier
    ds_identifier = ds.id
    # metadata receptacle
    meta = []
    # where things are
    meta_path = opj(ds.path, metadata_basepath)
    main_meta_fname = opj(meta_path, metadata_filename)

    # from cache?
    if ignore_cache or not exists(main_meta_fname):
        # start with the implicit meta data, currently there is no cache for
        # this type of meta data, as it will change with every clone.
        # In contrast, native meta data is cached.
        implicit_meta = _get_implicit_metadata(ds, ds_identifier)
        meta.append(implicit_meta)
        # and any native meta data
        meta.extend(
            get_native_metadata(ds,
                                guess_type=guess_type,
                                ds_identifier=ds_identifier))
    else:
        # from cache
        cached_meta = jsonload(main_meta_fname)
        if isinstance(cached_meta, list):
            meta.extend(cached_meta)
        else:
            meta.append(cached_meta)
        # cached meta data doesn't have version info for the top-level
        # dataset -> look for the item and update it
        for m in meta:
            if not is_implicit_metadata(m):
                continue
            if m.get('@id', None) == ds_identifier:
                m.update(_get_implicit_metadata(ds, ds_identifier))
                break

    if ignore_subdatasets:
        # all done now
        return meta

    from datalad.metadata.parsers.aggregate import MetadataParser as AggregateParser
    agg_parser = AggregateParser(ds)
    if agg_parser.has_metadata():
        agg_meta = agg_parser.get_metadata(ds_identifier)
        # try hard to keep things a simple non-nested list
        if isinstance(agg_meta, list):
            meta.extend(agg_meta)
        else:
            meta.append(agg_meta)

    return meta
Exemple #9
0
def get_metadata(ds, guess_type=False, ignore_subdatasets=False,
                 ignore_cache=False):
    # common identifier
    ds_identifier = ds.id
    # metadata receptacle
    meta = []
    # where things are
    meta_path = opj(ds.path, metadata_basepath)
    main_meta_fname = opj(meta_path, metadata_filename)

    # from cache?
    if ignore_cache or not exists(main_meta_fname):
        # start with the implicit meta data, currently there is no cache for
        # this type of meta data, as it will change with every clone.
        # In contrast, native meta data is cached.
        implicit_meta = _get_implicit_metadata(ds, ds_identifier)
        meta.append(implicit_meta)
        # and any native meta data
        meta.extend(
            get_native_metadata(
                ds,
                guess_type=guess_type,
                ds_identifier=ds_identifier))
    else:
        # from cache
        cached_meta = jsonload(main_meta_fname)
        if isinstance(cached_meta, list):
            meta.extend(cached_meta)
        else:
            meta.append(cached_meta)
        # cached meta data doesn't have version info for the top-level
        # dataset -> look for the item and update it
        for m in meta:
            if not is_implicit_metadata(m):
                continue
            if m.get('@id', None) == ds_identifier:
                m.update(_get_implicit_metadata(ds, ds_identifier))
                break

    if ignore_subdatasets:
        # all done now
        return meta

    from datalad.metadata.parsers.aggregate import MetadataParser as AggregateParser
    agg_parser = AggregateParser(ds)
    if agg_parser.has_metadata():
        agg_meta = agg_parser.get_metadata(ds_identifier)
        # try hard to keep things a simple non-nested list
        if isinstance(agg_meta, list):
            meta.extend(agg_meta)
        else:
            meta.append(agg_meta)

    return meta
Exemple #10
0
 def _get_dataset_metadata(self):
     """
     Returns
     -------
     dict
       keys are homogenized datalad metadata keys, values are arbitrary
     """
     fpath = opj(self.ds.path, self._dataset_metadata_filename)
     obj = {}
     if exists(fpath):
         obj = jsonload(fpath, fixup=True)
     if 'definition' in obj:
         obj['@context'] = obj['definition']
         del obj['definition']
     obj['@id'] = self.ds.id
     return obj
Exemple #11
0
 def _finalize_record(r):
     msg, rec = _split_record_message(r.pop('body', []))
     r['message'] = msg
     # TODO this can also just be a runrecord ID in which case we need
     # to load the file and report its content
     rec = jsonloads(rec)
     if not isinstance(rec, dict):
         # this is a runinfo file name
         rec = jsonload(
             text_type(ds.pathobj / '.datalad' / 'runinfo' / rec),
             # TODO this should not be necessary, instead jsonload()
             # should be left on auto, and `run` should save compressed
             # files with an appropriate extension
             compressed=True,
         )
     r['run_record'] = rec
     return r
Exemple #12
0
    def _get_metadata(self, ds_identifier, meta, full):
        bids = jsonload(
            self.get_core_metadata_filenames()[0])

        # TODO maybe normalize labels of standard licenses to definition URIs
        # perform mapping
        for bidsterm, dataladterm in (('Name', 'name'),
                                      ('License', 'license'),
                                      ('Authors', 'author'),
                                      ('ReferencesAndLinks', 'citation'),
                                      ('Funding', 'foaf:fundedBy'),
                                      ('Description', 'description')):
            if bidsterm in bids:
                meta[dataladterm] = bids[bidsterm]

        README_fname = opj(self.ds.path, 'README')
        if not meta.get('description') and exists(README_fname):
            # BIDS uses README to provide description, so if was not
            # explicitly provided to possibly override longer README, let's just
            # load README
            try:
                desc = open(README_fname, encoding="utf-8").read()
            except UnicodeDecodeError as exc:
                lgr.warning(
                    "Failed to decode content of %s. "
                    "Re-loading allowing for UTF-8 errors with replacement: %s"
                    % (README_fname, exc_str(exc))
                )
                desc = open(README_fname, encoding="utf-8", errors="replace").read()

            meta['description'] = desc.strip()

        compliance = ["http://docs.datalad.org/metadata.html#v0-1"]

        # special case
        if bids.get('BIDSVersion'):
            compliance.append(
                'http://bids.neuroimaging.io/bids_spec{}.pdf'.format(
                    bids['BIDSVersion'].strip()))
        else:
            compliance.append('http://bids.neuroimaging.io')
        meta['dcterms:conformsTo'] = compliance
        return meta
Exemple #13
0
    def get_metadata(self, dsid=None, full=False):
        base_meta = _get_base_dataset_metadata(dsid if dsid else self.ds.id)
        meta = [base_meta]
        basepath = opj(self.ds.path, '.datalad', 'meta')
        parts = []
        for subds_meta_fname in self.get_core_metadata_filenames():
            # get the part between the 'meta' dir and the filename
            # which is the subdataset mountpoint
            subds_path = subds_meta_fname[len(basepath) + 1:-10]
            if not subds_path:
                # this is a potentially existing cache of the native meta data
                # of the superdataset, not for us...
                continue
            submeta_info = {
                'location': subds_path}
            # load aggregated meta data
            subds_meta = jsonload(subds_meta_fname)
            # we cannot simply append, or we get weired nested graphs
            # proper way would be to expand the JSON-LD, extend the list and
            # compact/flatten at the end. However assuming a single context
            # we can cheat.
            subds_meta = _simplify_meta_data_structure(subds_meta)
            _adjust_subdataset_location(subds_meta, subds_path)
            # sift through all meta data sets look for a meta data set that
            # knows about being part of this dataset, so we record its @id as
            # part
            for md in subds_meta:
                cand_id = md.get('dcterms:isPartOf', None)
                if cand_id == dsid and '@id' in md:
                    submeta_info['@id'] = md['@id']
                    break

            if subds_meta:
                meta.extend(subds_meta)
            parts.append(submeta_info)
        if len(parts):
            if len(parts) == 1:
                parts = parts[0]
            base_meta['dcterms:hasPart'] = parts

        return meta
Exemple #14
0
    def get_metadata(self, dsid=None, full=False):
        base_meta = _get_base_dataset_metadata(dsid if dsid else self.ds.id)
        meta = [base_meta]
        basepath = opj(self.ds.path, '.datalad', 'meta')
        parts = []
        for subds_meta_fname in self.get_core_metadata_filenames():
            # get the part between the 'meta' dir and the filename
            # which is the subdataset mountpoint
            subds_path = subds_meta_fname[len(basepath) + 1:-10]
            if not subds_path:
                # this is a potentially existing cache of the native meta data
                # of the superdataset, not for us...
                continue
            submeta_info = {'location': subds_path}
            # load aggregated meta data
            subds_meta = jsonload(subds_meta_fname)
            # we cannot simply append, or we get weired nested graphs
            # proper way would be to expand the JSON-LD, extend the list and
            # compact/flatten at the end. However assuming a single context
            # we can cheat.
            subds_meta = _simplify_meta_data_structure(subds_meta)
            _adjust_subdataset_location(subds_meta, subds_path)
            # sift through all meta data sets look for a meta data set that
            # knows about being part of this dataset, so we record its @id as
            # part
            for md in subds_meta:
                cand_id = md.get('dcterms:isPartOf', None)
                if cand_id == dsid and '@id' in md:
                    submeta_info['@id'] = md['@id']
                    break

            if subds_meta:
                meta.extend(subds_meta)
            parts.append(submeta_info)
        if len(parts):
            if len(parts) == 1:
                parts = parts[0]
            base_meta['dcterms:hasPart'] = parts

        return meta
Exemple #15
0
    def _get_metadata(self, ds_identifier, meta, full):
        bids = jsonload(self.get_core_metadata_filenames()[0])

        # TODO maybe normalize labels of standard licenses to definition URIs
        # perform mapping
        for bidsterm, dataladterm in (('Name', 'name'), ('License', 'license'),
                                      ('Authors', 'author'),
                                      ('ReferencesAndLinks', 'citation'),
                                      ('Funding', 'foaf:fundedBy'),
                                      ('Description', 'description')):
            if bidsterm in bids:
                meta[dataladterm] = bids[bidsterm]
        compliance = ["http://docs.datalad.org/metadata.html#v0-1"]
        # special case
        if bids.get('BIDSVersion'):
            compliance.append(
                'http://bids.neuroimaging.io/bids_spec{}.pdf'.format(
                    bids['BIDSVersion'].strip()))
        else:
            compliance.append('http://bids.neuroimaging.io')
        meta['dcterms:conformsTo'] = compliance
        return meta
Exemple #16
0
    def _get_dataset_metadata(self):
        """
        Returns
        -------
        dict
          keys are homogenized datalad metadata keys, values are arbitrary
        """
        fpath = opj(self.ds.path, DATASET_METADATA_FILE)
        obj = {}
        if exists(fpath):
            obj = jsonload(fpath, fixup=True)
        if 'definition' in obj:
            obj['@context'] = obj['definition']
            del obj['definition']
        obj['@id'] = self.ds.id
        subdsinfo = [
            {
                # this version would change anytime we aggregate metadata, let's not
                # do this for now
                #'version': sds['revision'],
                'type': sds['type'],
                'name': sds['gitmodule_name'],
            } for sds in subdatasets(dataset=self.ds,
                                     recursive=False,
                                     return_type='generator',
                                     result_renderer='disabled',
                                     on_failure='ignore')
        ]
        if subdsinfo:
            obj['haspart'] = subdsinfo
        superds = self.ds.get_superdataset(registered_only=True, topmost=False)
        if superds:
            obj['ispartof'] = {
                '@id': superds.id,
                'type': 'dataset',
            }

        return obj
Exemple #17
0
    def _get_dataset_metadata(self):
        """
        Returns
        -------
        dict
          keys are homogenized datalad metadata keys, values are arbitrary
        """
        fpath = opj(self.ds.path, self._dataset_metadata_filename)
        obj = {}
        if exists(fpath):
            obj = jsonload(fpath, fixup=True)
        if 'definition' in obj:
            obj['@context'] = obj['definition']
            del obj['definition']
        obj['@id'] = self.ds.id
        subdsinfo = [{
            # this version would change anytime we aggregate metadata, let's not
            # do this for now
            #'version': sds['revision'],
            'type': sds['type'],
            'name': sds['gitmodule_name'],
        }
            for sds in subdatasets(
                dataset=self.ds,
                recursive=False,
                return_type='generator',
                result_renderer='disabled')
        ]
        if subdsinfo:
            obj['haspart'] = subdsinfo
        superds = self.ds.get_superdataset(registered_only=True, topmost=False)
        if superds:
            obj['ispartof'] = {
                '@id': superds.id,
                'type': 'dataset',
            }

        return obj
    def _get_dataset_metadata(self):
        meta = {}
        metadata_path = opj(self.ds.path, self.metadatasrc_fname)
        if not exists(metadata_path):
            return meta
        foreign = jsonload(metadata_path)

        for term in self._key2stdkey:
            if term in foreign:
                meta[self._key2stdkey[term]] = foreign[term]
        if 'author' in foreign:
            meta['author'] = _compact_author(foreign['author'])
        if 'contributors' in foreign:
            meta['contributors'] = [_compact_author(c)
                                    for c in foreign['contributors']]
        # two license terms were supported at some point
        if 'license' in foreign:
            meta['license'] = _compact_license(foreign['license'])
        if 'licenses' in foreign:
            meta['license'] = [_compact_license(l) for l in foreign['licenses']]

        meta['conformsto'] = 'http://specs.frictionlessdata.io/data-packages'

        return meta
Exemple #19
0
    def __call__(self, dataset, refcommit, process_type, status):
        # shortcut
        ds = dataset

        log_progress(
            lgr.info,
            'extractorcustom',
            'Start custom metadata extraction from %s',
            ds,
            total=len(status) + 1,
            label='Custom metadata extraction',
            unit=' Files',
        )
        if process_type in ('all', 'content'):
            mfile_expr = _get_fmeta_expr(ds)
            for rec in status:
                log_progress(lgr.info,
                             'extractorcustom',
                             'Extracted custom metadata from %s',
                             rec['path'],
                             update=1,
                             increment=True)
                # build metadata file path
                meta_fpath = _get_fmeta_objpath(ds, mfile_expr, rec)
                if meta_fpath is not None and op.exists(meta_fpath):
                    try:
                        meta = jsonload(text_type(meta_fpath))
                        if isinstance(meta, dict) and meta \
                                and '@id' not in meta:
                            # in case we have a single, top-level
                            # document, and it has no ID: assume that
                            # it describes the file and assign the
                            # datalad file ID
                            meta['@id'] = get_file_id(rec)
                        if meta:
                            yield dict(
                                path=rec['path'],
                                metadata=meta,
                                type=rec['type'],
                                status='ok',
                            )
                    except Exception as e:
                        yield dict(
                            path=rec['path'],
                            type=rec['type'],
                            status='error',
                            message=exc_str(e),
                        )

        if process_type in ('all', 'dataset'):
            for r in _yield_dsmeta(ds):
                yield r
            log_progress(lgr.info,
                         'extractorcustom',
                         'Extracted custom metadata from %s',
                         ds.path,
                         update=1,
                         increment=True)

        log_progress(lgr.info, 'extractorcustom',
                     'Finished custom metadata extraction from %s', ds.path)