Beispiel #1
0
    def __call__(types, files=None, dataset=None):
        dataset = require_dataset(dataset or curdir,
                                  purpose="extract metadata",
                                  check_installed=not files)
        if not files:
            ds = require_dataset(dataset, check_installed=True)
            subds = ds.subdatasets(recursive=False, result_xfm='relpaths')
            files = list(_get_metadatarelevant_paths(ds, subds))

        dsmeta, contentmeta, error = _get_metadata(dataset,
                                                   types,
                                                   global_meta=True,
                                                   content_meta=bool(files),
                                                   paths=files)

        if dataset is not None and dataset.is_installed():
            res = get_status_dict(action='metadata',
                                  ds=dataset,
                                  refds=dataset.path,
                                  metadata=dsmeta,
                                  status='error' if error else 'ok')
            yield res

        for p in contentmeta:
            res = get_status_dict(action='metadata',
                                  path=opj(dataset.path, p) if dataset else p,
                                  refds=dataset.path,
                                  metadata=contentmeta[p],
                                  type='file',
                                  status='error' if error else 'ok')
            if dataset:
                res['parentds'] = dataset.path
            yield res
Beispiel #2
0
def test_get_metadata(path1, path2):
    for p in (path1, path2):
        print('PATH')
        ds = create(p, force=True)
        ds.save()
        meta = MetadataExtractor(ds, _get_metadatarelevant_paths(
            ds, []))._get_dataset_metadata()
        assert_equal(
            dumps(meta, sort_keys=True, indent=2), """\
{
  "author": [
    "Last1, First1",
    "Last2, First2"
  ],
  "citation": [
    "10.1016/j.cub.2011.08.031"
  ],
  "description": "Some long description.",
  "formats": [
    "application/matlab",
    "NIFTY"
  ],
  "name": "CRCNS.org xxx-1",
  "sameas": "10.6080/K0QN64NG",
  "shortdescription": "Main title",
  "tag": [
    "Neuroscience",
    "fMRI"
  ],
  "version": "1.0"
}""")
Beispiel #3
0
def test_get_metadata(path1, path2):
    for p in (path1, path2):
        print('PATH')
        ds = create(p, force=True)
        ds.save()
        meta = MetadataExtractor(
                ds,
                _get_metadatarelevant_paths(ds, []))._get_dataset_metadata()
        assert_equal(
            dumps(meta, sort_keys=True, indent=2),
            """\
{
  "author": [
    "Last1, First1",
    "Last2, First2"
  ],
  "citation": [
    "10.1016/j.cub.2011.08.031"
  ],
  "description": "Some long description.",
  "formats": [
    "application/matlab",
    "NIFTY"
  ],
  "name": "CRCNS.org xxx-1",
  "sameas": "10.6080/K0QN64NG",
  "shortdescription": "Main title",
  "tag": [
    "Neuroscience",
    "fMRI"
  ],
  "version": "1.0"
}""")
Beispiel #4
0
    def __call__(types, files=None, dataset=None):
        dataset = require_dataset(dataset or curdir,
                                  purpose="extract metadata",
                                  check_installed=not files)
        if not files:
            ds = require_dataset(dataset, check_installed=True)
            subds = ds.subdatasets(recursive=False, result_xfm='relpaths')
            files = list(_get_metadatarelevant_paths(ds, subds))

        dsmeta, contentmeta, error = _get_metadata(
            dataset,
            types,
            global_meta=True,
            content_meta=bool(files),
            paths=files)

        if dataset is not None and dataset.is_installed():
            res = get_status_dict(
                action='metadata',
                ds=dataset,
                refds=dataset.path,
                metadata=dsmeta,
                status='error' if error else 'ok')
            yield res

        for p in contentmeta:
            res = get_status_dict(
                action='metadata',
                path=opj(dataset.path, p) if dataset else p,
                refds=dataset.path,
                metadata=contentmeta[p],
                type='file',
                status='error' if error else 'ok')
            if dataset:
                res['parentds'] = dataset.path
            yield res
Beispiel #5
0
def _extract_metadata(agginto_ds, aggfrom_ds, db, to_save, objid, metasources,
                      refcommit, subds_relpaths, agg_base_path):
    lgr.debug('Performing metadata extraction from %s', aggfrom_ds)
    # we will replace any conflicting info on this dataset with fresh stuff
    agginfo = db.get(aggfrom_ds.path, {})
    # paths to extract from
    relevant_paths = sorted(_get_metadatarelevant_paths(aggfrom_ds, subds_relpaths))
    # get extractors to engage from source dataset
    nativetypes = ['datalad_core', 'annex'] + ensure_list(get_metadata_type(aggfrom_ds))
    # store essential extraction config in dataset record
    agginfo['extractors'] = nativetypes
    agginfo['datalad_version'] = datalad.__version__

    # perform the actual extraction
    dsmeta, contentmeta, errored = _get_metadata(
        aggfrom_ds,
        nativetypes,
        # None indicates to honor a datasets per-extractor configuration and to be
        # on by default
        global_meta=None,
        content_meta=None,
        paths=relevant_paths)

    meta = {
        'ds': dsmeta,
        'cn': (dict(contentmeta[k], path=k) for k in sorted(contentmeta))
    }

    # inject the info which commit we are describing into the core metadata
    # this is done here in order to avoid feeding it all the way down
    coremeta = dsmeta.get('datalad_core', {})
    version = aggfrom_ds.repo.describe(commitish=refcommit)
    if version:
        coremeta['version'] = version
    coremeta['refcommit'] = refcommit
    dsmeta['datalad_core'] = coremeta

    # for both types of metadata
    for label, props in metasources.items():
        dest = props['targetds']
        if not meta[label]:
            continue
        # only write to disk if there is something
        objrelpath = _get_obj_location(objid, label, props['dumper'])
        # place metadata object into the source dataset
        objpath = op.join(dest.path, agg_base_path, objrelpath)

        # write obj files
        if op.exists(objpath):
            dest.unlock(objpath)
        elif op.lexists(objpath):
            # if it gets here, we have a symlink that is pointing nowhere
            # kill it, to be replaced with the newly aggregated content
            dest.repo.remove(objpath)
        # TODO actually dump a compressed file when annexing is possible
        # to speed up on-demand access
        props['dumper'](meta[label], objpath)
        # stage for dataset.save()
        to_save.append(dict(path=objpath, type='file'))

        # important to use abspath here, needs to be rewritten relative to
        # all receiving datasets
        agginfo['{}_info'.format(props['type'])] = objpath

    # overwrite existing info with stuff from just finished extraction
    db[aggfrom_ds.path] = agginfo

    return errored
Beispiel #6
0
def _extract_metadata(agginto_ds, aggfrom_ds, db, merge_native, to_save):
    """Dump metadata from a dataset into object in the metadata store of another

    Info on the metadata objects is placed into a DB dict under the
    absolute path of the dataset whose metadata was aggregated.

    Parameters
    ----------
    agginto_ds : Dataset
    aggfrom_ds : Dataset
    db : dict
    merge_native : str
      Merge mode.
    """
    subds_relpaths = aggfrom_ds.subdatasets(result_xfm='relpaths',
                                            return_type='list')
    # figure out a "state" of the dataset wrt its metadata that we are describing
    # 1. the latest commit that changed any file for which we could have native metadata
    refcommit = _get_latest_refcommit(aggfrom_ds, subds_relpaths)
    objid = refcommit if refcommit else ''
    # 2, our own dataset-global metadata
    dsmetafile = opj(aggfrom_ds.path, '.datalad', 'metadata', 'dataset.json')
    if exists(dsmetafile):
        objid += md5(open(dsmetafile, 'r').read().encode()).hexdigest()
    # 3. potential annex-based metadata
    if isinstance(aggfrom_ds, AnnexRepo) and \
            aggfrom_ds.config.obtain(
                'datalad.metadata.aggregate-content-datalad-core',
                default=True,
                valtype=EnsureBool()):
        # if there is no annex metadata, this will come out empty,
        # hence hash would be same as for a plain GitRepo
        # and no, we cannot use the shasum of the annex branch,
        # because this will change even when no metadata has changed
        timestamps, _ = aggfrom_ds.repo._run_annex_command(
            'metadata', '.', '-g', 'lastchanged')
        objid += timestamps.strip()

    if not objid:
        lgr.debug('%s has no metadata-relevant content', aggfrom_ds)
    else:
        lgr.debug('Dump metadata of %s (merge mode: %s) into %s', aggfrom_ds,
                  merge_native, agginto_ds)

    agginfo = {}
    # dataset global
    if aggfrom_ds.id:
        agginfo['id'] = aggfrom_ds.id
    agginfo['refcommit'] = refcommit
    # put in DB
    db[aggfrom_ds.path] = agginfo

    if not objid:
        dsmeta = contentmeta = None
        # this is no error, there is simply no metadata whatsoever
        return False

    # if there is any chance for metadata
    # obtain metadata for dataset and content
    relevant_paths = sorted(
        _get_metadatarelevant_paths(aggfrom_ds, subds_relpaths))
    nativetypes = get_metadata_type(aggfrom_ds)
    dsmeta, contentmeta, errored = _get_metadata(
        aggfrom_ds,
        # core must come first
        ['datalad_core'] + assure_list(nativetypes),
        merge_native,
        # None indicates to honor a datasets per-parser configuration and to be
        # on by default
        global_meta=None,
        content_meta=None,
        paths=relevant_paths)

    # shorten to MD5sum
    objid = md5(objid.encode()).hexdigest()

    metasources = [('ds', 'dataset', dsmeta, aggfrom_ds, json_py.dump)]

    # do not store content metadata if either the source or the target dataset
    # do not want it
    if aggfrom_ds.config.obtain(
            'datalad.metadata.store-aggregate-content',
            default=True,
            valtype=EnsureBool()) or \
            agginto_ds.config.obtain(
                'datalad.metadata.store-aggregate-content',
                default=True,
                valtype=EnsureBool()):
        metasources.append((
            'cn',
            'content',
            # sort by path key to get deterministic dump content
            (dict(contentmeta[k], path=k) for k in sorted(contentmeta)),
            aggfrom_ds,
            json_py.dump2xzstream))

    # for both types of metadata
    for label, mtype, meta, dest, store in metasources:
        if not meta:
            continue
        # only write to disk if there is something
        objrelpath = _get_obj_location(objid, label)
        if store is json_py.dump2xzstream:
            objrelpath += '.xz'
        # place metadata object into the source dataset
        objpath = opj(dest.path, dirname(agginfo_relpath), objrelpath)

        # write obj files
        if exists(objpath):
            dest.unlock(objpath)
        # TODO actually dump a compressed file when annexing is possible
        # to speed up on-demand access
        store(meta, objpath)
        # stage for dataset.save()
        to_save.append(dict(path=objpath, type='file'))

        # important to use abspath here, needs to be rewritten relative to
        # all receiving datasets
        agginfo['{}_info'.format(mtype)] = objpath

    return errored
Beispiel #7
0
def _extract_metadata(agginto_ds, aggfrom_ds, db, to_save, objid, metasources,
                      refcommit, subds_relpaths, agg_base_path):
    lgr.debug('Performing metadata extraction from %s', aggfrom_ds)
    # we will replace any conflicting info on this dataset with fresh stuff
    agginfo = db.get(aggfrom_ds.path, {})
    # paths to extract from
    relevant_paths = sorted(_get_metadatarelevant_paths(aggfrom_ds, subds_relpaths))
    # get extractors to engage from source dataset
    nativetypes = ['datalad_core', 'annex'] + assure_list(get_metadata_type(aggfrom_ds))
    # store esssential extraction config in dataset record
    agginfo['extractors'] = nativetypes
    agginfo['datalad_version'] = datalad.__version__

    # perform the actual extraction
    dsmeta, contentmeta, errored = _get_metadata(
        aggfrom_ds,
        nativetypes,
        # None indicates to honor a datasets per-extractor configuration and to be
        # on by default
        global_meta=None,
        content_meta=None,
        paths=relevant_paths)

    meta = {
        'ds': dsmeta,
        'cn': (dict(contentmeta[k], path=k) for k in sorted(contentmeta))
    }

    # inject the info which commmit we are describing into the core metadata
    # this is done here in order to avoid feeding it all the way down
    coremeta = dsmeta.get('datalad_core', {})
    version = aggfrom_ds.repo.describe(commitish=refcommit)
    if version:
        coremeta['version'] = version
    coremeta['refcommit'] = refcommit
    dsmeta['datalad_core'] = coremeta

    # for both types of metadata
    for label, props in metasources.items():
        dest = props['targetds']
        if not meta[label]:
            continue
        # only write to disk if there is something
        objrelpath = _get_obj_location(objid, label, props['dumper'])
        # place metadata object into the source dataset
        objpath = op.join(dest.path, agg_base_path, objrelpath)

        # write obj files
        if op.exists(objpath):
            dest.unlock(objpath)
        elif op.lexists(objpath):
            # if it gets here, we have a symlink that is pointing nowhere
            # kill it, to be replaced with the newly aggregated content
            dest.repo.remove(objpath)
        # TODO actually dump a compressed file when annexing is possible
        # to speed up on-demand access
        props['dumper'](meta[label], objpath)
        # stage for dataset.save()
        to_save.append(dict(path=objpath, type='file'))

        # important to use abspath here, needs to be rewritten relative to
        # all receiving datasets
        agginfo['{}_info'.format(props['type'])] = objpath

    # overwrite existing info with stuff from just finished extraction
    db[aggfrom_ds.path] = agginfo

    return errored