Exemple #1
0
def test_get_metadata_type(path):
    Dataset(path).create()
    # nothing set, nothing found
    assert_equal(get_metadata_type(Dataset(path)), [])
    # got section, but no setting
    open(opj(path, '.datalad', 'config'), 'w').write('[datalad "metadata"]\n')
    assert_equal(get_metadata_type(Dataset(path)), [])
    # minimal setting
    open(opj(path, '.datalad', 'config'), 'w+').write('[datalad "metadata"]\nnativetype = mamboschwambo\n')
    assert_equal(get_metadata_type(Dataset(path)), 'mamboschwambo')
Exemple #2
0
def test_get_metadata_type(path):
    Dataset(path).create()
    # nothing set, nothing found
    assert_equal(get_metadata_type(Dataset(path)), [])
    # got section, but no setting
    open(opj(path, '.datalad', 'config'), 'w').write('[datalad "metadata"]\n')
    assert_equal(get_metadata_type(Dataset(path)), [])
    # minimal setting
    open(opj(path, '.datalad', 'config'), 'w+').write('[datalad "metadata"]\nnativetype = mamboschwambo\n')
    assert_equal(get_metadata_type(Dataset(path)), 'mamboschwambo')
Exemple #3
0
def test_get_metadata_type(path=None):
    ds = Dataset(path).create()
    # nothing set, nothing found
    assert_equal(get_metadata_type(ds), [])
    # got section, but no setting
    open(opj(path, '.datalad', 'config'), 'w').write('[datalad "metadata"]\n')
    # not relying on automagical pick up of changes done
    # by external powers to the config
    # see https://github.com/datalad/datalad/issues/4363 for more info
    ds.config.reload()
    assert_equal(get_metadata_type(ds), [])
    # minimal setting
    open(opj(path, '.datalad', 'config'), 'w+').write('[datalad "metadata"]\nnativetype = mamboschwambo\n')
    ds.config.reload()
    assert_equal(get_metadata_type(ds), 'mamboschwambo')
Exemple #4
0
def _extract_metadata(agginto_ds, aggfrom_ds, db, to_save, objid, metasources,
                      refcommit, subds_relpaths, agg_base_path):
    lgr.debug('Performing metadata extraction from %s', aggfrom_ds)
    # we will replace any conflicting info on this dataset with fresh stuff
    agginfo = db.get(aggfrom_ds.path, {})
    # paths to extract from
    relevant_paths = sorted(_get_metadatarelevant_paths(aggfrom_ds, subds_relpaths))
    # get extractors to engage from source dataset
    nativetypes = ['datalad_core', 'annex'] + ensure_list(get_metadata_type(aggfrom_ds))
    # store essential extraction config in dataset record
    agginfo['extractors'] = nativetypes
    agginfo['datalad_version'] = datalad.__version__

    # perform the actual extraction
    dsmeta, contentmeta, errored = _get_metadata(
        aggfrom_ds,
        nativetypes,
        # None indicates to honor a datasets per-extractor configuration and to be
        # on by default
        global_meta=None,
        content_meta=None,
        paths=relevant_paths)

    meta = {
        'ds': dsmeta,
        'cn': (dict(contentmeta[k], path=k) for k in sorted(contentmeta))
    }

    # inject the info which commit we are describing into the core metadata
    # this is done here in order to avoid feeding it all the way down
    coremeta = dsmeta.get('datalad_core', {})
    version = aggfrom_ds.repo.describe(commitish=refcommit)
    if version:
        coremeta['version'] = version
    coremeta['refcommit'] = refcommit
    dsmeta['datalad_core'] = coremeta

    # for both types of metadata
    for label, props in metasources.items():
        dest = props['targetds']
        if not meta[label]:
            continue
        # only write to disk if there is something
        objrelpath = _get_obj_location(objid, label, props['dumper'])
        # place metadata object into the source dataset
        objpath = op.join(dest.path, agg_base_path, objrelpath)

        # write obj files
        if op.exists(objpath):
            dest.unlock(objpath)
        elif op.lexists(objpath):
            # if it gets here, we have a symlink that is pointing nowhere
            # kill it, to be replaced with the newly aggregated content
            dest.repo.remove(objpath)
        # TODO actually dump a compressed file when annexing is possible
        # to speed up on-demand access
        props['dumper'](meta[label], objpath)
        # stage for dataset.save()
        to_save.append(dict(path=objpath, type='file'))

        # important to use abspath here, needs to be rewritten relative to
        # all receiving datasets
        agginfo['{}_info'.format(props['type'])] = objpath

    # overwrite existing info with stuff from just finished extraction
    db[aggfrom_ds.path] = agginfo

    return errored
Exemple #5
0
def _extract_metadata(agginto_ds, aggfrom_ds, db, merge_native, to_save):
    """Dump metadata from a dataset into object in the metadata store of another

    Info on the metadata objects is placed into a DB dict under the
    absolute path of the dataset whose metadata was aggregated.

    Parameters
    ----------
    agginto_ds : Dataset
    aggfrom_ds : Dataset
    db : dict
    merge_native : str
      Merge mode.
    """
    subds_relpaths = aggfrom_ds.subdatasets(result_xfm='relpaths',
                                            return_type='list')
    # figure out a "state" of the dataset wrt its metadata that we are describing
    # 1. the latest commit that changed any file for which we could have native metadata
    refcommit = _get_latest_refcommit(aggfrom_ds, subds_relpaths)
    objid = refcommit if refcommit else ''
    # 2, our own dataset-global metadata
    dsmetafile = opj(aggfrom_ds.path, '.datalad', 'metadata', 'dataset.json')
    if exists(dsmetafile):
        objid += md5(open(dsmetafile, 'r').read().encode()).hexdigest()
    # 3. potential annex-based metadata
    if isinstance(aggfrom_ds, AnnexRepo) and \
            aggfrom_ds.config.obtain(
                'datalad.metadata.aggregate-content-datalad-core',
                default=True,
                valtype=EnsureBool()):
        # if there is no annex metadata, this will come out empty,
        # hence hash would be same as for a plain GitRepo
        # and no, we cannot use the shasum of the annex branch,
        # because this will change even when no metadata has changed
        timestamps, _ = aggfrom_ds.repo._run_annex_command(
            'metadata', '.', '-g', 'lastchanged')
        objid += timestamps.strip()

    if not objid:
        lgr.debug('%s has no metadata-relevant content', aggfrom_ds)
    else:
        lgr.debug('Dump metadata of %s (merge mode: %s) into %s', aggfrom_ds,
                  merge_native, agginto_ds)

    agginfo = {}
    # dataset global
    if aggfrom_ds.id:
        agginfo['id'] = aggfrom_ds.id
    agginfo['refcommit'] = refcommit
    # put in DB
    db[aggfrom_ds.path] = agginfo

    if not objid:
        dsmeta = contentmeta = None
        # this is no error, there is simply no metadata whatsoever
        return False

    # if there is any chance for metadata
    # obtain metadata for dataset and content
    relevant_paths = sorted(
        _get_metadatarelevant_paths(aggfrom_ds, subds_relpaths))
    nativetypes = get_metadata_type(aggfrom_ds)
    dsmeta, contentmeta, errored = _get_metadata(
        aggfrom_ds,
        # core must come first
        ['datalad_core'] + assure_list(nativetypes),
        merge_native,
        # None indicates to honor a datasets per-parser configuration and to be
        # on by default
        global_meta=None,
        content_meta=None,
        paths=relevant_paths)

    # shorten to MD5sum
    objid = md5(objid.encode()).hexdigest()

    metasources = [('ds', 'dataset', dsmeta, aggfrom_ds, json_py.dump)]

    # do not store content metadata if either the source or the target dataset
    # do not want it
    if aggfrom_ds.config.obtain(
            'datalad.metadata.store-aggregate-content',
            default=True,
            valtype=EnsureBool()) or \
            agginto_ds.config.obtain(
                'datalad.metadata.store-aggregate-content',
                default=True,
                valtype=EnsureBool()):
        metasources.append((
            'cn',
            'content',
            # sort by path key to get deterministic dump content
            (dict(contentmeta[k], path=k) for k in sorted(contentmeta)),
            aggfrom_ds,
            json_py.dump2xzstream))

    # for both types of metadata
    for label, mtype, meta, dest, store in metasources:
        if not meta:
            continue
        # only write to disk if there is something
        objrelpath = _get_obj_location(objid, label)
        if store is json_py.dump2xzstream:
            objrelpath += '.xz'
        # place metadata object into the source dataset
        objpath = opj(dest.path, dirname(agginfo_relpath), objrelpath)

        # write obj files
        if exists(objpath):
            dest.unlock(objpath)
        # TODO actually dump a compressed file when annexing is possible
        # to speed up on-demand access
        store(meta, objpath)
        # stage for dataset.save()
        to_save.append(dict(path=objpath, type='file'))

        # important to use abspath here, needs to be rewritten relative to
        # all receiving datasets
        agginfo['{}_info'.format(mtype)] = objpath

    return errored
Exemple #6
0
def _extract_metadata(agginto_ds, aggfrom_ds, db, to_save, objid, metasources,
                      refcommit, subds_relpaths, agg_base_path):
    lgr.debug('Performing metadata extraction from %s', aggfrom_ds)
    # we will replace any conflicting info on this dataset with fresh stuff
    agginfo = db.get(aggfrom_ds.path, {})
    # paths to extract from
    relevant_paths = sorted(_get_metadatarelevant_paths(aggfrom_ds, subds_relpaths))
    # get extractors to engage from source dataset
    nativetypes = ['datalad_core', 'annex'] + assure_list(get_metadata_type(aggfrom_ds))
    # store esssential extraction config in dataset record
    agginfo['extractors'] = nativetypes
    agginfo['datalad_version'] = datalad.__version__

    # perform the actual extraction
    dsmeta, contentmeta, errored = _get_metadata(
        aggfrom_ds,
        nativetypes,
        # None indicates to honor a datasets per-extractor configuration and to be
        # on by default
        global_meta=None,
        content_meta=None,
        paths=relevant_paths)

    meta = {
        'ds': dsmeta,
        'cn': (dict(contentmeta[k], path=k) for k in sorted(contentmeta))
    }

    # inject the info which commmit we are describing into the core metadata
    # this is done here in order to avoid feeding it all the way down
    coremeta = dsmeta.get('datalad_core', {})
    version = aggfrom_ds.repo.describe(commitish=refcommit)
    if version:
        coremeta['version'] = version
    coremeta['refcommit'] = refcommit
    dsmeta['datalad_core'] = coremeta

    # for both types of metadata
    for label, props in metasources.items():
        dest = props['targetds']
        if not meta[label]:
            continue
        # only write to disk if there is something
        objrelpath = _get_obj_location(objid, label, props['dumper'])
        # place metadata object into the source dataset
        objpath = op.join(dest.path, agg_base_path, objrelpath)

        # write obj files
        if op.exists(objpath):
            dest.unlock(objpath)
        elif op.lexists(objpath):
            # if it gets here, we have a symlink that is pointing nowhere
            # kill it, to be replaced with the newly aggregated content
            dest.repo.remove(objpath)
        # TODO actually dump a compressed file when annexing is possible
        # to speed up on-demand access
        props['dumper'](meta[label], objpath)
        # stage for dataset.save()
        to_save.append(dict(path=objpath, type='file'))

        # important to use abspath here, needs to be rewritten relative to
        # all receiving datasets
        agginfo['{}_info'.format(props['type'])] = objpath

    # overwrite existing info with stuff from just finished extraction
    db[aggfrom_ds.path] = agginfo

    return errored