Python Dataset.aggregate_metadata Examples, datalad.distribution.dataset.Dataset.aggregate_metadata Python Examples

Example #1

0

Show file

File: test_aggregation.py Project: hanke/datalad

def test_basic_aggregate(path):
    # TODO give datasets some more metadata to actually aggregate stuff
    base = Dataset(opj(path, 'origin')).create(force=True)
    sub = base.create('sub', force=True)
    #base.metadata(sub.path, init=dict(homepage='this'), apply2global=True)
    subsub = base.create(opj('sub', 'subsub'), force=True)
    base.add('.', recursive=True)
    ok_clean_git(base.path)
    # we will first aggregate the middle dataset on its own, this will
    # serve as a smoke test for the reuse of metadata objects later on
    sub.aggregate_metadata()
    base.save()
    ok_clean_git(base.path)
    base.aggregate_metadata(recursive=True, update_mode='all')
    ok_clean_git(base.path)
    direct_meta = base.metadata(recursive=True, return_type='list')
    # loose the deepest dataset
    sub.uninstall('subsub', check=False)
    # no we should eb able to reaggregate metadata, and loose nothing
    # because we can aggregate aggregated metadata of subsub from sub
    base.aggregate_metadata(recursive=True, update_mode='all')
    # same result for aggregate query than for (saved) direct query
    agg_meta = base.metadata(recursive=True, return_type='list')
    for d, a in zip(direct_meta, agg_meta):
        print(d['path'], a['path'])
        assert_dict_equal(d, a)
    # no we can throw away the subdataset tree, and loose no metadata
    base.uninstall('sub', recursive=True, check=False)
    assert(not sub.is_installed())
    ok_clean_git(base.path)
    # same result for aggregate query than for (saved) direct query
    agg_meta = base.metadata(recursive=True, return_type='list')
    for d, a in zip(direct_meta, agg_meta):
        assert_dict_equal(d, a)

Example #2

0

Show file

File: test_aggregation.py Project: raamana/datalad

def test_reaggregate_with_unavailable_objects(path):
    base = Dataset(opj(path, 'origin')).create(force=True)
    # force all metadata objects into the annex
    with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f:
        f.write(
            '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n'
        )
    sub = base.create('sub', force=True)
    subsub = base.create(opj('sub', 'subsub'), force=True)
    base.add('.', recursive=True)
    ok_clean_git(base.path)
    base.aggregate_metadata(recursive=True, update_mode='all')
    ok_clean_git(base.path)
    objpath = opj('.datalad', 'metadata', 'objects')
    # weird that it comes out as a string...
    objs = [o for o in sorted(base.repo.find(objpath).split('\n')) if o]
    # we have 3x2 metadata sets (dataset/files) under annex
    eq_(len(objs), 6)
    eq_(all(base.repo.file_has_content(objs)), True)
    # drop all object content
    base.drop(objs, check=False)
    eq_(all(base.repo.file_has_content(objs)), False)
    ok_clean_git(base.path)
    # now re-aggregate, the state hasn't changed, so the file names will
    # be the same
    base.aggregate_metadata(recursive=True, update_mode='all')
    eq_(all(base.repo.file_has_content(objs)), True)
    # and there are no new objects
    eq_(objs, [o for o in sorted(base.repo.find(objpath).split('\n')) if o])

Example #3

0

Show file

def test_aggregate_removal(path):
    base = Dataset(opj(path, 'origin')).create(force=True)
    # force all metadata objects into the annex
    with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f:
        f.write(
            '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n'
        )
    sub = base.create('sub', force=True)
    subsub = sub.create(opj('subsub'), force=True)
    base.save(recursive=True)
    base.aggregate_metadata(recursive=True, update_mode='all')
    assert_repo_status(base.path)
    res = base.metadata(get_aggregates=True)
    assert_result_count(res, 3)
    assert_result_count(res, 1, path=subsub.path)
    # check that we only have object files that are listed in agginfo
    eq_(_get_contained_objs(base), _get_referenced_objs(base))
    # now delete the deepest subdataset to test cleanup of aggregated objects
    # in the top-level ds
    base.remove(opj('sub', 'subsub'), check=False)
    # now aggregation has to detect that subsub is not simply missing, but gone
    # for good
    base.aggregate_metadata(recursive=True, update_mode='all')
    assert_repo_status(base.path)
    # internally consistent state
    eq_(_get_contained_objs(base), _get_referenced_objs(base))
    # info on subsub was removed at all levels
    res = base.metadata(get_aggregates=True)
    assert_result_count(res, 0, path=subsub.path)
    assert_result_count(res, 2)
    res = sub.metadata(get_aggregates=True)
    assert_result_count(res, 0, path=subsub.path)
    assert_result_count(res, 1)

Example #4

0

Show file

def test_reaggregate_with_unavailable_objects(path):
    base = Dataset(opj(path, 'origin')).create(force=True)
    # force all metadata objects into the annex
    with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f:
        f.write(
            '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n'
        )
    sub = base.create('sub', force=True)
    subsub = base.create(opj('sub', 'subsub'), force=True)
    base.save(recursive=True)
    assert_repo_status(base.path)
    base.aggregate_metadata(recursive=True, update_mode='all')
    assert_repo_status(base.path)
    objpath = opj('.datalad', 'metadata', 'objects')
    objs = list(sorted(base.repo.find(objpath)))
    # we have 3x2 metadata sets (dataset/files) under annex
    eq_(len(objs), 6)
    eq_(all(base.repo.file_has_content(objs)), True)
    # drop all object content
    base.drop(objs, check=False)
    eq_(all(base.repo.file_has_content(objs)), False)
    assert_repo_status(base.path)
    # now re-aggregate, the state hasn't changed, so the file names will
    # be the same
    base.aggregate_metadata(recursive=True,
                            update_mode='all',
                            force_extraction=True)
    eq_(all(base.repo.file_has_content(objs)), True)
    # and there are no new objects
    eq_(objs, list(sorted(base.repo.find(objpath))))

Example #5

0

Show file

File: test_aggregation.py Project: hanke/datalad

def test_publish_aggregated(path):
    base = Dataset(opj(path, 'origin')).create(force=True)
    # force all metadata objects into the annex
    with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f:
        f.write(
            '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n')
    base.create('sub', force=True)
    base.add('.', recursive=True)
    ok_clean_git(base.path)
    base.aggregate_metadata(recursive=True, update_mode='all')
    ok_clean_git(base.path)

    # create sibling and publish to it
    spath = opj(path, 'remote')
    base.create_sibling(
        name="local_target",
        sshurl="ssh://localhost",
        target_dir=spath)
    base.publish('.', to='local_target', transfer_data='all')
    remote = Dataset(spath)
    objpath = opj('.datalad', 'metadata', 'objects')
    objs = list(sorted(base.repo.find(objpath)))
    # all object files a present in both datasets
    eq_(all(base.repo.file_has_content(objs)), True)
    eq_(all(remote.repo.file_has_content(objs)), True)
    # and we can squeeze the same metadata out
    eq_(
        [{k: v for k, v in i.items() if k not in ('path', 'refds', 'parentds')}
         for i in base.metadata('sub')],
        [{k: v for k, v in i.items() if k not in ('path', 'refds', 'parentds')}
         for i in remote.metadata('sub')],
    )

Example #6

0

Show file

File: test_aggregation.py Project: hanke/datalad

def test_aggregate_removal(path):
    base = Dataset(opj(path, 'origin')).create(force=True)
    # force all metadata objects into the annex
    with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f:
        f.write(
            '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n')
    sub = base.create('sub', force=True)
    subsub = sub.create(opj('subsub'), force=True)
    base.add('.', recursive=True)
    base.aggregate_metadata(recursive=True, update_mode='all')
    ok_clean_git(base.path)
    res = base.metadata(get_aggregates=True)
    assert_result_count(res, 3)
    assert_result_count(res, 1, path=subsub.path)
    # check that we only have object files that are listed in agginfo
    eq_(_get_contained_objs(base), _get_referenced_objs(base))
    # now delete the deepest subdataset to test cleanup of aggregated objects
    # in the top-level ds
    base.remove(opj('sub', 'subsub'), check=False)
    # now aggregation has to detect that subsub is not simply missing, but gone
    # for good
    base.aggregate_metadata(recursive=True, update_mode='all')
    ok_clean_git(base.path)
    # internally consistent state
    eq_(_get_contained_objs(base), _get_referenced_objs(base))
    # info on subsub was removed at all levels
    res = base.metadata(get_aggregates=True)
    assert_result_count(res, 0, path=subsub.path)
    assert_result_count(res, 2)
    res = sub.metadata(get_aggregates=True)
    assert_result_count(res, 0, path=subsub.path)
    assert_result_count(res, 1)

Example #7

0

Show file

File: test_aggregation.py Project: hanke/datalad

def test_reaggregate_with_unavailable_objects(path):
    base = Dataset(opj(path, 'origin')).create(force=True)
    # force all metadata objects into the annex
    with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f:
        f.write(
            '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n')
    sub = base.create('sub', force=True)
    subsub = base.create(opj('sub', 'subsub'), force=True)
    base.add('.', recursive=True)
    ok_clean_git(base.path)
    base.aggregate_metadata(recursive=True, update_mode='all')
    ok_clean_git(base.path)
    objpath = opj('.datalad', 'metadata', 'objects')
    objs = list(sorted(base.repo.find(objpath)))
    # we have 3x2 metadata sets (dataset/files) under annex
    eq_(len(objs), 6)
    eq_(all(base.repo.file_has_content(objs)), True)
    # drop all object content
    base.drop(objs, check=False)
    eq_(all(base.repo.file_has_content(objs)), False)
    ok_clean_git(base.path)
    # now re-aggregate, the state hasn't changed, so the file names will
    # be the same
    base.aggregate_metadata(recursive=True, update_mode='all', force_extraction=True)
    eq_(all(base.repo.file_has_content(objs)), True)
    # and there are no new objects
    eq_(
        objs,
        list(sorted(base.repo.find(objpath)))
    )

Example #8

0

Show file

File: test_aggregation.py Project: hanke/datalad

def test_aggregate_with_unavailable_objects_from_subds(path, target):
    base = Dataset(opj(path, 'origin')).create(force=True)
    # force all metadata objects into the annex
    with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f:
        f.write(
            '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n')
    sub = base.create('sub', force=True)
    subsub = base.create(opj('sub', 'subsub'), force=True)
    base.add('.', recursive=True)
    ok_clean_git(base.path)
    base.aggregate_metadata(recursive=True, update_mode='all')
    ok_clean_git(base.path)

    # now make that a subdataset of a new one, so aggregation needs to get the
    # metadata objects first:
    super = Dataset(target).create()
    super.install("base", source=base.path)
    ok_clean_git(super.path)
    clone = Dataset(opj(super.path, "base"))
    ok_clean_git(clone.path)
    objpath = opj('.datalad', 'metadata', 'objects')
    objs = [o for o in sorted(clone.repo.get_annexed_files(with_content_only=False)) if o.startswith(objpath)]
    eq_(len(objs), 6)
    eq_(all(clone.repo.file_has_content(objs)), False)

    # now aggregate should get those metadata objects
    super.aggregate_metadata(recursive=True, update_mode='all',
                             force_extraction=False)
    eq_(all(clone.repo.file_has_content(objs)), True)

Example #9

0

Show file

File: test_aggregation.py Project: jelmer/datalad

def test_aggregate_with_unavailable_objects_from_subds(path, target):
    base = Dataset(opj(path, 'origin')).create(force=True)
    # force all metadata objects into the annex
    with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f:
        f.write(
            '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n'
        )
    sub = base.create('sub', force=True)
    subsub = base.create(opj('sub', 'subsub'), force=True)
    base.add('.', recursive=True)
    ok_clean_git(base.path)
    base.aggregate_metadata(recursive=True, update_mode='all')
    ok_clean_git(base.path)

    # now make that a subdataset of a new one, so aggregation needs to get the
    # metadata objects first:
    super = Dataset(target).create()
    super.install("base", source=base.path)
    ok_clean_git(super.path)
    clone = Dataset(opj(super.path, "base"))
    ok_clean_git(clone.path)
    objpath = opj('.datalad', 'metadata', 'objects')
    objs = [
        o
        for o in sorted(clone.repo.get_annexed_files(with_content_only=False))
        if o.startswith(objpath)
    ]
    eq_(len(objs), 6)
    eq_(all(clone.repo.file_has_content(objs)), False)

    # now aggregate should get those metadata objects
    super.aggregate_metadata(recursive=True,
                             update_mode='all',
                             force_extraction=False)
    eq_(all(clone.repo.file_has_content(objs)), True)

Example #10

0

Show file

def test_basic_aggregate(path):
    # TODO give datasets some more metadata to actually aggregate stuff
    base = Dataset(opj(path, 'origin')).create(force=True)
    sub = base.create('sub', force=True)
    base.metadata(sub.path, init=dict(homepage='this'), apply2global=True)
    subsub = base.create(opj('sub', 'subsub'), force=True)
    base.add('.', recursive=True)
    ok_clean_git(base.path)
    base.aggregate_metadata(recursive=True)
    ok_clean_git(base.path)
    direct_meta = base.metadata(recursive=True, return_type='list')
    # loose the deepest dataset
    sub.uninstall('subsub', check=False)
    # no we should eb able to reaggregate metadata, and loose nothing
    # because we can aggregate aggregated metadata of subsub from sub
    base.aggregate_metadata(recursive=True)
    # same result for aggregate query than for (saved) direct query
    agg_meta = base.metadata(recursive=True, return_type='list')
    for d, a in zip(direct_meta, agg_meta):
        print(d['path'], a['path'])
        assert_dict_equal(d, a)
    # no we can throw away the subdataset tree, and loose no metadata
    base.uninstall('sub', recursive=True, check=False)
    assert (not sub.is_installed())
    ok_clean_git(base.path)
    # same result for aggregate query than for (saved) direct query
    agg_meta = base.metadata(recursive=True, return_type='list')
    for d, a in zip(direct_meta, agg_meta):
        assert_dict_equal(d, a)

Example #11

0

Show file

def test_basic_aggregate(path):
    # TODO give datasets some more metadata to actually aggregate stuff
    base = Dataset(opj(path, 'origin')).create(force=True)
    sub = base.create('sub', force=True)
    #base.metadata(sub.path, init=dict(homepage='this'), apply2global=True)
    subsub = base.create(opj('sub', 'subsub'), force=True)
    base.save(recursive=True)
    assert_repo_status(base.path)
    # we will first aggregate the middle dataset on its own, this will
    # serve as a smoke test for the reuse of metadata objects later on
    sub.aggregate_metadata()
    base.save()
    assert_repo_status(base.path)
    base.aggregate_metadata(recursive=True, update_mode='all')
    assert_repo_status(base.path)
    direct_meta = base.metadata(recursive=True, return_type='list')
    # loose the deepest dataset
    sub.uninstall('subsub', check=False)
    # no we should eb able to reaggregate metadata, and loose nothing
    # because we can aggregate aggregated metadata of subsub from sub
    base.aggregate_metadata(recursive=True, update_mode='all')
    # same result for aggregate query than for (saved) direct query
    agg_meta = base.metadata(recursive=True, return_type='list')
    for d, a in zip(direct_meta, agg_meta):
        print(d['path'], a['path'])
        assert_dict_equal(d, a)
    # no we can throw away the subdataset tree, and loose no metadata
    base.uninstall('sub', recursive=True, check=False)
    assert (not sub.is_installed())
    assert_repo_status(base.path)
    # same result for aggregate query than for (saved) direct query
    agg_meta = base.metadata(recursive=True, return_type='list')
    for d, a in zip(direct_meta, agg_meta):
        assert_dict_equal(d, a)

Example #12

0

Show file

File: test_aggregation.py Project: raamana/datalad

def test_partial_aggregation(path):
    ds = Dataset(path).create(force=True)
    sub1 = ds.create('sub1', force=True)
    sub2 = ds.create('sub2', force=True)
    ds.add('.', recursive=True)
    ds.aggregate_metadata(recursive=True)
    # baseline, recursive aggregation gets us something for all three datasets
    res = ds.metadata(get_aggregates=True)
    assert_result_count(res, 3)
    # now let's do partial aggregation from just one subdataset
    # we should not loose information on the other datasets
    # as this would be a problem any time anything in a dataset
    # subtree is missing: no installed, too expensive to reaggregate, ...
    ds.aggregate_metadata(path='sub1', incremental=True)
    res = ds.metadata(get_aggregates=True)
    assert_result_count(res, 3)
    assert_result_count(res, 1, path=sub2.path)
    # from-scratch aggregation kills datasets that where not listed
    ds.aggregate_metadata(path='sub1', incremental=False)
    res = ds.metadata(get_aggregates=True)
    assert_result_count(res, 2)
    assert_result_count(res, 0, path=sub2.path)
    # now reaggregated in full
    ds.aggregate_metadata(recursive=True)
    # make change in sub1
    sub1.unlock('here')
    with open(opj(sub1.path, 'here'), 'w') as f:
        f.write('fresh')
    ds.save(recursive=True)
    ok_clean_git(path)

Example #13

0

Show file

File: test_aggregation.py Project: Jiachenn99/datalad-neuroimaging

def test_nested_metadata(path):
    ds = Dataset(path).create(force=True)
    ds.save()
    ds.aggregate_metadata()
    # BIDS returns participant info as a nested dict for each file in the
    # content metadata. On the dataset-level this should automatically
    # yield a sequence of participant info dicts, without any further action
    # or BIDS-specific configuration
    meta = ds.metadata('.', reporton='datasets',
                       return_type='item-or-list')['metadata']
    for i in zip(
            sorted(
                meta['datalad_unique_content_properties']['bids']['subject'],
                key=lambda x: x['id']),
            sorted([{
                "age(years)": "20-25",
                "id": "03",
                "gender": "female",
                "handedness": "r",
                "hearing_problems_current": "n",
                "language": "english"
            }, {
                "age(years)": "30-35",
                "id": "01",
                "gender": 'n/a',
                "handedness": "r",
                "hearing_problems_current": "n",
                "language": u"русский"
            }],
                   key=lambda x: x['id'])):
        assert_dict_equal(i[0], i[1])
    # we can turn off this kind of auto-summary
    ds.config.add('datalad.metadata.generate-unique-bids',
                  'false',
                  where='dataset')
    ds.aggregate_metadata()
    meta = ds.metadata('.', reporton='datasets',
                       return_type='item-or-list')['metadata']
    # protect next test a little, in case we enhance our core extractor in the future
    # to provide more info
    if 'datalad_unique_content_properties' in meta:
        assert_not_in('bids', meta['datalad_unique_content_properties'])

Example #14

0

Show file

File: test_aggregation.py Project: hanke/datalad

def test_partial_aggregation(path):
    ds = Dataset(path).create(force=True)
    sub1 = ds.create('sub1', force=True)
    sub2 = ds.create('sub2', force=True)
    ds.add('.', recursive=True)

    # if we aggregate a path(s) and say to recurse, we must not recurse into
    # the dataset itself and aggregate others
    ds.aggregate_metadata(path='sub1', recursive=True)
    res = ds.metadata(get_aggregates=True)
    assert_result_count(res, 1, path=ds.path)
    assert_result_count(res, 1, path=sub1.path)
    # so no metadata aggregates for sub2 yet
    assert_result_count(res, 0, path=sub2.path)

    ds.aggregate_metadata(recursive=True)
    # baseline, recursive aggregation gets us something for all three datasets
    res = ds.metadata(get_aggregates=True)
    assert_result_count(res, 3)
    # now let's do partial aggregation from just one subdataset
    # we should not loose information on the other datasets
    # as this would be a problem any time anything in a dataset
    # subtree is missing: not installed, too expensive to reaggregate, ...
    ds.aggregate_metadata(path='sub1', incremental=True)
    res = ds.metadata(get_aggregates=True)
    assert_result_count(res, 3)
    assert_result_count(res, 1, path=sub2.path)
    # from-scratch aggregation kills datasets that where not listed
    ds.aggregate_metadata(path='sub1', incremental=False)
    res = ds.metadata(get_aggregates=True)
    assert_result_count(res, 3)
    assert_result_count(res, 1, path=sub2.path)
    # now reaggregated in full
    ds.aggregate_metadata(recursive=True)
    # make change in sub1
    sub1.unlock('here')
    with open(opj(sub1.path, 'here'), 'w') as f:
        f.write('fresh')
    ds.save(recursive=True)
    ok_clean_git(path)

Example #15

0

Show file

def test_publish_aggregated(path):
    base = Dataset(opj(path, 'origin')).create(force=True)
    # force all metadata objects into the annex
    with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f:
        f.write(
            '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n'
        )
    base.create('sub', force=True)
    base.save(recursive=True)
    assert_repo_status(base.path)
    base.aggregate_metadata(recursive=True, update_mode='all')
    assert_repo_status(base.path)

    # create sibling and publish to it
    # Note: Use realpath() below because we know that the resolved temporary
    # test directory exists in the target (many tests rely on that), but it
    # doesn't necessarily have the unresolved variant.
    spath = op.realpath(opj(path, 'remote'))
    base.create_sibling(name="local_target",
                        sshurl="ssh://datalad-test",
                        target_dir=spath)
    base.publish('.', to='local_target', transfer_data='all')
    remote = Dataset(spath)
    objpath = opj('.datalad', 'metadata', 'objects')
    objs = list(sorted(base.repo.find(objpath)))
    # all object files a present in both datasets
    eq_(all(base.repo.file_has_content(objs)), True)
    eq_(all(remote.repo.file_has_content(objs)), True)
    # and we can squeeze the same metadata out
    eq_(
        [{
            k: v
            for k, v in i.items() if k not in ('path', 'refds', 'parentds')
        } for i in base.metadata('sub')],
        [{
            k: v
            for k, v in i.items() if k not in ('path', 'refds', 'parentds')
        } for i in remote.metadata('sub')],
    )

Example #16

0

Show file

File: test_add_readme.py Project: datalad/datalad

def test_add_readme(path=None):
    ds = Dataset(path).create(force=True)
    ds.save()
    ds.aggregate_metadata()
    assert_repo_status(ds.path)
    assert_status('ok', ds.add_readme())
    # should use default name
    content = open(opj(path, 'README.md')).read()
    ok_startswith(
        content, """\
# Dataset "demo_ds"

this is for play

### Authors

- Betty
- Tom

### License

PDDL

## General information

This is a DataLad dataset (id: {id}).
""".format(id=ds.id))
    # make sure that central README references are present
    assert_in(
        """More information on how to install DataLad and [how to install](http://handbook.datalad.org/en/latest/intro/installation.html)
it can be found in the [DataLad Handbook](https://handbook.datalad.org/en/latest/index.html).
""", content)
    # no unexpectedly long lines
    assert all([len(l) < 160 for l in content.splitlines()])

    # should skip on re-run
    assert_status('notneeded', ds.add_readme())

Example #17

0

Show file

def test_aggregate_with_unavailable_objects_from_subds(path, target):
    base = Dataset(opj(path, 'origin')).create(force=True)
    # force all metadata objects into the annex
    with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f:
        f.write(
            '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n'
        )
    sub = base.create('sub', force=True)
    subsub = base.create(opj('sub', 'subsub'), force=True)
    base.save(recursive=True)
    assert_repo_status(base.path)
    base.aggregate_metadata(recursive=True, update_mode='all')
    assert_repo_status(base.path)

    # now make that a subdataset of a new one, so aggregation needs to get the
    # metadata objects first:
    super = Dataset(target).create()
    super.install("base", source=base.path)
    assert_repo_status(super.path)
    clone = Dataset(opj(super.path, "base"))
    assert_repo_status(clone.path)
    objpath = opj('.datalad', 'metadata', 'objects')
    objs = clone.repo.get_content_annexinfo(paths=[objpath],
                                            init=None,
                                            eval_availability=True)
    eq_(len(objs), 6)
    assert_false(any(st["has_content"] for st in objs.values()))

    # now aggregate should get those metadata objects
    super.aggregate_metadata(recursive=True,
                             update_mode='all',
                             force_extraction=False)
    objs_after = clone.repo.get_content_annexinfo(paths=objs,
                                                  init=None,
                                                  eval_availability=True)
    assert_true(all(st["has_content"] for st in objs_after.values()))

Example #18

0

Show file

File: test_aggregation.py Project: raamana/datalad

def test_publish_aggregated(path):
    base = Dataset(opj(path, 'origin')).create(force=True)
    # force all metadata objects into the annex
    with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f:
        f.write(
            '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n'
        )
    base.create('sub', force=True)
    base.add('.', recursive=True)
    ok_clean_git(base.path)
    base.aggregate_metadata(recursive=True, update_mode='all')
    ok_clean_git(base.path)

    # create sibling and publish to it
    spath = opj(path, 'remote')
    base.create_sibling(name="local_target",
                        sshurl="ssh://localhost",
                        target_dir=spath)
    base.publish('.', to='local_target', transfer_data='all')
    remote = Dataset(spath)
    objpath = opj('.datalad', 'metadata', 'objects')
    objs = [o for o in sorted(base.repo.find(objpath).split('\n')) if o]
    # all object files a present in both datasets
    eq_(all(base.repo.file_has_content(objs)), True)
    eq_(all(remote.repo.file_has_content(objs)), True)
    # and we can squeeze the same metadata out
    eq_(
        [{
            k: v
            for k, v in i.items() if k not in ('path', 'refds', 'parentds')
        } for i in base.metadata('sub')],
        [{
            k: v
            for k, v in i.items() if k not in ('path', 'refds', 'parentds')
        } for i in remote.metadata('sub')],
    )

Example #19

0

Show file

File: test_aggregation.py Project: hanke/datalad

def test_update_strategy(path):
    base = Dataset(opj(path, 'origin')).create(force=True)
    # force all metadata objects into the annex
    with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f:
        f.write(
            '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n')
    sub = base.create('sub', force=True)
    subsub = sub.create(opj('subsub'), force=True)
    base.add('.', recursive=True)
    ok_clean_git(base.path)
    # we start clean
    for ds in base, sub, subsub:
        eq_(len(_get_contained_objs(ds)), 0)
    # aggregate the base dataset only, nothing below changes
    base.aggregate_metadata()
    eq_(len(_get_contained_objs(base)), 2)
    for ds in sub, subsub:
        eq_(len(_get_contained_objs(ds)), 0)
    # aggregate the entire tree, but by default only updates
    # the top-level dataset with all objects, none of the leaf
    # or intermediate datasets get's touched
    base.aggregate_metadata(recursive=True)
    eq_(len(_get_contained_objs(base)), 6)
    eq_(len(_get_referenced_objs(base)), 6)
    for ds in sub, subsub:
        eq_(len(_get_contained_objs(ds)), 0)
    res = base.metadata(get_aggregates=True)
    assert_result_count(res, 3)
    # it is impossible to query an intermediate or leaf dataset
    # for metadata
    for ds in sub, subsub:
        assert_status(
            'impossible',
            ds.metadata(get_aggregates=True, on_failure='ignore'))
    # get the full metadata report
    target_meta = base.metadata(return_type='list')

    # now redo full aggregation, this time updating all
    # (intermediate) datasets
    base.aggregate_metadata(recursive=True, update_mode='all')
    eq_(len(_get_contained_objs(base)), 6)
    eq_(len(_get_contained_objs(sub)), 4)
    eq_(len(_get_contained_objs(subsub)), 2)
    # it is now OK to query an intermediate or leaf dataset
    # for metadata
    for ds in sub, subsub:
        assert_status(
            'ok',
            ds.metadata(get_aggregates=True, on_failure='ignore'))

    # all of that has no impact on the reported metadata
    eq_(target_meta, base.metadata(return_type='list'))

Example #20

0

Show file

def test_update_strategy(path):
    base = Dataset(opj(path, 'origin')).create(force=True)
    # force all metadata objects into the annex
    with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f:
        f.write(
            '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n'
        )
    sub = base.create('sub', force=True)
    subsub = sub.create(opj('subsub'), force=True)
    base.save(recursive=True)
    assert_repo_status(base.path)
    # we start clean
    for ds in base, sub, subsub:
        eq_(len(_get_contained_objs(ds)), 0)
    # aggregate the base dataset only, nothing below changes
    base.aggregate_metadata()
    eq_(len(_get_contained_objs(base)), 2)
    for ds in sub, subsub:
        eq_(len(_get_contained_objs(ds)), 0)
    # aggregate the entire tree, but by default only updates
    # the top-level dataset with all objects, none of the leaf
    # or intermediate datasets get's touched
    base.aggregate_metadata(recursive=True)
    eq_(len(_get_contained_objs(base)), 6)
    eq_(len(_get_referenced_objs(base)), 6)
    for ds in sub, subsub:
        eq_(len(_get_contained_objs(ds)), 0)
    res = base.metadata(get_aggregates=True)
    assert_result_count(res, 3)
    # it is impossible to query an intermediate or leaf dataset
    # for metadata
    for ds in sub, subsub:
        assert_status('impossible',
                      ds.metadata(get_aggregates=True, on_failure='ignore'))
    # get the full metadata report
    target_meta = base.metadata(return_type='list')

    # now redo full aggregation, this time updating all
    # (intermediate) datasets
    base.aggregate_metadata(recursive=True, update_mode='all')
    eq_(len(_get_contained_objs(base)), 6)
    eq_(len(_get_contained_objs(sub)), 4)
    eq_(len(_get_contained_objs(subsub)), 2)
    # it is now OK to query an intermediate or leaf dataset
    # for metadata
    for ds in sub, subsub:
        assert_status('ok',
                      ds.metadata(get_aggregates=True, on_failure='ignore'))

    # all of that has no impact on the reported metadata
    eq_(target_meta, base.metadata(return_type='list'))

Example #21

0

Show file

def test_custom_native_merge(path):
    ds = Dataset(path).create(force=True)
    # no metadata, because nothing is commited
    _assert_metadata_empty(
        ds.metadata(reporton='datasets',
                    result_xfm='metadata',
                    return_type='item-or-list'))
    # enable BIDS metadata, BIDS metadata should become THE metadata
    ds.config.add('datalad.metadata.nativetype', 'bids', where='dataset')
    ds.aggregate_metadata()
    # no metadata, because still nothing is commited
    _assert_metadata_empty(
        ds.metadata(reporton='datasets',
                    result_xfm='metadata',
                    return_type='item-or-list'))
    ds.add('.')
    ds.aggregate_metadata()
    meta = ds.metadata(reporton='datasets',
                       result_xfm='metadata',
                       return_type='item-or-list')
    _clean_meta(meta)
    assert_dict_equal({'name': u'myds', 'author': ['one', 'two']}, meta)
    # now give the ds a custom name, must override the native one
    # but authors still come from BIDS
    ds.metadata(apply2global=True, add=dict(name='mycustom'))
    meta = ds.metadata(reporton='datasets',
                       result_xfm='metadata',
                       return_type='item-or-list')
    _clean_meta(meta)
    assert_dict_equal({'name': u'mycustom', 'author': ['one', 'two']}, meta)
    # we can disable the merge
    meta = ds.metadata(reporton='datasets',
                       merge_native='none',
                       result_xfm='metadata',
                       return_type='item-or-list')
    _clean_meta(meta)
    assert_dict_equal({'name': u'mycustom'}, meta)
    # we can accumulate values
    meta = ds.metadata(reporton='datasets',
                       merge_native='add',
                       result_xfm='metadata',
                       return_type='item-or-list')
    _clean_meta(meta)
    assert_dict_equal({
        'name': ['mycustom', 'myds'],
        'author': ['one', 'two']
    }, meta)
    # we can have native override custom (not sure when needed, though)
    # add one more custom to make visible
    ds.metadata(apply2global=True, init=dict(homepage='fresh'))
    meta = ds.metadata(reporton='datasets',
                       merge_native='reset',
                       result_xfm='metadata',
                       return_type='item-or-list')
    _clean_meta(meta)
    assert_dict_equal(
        {
            'name': u'myds',
            'author': ['one', 'two'],
            'homepage': u'fresh'
        }, meta)
    # enable an additional metadata source
    ds.config.add('datalad.metadata.nativetype',
                  'frictionless_datapackage',
                  where='dataset')
    # we need to reaggregate after the config change
    ds.aggregate_metadata(merge_native='add')
    meta = ds.metadata(reporton='datasets',
                       merge_native='add',
                       result_xfm='metadata',
                       return_type='item-or-list')
    _clean_meta(meta)
    assert_dict_equal(
        {
            'name': ['mycustom', 'myds', 'someother'],
            'author': ['one', 'two'],
            'homepage': u'fresh'
        }, meta)