def test_basic_aggregate(path): # TODO give datasets some more metadata to actually aggregate stuff base = Dataset(opj(path, 'origin')).create(force=True) sub = base.create('sub', force=True) #base.metadata(sub.path, init=dict(homepage='this'), apply2global=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) # we will first aggregate the middle dataset on its own, this will # serve as a smoke test for the reuse of metadata objects later on sub.aggregate_metadata() base.save() ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) direct_meta = base.metadata(recursive=True, return_type='list') # loose the deepest dataset sub.uninstall('subsub', check=False) # no we should eb able to reaggregate metadata, and loose nothing # because we can aggregate aggregated metadata of subsub from sub base.aggregate_metadata(recursive=True, update_mode='all') # same result for aggregate query than for (saved) direct query agg_meta = base.metadata(recursive=True, return_type='list') for d, a in zip(direct_meta, agg_meta): print(d['path'], a['path']) assert_dict_equal(d, a) # no we can throw away the subdataset tree, and loose no metadata base.uninstall('sub', recursive=True, check=False) assert(not sub.is_installed()) ok_clean_git(base.path) # same result for aggregate query than for (saved) direct query agg_meta = base.metadata(recursive=True, return_type='list') for d, a in zip(direct_meta, agg_meta): assert_dict_equal(d, a)
def test_reaggregate_with_unavailable_objects(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n' ) sub = base.create('sub', force=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) objpath = opj('.datalad', 'metadata', 'objects') # weird that it comes out as a string... objs = [o for o in sorted(base.repo.find(objpath).split('\n')) if o] # we have 3x2 metadata sets (dataset/files) under annex eq_(len(objs), 6) eq_(all(base.repo.file_has_content(objs)), True) # drop all object content base.drop(objs, check=False) eq_(all(base.repo.file_has_content(objs)), False) ok_clean_git(base.path) # now re-aggregate, the state hasn't changed, so the file names will # be the same base.aggregate_metadata(recursive=True, update_mode='all') eq_(all(base.repo.file_has_content(objs)), True) # and there are no new objects eq_(objs, [o for o in sorted(base.repo.find(objpath).split('\n')) if o])
def test_aggregate_removal(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n' ) sub = base.create('sub', force=True) subsub = sub.create(opj('subsub'), force=True) base.save(recursive=True) base.aggregate_metadata(recursive=True, update_mode='all') assert_repo_status(base.path) res = base.metadata(get_aggregates=True) assert_result_count(res, 3) assert_result_count(res, 1, path=subsub.path) # check that we only have object files that are listed in agginfo eq_(_get_contained_objs(base), _get_referenced_objs(base)) # now delete the deepest subdataset to test cleanup of aggregated objects # in the top-level ds base.remove(opj('sub', 'subsub'), check=False) # now aggregation has to detect that subsub is not simply missing, but gone # for good base.aggregate_metadata(recursive=True, update_mode='all') assert_repo_status(base.path) # internally consistent state eq_(_get_contained_objs(base), _get_referenced_objs(base)) # info on subsub was removed at all levels res = base.metadata(get_aggregates=True) assert_result_count(res, 0, path=subsub.path) assert_result_count(res, 2) res = sub.metadata(get_aggregates=True) assert_result_count(res, 0, path=subsub.path) assert_result_count(res, 1)
def test_reaggregate_with_unavailable_objects(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n' ) sub = base.create('sub', force=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.save(recursive=True) assert_repo_status(base.path) base.aggregate_metadata(recursive=True, update_mode='all') assert_repo_status(base.path) objpath = opj('.datalad', 'metadata', 'objects') objs = list(sorted(base.repo.find(objpath))) # we have 3x2 metadata sets (dataset/files) under annex eq_(len(objs), 6) eq_(all(base.repo.file_has_content(objs)), True) # drop all object content base.drop(objs, check=False) eq_(all(base.repo.file_has_content(objs)), False) assert_repo_status(base.path) # now re-aggregate, the state hasn't changed, so the file names will # be the same base.aggregate_metadata(recursive=True, update_mode='all', force_extraction=True) eq_(all(base.repo.file_has_content(objs)), True) # and there are no new objects eq_(objs, list(sorted(base.repo.find(objpath))))
def test_publish_aggregated(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n') base.create('sub', force=True) base.add('.', recursive=True) ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) # create sibling and publish to it spath = opj(path, 'remote') base.create_sibling( name="local_target", sshurl="ssh://localhost", target_dir=spath) base.publish('.', to='local_target', transfer_data='all') remote = Dataset(spath) objpath = opj('.datalad', 'metadata', 'objects') objs = list(sorted(base.repo.find(objpath))) # all object files a present in both datasets eq_(all(base.repo.file_has_content(objs)), True) eq_(all(remote.repo.file_has_content(objs)), True) # and we can squeeze the same metadata out eq_( [{k: v for k, v in i.items() if k not in ('path', 'refds', 'parentds')} for i in base.metadata('sub')], [{k: v for k, v in i.items() if k not in ('path', 'refds', 'parentds')} for i in remote.metadata('sub')], )
def test_aggregate_removal(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n') sub = base.create('sub', force=True) subsub = sub.create(opj('subsub'), force=True) base.add('.', recursive=True) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) res = base.metadata(get_aggregates=True) assert_result_count(res, 3) assert_result_count(res, 1, path=subsub.path) # check that we only have object files that are listed in agginfo eq_(_get_contained_objs(base), _get_referenced_objs(base)) # now delete the deepest subdataset to test cleanup of aggregated objects # in the top-level ds base.remove(opj('sub', 'subsub'), check=False) # now aggregation has to detect that subsub is not simply missing, but gone # for good base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) # internally consistent state eq_(_get_contained_objs(base), _get_referenced_objs(base)) # info on subsub was removed at all levels res = base.metadata(get_aggregates=True) assert_result_count(res, 0, path=subsub.path) assert_result_count(res, 2) res = sub.metadata(get_aggregates=True) assert_result_count(res, 0, path=subsub.path) assert_result_count(res, 1)
def test_reaggregate_with_unavailable_objects(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n') sub = base.create('sub', force=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) objpath = opj('.datalad', 'metadata', 'objects') objs = list(sorted(base.repo.find(objpath))) # we have 3x2 metadata sets (dataset/files) under annex eq_(len(objs), 6) eq_(all(base.repo.file_has_content(objs)), True) # drop all object content base.drop(objs, check=False) eq_(all(base.repo.file_has_content(objs)), False) ok_clean_git(base.path) # now re-aggregate, the state hasn't changed, so the file names will # be the same base.aggregate_metadata(recursive=True, update_mode='all', force_extraction=True) eq_(all(base.repo.file_has_content(objs)), True) # and there are no new objects eq_( objs, list(sorted(base.repo.find(objpath))) )
def test_aggregate_with_unavailable_objects_from_subds(path, target): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n') sub = base.create('sub', force=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) # now make that a subdataset of a new one, so aggregation needs to get the # metadata objects first: super = Dataset(target).create() super.install("base", source=base.path) ok_clean_git(super.path) clone = Dataset(opj(super.path, "base")) ok_clean_git(clone.path) objpath = opj('.datalad', 'metadata', 'objects') objs = [o for o in sorted(clone.repo.get_annexed_files(with_content_only=False)) if o.startswith(objpath)] eq_(len(objs), 6) eq_(all(clone.repo.file_has_content(objs)), False) # now aggregate should get those metadata objects super.aggregate_metadata(recursive=True, update_mode='all', force_extraction=False) eq_(all(clone.repo.file_has_content(objs)), True)
def test_aggregate_with_unavailable_objects_from_subds(path, target): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n' ) sub = base.create('sub', force=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) # now make that a subdataset of a new one, so aggregation needs to get the # metadata objects first: super = Dataset(target).create() super.install("base", source=base.path) ok_clean_git(super.path) clone = Dataset(opj(super.path, "base")) ok_clean_git(clone.path) objpath = opj('.datalad', 'metadata', 'objects') objs = [ o for o in sorted(clone.repo.get_annexed_files(with_content_only=False)) if o.startswith(objpath) ] eq_(len(objs), 6) eq_(all(clone.repo.file_has_content(objs)), False) # now aggregate should get those metadata objects super.aggregate_metadata(recursive=True, update_mode='all', force_extraction=False) eq_(all(clone.repo.file_has_content(objs)), True)
def test_basic_aggregate(path): # TODO give datasets some more metadata to actually aggregate stuff base = Dataset(opj(path, 'origin')).create(force=True) sub = base.create('sub', force=True) base.metadata(sub.path, init=dict(homepage='this'), apply2global=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) base.aggregate_metadata(recursive=True) ok_clean_git(base.path) direct_meta = base.metadata(recursive=True, return_type='list') # loose the deepest dataset sub.uninstall('subsub', check=False) # no we should eb able to reaggregate metadata, and loose nothing # because we can aggregate aggregated metadata of subsub from sub base.aggregate_metadata(recursive=True) # same result for aggregate query than for (saved) direct query agg_meta = base.metadata(recursive=True, return_type='list') for d, a in zip(direct_meta, agg_meta): print(d['path'], a['path']) assert_dict_equal(d, a) # no we can throw away the subdataset tree, and loose no metadata base.uninstall('sub', recursive=True, check=False) assert (not sub.is_installed()) ok_clean_git(base.path) # same result for aggregate query than for (saved) direct query agg_meta = base.metadata(recursive=True, return_type='list') for d, a in zip(direct_meta, agg_meta): assert_dict_equal(d, a)
def test_basic_aggregate(path): # TODO give datasets some more metadata to actually aggregate stuff base = Dataset(opj(path, 'origin')).create(force=True) sub = base.create('sub', force=True) #base.metadata(sub.path, init=dict(homepage='this'), apply2global=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.save(recursive=True) assert_repo_status(base.path) # we will first aggregate the middle dataset on its own, this will # serve as a smoke test for the reuse of metadata objects later on sub.aggregate_metadata() base.save() assert_repo_status(base.path) base.aggregate_metadata(recursive=True, update_mode='all') assert_repo_status(base.path) direct_meta = base.metadata(recursive=True, return_type='list') # loose the deepest dataset sub.uninstall('subsub', check=False) # no we should eb able to reaggregate metadata, and loose nothing # because we can aggregate aggregated metadata of subsub from sub base.aggregate_metadata(recursive=True, update_mode='all') # same result for aggregate query than for (saved) direct query agg_meta = base.metadata(recursive=True, return_type='list') for d, a in zip(direct_meta, agg_meta): print(d['path'], a['path']) assert_dict_equal(d, a) # no we can throw away the subdataset tree, and loose no metadata base.uninstall('sub', recursive=True, check=False) assert (not sub.is_installed()) assert_repo_status(base.path) # same result for aggregate query than for (saved) direct query agg_meta = base.metadata(recursive=True, return_type='list') for d, a in zip(direct_meta, agg_meta): assert_dict_equal(d, a)
def test_partial_aggregation(path): ds = Dataset(path).create(force=True) sub1 = ds.create('sub1', force=True) sub2 = ds.create('sub2', force=True) ds.add('.', recursive=True) ds.aggregate_metadata(recursive=True) # baseline, recursive aggregation gets us something for all three datasets res = ds.metadata(get_aggregates=True) assert_result_count(res, 3) # now let's do partial aggregation from just one subdataset # we should not loose information on the other datasets # as this would be a problem any time anything in a dataset # subtree is missing: no installed, too expensive to reaggregate, ... ds.aggregate_metadata(path='sub1', incremental=True) res = ds.metadata(get_aggregates=True) assert_result_count(res, 3) assert_result_count(res, 1, path=sub2.path) # from-scratch aggregation kills datasets that where not listed ds.aggregate_metadata(path='sub1', incremental=False) res = ds.metadata(get_aggregates=True) assert_result_count(res, 2) assert_result_count(res, 0, path=sub2.path) # now reaggregated in full ds.aggregate_metadata(recursive=True) # make change in sub1 sub1.unlock('here') with open(opj(sub1.path, 'here'), 'w') as f: f.write('fresh') ds.save(recursive=True) ok_clean_git(path)
def test_nested_metadata(path): ds = Dataset(path).create(force=True) ds.save() ds.aggregate_metadata() # BIDS returns participant info as a nested dict for each file in the # content metadata. On the dataset-level this should automatically # yield a sequence of participant info dicts, without any further action # or BIDS-specific configuration meta = ds.metadata('.', reporton='datasets', return_type='item-or-list')['metadata'] for i in zip( sorted( meta['datalad_unique_content_properties']['bids']['subject'], key=lambda x: x['id']), sorted([{ "age(years)": "20-25", "id": "03", "gender": "female", "handedness": "r", "hearing_problems_current": "n", "language": "english" }, { "age(years)": "30-35", "id": "01", "gender": 'n/a', "handedness": "r", "hearing_problems_current": "n", "language": u"русский" }], key=lambda x: x['id'])): assert_dict_equal(i[0], i[1]) # we can turn off this kind of auto-summary ds.config.add('datalad.metadata.generate-unique-bids', 'false', where='dataset') ds.aggregate_metadata() meta = ds.metadata('.', reporton='datasets', return_type='item-or-list')['metadata'] # protect next test a little, in case we enhance our core extractor in the future # to provide more info if 'datalad_unique_content_properties' in meta: assert_not_in('bids', meta['datalad_unique_content_properties'])
def test_partial_aggregation(path): ds = Dataset(path).create(force=True) sub1 = ds.create('sub1', force=True) sub2 = ds.create('sub2', force=True) ds.add('.', recursive=True) # if we aggregate a path(s) and say to recurse, we must not recurse into # the dataset itself and aggregate others ds.aggregate_metadata(path='sub1', recursive=True) res = ds.metadata(get_aggregates=True) assert_result_count(res, 1, path=ds.path) assert_result_count(res, 1, path=sub1.path) # so no metadata aggregates for sub2 yet assert_result_count(res, 0, path=sub2.path) ds.aggregate_metadata(recursive=True) # baseline, recursive aggregation gets us something for all three datasets res = ds.metadata(get_aggregates=True) assert_result_count(res, 3) # now let's do partial aggregation from just one subdataset # we should not loose information on the other datasets # as this would be a problem any time anything in a dataset # subtree is missing: not installed, too expensive to reaggregate, ... ds.aggregate_metadata(path='sub1', incremental=True) res = ds.metadata(get_aggregates=True) assert_result_count(res, 3) assert_result_count(res, 1, path=sub2.path) # from-scratch aggregation kills datasets that where not listed ds.aggregate_metadata(path='sub1', incremental=False) res = ds.metadata(get_aggregates=True) assert_result_count(res, 3) assert_result_count(res, 1, path=sub2.path) # now reaggregated in full ds.aggregate_metadata(recursive=True) # make change in sub1 sub1.unlock('here') with open(opj(sub1.path, 'here'), 'w') as f: f.write('fresh') ds.save(recursive=True) ok_clean_git(path)
def test_publish_aggregated(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n' ) base.create('sub', force=True) base.save(recursive=True) assert_repo_status(base.path) base.aggregate_metadata(recursive=True, update_mode='all') assert_repo_status(base.path) # create sibling and publish to it # Note: Use realpath() below because we know that the resolved temporary # test directory exists in the target (many tests rely on that), but it # doesn't necessarily have the unresolved variant. spath = op.realpath(opj(path, 'remote')) base.create_sibling(name="local_target", sshurl="ssh://datalad-test", target_dir=spath) base.publish('.', to='local_target', transfer_data='all') remote = Dataset(spath) objpath = opj('.datalad', 'metadata', 'objects') objs = list(sorted(base.repo.find(objpath))) # all object files a present in both datasets eq_(all(base.repo.file_has_content(objs)), True) eq_(all(remote.repo.file_has_content(objs)), True) # and we can squeeze the same metadata out eq_( [{ k: v for k, v in i.items() if k not in ('path', 'refds', 'parentds') } for i in base.metadata('sub')], [{ k: v for k, v in i.items() if k not in ('path', 'refds', 'parentds') } for i in remote.metadata('sub')], )
def test_add_readme(path=None): ds = Dataset(path).create(force=True) ds.save() ds.aggregate_metadata() assert_repo_status(ds.path) assert_status('ok', ds.add_readme()) # should use default name content = open(opj(path, 'README.md')).read() ok_startswith( content, """\ # Dataset "demo_ds" this is for play ### Authors - Betty - Tom ### License PDDL ## General information This is a DataLad dataset (id: {id}). """.format(id=ds.id)) # make sure that central README references are present assert_in( """More information on how to install DataLad and [how to install](http://handbook.datalad.org/en/latest/intro/installation.html) it can be found in the [DataLad Handbook](https://handbook.datalad.org/en/latest/index.html). """, content) # no unexpectedly long lines assert all([len(l) < 160 for l in content.splitlines()]) # should skip on re-run assert_status('notneeded', ds.add_readme())
def test_aggregate_with_unavailable_objects_from_subds(path, target): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n' ) sub = base.create('sub', force=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.save(recursive=True) assert_repo_status(base.path) base.aggregate_metadata(recursive=True, update_mode='all') assert_repo_status(base.path) # now make that a subdataset of a new one, so aggregation needs to get the # metadata objects first: super = Dataset(target).create() super.install("base", source=base.path) assert_repo_status(super.path) clone = Dataset(opj(super.path, "base")) assert_repo_status(clone.path) objpath = opj('.datalad', 'metadata', 'objects') objs = clone.repo.get_content_annexinfo(paths=[objpath], init=None, eval_availability=True) eq_(len(objs), 6) assert_false(any(st["has_content"] for st in objs.values())) # now aggregate should get those metadata objects super.aggregate_metadata(recursive=True, update_mode='all', force_extraction=False) objs_after = clone.repo.get_content_annexinfo(paths=objs, init=None, eval_availability=True) assert_true(all(st["has_content"] for st in objs_after.values()))
def test_publish_aggregated(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n' ) base.create('sub', force=True) base.add('.', recursive=True) ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) # create sibling and publish to it spath = opj(path, 'remote') base.create_sibling(name="local_target", sshurl="ssh://localhost", target_dir=spath) base.publish('.', to='local_target', transfer_data='all') remote = Dataset(spath) objpath = opj('.datalad', 'metadata', 'objects') objs = [o for o in sorted(base.repo.find(objpath).split('\n')) if o] # all object files a present in both datasets eq_(all(base.repo.file_has_content(objs)), True) eq_(all(remote.repo.file_has_content(objs)), True) # and we can squeeze the same metadata out eq_( [{ k: v for k, v in i.items() if k not in ('path', 'refds', 'parentds') } for i in base.metadata('sub')], [{ k: v for k, v in i.items() if k not in ('path', 'refds', 'parentds') } for i in remote.metadata('sub')], )
def test_update_strategy(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n') sub = base.create('sub', force=True) subsub = sub.create(opj('subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) # we start clean for ds in base, sub, subsub: eq_(len(_get_contained_objs(ds)), 0) # aggregate the base dataset only, nothing below changes base.aggregate_metadata() eq_(len(_get_contained_objs(base)), 2) for ds in sub, subsub: eq_(len(_get_contained_objs(ds)), 0) # aggregate the entire tree, but by default only updates # the top-level dataset with all objects, none of the leaf # or intermediate datasets get's touched base.aggregate_metadata(recursive=True) eq_(len(_get_contained_objs(base)), 6) eq_(len(_get_referenced_objs(base)), 6) for ds in sub, subsub: eq_(len(_get_contained_objs(ds)), 0) res = base.metadata(get_aggregates=True) assert_result_count(res, 3) # it is impossible to query an intermediate or leaf dataset # for metadata for ds in sub, subsub: assert_status( 'impossible', ds.metadata(get_aggregates=True, on_failure='ignore')) # get the full metadata report target_meta = base.metadata(return_type='list') # now redo full aggregation, this time updating all # (intermediate) datasets base.aggregate_metadata(recursive=True, update_mode='all') eq_(len(_get_contained_objs(base)), 6) eq_(len(_get_contained_objs(sub)), 4) eq_(len(_get_contained_objs(subsub)), 2) # it is now OK to query an intermediate or leaf dataset # for metadata for ds in sub, subsub: assert_status( 'ok', ds.metadata(get_aggregates=True, on_failure='ignore')) # all of that has no impact on the reported metadata eq_(target_meta, base.metadata(return_type='list'))
def test_update_strategy(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n' ) sub = base.create('sub', force=True) subsub = sub.create(opj('subsub'), force=True) base.save(recursive=True) assert_repo_status(base.path) # we start clean for ds in base, sub, subsub: eq_(len(_get_contained_objs(ds)), 0) # aggregate the base dataset only, nothing below changes base.aggregate_metadata() eq_(len(_get_contained_objs(base)), 2) for ds in sub, subsub: eq_(len(_get_contained_objs(ds)), 0) # aggregate the entire tree, but by default only updates # the top-level dataset with all objects, none of the leaf # or intermediate datasets get's touched base.aggregate_metadata(recursive=True) eq_(len(_get_contained_objs(base)), 6) eq_(len(_get_referenced_objs(base)), 6) for ds in sub, subsub: eq_(len(_get_contained_objs(ds)), 0) res = base.metadata(get_aggregates=True) assert_result_count(res, 3) # it is impossible to query an intermediate or leaf dataset # for metadata for ds in sub, subsub: assert_status('impossible', ds.metadata(get_aggregates=True, on_failure='ignore')) # get the full metadata report target_meta = base.metadata(return_type='list') # now redo full aggregation, this time updating all # (intermediate) datasets base.aggregate_metadata(recursive=True, update_mode='all') eq_(len(_get_contained_objs(base)), 6) eq_(len(_get_contained_objs(sub)), 4) eq_(len(_get_contained_objs(subsub)), 2) # it is now OK to query an intermediate or leaf dataset # for metadata for ds in sub, subsub: assert_status('ok', ds.metadata(get_aggregates=True, on_failure='ignore')) # all of that has no impact on the reported metadata eq_(target_meta, base.metadata(return_type='list'))
def test_custom_native_merge(path): ds = Dataset(path).create(force=True) # no metadata, because nothing is commited _assert_metadata_empty( ds.metadata(reporton='datasets', result_xfm='metadata', return_type='item-or-list')) # enable BIDS metadata, BIDS metadata should become THE metadata ds.config.add('datalad.metadata.nativetype', 'bids', where='dataset') ds.aggregate_metadata() # no metadata, because still nothing is commited _assert_metadata_empty( ds.metadata(reporton='datasets', result_xfm='metadata', return_type='item-or-list')) ds.add('.') ds.aggregate_metadata() meta = ds.metadata(reporton='datasets', result_xfm='metadata', return_type='item-or-list') _clean_meta(meta) assert_dict_equal({'name': u'myds', 'author': ['one', 'two']}, meta) # now give the ds a custom name, must override the native one # but authors still come from BIDS ds.metadata(apply2global=True, add=dict(name='mycustom')) meta = ds.metadata(reporton='datasets', result_xfm='metadata', return_type='item-or-list') _clean_meta(meta) assert_dict_equal({'name': u'mycustom', 'author': ['one', 'two']}, meta) # we can disable the merge meta = ds.metadata(reporton='datasets', merge_native='none', result_xfm='metadata', return_type='item-or-list') _clean_meta(meta) assert_dict_equal({'name': u'mycustom'}, meta) # we can accumulate values meta = ds.metadata(reporton='datasets', merge_native='add', result_xfm='metadata', return_type='item-or-list') _clean_meta(meta) assert_dict_equal({ 'name': ['mycustom', 'myds'], 'author': ['one', 'two'] }, meta) # we can have native override custom (not sure when needed, though) # add one more custom to make visible ds.metadata(apply2global=True, init=dict(homepage='fresh')) meta = ds.metadata(reporton='datasets', merge_native='reset', result_xfm='metadata', return_type='item-or-list') _clean_meta(meta) assert_dict_equal( { 'name': u'myds', 'author': ['one', 'two'], 'homepage': u'fresh' }, meta) # enable an additional metadata source ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') # we need to reaggregate after the config change ds.aggregate_metadata(merge_native='add') meta = ds.metadata(reporton='datasets', merge_native='add', result_xfm='metadata', return_type='item-or-list') _clean_meta(meta) assert_dict_equal( { 'name': ['mycustom', 'myds', 'someother'], 'author': ['one', 'two'], 'homepage': u'fresh' }, meta)