def test_copy_file_errors(dspath1, dspath2, nondspath): ds1 = Dataset(dspath1) # nothing given assert_raises(ValueError, copy_file) # no target directory given assert_raises(ValueError, ds1.copy_file, 'somefile') # using multiple sources and --specs-from assert_raises(ValueError, ds1.copy_file, ['1', '2', '3'], specs_from='-') # trying to copy to a dir that is not in a dataset ds1.create() assert_status( 'error', ds1.copy_file('somepath', target_dir=nondspath, on_failure='ignore')) # copy into a dataset that is not in the reference dataset ds2 = Dataset(dspath2).create() assert_status( 'error', ds1.copy_file('somepath', target_dir=dspath2, on_failure='ignore')) # attempt to copy from a directory, but no recursion is enabled. # use no reference ds to exercise a different code path assert_status('impossible', copy_file([nondspath, dspath1], on_failure='ignore')) # attempt to copy a file that doesn't exist assert_status('impossible', copy_file(['funky', dspath1], on_failure='ignore')) # attempt to copy a file without a destination given assert_raises(ValueError, copy_file, 'somepath') assert_status('impossible', copy_file(specs_from=['somepath'], on_failure='ignore'))
def test_aggregate_with_unavailable_objects_from_subds(path, target): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n') sub = base.create('sub', force=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) # now make that a subdataset of a new one, so aggregation needs to get the # metadata objects first: super = Dataset(target).create() super.install("base", source=base.path) ok_clean_git(super.path) clone = Dataset(opj(super.path, "base")) ok_clean_git(clone.path) objpath = opj('.datalad', 'metadata', 'objects') objs = [o for o in sorted(clone.repo.get_annexed_files(with_content_only=False)) if o.startswith(objpath)] eq_(len(objs), 6) eq_(all(clone.repo.file_has_content(objs)), False) # now aggregate should get those metadata objects super.aggregate_metadata(recursive=True, update_mode='all', force_extraction=False) eq_(all(clone.repo.file_has_content(objs)), True)
def test_run_from_subds_gh3551(path): ds = Dataset(path).create(force=True) ds.save() ds.create("output") with chpwd(op.join(ds.path, "sub")): assert_in_results( run("echo", inputs=[op.join(op.pardir, "sub", "input")], outputs=[op.join(op.pardir, "output")], return_type="list", result_filter=None, result_xfm=None), action="get", status="notneeded") assert_repo_status(ds.path) subds_path = op.join("output", "subds") ds.create(subds_path) with chpwd(op.join(ds.path, "sub")): output_dir = op.join(op.pardir, "output", "subds") # The below command is trying to be compatible. It could be made better # (e.g., actually using the input file) by someone that knows something # about Windows. assert_in_results( run("cd .> {}".format(op.join(output_dir, "f")), inputs=[op.join(op.pardir, "sub", "input")], outputs=[output_dir], return_type="list", result_filter=None, result_xfm=None), action="save", status="ok") assert_repo_status(ds.path) subds = Dataset(op.join(ds.path, subds_path)) ok_exists(op.join(subds.path, "f")) if not ds.repo.is_managed_branch(): # FIXME # This check fails on Windows: # https://github.com/datalad/datalad/pull/3747/checks?check_run_id=248506560#step:8:254 ok_(subds.repo.file_has_content("f"))
def test_subsuperdataset_save(path): # Verify that when invoked without recursion save does not # cause querying of subdatasets of the subdataset # see https://github.com/datalad/datalad/issues/4523 parent = Dataset(path).create() # Create 3 levels of subdatasets so later to check operation # with or without --dataset being specified sub1 = parent.create('sub1') sub2 = parent.create(sub1.pathobj / 'sub2') sub3 = parent.create(sub2.pathobj / 'sub3') assert_repo_status(path) # now we will lobotomize that sub3 so git would fail if any query is performed. (sub3.pathobj / '.git' / 'config').chmod(0o000) try: sub3.repo.call_git(['ls-files'], read_only=True) raise SkipTest except CommandError: # desired outcome pass # the call should proceed fine since neither should care about sub3 # default is no recursion parent.save('sub1') sub1.save('sub2') assert_raises(CommandError, parent.save, 'sub1', recursive=True) # and should not fail in the top level superdataset with chpwd(parent.path): save('sub1') # or in a subdataset above the problematic one with chpwd(sub1.path): save('sub2')
def test_add_recursive(path): # make simple hierarchy parent = Dataset(path).create() assert_repo_status(parent.path) sub1 = parent.create(op.join('down', 'sub1')) assert_repo_status(parent.path) sub2 = parent.create('sub2') # next one make the parent dirty subsub = sub2.create('subsub') assert_repo_status(parent.path, modified=['sub2']) res = parent.save() assert_repo_status(parent.path) # now add content deep in the hierarchy create_tree(subsub.path, {'new': 'empty'}) assert_repo_status(parent.path, modified=['sub2']) # recursive add should not even touch sub1, because # it knows that it is clean res = parent.save(recursive=True, jobs=5) # the key action is done assert_result_count(res, 1, path=op.join(subsub.path, 'new'), action='add', status='ok') # saved all the way up assert_result_count(res, 3, action='save', status='ok') assert_repo_status(parent.path)
def test_add_recursive(path): # make simple hierarchy parent = Dataset(path).create() assert_repo_status(parent.path) sub1 = parent.create(op.join('down', 'sub1')) assert_repo_status(parent.path) sub2 = parent.create('sub2') # next one make the parent dirty subsub = sub2.create('subsub') assert_repo_status(parent.path, modified=['sub2']) res = parent.save() assert_repo_status(parent.path) # now add content deep in the hierarchy create_tree(subsub.path, {'new': 'empty'}) assert_repo_status(parent.path, modified=['sub2']) # recursive add should not even touch sub1, because # it knows that it is clean res = parent.save(recursive=True) # the key action is done assert_result_count( res, 1, path=op.join(subsub.path, 'new'), action='add', status='ok') # saved all the way up assert_result_count(res, 3, action='save', status='ok') assert_repo_status(parent.path)
def test_subsuperdataset_save(path): # Verify that when invoked without recursion save does not # cause querying of subdatasets of the subdataset # see https://github.com/datalad/datalad/issues/4523 parent = Dataset(path).create() # Create 3 levels of subdatasets so later to check operation # with or without --dataset being specified sub1 = parent.create('sub1') sub2 = parent.create(sub1.pathobj / 'sub2') sub3 = parent.create(sub2.pathobj / 'sub3') assert_repo_status(path) # now we will lobotomize that sub2 so git would fail if any query is performed. rmtree(str(sub3.pathobj / '.git' / 'objects')) # the call should proceed fine since neither should care about sub3 # default is no recursion parent.save('sub1') sub1.save('sub2') assert_raises(CommandError, parent.save, 'sub1', recursive=True) # and should fail if we request saving while in the parent directory # but while not providing a dataset, since operation would run within # pointed subdataset with chpwd(sub1.path): assert_raises(CommandError, save, 'sub2') # but should not fail in the top level superdataset with chpwd(parent.path): save('sub1')
def test_no_leaks(path1, path2): ds1 = Dataset(path1).create() ds1.config.set('i.was.here', 'today', where='local') assert_in('i.was.here', ds1.config.keys()) ds1.config.reload() assert_in('i.was.here', ds1.config.keys()) # now we move into this one repo, and create another # make sure that no config from ds1 leaks into ds2 with chpwd(path1): ds2 = Dataset(path2) assert_not_in('i.was.here', ds2.config.keys()) ds2.config.reload() assert_not_in('i.was.here', ds2.config.keys()) ds2.create() assert_not_in('i.was.here', ds2.config.keys()) # and that we do not track the wrong files assert_not_in(ds1.pathobj / '.git' / 'config', ds2.config._stores['git']['files']) assert_not_in(ds1.pathobj / '.datalad' / 'config', ds2.config._stores['dataset']['files']) # these are the right ones assert_in(ds2.pathobj / '.git' / 'config', ds2.config._stores['git']['files']) assert_in(ds2.pathobj / '.datalad' / 'config', ds2.config._stores['dataset']['files'])
def test_dirty(path): for mode in _dirty_modes: # does nothing without a dataset handle_dirty_dataset(None, mode) # placeholder, but not yet created ds = Dataset(path) # unknown mode assert_raises(ValueError, handle_dirty_dataset, ds, 'MADEUP') # not yet created is very dirty assert_raises(RuntimeError, handle_dirty_dataset, ds, 'fail') handle_dirty_dataset(ds, 'ignore') assert_raises(RuntimeError, handle_dirty_dataset, ds, 'save-before') # should yield a clean repo ds.create() orig_state = ds.repo.get_hexsha() _check_all_clean(ds, orig_state) # tainted: untracked with open(opj(ds.path, 'something'), 'w') as f: f.write('some') # we don't want to auto-add untracked files by saving (anymore) assert_raises(AssertionError, _check_auto_save, ds, orig_state) # tainted: staged ds.repo.add('something', git=True) orig_state = _check_auto_save(ds, orig_state) # tainted: submodule # not added to super on purpose! subds = ds.create('subds') _check_all_clean(subds, subds.repo.get_hexsha()) assert_repo_status(ds.path) # subdataset must be added as a submodule! assert_equal(ds.subdatasets(result_xfm='relpaths'), ['subds'])
def test_basic_aggregate(path): # TODO give datasets some more metadata to actually aggregate stuff base = Dataset(opj(path, 'origin')).create(force=True) sub = base.create('sub', force=True) #base.metadata(sub.path, init=dict(homepage='this'), apply2global=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.save(recursive=True) assert_repo_status(base.path) # we will first aggregate the middle dataset on its own, this will # serve as a smoke test for the reuse of metadata objects later on sub.aggregate_metadata() base.save() assert_repo_status(base.path) base.aggregate_metadata(recursive=True, update_mode='all') assert_repo_status(base.path) direct_meta = base.metadata(recursive=True, return_type='list') # loose the deepest dataset sub.uninstall('subsub', check=False) # no we should eb able to reaggregate metadata, and loose nothing # because we can aggregate aggregated metadata of subsub from sub base.aggregate_metadata(recursive=True, update_mode='all') # same result for aggregate query than for (saved) direct query agg_meta = base.metadata(recursive=True, return_type='list') for d, a in zip(direct_meta, agg_meta): print(d['path'], a['path']) assert_dict_equal(d, a) # no we can throw away the subdataset tree, and loose no metadata base.uninstall('sub', recursive=True, check=False) assert (not sub.is_installed()) assert_repo_status(base.path) # same result for aggregate query than for (saved) direct query agg_meta = base.metadata(recursive=True, return_type='list') for d, a in zip(direct_meta, agg_meta): assert_dict_equal(d, a)
def test_reaggregate_with_unavailable_objects(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n' ) sub = base.create('sub', force=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.save(recursive=True) assert_repo_status(base.path) base.aggregate_metadata(recursive=True, update_mode='all') assert_repo_status(base.path) objpath = opj('.datalad', 'metadata', 'objects') objs = list(sorted(base.repo.find(objpath))) # we have 3x2 metadata sets (dataset/files) under annex eq_(len(objs), 6) eq_(all(base.repo.file_has_content(objs)), True) # drop all object content base.drop(objs, check=False) eq_(all(base.repo.file_has_content(objs)), False) assert_repo_status(base.path) # now re-aggregate, the state hasn't changed, so the file names will # be the same base.aggregate_metadata(recursive=True, update_mode='all', force_extraction=True) eq_(all(base.repo.file_has_content(objs)), True) # and there are no new objects eq_(objs, list(sorted(base.repo.find(objpath))))
def test_gh1426(origin_path, target_path): # set up a pair of repos, one the published copy of the other origin = Dataset(origin_path).create() target = mk_push_target(origin, 'target', target_path, annex=True, bare=False) origin.push(to='target') assert_repo_status(origin.path) assert_repo_status(target.path) eq_(origin.repo.get_hexsha(DEFAULT_BRANCH), target.get_hexsha(DEFAULT_BRANCH)) # gist of #1426 is that a newly added subdataset does not cause the # superdataset to get published origin.create('sub') assert_repo_status(origin.path) neq_(origin.repo.get_hexsha(DEFAULT_BRANCH), target.get_hexsha(DEFAULT_BRANCH)) # now push res = origin.push(to='target') assert_result_count(res, 1, status='ok', type='dataset', path=origin.path, action='publish', target='target', operations=['fast-forward']) eq_(origin.repo.get_hexsha(DEFAULT_BRANCH), target.get_hexsha(DEFAULT_BRANCH))
def test_reaggregate_with_unavailable_objects(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n') sub = base.create('sub', force=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) objpath = opj('.datalad', 'metadata', 'objects') objs = list(sorted(base.repo.find(objpath))) # we have 3x2 metadata sets (dataset/files) under annex eq_(len(objs), 6) eq_(all(base.repo.file_has_content(objs)), True) # drop all object content base.drop(objs, check=False) eq_(all(base.repo.file_has_content(objs)), False) ok_clean_git(base.path) # now re-aggregate, the state hasn't changed, so the file names will # be the same base.aggregate_metadata(recursive=True, update_mode='all', force_extraction=True) eq_(all(base.repo.file_has_content(objs)), True) # and there are no new objects eq_( objs, list(sorted(base.repo.find(objpath))) )
def test_aggregate_with_unavailable_objects_from_subds(path, target): base = Dataset(op.join(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(op.join(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n' ) sub = base.create('sub', force=True) subsub = base.create(op.join('sub', 'subsub'), force=True) base.save(recursive=True) assert_repo_status(base.path) base.meta_aggregate(recursive=True, into='all') assert_repo_status(base.path) # now make that a subdataset of a new one, so aggregation needs to get the # metadata objects first: super = Dataset(target).create() super.install("base", source=base.path) assert_repo_status(super.path) clone = Dataset(op.join(super.path, "base")) assert_repo_status(clone.path) objpath = PurePosixPath('.datalad/metadata/objects') objs = [ o for o in sorted(clone.repo.get_annexed_files(with_content_only=False)) if objpath in PurePosixPath(o).parents ] eq_(len(objs), 6) eq_(all(clone.repo.file_has_content(objs)), False) # now aggregate should get those metadata objects super.meta_aggregate(recursive=True, into='all') eq_(all(clone.repo.file_has_content(objs)), True)
def test_aggregate_with_unavailable_objects_from_subds(path, target): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n' ) sub = base.create('sub', force=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) # now make that a subdataset of a new one, so aggregation needs to get the # metadata objects first: super = Dataset(target).create() super.install("base", source=base.path) ok_clean_git(super.path) clone = Dataset(opj(super.path, "base")) ok_clean_git(clone.path) objpath = opj('.datalad', 'metadata', 'objects') objs = [ o for o in sorted(clone.repo.get_annexed_files(with_content_only=False)) if o.startswith(objpath) ] eq_(len(objs), 6) eq_(all(clone.repo.file_has_content(objs)), False) # now aggregate should get those metadata objects super.aggregate_metadata(recursive=True, update_mode='all', force_extraction=False) eq_(all(clone.repo.file_has_content(objs)), True)
def test_reaggregate_with_unavailable_objects(path): base = Dataset(op.join(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(op.join(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n' ) sub = base.create('sub', force=True) subsub = base.create(op.join('sub', 'subsub'), force=True) base.save(recursive=True) assert_repo_status(base.path) # first a quick check that an unsupported 'into' mode causes an exception assert_raises(ValueError, base.meta_aggregate, recursive=True, into='spaceship') # no for real base.meta_aggregate(recursive=True, into='all') assert_repo_status(base.path) objpath = op.join('.datalad', 'metadata', 'objects') objs = list(sorted(base.repo.find(objpath))) # we have 3x2 metadata sets (dataset/files) under annex eq_(len(objs), 6) eq_(all(base.repo.file_has_content(objs)), True) # drop all object content base.drop(objs, check=False) eq_(all(base.repo.file_has_content(objs)), False) assert_repo_status(base.path) # now re-aggregate, the state hasn't changed, so the file names will # be the same base.meta_aggregate(recursive=True, into='all', force='fromscratch') eq_(all(base.repo.file_has_content(objs)), True) # and there are no new objects eq_(objs, list(sorted(base.repo.find(objpath))))
def test_publish_aggregated(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n') base.create('sub', force=True) base.add('.', recursive=True) ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) # create sibling and publish to it spath = opj(path, 'remote') base.create_sibling( name="local_target", sshurl="ssh://localhost", target_dir=spath) base.publish('.', to='local_target', transfer_data='all') remote = Dataset(spath) objpath = opj('.datalad', 'metadata', 'objects') objs = list(sorted(base.repo.find(objpath))) # all object files a present in both datasets eq_(all(base.repo.file_has_content(objs)), True) eq_(all(remote.repo.file_has_content(objs)), True) # and we can squeeze the same metadata out eq_( [{k: v for k, v in i.items() if k not in ('path', 'refds', 'parentds')} for i in base.metadata('sub')], [{k: v for k, v in i.items() if k not in ('path', 'refds', 'parentds')} for i in remote.metadata('sub')], )
def test_basic_aggregate(path): # TODO give datasets some more metadata to actually aggregate stuff base = Dataset(opj(path, 'origin')).create(force=True) sub = base.create('sub', force=True) #base.metadata(sub.path, init=dict(homepage='this'), apply2global=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) # we will first aggregate the middle dataset on its own, this will # serve as a smoke test for the reuse of metadata objects later on sub.aggregate_metadata() base.save() ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) direct_meta = base.metadata(recursive=True, return_type='list') # loose the deepest dataset sub.uninstall('subsub', check=False) # no we should eb able to reaggregate metadata, and loose nothing # because we can aggregate aggregated metadata of subsub from sub base.aggregate_metadata(recursive=True, update_mode='all') # same result for aggregate query than for (saved) direct query agg_meta = base.metadata(recursive=True, return_type='list') for d, a in zip(direct_meta, agg_meta): print(d['path'], a['path']) assert_dict_equal(d, a) # no we can throw away the subdataset tree, and loose no metadata base.uninstall('sub', recursive=True, check=False) assert(not sub.is_installed()) ok_clean_git(base.path) # same result for aggregate query than for (saved) direct query agg_meta = base.metadata(recursive=True, return_type='list') for d, a in zip(direct_meta, agg_meta): assert_dict_equal(d, a)
def test_dirty(path): for mode in _dirty_modes: # does nothing without a dataset handle_dirty_dataset(None, mode) # placeholder, but not yet created ds = Dataset(path) # unknown mode assert_raises(ValueError, handle_dirty_dataset, ds, 'MADEUP') # not yet created is very dirty assert_raises(RuntimeError, handle_dirty_dataset, ds, 'fail') handle_dirty_dataset(ds, 'ignore') assert_raises(RuntimeError, handle_dirty_dataset, ds, 'save-before') # should yield a clean repo ds.create() orig_state = ds.repo.get_hexsha() _check_all_clean(ds, orig_state) # tainted: untracked with open(opj(ds.path, 'something'), 'w') as f: f.write('some') # we don't want to auto-add untracked files by saving (anymore) assert_raises(AssertionError, _check_auto_save, ds, orig_state) # tainted: staged ds.repo.add('something', git=True) orig_state = _check_auto_save(ds, orig_state) # tainted: submodule # not added to super on purpose! subds = ds.create('subds') _check_all_clean(subds, subds.repo.get_hexsha()) ok_clean_git(ds.path) # subdataset must be added as a submodule! assert_equal(ds.subdatasets(result_xfm='relpaths'), ['subds'])
def test_ls_uninstalled(path): ds = Dataset(path) ds.create() ds.create('sub') ds.uninstall('sub', check=False) with swallow_outputs() as cmo: ls([path], recursive=True) assert_in('not installed', cmo.out)
def test_gh2927(path, linkpath): if has_symlink_capability(): # make it more complicated by default Path(linkpath).symlink_to(path, target_is_directory=True) path = linkpath ds = Dataset(path).create() ds.create('subds_clean') assert_status('ok', ds.create(op.join('subds_clean', 'subds_lvl1_clean'), result_xfm=None, return_type='list'))
def test_create_raises(path=None, outside_path=None): ds = Dataset(path) # incompatible arguments (annex only): assert_raises(ValueError, ds.create, annex=False, description='some') with open(op.join(path, "somefile.tst"), 'w') as f: f.write("some") # non-empty without `force`: assert_in_results( ds.create(force=False, **raw), status='error', message= 'will not create a dataset in a non-empty directory, use `--force` option to ignore' ) # non-empty with `force`: ds.create(force=True) # create sub outside of super: assert_in_results( ds.create(outside_path, **raw), status='error', message=( 'dataset containing given paths is not underneath the reference ' 'dataset %s: %s', ds, outside_path)) obscure_ds = u"ds-" + OBSCURE_FILENAME # create a sub: ds.create(obscure_ds) # fail when doing it again assert_in_results(ds.create(obscure_ds, **raw), status='error', message=('collision with %s (dataset) in dataset %s', str(ds.pathobj / obscure_ds), ds.path)) # now deinstall the sub and fail trying to create a new one at the # same location ds.drop(obscure_ds, what='all', reckless='kill', recursive=True) assert_in(obscure_ds, ds.subdatasets(state='absent', result_xfm='relpaths')) # and now should fail to also create inplace or under assert_in_results(ds.create(obscure_ds, **raw), status='error', message=('collision with %s (dataset) in dataset %s', str(ds.pathobj / obscure_ds), ds.path)) assert_in_results(ds.create(op.join(obscure_ds, 'subsub'), **raw), status='error', message=('collision with %s (dataset) in dataset %s', str(ds.pathobj / obscure_ds), ds.path)) os.makedirs(op.join(ds.path, 'down')) with open(op.join(ds.path, 'down', "someotherfile.tst"), 'w') as f: f.write("someother") ds.save() assert_in_results( ds.create('down', **raw), status='error', message=('collision with content in parent dataset at %s: %s', ds.path, [str(ds.pathobj / 'down' / 'someotherfile.tst')]), )
def _mk_submodule_annex(path, fname, fcontent): ca = dict(result_renderer='disabled') # a remote dataset with a subdataset underneath origds = Dataset(path).create(**ca) (origds.pathobj / fname).write_text(fcontent) # naming is weird, but a legacy artifact s1 = origds.create('subm 1', **ca) (s1.pathobj / fname).write_text(fcontent) s2 = origds.create('2', **ca) (s2.pathobj / fname).write_text(fcontent) origds.save(recursive=True, **ca) return origds
def test_create_sub_gh3463(path): ds = Dataset(path) ds.create() # Test non-bound call. with chpwd(ds.path): create("subds0", dataset=".") assert_repo_status(ds.path) # Test command-line invocation directly. Runner(cwd=ds.path).run(["datalad", "create", "-d.", "subds1"]) assert_repo_status(ds.path)
def check_create_initopts_form(form, path): path = Path(path) template_dir = path / "templates" template_dir.mkdir() (template_dir / "foo").write_text("") forms = {"list": [f"--template={template_dir}"], "dict": {"template": str(template_dir)}} ds = Dataset(path / "ds") ds.create(initopts=forms[form]) ok_exists(ds.repo.dot_git / "foo")
def test_remove_subdataset_nomethod(path=None): ds = Dataset(path).create() ds.create('subds') with chpwd(path): # fails due to unique state res = remove('subds', on_failure='ignore') assert_in_results(res, action='uninstall', status='error', type='dataset') res = remove('subds', reckless='availability', on_failure='ignore') assert_in_results(res, action='uninstall', status='ok', type='dataset') assert_in_results(res, action='remove', status='ok') assert_in_results(res, action='save', status='ok')
def make_demo_hierarchy_datasets(path, tree): created_ds = [] for node, items in tree.items(): node_path = opj(path, node) if isinstance(items, dict): ds = make_demo_hierarchy_datasets(node_path, items) created_ds.append(ds) topds = Dataset(path) if not topds.is_installed(): topds.create(force=True) # TODO this farce would not be necessary if add() could add subdatasets for ds in created_ds: _install_subds_inplace(ds=topds, path=ds.path, relativepath=relpath(ds.path, topds.path)) ds.save() return topds
def test_subdataset_save(path): parent = Dataset(path).create() sub = parent.create('sub') ok_clean_git(parent.path) create_tree(parent.path, {"untracked": 'ignore', 'sub': {"new": "wanted"}}) sub.add('new') # defined state: one untracked, modified (but clean in itself) subdataset ok_clean_git(sub.path) ok_clean_git(parent.path, untracked=['untracked'], index_modified=['sub']) # `save sub` does not save the parent!! with chpwd(parent.path): assert_status('notneeded', save(path=sub.path)) ok_clean_git(parent.path, untracked=['untracked'], index_modified=['sub']) # `save -d .` saves the state change in the subdataset, but leaves any untracked # content alone with chpwd(parent.path): assert_status('ok', parent.save()) ok_clean_git(parent.path, untracked=['untracked']) # get back to the original modified state and check that -S behaves in # exactly the same way create_tree(parent.path, {'sub': {"new2": "wanted2"}}) sub.add('new2') ok_clean_git(parent.path, untracked=['untracked'], index_modified=['sub']) with chpwd(parent.path): assert_status( # notneeded to save sub, but need to save parent ['ok', 'notneeded'], # the key condition of this test is that no reference dataset is # given! save(path='sub', super_datasets=True)) # save super must not cause untracked content to be commited! ok_clean_git(parent.path, untracked=['untracked'])
def test_create_subdataset_hierarchy_from_top(path): # how it would look like to overlay a subdataset hierarchy onto # an existing directory tree ds = Dataset(op.join(path, 'origin')).create(force=True) # we got a dataset .... ok_(ds.is_installed()) # ... but it has untracked content ok_(ds.repo.dirty) subds = ds.create(u"ds-" + OBSCURE_FILENAME, force=True) ok_(subds.is_installed()) ok_(subds.repo.dirty) subsubds = subds.create('subsub', force=True) ok_(subsubds.is_installed()) ok_(subsubds.repo.dirty) ok_(ds.id != subds.id != subsubds.id) ds.save(updated=True, recursive=True) # 'file*' in each repo was untracked before and should remain as such # (we don't want a #1419 resurrection ok_(ds.repo.dirty) ok_(subds.repo.dirty) ok_(subsubds.repo.dirty) # if we add these three, we should get clean ds.save([ 'file1', op.join(subds.path, 'file2'), op.join(subsubds.path, 'file3')]) assert_repo_status(ds.path) ok_(ds.id != subds.id != subsubds.id)
def test_inherit_src_candidates(lcl, storepath, url): lcl = Path(lcl) storepath = Path(storepath) # dataset with a subdataset ds1 = Dataset(lcl / 'ds1').create() ds1sub = ds1.create('sub') # a different dataset into which we install ds1, but do not touch its subds ds2 = Dataset(lcl / 'ds2').create() ds2.clone(source=ds1.path, path='mysub') # we give no dataset a source candidate config! # move all dataset into the store for d in (ds1, ds1sub, ds2): _move2store(storepath, d) # now we must be able to obtain all three datasets from the store riaclone = clone( 'ria+{}#{}'.format( # store URL url, # ID of the root dataset ds2.id), lcl / 'clone', ) # what happens is the the initial clone call sets a source candidate # config, because it sees the dataset coming from a store # all obtained subdatasets get the config inherited on-clone datasets = riaclone.get('.', get_data=False, recursive=True, result_xfm='datasets') # we get two subdatasets eq_(len(datasets), 2) for ds in datasets: eq_(ConfigManager(dataset=ds, source='dataset-local').get( 'datalad.get.subdataset-source-candidate-200origin'), 'ria+%s#{id}' % url)
def test_invalid_call(origin, tdir): ds = Dataset(origin).create() # no target assert_status('impossible', ds.push(on_failure='ignore')) # no dataset with chpwd(tdir): assert_raises(InsufficientArgumentsError, Push.__call__) # dataset, but outside path assert_raises(IncompleteResultsError, ds.push, path=tdir) # given a path constraint that doesn't match anything, will cause # nothing to be done assert_status('notneeded', ds.push(path=ds.pathobj / 'nothere')) # unavailable subdataset dummy_sub = ds.create('sub') dummy_sub.uninstall() assert_in('sub', ds.subdatasets(fulfilled=False, result_xfm='relpaths')) # now an explicit call to publish the unavailable subdataset assert_raises(ValueError, ds.push, 'sub') target = mk_push_target(ds, 'target', tdir, annex=True) # revision that doesn't exist assert_raises(ValueError, ds.push, to='target', since='09320957509720437523')
def test_subdataset_save(path): parent = Dataset(path).create() sub = parent.create('sub') assert_repo_status(parent.path) create_tree(parent.path, {"untracked": 'ignore', 'sub': {"new": "wanted"}}) sub.save('new') # defined state: one untracked, modified (but clean in itself) subdataset assert_repo_status(sub.path) assert_repo_status(parent.path, untracked=['untracked'], modified=['sub']) # `save sub` does not save the parent!! with chpwd(parent.path): assert_status('notneeded', save(dataset=sub.path)) assert_repo_status(parent.path, untracked=['untracked'], modified=['sub']) # `save -u .` saves the state change in the subdataset, # but leaves any untracked content alone with chpwd(parent.path): assert_status('ok', parent.save(updated=True)) assert_repo_status(parent.path, untracked=['untracked']) # get back to the original modified state and check that -S behaves in # exactly the same way create_tree(parent.path, {'sub': {"new2": "wanted2"}}) sub.save('new2') assert_repo_status(parent.path, untracked=['untracked'], modified=['sub'])
def test_push_subds_no_recursion(src_path, dst_top, dst_sub, dst_subsub): # dataset with one submodule and one subsubmodule top = Dataset(src_path).create() sub = top.create('sub m') test_file = sub.pathobj / 'subdir' / 'test_file' test_file.parent.mkdir() test_file.write_text('some') subsub = sub.create(sub.pathobj / 'subdir' / 'subsub m') top.save(recursive=True) assert_repo_status(top.path) target_top = mk_push_target(top, 'target', dst_top, annex=True) target_sub = mk_push_target(sub, 'target', dst_sub, annex=True) target_subsub = mk_push_target(subsub, 'target', dst_subsub, annex=True) # now publish, but NO recursion, instead give the parent dir of # both a subdataset and a file in the middle subdataset res = top.push( to='target', # give relative to top dataset to elevate the difficulty a little path=str(test_file.relative_to(top.pathobj).parent)) assert_status('ok', res) assert_in_results(res, action='publish', type='dataset', path=top.path) assert_in_results(res, action='publish', type='dataset', path=sub.path) assert_in_results(res, action='copy', type='file', path=str(test_file)) # the lowest-level subdataset isn't touched assert_not_in_results(res, action='publish', type='dataset', path=subsub.path)
def test_subdataset_save(path): parent = Dataset(path).create() sub = parent.create('sub') assert_repo_status(parent.path) create_tree(parent.path, { "untracked": 'ignore', 'sub': { "new": "wanted"}}) sub.save('new') # defined state: one untracked, modified (but clean in itself) subdataset assert_repo_status(sub.path) assert_repo_status(parent.path, untracked=['untracked'], modified=['sub']) # `save sub` does not save the parent!! with chpwd(parent.path): assert_status('notneeded', save(dataset=sub.path)) assert_repo_status(parent.path, untracked=['untracked'], modified=['sub']) # `save -u .` saves the state change in the subdataset, # but leaves any untracked content alone with chpwd(parent.path): assert_status('ok', parent.save(updated=True)) assert_repo_status(parent.path, untracked=['untracked']) # get back to the original modified state and check that -S behaves in # exactly the same way create_tree(parent.path, { 'sub': { "new2": "wanted2"}}) sub.save('new2') assert_repo_status(parent.path, untracked=['untracked'], modified=['sub'])
def test_aggregate_removal(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n' ) sub = base.create('sub', force=True) subsub = sub.create(opj('subsub'), force=True) base.save(recursive=True) base.aggregate_metadata(recursive=True, update_mode='all') assert_repo_status(base.path) res = base.metadata(get_aggregates=True) assert_result_count(res, 3) assert_result_count(res, 1, path=subsub.path) # check that we only have object files that are listed in agginfo eq_(_get_contained_objs(base), _get_referenced_objs(base)) # now delete the deepest subdataset to test cleanup of aggregated objects # in the top-level ds base.remove(opj('sub', 'subsub'), check=False) # now aggregation has to detect that subsub is not simply missing, but gone # for good base.aggregate_metadata(recursive=True, update_mode='all') assert_repo_status(base.path) # internally consistent state eq_(_get_contained_objs(base), _get_referenced_objs(base)) # info on subsub was removed at all levels res = base.metadata(get_aggregates=True) assert_result_count(res, 0, path=subsub.path) assert_result_count(res, 2) res = sub.metadata(get_aggregates=True) assert_result_count(res, 0, path=subsub.path) assert_result_count(res, 1)
def test_rerun(path, nodspath): ds = Dataset(path).create() sub = ds.create('sub') probe_path = opj(sub.path, 'sequence') # run inside the dataset with chpwd(path), \ swallow_outputs(): ds.run('echo x$(cat sub/sequence) > sub/sequence') # command ran once, all clean ok_clean_git(ds.path) eq_('x\n', open(probe_path).read()) # now, for a rerun we can be anywhere, PWD and all are recorded # moreover, rerun must figure out which bits to unlock, even in # subdatasets with chpwd(nodspath), \ swallow_outputs(): ds.run(rerun=True) ok_clean_git(ds.path) # ran twice now eq_('xx\n', open(probe_path).read()) # if I give another command, it will be ignored with chpwd(nodspath), \ swallow_logs(new_level=logging.WARNING) as cml, \ swallow_outputs(): ds.run('30BANG3934', rerun=True) cml.assert_logged("Ignoring provided command in --rerun mode", level="WARNING") ok_clean_git(ds.path) eq_('xxx\n', open(probe_path).read())
def test_aggregate_removal(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n') sub = base.create('sub', force=True) subsub = sub.create(opj('subsub'), force=True) base.add('.', recursive=True) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) res = base.metadata(get_aggregates=True) assert_result_count(res, 3) assert_result_count(res, 1, path=subsub.path) # check that we only have object files that are listed in agginfo eq_(_get_contained_objs(base), _get_referenced_objs(base)) # now delete the deepest subdataset to test cleanup of aggregated objects # in the top-level ds base.remove(opj('sub', 'subsub'), check=False) # now aggregation has to detect that subsub is not simply missing, but gone # for good base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) # internally consistent state eq_(_get_contained_objs(base), _get_referenced_objs(base)) # info on subsub was removed at all levels res = base.metadata(get_aggregates=True) assert_result_count(res, 0, path=subsub.path) assert_result_count(res, 2) res = sub.metadata(get_aggregates=True) assert_result_count(res, 0, path=subsub.path) assert_result_count(res, 1)
def test_partial_aggregation(path): ds = Dataset(path).create(force=True) sub1 = ds.create('sub1', force=True) sub2 = ds.create('sub2', force=True) ds.add('.', recursive=True) ds.aggregate_metadata(recursive=True) # baseline, recursive aggregation gets us something for all three datasets res = ds.metadata(get_aggregates=True) assert_result_count(res, 3) # now let's do partial aggregation from just one subdataset # we should not loose information on the other datasets # as this would be a problem any time anything in a dataset # subtree is missing: no installed, too expensive to reaggregate, ... ds.aggregate_metadata(path='sub1', incremental=True) res = ds.metadata(get_aggregates=True) assert_result_count(res, 3) assert_result_count(res, 1, path=sub2.path) # from-scratch aggregation kills datasets that where not listed ds.aggregate_metadata(path='sub1', incremental=False) res = ds.metadata(get_aggregates=True) assert_result_count(res, 2) assert_result_count(res, 0, path=sub2.path) # now reaggregated in full ds.aggregate_metadata(recursive=True) # make change in sub1 sub1.unlock('here') with open(opj(sub1.path, 'here'), 'w') as f: f.write('fresh') ds.save(recursive=True) ok_clean_git(path)
def test_aggregate_query(path, randompath): ds = Dataset(path).create(force=True) # no magic change to actual dataset metadata due to presence of # aggregated metadata res = ds.meta_dump(reporton='datasets', on_failure='ignore') assert_result_count(res, 0) # but we can now ask for metadata of stuff that is unknown on disk res = ds.meta_dump(op.join('sub', 'deep', 'some'), reporton='datasets') assert_result_count(res, 1) eq_({'homepage': 'http://top.example.com'}, res[0]['metadata']) sub = ds.create('sub', force=True) # when no reference dataset there is NO magic discovery of the relevant # dataset with chpwd(randompath): assert_raises(ValueError, meta_dump, op.join(path, 'sub', 'deep', 'some'), reporton='datasets') # but inside a dataset things work with chpwd(ds.path): res = meta_dump(op.join(path, 'sub', 'deep', 'some'), reporton='datasets') assert_result_count(res, 1) # the metadata in the discovered top dataset is return, not the # metadata in the subdataset eq_({'homepage': 'http://top.example.com'}, res[0]['metadata']) # when a reference dataset is given, it will be used as the metadata # provider res = sub.meta_dump(op.join('deep', 'some'), reporton='datasets') assert_result_count(res, 1) eq_({'homepage': 'http://sub.example.com'}, res[0]['metadata'])
def make_demo_hierarchy_datasets(path, tree, parent=None): if parent is None: parent = Dataset(path).create(force=True) for node, items in tree.items(): if isinstance(items, dict): node_path = opj(path, node) nodeds = parent.create(node_path, force=True) make_demo_hierarchy_datasets(node_path, items, parent=nodeds) return parent
def test_create_sub(path): ds = Dataset(path) ds.create() # 1. create sub and add to super: subds = ds.create(op.join("some", "what", "deeper")) ok_(isinstance(subds, Dataset)) ok_(subds.is_installed()) assert_repo_status(subds.path, annex=True) assert_in( 'submodule.some/what/deeper.datalad-id={}'.format( subds.id), ds.repo._git_custom_command( '', ['git', 'config', '--file', '.gitmodules', '--list'])[0] ) # subdataset is known to superdataset: assert_in(op.join("some", "what", "deeper"), ds.subdatasets(result_xfm='relpaths')) # and was committed: assert_repo_status(ds.path) # subds finds superdataset ok_(subds.get_superdataset() == ds) # 2. create sub without adding to super: subds2 = Dataset(op.join(path, "someother")).create() ok_(isinstance(subds2, Dataset)) ok_(subds2.is_installed()) assert_repo_status(subds2.path, annex=True) # unknown to superdataset: assert_not_in("someother", ds.subdatasets(result_xfm='relpaths')) # 3. create sub via super: subds3 = ds.create("third", no_annex=True) ok_(isinstance(subds3, Dataset)) ok_(subds3.is_installed()) assert_repo_status(subds3.path, annex=False) assert_in("third", ds.subdatasets(result_xfm='relpaths'))
def test_create(path): ds = Dataset(path) ds.create( description="funny", # custom git init option initopts=dict(shared='world')) ok_(ds.is_installed()) assert_repo_status(ds.path, annex=True) # check default backend eq_(ds.config.get("annex.backends"), 'MD5E') eq_(ds.config.get("core.sharedrepository"), '2') runner = Runner() # check description in `info` cmd = ['git', 'annex', 'info'] cmlout = runner.run(cmd, cwd=path) assert_in('funny [here]', cmlout[0]) # check datset ID eq_(ds.config.get_value('datalad.dataset', 'id'), ds.id)
def test_update_strategy(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n') sub = base.create('sub', force=True) subsub = sub.create(opj('subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) # we start clean for ds in base, sub, subsub: eq_(len(_get_contained_objs(ds)), 0) # aggregate the base dataset only, nothing below changes base.aggregate_metadata() eq_(len(_get_contained_objs(base)), 2) for ds in sub, subsub: eq_(len(_get_contained_objs(ds)), 0) # aggregate the entire tree, but by default only updates # the top-level dataset with all objects, none of the leaf # or intermediate datasets get's touched base.aggregate_metadata(recursive=True) eq_(len(_get_contained_objs(base)), 6) eq_(len(_get_referenced_objs(base)), 6) for ds in sub, subsub: eq_(len(_get_contained_objs(ds)), 0) res = base.metadata(get_aggregates=True) assert_result_count(res, 3) # it is impossible to query an intermediate or leaf dataset # for metadata for ds in sub, subsub: assert_status( 'impossible', ds.metadata(get_aggregates=True, on_failure='ignore')) # get the full metadata report target_meta = base.metadata(return_type='list') # now redo full aggregation, this time updating all # (intermediate) datasets base.aggregate_metadata(recursive=True, update_mode='all') eq_(len(_get_contained_objs(base)), 6) eq_(len(_get_contained_objs(sub)), 4) eq_(len(_get_contained_objs(subsub)), 2) # it is now OK to query an intermediate or leaf dataset # for metadata for ds in sub, subsub: assert_status( 'ok', ds.metadata(get_aggregates=True, on_failure='ignore')) # all of that has no impact on the reported metadata eq_(target_meta, base.metadata(return_type='list'))
def test_bf1886(path): parent = Dataset(path).create() parent.create('sub') assert_repo_status(parent.path) # create a symlink pointing down to the subdataset, and add it os.symlink('sub', op.join(parent.path, 'down')) parent.save('down') assert_repo_status(parent.path) # now symlink pointing up os.makedirs(op.join(parent.path, 'subdir', 'subsubdir')) os.symlink(op.join(op.pardir, 'sub'), op.join(parent.path, 'subdir', 'up')) parent.save(op.join('subdir', 'up')) # 'all' to avoid the empty dir being listed assert_repo_status(parent.path, untracked_mode='all') # now symlink pointing 2xup, as in #1886 os.symlink( op.join(op.pardir, op.pardir, 'sub'), op.join(parent.path, 'subdir', 'subsubdir', 'upup')) parent.save(op.join('subdir', 'subsubdir', 'upup')) assert_repo_status(parent.path) # simulatenously add a subds and a symlink pointing to it # create subds, but don't register it create(op.join(parent.path, 'sub2')) os.symlink( op.join(op.pardir, op.pardir, 'sub2'), op.join(parent.path, 'subdir', 'subsubdir', 'upup2')) parent.save(['sub2', op.join('subdir', 'subsubdir', 'upup2')]) assert_repo_status(parent.path) # full replication of #1886: the above but be in subdir of symlink # with no reference dataset create(op.join(parent.path, 'sub3')) os.symlink( op.join(op.pardir, op.pardir, 'sub3'), op.join(parent.path, 'subdir', 'subsubdir', 'upup3')) # need to use absolute paths with chpwd(op.join(parent.path, 'subdir', 'subsubdir')): save([op.join(parent.path, 'sub3'), op.join(parent.path, 'subdir', 'subsubdir', 'upup3')]) assert_repo_status(parent.path)
def test_aggregate_query(path): ds = Dataset(path).create(force=True) # no magic change to actual dataset metadata due to presence of # aggregated metadata res = ds.metadata(reporton='datasets', on_failure='ignore') assert_result_count(res, 1) assert_not_in('metadata', res[0]) # but we can now ask for metadata of stuff that is unknown on disk res = ds.metadata(opj('sub', 'deep', 'some'), reporton='datasets') assert_result_count(res, 1) eq_({'homepage': 'http://top.example.com'}, res[0]['metadata']) # when no reference dataset is given the command will report the # aggregated metadata as it is recorded in the dataset that is the # closest parent on disk ds.create('sub', force=True) res = metadata(opj(path, 'sub', 'deep', 'some'), reporton='datasets') assert_result_count(res, 1) eq_({'homepage': 'http://sub.example.com'}, res[0]['metadata']) # when a reference dataset is given, it will be used as the metadata # provider res = ds.metadata(opj('sub', 'deep', 'some'), reporton='datasets') assert_result_count(res, 1) eq_({'homepage': 'http://top.example.com'}, res[0]['metadata'])
def test_diff_helper(path): # make test dataset components of interesting states ds = Dataset.create(path, force=True) # detached dataset, not a submodule nosub = Dataset.create(opj(path, 'nosub')) # unmodified, proper submodule sub_clean = ds.create('sub_clean', force=True) # proper submodule, but commited modifications not commited in parent sub_modified = ds.create('sub_modified', force=True) sub_modified.add('modified') # proper submodule with untracked changes sub_dirty = ds.create('sub_dirty', force=True) ds.add(['clean', 'modified']) ds.unlock('modified') with open(opj(ds.path, 'modified'), 'w') as f: f.write('modified_content') file_mod = opj(ds.path, 'modified') # standard `git diff` no special args, reports modified, but not untracked res = list(_parse_git_diff(ds.path)) assert_result_count(res, 3) assert_result_count(res, 1, path=file_mod) assert_result_count(res, 1, path=sub_modified.path) assert_result_count(res, 1, path=sub_dirty.path)
def test_diff_recursive(path): ds = Dataset(path).create() sub = ds.create('sub') # look at the last change, and confirm a dataset was added res = ds.diff(revision='HEAD~1..HEAD') assert_result_count(res, 1, action='diff', state='added', path=sub.path, type='dataset') # now recursive res = ds.diff(recursive=True, revision='HEAD~1..HEAD') # we also get the entire diff of the subdataset from scratch assert_status('ok', res) ok_(len(res) > 3) # one specific test assert_result_count(res, 1, action='diff', state='added', path=opj(sub.path, '.datalad', 'config')) # now we add a file to just the parent create_tree(ds.path, {'onefile': 'tobeadded', 'sub': {'twofile': 'tobeadded'}}) res = ds.diff(recursive=True, report_untracked='all') assert_result_count(res, 3) assert_result_count(res, 1, action='diff', state='untracked', path=opj(ds.path, 'onefile'), type='file') assert_result_count(res, 1, action='diff', state='modified', path=sub.path, type='dataset') assert_result_count(res, 1, action='diff', state='untracked', path=opj(sub.path, 'twofile'), type='file') # save sub sub.add('.') # save sub in parent ds.save() # save addition in parent ds.add('.') ok_clean_git(ds.path) # look at the last change, only one file was added res = ds.diff(revision='HEAD~1..HEAD') assert_result_count(res, 1) assert_result_count(res, 1, action='diff', state='added', path=opj(ds.path, 'onefile'), type='file') # now the exact same thing with recursion, must not be different from the call # above res = ds.diff(recursive=True, revision='HEAD~1..HEAD') assert_result_count(res, 1) # last change in parent assert_result_count(res, 1, action='diff', state='added', path=opj(ds.path, 'onefile'), type='file') # one further back brings in the modified subdataset, and the added file within it res = ds.diff(recursive=True, revision='HEAD~2..HEAD') assert_result_count(res, 3) assert_result_count(res, 1, action='diff', state='added', path=opj(ds.path, 'onefile'), type='file') assert_result_count(res, 1, action='diff', state='added', path=opj(sub.path, 'twofile'), type='file') assert_result_count(res, 1, action='diff', state='modified', path=sub.path, type='dataset')
def test_subds_path(path): # a dataset with a subdataset with a file, all neatly tracked ds = Dataset(path).create() subds = ds.create('sub') assert_repo_status(path) with (subds.pathobj / 'some.txt').open('w') as f: f.write(u'test') ds.save(recursive=True) assert_repo_status(path) # querying the toplevel dataset repo for a subdspath should # report the subdataset record in the dataset # (unlike `git status`, which is silent for subdataset paths), # but definitely not report the subdataset as deleted # https://github.com/datalad/datalad-revolution/issues/17 stat = ds.repo.status(paths=[op.join('sub', 'some.txt')]) assert_equal(list(stat.keys()), [subds.repo.pathobj]) assert_equal(stat[subds.repo.pathobj]['state'], 'clean')
def test_nested_create(path): # to document some more organic usage pattern ds = Dataset(path).create() assert_repo_status(ds.path) lvl2relpath = op.join('lvl1', 'lvl2') lvl2path = op.join(ds.path, lvl2relpath) os.makedirs(lvl2path) os.makedirs(op.join(ds.path, 'lvl1', 'empty')) with open(op.join(lvl2path, 'file'), 'w') as f: f.write('some') ok_(ds.save()) # Empty directories are filtered out. assert_repo_status(ds.path, untracked=[]) # later create subdataset in a fresh dir # WINDOWS FAILURE IS NEXT LINE subds1 = ds.create(op.join('lvl1', 'subds')) assert_repo_status(ds.path, untracked=[]) eq_(ds.subdatasets(result_xfm='relpaths'), [op.join('lvl1', 'subds')]) # later create subdataset in an existing empty dir subds2 = ds.create(op.join('lvl1', 'empty')) assert_repo_status(ds.path) # later try to wrap existing content into a new subdataset # but that won't work assert_in_results( ds.create(lvl2relpath, **raw), status='error', message=( 'collision with content in parent dataset at %s: %s', ds.path, [op.join(lvl2path, 'file')])) # even with force, as to do this properly complicated surgery would need to # take place # MIH disable shaky test till proper dedicated upfront check is in-place in `create` # gh-1725 #assert_in_results( # ds.create(lvl2relpath, force=True, # on_failure='ignore', result_xfm=None, result_filter=None), # status='error', action='add') # only way to make it work is to unannex the content upfront ds.repo._run_annex_command('unannex', annex_options=[op.join(lvl2relpath, 'file')]) # nothing to save, git-annex commits the unannex itself, but only on v5 ds.repo.commit() # still nothing without force # "err='lvl1/lvl2' already exists in the index" assert_in_results( ds.create(lvl2relpath, **raw), status='error', message='will not create a dataset in a non-empty directory, use `force` option to ignore') # XXX even force doesn't help, because (I assume) GitPython doesn't update # its representation of the Git index properly ds.create(lvl2relpath, force=True) assert_in(lvl2relpath, ds.subdatasets(result_xfm='relpaths'))
def test_diff_rsync_syntax(path): # three nested datasets ds = Dataset(path).create() subds = ds.create('sub') subsubds = subds.create('deep') justtop = ds.diff(fr=PRE_INIT_COMMIT_SHA, path='sub') # we only get a single result, the subdataset in question assert_result_count(justtop, 1) assert_result_count(justtop, 1, type='dataset', path=subds.path) # now with "peak inside the dataset" syntax inside = ds.diff(fr=PRE_INIT_COMMIT_SHA, path='sub' + os.sep) # we get both subdatasets, but nothing else inside the nested one assert_result_count(inside, 2, type='dataset') assert_result_count(inside, 1, type='dataset', path=subds.path) assert_result_count(inside, 1, type='dataset', path=subsubds.path) assert_result_count(inside, 0, type='file', parentds=subsubds.path) # just for completeness, we get more when going full recursive rec = ds.diff(fr=PRE_INIT_COMMIT_SHA, recursive=True, path='sub' + os.sep) assert(len(inside) < len(rec))
def test_save(path): ds = Dataset(path) with open(opj(path, "new_file.tst"), "w") as f: f.write("something") ds.repo.add("new_file.tst", git=True) ok_(ds.repo.dirty) ds.save("add a new file", all_changes=False) ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) with open(opj(path, "new_file.tst"), "w") as f: f.write("modify") ok_(ds.repo.dirty) ds.save("modified new_file.tst", all_changes=True) ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) files = ['one.txt', 'two.txt'] for fn in files: with open(opj(path, fn), "w") as f: f.write(fn) ds.add([opj(path, f) for f in files]) # superfluous call to save (add saved it already), should not fail # but report that nothing was saved assert_false(ds.save("set of new files")) ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) # create subdataset subds = ds.create('subds') ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) # modify subds with open(opj(subds.path, "some_file.tst"), "w") as f: f.write("something") subds.add('.') ok_clean_git(subds.path, annex=isinstance(ds.repo, AnnexRepo)) ok_(ds.repo.dirty) # ensure modified subds is committed ds.save(all_changes=True) ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo))
def test_bf1886(path): parent = Dataset(path).create() sub = parent.create('sub') ok_clean_git(parent.path) # create a symlink pointing down to the subdataset, and add it os.symlink('sub', opj(parent.path, 'down')) parent.add('down') ok_clean_git(parent.path) # now symlink pointing up os.makedirs(opj(parent.path, 'subdir', 'subsubdir')) os.symlink(opj(pardir, 'sub'), opj(parent.path, 'subdir', 'up')) parent.add(opj('subdir', 'up')) ok_clean_git(parent.path) # now symlink pointing 2xup, as in #1886 os.symlink(opj(pardir, pardir, 'sub'), opj(parent.path, 'subdir', 'subsubdir', 'upup')) parent.add(opj('subdir', 'subsubdir', 'upup')) ok_clean_git(parent.path) # simulatenously add a subds and a symlink pointing to it # create subds, but don't register it sub2 = create(opj(parent.path, 'sub2')) os.symlink( opj(pardir, pardir, 'sub2'), opj(parent.path, 'subdir', 'subsubdir', 'upup2')) parent.add(['sub2', opj('subdir', 'subsubdir', 'upup2')]) ok_clean_git(parent.path) # full replication of #1886: the above but be in subdir of symlink # with no reference dataset sub3 = create(opj(parent.path, 'sub3')) os.symlink( opj(pardir, pardir, 'sub3'), opj(parent.path, 'subdir', 'subsubdir', 'upup3')) # need to use absolute paths with chpwd(opj(parent.path, 'subdir', 'subsubdir')): add([opj(parent.path, 'sub3'), opj(parent.path, 'subdir', 'subsubdir', 'upup3')]) # here is where we need to disagree with the repo in #1886 # we would not expect that `add` registers sub3 as a subdataset # of parent, because no reference dataset was given and the # command cannot decide (with the current semantics) whether # it should "add anything in sub3 to sub3" or "add sub3 to whatever # sub3 is in" ok_clean_git(parent.path, untracked=['sub3/'])
def test_bf1886(path): parent = Dataset(path).create() sub = parent.create('sub') ok_clean_git(parent.path) # create a symlink pointing down to the subdataset, and add it os.symlink('sub', opj(parent.path, 'down')) parent.add('down') ok_clean_git(parent.path) # now symlink pointing up os.makedirs(opj(parent.path, 'subdir', 'subsubdir')) os.symlink(opj(pardir, 'sub'), opj(parent.path, 'subdir', 'up')) parent.add(opj('subdir', 'up')) ok_clean_git(parent.path) # now symlink pointing 2xup, as in #1886 os.symlink(opj(pardir, pardir, 'sub'), opj(parent.path, 'subdir', 'subsubdir', 'upup')) parent.add(opj('subdir', 'subsubdir', 'upup')) ok_clean_git(parent.path) # simulatenously add a subds and a symlink pointing to it # create subds, but don't register it sub2 = create(opj(parent.path, 'sub2')) os.symlink( opj(pardir, pardir, 'sub2'), opj(parent.path, 'subdir', 'subsubdir', 'upup2')) parent.add(['sub2', opj('subdir', 'subsubdir', 'upup2')]) ok_clean_git(parent.path) # full replication of #1886: the above but be in subdir of symlink # with no reference dataset sub3 = create(opj(parent.path, 'sub3')) os.symlink( opj(pardir, pardir, 'sub3'), opj(parent.path, 'subdir', 'subsubdir', 'upup3')) # need to use absolute paths with chpwd(opj(parent.path, 'subdir', 'subsubdir')): rev_save([opj(parent.path, 'sub3'), opj(parent.path, 'subdir', 'subsubdir', 'upup3')]) # in contrast to `add` only operates on a single top-level dataset # although it is not specified, it get's discovered based on the PWD # the logic behind that feels a bit shaky # consult discussion in https://github.com/datalad/datalad/issues/3230 # if this comes up as an issue at some point ok_clean_git(parent.path)
def test_gh1597(path): if 'APPVEYOR' in os.environ: # issue only happens on appveyor, Python itself implodes # cannot be reproduced on a real windows box raise SkipTest( 'this test causes appveyor to crash, reason unknown') ds = Dataset(path).create() sub = ds.create('sub') res = ds.subdatasets() assert_result_count(res, 1, path=sub.path) # now modify .gitmodules with another command ds.subdatasets(contains=sub.path, set_property=[('this', 'that')]) # now modify low-level with open(op.join(ds.path, '.gitmodules'), 'a') as f: f.write('\n') assert_repo_status(ds.path, modified=['.gitmodules']) ds.save('.gitmodules') # must not come under annex mangement assert_not_in( 'key', ds.repo.annexstatus(paths=['.gitmodules']).popitem()[1])
def test_state(path): ds = Dataset.create(path) sub = ds.create('sub') res = ds.subdatasets() assert_result_count(res, 1, path=sub.path) # by default we are not reporting any state info assert_not_in('state', res[0]) # uninstall the subdataset ds.uninstall('sub') # normale 'gone' is "absent" assert_false(sub.is_installed()) assert_result_count( ds.subdatasets(), 1, path=sub.path, state='absent') # with directory totally gone also os.rmdir(sub.path) assert_result_count( ds.subdatasets(), 1, path=sub.path, state='absent') # putting dir back, no change os.makedirs(sub.path) assert_result_count( ds.subdatasets(), 1, path=sub.path, state='absent')
def test_get_dataset_directories(path): assert_raises(ValueError, get_dataset_directories, path) ds = Dataset(path).create() # ignores .git always and .datalad by default assert_equal(get_dataset_directories(path), []) assert_equal(get_dataset_directories(path, ignore_datalad=False), [opj(path, '.datalad')]) # find any directory, not just those known to git testdir = opj(path, 'newdir') os.makedirs(testdir) assert_equal(get_dataset_directories(path), [testdir]) # do not find files with open(opj(path, 'somefile'), 'w') as f: f.write('some') assert_equal(get_dataset_directories(path), [testdir]) # find more than one directory testdir2 = opj(path, 'newdir2') os.makedirs(testdir2) assert_equal(sorted(get_dataset_directories(path)), sorted([testdir, testdir2])) # find subdataset mount points subdsdir = opj(path, 'sub') subds = ds.create(subdsdir) assert_equal(sorted(get_dataset_directories(path)), sorted([testdir, testdir2, subdsdir])) # do not find content within subdataset dirs os.makedirs(opj(path, 'sub', 'deep')) assert_equal(sorted(get_dataset_directories(path)), sorted([testdir, testdir2, subdsdir])) subsubdsdir = opj(subdsdir, 'subsub') subds.create(subsubdsdir) assert_equal(sorted(get_dataset_directories(path)), sorted([testdir, testdir2, subdsdir])) # find nested directories testdir3 = opj(testdir2, 'newdir21') os.makedirs(testdir3) assert_equal(sorted(get_dataset_directories(path)), sorted([testdir, testdir2, testdir3, subdsdir])) # only return hits below the search path assert_equal(sorted(get_dataset_directories(testdir2)), sorted([testdir3])) # empty subdataset mount points are reported too ds.uninstall(subds.path, check=False, recursive=True) ok_(not subds.is_installed()) ok_(os.path.exists(subds.path)) assert_equal(sorted(get_dataset_directories(path)), sorted([testdir, testdir2, testdir3, subdsdir]))
def test_partial_aggregation(path): ds = Dataset(path).create(force=True) sub1 = ds.create('sub1', force=True) sub2 = ds.create('sub2', force=True) ds.add('.', recursive=True) # if we aggregate a path(s) and say to recurse, we must not recurse into # the dataset itself and aggregate others ds.aggregate_metadata(path='sub1', recursive=True) res = ds.metadata(get_aggregates=True) assert_result_count(res, 1, path=ds.path) assert_result_count(res, 1, path=sub1.path) # so no metadata aggregates for sub2 yet assert_result_count(res, 0, path=sub2.path) ds.aggregate_metadata(recursive=True) # baseline, recursive aggregation gets us something for all three datasets res = ds.metadata(get_aggregates=True) assert_result_count(res, 3) # now let's do partial aggregation from just one subdataset # we should not loose information on the other datasets # as this would be a problem any time anything in a dataset # subtree is missing: not installed, too expensive to reaggregate, ... ds.aggregate_metadata(path='sub1', incremental=True) res = ds.metadata(get_aggregates=True) assert_result_count(res, 3) assert_result_count(res, 1, path=sub2.path) # from-scratch aggregation kills datasets that where not listed ds.aggregate_metadata(path='sub1', incremental=False) res = ds.metadata(get_aggregates=True) assert_result_count(res, 3) assert_result_count(res, 1, path=sub2.path) # now reaggregated in full ds.aggregate_metadata(recursive=True) # make change in sub1 sub1.unlock('here') with open(opj(sub1.path, 'here'), 'w') as f: f.write('fresh') ds.save(recursive=True) ok_clean_git(path)
def test_subdataset_save(path): parent = Dataset(path).create() sub = parent.create('sub') ok_clean_git(parent.path) create_tree(parent.path, { "untracked": 'ignore', 'sub': { "new": "wanted"}}) sub.add('new') # defined state: one untracked, modified (but clean in itself) subdataset ok_clean_git(sub.path) ok_clean_git(parent.path, untracked=['untracked'], index_modified=['sub']) # `save sub` does not save the parent!! with chpwd(parent.path): assert_status('notneeded', save(path=sub.path)) ok_clean_git(parent.path, untracked=['untracked'], index_modified=['sub']) # `save -d .` saves the state change in the subdataset, but leaves any untracked # content alone with chpwd(parent.path): assert_status('ok', parent._save()) ok_clean_git(parent.path, untracked=['untracked']) # get back to the original modified state and check that -S behaves in # exactly the same way create_tree(parent.path, { 'sub': { "new2": "wanted2"}}) sub.add('new2') ok_clean_git(parent.path, untracked=['untracked'], index_modified=['sub']) with chpwd(parent.path): assert_status( # notneeded to save sub, but need to save parent ['ok', 'notneeded'], # the key condition of this test is that no reference dataset is # given! save(path='sub', super_datasets=True)) # save super must not cause untracked content to be commited! ok_clean_git(parent.path, untracked=['untracked'])