def test_rerun_assume_ready(path): ds = Dataset(path).create() repo = ds.repo (repo.pathobj / "f1").write_text("f1\n") ds.save() def double_in_both_cmd(src, dest1, dest2): return [ sys.executable, "-c", "import sys; import os; import os.path as op; " "content = open(sys.argv[-3]).read() * 2; " "d1 = sys.argv[-2]; d2 = sys.argv[-1]; " "op.lexists(d1) and os.unlink(d1); " "op.lexists(d2) and os.unlink(d2); " "open(d1, 'w').write(content); open(d2, 'w').write(content)", src, dest1, dest2 ] ds.run(double_in_both_cmd("f1", "out1", "out2"), outputs=["out1"]) # Drop the content so that we remove instead of unlock, making the test is # more meaningful on an adjusted branch. ds.drop(["out1", "out2"], check=False) # --assume-ready affects both explicitly specified and automatic outputs. res = ds.rerun(assume_ready="outputs") assert_not_in_results(res, action="remove")
def test_basic_aggregate(path=None): # TODO give datasets some more metadata to actually aggregate stuff base = Dataset(opj(path, 'origin')).create(force=True) sub = base.create('sub', force=True) #base.metadata(sub.path, init=dict(homepage='this'), apply2global=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.save(recursive=True) assert_repo_status(base.path) # we will first aggregate the middle dataset on its own, this will # serve as a smoke test for the reuse of metadata objects later on sub.aggregate_metadata() base.save() assert_repo_status(base.path) base.aggregate_metadata(recursive=True, update_mode='all') assert_repo_status(base.path) direct_meta = base.metadata(recursive=True, return_type='list') # loose the deepest dataset sub.drop('subsub', what='all', reckless='kill', recursive=True) # no we should eb able to reaggregate metadata, and loose nothing # because we can aggregate aggregated metadata of subsub from sub base.aggregate_metadata(recursive=True, update_mode='all') # same result for aggregate query than for (saved) direct query agg_meta = base.metadata(recursive=True, return_type='list') for d, a in zip(direct_meta, agg_meta): print(d['path'], a['path']) assert_dict_equal(d, a) # no we can throw away the subdataset tree, and loose no metadata base.drop('sub', what='all', reckless='kill', recursive=True) assert (not sub.is_installed()) assert_repo_status(base.path) # same result for aggregate query than for (saved) direct query agg_meta = base.metadata(recursive=True, return_type='list') for d, a in zip(direct_meta, agg_meta): assert_dict_equal(d, a)
def test_report_absent_keys(path): ds = Dataset(path).create() # create an annexed file testfile = ds.pathobj / 'dummy' testfile.write_text(u'nothing') ds.save() # present in a full report and in a partial report # based on worktree of HEAD ref for ai in (ds.repo.get_content_annexinfo(eval_availability=True), ds.repo.get_content_annexinfo(paths=['dummy'], eval_availability=True), ds.repo.get_content_annexinfo(ref='HEAD', eval_availability=True), ds.repo.get_content_annexinfo(ref='HEAD', paths=['dummy'], eval_availability=True)): assert_in(testfile, ai) assert_equal(ai[testfile]['has_content'], True) # drop the key, not available anywhere else ds.drop('dummy', check=False) # does not change a thing, except the key is gone for ai in (ds.repo.get_content_annexinfo(eval_availability=True), ds.repo.get_content_annexinfo(paths=['dummy'], eval_availability=True), ds.repo.get_content_annexinfo(ref='HEAD', eval_availability=True), ds.repo.get_content_annexinfo(ref='HEAD', paths=['dummy'], eval_availability=True)): assert_in(testfile, ai) assert_equal(ai[testfile]['has_content'], False)
def test_reaggregate_with_unavailable_objects(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n') sub = base.create('sub', force=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) objpath = opj('.datalad', 'metadata', 'objects') objs = list(sorted(base.repo.find(objpath))) # we have 3x2 metadata sets (dataset/files) under annex eq_(len(objs), 6) eq_(all(base.repo.file_has_content(objs)), True) # drop all object content base.drop(objs, check=False) eq_(all(base.repo.file_has_content(objs)), False) ok_clean_git(base.path) # now re-aggregate, the state hasn't changed, so the file names will # be the same base.aggregate_metadata(recursive=True, update_mode='all', force_extraction=True) eq_(all(base.repo.file_has_content(objs)), True) # and there are no new objects eq_( objs, list(sorted(base.repo.find(objpath))) )
def test_reaggregate_with_unavailable_objects(path): base = Dataset(op.join(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(op.join(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n' ) sub = base.create('sub', force=True) subsub = base.create(op.join('sub', 'subsub'), force=True) base.save(recursive=True) assert_repo_status(base.path) # first a quick check that an unsupported 'into' mode causes an exception assert_raises(ValueError, base.meta_aggregate, recursive=True, into='spaceship') # no for real base.meta_aggregate(recursive=True, into='all') assert_repo_status(base.path) objpath = op.join('.datalad', 'metadata', 'objects') objs = list(sorted(base.repo.find(objpath))) # we have 3x2 metadata sets (dataset/files) under annex eq_(len(objs), 6) eq_(all(base.repo.file_has_content(objs)), True) # drop all object content base.drop(objs, check=False) eq_(all(base.repo.file_has_content(objs)), False) assert_repo_status(base.path) # now re-aggregate, the state hasn't changed, so the file names will # be the same base.meta_aggregate(recursive=True, into='all', force='fromscratch') eq_(all(base.repo.file_has_content(objs)), True) # and there are no new objects eq_(objs, list(sorted(base.repo.find(objpath))))
def test_reaggregate_with_unavailable_objects(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n' ) sub = base.create('sub', force=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.save(recursive=True) assert_repo_status(base.path) base.aggregate_metadata(recursive=True, update_mode='all') assert_repo_status(base.path) objpath = opj('.datalad', 'metadata', 'objects') objs = list(sorted(base.repo.find(objpath))) # we have 3x2 metadata sets (dataset/files) under annex eq_(len(objs), 6) eq_(all(base.repo.file_has_content(objs)), True) # drop all object content base.drop(objs, check=False) eq_(all(base.repo.file_has_content(objs)), False) assert_repo_status(base.path) # now re-aggregate, the state hasn't changed, so the file names will # be the same base.aggregate_metadata(recursive=True, update_mode='all', force_extraction=True) eq_(all(base.repo.file_has_content(objs)), True) # and there are no new objects eq_(objs, list(sorted(base.repo.find(objpath))))
def test_reaggregate_with_unavailable_objects(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n' ) sub = base.create('sub', force=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) objpath = opj('.datalad', 'metadata', 'objects') # weird that it comes out as a string... objs = [o for o in sorted(base.repo.find(objpath).split('\n')) if o] # we have 3x2 metadata sets (dataset/files) under annex eq_(len(objs), 6) eq_(all(base.repo.file_has_content(objs)), True) # drop all object content base.drop(objs, check=False) eq_(all(base.repo.file_has_content(objs)), False) ok_clean_git(base.path) # now re-aggregate, the state hasn't changed, so the file names will # be the same base.aggregate_metadata(recursive=True, update_mode='all') eq_(all(base.repo.file_has_content(objs)), True) # and there are no new objects eq_(objs, [o for o in sorted(base.repo.find(objpath).split('\n')) if o])
def test_push_wanted(srcpath, dstpath): src = Dataset(srcpath).create() if src.repo.is_managed_branch(): # on crippled FS post-update hook enabling via create-sibling doesn't # work ATM raise SkipTest("no create-sibling on crippled FS") (src.pathobj / 'data.0').write_text('0') (src.pathobj / 'secure.1').write_text('1') (src.pathobj / 'secure.2').write_text('2') src.save() # Dropping a file to mimic a case of simply not having it locally (thus not # to be "pushed") src.drop('secure.2', check=False) # Annotate sensitive content, actual value "verysecure" does not matter in # this example src.repo.set_metadata(add={'distribution-restrictions': 'verysecure'}, files=['secure.1', 'secure.2']) src.create_sibling( dstpath, annex_wanted="not metadata=distribution-restrictions=*", name='target', ) # check that wanted is obeyed, if instructed by configuration src.config.set('datalad.push.copy-auto-if-wanted', 'true', where='local') res = src.push(to='target') assert_in_results(res, action='copy', path=str(src.pathobj / 'data.0'), status='ok') for p in ('secure.1', 'secure.2'): assert_not_in_results(res, path=str(src.pathobj / p)) assert_status('notneeded', src.push(to='target')) # check that dataset-config cannot overrule this src.config.set('datalad.push.copy-auto-if-wanted', 'false', where='dataset') res = src.push(to='target') assert_status('notneeded', res) # check the target to really make sure dst = Dataset(dstpath) # normal file, yes eq_((dst.pathobj / 'data.0').read_text(), '0') # secure file, no if dst.repo.is_managed_branch(): neq_((dst.pathobj / 'secure.1').read_text(), '1') else: assert_raises(FileNotFoundError, (dst.pathobj / 'secure.1').read_text) # remove local config, must enable push of secure file src.config.unset('datalad.push.copy-auto-if-wanted', where='local') res = src.push(to='target') assert_in_results(res, path=str(src.pathobj / 'secure.1')) eq_((dst.pathobj / 'secure.1').read_text(), '1')
def test_drop_file_need_nocheck(path=None): ds = Dataset(path).create(force=True) ds.save() with assert_raises(IncompleteResultsError) as cme: ds.drop("foo") # The --force suggestion from git-annex-drop is translated to --reckless. assert_in("--reckless", str(cme.value)) assert_status("ok", ds.drop("foo", reckless='kill', on_failure="ignore"))
def test_drop_file_need_nocheck(path): ds = Dataset(path).create(force=True) ds.save() with assert_raises(IncompleteResultsError) as cme: ds.drop("foo") # The --force suggestion from git-annex-drop is translated to --nocheck. assert_in("--nocheck", str(cme.exception)) assert_status("ok", ds.drop("foo", check=False, on_failure="ignore"))
def test_create_raises(path=None, outside_path=None): ds = Dataset(path) # incompatible arguments (annex only): assert_raises(ValueError, ds.create, annex=False, description='some') with open(op.join(path, "somefile.tst"), 'w') as f: f.write("some") # non-empty without `force`: assert_in_results( ds.create(force=False, **raw), status='error', message= 'will not create a dataset in a non-empty directory, use `--force` option to ignore' ) # non-empty with `force`: ds.create(force=True) # create sub outside of super: assert_in_results( ds.create(outside_path, **raw), status='error', message=( 'dataset containing given paths is not underneath the reference ' 'dataset %s: %s', ds, outside_path)) obscure_ds = u"ds-" + OBSCURE_FILENAME # create a sub: ds.create(obscure_ds) # fail when doing it again assert_in_results(ds.create(obscure_ds, **raw), status='error', message=('collision with %s (dataset) in dataset %s', str(ds.pathobj / obscure_ds), ds.path)) # now deinstall the sub and fail trying to create a new one at the # same location ds.drop(obscure_ds, what='all', reckless='kill', recursive=True) assert_in(obscure_ds, ds.subdatasets(state='absent', result_xfm='relpaths')) # and now should fail to also create inplace or under assert_in_results(ds.create(obscure_ds, **raw), status='error', message=('collision with %s (dataset) in dataset %s', str(ds.pathobj / obscure_ds), ds.path)) assert_in_results(ds.create(op.join(obscure_ds, 'subsub'), **raw), status='error', message=('collision with %s (dataset) in dataset %s', str(ds.pathobj / obscure_ds), ds.path)) os.makedirs(op.join(ds.path, 'down')) with open(op.join(ds.path, 'down', "someotherfile.tst"), 'w') as f: f.write("someother") ds.save() assert_in_results( ds.create('down', **raw), status='error', message=('collision with content in parent dataset at %s: %s', ds.path, [str(ds.pathobj / 'down' / 'someotherfile.tst')]), )
def test_report_absent_keys(path=None): ds = Dataset(path).create() # create an annexed file testfile = ds.pathobj / 'dummy' testfile.write_text(u'nothing') ds.save() # present in a full report and in a partial report # based on worktree of HEAD ref for ai in (ds.repo.get_content_annexinfo(eval_availability=True), ds.repo.get_content_annexinfo(paths=['dummy'], eval_availability=True), ds.repo.get_content_annexinfo(ref='HEAD', eval_availability=True), ds.repo.get_content_annexinfo(ref='HEAD', paths=['dummy'], eval_availability=True)): assert_in(testfile, ai) assert_equal(ai[testfile]['has_content'], True) # drop the key, not available anywhere else ds.drop('dummy', reckless='kill') # does not change a thing, except the key is gone for ai in (ds.repo.get_content_annexinfo(eval_availability=True), ds.repo.get_content_annexinfo(paths=['dummy'], eval_availability=True), ds.repo.get_content_annexinfo(ref='HEAD', eval_availability=True), ds.repo.get_content_annexinfo(ref='HEAD', paths=['dummy'], eval_availability=True)): assert_in(testfile, ai) assert_equal(ai[testfile]['has_content'], False) # make sure files with URL keys are correctly reported: from datalad.conftest import test_http_server remote_file_name = 'imaremotefile.dat' local_file_name = 'mehasurlkey' (Path(test_http_server.path) / remote_file_name).write_text("weee") remote_file_url = f'{test_http_server.url}/{remote_file_name}' # we need to get a file with a URL key and check its local availability ds.repo.call_annex( ['addurl', '--relaxed', remote_file_url, '--file', local_file_name]) ds.save("URL keys!") # should not be there res = ds.repo.get_file_annexinfo(local_file_name, eval_availability=True) assert_equal(res['has_content'], False) ds.get(local_file_name) # should be there res = ds.repo.get_file_annexinfo(local_file_name, eval_availability=True) assert_equal(res['has_content'], True)
def test_clone_unborn_head(path): ds_origin = Dataset(op.join(path, "a")).create() repo = ds_origin.repo managed = repo.is_managed_branch() # The setup below is involved, mostly because it's accounting for adjusted # branches. The scenario itself isn't so complicated, though: # # * a checked out default branch with no commits # * a (potentially adjusted) "abc" branch with commits. # * a (potentially adjusted) "chooseme" branch whose tip commit has a # more recent commit than any in "abc". (ds_origin.pathobj / "foo").write_text("foo content") ds_origin.save(message="foo") for res in repo.for_each_ref_(fields="refname"): ref = res["refname"] if DEFAULT_BRANCH in ref: repo.update_ref(ref.replace(DEFAULT_BRANCH, "abc"), ref) repo.call_git(["update-ref", "-d", ref]) repo.update_ref("HEAD", "refs/heads/{}".format( "adjusted/abc(unlocked)" if managed else "abc"), symbolic=True) abc_ts = int(repo.format_commit("%ct")) repo.call_git(["checkout", "-b", "chooseme", "abc~1"]) if managed: repo.adjust() (ds_origin.pathobj / "bar").write_text("bar content") with set_date(abc_ts + 1): ds_origin.save(message="bar") # Make the git-annex branch the most recently updated ref so that we test # that it is skipped. with set_date(abc_ts + 2): ds_origin.drop("bar", check=False) ds_origin.repo.checkout(DEFAULT_BRANCH, options=["--orphan"]) ds = clone(ds_origin.path, op.join(path, "b")) # We landed on the branch with the most recent commit, ignoring the # git-annex branch. branch = ds.repo.get_active_branch() eq_(ds.repo.get_corresponding_branch(branch) or branch, "chooseme") eq_(ds_origin.repo.get_hexsha("chooseme"), ds.repo.get_hexsha("chooseme")) # In the context of this test, the clone should be on an adjusted branch if # the source landed there initially because we're on the same file system. eq_(managed, ds.repo.is_managed_branch())
def test_get_file_annexinfo(path=None): ds = Dataset(path).create(force=True) ds.save('ingit.txt', to_git=True) ds.save() # have some content-less component for testing ds.drop(ds.pathobj / 'dir1', reckless='kill') repo = ds.repo # only handles a single file at a time assert_raises(ValueError, repo.get_file_annexinfo, repo.pathobj / 'dir2') # however, it only functionally matters that there is only a single file to # report on not that the exact query path matches, the matching path is in # the report assert_equal(repo.pathobj / 'dir1' / 'dropped', repo.get_file_annexinfo(repo.pathobj / 'dir1')['path']) # does not raise on a non-annex file, instead it returns no properties assert_equal(repo.get_file_annexinfo('ingit.txt'), {}) # but does raise on path that doesn exist assert_raises(NoSuchPathError, repo.get_file_annexinfo, 'nothere') # check return properties for utility props = repo.get_file_annexinfo('inannex.txt') # to replace get_file_backend() assert_equal(props['backend'], 'MD5E') # to replace get_file_key() assert_equal(props['key'], 'MD5E-s7--3b158c5b0a18c247ebad28c09fc3e180.txt') # for size reporting assert_equal(props['bytesize'], 7) # all records have a pathobj assert_equal(props['path'], repo.pathobj / 'inannex.txt') # test if `eval_availability` has desired effect assert_not_in('has_content', props) # extended set of properties, after more expensive availability check props = repo.get_file_annexinfo('inannex.txt', eval_availability=True) # to replace file_has_content() assert_equal(props['has_content'], True) # to replace get_contentlocation() assert_equal(Path(props['objloc']).read_text(), 'inannex') # make sure has_content is not always True props = repo.get_file_annexinfo(ds.pathobj / 'dir1' / 'dropped', eval_availability=True) assert_equal(props['has_content'], False) assert_not_in('objloc', props)
def test_push_wanted(srcpath, dstpath): src = Dataset(srcpath).create() (src.pathobj / 'data.0').write_text('0') (src.pathobj / 'secure.1').write_text('1') (src.pathobj / 'secure.2').write_text('2') src.save() # Dropping a file to mimic a case of simply not having it locally (thus not # to be "pushed") src.drop('secure.2', check=False) # Annotate sensitive content, actual value "verysecure" does not matter in # this example src.repo.set_metadata(add={'distribution-restrictions': 'verysecure'}, files=['secure.1', 'secure.2']) src.create_sibling( dstpath, annex_wanted="not metadata=distribution-restrictions=*", name='target', ) # check that wanted is obeyed, since set in sibling configuration res = src.push(to='target') assert_in_results(res, action='copy', path=str(src.pathobj / 'data.0'), status='ok') for p in ('secure.1', 'secure.2'): assert_not_in_results(res, path=str(src.pathobj / p)) assert_status('notneeded', src.push(to='target')) # check the target to really make sure dst = Dataset(dstpath) # normal file, yes eq_((dst.pathobj / 'data.0').read_text(), '0') # secure file, no if dst.repo.is_managed_branch(): neq_((dst.pathobj / 'secure.1').read_text(), '1') else: assert_raises(FileNotFoundError, (dst.pathobj / 'secure.1').read_text) # reset wanted config, which must enable push of secure file src.repo.set_preferred_content('wanted', '', remote='target') res = src.push(to='target') assert_in_results(res, path=str(src.pathobj / 'secure.1')) eq_((dst.pathobj / 'secure.1').read_text(), '1')
def test_copy_file(workdir=None, webdir=None, weburl=None): workdir = Path(workdir) webdir = Path(webdir) src_ds = Dataset(workdir / 'src').create() # put a file into the dataset by URL and drop it again src_ds.download_url('/'.join((weburl, 'webfile1')), path='myfile1.txt') src_ds.download_url('/'.join((weburl, 'webfile2')), path=opj('subdir', 'myfile2.txt')) ok_file_has_content(src_ds.pathobj / 'myfile1.txt', '123') # now create a fresh dataset dest_ds = Dataset(workdir / 'dest').create() if dest_ds.repo._check_version_kludges("fromkey-supports-unlocked") or \ not dest_ds.repo.is_managed_branch(): # unless we have a target ds on a cripples FS (where `annex fromkey` # doesn't work until after 8.20210428), we can even drop the file # content in the source repo src_ds.drop('myfile1.txt', reckless='kill') nok_(src_ds.repo.file_has_content('myfile1.txt')) # copy the file from the source dataset into it. # it must copy enough info to actually put datalad into the position # to obtain the file content from the original URL dest_ds.copy_file(src_ds.pathobj / 'myfile1.txt') dest_ds.get('myfile1.txt') ok_file_has_content(dest_ds.pathobj / 'myfile1.txt', '123') # purposefully pollute the employed tmp folder to check that we do not trip # over such a condition tmploc = dest_ds.pathobj / '.git' / 'tmp' / 'datalad-copy' / 'some' tmploc.parent.mkdir(parents=True) tmploc.touch() # copy again, but to different target file name # (source+dest pair now) dest_ds.copy_file( [src_ds.pathobj / 'myfile1.txt', dest_ds.pathobj / 'renamed.txt']) ok_file_has_content(dest_ds.pathobj / 'renamed.txt', '123') # copying more than one at once dest_ds.copy_file([ src_ds.pathobj / 'myfile1.txt', src_ds.pathobj / 'subdir' / 'myfile2.txt', dest_ds.pathobj ]) # copy directly from a non-dataset location dest_ds.copy_file(webdir / 'webfile1') # copy from annex dataset into gitrepo git_ds = Dataset(workdir / 'git').create(annex=False) git_ds.copy_file(src_ds.pathobj / 'subdir' / 'myfile2.txt')
def test_copy_file_datalad_specialremote(workdir=None, webdir=None, weburl=None): workdir = Path(workdir) src_ds = Dataset(workdir / 'src').create() # enable datalad special remote src_ds.repo.init_remote(DATALAD_SPECIAL_REMOTE, [ 'encryption=none', 'type=external', 'externaltype={}'.format(DATALAD_SPECIAL_REMOTE), 'autoenable=true' ]) # put files into the dataset by URL src_ds.download_url('/'.join((weburl, 'webfile1')), path='myfile1.txt') src_ds.download_url('/'.join((weburl, 'webfile2')), path='myfile2.txt') # approx test that the file is known to a remote # that is not the web remote assert_in_results( src_ds.repo.whereis('myfile1.txt', output='full').values(), here=False, description='[{}]'.format(DATALAD_SPECIAL_REMOTE), ) # now a new dataset dest_ds = Dataset(workdir / 'dest').create() # no special remotes eq_(dest_ds.repo.get_special_remotes(), {}) # must call with a dataset to get change saved, in order for drop # below to work properly without getting in reckless mode dest_ds.copy_file([src_ds.pathobj / 'myfile1.txt', dest_ds.pathobj]) # we have an special remote in the destination dataset now assert_in_results( dest_ds.repo.get_special_remotes().values(), externaltype=DATALAD_SPECIAL_REMOTE, ) # and it works dest_ds.drop('myfile1.txt') dest_ds.repo.get('myfile1.txt', remote='datalad') ok_file_has_content(dest_ds.pathobj / 'myfile1.txt', '123') # now replace file in dest with a different content at the same path # must call with a dataset to get change saved, in order for drop dest_ds.copy_file( [src_ds.pathobj / 'myfile2.txt', dest_ds.pathobj / 'myfile1.txt']) dest_ds.drop('myfile1.txt') dest_ds.repo.get('myfile1.txt', remote='datalad') # no gets the "same path" but yields different content ok_file_has_content(dest_ds.pathobj / 'myfile1.txt', 'abc')
def test_copy_file_into_nonannex(workdir): workdir = Path(workdir) src_ds = Dataset(workdir / 'src').create() (src_ds.pathobj / 'present.txt').write_text('123') (src_ds.pathobj / 'gone.txt').write_text('abc') src_ds.save() src_ds.drop('gone.txt', check=False) # destination has no annex dest_ds = Dataset(workdir / 'dest').create(annex=False) # no issue copying a file that has content copy_file([src_ds.pathobj / 'present.txt', dest_ds.pathobj]) ok_file_has_content(dest_ds.pathobj / 'present.txt', '123') # but cannot handle a dropped file, no chance to register # availability info in an annex assert_status( 'impossible', copy_file([src_ds.pathobj / 'gone.txt', dest_ds.pathobj], on_failure='ignore'))
def test_unlock_directory(path): ds = Dataset(path).create(force=True) ds.save() ds.unlock(path="dir") dirpath = Path("dir") dirpath_abs = Path(ds.pathobj / "dir") # On adjusted branches (for the purposes of this test, crippled # filesystems), the files were already unlocked and the committed state is # the unlocked pointer file. is_managed_branch = ds.repo.is_managed_branch() if is_managed_branch: assert_repo_status(ds.path) else: assert_repo_status(ds.path, modified=[dirpath / "a", dirpath / "b"]) ds.save() ds.drop(text_type(dirpath / "a"), check=False) assert_false(ds.repo.file_has_content(text_type(dirpath / "a"))) # Unlocking without an explicit non-directory path doesn't fail if one of # the directory's files doesn't have content. res = ds.unlock(path="dir") assert_not_in_results(res, action="unlock", path=text_type(dirpath_abs / "a")) if is_managed_branch: assert_not_in_results(res, action="unlock", path=text_type(dirpath_abs / "b")) else: assert_in_results(res, action="unlock", status="ok", path=text_type(dirpath_abs / "b")) assert_repo_status(ds.path, modified=[dirpath / "b"]) # If we explicitly provide a path that lacks content, we get a result # for it. assert_in_results(ds.unlock(path=dirpath / "a", on_failure="ignore"), action="unlock", status="impossible", path=text_type(dirpath_abs / "a"))
def test_copy_file_nourl(serv_path=None, orig_path=None, tst_path=None): """Tests availability transfer to normal git-annex remote""" # prep source dataset that will have the file content srv_ds = Dataset(serv_path).create() (srv_ds.pathobj / 'myfile.dat').write_text('I am content') (srv_ds.pathobj / 'noavail.dat').write_text('null') srv_ds.save() srv_ds.drop('noavail.dat', reckless='kill') # make an empty superdataset, with the test dataset as a subdataset orig_ds = Dataset(orig_path).create() orig_ds.clone(source=serv_path, path='serv') assert_repo_status(orig_ds.path) # now copy the test file into the superdataset no_avail_file = orig_ds.pathobj / 'serv' / 'noavail.dat' assert_in_results( orig_ds.copy_file(no_avail_file, on_failure='ignore'), status='impossible', message='no known location of file content', path=str(no_avail_file), )
def test_report_absent_keys(path): ds = Dataset(path).create() # create an annexed file testfile = ds.pathobj / 'dummy' testfile.write_text(u'nothing') ds.save() # present in a full report and in a partial report # based on worktree of HEAD ref for ai in ( ds.repo.get_content_annexinfo(eval_availability=True), ds.repo.get_content_annexinfo( paths=['dummy'], eval_availability=True), ds.repo.get_content_annexinfo( ref='HEAD', eval_availability=True), ds.repo.get_content_annexinfo( ref='HEAD', paths=['dummy'], eval_availability=True)): assert_in(testfile, ai) assert_equal(ai[testfile]['has_content'], True) # drop the key, not available anywhere else ds.drop('dummy', check=False) # does not change a thing, except the key is gone for ai in ( ds.repo.get_content_annexinfo(eval_availability=True), ds.repo.get_content_annexinfo( paths=['dummy'], eval_availability=True), ds.repo.get_content_annexinfo( ref='HEAD', eval_availability=True), ds.repo.get_content_annexinfo( ref='HEAD', paths=['dummy'], eval_availability=True)): assert_in(testfile, ai) assert_equal(ai[testfile]['has_content'], False)
def test_clean_subds_removal(path=None): ds = Dataset(path).create() subds1 = ds.create('one') subds2 = ds.create('two') eq_(sorted(ds.subdatasets(result_xfm='relpaths')), ['one', 'two']) assert_repo_status(ds.path) # now kill one res = ds.remove('one', reckless='availability', result_xfm=None) # subds1 got uninstalled, and ds got the removal of subds1 saved assert_result_count(res, 1, path=subds1.path, action='uninstall', status='ok') assert_result_count(res, 1, path=subds1.path, action='remove', status='ok') assert_result_count(res, 1, path=ds.path, action='save', status='ok') ok_(not subds1.is_installed()) assert_repo_status(ds.path) # two must remain eq_(ds.subdatasets(result_xfm='relpaths'), ['two']) # one is gone nok_(subds1.pathobj.exists()) # and now again, but this time remove something that is not installed ds.create('three') eq_(sorted(ds.subdatasets(result_xfm='relpaths')), ['three', 'two']) ds.drop('two', what='all', reckless='availability') assert_repo_status(ds.path) eq_(sorted(ds.subdatasets(result_xfm='relpaths')), ['three', 'two']) nok_(subds2.is_installed()) # oderly empty mountpoint is maintained ok_(subds2.pathobj.exists()) res = ds.remove('two', reckless='availability') assert_in_results(res, path=str(ds.pathobj / 'two'), action='remove') assert_repo_status(ds.path) # subds2 was already uninstalled, now ds got the removal of subds2 saved nok_(subds2.pathobj.exists()) eq_(ds.subdatasets(result_xfm='relpaths'), ['three'])
def test_rerun(path, nodspath): ds = Dataset(path).create() sub = ds.create('sub') probe_path = opj(sub.path, 'sequence') # run inside the dataset with chpwd(path), \ swallow_outputs(): ds.run('echo x$(cat sub/sequence) > sub/sequence') # command ran once, all clean ok_clean_git(ds.path) eq_('x\n', open(probe_path).read()) # now, for a rerun we can be anywhere, PWD and all are recorded # moreover, rerun must figure out which bits to unlock, even in # subdatasets with chpwd(nodspath), \ swallow_outputs(): ds.rerun() ok_clean_git(ds.path) # ran twice now eq_('xx\n', open(probe_path).read()) # Rerunning from a subdataset skips the command. _, sub_info = get_run_info(ds, sub.repo.repo.head.commit.message) eq_(ds.id, sub_info["dsid"]) assert_result_count(sub.rerun(return_type="list", on_failure="ignore"), 1, status="impossible", action="run", rerun_action="skip") eq_('xx\n', open(probe_path).read()) # Rerun fails with a dirty repo. dirt = opj(path, "dirt") with open(dirt, "w") as fh: fh.write("") assert_status('impossible', ds.rerun(on_failure="ignore")) remove(dirt) ok_clean_git(ds.path) # Make a non-run commit. with open(opj(path, "nonrun-file"), "w") as f: f.write("foo") ds.add("nonrun-file") # Now rerun the buried command. ds.rerun(revision="HEAD~", message="rerun buried") eq_('xxx\n', open(probe_path).read()) # Also check that the messasge override worked. eq_( ds.repo.format_commit("%B").splitlines()[0], "[DATALAD RUNCMD] rerun buried") # Or a range of commits, skipping non-run commits. ds.rerun(since="HEAD~3") eq_('xxxxx\n', open(probe_path).read()) # Or --since= to run all reachable commits. ds.rerun(since="") eq_('xxxxxxxxxx\n', open(probe_path).read()) # We can get back a report of what would happen rather than actually # rerunning anything. report = ds.rerun(since="", report=True, return_type="list") # Nothing changed. eq_('xxxxxxxxxx\n', open(probe_path).read()) assert_result_count(report, 1, rerun_action="skip") report[-1]["commit"] == ds.repo.get_hexsha() # If a file is dropped, we remove it instead of unlocking it. ds.drop(probe_path, check=False) with swallow_outputs(): ds.rerun() eq_('x\n', open(probe_path).read()) # If the history to rerun has a merge commit, we abort. ds.repo.checkout("HEAD~3", options=["-b", "topic"]) with open(opj(path, "topic-file"), "w") as f: f.write("topic") ds.add("topic-file") ds.repo.checkout("master") ds.repo.merge("topic") ok_clean_git(ds.path) assert_raises(IncompleteResultsError, ds.rerun)
def test_dry_run(path=None): ds = Dataset(path).create(force=True) # The dataset is reported as dirty, and the custom result render relays # that to the default renderer. with swallow_outputs() as cmo: with assert_raises(IncompleteResultsError): ds.run("blah ", dry_run="basic") assert_in("run(impossible)", cmo.out) assert_not_in("blah", cmo.out) ds.save() # unknown dry-run mode assert_raises(ValueError, ds.run, 'blah', dry_run='absurd') with swallow_outputs() as cmo: ds.run("blah ", dry_run="basic") assert_in("Dry run", cmo.out) assert_in("location", cmo.out) assert_in("blah", cmo.out) assert_not_in("expanded inputs", cmo.out) assert_not_in("expanded outputs", cmo.out) with swallow_outputs() as cmo: ds.run("blah {inputs} {outputs}", dry_run="basic", inputs=["fo*"], outputs=["b*r"]) assert_in('blah "foo" "bar"' if on_windows else "blah foo bar", cmo.out) assert_in("expanded inputs", cmo.out) assert_in("['foo']", cmo.out) assert_in("expanded outputs", cmo.out) assert_in("['bar']", cmo.out) # Just the command. with swallow_outputs() as cmo: ds.run("blah ", dry_run="command") assert_not_in("Dry run", cmo.out) assert_in("blah", cmo.out) assert_not_in("inputs", cmo.out) # The output file wasn't unlocked. assert_repo_status(ds.path) # Subdaset handling subds = ds.create("sub") (subds.pathobj / "baz").write_text("z") ds.save(recursive=True) # If a subdataset is installed, it works as usual. with swallow_outputs() as cmo: ds.run("blah {inputs}", dry_run="basic", inputs=["sub/b*"]) assert_in('blah "sub\\baz"' if on_windows else 'blah sub/baz', cmo.out) # However, a dry run will not do the install/reglob procedure. ds.drop("sub", what='all', reckless='kill', recursive=True) with swallow_outputs() as cmo: ds.run("blah {inputs}", dry_run="basic", inputs=["sub/b*"]) assert_in("sub/b*", cmo.out) assert_not_in("baz", cmo.out)
def test_ls_json(topdir, topurl): annex = AnnexRepo(topdir, create=True) ds = Dataset(topdir) # create some file and commit it with open(opj(ds.path, 'subdsfile.txt'), 'w') as f: f.write('123') ds.add(path='subdsfile.txt') ds.save("Hello!", version_tag=1) # add a subdataset ds.install('subds', source=topdir) subdirds = ds.create(_path_('dir/subds2'), force=True) subdirds.add('file') git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True) # create git repo git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt')) # commit to git to init git repo git.commit() annex.add(opj(topdir, 'dir', 'subgit')) # add the non-dataset git repo to annex annex.add(opj(topdir, 'dir')) # add to annex (links) annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # broken-link annex.commit() git.add('fgit.txt') # commit to git to init git repo git.commit() # annex.add doesn't add submodule, so using ds.add ds.add(opj('dir', 'subgit')) # add the non-dataset git repo to annex ds.add('dir') # add to annex (links) ds.drop(opj('dir', 'subdir', 'file2.txt'), check=False) # broken-link # register "external" submodule by installing and uninstalling it ext_url = topurl + '/dir/subgit/.git' # need to make it installable via http Runner()('git update-server-info', cwd=opj(topdir, 'dir', 'subgit')) ds.install(opj('dir', 'subgit_ext'), source=ext_url) ds.uninstall(opj('dir', 'subgit_ext')) meta_dir = opj('.git', 'datalad', 'metadata') def get_metahash(*path): if not path: path = ['/'] return hashlib.md5(opj(*path).encode('utf-8')).hexdigest() def get_metapath(dspath, *path): return _path_(dspath, meta_dir, get_metahash(*path)) def get_meta(dspath, *path): with open(get_metapath(dspath, *path)) as f: return js.load(f) # Let's see that there is no crash if one of the files is available only # in relaxed URL mode, so no size could be picked up ds.repo.add_url_to_file( 'fromweb', topurl + '/noteventhere', options=['--relaxed']) for all_ in [True, False]: # recurse directories for recursive in [True, False]: for state in ['file', 'delete']: # subdataset should have its json created and deleted when # all=True else not subds_metapath = get_metapath(opj(topdir, 'subds')) exists_prior = exists(subds_metapath) #with swallow_logs(), swallow_outputs(): dsj = _ls_json( topdir, json=state, all_=all_, recursive=recursive ) ok_startswith(dsj['tags'], '1-') exists_post = exists(subds_metapath) # print("%s %s -> %s" % (state, exists_prior, exists_post)) assert_equal(exists_post, (state == 'file' and recursive)) # root should have its json file created and deleted in all cases ds_metapath = get_metapath(topdir) assert_equal(exists(ds_metapath), state == 'file') # children should have their metadata json's created and deleted only when recursive=True child_metapath = get_metapath(topdir, 'dir', 'subdir') assert_equal(exists(child_metapath), (state == 'file' and all_)) # ignored directories should not have json files created in any case for subdir in [('.hidden',), ('dir', 'subgit')]: assert_false(exists(get_metapath(topdir, *subdir))) # check if its updated in its nodes sublist too. used by web-ui json. regression test assert_equal(dsj['nodes'][0]['size']['total'], dsj['size']['total']) # check size of subdataset subds = [item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds')][0] assert_equal(subds['size']['total'], '3 Bytes') # dir/subds2 must not be listed among nodes of the top dataset: topds_nodes = {x['name']: x for x in dsj['nodes']} assert_in('subds', topds_nodes) # XXX # # condition here is a bit a guesswork by yoh later on # # TODO: here and below clear destiny/interaction of all_ and recursive # assert_equal(dsj['size']['total'], # '15 Bytes' if (recursive and all_) else # ('9 Bytes' if (recursive or all_) else '3 Bytes') # ) # https://github.com/datalad/datalad/issues/1674 if state == 'file' and all_: dirj = get_meta(topdir, 'dir') dir_nodes = {x['name']: x for x in dirj['nodes']} # it should be present in the subdir meta assert_in('subds2', dir_nodes) assert_not_in('url_external', dir_nodes['subds2']) assert_in('subgit_ext', dir_nodes) assert_equal(dir_nodes['subgit_ext']['url'], ext_url) # and not in topds assert_not_in('subds2', topds_nodes) # run non-recursive dataset traversal after subdataset metadata already created # to verify sub-dataset metadata being picked up from its metadata file in such cases if state == 'file' and recursive and not all_: dsj = _ls_json(topdir, json='file', all_=False) subds = [ item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds') ][0] assert_equal(subds['size']['total'], '3 Bytes') assert_equal( topds_nodes['fromweb']['size']['total'], UNKNOWN_SIZE )
def test_run_assume_ready(path): ds = Dataset(path).create() repo = ds.repo adjusted = repo.is_managed_branch() # --assume-ready=inputs (repo.pathobj / "f1").write_text("f1") ds.save() def cat_cmd(fname): return [ sys.executable, "-c", "import sys; print(open(sys.argv[-1]).read())", fname ] assert_in_results(ds.run(cat_cmd("f1"), inputs=["f1"]), action="get", type="file") # Same thing, but without the get() call. assert_not_in_results(ds.run(cat_cmd("f1"), inputs=["f1"], assume_ready="inputs"), action="get", type="file") ds.drop("f1", check=False) if not adjusted: # If the input is not actually ready, the command will fail. with assert_raises(CommandError): ds.run(cat_cmd("f1"), inputs=["f1"], assume_ready="inputs") # --assume-ready=outputs def unlink_and_write_cmd(fname): # This command doesn't care whether the output file is unlocked because # it removes it ahead of time anyway. return [ sys.executable, "-c", "import sys; import os; import os.path as op; " "f = sys.argv[-1]; op.lexists(f) and os.unlink(f); " "open(f, mode='w').write(str(sys.argv))", fname ] (repo.pathobj / "f2").write_text("f2") ds.save() res = ds.run(unlink_and_write_cmd("f2"), outputs=["f2"]) if not adjusted: assert_in_results(res, action="unlock", type="file") # Same thing, but without the unlock() call. res = ds.run(unlink_and_write_cmd("f2"), outputs=["f2"], assume_ready="outputs") assert_not_in_results(res, action="unlock", type="file") # --assume-ready=both res = ds.run(unlink_and_write_cmd("f2"), outputs=["f2"], inputs=["f2"]) assert_in_results(res, action="get", type="file") if not adjusted: assert_in_results(res, action="unlock", type="file") res = ds.run(unlink_and_write_cmd("f2"), outputs=["f2"], inputs=["f2"], assume_ready="both") assert_not_in_results(res, action="get", type="file") assert_not_in_results(res, action="unlock", type="file")
def test_run_inputs_outputs(path): ds = Dataset(path) assert_false(ds.repo.file_has_content("test-annex.dat")) # If we specify test-annex.dat as an input, it will be retrieved before the # run. ds.run("cat test-annex.dat test-annex.dat >doubled.dat", inputs=["test-annex.dat"]) ok_clean_git(ds.path) ok_(ds.repo.file_has_content("test-annex.dat")) ok_(ds.repo.file_has_content("doubled.dat")) # Rerunning the commit will also get the input file. ds.repo.drop("test-annex.dat", options=["--force"]) assert_false(ds.repo.file_has_content("test-annex.dat")) ds.rerun() ok_(ds.repo.file_has_content("test-annex.dat")) with swallow_logs(new_level=logging.WARN) as cml: ds.run("touch dummy", inputs=["*.not-an-extension"]) assert_in("No matching files found for '*.not-an-extension'", cml.out) # Test different combinations of globs and explicit files. inputs = ["a.dat", "b.dat", "c.txt", "d.txt"] create_tree(ds.path, {i: i for i in inputs}) ds.add(".") ds.repo.copy_to(inputs, remote="origin") ds.repo.drop(inputs, options=["--force"]) test_cases = [(["*.dat"], ["a.dat", "b.dat"]), (["*.dat", "c.txt"], ["a.dat", "b.dat", "c.txt"]), (["*"], inputs)] for idx, (inputs_arg, expected_present) in enumerate(test_cases): assert_false(any(ds.repo.file_has_content(i) for i in inputs)) ds.run("touch dummy{}".format(idx), inputs=inputs_arg) ok_(all(ds.repo.file_has_content(f) for f in expected_present)) # Globs are stored unexpanded by default. assert_in(inputs_arg[0], ds.repo.repo.head.commit.message) ds.repo.drop(inputs, options=["--force"]) # --input can be passed a subdirectory. create_tree(ds.path, {"subdir": {"a": "subdir a", "b": "subdir b"}}) ds.add("subdir") ds.repo.copy_to(["subdir/a", "subdir/b"], remote="origin") ds.repo.drop("subdir", options=["--force"]) ds.run("touch subdir-dummy", inputs=[opj(ds.path, "subdir")]) ok_(all(ds.repo.file_has_content(opj("subdir", f)) for f in ["a", "b"])) # Inputs are specified relative to a dataset's subdirectory. ds.repo.drop(opj("subdir", "a"), options=["--force"]) with chpwd(opj(path, "subdir")): run("touch subdir-dummy1", inputs=["a"]) ok_(ds.repo.file_has_content(opj("subdir", "a"))) # --input=. runs "datalad get ." ds.run("touch dot-dummy", inputs=["."]) eq_(ds.repo.get_annexed_files(), ds.repo.get_annexed_files(with_content_only=True)) # On rerun, we get all files, even those that weren't in the tree at the # time of the run. create_tree(ds.path, {"after-dot-run": "after-dot-run content"}) ds.add(".") ds.repo.copy_to(["after-dot-run"], remote="origin") ds.repo.drop(["after-dot-run"], options=["--force"]) ds.rerun("HEAD^") ds.repo.file_has_content("after-dot-run") # --output will unlock files that are present. ds.repo.get("a.dat") ds.run("echo ' appended' >>a.dat", outputs=["a.dat"]) with open(opj(path, "a.dat")) as fh: eq_(fh.read(), "a.dat appended\n") # --output will remove files that are not present. ds.repo.drop("a.dat", options=["--force"]) ds.run("echo ' appended' >>a.dat", outputs=["a.dat"]) with open(opj(path, "a.dat")) as fh: eq_(fh.read(), " appended\n") # --input can be combined with --output. ds.repo.repo.git.reset("--hard", "HEAD~2") ds.run("echo ' appended' >>a.dat", inputs=["a.dat"], outputs=["a.dat"]) with open(opj(path, "a.dat")) as fh: eq_(fh.read(), "a.dat appended\n") with swallow_logs(new_level=logging.WARN) as cml: ds.run("echo blah", outputs=["*.not-an-extension"]) assert_in("No matching files found for '*.not-an-extension'", cml.out) ds.create('sub') ds.run("echo sub_orig >sub/subfile") ds.run("echo sub_overwrite >sub/subfile", outputs=["sub/subfile"]) ds.drop("sub/subfile", check=False) ds.run("echo sub_overwrite >sub/subfile", outputs=["sub/subfile"]) # --input/--output globs can be stored in expanded form. ds.run("touch expand-dummy", inputs=["a.*"], outputs=["b.*"], expand="both") assert_in("a.dat", ds.repo.repo.head.commit.message) assert_in("b.dat", ds.repo.repo.head.commit.message) res = ds.rerun(report=True, return_type='item-or-list') eq_(res["run_info"]['inputs'], ["a.dat"]) eq_(res["run_info"]['outputs'], ["b.dat"])
def test_rerun(path, nodspath): ds = Dataset(path).create() sub = ds.create('sub') probe_path = opj(sub.path, 'sequence') # run inside the dataset with chpwd(path), \ swallow_outputs(): ds.run('echo x$(cat sub/sequence) > sub/sequence') # command ran once, all clean ok_clean_git(ds.path) eq_('x\n', open(probe_path).read()) # now, for a rerun we can be anywhere, PWD and all are recorded # moreover, rerun must figure out which bits to unlock, even in # subdatasets with chpwd(nodspath), \ swallow_outputs(): ds.rerun() ok_clean_git(ds.path) # ran twice now eq_('xx\n', open(probe_path).read()) # Rerunning from a subdataset skips the command. _, sub_info = get_run_info(ds, sub.repo.repo.head.commit.message) eq_(ds.id, sub_info["dsid"]) assert_result_count( sub.rerun(return_type="list", on_failure="ignore"), 1, status="impossible", action="run", rerun_action="skip") eq_('xx\n', open(probe_path).read()) # Rerun fails with a dirty repo. dirt = opj(path, "dirt") with open(dirt, "w") as fh: fh.write("") assert_status('impossible', ds.rerun(on_failure="ignore")) remove(dirt) ok_clean_git(ds.path) # Make a non-run commit. with open(opj(path, "nonrun-file"), "w") as f: f.write("foo") ds.add("nonrun-file") # Now rerun the buried command. ds.rerun(revision="HEAD~", message="rerun buried") eq_('xxx\n', open(probe_path).read()) # Also check that the messasge override worked. eq_(ds.repo.format_commit("%B").splitlines()[0], "[DATALAD RUNCMD] rerun buried") # Or a range of commits, skipping non-run commits. ds.rerun(since="HEAD~3") eq_('xxxxx\n', open(probe_path).read()) # Or --since= to run all reachable commits. ds.rerun(since="") eq_('xxxxxxxxxx\n', open(probe_path).read()) # We can get back a report of what would happen rather than actually # rerunning anything. report = ds.rerun(since="", report=True, return_type="list") # Nothing changed. eq_('xxxxxxxxxx\n', open(probe_path).read()) assert_result_count(report, 1, rerun_action="skip") report[-1]["commit"] == ds.repo.get_hexsha() # If a file is dropped, we remove it instead of unlocking it. ds.drop(probe_path, check=False) ds.rerun() eq_('x\n', open(probe_path).read()) # If the history to rerun has a merge commit, we abort. ds.repo.checkout("HEAD~3", options=["-b", "topic"]) with open(opj(path, "topic-file"), "w") as f: f.write("topic") ds.add("topic-file") ds.repo.checkout("master") ds.repo.merge("topic") ok_clean_git(ds.path) assert_raises(IncompleteResultsError, ds.rerun)
def test_rerun(path, nodspath): ds = Dataset(path).create() sub = ds.create('sub') probe_path = op.join(sub.path, 'sequence') # run inside the dataset with chpwd(path), \ swallow_outputs(): ds.run('echo x$(cat sub/sequence) > sub/sequence') # command ran once, all clean assert_repo_status(ds.path) eq_('x\n', open(probe_path).read()) # now, for a rerun we can be anywhere, PWD and all are recorded # moreover, rerun must figure out which bits to unlock, even in # subdatasets with chpwd(nodspath), \ swallow_outputs(): ds.rerun() assert_repo_status(ds.path) # ran twice now eq_('xx\n', open(probe_path).read()) # Rerunning from a subdataset skips the command. _, sub_info = get_run_info(ds, sub.repo.format_commit("%B")) eq_(ds.id, sub_info["dsid"]) assert_result_count(sub.rerun(return_type="list", on_failure="ignore"), 1, status="impossible", action="run", rerun_action="skip") eq_('xx\n', open(probe_path).read()) # Rerun fails with a dirty repo. dirt = op.join(path, "dirt") with open(dirt, "w") as fh: fh.write("") assert_status('impossible', ds.rerun(on_failure="ignore")) remove(dirt) assert_repo_status(ds.path) # Make a non-run commit. with open(op.join(path, "nonrun-file"), "w") as f: f.write("foo") ds.save("nonrun-file") # Now rerun the buried command. ds.rerun(revision="HEAD~", message="rerun buried") eq_('xxx\n', open(probe_path).read()) # Also check that the messasge override worked. eq_( ds.repo.format_commit("%B").splitlines()[0], "[DATALAD RUNCMD] rerun buried") # Or a range of commits, skipping non-run commits. ds.rerun(since="HEAD~3") eq_('xxxxx\n', open(probe_path).read()) # Or --since= to run all reachable commits. ds.rerun(since="") eq_('xxxxxxxxxx\n', open(probe_path).read()) # We can get back a report of what would happen rather than actually # rerunning anything. report = ds.rerun(since="", report=True, return_type="list") # The "diff" section of the report doesn't include the unchanged files that # would come in "-f json diff" output. for entry in report: if entry["rerun_action"] == "run": # None of the run commits touch .datalad/config or any other config # file. assert_false( any(r["path"].endswith("config") for r in entry["diff"])) # Nothing changed. eq_('xxxxxxxxxx\n', open(probe_path).read()) assert_result_count(report, 1, rerun_action="skip-or-pick") report[-1]["commit"] == ds.repo.get_hexsha() # If a file is dropped, we remove it instead of unlocking it. ds.drop(probe_path, check=False) with swallow_outputs(): ds.rerun() eq_('x\n', open(probe_path).read())
def test_ls_json(topdir, topurl): annex = AnnexRepo(topdir, create=True) ds = Dataset(topdir) # create some file and commit it with open(opj(ds.path, 'subdsfile.txt'), 'w') as f: f.write('123') ds.add(path='subdsfile.txt') ds.save("Hello!", version_tag=1) # add a subdataset ds.install('subds', source=topdir) subdirds = ds.create(_path_('dir/subds2'), force=True) subdirds.add('file') git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True) # create git repo git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt')) # commit to git to init git repo git.commit() annex.add(opj(topdir, 'dir', 'subgit')) # add the non-dataset git repo to annex annex.add(opj(topdir, 'dir')) # add to annex (links) annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # broken-link annex.commit() git.add('fgit.txt') # commit to git to init git repo git.commit() # annex.add doesn't add submodule, so using ds.add ds.add(opj('dir', 'subgit')) # add the non-dataset git repo to annex ds.add('dir') # add to annex (links) ds.drop(opj('dir', 'subdir', 'file2.txt'), check=False) # broken-link # register "external" submodule by installing and uninstalling it ext_url = topurl + '/dir/subgit/.git' # need to make it installable via http Runner()('git update-server-info', cwd=opj(topdir, 'dir', 'subgit')) ds.install(opj('dir', 'subgit_ext'), source=ext_url) ds.uninstall(opj('dir', 'subgit_ext')) meta_dir = opj('.git', 'datalad', 'metadata') def get_metahash(*path): if not path: path = ['/'] return hashlib.md5(opj(*path).encode('utf-8')).hexdigest() def get_metapath(dspath, *path): return _path_(dspath, meta_dir, get_metahash(*path)) def get_meta(dspath, *path): with open(get_metapath(dspath, *path)) as f: return js.load(f) # Let's see that there is no crash if one of the files is available only # in relaxed URL mode, so no size could be picked up ds.repo.add_url_to_file('fromweb', topurl + '/noteventhere', options=['--relaxed']) for all_ in [True, False]: # recurse directories for recursive in [True, False]: for state in ['file', 'delete']: # subdataset should have its json created and deleted when # all=True else not subds_metapath = get_metapath(opj(topdir, 'subds')) exists_prior = exists(subds_metapath) #with swallow_logs(), swallow_outputs(): dsj = _ls_json(topdir, json=state, all_=all_, recursive=recursive) ok_startswith(dsj['tags'], '1-') exists_post = exists(subds_metapath) # print("%s %s -> %s" % (state, exists_prior, exists_post)) assert_equal(exists_post, (state == 'file' and recursive)) # root should have its json file created and deleted in all cases ds_metapath = get_metapath(topdir) assert_equal(exists(ds_metapath), state == 'file') # children should have their metadata json's created and deleted only when recursive=True child_metapath = get_metapath(topdir, 'dir', 'subdir') assert_equal(exists(child_metapath), (state == 'file' and all_)) # ignored directories should not have json files created in any case for subdir in [('.hidden', ), ('dir', 'subgit')]: assert_false(exists(get_metapath(topdir, *subdir))) # check if its updated in its nodes sublist too. used by web-ui json. regression test assert_equal(dsj['nodes'][0]['size']['total'], dsj['size']['total']) # check size of subdataset subds = [ item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds') ][0] assert_equal(subds['size']['total'], '3 Bytes') # dir/subds2 must not be listed among nodes of the top dataset: topds_nodes = {x['name']: x for x in dsj['nodes']} assert_in('subds', topds_nodes) # XXX # # condition here is a bit a guesswork by yoh later on # # TODO: here and below clear destiny/interaction of all_ and recursive # assert_equal(dsj['size']['total'], # '15 Bytes' if (recursive and all_) else # ('9 Bytes' if (recursive or all_) else '3 Bytes') # ) # https://github.com/datalad/datalad/issues/1674 if state == 'file' and all_: dirj = get_meta(topdir, 'dir') dir_nodes = {x['name']: x for x in dirj['nodes']} # it should be present in the subdir meta assert_in('subds2', dir_nodes) assert_not_in('url_external', dir_nodes['subds2']) assert_in('subgit_ext', dir_nodes) assert_equal(dir_nodes['subgit_ext']['url'], ext_url) # and not in topds assert_not_in('subds2', topds_nodes) # run non-recursive dataset traversal after subdataset metadata already created # to verify sub-dataset metadata being picked up from its metadata file in such cases if state == 'file' and recursive and not all_: dsj = _ls_json(topdir, json='file', all_=False) subds = [ item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds') ][0] assert_equal(subds['size']['total'], '3 Bytes') assert_equal(topds_nodes['fromweb']['size']['total'], UNKNOWN_SIZE)