def test_audio(path): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset') copy( opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'audio.mp3'), path) ds.add('.') ok_clean_git(ds.path) res = ds.aggregate_metadata() assert_status('ok', res) res = ds.metadata('audio.mp3') assert_result_count(res, 1) # from this extractor meta = res[0]['metadata']['audio'] for k, v in target.items(): eq_(meta[k], v) assert_in('@context', meta) uniques = ds.metadata( reporton='datasets', return_type='item-or-list')['metadata']['datalad_unique_content_properties'] # test file has it, but uniques have it blanked out, because the extractor considers it worthless # for discovering whole datasets assert_in('bitrate', meta) eq_(uniques['audio']['bitrate'], None) # 'date' field carries not value, hence gets exclude from the unique report assert_in('date', meta) assert(not meta['date']) assert_not_in('date', uniques['audio'])
def test_diff_nonexistent_ref_unicode(path): ds = Dataset(path).create() assert_result_count( ds.diff(fr="HEAD", to=u"β", on_failure="ignore"), 1, path=ds.path, status="impossible")
def test_add_files(path): ds = Dataset(path).create(force=True) test_list_1 = ['test_annex.txt'] test_list_2 = ['test.txt'] test_list_3 = ['test1.dat', 'test2.dat'] test_list_4 = [op.join('dir', 'testindir'), op.join('dir', OBSCURE_FILENAME)] for arg in [(test_list_1[0], False), (test_list_2[0], True), (test_list_3, False), (test_list_4, False)]: # special case 4: give the dir: if arg[0] == test_list_4: result = ds.save('dir', to_git=arg[1]) status = ds.repo.annexstatus(['dir']) else: result = ds.save(arg[0], to_git=arg[1]) for a in assure_list(arg[0]): assert_result_count(result, 1, path=text_type(ds.pathobj / a)) status = ds.repo.get_content_annexinfo( ut.Path(p) for p in assure_list(arg[0])) for f, p in iteritems(status): if arg[1]: assert p.get('key', None) is None, f else: assert p.get('key', None) is not None, f
def test_rerun_just_one_commit(path): ds = Dataset(path).create() # Check out an orphan branch so that we can test the "one commit # in a repo" case. ds.repo.checkout("orph", options=["--orphan"]) ds.repo.repo.git.reset("--hard") ds.repo.config.reload() ds.run('echo static-content > static') assert_result_count(ds.repo.repo.git.rev_list("HEAD").split(), 1) # Rerunning with just one commit doesn't raise an error ... ds.rerun() # ... but we're still at one commit because the content didn't # change. assert_result_count(ds.repo.repo.git.rev_list("HEAD").split(), 1) # We abort rather than trying to do anything when --onto='' and # --since='' are given together and the first commit contains a # run command. ds.repo.commit(msg="empty", options=["--allow-empty"]) assert_raises(IncompleteResultsError, ds.rerun, since="", onto="") # --script propagates the error. with swallow_outputs(): assert_raises(IncompleteResultsError, ds.rerun, since="", onto="", script="-") # --dry-run propagates the error. assert_raises(IncompleteResultsError, ds.rerun, since="", onto="", report=True, return_type="list")
def test_add_recursive(path): # make simple hierarchy parent = Dataset(path).create() assert_repo_status(parent.path) sub1 = parent.create(op.join('down', 'sub1')) assert_repo_status(parent.path) sub2 = parent.create('sub2') # next one make the parent dirty subsub = sub2.create('subsub') assert_repo_status(parent.path, modified=['sub2']) res = parent.save() assert_repo_status(parent.path) # now add content deep in the hierarchy create_tree(subsub.path, {'new': 'empty'}) assert_repo_status(parent.path, modified=['sub2']) # recursive add should not even touch sub1, because # it knows that it is clean res = parent.save(recursive=True) # the key action is done assert_result_count( res, 1, path=op.join(subsub.path, 'new'), action='add', status='ok') # saved all the way up assert_result_count(res, 3, action='save', status='ok') assert_repo_status(parent.path)
def test_install_skip_list_arguments(src, path, path_outside): ds = install(path, source=src) ok_(ds.is_installed()) # install a list with valid and invalid items: result = ds.install( path=['subm 1', 'not_existing', path_outside, '2'], get_data=False, on_failure='ignore', result_xfm=None, return_type='list') # good and bad results together ok_(isinstance(result, list)) eq_(len(result), 4) # check that we have an 'impossible' status for both invalid args # but all the other tasks have been accomplished for skipped, msg in [(opj(ds.path, 'not_existing'), "path does not exist"), (path_outside, "path not associated with any dataset")]: assert_result_count( result, 1, status='impossible', message=msg, path=skipped) for sub in [Dataset(opj(path, 'subm 1')), Dataset(opj(path, '2'))]: assert_result_count( result, 1, status='ok', message=('Installed subdataset in order to get %s', sub.path)) ok_(sub.is_installed()) # return of get is always a list, by default, even if just one thing was gotten # in this case 'subm1' was already obtained above, so this will get this # content of the subdataset with assert_raises(IncompleteResultsError) as cme: ds.install(path=['subm 1', 'not_existing']) with assert_raises(IncompleteResultsError) as cme: ds.get(path=['subm 1', 'not_existing'])
def test_invalid_call(path): with chpwd(path): # ^ Change directory so that we don't fail with an # InvalidGitRepositoryError if the test is executed from a git # worktree. # needs a SSH URL assert_raises(InsufficientArgumentsError, create_sibling, '') assert_raises(ValueError, create_sibling, 'http://ignore.me') # needs an actual dataset assert_raises( ValueError, create_sibling, 'localhost:/tmp/somewhere', dataset='/nothere') # pre-configure a bogus remote ds = Dataset(path).create() ds.repo.add_remote('bogus', 'http://bogus.url.com') # fails to reconfigure by default with generated # and also when given an existing name for res in (ds.create_sibling('bogus:/tmp/somewhere', on_failure='ignore'), ds.create_sibling('localhost:/tmp/somewhere', name='bogus', on_failure='ignore')): assert_result_count( res, 1, status='error', message=( "sibling '%s' already configured (specify alternative name, or force reconfiguration via --existing", 'bogus'))
def test_get_mixed_hierarchy(src, path): origin = Dataset(src).create(no_annex=True) origin_sub = origin.create('subds') with open(opj(origin.path, 'file_in_git.txt'), "w") as f: f.write('no idea') with open(opj(origin_sub.path, 'file_in_annex.txt'), "w") as f: f.write('content') origin.add('file_in_git.txt', to_git=True) origin_sub.add('file_in_annex.txt') origin.save() # now, install that thing: ds, subds = install( path, source=src, recursive=True, result_xfm='datasets', return_type='item-or-list', result_filter=None) ok_(subds.repo.file_has_content("file_in_annex.txt") is False) # and get: result = ds.get(curdir, recursive=True) # git repo and subds assert_status(['ok', 'notneeded'], result) assert_result_count( result, 1, path=opj(subds.path, "file_in_annex.txt"), status='ok') ok_(subds.repo.file_has_content("file_in_annex.txt") is True)
def test_our_metadataset_search(tdir): # TODO renable when a dataset with new aggregated metadata is # available at some public location raise SkipTest # smoke test for basic search operations on our super-megadataset # expensive operation but ok #ds = install( # path=tdir, # # TODO renable test when /// metadata actually conforms to the new metadata # #source="///", # source="smaug:/mnt/btrfs/datasets-meta6-4/datalad/crawl", # result_xfm='datasets', return_type='item-or-list') assert list(ds.search('haxby')) assert_result_count( ds.search('id:873a6eae-7ae6-11e6-a6c8-002590f97d84', mode='textblob'), 1, type='dataset', path=opj(ds.path, 'crcns', 'pfc-2')) # there is a problem with argparse not decoding into utf8 in PY2 from datalad.cmdline.tests.test_main import run_main # TODO: make it into an independent lean test from datalad.cmd import Runner out, err = Runner(cwd=ds.path)('datalad search Buzsáki') assert_in('crcns/pfc-2 ', out) # has it in description # and then another aspect: this entry it among multiple authors, need to # check if aggregating them into a searchable entity was done correctly assert_in('crcns/hc-1 ', out)
def test_invalid_call(origin, tdir): ds = Dataset(origin) ds.uninstall('subm 1', check=False) # nothing assert_status('error', publish('/notthere', on_failure='ignore')) # known, but not present assert_status('impossible', publish(opj(ds.path, 'subm 1'), on_failure='ignore')) # --since without dataset is now supported as long as it # could be identified # assert_raises(InsufficientArgumentsError, publish, since='HEAD') # but if it couldn't be, then should indeed crash with chpwd(tdir): assert_raises(InsufficientArgumentsError, publish, since='HEAD') # new dataset, with unavailable subdataset dummy = Dataset(tdir).create() dummy_sub = dummy.create('sub') dummy_sub.uninstall() assert_in('sub', dummy.subdatasets(fulfilled=False, result_xfm='relpaths')) # now an explicit call to publish the unavailable subdataset assert_result_count( dummy.publish('sub', on_failure='ignore'), 1, path=dummy_sub.path, status='impossible', type='dataset')
def test_kill(path): # nested datasets with load ds = Dataset(path).create() testfile = opj(ds.path, "file.dat") with open(testfile, 'w') as f: f.write("load") ds.save("file.dat") subds = ds.create('deep1') eq_(sorted(ds.subdatasets(result_xfm='relpaths')), ['deep1']) ok_clean_git(ds.path) # and we fail to remove since content can't be dropped res = ds.remove(on_failure='ignore') assert_result_count( res, 1, status='error', path=testfile) # Following two assertions on message are relying on the actual error. # We have a second result with status 'impossible' for the ds, that we need # to filter out for those assertions: err_result = [r for r in res if r['status'] == 'error'][0] assert_result_values_cond( [err_result], 'message', lambda x: "configured minimum number of copies not found" in x or "Could only verify the existence of 0 out of 1 necessary copies" in x ) eq_(ds.remove(recursive=True, check=False, result_xfm='datasets'), [subds, ds]) ok_(not exists(path))
def test_uninstall_git_file(path): ds = Dataset(path) ok_(ds.is_installed()) ok_(exists(opj(path, 'INFO.txt'))) ok_file_under_git(ds.repo.path, 'INFO.txt') # drop file in Git in an annex repo # regardless of the type of repo this is 'notneeded'... # it is less about education that about "can we # we get the content back?", and for a file in Git we can assert_result_count( ds.drop(path='INFO.txt'), 1, status='notneeded', message="no annex'ed content") res = ds.uninstall(path="INFO.txt", on_failure='ignore') assert_result_count( res, 1, status='impossible', message='can only uninstall datasets (consider the `drop` command)') # remove the file: res = ds.remove(path='INFO.txt', result_xfm='paths', result_filter=lambda x: x['action'] == 'remove') assert_raises(AssertionError, ok_file_under_git, ds.repo.path, 'INFO.txt') ok_(not exists(opj(path, 'INFO.txt'))) eq_(res, ['INFO.txt'])
def test_clone_isnot_recursive(src, path_nr, path_r): ds = clone(src, path_nr, result_xfm='datasets', return_type='item-or-list') ok_(ds.is_installed()) # check nothin is unintentionally installed subdss = ds.subdatasets(recursive=True) assert_result_count(subdss, len(subdss), state='absent') # this also means, subdatasets to be listed as not fulfilled: eq_(set(ds.subdatasets(recursive=True, fulfilled=False, result_xfm='relpaths')), {'subm 1', '2'})
def test_publish_simple(origin, src_path, dst_path): # prepare src source = install(src_path, source=origin, recursive=True) # forget we cloned it (provide no 'origin' anymore), which should lead to # setting tracking branch to target: source.repo.remove_remote("origin") # create plain git at target: target = GitRepo(dst_path, create=True) target.checkout("TMP", ["-b"]) source.repo.add_remote("target", dst_path) res = publish(dataset=source, to="target", result_xfm='datasets') eq_(res, [source]) ok_clean_git(source.repo, annex=None) ok_clean_git(target, annex=None) eq_(list(target.get_branch_commits("master")), list(source.repo.get_branch_commits("master"))) # don't fail when doing it again res = publish(dataset=source, to="target") # and nothing is pushed assert_result_count(res, 1, status='notneeded') ok_clean_git(source.repo, annex=None) ok_clean_git(target, annex=None) eq_(list(target.get_branch_commits("master")), list(source.repo.get_branch_commits("master"))) eq_(list(target.get_branch_commits("git-annex")), list(source.repo.get_branch_commits("git-annex"))) # 'target/master' should be tracking branch at this point, so # try publishing without `to`: # MIH: Nope, we don't automatically add this anymore # some modification: with open(opj(src_path, 'test_mod_file'), "w") as f: f.write("Some additional stuff.") source.add(opj(src_path, 'test_mod_file'), to_git=True, message="Modified.") ok_clean_git(source.repo, annex=None) res = publish(dataset=source, to='target', result_xfm='datasets') eq_(res, [source]) ok_clean_git(dst_path, annex=None) eq_(list(target.get_branch_commits("master")), list(source.repo.get_branch_commits("master"))) # Since git-annex 6.20170220, post-receive hook gets triggered # which results in entry being added for that repo into uuid.log on remote # end since then finally git-annex senses that it needs to init that remote, # so it might have 1 more commit than local. # see https://github.com/datalad/datalad/issues/1319 ok_(set(source.repo.get_branch_commits("git-annex")).issubset( set(target.get_branch_commits("git-annex"))))
def test_update_fetch_all(src, remote_1, remote_2): rmt1 = AnnexRepo.clone(src, remote_1) rmt2 = AnnexRepo.clone(src, remote_2) ds = Dataset(src) ds.siblings('add', name="sibling_1", url=remote_1) ds.siblings('add', name="sibling_2", url=remote_2) # modify the remotes: with open(opj(remote_1, "first.txt"), "w") as f: f.write("some file load") rmt1.add("first.txt") rmt1.commit() # TODO: Modify an already present file! with open(opj(remote_2, "second.txt"), "w") as f: f.write("different file load") rmt2.add("second.txt", git=True) rmt2.commit(msg="Add file to git.") # Let's init some special remote which we couldn't really update/fetch if not os.environ.get('DATALAD_TESTS_DATALADREMOTE'): ds.repo.init_remote( 'datalad', ['encryption=none', 'type=external', 'externaltype=datalad']) # fetch all remotes assert_result_count( ds.update(), 1, status='ok', type='dataset') # no merge, so changes are not in active branch: assert_not_in("first.txt", ds.repo.get_files(ds.repo.get_active_branch())) assert_not_in("second.txt", ds.repo.get_files(ds.repo.get_active_branch())) # but we know the changes in remote branches: assert_in("first.txt", ds.repo.get_files("sibling_1/master")) assert_in("second.txt", ds.repo.get_files("sibling_2/master")) # no merge strategy for multiple remotes yet: # more clever now, there is a tracking branch that provides a remote #assert_raises(NotImplementedError, ds.update, merge=True) # merge a certain remote: assert_result_count( ds.update( sibling='sibling_1', merge=True), 1, status='ok', type='dataset') # changes from sibling_2 still not present: assert_not_in("second.txt", ds.repo.get_files(ds.repo.get_active_branch())) # changes from sibling_1 merged: assert_in("first.txt", ds.repo.get_files(ds.repo.get_active_branch())) # it's known to annex, but has no content yet: ds.repo.get_file_key("first.txt") # raises if unknown eq_([False], ds.repo.file_has_content(["first.txt"]))
def test_placeholders(path): ds = Dataset(path).create(force=True) ds.add(".") ds.run("echo {inputs} >{outputs}", inputs=[".", "*.in"], outputs=["c.out"]) ok_file_has_content(opj(path, "c.out"), "a.in b.in\n") hexsha_before = ds.repo.get_hexsha() ds.rerun() eq_(hexsha_before, ds.repo.get_hexsha()) ds.run("echo {inputs[0]} >getitem", inputs=["*.in"]) ok_file_has_content(opj(path, "getitem"), "a.in\n") ds.run("echo {pwd} >expanded-pwd") ok_file_has_content(opj(path, "expanded-pwd"), path, strip=True) ds.run("echo {dspath} >expanded-dspath") ok_file_has_content(opj(path, "expanded-dspath"), ds.path, strip=True) subdir_path = opj(path, "subdir") with chpwd(subdir_path): run("echo {pwd} >expanded-pwd") ok_file_has_content(opj(path, "subdir", "expanded-pwd"), subdir_path, strip=True) eq_(get_run_info(ds, ds.repo.format_commit("%B"))[1]["pwd"], "subdir") # Double brackets can be used to escape placeholders. ds.run("touch {{inputs}}", inputs=["*.in"]) ok_exists(opj(path, "{inputs}")) # rerun --script expands the placeholders. with patch("sys.stdout", new_callable=StringIO) as cmout: ds.rerun(script="-", since="") script_out = cmout.getvalue() assert_in("echo a.in b.in >c.out", script_out) assert_in("echo {} >expanded-pwd".format(subdir_path), script_out) assert_in("echo {} >expanded-dspath".format(ds.path), script_out) assert_result_count( ds.run("{unknown_placeholder}", on_failure="ignore"), 1, status="impossible", action="run") # Configured placeholders. ds.config.add("datalad.run.substitutions.license", "gpl3", where="local") ds.run("echo {license} >configured-license") ok_file_has_content(opj(path, "configured-license"), "gpl3", strip=True) # --script handles configured placeholders. with patch("sys.stdout", new_callable=StringIO) as cmout: ds.rerun(script="-") assert_in("gpl3", cmout.getvalue())
def test_publish_plain_git(origin, src_path, dst_path): # TODO: Since it's mostly the same, melt with test_publish_simple # prepare src source = install(src_path, source=origin, recursive=True) # forget we cloned it (provide no 'origin' anymore), which should lead to # setting tracking branch to target: source.repo.remove_remote("origin") # create plain git at target: target = GitRepo(dst_path, create=True) target.checkout("TMP", ["-b"]) source.repo.add_remote("target", dst_path) res = publish(dataset=source, to="target", result_xfm='datasets') eq_(res, [source]) ok_clean_git(source.repo, annex=None) ok_clean_git(target, annex=None) eq_(list(target.get_branch_commits("master")), list(source.repo.get_branch_commits("master"))) # don't fail when doing it again res = publish(dataset=source, to="target") # and nothing is pushed assert_result_count(res, 1, status='notneeded') ok_clean_git(source.repo, annex=None) ok_clean_git(target, annex=None) eq_(list(target.get_branch_commits("master")), list(source.repo.get_branch_commits("master"))) # some modification: with open(opj(src_path, 'test_mod_file'), "w") as f: f.write("Some additional stuff.") source.add(opj(src_path, 'test_mod_file'), to_git=True, message="Modified.") ok_clean_git(source.repo, annex=None) res = publish(dataset=source, to='target', result_xfm='datasets') eq_(res, [source]) ok_clean_git(dst_path, annex=None) eq_(list(target.get_branch_commits("master")), list(source.repo.get_branch_commits("master"))) # amend and change commit msg in order to test for force push: source.repo.commit("amended", options=['--amend']) # push should be rejected (non-fast-forward): assert_raises(IncompleteResultsError, publish, dataset=source, to='target', result_xfm='datasets') # push with force=True works: res = publish(dataset=source, to='target', result_xfm='datasets', force=True) eq_(res, [source])
def test_get_single_file(path): ds = Dataset(path) ok_(ds.is_installed()) ok_(ds.repo.file_has_content('test-annex.dat') is False) result = ds.get("test-annex.dat") assert_result_count(result, 1) assert_status('ok', result) eq_(result[0]['path'], opj(ds.path, 'test-annex.dat')) eq_(result[0]['annexkey'], ds.repo.get_file_key('test-annex.dat')) ok_(ds.repo.file_has_content('test-annex.dat') is True)
def test_update_git_smoke(src_path, dst_path): # Apparently was just failing on git repos for basic lack of coverage, hence this quick test ds = Dataset(src_path).create(no_annex=True) target = install( dst_path, source=src_path, result_xfm='datasets', return_type='item-or-list') create_tree(ds.path, {'file.dat': '123'}) ds.save('file.dat') assert_result_count( target.update(recursive=True, merge=True), 1, status='ok', type='dataset') ok_file_has_content(opj(target.path, 'file.dat'), '123')
def test_newthings_coming_down(originpath, destpath): origin = GitRepo(originpath, create=True) create_tree(originpath, {'load.dat': 'heavy'}) Dataset(originpath).save('load.dat') ds = install( source=originpath, path=destpath, result_xfm='datasets', return_type='item-or-list') assert_is_instance(ds.repo, GitRepo) assert_in('origin', ds.repo.get_remotes()) # turn origin into an annex origin = AnnexRepo(originpath, create=True) # clone doesn't know yet assert_false(knows_annex(ds.path)) # but after an update it should # no merge, only one sibling, no parameters should be specific enough assert_result_count(ds.update(), 1, status='ok', type='dataset') assert(knows_annex(ds.path)) # no branches appeared eq_(ds.repo.get_branches(), ['master']) # now merge, and get an annex assert_result_count(ds.update(merge=True), 1, status='ok', type='dataset') assert_in('git-annex', ds.repo.get_branches()) assert_is_instance(ds.repo, AnnexRepo) # should be fully functional testfname = opj(ds.path, 'load.dat') assert_false(ds.repo.file_has_content(testfname)) ds.get('.') ok_file_has_content(opj(ds.path, 'load.dat'), 'heavy') # check that a new tag comes down origin.tag('first!') assert_result_count(ds.update(), 1, status='ok', type='dataset') eq_(ds.repo.get_tags(output='name')[0], 'first!') # and now we destroy the remote annex origin._git_custom_command([], ['git', 'config', '--remove-section', 'annex']) rmtree(opj(origin.path, '.git', 'annex'), chmod_files=True) origin._git_custom_command([], ['git', 'branch', '-D', 'git-annex']) origin = GitRepo(originpath) assert_false(knows_annex(originpath)) # and update the local clone # for now this should simply not fail (see gh-793), later might be enhanced to a # graceful downgrade before_branches = ds.repo.get_branches() assert_result_count(ds.update(), 1, status='ok', type='dataset') eq_(before_branches, ds.repo.get_branches()) # annex branch got pruned eq_(['origin/HEAD', 'origin/master'], ds.repo.get_remote_branches()) # check that a new tag comes down even if repo types mismatch origin.tag('second!') assert_result_count(ds.update(), 1, status='ok', type='dataset') eq_(ds.repo.get_tags(output='name')[-1], 'second!')
def test_update_strategy(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n') sub = base.create('sub', force=True) subsub = sub.create(opj('subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) # we start clean for ds in base, sub, subsub: eq_(len(_get_contained_objs(ds)), 0) # aggregate the base dataset only, nothing below changes base.aggregate_metadata() eq_(len(_get_contained_objs(base)), 2) for ds in sub, subsub: eq_(len(_get_contained_objs(ds)), 0) # aggregate the entire tree, but by default only updates # the top-level dataset with all objects, none of the leaf # or intermediate datasets get's touched base.aggregate_metadata(recursive=True) eq_(len(_get_contained_objs(base)), 6) eq_(len(_get_referenced_objs(base)), 6) for ds in sub, subsub: eq_(len(_get_contained_objs(ds)), 0) res = base.metadata(get_aggregates=True) assert_result_count(res, 3) # it is impossible to query an intermediate or leaf dataset # for metadata for ds in sub, subsub: assert_status( 'impossible', ds.metadata(get_aggregates=True, on_failure='ignore')) # get the full metadata report target_meta = base.metadata(return_type='list') # now redo full aggregation, this time updating all # (intermediate) datasets base.aggregate_metadata(recursive=True, update_mode='all') eq_(len(_get_contained_objs(base)), 6) eq_(len(_get_contained_objs(sub)), 4) eq_(len(_get_contained_objs(subsub)), 2) # it is now OK to query an intermediate or leaf dataset # for metadata for ds in sub, subsub: assert_status( 'ok', ds.metadata(get_aggregates=True, on_failure='ignore')) # all of that has no impact on the reported metadata eq_(target_meta, base.metadata(return_type='list'))
def test_clone_report_permission_issue(tdir): pdir = _path_(tdir, 'protected') mkdir(pdir) # make it read-only chmod(pdir, 0o555) with chpwd(pdir): res = clone('///', result_xfm=None, return_type='list', on_failure='ignore') assert_status('error', res) assert_result_count( res, 1, status='error', message="could not create work tree dir '%s/%s': Permission denied" % (pdir, get_datasets_topdir()) )
def test_autoresolve_multiple_datasets(src, path): with chpwd(path): ds1 = install( 'ds1', source=src, result_xfm='datasets', return_type='item-or-list') ds2 = install( 'ds2', source=src, result_xfm='datasets', return_type='item-or-list') results = get([opj('ds1', 'test-annex.dat')] + glob(opj('ds2', '*.dat'))) # each ds has one file assert_result_count(results, 2, type='file', action='get', status='ok') ok_(ds1.repo.file_has_content('test-annex.dat') is True) ok_(ds2.repo.file_has_content('test-annex.dat') is True)
def test_here(path): # few smoke tests regarding the 'here' sibling ds = create(path) res = ds.siblings( 'query', on_failure='ignore', result_renderer=None) assert_status('ok', res) assert_result_count(res, 1) assert_result_count(res, 1, name='here') here = res[0] eq_(ds.repo.uuid, here['annex-uuid']) assert_in('annex-description', here) assert_in('annex-bare', here) assert_in('available_local_disk_space', here) # set a description res = ds.siblings( 'configure', name='here', description='very special', on_failure='ignore', result_renderer=None) assert_status('ok', res) assert_result_count(res, 1) assert_result_count(res, 1, name='here') here = res[0] eq_('very special', here['annex-description'])
def _compare_metadata_helper(origres, compds): for ores in origres: rpath = relpath(ores['path'], ores['refds']) cres = compds.metadata( rpath, reporton='{}s'.format(ores['type'])) if ores['type'] == 'file': # TODO implement file based lookup continue assert_result_count(cres, 1) cres = cres[0] assert_dict_equal(ores['metadata'], cres['metadata']) if ores['type'] == 'dataset': for i in ('dsid', ): eq_(ores[i], cres[i])
def test_archive(path): ds = Dataset(opj(path, 'ds')).create(force=True) ds.save() committed_date = ds.repo.get_commit_date() default_outname = opj(path, 'datalad_{}.tar.gz'.format(ds.id)) with chpwd(path): res = list(ds.export_archive()) assert_status('ok', res) assert_result_count(res, 1) assert(isabs(res[0]['path'])) assert_true(os.path.exists(default_outname)) custom_outname = opj(path, 'myexport.tar.gz') # feed in without extension ds.export_archive(filename=custom_outname[:-7]) assert_true(os.path.exists(custom_outname)) custom1_md5 = md5sum(custom_outname) # encodes the original archive filename -> different checksum, despit # same content assert_not_equal(md5sum(default_outname), custom1_md5) # should really sleep so if they stop using time.time - we know time.sleep(1.1) ds.export_archive(filename=custom_outname) # should not encode mtime, so should be identical assert_equal(md5sum(custom_outname), custom1_md5) def check_contents(outname, prefix): with tarfile.open(outname) as tf: nfiles = 0 for ti in tf: # any annex links resolved assert_false(ti.issym()) ok_startswith(ti.name, prefix + '/') assert_equal(ti.mtime, committed_date) if '.datalad' not in ti.name: # ignore any files in .datalad for this test to not be # susceptible to changes in how much we generate a meta info nfiles += 1 # we have exactly four files (includes .gitattributes for default # MD5E backend), and expect no content for any directory assert_equal(nfiles, 4) check_contents(default_outname, 'datalad_%s' % ds.id) check_contents(custom_outname, 'myexport') # now loose some content ds.drop('file_up', check=False) assert_raises(IOError, ds.export_archive, filename=opj(path, 'my')) ds.export_archive(filename=opj(path, 'partial'), missing_content='ignore') assert_true(os.path.exists(opj(path, 'partial.tar.gz')))
def test_status_nods(path, otherpath): ds = Dataset(path).create() assert_result_count( ds.status(path=otherpath, on_failure='ignore'), 1, status='error', message='path not underneath this dataset') otherds = Dataset(otherpath).create() assert_result_count( ds.status(path=otherpath, on_failure='ignore'), 1, path=otherds.path, status='error', message=( 'dataset containing given paths is not underneath the reference dataset %s: %s', ds, []) )
def test_gh1597(path): ds = Dataset(path).create() sub = ds.create('sub', save=False) # only staged at this point, but known, and not annexed ok_file_under_git(ds.path, '.gitmodules', annexed=False) res = ds.subdatasets() assert_result_count(res, 1, path=sub.path) # now modify .gitmodules with another command ds.subdatasets(contains=sub.path, set_property=[('this', 'that')]) ok_clean_git(ds.path, index_modified=['sub']) # now modify low-level with open(opj(ds.path, '.gitmodules'), 'a') as f: f.write('\n') ok_clean_git(ds.path, index_modified=['.gitmodules', 'sub']) ds.add('.gitmodules') # must not come under annex mangement ok_file_under_git(ds.path, '.gitmodules', annexed=False)
def test_smth_about_not_supported(p1, p2): source = Dataset(p1).create() from datalad.support.network import PathRI source.create_sibling( 'ssh://localhost' + PathRI(p2).posixpath, name='target1') # source.publish(to='target1') with chpwd(p1): # since we have only two commits (set backend, init dataset) # -- there is no HEAD^^ assert_result_count( publish(to='target1', since='HEAD^^', on_failure='ignore'), 1, status='impossible', message="fatal: bad revision 'HEAD^^'") # but now let's add one more commit, we should be able to pusblish source.repo.commit("msg", options=['--allow-empty']) publish(to='target1', since='HEAD^') # must not fail now
def test_exif(path): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'exif', where='dataset') copy( opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'exif.jpg'), path) ds.save() ok_clean_git(ds.path) res = ds.aggregate_metadata() assert_status('ok', res) res = ds.metadata('exif.jpg') assert_result_count(res, 1) # from this extractor meta = res[0]['metadata']['exif'] for k, v in target.items(): eq_(meta[k], v) assert_in('@context', meta)
def test_within_ds_file_search(path): try: import nibabel except ImportError: raise SkipTest ds = Dataset(path).create(force=True) ds.config.add('datalad.metadata.nativetype', 'nifti1', where='dataset') makedirs(opj(path, 'stim')) for src, dst in (('nifti1.nii.gz', opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz')), ('nifti1.nii.gz', opj('sub-03', 'func', 'sub-03_task-other_bold.nii.gz'))): copy(opj(dirname(dirname(__file__)), 'tests', 'data', src), opj(path, dst)) ds.add('.') ds.aggregate_metadata() ok_clean_git(ds.path) # basic sanity check on the metadata structure of the dataset dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata'] for src in ('bids', 'nifti1'): # something for each one assert_in(src, dsmeta) # each src declares its own context assert_in('@context', dsmeta[src]) # we have a unique content metadata summary for each src assert_in(src, dsmeta['datalad_unique_content_properties']) # test default behavior with swallow_outputs() as cmo: ds.search(show_keys='name', mode='textblob') assert_in("""\ id meta parentds path type """, cmo.out) target_out = """\ bids.BIDSVersion bids.author bids.citation bids.conformsto bids.description bids.fundedby bids.license bids.modality bids.name bids.participant.age(years) bids.participant.gender bids.participant.handedness bids.participant.hearing_problems_current bids.participant.id bids.participant.language bids.subject bids.task bids.type datalad_core.id datalad_core.refcommit id nifti1.cal_max nifti1.cal_min nifti1.datatype nifti1.description nifti1.dim nifti1.freq_axis nifti1.intent nifti1.magic nifti1.phase_axis nifti1.pixdim nifti1.qform_code nifti1.sform_code nifti1.sizeof_hdr nifti1.slice_axis nifti1.slice_duration nifti1.slice_end nifti1.slice_order nifti1.slice_start nifti1.spatial_resolution(mm) nifti1.t_unit nifti1.temporal_spacing(s) nifti1.toffset nifti1.vox_offset nifti1.xyz_unit parentds path type """ # check generated autofield index keys with swallow_outputs() as cmo: ds.search(mode='autofield', show_keys='name') # it is impossible to assess what is different from that dump assert_in(target_out, cmo.out) assert_result_count(ds.search('blablob#'), 0) # now check that we can discover things from the aggregated metadata for mode, query, hitpath, matched_key, matched_val in ( # random keyword query # multi word query implies AND ('textblob', ['bold', 'male'], opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz'), 'meta', 'male'), # report which field matched with auto-field ('autofield', 'female', opj('sub-03', 'func', 'sub-03_task-other_bold.nii.gz'), 'bids.participant.gender', 'female'), # autofield multi-word query is also AND ('autofield', ['bids.type:bold', 'bids.participant.id:01'], opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz'), 'bids.type', 'bold'), # TODO extend with more complex queries to test whoosh # query language configuration ): res = ds.search(query, mode=mode, full_record=True) if mode == 'textblob': # 'textblob' does datasets by default only (be could be configured otherwise assert_result_count(res, 1) else: # the rest has always a file and the dataset, because they carry metadata in # the same structure assert_result_count(res, 2) assert_result_count( res, 1, type='file', path=opj(ds.path, hitpath), # each file must report the ID of the dataset it is from, critical for # discovering related content dsid=ds.id) assert_result_count(res, 1, type='dataset', path=ds.path, dsid=ds.id) # test the key and specific value of the match assert_in(matched_key, res[-1]['query_matched']) assert_equal(res[-1]['query_matched'][matched_key], matched_val)
def test_ephemeral(origin_path, bare_path, clone1_path, clone2_path, clone3_path): file_test = Path('ds') / 'test.txt' file_testsub = Path('ds') / 'subdir' / 'testsub.txt' origin = Dataset(origin_path).create(force=True) if origin.repo.is_managed_branch(): raise SkipTest('Ephemeral clones cannot use adjusted mode repos') origin.save() # 1. clone via path clone1 = clone(origin_path, clone1_path, reckless='ephemeral') can_symlink = has_symlink_capability() if can_symlink: clone1_annex = (clone1.repo.dot_git / 'annex') ok_(clone1_annex.is_symlink()) ok_(clone1_annex.resolve().samefile(origin.repo.dot_git / 'annex')) if not clone1.repo.is_managed_branch(): # TODO: We can't properly handle adjusted branch yet eq_((clone1.pathobj / file_test).read_text(), 'some') eq_((clone1.pathobj / file_testsub).read_text(), 'somemore') # 2. clone via file-scheme URL clone2 = clone('file://' + Path(origin_path).as_posix(), clone2_path, reckless='ephemeral') if can_symlink: clone2_annex = (clone2.repo.dot_git / 'annex') ok_(clone2_annex.is_symlink()) ok_(clone2_annex.resolve().samefile(origin.repo.dot_git / 'annex')) if not clone2.repo.is_managed_branch(): # TODO: We can't properly handle adjusted branch yet eq_((clone2.pathobj / file_test).read_text(), 'some') eq_((clone2.pathobj / file_testsub).read_text(), 'somemore') # 3. add something to clone1 and push back to origin availability from # clone1 should not be propagated (we declared 'here' dead to that end) (clone1.pathobj / 'addition.txt').write_text("even more") clone1.save() origin.config.set("receive.denyCurrentBranch", "updateInstead", where="local") # Note, that the only thing to test is git-annex-dead here, # if we couldn't symlink: clone1.publish(to='origin', transfer_data='none' if can_symlink else 'auto') if not origin.repo.is_managed_branch(): # test logic cannot handle adjusted branches eq_(origin.repo.get_hexsha(), clone1.repo.get_hexsha()) res = origin.repo.whereis("addition.txt") if can_symlink: # obv. present in origin, but this is not yet known to origin: eq_(res, []) res = origin.repo.fsck() assert_result_count(res, 3, success=True) # TODO: Double check whether annex reports POSIX paths o windows! eq_({str(file_test), str(file_testsub), "addition.txt"}, {r['file'] for r in res}) # now origin knows: res = origin.repo.whereis("addition.txt") eq_(res, [origin.config.get("annex.uuid")]) # 4. ephemeral clone from a bare repo runner = GitWitlessRunner() runner.run(['git', 'clone', '--bare', origin_path, bare_path]) runner.run(['git', 'annex', 'init'], cwd=bare_path) eph_from_bare = clone(bare_path, clone3_path, reckless='ephemeral') can_symlink = has_symlink_capability() if can_symlink: # Bare repo uses dirhashlower by default, while a standard repo uses # dirhashmixed. Symlinking different object trees doesn't really work. # Don't test that here, since this is not a matter of the "ephemeral" # option alone. We should have such a setup in the RIA tests and test # for data access there. # Here we only test for the correct linking. eph_annex = eph_from_bare.repo.dot_git / 'annex' ok_(eph_annex.is_symlink()) ok_(eph_annex.resolve().samefile(Path(bare_path) / 'annex'))
def test_run_subdataset_install(path): path = Path(path) ds_src = Dataset(path / "src").create() # Repository setup # # . # |-- a/ # | |-- a2/ # | | `-- img # | `-- img # |-- b/ # | `-- b2/ # | `-- img # |-- c/ # | `-- c2/ # | `-- img # `-- d/ # `-- d2/ # `-- img ds_src_a = ds_src.create("a") ds_src_a2 = ds_src_a.create("a2") ds_src_b = Dataset(ds_src.pathobj / "b").create() ds_src_b2 = ds_src_b.create("b2") ds_src_c = ds_src.create("c") ds_src_c2 = ds_src_c.create("c2") ds_src_d = Dataset(ds_src.pathobj / "d").create() ds_src_d2 = ds_src_d.create("d2") ds_src.save() add_pyscript_image(ds_src_a, "in-a", "img") add_pyscript_image(ds_src_a2, "in-a2", "img") add_pyscript_image(ds_src_b2, "in-b2", "img") add_pyscript_image(ds_src_c2, "in-c2", "img") add_pyscript_image(ds_src_d2, "in-d2", "img") ds_src.save(recursive=True) ds_dest = clone(ds_src.path, str(path / "dest")) ds_dest_a2 = Dataset(ds_dest.pathobj / "a" / "a2") ds_dest_b2 = Dataset(ds_dest.pathobj / "b" / "b2") ds_dest_c2 = Dataset(ds_dest.pathobj / "c" / "c2") ds_dest_d2 = Dataset(ds_dest.pathobj / "d" / "d2") assert_false(ds_dest_a2.is_installed()) assert_false(ds_dest_b2.is_installed()) assert_false(ds_dest_c2.is_installed()) assert_false(ds_dest_d2.is_installed()) # Needed subdatasets are installed if container name is given... res = ds_dest.containers_run(["arg"], container_name="a/a2/in-a2") assert_result_count(res, 1, action="install", status="ok", path=ds_dest_a2.path) assert_result_count(res, 1, action="get", status="ok", path=str(ds_dest_a2.pathobj / "img")) ok_(ds_dest_a2.is_installed()) # ... even if the name and path do not match. res = ds_dest.containers_run(["arg"], container_name="b/b2/in-b2") assert_result_count(res, 1, action="install", status="ok", path=ds_dest_b2.path) assert_result_count(res, 1, action="get", status="ok", path=str(ds_dest_b2.pathobj / "img")) ok_(ds_dest_b2.is_installed()) # Subdatasets will also be installed if given an image path... res = ds_dest.containers_run(["arg"], container_name=str(Path("c/c2/img"))) assert_result_count(res, 1, action="install", status="ok", path=ds_dest_c2.path) assert_result_count(res, 1, action="get", status="ok", path=str(ds_dest_c2.pathobj / "img")) ok_(ds_dest_c2.is_installed()) ds_dest.containers_run(["arg"], container_name=str(Path("d/d2/img"))) # There's no install record if subdataset is already present. res = ds_dest.containers_run(["arg"], container_name="a/a2/in-a2") assert_not_in_results(res, action="install")
def test_publish_plain_git(origin, src_path, dst_path): # TODO: Since it's mostly the same, melt with test_publish_simple # prepare src source = install(src_path, source=origin, recursive=True) # forget we cloned it (provide no 'origin' anymore), which should lead to # setting tracking branch to target: source.repo.remove_remote("origin") # create plain git at target: target = GitRepo(dst_path, create=True) target.checkout("TMP", ["-b"]) source.repo.add_remote("target", dst_path) res = publish(dataset=source, to="target", result_xfm='datasets') eq_(res, [source]) assert_repo_status(source.repo, annex=None) assert_repo_status(target, annex=None) eq_(list(target.get_branch_commits_("master")), list(source.repo.get_branch_commits_("master"))) # don't fail when doing it again res = publish(dataset=source, to="target") # and nothing is pushed assert_result_count(res, 1, status='notneeded') assert_repo_status(source.repo, annex=None) assert_repo_status(target, annex=None) eq_(list(target.get_branch_commits_("master")), list(source.repo.get_branch_commits_("master"))) # some modification: with open(opj(src_path, 'test_mod_file'), "w") as f: f.write("Some additional stuff.") source.save(opj(src_path, 'test_mod_file'), to_git=True, message="Modified.") assert_repo_status(source.repo, annex=None) res = publish(dataset=source, to='target', result_xfm='datasets') eq_(res, [source]) assert_repo_status(dst_path, annex=None) eq_(list(target.get_branch_commits_("master")), list(source.repo.get_branch_commits_("master"))) # amend and change commit msg in order to test for force push: source.repo.commit("amended", options=['--amend']) # push should be rejected (non-fast-forward): assert_raises(IncompleteResultsError, publish, dataset=source, to='target', result_xfm='datasets') # push with force=True works: res = publish(dataset=source, to='target', result_xfm='datasets', force=True) eq_(res, [source])
def test_unlock(path): ds = Dataset(path) # file is currently locked: # TODO: use get_annexed_files instead of hardcoded filename assert_raises(IOError, open, opj(path, 'test-annex.dat'), "w") # Note: In V6+ we can unlock even if the file's content isn't present, but # doing so when unlock() is called with no paths isn't consistent with the # current behavior when an explicit path is given (it doesn't unlock) or # with the behavior in V5, so we don't do it. # Unlocking the dataset without an explicit path does not fail if there # are files without content. eq_(ds.unlock(path=None, on_failure="ignore"), []) eq_(ds.unlock(path=[], on_failure="ignore"), []) # cannot unlock without content (annex get wasn't called) assert_in_results(ds.unlock(path="test-annex.dat", on_failure="ignore"), path=opj(path, "test-annex.dat"), status="impossible") ds.repo.get('test-annex.dat') result = ds.unlock() assert_result_count(result, 1) assert_in_results(result, path=opj(ds.path, 'test-annex.dat'), status='ok') with open(opj(path, 'test-annex.dat'), "w") as f: f.write("change content") ds.repo.add('test-annex.dat') # in V6+ we need to explicitly re-lock it: if ds.repo.supports_unlocked_pointers: # TODO: RF: make 'lock' a command as well # re-lock to further on have a consistent situation with V5: ds.repo._git_custom_command('test-annex.dat', ['git', 'annex', 'lock']) ds.repo.commit("edit 'test-annex.dat' via unlock and lock it again") # after commit, file is locked again: assert_raises(IOError, open, opj(path, 'test-annex.dat'), "w") # content was changed: with open(opj(path, 'test-annex.dat'), "r") as f: eq_("change content", f.read()) # unlock again, this time more specific: result = ds.unlock(path='test-annex.dat') assert_result_count(result, 1) assert_in_results(result, path=opj(ds.path, 'test-annex.dat'), status='ok') with open(opj(path, 'test-annex.dat'), "w") as f: f.write("change content again") ds.repo.add('test-annex.dat') # in V6+ we need to explicitly re-lock it: if ds.repo.supports_unlocked_pointers: # TODO: RF: make 'lock' a command as well # re-lock to further on have a consistent situation with V5: ds.repo._git_custom_command('test-annex.dat', ['git', 'annex', 'lock']) ds.repo.commit("edit 'test-annex.dat' via unlock and lock it again") # TODO: # BOOOM: test-annex.dat writeable in V6! # Why the hell is this different than the first time we wrote to the file # and locked it again? # Also: After opening the file is empty. # after commit, file is locked again: assert_raises(IOError, open, opj(path, 'test-annex.dat'), "w") # content was changed: with open(opj(path, 'test-annex.dat'), "r") as f: eq_("change content again", f.read())
def test_container_files(path, super_path): raise SkipTest('SingularityHub is gone for now') ds = Dataset(path).create() cmd = ['dir'] if on_windows else ['ls'] # plug in a proper singularity image ds.containers_add( 'mycontainer', url=testimg_url, image='righthere', # the next one is auto-guessed #call_fmt='singularity exec {img} {cmd}' ) assert_result_count(ds.containers_list(), 1, path=op.join(ds.path, 'righthere'), name='mycontainer') ok_clean_git(path) def assert_no_change(res, path): # this command changed nothing # # Avoid specifying the action because it will change from "add" to # "save" in DataLad v0.12. assert_result_count(res, 1, status='notneeded', path=path, type='dataset') # now we can run stuff in the container # and because there is just one, we don't even have to name the container res = ds.containers_run(cmd) # container becomes an 'input' for `run` -> get request, but "notneeded" assert_result_count(res, 1, action='get', status='notneeded', path=op.join(ds.path, 'righthere'), type='file') assert_no_change(res, ds.path) # same thing as we specify the container by its name: res = ds.containers_run(cmd, container_name='mycontainer') # container becomes an 'input' for `run` -> get request, but "notneeded" assert_result_count(res, 1, action='get', status='notneeded', path=op.join(ds.path, 'righthere'), type='file') assert_no_change(res, ds.path) # we can also specify the container by its path: res = ds.containers_run(cmd, container_name=op.join(ds.path, 'righthere')) # container becomes an 'input' for `run` -> get request, but "notneeded" assert_result_count(res, 1, action='get', status='notneeded', path=op.join(ds.path, 'righthere'), type='file') assert_no_change(res, ds.path) # Now, test the same thing, but with this dataset being a subdataset of # another one: super_ds = Dataset(super_path).create() super_ds.install("sub", source=path) # When running, we don't discover containers in subdatasets with assert_raises(ValueError) as cm: super_ds.containers_run(cmd) assert_in("No known containers", str(cm.exception)) # ... unless we need to specify the name res = super_ds.containers_run(cmd, container_name="sub/mycontainer") # container becomes an 'input' for `run` -> get request (needed this time) assert_result_count(res, 1, action='get', status='ok', path=op.join(super_ds.path, 'sub', 'righthere'), type='file') assert_no_change(res, super_ds.path)
def test_rerun_onto(path): ds = Dataset(path).create() grow_file = opj(path, "grows") # Make sure we can handle range-specifications that yield no results. for since in ["", "HEAD"]: assert_result_count(ds.rerun("HEAD", onto="", since=since, on_failure="ignore"), 1, status="impossible", action="run") ds.run('echo static-content > static') ds.repo.tag("static") ds.run('echo x$(cat grows) > grows') ds.rerun() eq_('xx\n', open(grow_file).read()) # If we run the "static" change on top of itself, we end up in the # same (but detached) place. ds.rerun(revision="static", onto="static") ok_(ds.repo.get_active_branch() is None) eq_(ds.repo.repo.git.rev_parse("HEAD"), ds.repo.repo.git.rev_parse("static")) # If we run the "static" change from the same "base", we end up # with a new commit. ds.repo.checkout("master") ds.rerun(revision="static", onto="static^") ok_(ds.repo.get_active_branch() is None) neq_(ds.repo.repo.git.rev_parse("HEAD"), ds.repo.repo.git.rev_parse("static")) assert_result_count(ds.diff(revision="HEAD..static"), 0) for revrange in ["..static", "static.."]: assert_result_count(ds.repo.repo.git.rev_list(revrange).split(), 1) # Unlike the static change, if we run the ever-growing change on # top of itself, we end up with a new commit. ds.repo.checkout("master") ds.rerun(onto="HEAD") ok_(ds.repo.get_active_branch() is None) neq_(ds.repo.repo.git.rev_parse("HEAD"), ds.repo.repo.git.rev_parse("master")) # An empty `onto` means use the parent of the first revision. ds.repo.checkout("master") ds.rerun(since="static^", onto="") ok_(ds.repo.get_active_branch() is None) for revrange in ["..master", "master.."]: assert_result_count(ds.repo.repo.git.rev_list(revrange).split(), 3) # An empty `onto` means use the parent of the first revision that # has a run command. ds.repo.checkout("master") ds.rerun(since="", onto="", branch="from-base") eq_(ds.repo.get_active_branch(), "from-base") assert_result_count(ds.diff(revision="master..from-base"), 0) eq_(ds.repo.get_merge_base(["static", "from-base"]), ds.repo.repo.git.rev_parse("static^"))
def test_rerun(path, nodspath): ds = Dataset(path).create() sub = ds.create('sub') probe_path = opj(sub.path, 'sequence') # run inside the dataset with chpwd(path), \ swallow_outputs(): ds.run('echo x$(cat sub/sequence) > sub/sequence') # command ran once, all clean ok_clean_git(ds.path) eq_('x\n', open(probe_path).read()) # now, for a rerun we can be anywhere, PWD and all are recorded # moreover, rerun must figure out which bits to unlock, even in # subdatasets with chpwd(nodspath), \ swallow_outputs(): ds.rerun() ok_clean_git(ds.path) # ran twice now eq_('xx\n', open(probe_path).read()) # Rerunning from a subdataset skips the command. _, sub_info = get_run_info(ds, sub.repo.repo.head.commit.message) eq_(ds.id, sub_info["dsid"]) assert_result_count(sub.rerun(return_type="list", on_failure="ignore"), 1, status="impossible", action="run", rerun_action="skip") eq_('xx\n', open(probe_path).read()) # Rerun fails with a dirty repo. dirt = opj(path, "dirt") with open(dirt, "w") as fh: fh.write("") assert_status('impossible', ds.rerun(on_failure="ignore")) remove(dirt) ok_clean_git(ds.path) # Make a non-run commit. with open(opj(path, "nonrun-file"), "w") as f: f.write("foo") ds.add("nonrun-file") # Now rerun the buried command. ds.rerun(revision="HEAD~", message="rerun buried") eq_('xxx\n', open(probe_path).read()) # Also check that the messasge override worked. eq_(ds.repo.repo.head.commit.message.splitlines()[0], "[DATALAD RUNCMD] rerun buried") # Or a range of commits, skipping non-run commits. ds.rerun(since="HEAD~3") eq_('xxxxx\n', open(probe_path).read()) # Or --since= to run all reachable commits. ds.rerun(since="") eq_('xxxxxxxxxx\n', open(probe_path).read()) # We can get back a report of what would happen rather than actually # rerunning anything. report = ds.rerun(since="", report=True, return_type="list") # Nothing changed. eq_('xxxxxxxxxx\n', open(probe_path).read()) assert_result_count(report, 1, rerun_action="skip") report[-1]["commit"] == ds.repo.get_hexsha() # If a file is dropped, we remove it instead of unlocking it. ds.drop(probe_path, check=False) ds.rerun() eq_('x\n', open(probe_path).read()) # If the history to rerun has a merge commit, we abort. ds.repo.checkout("HEAD~3", options=["-b", "topic"]) with open(opj(path, "topic-file"), "w") as f: f.write("topic") ds.add("topic-file") ds.repo.checkout("master") ds.repo.merge("topic") ok_clean_git(ds.path) assert_raises(IncompleteResultsError, ds.rerun)
def test_aggregation(path): # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(op.join(path, 'origin')).create(force=True) ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') subds = ds.create('sub', force=True) subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') subsubds = subds.create('subsub', force=True) subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') assert_status('ok', ds.save(recursive=True)) # while we are at it: dot it again, nothing should happen assert_status('notneeded', ds.save(recursive=True)) assert_repo_status(ds.path) # aggregate metadata from all subdatasets into any superdataset, including # intermediate ones res = ds.meta_aggregate(recursive=True, into='all') # we get success report for both subdatasets and the superdataset, # and they get saved assert_result_count(res, 3, status='ok', action='meta_aggregate') # the respective super datasets see two saves, one to record the change # in the subdataset after its own aggregation, and one after the super # updated with aggregated metadata assert_result_count(res, 5, status='ok', action='save', type='dataset') # nice and tidy assert_repo_status(ds.path) # quick test of aggregate report aggs = ds.meta_dump(reporton='aggregates', recursive=True) # one for each dataset assert_result_count(aggs, 3) # mother also report layout version assert_result_count(aggs, 1, path=ds.path, layout_version=1) # store clean direct result origres = ds.meta_dump(recursive=True) # basic sanity check assert_result_count(origres, 3, type='dataset') assert_result_count([r for r in origres if r['path'].endswith('.json')], 3, type='file') # Now that we have annex.key # three different IDs eq_( 3, len( set([ _get_dsid_from_core_metadata(s['metadata']['metalad_core']) for s in origres if s['type'] == 'dataset' ]))) # and we know about all three datasets for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'): assert_true( sum([s['metadata']['frictionless_datapackage']['name'] \ == assure_unicode(name) for s in origres if s['type'] == 'dataset'])) # now clone the beast to simulate a new user installing an empty dataset clone = install(op.join(path, 'clone'), source=ds.path, result_xfm='datasets', return_type='item-or-list') # ID mechanism works eq_(ds.id, clone.id) # get fresh metadata cloneres = clone.meta_dump() # basic sanity check assert_result_count(cloneres, 1, type='dataset') # payload file assert_result_count(cloneres, 1, type='file') # now loop over the previous results from the direct metadata query of # origin and make sure we get the extact same stuff from the clone _compare_metadata_helper(origres, clone) # now obtain a subdataset in the clone, should make no difference assert_status('ok', clone.install('sub', result_xfm=None, return_type='list')) _compare_metadata_helper(origres, clone) # test search in search tests, not all over the place ## query smoke test assert_result_count(clone.search('mother', mode='egrep'), 1) assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1) child_res = clone.search('child', mode='egrep') assert_result_count(child_res, 2) for r in child_res: if r['type'] == 'dataset': assert_in(r['query_matched']['frictionless_datapackage.name'], r['metadata']['frictionless_datapackage']['name'])
def test_unlock(path): ds = Dataset(path) # file is currently locked: # TODO: use get_annexed_files instead of hardcoded filename assert_raises(IOError, open, opj(path, 'test-annex.dat'), "w") # in direct mode there is no unlock: if ds.repo.is_direct_mode(): res = ds.unlock() assert_result_count(res, 1) assert_status('notneeded', res) # in V6 we can unlock even if the file's content isn't present: elif ds.repo.config.getint("annex", "version") == 6: res = ds.unlock() assert_result_count(res, 1) assert_status('ok', res) # TODO: RF: make 'lock' a command as well # re-lock to further on have a consistent situation with V5: ds.repo._git_custom_command('test-annex.dat', ['git', 'annex', 'lock']) else: # cannot unlock without content (annex get wasn't called) assert_raises(CommandError, ds.unlock) # FIXME ds.repo.get('test-annex.dat') result = ds.unlock() assert_result_count(result, 1) if ds.repo.is_direct_mode(): assert_status('notneeded', result) else: assert_in_results(result, path=opj(ds.path, 'test-annex.dat'), status='ok') with open(opj(path, 'test-annex.dat'), "w") as f: f.write("change content") ds.repo.add('test-annex.dat') # in V6 we need to explicitly re-lock it: if ds.repo.config.getint("annex", "version") == 6: # TODO: RF: make 'lock' a command as well # re-lock to further on have a consistent situation with V5: ds.repo._git_custom_command('test-annex.dat', ['git', 'annex', 'lock']) ds.repo.commit("edit 'test-annex.dat' via unlock and lock it again") if not ds.repo.is_direct_mode(): # after commit, file is locked again: assert_raises(IOError, open, opj(path, 'test-annex.dat'), "w") # content was changed: with open(opj(path, 'test-annex.dat'), "r") as f: eq_("change content", f.read()) # unlock again, this time more specific: result = ds.unlock(path='test-annex.dat') assert_result_count(result, 1) if ds.repo.is_direct_mode(): assert_in_results(result, path=opj(ds.path, 'test-annex.dat'), status='notneeded') else: assert_in_results(result, path=opj(ds.path, 'test-annex.dat'), status='ok') with open(opj(path, 'test-annex.dat'), "w") as f: f.write("change content again") ds.repo.add('test-annex.dat') # in V6 we need to explicitly re-lock it: if ds.repo.config.getint("annex", "version") == 6: # TODO: RF: make 'lock' a command as well # re-lock to further on have a consistent situation with V5: ds.repo._git_custom_command('test-annex.dat', ['git', 'annex', 'lock']) ds.repo.commit("edit 'test-annex.dat' via unlock and lock it again") # TODO: # BOOOM: test-annex.dat writeable in V6! # Why the hell is this different than the first time we wrote to the file # and locked it again? # Also: After opening the file is empty. if not ds.repo.is_direct_mode(): # after commit, file is locked again: assert_raises(IOError, open, opj(path, 'test-annex.dat'), "w") # content was changed: with open(opj(path, 'test-annex.dat'), "r") as f: eq_("change content again", f.read())
def test_save(path): ds = Dataset(path) with open(op.join(path, "new_file.tst"), "w") as f: f.write("something") ds.repo.add("new_file.tst", git=True) ok_(ds.repo.dirty) ds.save(message="add a new file") assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) with open(op.join(path, "new_file.tst"), "w") as f: f.write("modify") ok_(ds.repo.dirty) ds.save(message="modified new_file.tst") assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) # save works without ds and files given in the PWD with open(op.join(path, "new_file.tst"), "w") as f: f.write("rapunzel") with chpwd(path): save(message="love rapunzel") assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) # and also without `-a` when things are staged with open(op.join(path, "new_file.tst"), "w") as f: f.write("exotic") ds.repo.add("new_file.tst", git=True) with chpwd(path): save(message="love marsians") assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) files = ['one.txt', 'two.txt'] for fn in files: with open(op.join(path, fn), "w") as f: f.write(fn) ds.save([op.join(path, f) for f in files]) # superfluous call to save (alll saved it already), should not fail # but report that nothing was saved assert_status('notneeded', ds.save(message="set of new files")) assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) # create subdataset subds = ds.create('subds') assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) # modify subds with open(op.join(subds.path, "some_file.tst"), "w") as f: f.write("something") subds.save() assert_repo_status(subds.path, annex=isinstance(subds.repo, AnnexRepo)) # ensure modified subds is committed ds.save() assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) # now introduce a change downstairs subds.create('someotherds') assert_repo_status(subds.path, annex=isinstance(subds.repo, AnnexRepo)) ok_(ds.repo.dirty) # and save via subdataset path ds.save('subds', version_tag='new_sub') assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) tags = ds.repo.get_tags() ok_(len(tags) == 1) eq_(tags[0], dict(hexsha=ds.repo.get_hexsha(), name='new_sub')) # fails when retagged, like git does res = ds.save(version_tag='new_sub', on_failure='ignore') assert_status('error', res) assert_result_count( res, 1, action='save', type='dataset', path=ds.path, message=('cannot tag this version: %s', "fatal: tag 'new_sub' already exists"))
def test_get_modified_subpaths(path): ds = Dataset(path).create(force=True) suba = ds.create('ba', force=True) subb = ds.create('bb', force=True) subsub = ds.create(opj('bb', 'bba', 'bbaa'), force=True) ds.add('.', recursive=True) ok_clean_git(path) orig_base_commit = ds.repo.repo.commit().hexsha # nothing was modified compared to the status quo, output must be empty eq_([], list(get_modified_subpaths( [dict(path=ds.path)], ds, orig_base_commit))) # modify one subdataset create_tree(subsub.path, {'added': 'test'}) subsub.add('added') # it will replace the requested path with the path of the closest # submodule that is modified assert_result_count( get_modified_subpaths( [dict(path=ds.path)], ds, orig_base_commit), 1, type='dataset', path=subb.path) # make another one dirty create_tree(suba.path, {'added': 'test'}) # now a single query path will result in the two modified subdatasets assert_result_count( get_modified_subpaths( [dict(path=ds.path)], ds, orig_base_commit), 2, type='dataset') # now save uptop, this will the new state of subb, but keep suba dirty ds.save(subb.path, recursive=True) # now if we ask for what was last saved, we only get the new state of subb assert_result_count( get_modified_subpaths( [dict(path=ds.path)], ds, 'HEAD~1..HEAD'), 1, type='dataset', path=subb.path) # comparing the working tree to head will the dirty suba instead assert_result_count( get_modified_subpaths( [dict(path=ds.path)], ds, 'HEAD'), 1, type='dataset', path=suba.path) # add/save everything, become clean ds.add('.', recursive=True) ok_clean_git(path) # nothing is reported as modified assert_result_count( get_modified_subpaths( [dict(path=ds.path)], ds, 'HEAD'), 0) # but looking all the way back, we find all changes assert_result_count( get_modified_subpaths( [dict(path=ds.path)], ds, orig_base_commit), 2, type='dataset') # now we ask specifically for the file we added to subsub above query = [dict(path=opj(subsub.path, 'added'))] res = list(get_modified_subpaths(query, ds, orig_base_commit)) # we only get this one result back, and not all the submodule state changes # that were also saved in the superdatasets assert_result_count(res, 1) assert_result_count( res, 1, type='file', path=opj(subsub.path, 'added'), state='added') # but if we are only looking at the last saved change (suba), we will not # find our query return something res = get_modified_subpaths(query, ds, 'HEAD^') assert_result_count(res, 0) # deal with removal (force insufiicient copies error) ds.remove(suba.path, check=False) ok_clean_git(path) res = list(get_modified_subpaths([dict(path=ds.path)], ds, 'HEAD~1..HEAD')) # removed submodule + .gitmodules update assert_result_count(res, 2) assert_result_count( res, 1, type_src='dataset', path=suba.path)
def test_update_simple(origin, src_path, dst_path): # prepare src source = install(src_path, source=origin, recursive=True) # forget we cloned it (provide no 'origin' anymore), which should lead to # setting tracking branch to target: source.repo.remove_remote("origin") # get a clone to update later on: dest = install(dst_path, source=src_path, recursive=True) # test setup done; # assert all fine ok_clean_git(dst_path) ok_clean_git(src_path) # update yields nothing => up-to-date assert_status('ok', dest.update()) ok_clean_git(dst_path) # modify origin: with open(opj(src_path, "update.txt"), "w") as f: f.write("Additional content") source.add(path="update.txt") source.save("Added update.txt") ok_clean_git(src_path) # update without `merge` only fetches: assert_status('ok', dest.update()) # modification is not known to active branch: assert_not_in("update.txt", dest.repo.get_files(dest.repo.get_active_branch())) # modification is known to branch origin/master assert_in("update.txt", dest.repo.get_files("origin/master")) # merge: assert_status('ok', dest.update(merge=True)) # modification is now known to active branch: assert_in("update.txt", dest.repo.get_files(dest.repo.get_active_branch())) # it's known to annex, but has no content yet: dest.repo.get_file_key("update.txt") # raises if unknown eq_([False], dest.repo.file_has_content(["update.txt"])) # smoke-test if recursive update doesn't fail if submodule is removed # and that we can run it from within a dataset without providing it # explicitly assert_result_count(dest.remove('subm 1'), 1, status='ok', action='remove', path=opj(dest.path, 'subm 1')) with chpwd(dest.path): assert_result_count(update(recursive=True), 2, status='ok', type='dataset') assert_result_count(dest.update(merge=True, recursive=True), 2, status='ok', type='dataset') # and now test recursive update with merging in differences create_tree(opj(source.path, 'subm 2'), {'load.dat': 'heavy'}) source.add(opj('subm 2', 'load.dat'), message="saving changes within subm2", recursive=True) assert_result_count(dest.update(merge=True, recursive=True), 2, status='ok', type='dataset') # and now we can get new file dest.get('subm 2/load.dat') ok_file_has_content(opj(dest.path, 'subm 2', 'load.dat'), 'heavy')
def test_siblings(origin, repo_path, local_clone_path): sshurl = "ssh://push-remote.example.com" httpurl1 = "http://remote1.example.com/location" httpurl2 = "http://remote2.example.com/location" # insufficient arguments # we need a dataset to work at with chpwd(repo_path): # not yet there assert_raises(InsufficientArgumentsError, siblings, 'add', url=httpurl1) # prepare src source = install(repo_path, source=origin, recursive=True) # pollute config depvar = 'remote.test-remote.datalad-publish-depends' source.config.add(depvar, 'stupid', where='local') # cannot configure unknown remotes as dependencies res = siblings( 'configure', dataset=source, name="test-remote", url=httpurl1, publish_depends=['r1', 'r2'], on_failure='ignore', result_renderer=None) assert_status('error', res) eq_(res[0]['message'], ('unknown sibling(s) specified as publication dependency: %s', set(('r1', 'r2')))) # prior config was not changed by failed call above eq_(source.config.get(depvar, None), 'stupid') res = siblings('configure', dataset=source, name="test-remote", url=httpurl1, result_xfm='paths', result_renderer=None) eq_(res, [source.path]) assert_in("test-remote", source.repo.get_remotes()) eq_(httpurl1, source.repo.get_remote_url("test-remote")) # reconfiguring doesn't change anything siblings('configure', dataset=source, name="test-remote", url=httpurl1, result_renderer=None) assert_in("test-remote", source.repo.get_remotes()) eq_(httpurl1, source.repo.get_remote_url("test-remote")) # re-adding doesn't work res = siblings('add', dataset=source, name="test-remote", url=httpurl1, on_failure='ignore', result_renderer=None) assert_status('error', res) # only after removal res = siblings('remove', dataset=source, name="test-remote", result_renderer=None) assert_status('ok', res) assert_not_in("test-remote", source.repo.get_remotes()) res = siblings('add', dataset=source, name="test-remote", url=httpurl1, on_failure='ignore', result_renderer=None) assert_status('ok', res) # add to another remote automagically taking it from the url # and being in the dataset directory with chpwd(source.path): res = siblings('add', url=httpurl2, result_renderer=None) assert_result_count( res, 1, name="remote2.example.com", type='sibling') assert_in("remote2.example.com", source.repo.get_remotes()) # don't fail with conflicting url, when using force: res = siblings('configure', dataset=source, name="test-remote", url=httpurl1 + "/elsewhere", result_renderer=None) assert_status('ok', res) eq_(httpurl1 + "/elsewhere", source.repo.get_remote_url("test-remote")) # no longer a use case, I would need additional convincing that # this is anyhow useful other then tripple checking other peoples # errors. for an actual check use 'query' # maybe it could be turned into a set of warnings when `configure` # alters an existing setting, but then why call configure, if you # want to keep the old values #with assert_raises(RuntimeError) as cm: # add_sibling(dataset=source, name="test-remote", # url=httpurl1 + "/elsewhere") #assert_in("""'test-remote' already exists with conflicting settings""", # str(cm.exception)) ## add a push url without force fails, since in a way the fetch url is the ## configured push url, too, in that case: #with assert_raises(RuntimeError) as cm: # add_sibling(dataset=source, name="test-remote", # url=httpurl1 + "/elsewhere", # pushurl=sshurl, force=False) #assert_in("""'test-remote' already exists with conflicting settings""", # str(cm.exception)) # add push url (force): res = siblings('configure', dataset=source, name="test-remote", url=httpurl1 + "/elsewhere", pushurl=sshurl, result_renderer=None) assert_status('ok', res) eq_(httpurl1 + "/elsewhere", source.repo.get_remote_url("test-remote")) eq_(sshurl, source.repo.get_remote_url("test-remote", push=True)) # recursively: for r in siblings( 'configure', dataset=source, name="test-remote", url=httpurl1 + "/%NAME", pushurl=sshurl + "/%NAME", recursive=True, # we need to disable annex queries, as it will try to access # the fake URL configured above get_annex_info=False): repo = GitRepo(r['path'], create=False) assert_in("test-remote", repo.get_remotes()) url = repo.get_remote_url("test-remote") pushurl = repo.get_remote_url("test-remote", push=True) ok_(url.startswith(httpurl1 + '/' + basename(source.path))) ok_(url.endswith(basename(repo.path))) ok_(pushurl.startswith(sshurl + '/' + basename(source.path))) ok_(pushurl.endswith(basename(repo.path))) eq_(url, r['url']) eq_(pushurl, r['pushurl']) # recursively without template: for r in siblings( 'configure', dataset=source, name="test-remote-2", url=httpurl1, pushurl=sshurl, recursive=True, # we need to disable annex queries, as it will try to access # the fake URL configured above get_annex_info=False, result_renderer=None): repo = GitRepo(r['path'], create=False) assert_in("test-remote-2", repo.get_remotes()) url = repo.get_remote_url("test-remote-2") pushurl = repo.get_remote_url("test-remote-2", push=True) ok_(url.startswith(httpurl1)) ok_(pushurl.startswith(sshurl)) # FIXME: next condition used to compare the *Repo objects instead of # there paths. Due to missing annex-init in # datalad/tests/utils.py:clone_url this might not be the same, since # `source` actually is an annex, but after flavor 'clone' in # `with_testrepos` and then `install` any trace of an annex might be # gone in v5 (branch 'master' only), while in direct mode it still is # considered an annex. `repo` is forced to be a `GitRepo`, so we might # compare two objects of different classes while they actually are # pointing to the same repository. # See github issue #1854 if repo.path != source.repo.path: ok_(url.endswith('/' + basename(repo.path))) ok_(pushurl.endswith(basename(repo.path))) eq_(url, r['url']) eq_(pushurl, r['pushurl']) # recursively without template and pushurl but full "hierarchy" # to a local clone for r in siblings( 'configure', dataset=source, name="test-remote-3", url=local_clone_path, recursive=True, # we need to disable annex queries, as it will try to access # the fake URL configured above get_annex_info=False, result_renderer=None): repo = GitRepo(r['path'], create=False) assert_in("test-remote-3", repo.get_remotes()) url = repo.get_remote_url("test-remote-3") pushurl = repo.get_remote_url("test-remote-3", push=True) eq_(normpath(url), normpath(opj(local_clone_path, relpath(str(r['path']), source.path)))) # https://github.com/datalad/datalad/issues/3951 ok_(not pushurl) # no pushurl should be defined
def test_basics(path, nodspath): ds = Dataset(path).create() last_state = ds.repo.get_hexsha() # run inside the dataset with chpwd(path), \ swallow_outputs(): # provoke command failure with assert_raises(CommandError) as cme: ds.run('7i3amhmuch9invalid') # let's not speculate that the exit code is always 127 ok_(cme.code > 0) eq_(last_state, ds.repo.get_hexsha()) # now one that must work res = ds.run('touch empty', message='TEST') ok_clean_git(ds.path) assert_result_count(res, 2) # TODO 'state' is still untracked!!! assert_result_count(res, 1, action='add', path=opj(ds.path, 'empty'), type='file') assert_result_count(res, 1, action='save', path=ds.path) commit_msg = ds.repo.repo.head.commit.message ok_(commit_msg.startswith('[DATALAD RUNCMD] TEST')) # crude test that we have a record for the PWD assert_in('"pwd": "."', commit_msg) last_state = ds.repo.get_hexsha() # now run a command that will not alter the dataset res = ds.run('touch empty', message='NOOP_TEST') assert_status('notneeded', res) eq_(last_state, ds.repo.get_hexsha()) # We can also run the command via a single-item list because this is # what the CLI interface passes in for quoted commands. res = ds.run(['touch empty'], message='NOOP_TEST') assert_status('notneeded', res) # run outside the dataset, should still work but with limitations with chpwd(nodspath), \ swallow_outputs(): res = ds.run(['touch', 'empty2'], message='TEST') assert_status('ok', res) assert_result_count(res, 1, action='add', path=opj(ds.path, 'empty2'), type='file') # running without a command is a noop with chpwd(path): with swallow_logs(new_level=logging.WARN) as cml: ds.run() assert_in("No command given", cml.out) # Simple sidecar message checks. ds.run(["touch", "dummy0"], message="sidecar arg", sidecar=True) assert_not_in('"cmd":', ds.repo.repo.head.commit.message) real_get = ds.config.get def mocked_get(key, default=None): if key == "datalad.run.record-sidecar": return True return real_get(key, default) with patch.object(ds.config, "get", mocked_get): ds.run(["touch", "dummy1"], message="sidecar config") assert_not_in('"cmd":', ds.repo.repo.head.commit.message)
def test_aggregate_removal(path): base = Dataset(op.join(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(op.join(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n' ) sub = base.create('sub', force=True) subsub = sub.create(op.join('subsub'), force=True) base.save(recursive=True) base.meta_aggregate(recursive=True, into='all') assert_repo_status(base.path) res = base.meta_dump(reporton='aggregates', recursive=True) assert_result_count(res, 3) assert_result_count(res, 1, path=subsub.path) # check that we only have object files that are listed in agginfo eq_(_get_contained_objs(base), _get_referenced_objs(base)) # now delete the deepest subdataset to test cleanup of aggregated objects # in the top-level ds base.remove(op.join('sub', 'subsub'), check=False) # now aggregation has to detect that subsub is not simply missing, but gone # for good base.meta_aggregate(recursive=True, into='all') assert_repo_status(base.path) # internally consistent state eq_(_get_contained_objs(base), _get_referenced_objs(base)) # info on subsub was removed at all levels res = base.meta_dump(reporton='aggregates', recursive=True) assert_result_count(res, 0, path=subsub.path) assert_result_count(res, 2) res = sub.meta_dump(reporton='aggregates', recursive=True) assert_result_count(res, 0, path=subsub.path) assert_result_count(res, 1)
def _test_bare_git_version_1(host, dspath, store): # This test should take a dataset and create a bare repository at the remote # end from it. # Given, that it is placed correctly within a tree of dataset, that remote # thing should then be usable as an ora-remote as well as as a git-type # remote. # Note: Usability of git remote by annex depends on dataset layout version # (dirhashlower vs. -mixed). # For version 1 (lower) upload and consumption should be # interchangeable. It doesn't matter which remote is used for what # direction. ds_path = Path(dspath) store = Path(store) ds = Dataset(ds_path).create() populate_dataset(ds) ds.save() bare_repo_path, _, _ = get_layout_locations(1, store, ds.id) # Use git to make sure the remote end is what git thinks a bare clone of it # should look like subprocess.run(['git', 'clone', '--bare', quote_cmdlinearg(str(dspath)), quote_cmdlinearg(str(bare_repo_path)) ]) if host: url = "ria+ssh://{host}{path}".format(host=host, path=store) else: url = "ria+{}".format(store.as_uri()) init_opts = common_init_opts + ['url={}'.format(url)] # set up store: io = SSHRemoteIO(host) if host else LocalIO() create_store(io, store, '1') # set up the dataset location, too. # Note: Dataset layout version 1 (dirhash lower): create_ds_in_store(io, store, ds.id, '1', '1') # Now, let's have the bare repo as a git remote and use it with annex git_url = "ssh://{host}{path}".format(host=host, path=bare_repo_path) \ if host else bare_repo_path.as_uri() ds.repo.add_remote('bare-git', git_url) ds.repo.enable_remote('bare-git') # copy files to the remote ds.repo.copy_to('.', 'bare-git') eq_(len(ds.repo.whereis('one.txt')), 2) # now we can drop all content locally, reobtain it, and survive an # fsck ds.drop('.') ds.get('.') assert_status('ok', [annexjson2result(r, ds) for r in ds.repo.fsck()]) # Now, add the ora remote: ds.repo.init_remote('ora-remote', options=init_opts) # fsck to make availability known assert_status( 'ok', [annexjson2result(r, ds) for r in ds.repo.fsck(remote='ora-remote', fast=True)]) eq_(len(ds.repo.whereis('one.txt')), 3) # Now move content from git-remote to local and see it not being available # via bare-git anymore. ds.repo.call_git(['annex', 'move', '--all', '--from=bare-git']) # ora-remote doesn't know yet: eq_(len(ds.repo.whereis('one.txt')), 2) # But after fsck it does: fsck_res = [annexjson2result(r, ds) for r in ds.repo.fsck(remote='ora-remote', fast=True)] assert_result_count(fsck_res, 1, status='error', message='** Based on the location log, one.txt\n' '** was expected to be present, ' 'but its content is missing.') assert_result_count(fsck_res, 1, status='error', message='** Based on the location log, subdir/two\n' '** was expected to be present, ' 'but its content is missing.') eq_(len(ds.repo.whereis('one.txt')), 1) # and the other way around: upload via ora-remote and have it available via # git-remote: ds.repo.copy_to('.', 'ora-remote') # fsck to make availability known assert_status( 'ok', [annexjson2result(r, ds) for r in ds.repo.fsck(remote='bare-git', fast=True)]) eq_(len(ds.repo.whereis('one.txt')), 3)
def _test_bare_git_version_2(host, dspath, store): # Similarly to test_bare_git_version_1, this should ensure a bare git repo # at the store location for a dataset doesn't conflict with the ORA remote. # Note: Usability of git remote by annex depends on dataset layout version # (dirhashlower vs. -mixed). # For version 2 (mixed) upload via ORA and consumption via git should # work. But not the other way around, since git-annex uses # dirhashlower with bare repos. ds_path = Path(dspath) store = Path(store) ds = Dataset(ds_path).create() populate_dataset(ds) ds.save() bare_repo_path, _, _ = get_layout_locations(1, store, ds.id) # Use git to make sure the remote end is what git thinks a bare clone of it # should look like subprocess.run(['git', 'clone', '--bare', quote_cmdlinearg(str(dspath)), quote_cmdlinearg(str(bare_repo_path)) ]) if host: url = "ria+ssh://{host}{path}".format(host=host, path=store) else: url = "ria+{}".format(store.as_uri()) init_opts = common_init_opts + ['url={}'.format(url)] # set up store: io = SSHRemoteIO(host) if host else LocalIO() create_store(io, store, '1') # set up the dataset location, too. # Note: Dataset layout version 2 (dirhash mixed): create_ds_in_store(io, store, ds.id, '2', '1') # Now, let's have the bare repo as a git remote git_url = "ssh://{host}{path}".format(host=host, path=bare_repo_path) \ if host else bare_repo_path.as_uri() ds.repo.add_remote('bare-git', git_url) ds.repo.enable_remote('bare-git') # and the ORA remote in addition: ds.repo.init_remote('ora-remote', options=init_opts) # upload keys via ORA: ds.repo.copy_to('.', 'ora-remote') # bare-git doesn't know yet: eq_(len(ds.repo.whereis('one.txt')), 2) # fsck to make availability known assert_status( 'ok', [annexjson2result(r, ds) for r in ds.repo.fsck(remote='bare-git', fast=True)]) eq_(len(ds.repo.whereis('one.txt')), 3) ds.drop('.') eq_(len(ds.repo.whereis('one.txt')), 2) # actually consumable via git remote: ds.repo.call_git(['annex', 'move', 'one.txt', '--from', 'bare-git']) eq_(len(ds.repo.whereis('one.txt')), 2) # now, move back via git - shouldn't be consumable via ORA ds.repo.call_git(['annex', 'move', 'one.txt', '--to', 'bare-git']) # fsck to make availability known, but there's nothing from POV of ORA: fsck_res = [annexjson2result(r, ds) for r in ds.repo.fsck(remote='ora-remote', fast=True)] assert_result_count(fsck_res, 1, status='error', message='** Based on the location log, one.txt\n' '** was expected to be present, ' 'but its content is missing.') assert_result_count(fsck_res, 1, status='ok') eq_(len(fsck_res), 2) eq_(len(ds.repo.whereis('one.txt')), 1)
def test_annotate_paths(dspath, nodspath): # this test doesn't use API`remove` to avoid circularities ds = make_demo_hierarchy_datasets(dspath, demo_hierarchy) ds.add('.', recursive=True) ok_clean_git(ds.path) with chpwd(dspath): # with and without an explicitly given path the result is almost the # same inside a dataset without_path = annotate_paths(on_failure='ignore') pwd_res = annotate_paths(path='.', on_failure='ignore') assert_result_count( without_path, 1, type='dataset', path=dspath) assert_result_count( pwd_res, 1, type='dataset', path=dspath, orig_request='.', raw_input=True) # make sure going into a subdataset vs giving it as a path has no # structural impact eq_( [{k: v for k, v in ap.items() if k not in ('registered_subds', 'raw_input', 'orig_request', 'refds')} for ap in annotate_paths(path='b', recursive=True)], [{k: v for k, v in ap.items() if k not in ('registered_subds', 'raw_input', 'orig_request', 'refds')} for ap in annotate_paths(dataset='b', recursive=True)]) # now do it again, pointing to the ds directly res = ds.annotate_paths(on_failure='ignore') # no request, no refds, but otherwise the same eq_(len(res), len(pwd_res)) eq_({k: pwd_res[0][k] for k in pwd_res[0] if k in ('path', 'type', 'action', 'status')}, {k: res[0][k] for k in res[0] if k not in ('refds',)}) # will refuse a path that is not a dataset as refds res = annotate_paths(dataset=nodspath, on_failure='ignore') assert_result_count( res, 1, status='error', path=nodspath, message='given reference dataset is not a dataset') # recursion with proper base dataset parentds = Dataset(opj(dspath, 'a')) base_res = parentds.annotate_paths(recursive=True) # needs to find 'aa' and the base assert_result_count(base_res, 2) assert_result_count(base_res, 2, type='dataset') assert_result_count( base_res, 1, type='dataset', parentds=parentds.path, path=opj(parentds.path, 'aa'), status='') # same recursion but without a base dataset res = annotate_paths(path=opj(dspath, 'a'), recursive=True) # needs to find 'aa' and 'a' again assert_result_count(res, 2) eq_(res[-1], {k: base_res[-1][k] for k in base_res[-1] if k not in ('refds',)}) assert_result_count( res, 1, type='dataset', status='', # it does not auto-discover parent datasets without force or a refds #parentds=parentds.path, path=parentds.path) # but we can force parent discovery res = parentds.annotate_paths( path=opj(dspath, 'a'), recursive=True, force_parentds_discovery=True) assert_result_count(res, 2) assert_result_count( res, 1, type='dataset', status='', parentds=dspath, path=parentds.path) # recursion with multiple disjoint seeds, no common base eq_([basename(p) for p in annotate_paths( path=[opj(dspath, 'a'), opj(dspath, 'b', 'bb', 'bba')], recursive=True, result_xfm='paths')], ['a', 'aa', 'bba', 'bbaa']) # recursion with partially overlapping seeds, no duplicate results eq_([basename(p) for p in annotate_paths( path=[opj(dspath, 'b'), opj(dspath, 'b', 'bb', 'bba')], recursive=True, result_xfm='paths')], ['b', 'ba', 'bb', 'bba', 'bbaa']) # get straight from a file fpath = opj('a', 'aa', 'file_aa') res = ds.annotate_paths(fpath) assert_result_count(res, 1) assert_result_count( res, 1, orig_request=fpath, raw_input=True, type='file', path=opj(ds.path, fpath), parentds=opj(ds.path, 'a', 'aa'), status='') # now drop it dropres = ds.drop(fpath, check=False) assert_result_count(dropres, 1, path=res[0]['path'], status='ok') # ask for same file again, use 'notneeded' for unavailable to try trigger # any difference droppedres = ds.annotate_paths(fpath, unavailable_path_status='notneeded') # but we get the same result eq_(res, droppedres) # now try the same on an uninstalled dataset subdspath = opj('b', 'bb') # before before_res = ds.annotate_paths(subdspath, recursive=True, unavailable_path_status='error') assert_result_count(before_res, 3, status='', type='dataset') uninstall_res = ds.uninstall(subdspath, recursive=True, check=False) assert_result_count(uninstall_res, 3, status='ok', type='dataset') # after after_res = ds.annotate_paths(subdspath, unavailable_path_status='error', on_failure='ignore') # uninstall hides all low-level datasets assert_result_count(after_res, 1) # but for the top-most uninstalled one it merely reports absent state now assert_result_count( after_res, 1, state='absent', **{k: before_res[0][k] for k in before_res[0] if k not in ('state', 'status')}) # however, this beauty doesn't come for free, so it can be disabled # which will make the uninstalled subdataset like a directory in the # parent (or even just a non-existing path, if the mountpoint dir isn't # present after_res = ds.annotate_paths(subdspath, force_subds_discovery=False) assert_result_count( after_res, 1, type='directory', path=before_res[0]['path'], parentds=before_res[0]['parentds']) # feed annotated paths into annotate_paths, it shouldn't change things # upon second run # datasets and file res = ds.annotate_paths(['.', fpath], recursive=True) # make a copy, just to the sure orig_res = deepcopy(res) assert_result_count(res, 7) # and in again, no recursion this time res_again = ds.annotate_paths(res) # doesn't change a thing eq_(orig_res, res_again) # and in again, with recursion this time res_recursion_again = ds.annotate_paths(res, recursive=True) assert_result_count(res_recursion_again, 7) # doesn't change a thing eq_(orig_res, res_recursion_again)
def test_status(_path, linkpath): # do the setup on the real path, not the symlink, to have its # bugs not affect this test of status() ds = get_deeply_nested_structure(str(_path)) if has_symlink_capability(): # make it more complicated by default ut.Path(linkpath).symlink_to(_path, target_is_directory=True) path = linkpath else: path = _path ds = Dataset(path) if has_symlink_capability(): assert ds.pathobj != ds.repo.pathobj # spotcheck that annex status reporting and availability evaluation # works assert_result_count( ds.status(annex='all', result_renderer=None), 1, path=str(ds.pathobj / 'subdir' / 'annexed_file.txt'), key='MD5E-s5--275876e34cf609db118f3d84b799a790.txt', has_content=True, objloc=str(ds.repo.pathobj / '.git' / 'annex' / 'objects' / # hashdir is different on windows ('f33' if ds.repo.is_managed_branch() else '7p') / ('94b' if ds.repo.is_managed_branch() else 'gp') / 'MD5E-s5--275876e34cf609db118f3d84b799a790.txt' / 'MD5E-s5--275876e34cf609db118f3d84b799a790.txt')) plain_recursive = ds.status(recursive=True, result_renderer=None) # check integrity of individual reports with a focus on how symlinks # are reported for res in plain_recursive: # anything that is an "intended" symlink should be reported # as such. In contrast, anything that is a symlink for mere # technical reasons (annex using it for something in some mode) # should be reported as the thing it is representing (i.e. # a file) if 'link2' in str(res['path']): assert res['type'] == 'symlink', res else: assert res['type'] != 'symlink', res # every item must report its parent dataset assert_in('parentds', res) # bunch of smoke tests # query of '.' is same as no path eq_(plain_recursive, ds.status(path='.', recursive=True, result_renderer=None)) # duplicate paths do not change things eq_(plain_recursive, ds.status(path=['.', '.'], recursive=True, result_renderer=None)) # neither do nested paths eq_( plain_recursive, ds.status(path=['.', 'subds_modified'], recursive=True, result_renderer=None)) # when invoked in a subdir of a dataset it still reports on the full thing # just like `git status`, as long as there are no paths specified with chpwd(op.join(path, 'directory_untracked')): plain_recursive = status(recursive=True, result_renderer=None) # should be able to take absolute paths and yield the same # output eq_(plain_recursive, ds.status(path=ds.path, recursive=True, result_renderer=None)) # query for a deeply nested path from the top, should just work with a # variety of approaches rpath = op.join('subds_modified', 'subds_lvl1_modified', OBSCURE_FILENAME + u'_directory_untracked') apathobj = ds.pathobj / rpath apath = str(apathobj) # ds.repo.pathobj will have the symlink resolved arealpath = ds.repo.pathobj / rpath # TODO include explicit relative path in test for p in (rpath, apath, arealpath, None): if p is None: # change into the realpath of the dataset and # query with an explicit path with chpwd(ds.repo.path): res = ds.status(path=op.join('.', rpath), result_renderer=None) else: res = ds.status(path=p, result_renderer=None) assert_result_count( res, 1, state='untracked', type='directory', refds=ds.path, # path always comes out a full path inside the queried dataset path=apath, ) assert_result_count(ds.status(recursive=True, result_renderer=None), 1, path=apath) # limiting recursion will exclude this particular path assert_result_count(ds.status(recursive=True, recursion_limit=1, result_renderer=None), 0, path=apath) # negative limit is unlimited limit eq_(ds.status(recursive=True, recursion_limit=-1, result_renderer=None), ds.status(recursive=True, result_renderer=None))
def test_publish_recursive(pristine_origin, origin_path, src_path, dst_path, sub1_pub, sub2_pub): # we will be publishing back to origin, so to not alter testrepo # we will first clone it origin = install(origin_path, source=pristine_origin, recursive=True) # prepare src source = install(src_path, source=origin.path, recursive=True) # we will be trying to push into this later on, need to give permissions... origin_sub2 = Dataset(opj(origin_path, '2')) origin_sub2.config.set('receive.denyCurrentBranch', 'updateInstead', where='local') ## TODO this manual fixup is needed due to gh-1548 -- needs proper solution #os.remove(opj(origin_sub2.path, '.git')) #os.rename(opj(origin_path, '.git', 'modules', '2'), opj(origin_sub2.path, '.git')) # create plain git at target: target = GitRepo(dst_path, create=True) target.checkout("TMP", ["-b"]) source.repo.add_remote("target", dst_path) # subdatasets have no remote yet, so recursive publishing should fail: res = publish(dataset=source, to="target", recursive=True, on_failure='ignore') assert_result_count(res, 3) assert_result_count(res, 1, status='ok', type='dataset', path=source.path) assert_result_count(res, 2, status='error', message=("Unknown target sibling '%s' for publication", 'target')) # now, set up targets for the submodules: sub1_target = GitRepo(sub1_pub, create=True) sub1_target.checkout("TMP", ["-b"]) sub2_target = AnnexRepo(sub2_pub, create=True) # we will be testing presence of the file content, so let's make it progress sub2_target.config.set('receive.denyCurrentBranch', 'updateInstead', where='local') sub1 = GitRepo(opj(src_path, 'subm 1'), create=False) sub2 = GitRepo(opj(src_path, '2'), create=False) sub1.add_remote("target", sub1_pub) sub2.add_remote("target", sub2_pub) # publish recursively with swallow_logs(new_level=logging.DEBUG) as cml: res = publish(dataset=source, to="target", recursive=True) assert_not_in('forced update', cml.out, "we probably haven't merged git-annex before pushing") # testing result list # base dataset was already published above, notneeded again assert_status(('ok', 'notneeded'), res) # nothing failed assert_result_count(res, 3, type='dataset') eq_({r['path'] for r in res}, {src_path, sub1.path, sub2.path}) eq_(list(target.get_branch_commits_("master")), list(source.repo.get_branch_commits_("master"))) assert_git_annex_branch_published(source.repo, target) eq_(list(sub1_target.get_branch_commits_("master")), list(sub1.get_branch_commits_("master"))) assert_git_annex_branch_published(sub1, sub1_target) eq_(list(sub2_target.get_branch_commits_("master")), list(sub2.get_branch_commits_("master"))) assert_git_annex_branch_published(sub2, sub2_target) # we are tracking origin but origin has different git-annex, since we # cloned from it, so it is not aware of our git-annex neq_(list(origin.repo.get_branch_commits_("git-annex")), list(source.repo.get_branch_commits_("git-annex"))) # So if we first publish to it recursively, we would update # all sub-datasets since git-annex branch would need to be pushed res_ = publish(dataset=source, recursive=True) assert_result_count(res_, 1, status='ok', path=source.path) assert_result_count(res_, 1, status='ok', path=sub1.path) assert_result_count(res_, 1, status='ok', path=sub2.path) # and now should carry the same state for git-annex assert_git_annex_branch_published(source.repo, origin.repo) # test for publishing with --since. By default since no changes, nothing pushed res_ = publish(dataset=source, recursive=True) assert_result_count(res_, 3, status='notneeded', type='dataset') # still nothing gets pushed, because origin is up to date res_ = publish(dataset=source, recursive=True, since='HEAD^') assert_result_count(res_, 3, status='notneeded', type='dataset') # and we should not fail if we run it from within the dataset with chpwd(source.path): res_ = publish(recursive=True, since='HEAD^') assert_result_count(res_, 3, status='notneeded', type='dataset') # Let's now update one subm with open(opj(sub2.path, "file.txt"), 'w') as f: f.write('') # add to subdataset, does not alter super dataset! # MIH: use `to_git` because original test author used # and explicit `GitRepo.add` -- keeping this for now Dataset(sub2.path).save('file.txt', to_git=True) # Let's now update one subm create_tree(sub2.path, {'file.dat': 'content'}) # add to subdataset, without reflecting the change in its super(s) Dataset(sub2.path).save('file.dat') # note: will publish to origin here since that is what it tracks res_ = publish(dataset=source, recursive=True, on_failure='ignore') ## only updates published, i.e. just the subdataset, super wasn't altered ## nothing copied! assert_status(('ok', 'notneeded'), res_) assert_result_count(res_, 1, status='ok', path=sub2.path, type='dataset') assert_result_count(res_, 0, path=opj(sub2.path, 'file.dat'), type='file') # since published to origin -- destination should not get that file nok_(lexists(opj(sub2_target.path, 'file.dat'))) res_ = publish(dataset=source, to='target', recursive=True) assert_status(('ok', 'notneeded'), res_) assert_result_count(res_, 1, status='ok', path=sub2.path, type='dataset') assert_result_count(res_, 0, path=opj(sub2.path, 'file.dat'), type='file') # Note: with updateInstead only in target2 and not saving change in # super-dataset we would have made remote dataset, if we had entire # hierarchy, to be somewhat inconsistent. # But here, since target datasets are independent -- it is ok # and the file itself was transferred ok_(lexists(opj(sub2_target.path, 'file.dat'))) nok_(sub2_target.file_has_content('file.dat')) ## but now we can redo publish recursively, with explicitly requested data transfer res_ = publish(dataset=source, to='target', recursive=True, transfer_data='all') ok_(sub2_target.file_has_content('file.dat')) assert_result_count(res_, 1, status='ok', path=opj(sub2.path, 'file.dat')) # Let's save those present changes and publish while implying "since last # merge point" source.save(message="Changes in subm2") # and test if it could deduce the remote/branch to push to source.config.set('branch.master.remote', 'target', where='local') with chpwd(source.path): res_ = publish(since='', recursive=True) # TODO: somehow test that there were no even attempt to diff within "subm 1" # since if `--since=''` worked correctly, nothing has changed there and it # should have not been even touched assert_status(('ok', 'notneeded'), res_) assert_result_count(res_, 1, status='ok', path=source.path, type='dataset') # Don't fail when a string is passed as `dataset` and since="". assert_status("notneeded", publish(since='', dataset=source.path))
def test_save_amend(dspath): dspath = Path(dspath) file_in_super = dspath / 'somefile' file_in_sub = dspath / 'subds' / 'file_in_sub' # test on a hierarchy including a plain git repo: ds = Dataset(dspath).create(force=True, no_annex=True) subds = ds.create('subds', force=True) ds.save(recursive=True) assert_repo_status(ds.repo) # recursive and amend are mutually exclusive: for d in (ds, subds): assert_raises(ValueError, d.save, recursive=True, amend=True) # in an annex repo the branch we are interested in might not be the active # branch (adjusted): sub_branch = subds.repo.get_corresponding_branch() # amend in subdataset w/ new message; otherwise empty amendment: last_sha = subds.repo.get_hexsha(sub_branch) subds.save(message="new message in sub", amend=True) # we did in fact commit something: neq_(last_sha, subds.repo.get_hexsha(sub_branch)) # repo is clean: assert_repo_status(subds.repo) # message is correct: eq_(subds.repo.format_commit("%B", sub_branch).strip(), "new message in sub") # actually replaced the previous commit: assert_not_in(last_sha, subds.repo.get_branch_commits_(sub_branch)) # amend modifications in subdataset w/o new message if not subds.repo.is_managed_branch(): subds.unlock('file_in_sub') file_in_sub.write_text("modified again") last_sha = subds.repo.get_hexsha(sub_branch) subds.save(amend=True) neq_(last_sha, subds.repo.get_hexsha(sub_branch)) assert_repo_status(subds.repo) # message unchanged: eq_(subds.repo.format_commit("%B", sub_branch).strip(), "new message in sub") # actually replaced the previous commit: assert_not_in(last_sha, subds.repo.get_branch_commits_(sub_branch)) # save --amend with nothing to amend with: res = subds.save(amend=True) assert_result_count(res, 1) assert_result_count(res, 1, status='notneeded', action='save') # amend in superdataset w/ new message; otherwise empty amendment: last_sha = ds.repo.get_hexsha() ds.save(message="new message in super", amend=True) neq_(last_sha, ds.repo.get_hexsha()) assert_repo_status(subds.repo) eq_(ds.repo.format_commit("%B").strip(), "new message in super") assert_not_in(last_sha, ds.repo.get_branch_commits_()) # amend modifications in superdataset w/o new message file_in_super.write_text("changed content") if not subds.repo.is_managed_branch(): subds.unlock('file_in_sub') file_in_sub.write_text("modified once again") last_sha = ds.repo.get_hexsha() last_sha_sub = subds.repo.get_hexsha(sub_branch) ds.save(amend=True) neq_(last_sha, ds.repo.get_hexsha()) eq_(ds.repo.format_commit("%B").strip(), "new message in super") assert_not_in(last_sha, ds.repo.get_branch_commits_()) # we didn't mess with the subds: assert_repo_status(ds.repo, modified=["subds"]) eq_(last_sha_sub, subds.repo.get_hexsha(sub_branch)) eq_(subds.repo.format_commit("%B", sub_branch).strip(), "new message in sub") # save --amend with nothing to amend with: last_sha = ds.repo.get_hexsha() res = ds.save(amend=True) assert_result_count(res, 1) assert_result_count(res, 1, status='notneeded', action='save') eq_(last_sha, ds.repo.get_hexsha()) # we didn't mess with the subds: assert_repo_status(ds.repo, modified=["subds"]) eq_(last_sha_sub, subds.repo.get_hexsha(sub_branch)) eq_(subds.repo.format_commit("%B", sub_branch).strip(), "new message in sub") # amend with different identity: orig_author = ds.repo.format_commit("%an") orig_email = ds.repo.format_commit("%ae") orig_date = ds.repo.format_commit("%ad") orig_committer = ds.repo.format_commit("%cn") orig_committer_mail = ds.repo.format_commit("%ce") eq_(orig_author, orig_committer) eq_(orig_email, orig_committer_mail) with patch.dict('os.environ', {'GIT_COMMITTER_NAME': 'Hopefully Different', 'GIT_COMMITTER_EMAIL': '*****@*****.**'}): ds.config.reload(force=True) ds.save(amend=True, message="amend with hope") # author was kept: eq_(orig_author, ds.repo.format_commit("%an")) eq_(orig_email, ds.repo.format_commit("%ae")) eq_(orig_date, ds.repo.format_commit("%ad")) # committer changed: eq_(ds.repo.format_commit("%cn"), "Hopefully Different") eq_(ds.repo.format_commit("%ce"), "*****@*****.**") # corner case: amend empty commit with no parent: rmtree(str(dspath)) # When adjusted branch is enforced by git-annex detecting a crippled FS, # git-annex produces an empty commit before switching to adjusted branch: # "commit before entering adjusted branch" # The commit by `create` would be the second one already. # Therefore go with plain annex repo and create an (empty) commit only when # not on adjusted branch: repo = AnnexRepo(dspath, create=True) if not repo.is_managed_branch(): repo.commit(msg="initial", options=['--allow-empty']) ds = Dataset(dspath) branch = ds.repo.get_corresponding_branch() or ds.repo.get_active_branch() # test pointless if we start with more than one commit eq_(len(list(ds.repo.get_branch_commits_(branch))), 1, msg="More than on commit '{}': {}".format( branch, ds.repo.call_git(['log', branch])) ) last_sha = ds.repo.get_hexsha(branch) ds.save(message="new initial commit", amend=True) assert_repo_status(ds.repo) eq_(len(list(ds.repo.get_branch_commits_(branch))), 1, msg="More than on commit '{}': {}".format( branch, ds.repo.call_git(['log', branch])) ) assert_not_in(last_sha, ds.repo.get_branch_commits_(branch)) eq_(ds.repo.format_commit("%B", branch).strip(), "new initial commit")
def test_basics(path, nodspath): ds = Dataset(path).create() last_state = ds.repo.get_hexsha() # run inside the dataset with chpwd(path), \ swallow_outputs(): # provoke command failure with assert_raises(CommandError) as cme: ds.run('7i3amhmuch9invalid') # let's not speculate that the exit code is always 127 ok_(cme.code > 0) eq_(last_state, ds.repo.get_hexsha()) # now one that must work res = ds.run('cd .> empty', message='TEST') assert_repo_status(ds.path) assert_result_count(res, 2) # TODO 'state' is still untracked!!! assert_result_count(res, 1, action='add', path=op.join(ds.path, 'empty'), type='file') assert_result_count(res, 1, action='save', path=ds.path) # ATTN: Use master explicitly so that this check works when we're on an # adjusted branch too (e.g., when this test is executed under Windows). commit_msg = ds.repo.format_commit("%B", "master") ok_(commit_msg.startswith('[DATALAD RUNCMD] TEST')) # crude test that we have a record for the PWD assert_in('"pwd": "."', commit_msg) last_state = ds.repo.get_hexsha() # now run a command that will not alter the dataset noop_cmd = ':' res = ds.run(noop_cmd, message='NOOP_TEST') assert_result_count(res, 1, action='save', status='notneeded') eq_(last_state, ds.repo.get_hexsha()) # We can also run the command via a single-item list because this is # what the CLI interface passes in for quoted commands. res = ds.run([noop_cmd], message='NOOP_TEST') assert_result_count(res, 1, action='save', status='notneeded') # run outside the dataset, should still work but with limitations with chpwd(nodspath), \ swallow_outputs(): res = ds.run('cd . > empty2', message='TEST') assert_result_count(res, 1, action='add', path=op.join(ds.path, 'empty2'), type='file', status='ok') assert_result_count(res, 1, action='save', status='ok') # running without a command is a noop with chpwd(path): with swallow_logs(new_level=logging.WARN) as cml: ds.run() assert_in("No command given", cml.out)
def test_publish_depends(origin, src_path, target1_path, target2_path, target3_path): # prepare src source = install(src_path, source=origin, recursive=True) source.repo.get('test-annex.dat') # pollute config depvar = 'remote.target2.datalad-publish-depends' source.config.add(depvar, 'stupid', where='local') eq_(source.config.get(depvar, None), 'stupid') # two remote sibling on two "different" hosts source.create_sibling('ssh://localhost' + target1_path, annex_wanted='standard', annex_group='backup', name='target1') # fails with unknown remote res = source.create_sibling( 'ssh://datalad-test' + target2_path, name='target2', existing='reconfigure', # because 'target2' is known in polluted cfg publish_depends='bogus', on_failure='ignore') assert_result_count( res, 1, status='error', message=('unknown sibling(s) specified as publication dependency: %s', set(['bogus']))) # for real source.create_sibling( 'ssh://datalad-test' + target2_path, name='target2', existing='reconfigure', # because 'target2' is known in polluted cfg annex_wanted='standard', annex_group='backup', publish_depends='target1') # wiped out previous dependencies eq_(source.config.get(depvar, None), 'target1') # and one more remote, on the same host but associated with a dependency source.create_sibling('ssh://datalad-test' + target3_path, name='target3') assert_repo_status(src_path) # introduce change in source create_tree(src_path, {'probe1': 'probe1'}) source.save('probe1') assert_repo_status(src_path) # only the source has the probe ok_file_has_content(opj(src_path, 'probe1'), 'probe1') for p in (target1_path, target2_path, target3_path): assert_false(lexists(opj(p, 'probe1'))) # publish to a standalone remote source.publish(to='target3') ok_(lexists(opj(target3_path, 'probe1'))) # but it has no data copied target3 = Dataset(target3_path) nok_(target3.repo.file_has_content('probe1')) # but if we publish specifying its path, it gets copied source.publish('probe1', to='target3') ok_file_has_content(opj(target3_path, 'probe1'), 'probe1') # no others are affected in either case for p in (target1_path, target2_path): assert_false(lexists(opj(p, 'probe1'))) # publish to all remaining, but via a dependency source.publish(to='target2') for p in (target1_path, target2_path, target3_path): ok_file_has_content(opj(p, 'probe1'), 'probe1')
def test_recurseinto(dspath, dest): # make fresh dataset hierarchy ds = make_demo_hierarchy_datasets(dspath, demo_hierarchy) ds.add('.', recursive=True) # label intermediate dataset as 'norecurseinto' res = Dataset(opj(ds.path, 'b')).subdatasets( contains='bb', set_property=[('datalad-recursiveinstall', 'skip')]) assert_result_count(res, 1, path=opj(ds.path, 'b', 'bb')) ds.add('b/', recursive=True) ok_clean_git(ds.path) # recursive install, should skip the entire bb branch res = install(source=ds.path, path=dest, recursive=True, result_xfm=None, result_filter=None) assert_result_count(res, 5) assert_result_count(res, 5, type='dataset') # we got the neighbor subdataset assert_result_count(res, 1, type='dataset', path=opj(dest, 'b', 'ba')) # we did not get the one we wanted to skip assert_result_count(res, 0, type='dataset', path=opj(dest, 'b', 'bb')) assert_not_in( opj(dest, 'b', 'bb'), Dataset(dest).subdatasets(fulfilled=True, result_xfm='paths')) assert(not Dataset(opj(dest, 'b', 'bb')).is_installed()) # cleanup Dataset(dest).remove(recursive=True) assert(not lexists(dest)) # again but just clone the base, and then get content and grab 'bb' # explicitly -- must get it installed dest = install(source=ds.path, path=dest) res = dest.get(['.', opj('b', 'bb')], get_data=False, recursive=True) assert_result_count(res, 8) assert_result_count(res, 8, type='dataset') assert_result_count(res, 1, type='dataset', path=opj(dest.path, 'b', 'bb')) assert(Dataset(opj(dest.path, 'b', 'bb')).is_installed())
def test_publish_with_data(origin, src_path, dst_path, sub1_pub, sub2_pub, dst_clone_path): # prepare src source = install(src_path, source=origin, recursive=True) source.repo.get('test-annex.dat') # create plain git at target: target = AnnexRepo(dst_path, create=True) target.checkout("TMP", ["-b"]) source.repo.add_remote("target", dst_path) # now, set up targets for the submodules: # the need to be annexes, because we want to be able to copy data to them # further down sub1_target = AnnexRepo(sub1_pub, create=True) sub1_target.checkout("TMP", ["-b"]) sub2_target = AnnexRepo(sub2_pub, create=True) sub2_target.checkout("TMP", ["-b"]) sub1 = GitRepo(opj(src_path, 'subm 1'), create=False) sub2 = GitRepo(opj(src_path, '2'), create=False) sub1.add_remote("target", sub1_pub) sub2.add_remote("target", sub2_pub) res = publish(dataset=source, to="target", path=['test-annex.dat'], result_xfm='paths') # first it would publish data and then push # TODO order is not fixed (yet) #eq_(res, [opj(source.path, 'test-annex.dat'), source.path]) eq_(set(res), set([opj(source.path, 'test-annex.dat'), source.path])) # XXX master was not checked out in dst! eq_(list(target.get_branch_commits_("master")), list(source.repo.get_branch_commits_("master"))) assert_git_annex_branch_published(source.repo, target) # we need compare target/master: target.checkout("master") ok_(target.file_has_content('test-annex.dat')) # make sure that whatever we published is actually consumable dst_clone = install(dst_clone_path, source=dst_path, result_xfm='datasets', return_type='item-or-list') nok_(dst_clone.repo.file_has_content('test-annex.dat')) res = dst_clone.get('test-annex.dat') ok_(dst_clone.repo.file_has_content('test-annex.dat')) res = publish(dataset=source, to="target", path=['.']) # there is nothing to publish on 2nd attempt #eq_(res, ([source, 'test-annex.dat'], [])) assert_result_count(res, 1, status='notneeded') import glob res = publish(dataset=source, to="target", path=glob.glob1(source.path, '*')) # Note: This leads to recursive publishing, since expansion of '*' # contains the submodules themselves in this setup # only the subdatasets, targets are plain git repos, hence # no file content is pushed, all content in super was pushed # before assert_result_count(res, 3) assert_result_count(res, 1, status='ok', path=sub1.path) assert_result_count(res, 1, status='ok', path=sub2.path) assert_result_count(res, 1, status='notneeded', path=source.path) # if we publish again -- nothing to be published res = source.publish(to="target") assert_result_count(res, 1, status='notneeded', path=source.path) # if we drop a file and publish again -- dataset should be published # since git-annex branch was updated source.drop('test-annex.dat') res = source.publish(to="target") assert_result_count(res, 1, status='ok', path=source.path) # and empty again if we try again res = source.publish(to="target") assert_result_count(res, 1, status='notneeded', path=source.path) # data integrity check looks identical from all perspectives # minus "note" statements from git-annex eq_(filter_fsck_error_msg(source.repo.fsck()), filter_fsck_error_msg(source.repo.fsck(remote='target'))) eq_(filter_fsck_error_msg(target.fsck()), filter_fsck_error_msg(source.repo.fsck(remote='target')))
def check_push(annex, src_path, dst_path): # prepare src src = Dataset(src_path).create(annex=annex) src_repo = src.repo # push should not add branches to the local dataset orig_branches = src_repo.get_branches() assert_not_in('synced/' + DEFAULT_BRANCH, orig_branches) res = src.push(on_failure='ignore') assert_result_count(res, 1) assert_in_results( res, status='impossible', message='No push target given, and none could be auto-detected, ' 'please specify via --to') eq_(orig_branches, src_repo.get_branches()) # target sibling target = mk_push_target(src, 'target', dst_path, annex=annex) eq_(orig_branches, src_repo.get_branches()) res = src.push(to="target") eq_(orig_branches, src_repo.get_branches()) assert_result_count(res, 2 if annex else 1) assert_in_results(res, action='publish', status='ok', target='target', refspec=DEFAULT_REFSPEC, operations=['new-branch']) assert_repo_status(src_repo, annex=annex) eq_(list(target.get_branch_commits_(DEFAULT_BRANCH)), list(src_repo.get_branch_commits_(DEFAULT_BRANCH))) # configure a default merge/upstream target src.config.set('branch.{}.remote'.format(DEFAULT_BRANCH), 'target', where='local') src.config.set('branch.{}.merge'.format(DEFAULT_BRANCH), DEFAULT_BRANCH, where='local') # don't fail when doing it again, no explicit target specification # needed anymore res = src.push() eq_(orig_branches, src_repo.get_branches()) # and nothing is pushed assert_status('notneeded', res) assert_repo_status(src_repo, annex=annex) eq_(list(target.get_branch_commits_(DEFAULT_BRANCH)), list(src_repo.get_branch_commits_(DEFAULT_BRANCH))) # some modification: (src.pathobj / 'test_mod_file').write_text("Some additional stuff.") src.save(to_git=True, message="Modified.") (src.pathobj / 'test_mod_annex_file').write_text("Heavy stuff.") src.save(to_git=not annex, message="Modified again.") assert_repo_status(src_repo, annex=annex) # we could say since='HEAD~2' to make things fast, or we are lazy # and say since='^' to indicate the state of the tracking remote # which is the same, because we made to commits since the last push. res = src.push(to='target', since="^", jobs=2) assert_in_results( res, action='publish', status='ok', target='target', refspec=DEFAULT_REFSPEC, # we get to see what happened operations=['fast-forward']) if annex: # we got to see the copy result for the annexed files assert_in_results(res, action='copy', status='ok', path=str(src.pathobj / 'test_mod_annex_file')) # we published, so we can drop and reobtain ok_(src_repo.file_has_content('test_mod_annex_file')) src_repo.drop('test_mod_annex_file') ok_(not src_repo.file_has_content('test_mod_annex_file')) src_repo.get('test_mod_annex_file') ok_(src_repo.file_has_content('test_mod_annex_file')) ok_file_has_content(src_repo.pathobj / 'test_mod_annex_file', 'Heavy stuff.') eq_(list(target.get_branch_commits_(DEFAULT_BRANCH)), list(src_repo.get_branch_commits_(DEFAULT_BRANCH))) if not (annex and src_repo.is_managed_branch()): # the following doesn't make sense in managed branches, because # a commit that could be amended is no longer the last commit # of a branch after a sync has happened (which did happen # during the last push above # amend and change commit msg in order to test for force push: src_repo.commit("amended", options=['--amend']) # push should be rejected (non-fast-forward): res = src.push(to='target', since='HEAD~2', on_failure='ignore') # fails before even touching the annex branch assert_in_results(res, action='publish', status='error', target='target', refspec=DEFAULT_REFSPEC, operations=['rejected', 'error']) # push with force=True works: res = src.push(to='target', since='HEAD~2', force='gitpush') assert_in_results(res, action='publish', status='ok', target='target', refspec=DEFAULT_REFSPEC, operations=['forced-update']) eq_(list(target.get_branch_commits_(DEFAULT_BRANCH)), list(src_repo.get_branch_commits_(DEFAULT_BRANCH))) # we do not have more branches than we had in the beginning # in particular no 'synced/<default branch>' eq_(orig_branches, src_repo.get_branches())
def test_uninstall_without_super(path): # a parent dataset with a proper subdataset, and another dataset that # is just placed underneath the parent, but not an actual subdataset parent = Dataset(path).create() sub = parent.create('sub') ok_clean_git(parent.path) nosub = create(opj(parent.path, 'nosub')) ok_clean_git(nosub.path) subreport = parent.subdatasets() assert_result_count(subreport, 1, path=sub.path) assert_result_count(subreport, 0, path=nosub.path) # it should be possible to uninstall the proper subdataset, even without # explicitly calling the uninstall methods of the parent -- things should # be figured out by datalad uninstall(sub.path) assert not sub.is_installed() # no present subdatasets anymore subreport = parent.subdatasets() assert_result_count(subreport, 1) assert_result_count(subreport, 1, path=sub.path, state='absent') assert_result_count(subreport, 0, path=nosub.path) # but we should fail on an attempt to uninstall the non-subdataset res = uninstall(nosub.path, on_failure='ignore') assert_result_count( res, 1, path=nosub.path, status='error', message="will not uninstall top-level dataset (consider `remove` command)")
def test_partial_aggregation(path): ds = Dataset(path).create(force=True) sub1 = ds.create('sub1', force=True) sub2 = ds.create('sub2', force=True) ds.save(recursive=True) # if we aggregate a path(s) and say to recurse, we must not recurse into # the dataset itself and aggregate others ds.meta_aggregate(path='sub1', recursive=True) res = ds.meta_dump(reporton='aggregates', recursive=True) assert_result_count(res, 1, path=ds.path) assert_result_count(res, 1, path=sub1.path) # so no metadata aggregates for sub2 yet assert_result_count(res, 0, path=sub2.path) ds.meta_aggregate(recursive=True) origsha = ds.repo.get_hexsha() assert_repo_status(ds.path) # baseline, recursive aggregation gets us something for all three datasets res = ds.meta_dump(reporton='aggregates', recursive=True) assert_result_count(res, 3) # now let's do partial aggregation from just one subdataset # we should not loose information on the other datasets # as this would be a problem any time anything in a dataset # subtree is missing: not installed, too expensive to reaggregate, ... ds.meta_aggregate(path='sub1') eq_(origsha, ds.repo.get_hexsha()) res = ds.meta_dump(reporton='aggregates', recursive=True) assert_result_count(res, 3) assert_result_count(res, 1, path=sub2.path) # nothing changes, so no commit ds.meta_aggregate(path='sub1') eq_(origsha, ds.repo.get_hexsha()) # and the same thing again, doesn't ruin the state either ds.meta_aggregate(path='sub1') eq_(origsha, ds.repo.get_hexsha()) # from-scratch aggregation kills datasets that where not listed # note the trailing separator that indicated that path refers # to the content of the subdataset, not the subdataset record # in the superdataset ds.meta_aggregate(path='sub1' + op.sep, force='fromscratch') res = ds.meta_dump(reporton='aggregates', recursive=True) assert_result_count(res, 1) assert_result_count(res, 1, path=sub1.path) # now reaggregated in full ds.meta_aggregate(recursive=True) # make change in sub1 sub1.unlock('here') with open(op.join(sub1.path, 'here'), 'w') as f: f.write('fresh') ds.save(recursive=True) assert_repo_status(path)
def test_bare_git(origin, remote_base_path): remote_base_path = Path(remote_base_path) # This test should take a dataset and create a bare repository at the remote end from it. # Given, that it is placed correctly within a tree of dataset, that remote thing should then be usable as a # ria-remote as well as as a git-type remote ds = create(origin) populate_dataset(ds) ds.save() assert_repo_status(ds.path) # Use git to make sure the remote end is what git thinks a bare clone of it should look like bare_repo_path = remote_base_path / ds.id[:3] / ds.id[3:] subprocess.run(['git', 'clone', '--bare', origin, str(bare_repo_path)]) # Now, let's have the bare repo as a git remote and use it with annex eq_( subprocess.run( ['git', 'remote', 'add', 'bare-git', str(bare_repo_path)], cwd=origin).returncode, 0) eq_( subprocess.run(['git', 'annex', 'enableremote', 'bare-git'], cwd=origin).returncode, 0) eq_( subprocess.run(['git', 'annex', 'testremote', 'bare-git'], cwd=origin).returncode, 0) # copy files to the remote ds.repo.copy_to('.', 'bare-git') eq_(len(ds.repo.whereis('one.txt')), 2) # now we can drop all content locally, reobtain it, and survive an # fsck ds.drop('.') ds.get('.') assert_status('ok', [annexjson2result(r, ds) for r in ds.repo.fsck()]) # Since we created the remote this particular way instead of letting ria-remote create it, we need to put # ria-layout-version files into it. Then we should be able to also add it as a ria-remote. with open(str(remote_base_path / 'ria-layout-version'), 'w') as f: f.write('1') with open(str(bare_repo_path / 'ria-layout-version'), 'w') as f: f.write('1') # Now, add the ria remote: initexternalremote(ds.repo, 'riaremote', 'ria', config={'base-path': str(remote_base_path)}) # fsck to make availability known assert_status('ok', [ annexjson2result(r, ds) for r in ds.repo.fsck(remote='riaremote', fast=True) ]) eq_(len(ds.repo.whereis('one.txt')), 3) # Now move content from git-remote to local and see it not being available via bare-git anymore eq_( subprocess.run(['git', 'annex', 'move', '--all', '--from=bare-git'], cwd=origin).returncode, 0) # ria-remote doesn't know yet: eq_(len(ds.repo.whereis('one.txt')), 2) # But after fsck it does: fsck_res = [ annexjson2result(r, ds) for r in ds.repo.fsck(remote='riaremote', fast=True) ] assert_result_count( fsck_res, 1, status='error', message= '** Based on the location log, one.txt\n** was expected to be present, ' 'but its content is missing.') assert_result_count( fsck_res, 1, status='error', message= '** Based on the location log, subdir/two\n** was expected to be present, ' 'but its content is missing.') eq_(len(ds.repo.whereis('one.txt')), 1)