def test_overrides(): cfg = ConfigManager() # any sensible (and also our CI) test environment(s) should have this assert_in('user.name', cfg) # set cfg.set('user.name', 'myoverride', where='override') assert_equal(cfg['user.name'], 'myoverride') # unset just removes override, not entire config cfg.unset('user.name', where='override') assert_in('user.name', cfg) assert_not_equal('user.name', 'myoverride') # add # there is no initial increment cfg.add('user.name', 'myoverride', where='override') assert_equal(cfg['user.name'], 'myoverride') # same as with add, not a list assert_equal(cfg['user.name'], 'myoverride') # but then there is cfg.add('user.name', 'myother', where='override') assert_equal(cfg['user.name'], ['myoverride', 'myother']) # rename assert_not_in('ups.name', cfg) cfg.rename_section('user', 'ups', where='override') # original variable still there assert_in('user.name', cfg) # rename of override in effect assert_equal(cfg['ups.name'], ['myoverride', 'myother']) # remove entirely by section cfg.remove_section('ups', where='override') from datalad.utils import Path assert_not_in('ups.name', cfg, ( cfg._stores, cfg.overrides, ))
def _check_auto_save(ds, orig_state): handle_dirty_dataset(ds, 'ignore') assert_raises(RuntimeError, handle_dirty_dataset, ds, 'fail') handle_dirty_dataset(ds, 'save-before') state = ds.repo.get_hexsha() assert_not_equal(orig_state, state) _check_all_clean(ds, state) return state
def _check_auto_save(ds, orig_state): handle_dirty_dataset(ds, 'ignore') assert_raises(RuntimeError, handle_dirty_dataset, ds, 'fail') handle_dirty_dataset(ds, 'save-before') state = ds.repo.get_hexsha() assert_not_equal(orig_state, state) _check_all_clean(ds, state) return state
def test_balsa_pipeline1(ind, topurl, outd, clonedir): list(initiate_dataset( template="balsa", dataset_name='dataladtest-WG33', path=outd, data_fields=['dataset_id'])({'dataset_id': 'WG33'})) with chpwd(outd): pipeline = ofpipeline('WG33', url=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) repo = AnnexRepo(outd, create=False) # to be used in the checks # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # since now we base incoming on master -- and there were nothing custom # in master after incoming-processed, both branches should be the same eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # but that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) get_branch_commits = repo.get_branch_commits_ \ if hasattr(repo, 'get_branch_commits_') else repo.get_branch_commits commits = {b: list(get_branch_commits(b)) for b in branches} # all commits out there -- init ds + init crawler + 1*(incoming, processed) # The number of commits in master differs based on the create variant used # (the one DataLad's master makes only one commit). ncommits_master = len(commits["master"]) assert_in(ncommits_master, [4, 5]) # incoming branches from master but lacks one merge commit. eq_(len(commits['incoming']), ncommits_master - 1) # incoming-processed is on master. eq_(len(commits['incoming-processed']), ncommits_master) with chpwd(outd): eq_(set(glob('*')), {'dir1', 'file1.nii'}) all_files = sorted(find_files('.')) fpath = opj(outd, 'file1.nii') ok_file_has_content(fpath, "content of file1.nii") ok_file_under_git(fpath, annexed=True) fpath2 = opj(outd, 'dir1', 'file2.nii') ok_file_has_content(fpath2, "content of file2.nii") ok_file_under_git(fpath2, annexed=True) target_files = { './.datalad/crawl/crawl.cfg', './.datalad/crawl/statuses/incoming.json', './.datalad/meta/balsa.json', './.datalad/config', './file1.nii', './dir1/file2.nii', } eq_(set(all_files), target_files)
def test_archive(path): ds = Dataset(opj(path, 'ds')).create(force=True) ds.add('.') committed_date = ds.repo.get_commit_date() default_outname = opj(path, 'datalad_{}.tar.gz'.format(ds.id)) with chpwd(path): res = list(ds.export_archive()) assert_status('ok', res) assert_result_count(res, 1) assert (isabs(res[0]['path'])) assert_true(os.path.exists(default_outname)) custom_outname = opj(path, 'myexport.tar.gz') # feed in without extension ds.export_archive(filename=custom_outname[:-7]) assert_true(os.path.exists(custom_outname)) custom1_md5 = md5sum(custom_outname) # encodes the original archive filename -> different checksum, despit # same content assert_not_equal(md5sum(default_outname), custom1_md5) # should really sleep so if they stop using time.time - we know time.sleep(1.1) ds.export_archive(filename=custom_outname) # should not encode mtime, so should be identical assert_equal(md5sum(custom_outname), custom1_md5) def check_contents(outname, prefix): with tarfile.open(outname) as tf: nfiles = 0 for ti in tf: # any annex links resolved assert_false(ti.issym()) ok_startswith(ti.name, prefix + '/') assert_equal(ti.mtime, committed_date) if '.datalad' not in ti.name: # ignore any files in .datalad for this test to not be # susceptible to changes in how much we generate a meta info nfiles += 1 # we have exactly four files (includes .gitattributes for default # MD5E backend), and expect no content for any directory assert_equal(nfiles, 4) check_contents(default_outname, 'datalad_%s' % ds.id) check_contents(custom_outname, 'myexport') # now loose some content if ds.repo.is_direct_mode(): # in direct mode the add() aove commited directly to the annex/direct/master # branch, hence drop will have no effect (notneeded) # this might be undesired behavior (or not), but this is not the place to test # for it return ds.drop('file_up', check=False) assert_raises(IOError, ds.export_archive, filename=opj(path, 'my')) ds.export_archive(filename=opj(path, 'partial'), missing_content='ignore') assert_true(os.path.exists(opj(path, 'partial.tar.gz')))
def test_balsa_pipeline1(ind, topurl, outd, clonedir): list( initiate_dataset(template="balsa", dataset_name='dataladtest-WG33', path=outd, data_fields=['dataset_id'])({ 'dataset_id': 'WG33' })) with chpwd(outd): pipeline = ofpipeline('WG33', url=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) repo = AnnexRepo(outd, create=False) # to be used in the checks # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) commits = {b: list(repo.get_branch_commits(b)) for b in branches} eq_(len(commits['incoming']), 1) eq_(len(commits['incoming-processed']), 2) eq_( len(commits['master']), 6 ) # all commits out there -- init ds + init crawler + 1*(incoming, processed, merge) with chpwd(outd): eq_(set(glob('*')), {'dir1', 'file1.nii'}) all_files = sorted(find_files('.')) fpath = opj(outd, 'file1.nii') ok_file_has_content(fpath, "content of file1.nii") ok_file_under_git(fpath, annexed=True) fpath2 = opj(outd, 'dir1', 'file2.nii') ok_file_has_content(fpath2, "content of file2.nii") ok_file_under_git(fpath2, annexed=True) target_files = { './.datalad/crawl/crawl.cfg', './.datalad/crawl/statuses/incoming.json', './.datalad/meta/balsa.json', './.datalad/config', './file1.nii', './dir1/file2.nii', } eq_(set(all_files), target_files)
def test_archive(path): ds = Dataset(opj(path, 'ds')).create(force=True) ds.save() committed_date = ds.repo.get_commit_date() default_outname = opj(path, 'datalad_{}.tar.gz'.format(ds.id)) with chpwd(path): res = list(ds.export_archive()) assert_status('ok', res) assert_result_count(res, 1) assert(isabs(res[0]['path'])) assert_true(os.path.exists(default_outname)) custom_outname = opj(path, 'myexport.tar.gz') # feed in without extension ds.export_archive(filename=custom_outname[:-7]) assert_true(os.path.exists(custom_outname)) custom1_md5 = md5sum(custom_outname) # encodes the original archive filename -> different checksum, despit # same content assert_not_equal(md5sum(default_outname), custom1_md5) # should really sleep so if they stop using time.time - we know time.sleep(1.1) ds.export_archive(filename=custom_outname) # should not encode mtime, so should be identical assert_equal(md5sum(custom_outname), custom1_md5) def check_contents(outname, prefix): with tarfile.open(outname) as tf: nfiles = 0 for ti in tf: # any annex links resolved assert_false(ti.issym()) ok_startswith(ti.name, prefix + '/') assert_equal(ti.mtime, committed_date) if '.datalad' not in ti.name: # ignore any files in .datalad for this test to not be # susceptible to changes in how much we generate a meta info nfiles += 1 # we have exactly four files (includes .gitattributes for default # MD5E backend), and expect no content for any directory assert_equal(nfiles, 4) check_contents(default_outname, 'datalad_%s' % ds.id) check_contents(custom_outname, 'myexport') # now loose some content ds.drop('file_up', check=False) assert_raises(IOError, ds.export_archive, filename=opj(path, 'my')) ds.export_archive(filename=opj(path, 'partial'), missing_content='ignore') assert_true(os.path.exists(opj(path, 'partial.tar.gz')))
def test_property_reevaluation(repo1): from os.path import lexists from datalad.tests.utils import ok_clean_git ds = Dataset(repo1) assert_is_none(ds.repo) assert_is_not_none(ds.config) first_config = ds.config assert_false(ds._cfg_bound) assert_is_none(ds.id) ds.create() ok_clean_git(repo1) # after creation, we have `repo`, and `config` was reevaluated to point # to the repo's config: assert_is_not_none(ds.repo) assert_is_not_none(ds.config) second_config = ds.config assert_true(ds._cfg_bound) assert_is(ds.config, ds.repo.config) assert_is_not(first_config, second_config) assert_is_not_none(ds.id) first_id = ds.id ds.remove() # repo is gone, and config is again reevaluated to only provide user/system # level config: assert_false(lexists(ds.path)) assert_is_none(ds.repo) assert_is_not_none(ds.config) third_config = ds.config assert_false(ds._cfg_bound) assert_is_not(second_config, third_config) assert_is_none(ds.id) ds.create() ok_clean_git(repo1) # after recreation everything is sane again: assert_is_not_none(ds.repo) assert_is_not_none(ds.config) assert_is(ds.config, ds.repo.config) forth_config = ds.config assert_true(ds._cfg_bound) assert_is_not(third_config, forth_config) assert_is_not_none(ds.id) assert_not_equal(ds.id, first_id)
def test_property_reevaluation(repo1): from os.path import lexists from datalad.tests.utils import ok_clean_git ds = Dataset(repo1) assert_is_none(ds.repo) assert_is_not_none(ds.config) first_config = ds.config assert_false(ds._cfg_bound) assert_is_none(ds.id) ds.create() ok_clean_git(repo1) # after creation, we have `repo`, and `config` was reevaluated to point # to the repo's config: assert_is_not_none(ds.repo) assert_is_not_none(ds.config) second_config = ds.config assert_true(ds._cfg_bound) assert_is(ds.config, ds.repo.config) assert_is_not(first_config, second_config) assert_is_not_none(ds.id) first_id = ds.id ds.remove() # repo is gone, and config is again reevaluated to only provide user/system # level config: assert_false(lexists(ds.path)) assert_is_none(ds.repo) assert_is_not_none(ds.config) third_config = ds.config assert_false(ds._cfg_bound) assert_is_not(second_config, third_config) assert_is_none(ds.id) ds.create() ok_clean_git(repo1) # after recreation everything is sane again: assert_is_not_none(ds.repo) assert_is_not_none(ds.config) assert_is(ds.config, ds.repo.config) forth_config = ds.config assert_true(ds._cfg_bound) assert_is_not(third_config, forth_config) assert_is_not_none(ds.id) assert_not_equal(ds.id, first_id)
def test_tarball(path): ds = Dataset(opj(path, 'ds')).create(force=True) ds.save(all_changes=True) committed_date = ds.repo.get_committed_date() with chpwd(path): _mod, tarball1 = ds.export('tarball') assert (not isabs(tarball1)) tarball1 = opj(path, tarball1) default_outname = opj(path, 'datalad_{}.tar.gz'.format(ds.id)) assert_equal(tarball1, default_outname) assert_true(os.path.exists(default_outname)) custom_outname = opj(path, 'myexport.tar.gz') # feed in without extension ds.export('tarball', output=custom_outname[:-7]) assert_true(os.path.exists(custom_outname)) custom1_md5 = md5sum(custom_outname) # encodes the original tarball filename -> different checksum, despit # same content assert_not_equal(md5sum(default_outname), custom1_md5) # should really sleep so if they stop using time.time - we know time.sleep(1.1) ds.export('tarball', output=custom_outname) # should not encode mtime, so should be identical assert_equal(md5sum(custom_outname), custom1_md5) def check_contents(outname, prefix): with tarfile.open(outname) as tf: nfiles = 0 for ti in tf: # any annex links resolved assert_false(ti.issym()) ok_startswith(ti.name, prefix + '/') assert_equal(ti.mtime, committed_date) if '.datalad' not in ti.name: # ignore any files in .datalad for this test to not be # susceptible to changes in how much we generate a meta info nfiles += 1 # we have exactly three files, and expect no content for any directory assert_equal(nfiles, 3) check_contents(default_outname, 'datalad_%s' % ds.id) check_contents(custom_outname, 'myexport')
def test_crawl_s3_commit_versions_one_at_a_time(path): annex = _annex(path) # Fancier setup so we could do any of desired actions within a single sweep pipeline = [ crawl_s3('datalad-test0-versioned', strategy='commit-versions', repo=annex.repo, ncommits=1), switch('datalad_action', { 'commit': annex.finalize(tag=True), 'remove': annex.remove, 'annex': annex, }) ] with externals_use_cassette('test_crawl_s3-pipeline1'): with swallow_logs(new_level=logging.WARN) as cml: out = run_pipeline(pipeline) assert_not_in("There is already a tag %s" % target_version, cml.out) # things are committed and thus stats are empty eq_(out, [{'datalad_stats': ActivityStats()}]) total_stats_all = total_stats = out[0]['datalad_stats'].get_total() eq_(total_stats, # Deletions come as 'files' as well atm ActivityStats(files=3, downloaded=3, urls=3, add_annex=3, downloaded_size=24, versions=[target_version])) # and there should be 7 more, every time changing the total stats for t in range(1, 8): with externals_use_cassette('test_crawl_s3-pipeline1'): with swallow_logs(new_level=logging.WARN) as cml: out = run_pipeline(pipeline) assert_in("There is already a tag %s" % target_version, cml.out) total_stats_ = out[0]['datalad_stats'].get_total() assert_not_equal(total_stats, total_stats_) total_stats = total_stats_ total_stats_all += total_stats # with total stats at the end to be the same as if all at once total_stats_all.versions = [] eq_(total_stats_all, # Deletions come as 'files' as well atm ActivityStats(files=17, skipped=72, overwritten=3, downloaded=14, urls=14, add_annex=14, removed=3, downloaded_size=112))
def test_tarball(path): ds = Dataset(opj(path, 'ds')).create(force=True) ds.save(all_changes=True) committed_date = ds.repo.get_committed_date() with chpwd(path): _mod, tarball1 = ds.export('tarball') assert(not isabs(tarball1)) tarball1 = opj(path, tarball1) default_outname = opj(path, 'datalad_{}.tar.gz'.format(ds.id)) assert_equal(tarball1, default_outname) assert_true(os.path.exists(default_outname)) custom_outname = opj(path, 'myexport.tar.gz') # feed in without extension ds.export('tarball', output=custom_outname[:-7]) assert_true(os.path.exists(custom_outname)) custom1_md5 = md5sum(custom_outname) # encodes the original tarball filename -> different checksum, despit # same content assert_not_equal(md5sum(default_outname), custom1_md5) # should really sleep so if they stop using time.time - we know time.sleep(1.1) ds.export('tarball', output=custom_outname) # should not encode mtime, so should be identical assert_equal(md5sum(custom_outname), custom1_md5) def check_contents(outname, prefix): with tarfile.open(outname) as tf: nfiles = 0 for ti in tf: # any annex links resolved assert_false(ti.issym()) ok_startswith(ti.name, prefix + '/') assert_equal(ti.mtime, committed_date) if '.datalad' not in ti.name: # ignore any files in .datalad for this test to not be # susceptible to changes in how much we generate a meta info nfiles += 1 # we have exactly three files, and expect no content for any directory assert_equal(nfiles, 3) check_contents(default_outname, 'datalad_%s' % ds.id) check_contents(custom_outname, 'myexport')
def test_gh1426(origin_path, target_path): # set up a pair of repos, one the published copy of the other origin = create(origin_path) target = AnnexRepo(target_path, create=True) target.config.set( 'receive.denyCurrentBranch', 'updateInstead', where='local') origin.siblings('add', name='target', url=target_path) origin.publish(to='target') ok_clean_git(origin.path) ok_clean_git(target.path) eq_(origin.repo.get_hexsha(), target.get_hexsha()) # gist of #1426 is that a newly added subdataset does not cause the # superdataset to get published origin.create('sub') ok_clean_git(origin.path) assert_not_equal(origin.repo.get_hexsha(), target.get_hexsha()) # now push res = origin.publish(to='target') assert_result_count(res, 1) assert_result_count(res, 1, status='ok', type='dataset', path=origin.path) eq_(origin.repo.get_hexsha(), target.get_hexsha())
def test_openfmri_pipeline2(ind, topurl, outd): # no versioned files -- should still work! ;) list( initiate_dataset(template="openfmri", dataset_name='dataladtest-ds666', path=outd, data_fields=['dataset'])({ 'dataset': 'ds666' })) with chpwd(outd): pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) repo = AnnexRepo(outd, create=False) # to be used in the checks # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # We do not have custom changes in master yet, so it just follows incoming-processed atm # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # Since we did initiate_dataset -- now we have separate master! assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) # actually the tree should look quite neat with 1.0.0 tag having 1 parent in incoming # 1.0.1 having 1.0.0 and the 2nd commit in incoming as parents commits_hexsha = {b: list(_get_branch_commits(repo, b)) for b in branches} commits_l = { b: list(_get_branch_commits(repo, b, limit='left-only')) for b in branches } # all commits out there: # backend set, dataset init, crawler, init, incoming (shares with master -1), # (2 or 3 commits, depending on create variant) # incoming-processed, merge, aggregate metadata: ncommits_master = len(commits_hexsha['master']) assert_in(ncommits_master, [5, 6]) assert_in(len(commits_l['master']), [4, 5]) eq_(len(commits_hexsha['incoming']), ncommits_master - 2) eq_(len(commits_l['incoming']), ncommits_master - 2) eq_(len(commits_hexsha['incoming-processed']), ncommits_master - 1) # TODO inspect by knowledgeable person and re-enable #eq_(len(commits_l['incoming-processed']), ncommits_master - 2) # rerun pipeline -- make sure we are on the same in all branches! with chpwd(outd): out = run_pipeline(pipeline) eq_(len(out), 1) commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches} eq_(commits_hexsha, commits_hexsha_) # i.e. nothing new eq_(out[0]['datalad_stats'], ActivityStats(files=2, skipped=2, urls=2)) eq_(out[0]['datalad_stats'], out[0]['datalad_stats'].get_total()) os.rename(opj(ind, 'ds666', 'ds666_R2.0.0.tar.gz'), opj(ind, 'ds666', 'ds666.tar.gz')) with chpwd(outd): out = run_pipeline(pipeline) eq_(len(out), 1) eq_(out[0]['datalad_stats'], ActivityStats()) # was committed stats_total = out[0]['datalad_stats'].get_total() stats_total.downloaded_size = 0 eq_( stats_total, ActivityStats(files=4, overwritten=1, skipped=1, downloaded=1, merges=[['incoming', 'incoming-processed']], versions=['1.0.0'], renamed=1, urls=2, add_annex=2)) # in reality there is also 1.0.0+1 tag since file changed but no version suffix eq_(repo.get_tags(output='name'), ['1.0.0', '1.0.0+1']) check_dropall_get(repo)
def test_openfmri_pipeline1(ind, topurl, outd, clonedir): index_html = opj(ind, 'ds666', 'index.html') list( initiate_dataset(template="openfmri", dataset_name='dataladtest-ds666', path=outd, data_fields=['dataset'])({ 'dataset': 'ds666' })) repo = AnnexRepo(outd, create=False) # to be used in the checks # Since datalad 0.11.2 all .metadata/objects go under annex. # Here we have a test where we force drop all annexed content, # to mitigate that let's place all metadata under git dotdatalad_attributes_file = opj('.datalad', '.gitattributes') repo.set_gitattributes([('metadata/objects/**', { 'annex.largefiles': '(nothing)' })], dotdatalad_attributes_file) # --amend so we do not cause change in # of commits below repo.commit("gitattributes", files=dotdatalad_attributes_file, options=['--amend']) with chpwd(outd): pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # We do not have custom changes in master yet, so it just follows incoming-processed atm # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # Since we did initiate_dataset -- now we have separate master! assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) t1w_fpath_nover = opj(outd, 'sub1', 'anat', 'sub-1_T1w.dat') ok_file_has_content(t1w_fpath_nover, "mighty load in old format") # # And now versioned files were specified! # add_to_index(index_html, content=_versioned_files) with chpwd(outd): pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) ok_( not exists(t1w_fpath_nover), "%s file should no longer be there if unversioned files get removed correctly" % t1w_fpath_nover) repo = AnnexRepo(outd, create=False) # to be used in the checks # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # We do not have custom changes in master yet, so it just follows incoming-processed atm # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # Since we did initiate_dataset -- now we have separate master! assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) # actually the tree should look quite neat with 1.0.0 tag having 1 parent in incoming # 1.0.1 having 1.0.0 and the 2nd commit in incoming as parents commits_hexsha = {b: list(_get_branch_commits(repo, b)) for b in branches} commits_l = { b: list(_get_branch_commits(repo, b, limit='left-only')) for b in branches } # all commits out there: # dataset init, crawler init # (2 commits) # + 3*(incoming, processed, merge) # + 3*aggregate-metadata update # - 1 since now that incoming starts with master, there is one less merge # In --incremental mode there is a side effect of absent now # 2*remove of obsolete metadata object files, # see https://github.com/datalad/datalad/issues/2772 # TODO inspect by knowledgeable person and re-enable #ncommits_master = len(commits_hexsha['master']) #assert_in(ncommits_master, [13, 14]) #assert_in(len(commits_l['master']), [8, 9]) # TODO inspect by knowledgeable person and re-enable #eq_(len(commits_hexsha['incoming']), ncommits_master - 8) #eq_(len(commits_l['incoming']), ncommits_master - 8) #eq_(len(commits_hexsha['incoming-processed']), ncommits_master - 5) #eq_(len(commits_l['incoming-processed']), ncommits_master - 8) # Check tags for the versions eq_(out[0]['datalad_stats'].get_total().versions, ['1.0.0', '1.0.1']) # +1 because original "release" was assumed to be 1.0.0 repo_tags = repo.get_tags() eq_(repo.get_tags(output='name'), ['1.0.0', '1.0.0+1', '1.0.1']) # Ben: The tagged ones currently are the ones with the message # '[DATALAD] dataset aggregate metadata update\n': #eq_(repo_tags[0]['hexsha'], commits_l['master'][4]) # next to the last one #eq_(repo_tags[-1]['hexsha'], commits_l['master'][0]) # the last one def hexsha(l): return l.__class__(x.hexsha for x in l) # TODO requires additional tooling to re-enable ## Verify that we have desired tree of merges #eq_(hexsha(commits_l['incoming-processed'][0].parents), (commits_l['incoming-processed'][1], # commits_l['incoming'][0])) #eq_(hexsha(commits_l['incoming-processed'][2].parents), (commits_l['incoming-processed'][3], # also in master # commits_l['incoming'][2],)) # ben: The following two comparisons are targeting these commits: # commit "Merge branch 'incoming-processed'\n" in commits_l['master'], # parents are: # commit "[DATALAD] dataset aggregate metadata update\n" in commits_l['master'] and # commit "[DATALAD] Added files from extracted archives\n\nFiles processed: 6\n renamed: 2\n +annex: 3\nBranches merged: incoming->incoming-processed\n" in commits_l['incoming-processed'] # TODO requires additional tooling to re-enable #eq_(hexsha(commits_l['master'][1].parents), (commits_l['master'][2], # commits_l['incoming-processed'][0])) #eq_(hexsha(commits_l['master'][3].parents), (commits_l['master'][4], # commits_l['incoming-processed'][1])) with chpwd(outd): eq_(set(glob('*')), {'changelog.txt', 'sub-1'}) all_files = sorted(find_files('.')) t1w_fpath = opj(outd, 'sub-1', 'anat', 'sub-1_T1w.dat') ok_file_has_content(t1w_fpath, "mighty load 1.0.1") ok_file_under_git(opj(outd, 'changelog.txt'), annexed=False) ok_file_under_git(t1w_fpath, annexed=True) try: # this is the new way from datalad.metadata.metadata import get_ds_aggregate_db_locations ds = Dataset('.') dbloc, objbase = get_ds_aggregate_db_locations(ds) dbloc = op.relpath(dbloc, start=ds.path) except ImportError: # this stopped working in early 2019 versions of datalad from datalad.metadata.metadata import agginfo_relpath dbloc = agginfo_relpath target_files = { './.datalad/config', './.datalad/crawl/crawl.cfg', # no more! # './.datalad/config.ttl', './.datalad/datalad.ttl', './.datalad/crawl/statuses/incoming.json', './.datalad/crawl/versions/incoming.json', './changelog.txt', './sub-1/anat/sub-1_T1w.dat', './sub-1/beh/responses.tsv', './' + dbloc, } target_incoming_files = { '.gitattributes', # we marked default backend right in the incoming # we now base 'incoming' on master branch, so we get all those as well '.datalad/.gitattributes', '.datalad/config', '.datalad/crawl/crawl.cfg', 'changelog.txt', 'ds666.tar.gz', 'ds666-beh_R1.0.1.tar.gz', 'ds666_R1.0.0.tar.gz', 'ds666_R1.0.1.tar.gz', 'ds666_R2.0.0.tar.gz', '.datalad/crawl/statuses/incoming.json', '.datalad/crawl/versions/incoming.json' } # Ben: metadata object files may differ in their names containing some checksum-ish shit ... # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison eq_( set([ f for f in all_files if not f.startswith('./.datalad/metadata/objects/') ]), target_files) # check that -beh was committed in 2nd commit in incoming, not the first one assert_not_in('ds666-beh_R1.0.1.tar.gz', repo.get_files(commits_l['incoming'][-1])) assert_in('ds666-beh_R1.0.1.tar.gz', repo.get_files(commits_l['incoming'][0])) # rerun pipeline -- make sure we are on the same in all branches! with chpwd(outd): out = run_pipeline(pipeline) eq_(len(out), 1) commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches} eq_(commits_hexsha, commits_hexsha_) # i.e. nothing new # actually we do manage to add_git 1 (README) since it is generated committed directly to git # BUT now fixed -- if not committed (was the same), should be marked as skipped # Nothing was committed so stats leaked all the way up eq_(out[0]['datalad_stats'], ActivityStats(files=5, skipped=5, urls=5)) eq_(out[0]['datalad_stats'], out[0]['datalad_stats'].get_total()) # rerun pipeline when new content is available # add new revision, rerun pipeline and check that stuff was processed/added correctly add_to_index( index_html, content= '<a href="ds666_R2.0.0.tar.gz">Raw data on AWS version 2.0.0</a>') with chpwd(outd): out = run_pipeline(pipeline) all_files_updated = sorted(find_files('.')) eq_(len(out), 1) assert_not_equal(out[0]['datalad_stats'].get_total(), ActivityStats()) # there is no overlays ATM, so behav would be gone since no 2.0.0 for it! target_files.remove('./sub-1/beh/responses.tsv') # Ben: metadata object files may differ in their names containing some checksum-ish shit ... # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison eq_( set([ f for f in all_files_updated if not f.startswith('./.datalad/metadata/objects/') ]), target_files) # new instance so it re-reads git stuff etc # repo = AnnexRepo(outd, create=False) # to be used in the checks commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches} commits_l_ = { b: list(_get_branch_commits(repo, b, limit='left-only')) for b in branches } assert_not_equal(commits_hexsha, commits_hexsha_) eq_(out[0]['datalad_stats'], ActivityStats()) # commit happened so stats were consumed # numbers seems to be right total_stats = out[0]['datalad_stats'].get_total() # but for some reason downloaded_size fluctuates.... why? probably archiving...? total_stats.downloaded_size = 0 eq_( total_stats, ActivityStats( files=8, skipped=5, downloaded=1, renamed=1, urls=6, add_annex=2, # add_git=1, # README versions=['2.0.0'], merges=[['incoming', 'incoming-processed']])) check_dropall_get(repo) # Let's see if pipeline would remove files we stopped tracking remove_from_index(index_html, '<a href=.ds666_R1.0.0[^<]*</a>') with chpwd(outd): with swallow_logs(new_level=logging.WARNING) as cml: out = run_pipeline(pipeline) # since files get removed in incoming, but repreprocessed completely # incomming-processed and merged into master -- new commits will come # They shouldn't have any difference but still should be new commits assert_in("There is already a tag 2.0.0 in the repository", cml.out) eq_(len(out), 1) incoming_files = repo.get_files('incoming') target_incoming_files.remove('ds666_R1.0.0.tar.gz') eq_(set(incoming_files), target_incoming_files) commits_hexsha_removed = { b: list(_get_branch_commits(repo, b)) for b in branches } # our 'statuses' database should have recorded the change thus got a diff # which propagated through all branches for b in 'master', 'incoming-processed': # with non persistent DB we had no changes # eq_(repo.repo.branches[b].commit.diff(commits_hexsha_[b][0]), []) assert_in(repo.pathobj / '.datalad/crawl/statuses/incoming.json', repo.diff(b, commits_hexsha_[b][0])) dincoming = repo.diff('incoming', commits_hexsha_['incoming'][0]) eq_(len(dincoming), 2) # 2 diff objects -- 1 file removed, 1 statuses updated eq_( set(dincoming.keys()), { repo.pathobj / '.datalad/crawl/statuses/incoming.json', repo.pathobj / 'ds666_R1.0.0.tar.gz' }) eq_(out[0]['datalad_stats'].get_total().removed, 1) assert_not_equal(commits_hexsha_, commits_hexsha_removed) # we will check if a clone would be crawling just as good from datalad.api import crawl # make a brand new clone GitRepo.clone(outd, clonedir) def _pipeline(*args, **kwargs): """Helper to mock openfmri.pipeline invocation so it looks at our 'server'""" kwargs = updated(kwargs, {'topurl': topurl, 'versioned_urls': False}) return ofpipeline(*args, **kwargs) with chpwd(clonedir), patch.object(openfmri, 'pipeline', _pipeline): output, stats = crawl( ) # we should be able to recrawl without doing anything ok_(stats, ActivityStats(files=6, skipped=6, urls=5))
def test_cached_dataset(cache_dir): # patch DATALAD_TESTS_CACHE to not use the actual cache with # the test testing that very cache. cache_dir = Path(cache_dir) ds_url = "https://github.com/datalad/testrepo--minimalds" name_in_cache = url2filename(ds_url) annexed_file = Path("inannex") / "animated.gif" with patch(CACHE_PATCH_STR, new=cache_dir): @cached_dataset(url=ds_url) def decorated_test1(ds): # we get a Dataset instance assert_is_instance(ds, Dataset) # it's a clone in a temp. location, not within the cache assert_not_in(cache_dir, ds.pathobj.parents) assert_result_count(ds.siblings(), 1, type="sibling", name="origin", url=str(cache_dir / name_in_cache)) here = ds.config.get("annex.uuid") origin = ds.config.get("remote.origin.annex-uuid") where = ds.repo.whereis(str(annexed_file)) assert_not_in(here, where) assert_not_in(origin, where) return ds.pathobj, ds.repo.pathobj @cached_dataset(url=ds_url, paths=str(annexed_file)) def decorated_test2(ds): # we get a Dataset instance assert_is_instance(ds, Dataset) # it's a clone in a temp. location, not within the cache assert_not_in(cache_dir, ds.pathobj.parents) assert_result_count(ds.siblings(), 1, type="sibling", name="origin", url=str(cache_dir / name_in_cache)) here = ds.config.get("annex.uuid") origin = ds.config.get("remote.origin.annex-uuid") where = ds.repo.whereis(str(annexed_file)) assert_in(here, where) assert_in(origin, where) return ds.pathobj, ds.repo.pathobj @cached_dataset(url=ds_url) def decorated_test3(ds): # we get a Dataset instance assert_is_instance(ds, Dataset) # it's a clone in a temp. location, not within the cache assert_not_in(cache_dir, ds.pathobj.parents) assert_result_count(ds.siblings(), 1, type="sibling", name="origin", url=str(cache_dir / name_in_cache)) # origin is the same cached dataset, that got this content in # decorated_test2 before. Should still be there. But "here" we # didn't request it here = ds.config.get("annex.uuid") origin = ds.config.get("remote.origin.annex-uuid") where = ds.repo.whereis(str(annexed_file)) assert_not_in(here, where) assert_in(origin, where) return ds.pathobj, ds.repo.pathobj @cached_dataset(url=ds_url, version="541cf855d13c2a338ff2803d4488daf0035e568f") def decorated_test4(ds): # we get a Dataset instance assert_is_instance(ds, Dataset) # it's a clone in a temp. location, not within the cache assert_not_in(cache_dir, ds.pathobj.parents) assert_result_count(ds.siblings(), 1, type="sibling", name="origin", url=str(cache_dir / name_in_cache)) # origin is the same cached dataset, that got this content in # decorated_test2 before. Should still be there. But "here" we # didn't request it here = ds.config.get("annex.uuid") origin = ds.config.get("remote.origin.annex-uuid") where = ds.repo.whereis(str(annexed_file)) assert_not_in(here, where) assert_in(origin, where) assert_equal(ds.repo.get_hexsha(), "541cf855d13c2a338ff2803d4488daf0035e568f") return ds.pathobj, ds.repo.pathobj first_dspath, first_repopath = decorated_test1() second_dspath, second_repopath = decorated_test2() decorated_test3() decorated_test4() # first and second are not the same, only their origin is: assert_not_equal(first_dspath, second_dspath) assert_not_equal(first_repopath, second_repopath)
def test_target_ssh_simple(origin, src_path, target_rootpath): # prepare src source = install( src_path, source=origin, result_xfm='datasets', return_type='item-or-list') target_path = opj(target_rootpath, "basic") with swallow_logs(new_level=logging.ERROR) as cml: create_sibling( dataset=source, name="local_target", sshurl="ssh://localhost", target_dir=target_path, ui=True) assert_not_in('enableremote local_target failed', cml.out) GitRepo(target_path, create=False) # raises if not a git repo assert_in("local_target", source.repo.get_remotes()) # Both must be annex or git repositories src_is_annex = AnnexRepo.is_valid_repo(src_path) eq_(src_is_annex, AnnexRepo.is_valid_repo(target_path)) # And target one should be known to have a known UUID within the source if annex if src_is_annex: annex = AnnexRepo(src_path) local_target_cfg = annex.repo.remotes["local_target"].config_reader.get # basic config in place eq_(local_target_cfg('annex-ignore'), 'false') ok_(local_target_cfg('annex-uuid')) # do it again without force, but use a different name to avoid initial checks # for existing remotes: with assert_raises(RuntimeError) as cm: assert_create_sshwebserver( dataset=source, name="local_target_alt", sshurl="ssh://localhost", target_dir=target_path) ok_(text_type(cm.exception).startswith( "Target path %s already exists. And it fails to rmdir" % target_path)) if src_is_annex: target_description = AnnexRepo(target_path, create=False).get_description() assert_not_equal(target_description, None) assert_not_equal(target_description, target_path) # on yoh's laptop TMPDIR is under HOME, so things start to become # tricky since then target_path is shortened and we would need to know # remote $HOME. To not over-complicate and still test, test only for # the basename of the target_path ok_endswith(target_description, basename(target_path)) # now, with force and correct url, which is also used to determine # target_dir # Note: on windows absolute path is not url conform. But this way it's easy # to test, that ssh path is correctly used. if not on_windows: # add random file under target_path, to explicitly test existing=replace open(opj(target_path, 'random'), 'w').write('123') assert_create_sshwebserver( dataset=source, name="local_target", sshurl="ssh://localhost" + target_path, publish_by_default='master', existing='replace') eq_("ssh://localhost" + urlquote(target_path), source.repo.get_remote_url("local_target")) ok_(source.repo.get_remote_url("local_target", push=True) is None) # ensure target tree actually replaced by source assert_false(exists(opj(target_path, 'random'))) if src_is_annex: annex = AnnexRepo(src_path) local_target_cfg = annex.repo.remotes["local_target"].config_reader.get eq_(local_target_cfg('annex-ignore'), 'false') eq_(local_target_cfg('annex-uuid').count('-'), 4) # valid uuid # should be added too, even if URL matches prior state eq_(local_target_cfg('push'), 'master') # again, by explicitly passing urls. Since we are on localhost, the # local path should work: cpkwargs = dict( dataset=source, name="local_target", sshurl="ssh://localhost", target_dir=target_path, target_url=target_path, target_pushurl="ssh://localhost" + target_path, ui=True, ) assert_create_sshwebserver(existing='replace', **cpkwargs) if src_is_annex: target_description = AnnexRepo(target_path, create=False).get_description() eq_(target_description, target_path) eq_(target_path, source.repo.get_remote_url("local_target")) eq_("ssh://localhost" + target_path, source.repo.get_remote_url("local_target", push=True)) _test_correct_publish(target_path) # now, push should work: publish(dataset=source, to="local_target") # and we should be able to 'reconfigure' def process_digests_mtimes(digests, mtimes): # it should have triggered a hook, which would have created log and metadata files check_metadata = False for part in 'logs', 'metadata': metafiles = [k for k in digests if k.startswith(_path_('.git/datalad/%s/' % part))] # This is in effect ONLY if we have "compatible" datalad installed on remote # end. ATM we don't have easy way to guarantee that AFAIK (yoh), # so let's not check/enforce (TODO) # assert(len(metafiles) >= 1) # we might have 2 logs if timestamps do not collide ;) # Let's actually do it to some degree if part == 'logs': # always should have those: assert (len(metafiles) >= 1) with open(opj(target_path, metafiles[0])) as f: if 'no datalad found' not in f.read(): check_metadata = True if part == 'metadata': eq_(len(metafiles), bool(check_metadata)) for f in metafiles: digests.pop(f) mtimes.pop(f) # and just pop some leftovers from annex for f in list(digests): if f.startswith('.git/annex/mergedrefs'): digests.pop(f) mtimes.pop(f) orig_digests, orig_mtimes = get_mtimes_and_digests(target_path) process_digests_mtimes(orig_digests, orig_mtimes) import time time.sleep(0.1) # just so that mtimes change assert_create_sshwebserver(existing='reconfigure', **cpkwargs) digests, mtimes = get_mtimes_and_digests(target_path) process_digests_mtimes(digests, mtimes) assert_dict_equal(orig_digests, digests) # nothing should change in terms of content # but some files should have been modified modified_files = {k for k in mtimes if orig_mtimes.get(k, 0) != mtimes.get(k, 0)} # collect which files were expected to be modified without incurring any changes ok_modified_files = { _path_('.git/hooks/post-update'), 'index.html', # files which hook would manage to generate _path_('.git/info/refs'), '.git/objects/info/packs' } # on elderly git we don't change receive setting ok_modified_files.add(_path_('.git/config')) ok_modified_files.update({f for f in digests if f.startswith(_path_('.git/datalad/web'))}) # it seems that with some recent git behavior has changed a bit # and index might get touched if _path_('.git/index') in modified_files: ok_modified_files.add(_path_('.git/index')) assert_set_equal(modified_files, ok_modified_files)
def test_recursive_save(path): ds = Dataset(path).create() # nothing to save assert_false(ds.save()) subds = ds.create('sub') # subdataset presence already saved ok_clean_git(ds.path) subsubds = subds.create('subsub') assert_equal( ds.get_subdatasets(recursive=True, absolute=True, fulfilled=True), [subsubds.path, subds.path]) newfile_name = opj(subsubds.path, 'test') with open(newfile_name, 'w') as f: f.write('some') # saves the status change of the subdataset due to the subsubdataset addition assert_equal(ds.save(all_changes=True), [ds]) # make the new file known to its dataset # with #1141 this would be #ds.add(newfile_name, save=False) subsubds.add(newfile_name, save=False) # but remains dirty because of the untracked file down below assert ds.repo.dirty # auto-add will save nothing deep down without recursive assert_equal(ds.save(all_changes=True), []) assert ds.repo.dirty # with recursive pick up the change in subsubds assert_equal(ds.save(all_changes=True, recursive=True), [subsubds, subds, ds]) # modify content in subsub and try saving testfname = newfile_name subsubds.unlock(testfname) with open(opj(ds.path, testfname), 'w') as f: f.write('I am in here!') # the following should all do nothing # no auto_add assert_false(ds.save()) # no recursive assert_false(ds.save(all_changes=True)) # an explicit target saves only the corresponding dataset assert_equal(save(files=[testfname]), [subsubds]) # plain recursive without any files given will save the beast assert_equal(ds.save(recursive=True), [subds, ds]) # there is nothing else to save assert_false(ds.save(all_changes=True, recursive=True)) # one more time and check that all datasets in the hierarchy get updated states = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] testfname = opj('sub', 'subsub', 'saveme2') with open(opj(ds.path, testfname), 'w') as f: f.write('I am in here!') assert_true(ds.save(all_changes=True, recursive=True)) newstates = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] for old, new in zip(states, newstates): assert_not_equal(old, new) # now let's check saving "upwards" assert not subds.repo.dirty create_tree(subds.path, {"testnew": 'smth', "testadded": "added"}) subds.repo.add("testadded") indexed_files = subds.repo.get_indexed_files() assert subds.repo.dirty assert ds.repo.dirty assert not subsubds.repo.dirty create_tree(subsubds.path, {"testnew2": 'smth'}) assert subsubds.repo.dirty # and indexed files didn't change assert_equal(indexed_files, subds.repo.get_indexed_files()) ok_clean_git(subds.repo, untracked=['testnew'], index_modified=['subsub'], head_modified=['testadded']) subsubds.save(message="savingtestmessage", super_datasets=True, all_changes=True) ok_clean_git(subsubds.repo) # but its super should have got only the subsub saved # not the file we created ok_clean_git(subds.repo, untracked=['testnew'], head_modified=['testadded']) # check commits to have correct messages # there are no more dedicated superdataset-save commits anymore, because # superdatasets get saved as part of the processed hierarchy and can contain # other parts in the commit (if so instructed) assert_equal(next(subsubds.repo.get_branch_commits('master')).message.rstrip(), 'savingtestmessage') assert_equal(next(subds.repo.get_branch_commits('master')).message.rstrip(), 'savingtestmessage') assert_equal(next(ds.repo.get_branch_commits('master')).message.rstrip(), 'savingtestmessage')
def test_target_ssh_simple(origin, src_path, target_rootpath): # prepare src source = install(src_path, source=origin) target_path = opj(target_rootpath, "basic") # it will try to fetch it so would fail as well since sshurl is wrong with swallow_logs(new_level=logging.ERROR) as cml, \ assert_raises(GitCommandError): create_sibling(dataset=source, target="local_target", sshurl="ssh://localhost", target_dir=target_path, ui=True) # is not actually happening on one of the two basic cases -- TODO figure it out # assert_in('enableremote local_target failed', cml.out) GitRepo(target_path, create=False) # raises if not a git repo assert_in("local_target", source.repo.get_remotes()) eq_("ssh://localhost", source.repo.get_remote_url("local_target")) # should NOT be able to push now, since url isn't correct: # TODO: assumption is wrong if ~ does have .git! fix up! assert_raises(GitCommandError, publish, dataset=source, to="local_target") # Both must be annex or git repositories src_is_annex = AnnexRepo.is_valid_repo(src_path) eq_(src_is_annex, AnnexRepo.is_valid_repo(target_path)) # And target one should be known to have a known UUID within the source if annex if src_is_annex: annex = AnnexRepo(src_path) local_target_cfg = annex.repo.remotes["local_target"].config_reader.get # for some reason this was "correct" # eq_(local_target_cfg('annex-ignore'), 'false') # but after fixing creating siblings in # 21f6dd012b2c7b9c0b8b348dcfb3b0ace7e8b2ec it started to fail # I think it is legit since we are trying to fetch now before calling # annex.enable_remote so it doesn't set it up, and fails before assert_raises(Exception, local_target_cfg, 'annex-ignore') # hm, but ATM wouldn't get a uuid since url is wrong assert_raises(Exception, local_target_cfg, 'annex-uuid') # do it again without force: with assert_raises(RuntimeError) as cm: assert_create_sshwebserver(dataset=source, target="local_target", sshurl="ssh://localhost", target_dir=target_path) eq_("Target directory %s already exists." % target_path, str(cm.exception)) if src_is_annex: target_description = AnnexRepo(target_path, create=False).get_description() assert_not_equal(target_description, None) assert_not_equal(target_description, target_path) ok_endswith(target_description, target_path) # now, with force and correct url, which is also used to determine # target_dir # Note: on windows absolute path is not url conform. But this way it's easy # to test, that ssh path is correctly used. if not on_windows: # add random file under target_path, to explicitly test existing=replace open(opj(target_path, 'random'), 'w').write('123') assert_create_sshwebserver(dataset=source, target="local_target", sshurl="ssh://localhost" + target_path, existing='replace') eq_("ssh://localhost" + target_path, source.repo.get_remote_url("local_target")) ok_(source.repo.get_remote_url("local_target", push=True) is None) # ensure target tree actually replaced by source assert_false(exists(opj(target_path, 'random'))) if src_is_annex: annex = AnnexRepo(src_path) local_target_cfg = annex.repo.remotes[ "local_target"].config_reader.get eq_(local_target_cfg('annex-ignore'), 'false') eq_(local_target_cfg('annex-uuid').count('-'), 4) # valid uuid # again, by explicitly passing urls. Since we are on localhost, the # local path should work: cpkwargs = dict( dataset=source, target="local_target", sshurl="ssh://localhost", target_dir=target_path, target_url=target_path, target_pushurl="ssh://localhost" + target_path, ui=True, ) assert_create_sshwebserver(existing='replace', **cpkwargs) if src_is_annex: target_description = AnnexRepo(target_path, create=False).get_description() eq_(target_description, target_path) eq_(target_path, source.repo.get_remote_url("local_target")) eq_("ssh://localhost" + target_path, source.repo.get_remote_url("local_target", push=True)) _test_correct_publish(target_path) # now, push should work: publish(dataset=source, to="local_target") # and we should be able to 'reconfigure' def process_digests_mtimes(digests, mtimes): # it should have triggered a hook, which would have created log and metadata files check_metadata = False for part in 'logs', 'metadata': metafiles = [ k for k in digests if k.startswith(_path_('.git/datalad/%s/' % part)) ] # This is in effect ONLY if we have "compatible" datalad installed on remote # end. ATM we don't have easy way to guarantee that AFAIK (yoh), # so let's not check/enforce (TODO) # assert(len(metafiles) >= 1) # we might have 2 logs if timestamps do not collide ;) # Let's actually do it to some degree if part == 'logs': # always should have those: assert (len(metafiles) >= 1) with open(opj(target_path, metafiles[0])) as f: if 'no datalad found' not in f.read(): check_metadata = True if part == 'metadata': eq_(len(metafiles), bool(check_metadata)) for f in metafiles: digests.pop(f) mtimes.pop(f) # and just pop some leftovers from annex for f in list(digests): if f.startswith('.git/annex/mergedrefs'): digests.pop(f) mtimes.pop(f) orig_digests, orig_mtimes = get_mtimes_and_digests(target_path) process_digests_mtimes(orig_digests, orig_mtimes) import time time.sleep(0.1) # just so that mtimes change assert_create_sshwebserver(existing='reconfigure', **cpkwargs) digests, mtimes = get_mtimes_and_digests(target_path) process_digests_mtimes(digests, mtimes) assert_dict_equal(orig_digests, digests) # nothing should change in terms of content # but some files should have been modified modified_files = { k for k in mtimes if orig_mtimes.get(k, 0) != mtimes.get(k, 0) } # collect which files were expected to be modified without incurring any changes ok_modified_files = { _path_('.git/hooks/post-update'), 'index.html', # files which hook would manage to generate _path_('.git/info/refs'), '.git/objects/info/packs' } if external_versions['cmd:system-git'] >= '2.4': # on elderly git we don't change receive setting ok_modified_files.add(_path_('.git/config')) ok_modified_files.update( {f for f in digests if f.startswith(_path_('.git/datalad/web'))}) assert_set_equal(modified_files, ok_modified_files)
def test_recursive_save(path): ds = Dataset(path).create() # nothing to save assert_false(ds.save()) subds = ds.create('sub') # subdataset presence already saved ok_clean_git(ds.path) subsubds = subds.create('subsub') assert_equal( ds.get_subdatasets(recursive=True, absolute=True, fulfilled=True), [subsubds.path, subds.path]) newfile_name = opj(subsubds.path, 'test') with open(newfile_name, 'w') as f: f.write('some') # saves the status change of the subdataset due to the subsubdataset addition assert_equal(ds.save(all_changes=True), [ds]) # make the new file known to its dataset # with #1141 this would be #ds.add(newfile_name, save=False) subsubds.add(newfile_name, save=False) # but remains dirty because of the untracked file down below assert ds.repo.dirty # auto-add will save nothing deep down without recursive assert_equal(ds.save(all_changes=True), []) assert ds.repo.dirty # with recursive pick up the change in subsubds assert_equal(ds.save(all_changes=True, recursive=True), [subsubds, subds, ds]) # modify content in subsub and try saving testfname = newfile_name subsubds.unlock(testfname) with open(opj(ds.path, testfname), 'w') as f: f.write('I am in here!') # the following should all do nothing # no auto_add assert_false(ds.save()) # no recursive assert_false(ds.save(all_changes=True)) # an explicit target saves only the corresponding dataset assert_equal(save(files=[testfname]), [subsubds]) # plain recursive without any files given will save the beast assert_equal(ds.save(recursive=True), [subds, ds]) # there is nothing else to save assert_false(ds.save(all_changes=True, recursive=True)) # one more time and check that all datasets in the hierarchy get updated states = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] testfname = opj('sub', 'subsub', 'saveme2') with open(opj(ds.path, testfname), 'w') as f: f.write('I am in here!') assert_true(ds.save(all_changes=True, recursive=True)) newstates = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] for old, new in zip(states, newstates): assert_not_equal(old, new) # now let's check saving "upwards" assert not subds.repo.dirty create_tree(subds.path, {"testnew": 'smth', "testadded": "added"}) subds.repo.add("testadded") indexed_files = subds.repo.get_indexed_files() assert subds.repo.dirty assert ds.repo.dirty assert not subsubds.repo.dirty create_tree(subsubds.path, {"testnew2": 'smth'}) assert subsubds.repo.dirty # and indexed files didn't change assert_equal(indexed_files, subds.repo.get_indexed_files()) ok_clean_git(subds.repo, untracked=['testnew'], index_modified=['subsub'], head_modified=['testadded']) subsubds.save(message="savingtestmessage", super_datasets=True, all_changes=True) ok_clean_git(subsubds.repo) # but its super should have got only the subsub saved # not the file we created ok_clean_git(subds.repo, untracked=['testnew'], head_modified=['testadded']) # check commits to have correct messages # there are no more dedicated superdataset-save commits anymore, because # superdatasets get saved as part of the processed hierarchy and can contain # other parts in the commit (if so instructed) assert_equal( next(subsubds.repo.get_branch_commits('master')).message.rstrip(), 'savingtestmessage') assert_equal( next(subds.repo.get_branch_commits('master')).message.rstrip(), 'savingtestmessage') assert_equal( next(ds.repo.get_branch_commits('master')).message.rstrip(), 'savingtestmessage')
def test_FileStatus_basic(): assert_equal(FileStatus(size=0), FileStatus(size=0)) assert_not_equal(FileStatus(size=0), FileStatus(size=1)) # mtimes allow trimming if one is int assert_equal(FileStatus(mtime=0), FileStatus(mtime=0.9999)) assert_equal(FileStatus(mtime=0), FileStatus(mtime=0.0001)) assert_not_equal(FileStatus(mtime=0.2), FileStatus(mtime=0.1)) assert_not_equal(FileStatus(mtime=0.2), FileStatus(mtime=None)) assert_not_equal(FileStatus(mtime=1), FileStatus(mtime=None)) # And with None should be False assert_not_equal(FileStatus(mtime=1), None) assert_not_equal(None, FileStatus(mtime=1)) # adding more information would result in not-equal assert_not_equal(FileStatus(size=0), FileStatus(size=0, mtime=123)) # empty ones can't be compared # TODO: actually not clear why that NotImplemented singleton is not returned assert_not_equal(FileStatus(), FileStatus())
def test_target_ssh_simple(origin, src_path, target_rootpath): # prepare src source = install(src_path, source=origin) target_path = opj(target_rootpath, "basic") # it will try to fetch it so would fail as well since sshurl is wrong with swallow_logs(new_level=logging.ERROR) as cml, \ assert_raises(GitCommandError): create_sibling( dataset=source, target="local_target", sshurl="ssh://localhost", target_dir=target_path, ui=True) # is not actually happening on one of the two basic cases -- TODO figure it out # assert_in('enableremote local_target failed', cml.out) GitRepo(target_path, create=False) # raises if not a git repo assert_in("local_target", source.repo.get_remotes()) eq_("ssh://localhost", source.repo.get_remote_url("local_target")) # should NOT be able to push now, since url isn't correct: # TODO: assumption is wrong if ~ does have .git! fix up! assert_raises(GitCommandError, publish, dataset=source, to="local_target") # Both must be annex or git repositories src_is_annex = AnnexRepo.is_valid_repo(src_path) eq_(src_is_annex, AnnexRepo.is_valid_repo(target_path)) # And target one should be known to have a known UUID within the source if annex if src_is_annex: annex = AnnexRepo(src_path) local_target_cfg = annex.repo.remotes["local_target"].config_reader.get # for some reason this was "correct" # eq_(local_target_cfg('annex-ignore'), 'false') # but after fixing creating siblings in # 21f6dd012b2c7b9c0b8b348dcfb3b0ace7e8b2ec it started to fail # I think it is legit since we are trying to fetch now before calling # annex.enable_remote so it doesn't set it up, and fails before assert_raises(Exception, local_target_cfg, 'annex-ignore') # hm, but ATM wouldn't get a uuid since url is wrong assert_raises(Exception, local_target_cfg, 'annex-uuid') # do it again without force: with assert_raises(RuntimeError) as cm: assert_create_sshwebserver( dataset=source, target="local_target", sshurl="ssh://localhost", target_dir=target_path) eq_("Target directory %s already exists." % target_path, str(cm.exception)) if src_is_annex: target_description = AnnexRepo(target_path, create=False).get_description() assert_not_equal(target_description, None) assert_not_equal(target_description, target_path) ok_endswith(target_description, target_path) # now, with force and correct url, which is also used to determine # target_dir # Note: on windows absolute path is not url conform. But this way it's easy # to test, that ssh path is correctly used. if not on_windows: # add random file under target_path, to explicitly test existing=replace open(opj(target_path, 'random'), 'w').write('123') assert_create_sshwebserver( dataset=source, target="local_target", sshurl="ssh://localhost" + target_path, existing='replace') eq_("ssh://localhost" + target_path, source.repo.get_remote_url("local_target")) ok_(source.repo.get_remote_url("local_target", push=True) is None) # ensure target tree actually replaced by source assert_false(exists(opj(target_path, 'random'))) if src_is_annex: annex = AnnexRepo(src_path) local_target_cfg = annex.repo.remotes["local_target"].config_reader.get eq_(local_target_cfg('annex-ignore'), 'false') eq_(local_target_cfg('annex-uuid').count('-'), 4) # valid uuid # again, by explicitly passing urls. Since we are on localhost, the # local path should work: cpkwargs = dict( dataset=source, target="local_target", sshurl="ssh://localhost", target_dir=target_path, target_url=target_path, target_pushurl="ssh://localhost" + target_path, ui=True, ) assert_create_sshwebserver(existing='replace', **cpkwargs) if src_is_annex: target_description = AnnexRepo(target_path, create=False).get_description() eq_(target_description, target_path) eq_(target_path, source.repo.get_remote_url("local_target")) eq_("ssh://localhost" + target_path, source.repo.get_remote_url("local_target", push=True)) _test_correct_publish(target_path) # now, push should work: publish(dataset=source, to="local_target") # and we should be able to 'reconfigure' def process_digests_mtimes(digests, mtimes): # it should have triggered a hook, which would have created log and metadata files check_metadata = False for part in 'logs', 'metadata': metafiles = [k for k in digests if k.startswith(_path_('.git/datalad/%s/' % part))] # This is in effect ONLY if we have "compatible" datalad installed on remote # end. ATM we don't have easy way to guarantee that AFAIK (yoh), # so let's not check/enforce (TODO) # assert(len(metafiles) >= 1) # we might have 2 logs if timestamps do not collide ;) # Let's actually do it to some degree if part == 'logs': # always should have those: assert (len(metafiles) >= 1) with open(opj(target_path, metafiles[0])) as f: if 'no datalad found' not in f.read(): check_metadata = True if part == 'metadata': eq_(len(metafiles), bool(check_metadata)) for f in metafiles: digests.pop(f) mtimes.pop(f) # and just pop some leftovers from annex for f in list(digests): if f.startswith('.git/annex/mergedrefs'): digests.pop(f) mtimes.pop(f) orig_digests, orig_mtimes = get_mtimes_and_digests(target_path) process_digests_mtimes(orig_digests, orig_mtimes) import time; time.sleep(0.1) # just so that mtimes change assert_create_sshwebserver(existing='reconfigure', **cpkwargs) digests, mtimes = get_mtimes_and_digests(target_path) process_digests_mtimes(digests, mtimes) assert_dict_equal(orig_digests, digests) # nothing should change in terms of content # but some files should have been modified modified_files = {k for k in mtimes if orig_mtimes.get(k, 0) != mtimes.get(k, 0)} # collect which files were expected to be modified without incurring any changes ok_modified_files = { _path_('.git/hooks/post-update'), 'index.html', # files which hook would manage to generate _path_('.git/info/refs'), '.git/objects/info/packs' } if external_versions['cmd:system-git'] >= '2.4': # on elderly git we don't change receive setting ok_modified_files.add(_path_('.git/config')) ok_modified_files.update({f for f in digests if f.startswith(_path_('.git/datalad/web'))}) assert_set_equal(modified_files, ok_modified_files)
def test_target_ssh_simple(origin, src_path, target_rootpath): # prepare src source = install(src_path, source=origin, result_xfm='datasets', return_type='item-or-list') target_path = opj(target_rootpath, "basic") with swallow_logs(new_level=logging.ERROR) as cml: create_sibling(dataset=source, name="local_target", sshurl="ssh://localhost:22", target_dir=target_path, ui=True) assert_not_in('enableremote local_target failed', cml.out) GitRepo(target_path, create=False) # raises if not a git repo assert_in("local_target", source.repo.get_remotes()) # Both must be annex or git repositories src_is_annex = AnnexRepo.is_valid_repo(src_path) eq_(src_is_annex, AnnexRepo.is_valid_repo(target_path)) # And target one should be known to have a known UUID within the source if annex if src_is_annex: annex = AnnexRepo(src_path) local_target_cfg = annex.repo.remotes["local_target"].config_reader.get # basic config in place eq_(local_target_cfg('annex-ignore'), 'false') ok_(local_target_cfg('annex-uuid')) # do it again without force, but use a different name to avoid initial checks # for existing remotes: with assert_raises(RuntimeError) as cm: assert_create_sshwebserver(dataset=source, name="local_target_alt", sshurl="ssh://localhost", target_dir=target_path) ok_( text_type(cm.exception).startswith( "Target path %s already exists. And it fails to rmdir" % target_path)) if src_is_annex: target_description = AnnexRepo(target_path, create=False).get_description() assert_not_equal(target_description, None) assert_not_equal(target_description, target_path) # on yoh's laptop TMPDIR is under HOME, so things start to become # tricky since then target_path is shortened and we would need to know # remote $HOME. To not over-complicate and still test, test only for # the basename of the target_path ok_endswith(target_description, basename(target_path)) # now, with force and correct url, which is also used to determine # target_dir # Note: on windows absolute path is not url conform. But this way it's easy # to test, that ssh path is correctly used. if not on_windows: # add random file under target_path, to explicitly test existing=replace open(opj(target_path, 'random'), 'w').write('123') assert_create_sshwebserver(dataset=source, name="local_target", sshurl="ssh://localhost" + target_path, publish_by_default='master', existing='replace') eq_("ssh://localhost" + urlquote(target_path), source.repo.get_remote_url("local_target")) ok_(source.repo.get_remote_url("local_target", push=True) is None) # ensure target tree actually replaced by source assert_false(exists(opj(target_path, 'random'))) if src_is_annex: annex = AnnexRepo(src_path) local_target_cfg = annex.repo.remotes[ "local_target"].config_reader.get eq_(local_target_cfg('annex-ignore'), 'false') eq_(local_target_cfg('annex-uuid').count('-'), 4) # valid uuid # should be added too, even if URL matches prior state eq_(local_target_cfg('push'), 'master') # again, by explicitly passing urls. Since we are on localhost, the # local path should work: cpkwargs = dict( dataset=source, name="local_target", sshurl="ssh://localhost", target_dir=target_path, target_url=target_path, target_pushurl="ssh://localhost" + target_path, ui=True, ) assert_create_sshwebserver(existing='replace', **cpkwargs) if src_is_annex: target_description = AnnexRepo(target_path, create=False).get_description() eq_(target_description, target_path) eq_(target_path, source.repo.get_remote_url("local_target")) eq_("ssh://localhost" + target_path, source.repo.get_remote_url("local_target", push=True)) _test_correct_publish(target_path) # now, push should work: publish(dataset=source, to="local_target") # and we should be able to 'reconfigure' def process_digests_mtimes(digests, mtimes): # it should have triggered a hook, which would have created log and metadata files check_metadata = False for part in 'logs', 'metadata': metafiles = [ k for k in digests if k.startswith(_path_('.git/datalad/%s/' % part)) ] # This is in effect ONLY if we have "compatible" datalad installed on remote # end. ATM we don't have easy way to guarantee that AFAIK (yoh), # so let's not check/enforce (TODO) # assert(len(metafiles) >= 1) # we might have 2 logs if timestamps do not collide ;) # Let's actually do it to some degree if part == 'logs': # always should have those: assert (len(metafiles) >= 1) with open(opj(target_path, metafiles[0])) as f: if 'no datalad found' not in f.read(): check_metadata = True if part == 'metadata': eq_(len(metafiles), bool(check_metadata)) for f in metafiles: digests.pop(f) mtimes.pop(f) # and just pop some leftovers from annex for f in list(digests): if f.startswith('.git/annex/mergedrefs'): digests.pop(f) mtimes.pop(f) orig_digests, orig_mtimes = get_mtimes_and_digests(target_path) process_digests_mtimes(orig_digests, orig_mtimes) import time time.sleep(0.1) # just so that mtimes change assert_create_sshwebserver(existing='reconfigure', **cpkwargs) digests, mtimes = get_mtimes_and_digests(target_path) process_digests_mtimes(digests, mtimes) assert_dict_equal(orig_digests, digests) # nothing should change in terms of content # but some files should have been modified modified_files = { k for k in mtimes if orig_mtimes.get(k, 0) != mtimes.get(k, 0) } # collect which files were expected to be modified without incurring any changes ok_modified_files = { _path_('.git/hooks/post-update'), 'index.html', # files which hook would manage to generate _path_('.git/info/refs'), '.git/objects/info/packs' } # on elderly git we don't change receive setting ok_modified_files.add(_path_('.git/config')) ok_modified_files.update( {f for f in digests if f.startswith(_path_('.git/datalad/web'))}) # it seems that with some recent git behavior has changed a bit # and index might get touched if _path_('.git/index') in modified_files: ok_modified_files.add(_path_('.git/index')) assert_set_equal(modified_files, ok_modified_files)
def test_recursive_save(path): ds = Dataset(path).create() # nothing to save assert_false(ds.save()) subds = ds.create('sub') # subdataset presence already saved ok_clean_git(ds.path) subsubds = subds.create('subsub') with open(opj(subsubds.path, 'test'), 'w') as f: f.write('some') # does not save anything in the topdataset assert_false(ds.save()) # auto-add will save addition of subsubds to subds assert_true(ds.save(auto_add_changes=True)) # with recursive it will add the file in subsubds assert_true(ds.save(auto_add_changes=True, recursive=True)) # add content to subsub and try saving testfname = opj('sub', 'subsub', 'saveme') with open(opj(ds.path, testfname), 'w') as f: f.write('I am in here!') # the following should all do nothing # no auto_add assert_false(ds.save()) # no recursive assert_false(ds.save(auto_add_changes=True)) # no recursive and auto_add assert_false(ds.save(recursive=True)) # even with explicit target, no recursive safe assert_false(ds.save(files=[testfname])) # insufficient recursion depth for rlevel in (0, 1): assert_false( ds.save(files=[testfname], recursive=True, recursion_limit=rlevel)) # and finally with the right settings assert_true(ds.save(files=[testfname], recursive=True, recursion_limit=2)) # there is nothing else to save assert_false(ds.save(auto_add_changes=True, recursive=True)) # one more time and check that all datasets in the hierarchy get updated states = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] testfname = opj('sub', 'subsub', 'saveme2') with open(opj(ds.path, testfname), 'w') as f: f.write('I am in here!') assert_true(ds.save(auto_add_changes=True, recursive=True)) newstates = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] for old, new in zip(states, newstates): assert_not_equal(old, new) ## now let's check saving "upwards" assert not subds.repo.dirty create_tree(subds.path, {"testnew": 'smth', "testadded": "added"}) subds.repo.add("testadded") indexed_files = subds.repo.get_indexed_files() assert subds.repo.dirty assert ds.repo.dirty assert not subsubds.repo.dirty create_tree(subsubds.path, {"testnew2": 'smth'}) assert subsubds.repo.dirty # and indexed files didn't change assert_equal(indexed_files, subds.repo.get_indexed_files()) ok_clean_git(subds.repo, untracked=['testnew'], index_modified=['subsub'], head_modified=['testadded']) subsubds.save(message="saving", super_datasets=True, auto_add_changes=True) ok_clean_git(subsubds.repo) # but its super should have got only the subsub saved # not the file we created ok_clean_git(subds.repo, untracked=['testnew'], head_modified=['testadded']) # check commits to have correct messages assert_equal( next(ds.repo.get_branch_commits('master')).message.rstrip(), 'saving [origin: sub/subsub]') assert_equal( next(subds.repo.get_branch_commits('master')).message.rstrip(), 'saving [origin: subsub]')