def test_proxying_open_testrepobased(repo): TEST_CONTENT = "content to be annex-addurl'd" fname = 'test-annex.dat' fpath = opj(repo, fname) assert_raises(IOError, open, fpath) aio = AutomagicIO(activate=True) try: with swallow_outputs(): # now we should be able just to request to open this file with open(fpath) as f: content = f.read() eq_(content, TEST_CONTENT) finally: aio.deactivate() # and now that we have fetched it, nothing should forbid us to open it again with open(fpath) as f: eq_(f.read(), TEST_CONTENT) annex = AnnexRepo(repo, create=False) # Let's create another file deeper under the directory with the same content # so it would point to the same key, which we would drop and repeat the drill fpath2 = opj(repo, 'd1', 'd2', 'test2.dat') os.makedirs(dirname(fpath2)) with open(fpath2, 'w') as f: f.write(content) annex.add(fpath2) annex.drop(fpath2) annex.commit("added and dropped") assert_raises(IOError, open, fpath2) # Let's use context manager form with AutomagicIO() as aio: ok_(isinstance(aio, AutomagicIO)) ok_(aio.active) # swallowing output would cause trouble while testing with # DATALAD_ASSERT_NO_OPEN_FILES mode on. Reason is not 100% clear # on why underlying git-annex process would be dumping to stdout or err #with swallow_outputs(): # now we should be able just to request to open this file with open(fpath2) as f: content = f.read() eq_(content, TEST_CONTENT) annex.drop(fpath2) assert_raises(IOError, open, fpath2) # Let's use relative path with chpwd(opj(repo, 'd1')): # Let's use context manager form with AutomagicIO() as aio, \ swallow_outputs(), \ open(opj('d2', 'test2.dat')) as f: content = f.read() eq_(content, TEST_CONTENT)
def test_get_contentlocation(tdir): repo = AnnexRepo(tdir, create=True, init=True) repo.add('file.dat') repo.commit('added file.dat') key = repo.get_file_key('file.dat') cr = AnnexCustomRemote(tdir) key_path = cr.get_contentlocation(key, absolute=False) assert not isabs(key_path) key_path_abs = cr.get_contentlocation(key, absolute=True) assert isabs(key_path_abs) assert cr._contentlocations == {key: key_path} repo.drop('file.dat', options=['--force']) assert not cr.get_contentlocation(key, absolute=True)
def test_get_contentlocation(tdir): repo = AnnexRepo(tdir, create=True, init=True) repo.add('file.dat') repo.commit('added file.dat') key = repo.get_file_key('file.dat') cr = AnnexCustomRemote(tdir) key_path = cr.get_contentlocation(key, absolute=False) assert not isabs(key_path) key_path_abs = cr.get_contentlocation(key, absolute=True) assert isabs(key_path_abs) assert cr._contentlocations == {key: key_path} repo.drop('file.dat', options=['--force']) assert not cr.get_contentlocation(key, absolute=True)
def test_get_contentlocation(tdir=None): repo = AnnexRepo(tdir, create=True, init=True) repo.add('file.dat') repo.commit('added file.dat') # TODO contentlocation would come with eval_availability=True key = repo.get_file_annexinfo('file.dat')['key'] cr = ArchiveAnnexCustomRemote(None, path=tdir) key_path = cr.get_contentlocation(key, absolute=False) assert not isabs(key_path) key_path_abs = cr.get_contentlocation(key, absolute=True) assert isabs(key_path_abs) assert cr._contentlocations == {key: key_path} repo.drop('file.dat', options=['--force']) assert not cr.get_contentlocation(key, absolute=True)
def test_fs_traverse(topdir): # setup temp directory tree for testing annex = AnnexRepo(topdir) AnnexRepo(opj(topdir, 'annexdir'), create=True) GitRepo(opj(topdir, 'gitdir'), create=True) GitRepo(opj(topdir, 'dir', 'subgit'), create=True) annex.add(opj(topdir, 'dir'), commit=True) annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # traverse file system in recursive and non-recursive modes for recursive in [True, False]: # test fs_traverse in display mode with swallow_logs( new_level=logging.INFO) as log, swallow_outputs() as cmo: fs = fs_traverse(topdir, AnnexRepo(topdir), recurse_directories=recursive, json='display') if recursive: # fs_traverse logs should contain all not ignored subdirectories for subdir in [ opj(topdir, 'dir'), opj(topdir, 'dir', 'subdir') ]: assert_in('Directory: ' + subdir, log.out) # fs_traverse stdout contains subdirectory assert_in(('file2.txt' and 'dir'), cmo.out) # extract info of the top-level child directory child = [item for item in fs['nodes'] if item['name'] == 'dir'][0] # size of dir type child in non-recursive modes should be 0 Bytes(default) as # dir type child's size currently has no metadata file for traverser to pick its size from # and would require a recursive traversal w/ write to child metadata file mode assert_equal(child['size']['total'], { True: '6 Bytes', False: '0 Bytes' }[recursive]) for recursive in [True, False]: # run fs_traverse in write to json 'file' mode fs = fs_traverse(topdir, AnnexRepo(topdir), recurse_directories=recursive, json='file') # fs_traverse should return a dictionary assert_equal(isinstance(fs, dict), True) # not including git and annex folders assert_equal([ item for item in fs['nodes'] if ('gitdir' or 'annexdir') == item['name'] ], []) # extract info of the top-level child directory child = [item for item in fs['nodes'] if item['name'] == 'dir'][0] # verify node type assert_equal(child['type'], 'dir') # same node size on running fs_traversal in recursive followed by non-recursive mode # verifies child's metadata file being used to find its size # running in reverse order (non-recursive followed by recursive mode) will give (0, actual size) assert_equal(child['size']['total'], '6 Bytes') # verify subdirectory traversal if run in recursive mode # In current RF 'nodes' are stripped away during recursive traversal # for now... later we might reincarnate them "differently" # TODO! if False: # recursive: # sub-dictionary should not include git and hidden directory info assert_equal([ item for item in child['nodes'] if ('subgit' or '.fgit') == item['name'] ], []) # extract subdirectory dictionary, else fail subchild = [ subitem for subitem in child["nodes"] if subitem['name'] == 'subdir' ][0] # extract info of file1.txts, else fail link = [ subnode for subnode in subchild["nodes"] if subnode['name'] == 'file1.txt' ][0] # verify node's sizes and type assert_equal(link['size']['total'], '3 Bytes') assert_equal(link['size']['ondisk'], link['size']['total']) assert_equal(link['type'], 'link') # extract info of file2.txt, else fail brokenlink = [ subnode for subnode in subchild["nodes"] if subnode['name'] == 'file2.txt' ][0] # verify node's sizes and type assert_equal(brokenlink['type'], 'link-broken') assert_equal(brokenlink['size']['ondisk'], '0 Bytes') assert_equal(brokenlink['size']['total'], '3 Bytes')
def test_ls_json(topdir): annex = AnnexRepo(topdir, create=True) dsj = Dataset(topdir) # create some file and commit it with open(opj(dsj.path, 'subdsfile.txt'), 'w') as f: f.write('123') dsj.add(path='subdsfile.txt') dsj.save("Hello!", version_tag=1) # add a subdataset dsj.install('subds', source=topdir) subdirds = dsj.create(_path_('dir/subds2'), force=True) subdirds.add('file') git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True) # create git repo git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt'), commit=True) # commit to git to init git repo annex.add(opj(topdir, 'dir', 'subgit'), commit=True) # add the non-dataset git repo to annex annex.add(opj(topdir, 'dir'), commit=True) # add to annex (links) annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # broken-link meta_dir = opj('.git', 'datalad', 'metadata') meta_path = opj(topdir, meta_dir) def get_metahash(*path): if not path: path = ['/'] return hashlib.md5(opj(*path).encode('utf-8')).hexdigest() def get_metapath(dspath, *path): return _path_(dspath, meta_dir, get_metahash(*path)) def get_meta(dspath, *path): with open(get_metapath(dspath, *path)) as f: return js.load(f) for all_ in [True, False]: # recurse directories for recursive in [True, False]: for state in ['file', 'delete']: # subdataset should have its json created and deleted when # all=True else not subds_metapath = get_metapath(opj(topdir, 'subds')) exists_prior = exists(subds_metapath) #with swallow_logs(), swallow_outputs(): dsj = _ls_json(topdir, json=state, all_=all_, recursive=recursive) exists_post = exists(subds_metapath) # print("%s %s -> %s" % (state, exists_prior, exists_post)) assert_equal(exists_post, (state == 'file' and recursive)) # root should have its json file created and deleted in all cases ds_metapath = get_metapath(topdir) assert_equal(exists(ds_metapath), state == 'file') # children should have their metadata json's created and deleted only when recursive=True child_metapath = get_metapath(topdir, 'dir', 'subdir') assert_equal(exists(child_metapath), (state == 'file' and all_)) # ignored directories should not have json files created in any case for subdir in [('.hidden', ), ('dir', 'subgit')]: assert_false(exists(get_metapath(topdir, *subdir))) # check if its updated in its nodes sublist too. used by web-ui json. regression test assert_equal(dsj['nodes'][0]['size']['total'], dsj['size']['total']) # check size of subdataset subds = [ item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds') ][0] assert_equal(subds['size']['total'], '3 Bytes') # dir/subds2 must not be listed among nodes of the top dataset: topds_nodes = {x['name']: x for x in dsj['nodes']} assert_in('subds', topds_nodes) # XXX # # condition here is a bit a guesswork by yoh later on # # TODO: here and below clear destiny/interaction of all_ and recursive # assert_equal(dsj['size']['total'], # '15 Bytes' if (recursive and all_) else # ('9 Bytes' if (recursive or all_) else '3 Bytes') # ) # https://github.com/datalad/datalad/issues/1674 if state == 'file' and all_: dirj = get_meta(topdir, 'dir') dir_nodes = {x['name']: x for x in dirj['nodes']} # it should be present in the subdir meta assert_in('subds2', dir_nodes) # and not in topds assert_not_in('subds2', topds_nodes) # run non-recursive dataset traversal after subdataset metadata already created # to verify sub-dataset metadata being picked up from its metadata file in such cases if state == 'file' and recursive and not all_: dsj = _ls_json(topdir, json='file', all_=False) subds = [ item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds') ][0] assert_equal(subds['size']['total'], '3 Bytes')
def test_ls_json(topdir): annex = AnnexRepo(topdir, create=True) ds = Dataset(topdir) # create some file and commit it with open(opj(ds.path, 'subdsfile.txt'), 'w') as f: f.write('123') ds.add(path='subdsfile.txt') ds.save("Hello!", version_tag=1) # add a subdataset ds.install('subds', source=topdir) git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True) # create git repo git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt'), commit=True) # commit to git to init git repo annex.add(opj(topdir, 'dir', 'subgit'), commit=True) # add the non-dataset git repo to annex annex.add(opj(topdir, 'dir'), commit=True) # add to annex (links) annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # broken-link meta_dir = opj('.git', 'datalad', 'metadata') meta_path = opj(topdir, meta_dir) def get_metahash(*path): return hashlib.md5(opj(*path).encode('utf-8')).hexdigest() for all_ in [True, False]: for recursive in [True, False]: for state in ['file', 'delete']: with swallow_logs(), swallow_outputs(): ds = _ls_json(topdir, json=state, all_=all_, recursive=recursive) # subdataset should have its json created and deleted when all=True else not subds_metahash = get_metahash('/') subds_metapath = opj(topdir, 'subds', meta_dir, subds_metahash) assert_equal(exists(subds_metapath), (state == 'file' and recursive)) # root should have its json file created and deleted in all cases ds_metahash = get_metahash('/') ds_metapath = opj(meta_path, ds_metahash) assert_equal(exists(ds_metapath), state == 'file') # children should have their metadata json's created and deleted only when recursive=True child_metahash = get_metahash('dir', 'subdir') child_metapath = opj(meta_path, child_metahash) assert_equal(exists(child_metapath), (state == 'file' and all_)) # ignored directories should not have json files created in any case for subdir in [('.hidden'), ('dir', 'subgit')]: child_metahash = get_metahash(*subdir) assert_equal(exists(opj(meta_path, child_metahash)), False) # check if its updated in its nodes sublist too. used by web-ui json. regression test assert_equal(ds['nodes'][0]['size']['total'], ds['size']['total']) # check size of subdataset subds = [ item for item in ds['nodes'] if item['name'] == ('subdsfile.txt' or 'subds') ][0] assert_equal(subds['size']['total'], '3 Bytes') # run non-recursive dataset traversal after subdataset metadata already created # to verify sub-dataset metadata being picked up from its metadata file in such cases if state == 'file' and recursive and not all_: ds = _ls_json(topdir, json='file', all_=False) subds = [ item for item in ds['nodes'] if item['name'] == ('subdsfile.txt' or 'subds') ][0] assert_equal(subds['size']['total'], '3 Bytes')
def test_ls_json(topdir, topurl): annex = AnnexRepo(topdir, create=True) ds = Dataset(topdir) # create some file and commit it with open(opj(ds.path, 'subdsfile.txt'), 'w') as f: f.write('123') ds.save(path='subdsfile.txt', message="Hello!", version_tag=1) # add a subdataset ds.install('subds', source=topdir) subdirds = ds.create(_path_('dir/subds2'), force=True) subdirds.save('file') git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True) # create git repo git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt')) # commit to git to init git repo git.commit() annex.add(opj(topdir, 'dir', 'subgit')) # add the non-dataset git repo to annex annex.add(opj(topdir, 'dir')) # add to annex (links) annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # broken-link annex.commit() git.add('fgit.txt') # commit to git to init git repo git.commit() # annex.add doesn't add submodule, so using ds.add ds.save(opj('dir', 'subgit')) # add the non-dataset git repo to annex ds.save('dir') # add to annex (links) ds.drop(opj('dir', 'subdir', 'file2.txt'), check=False) # broken-link # register "external" submodule by installing and uninstalling it ext_url = topurl + '/dir/subgit/.git' # need to make it installable via http WitlessRunner(cwd=opj(topdir, 'dir', 'subgit')).run( ['git', 'update-server-info']) ds.install(opj('dir', 'subgit_ext'), source=ext_url) ds.uninstall(opj('dir', 'subgit_ext')) meta_dir = opj('.git', 'datalad', 'metadata') def get_metahash(*path): if not path: path = ['/'] return hashlib.md5(opj(*path).encode('utf-8')).hexdigest() def get_metapath(dspath, *path): return _path_(dspath, meta_dir, get_metahash(*path)) def get_meta(dspath, *path): with open(get_metapath(dspath, *path)) as f: return js.load(f) # Let's see that there is no crash if one of the files is available only # in relaxed URL mode, so no size could be picked up ds.repo.add_url_to_file('fromweb', topurl + '/noteventhere', options=['--relaxed']) for all_ in [True, False]: # recurse directories for recursive in [True, False]: for state in ['file', 'delete']: # subdataset should have its json created and deleted when # all=True else not subds_metapath = get_metapath(opj(topdir, 'subds')) exists_prior = exists(subds_metapath) #with swallow_logs(), swallow_outputs(): dsj = _ls_json(topdir, json=state, all_=all_, recursive=recursive) ok_startswith(dsj['tags'], '1-') exists_post = exists(subds_metapath) # print("%s %s -> %s" % (state, exists_prior, exists_post)) assert_equal(exists_post, (state == 'file' and recursive)) # root should have its json file created and deleted in all cases ds_metapath = get_metapath(topdir) assert_equal(exists(ds_metapath), state == 'file') # children should have their metadata json's created and deleted only when recursive=True child_metapath = get_metapath(topdir, 'dir', 'subdir') assert_equal(exists(child_metapath), (state == 'file' and all_)) # ignored directories should not have json files created in any case for subdir in [('.hidden', ), ('dir', 'subgit')]: assert_false(exists(get_metapath(topdir, *subdir))) # check if its updated in its nodes sublist too. used by web-ui json. regression test assert_equal(dsj['nodes'][0]['size']['total'], dsj['size']['total']) # check size of subdataset subds = [ item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds') ][0] assert_equal(subds['size']['total'], '3 Bytes') # dir/subds2 must not be listed among nodes of the top dataset: topds_nodes = {x['name']: x for x in dsj['nodes']} assert_in('subds', topds_nodes) # XXX # # condition here is a bit a guesswork by yoh later on # # TODO: here and below clear destiny/interaction of all_ and recursive # assert_equal(dsj['size']['total'], # '15 Bytes' if (recursive and all_) else # ('9 Bytes' if (recursive or all_) else '3 Bytes') # ) # https://github.com/datalad/datalad/issues/1674 if state == 'file' and all_: dirj = get_meta(topdir, 'dir') dir_nodes = {x['name']: x for x in dirj['nodes']} # it should be present in the subdir meta assert_in('subds2', dir_nodes) assert_not_in('url_external', dir_nodes['subds2']) assert_in('subgit_ext', dir_nodes) assert_equal(dir_nodes['subgit_ext']['url'], ext_url) # and not in topds assert_not_in('subds2', topds_nodes) # run non-recursive dataset traversal after subdataset metadata already created # to verify sub-dataset metadata being picked up from its metadata file in such cases if state == 'file' and recursive and not all_: dsj = _ls_json(topdir, json='file', all_=False) subds = [ item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds') ][0] assert_equal(subds['size']['total'], '3 Bytes') assert_equal(topds_nodes['fromweb']['size']['total'], UNKNOWN_SIZE)
def test_add_archive_content(path_orig, url, repo_path): with chpwd(repo_path): # TODO we need to be able to pass path into add_archive_content # We could mock but I mean for the API assert_raises(RuntimeError, add_archive_content, "nonexisting.tar.gz") # no repo yet repo = AnnexRepo(repo_path, create=True) assert_raises(ValueError, add_archive_content, "nonexisting.tar.gz") # we can't add a file from outside the repo ATM assert_raises(FileNotInRepositoryError, add_archive_content, opj(path_orig, '1.tar.gz')) # Let's add first archive to the repo so we could test with swallow_outputs(): repo.add_urls([opj(url, '1.tar.gz')], options=["--pathdepth", "-1"]) for s in range(1, 5): repo.add_urls([opj(url, '%du/1.tar.gz' % s)], options=["--pathdepth", "-2"]) repo.commit("added 1.tar.gz") key_1tar = repo.get_file_key( '1.tar.gz') # will be used in the test later def d1_basic_checks(): ok_(exists('1')) ok_file_under_git('1', '1 f.txt', annexed=True) ok_file_under_git(opj('1', 'd', '1d'), annexed=True) ok_archives_caches(repo_path, 0) # and by default it just does it, everything goes to annex repo_ = add_archive_content('1.tar.gz') eq_(repo.path, repo_.path) d1_basic_checks() # If ran again, should proceed just fine since the content is the same so no changes would be made really add_archive_content('1.tar.gz') # But that other one carries updated file, so should fail due to overwrite with assert_raises(RuntimeError) as cme: add_archive_content(opj('1u', '1.tar.gz'), use_current_dir=True) # TODO: somewhat not precise since we have two possible "already exists" # -- in caching and overwrite check assert_in("already exists", str(cme.exception)) # but should do fine if overrides are allowed add_archive_content(opj('1u', '1.tar.gz'), existing='overwrite', use_current_dir=True) add_archive_content(opj('2u', '1.tar.gz'), existing='archive-suffix', use_current_dir=True) add_archive_content(opj('3u', '1.tar.gz'), existing='archive-suffix', use_current_dir=True) add_archive_content(opj('4u', '1.tar.gz'), existing='archive-suffix', use_current_dir=True) # rudimentary test assert_equal(sorted(map(basename, glob(opj(repo_path, '1', '1*')))), ['1 f-1.1.txt', '1 f-1.2.txt', '1 f-1.txt', '1 f.txt']) whereis = repo.whereis(glob(opj(repo_path, '1', '1*'))) # they all must be the same assert (all([x == whereis[0] for x in whereis[1:]])) # and we should be able to reference it while under subdirectory subdir = opj(repo_path, 'subdir') with chpwd(subdir, mkdir=True): add_archive_content(opj(pardir, '1.tar.gz'), use_current_dir=True) d1_basic_checks() # or we could keep relative path and also demand to keep the archive prefix # while extracting under original (annex root) dir add_archive_content(opj(pardir, '1.tar.gz'), add_archive_leading_dir=True) with chpwd(opj(repo_path, '1')): d1_basic_checks() with chpwd(repo_path): # test with excludes and renames and annex options add_archive_content('1.tar.gz', exclude=['d'], rename=['/ /_', '/^1/2'], annex_options="-c annex.largefiles=exclude=*.txt", delete=True) # no conflicts since new name ok_file_under_git('2', '1_f.txt', annexed=False) assert_false(exists(opj('2', 'd'))) assert_false(exists('1.tar.gz')) # delete was in effect # now test ability to extract within subdir with chpwd(opj(repo_path, 'd1'), mkdir=True): # Let's add first archive to the repo so we could test # named the same way but different content with swallow_outputs(): repo.add_urls([opj(url, 'd1', '1.tar.gz')], options=["--pathdepth", "-1"], cwd=getpwd()) # invoke under current subdir repo.commit("added 1.tar.gz in d1") def d2_basic_checks(): ok_(exists('1')) ok_file_under_git('1', '2 f.txt', annexed=True) ok_file_under_git(opj('1', 'd2', '2d'), annexed=True) ok_archives_caches(repo.path, 0) add_archive_content('1.tar.gz') d2_basic_checks() # in manual tests ran into the situation of inability to obtain on a single run # a file from an archive which was coming from a dropped key. I thought it was # tested in custom remote tests, but I guess not sufficiently well enough repo.drop(opj('1', '1 f.txt')) # should be all kosher repo.get(opj('1', '1 f.txt')) ok_archives_caches(repo.path, 1, persistent=True) ok_archives_caches(repo.path, 0, persistent=False) repo.drop(opj('1', '1 f.txt')) # should be all kosher repo.drop(key_1tar, key=True) # is available from the URL -- should be kosher repo.get(opj('1', '1 f.txt')) # that what managed to not work # TODO: check if persistent archive is there for the 1.tar.gz # We should be able to drop everything since available online with swallow_outputs(): clean(dataset=repo.path) repo.drop(key_1tar, key=True) # is available from the URL -- should be kosher repo.drop(opj('1', '1 f.txt')) # should be all kosher repo.get(opj('1', '1 f.txt')) # and should be able to get it again # bug was that dropping didn't work since archive was dropped first repo.call_annex(["drop", "--all"]) # verify that we can't drop a file if archive key was dropped and online archive was removed or changed size! ;) repo.get(key_1tar, key=True) unlink(opj(path_orig, '1.tar.gz')) with assert_raises(CommandError) as e: repo.drop(key_1tar, key=True) assert_equal(e.kwargs['stdout_json'][0]['success'], False) assert_result_values_cond( e.kwargs['stdout_json'], 'note', lambda x: '(Use --force to override this check, or adjust numcopies.)' in x) assert exists(opj(repo.path, repo.get_contentlocation(key_1tar)))
def test_check_dates(path): refdate = 1218182889 with set_date(refdate - 1): ar = AnnexRepo(path, create=True) def tag_object(tag): """Return object for tag. Do not dereference it. """ # We can't use ar.get_tags because that returns the commit's hexsha, # not the tag's, and ar.get_hexsha is limited to commit objects. return ar.call_git_oneline( ["rev-parse", "refs/tags/{}".format(tag)], read_only=True) ar.add("foo") ar.commit("add foo") foo_commit = ar.get_hexsha() ar.commit("add foo") ar.tag("foo-tag", "tag before refdate") foo_tag = tag_object("foo-tag") # Make a lightweight tag to make sure `tag_dates` doesn't choke on it. ar.tag("light") with set_date(refdate + 1): ar.add("bar") ar.commit("add bar") bar_commit = ar.get_hexsha() ar.tag("bar-tag", "tag after refdate") bar_tag = tag_object("bar-tag") with set_date(refdate + 2): # Drop an annexed file so that we have more blobs in the git-annex # branch than its current tree. ar.drop("bar", options=["--force"]) results = {} for which in ["older", "newer"]: result = check_dates(ar, refdate, which=which)["objects"] ok_(result) if which == "newer": assert_in(bar_commit, result) assert_not_in(foo_commit, result) assert_in(bar_tag, result) elif which == "older": assert_in(foo_commit, result) assert_not_in(bar_commit, result) assert_in(foo_tag, result) results[which] = result ok_(any( x.get("filename") == "uuid.log" for x in results["older"].values())) newer_tree = check_dates(ar, refdate, annex="tree")["objects"] def is_annex_log_blob(entry): return (entry["type"] == "annex-blob" and entry["filename"].endswith(".log")) def num_logs(entries): return sum(map(is_annex_log_blob, entries.values())) # Because we dropped bar above, we should have one more blob in the # git-annex branch than in the current tree of the git-annex branch. eq_(num_logs(results["newer"]) - num_logs(newer_tree), 1) # Act like today is one day from the reference timestamp to check that we # get the same results with the one-day-back default. seconds_in_day = 60 * 60 * 24 with patch('time.time', return_value=refdate + seconds_in_day): assert_equal(check_dates(ar, annex="tree")["objects"], newer_tree) # We can give a path (str) instead of a GitRepo object. assert_equal( check_dates(path, refdate, annex="tree")["objects"], newer_tree) with assert_raises(ValueError): check_dates(ar, refdate, which="unrecognized")
def test_ls_json(topdir): annex = AnnexRepo(topdir, create=True) ds = Dataset(topdir) # create some file and commit it open(opj(ds.path, 'subdsfile.txt'), 'w').write('123') ds.add(path='subdsfile.txt') ds.save("Hello!", version_tag=1) # add a subdataset ds.install('subds', source=topdir) git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True) # create git repo git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt'), commit=True) # commit to git to init git repo annex.add(opj(topdir, 'dir', 'subgit'), commit=True) # add the non-dataset git repo to annex annex.add(opj(topdir, 'dir'), commit=True) # add to annex (links) annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # broken-link meta_dir = opj('.git', 'datalad', 'metadata') meta_path = opj(topdir, meta_dir) def get_metahash(*path): return hashlib.md5(opj(*path).encode('utf-8')).hexdigest() for all_ in [True, False]: for recursive in [True, False]: for state in ['file', 'delete']: with swallow_logs(), swallow_outputs(): ds = _ls_json(topdir, json=state, all_=all_, recursive=recursive) # subdataset should have its json created and deleted when all=True else not subds_metahash = get_metahash('/') subds_metapath = opj(topdir, 'subds', meta_dir, subds_metahash) assert_equal(exists(subds_metapath), (state == 'file' and recursive)) # root should have its json file created and deleted in all cases ds_metahash = get_metahash('/') ds_metapath = opj(meta_path, ds_metahash) assert_equal(exists(ds_metapath), state == 'file') # children should have their metadata json's created and deleted only when recursive=True child_metahash = get_metahash('dir', 'subdir') child_metapath = opj(meta_path, child_metahash) assert_equal(exists(child_metapath), (state == 'file' and all_)) # ignored directories should not have json files created in any case for subdir in [('.hidden'), ('dir', 'subgit')]: child_metahash = get_metahash(*subdir) assert_equal(exists(opj(meta_path, child_metahash)), False) # check if its updated in its nodes sublist too. used by web-ui json. regression test assert_equal(ds['nodes'][0]['size']['total'], ds['size']['total']) # check size of subdataset subds = [item for item in ds['nodes'] if item['name'] == ('subdsfile.txt' or 'subds')][0] assert_equal(subds['size']['total'], '3 Bytes') # run non-recursive dataset traversal after subdataset metadata already created # to verify sub-dataset metadata being picked up from its metadata file in such cases if state == 'file' and recursive and not all_: ds = _ls_json(topdir, json='file', all_=False) subds = [item for item in ds['nodes'] if item['name'] == ('subdsfile.txt' or 'subds')][0] assert_equal(subds['size']['total'], '3 Bytes')
def test_fs_traverse(topdir): # setup temp directory tree for testing annex = AnnexRepo(topdir) AnnexRepo(opj(topdir, 'annexdir'), create=True) GitRepo(opj(topdir, 'gitdir'), create=True) GitRepo(opj(topdir, 'dir', 'subgit'), create=True) annex.add(opj(topdir, 'dir'), commit=True) annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # traverse file system in recursive and non-recursive modes for recursive in [True, False]: # test fs_traverse in display mode with swallow_logs(new_level=logging.INFO) as log, swallow_outputs() as cmo: fs = fs_traverse(topdir, AnnexRepo(topdir), recursive=recursive, json='display') if recursive: # fs_traverse logs should contain all not ignored subdirectories for subdir in [opj(topdir, 'dir'), opj(topdir, 'dir', 'subdir')]: assert_in('Directory: ' + subdir, log.out) # fs_traverse stdout contains subdirectory assert_in(('file2.txt' and 'dir'), cmo.out) # extract info of the top-level child directory child = [item for item in fs['nodes'] if item['name'] == 'dir'][0] # size of dir type child in non-recursive modes should be 0 Bytes(default) as # dir type child's size currently has no metadata file for traverser to pick its size from # and would require a recursive traversal w/ write to child metadata file mode assert_equal(child['size']['total'], {True: '6 Bytes', False: '0 Bytes'}[recursive]) for recursive in [True, False]: # run fs_traverse in write to json 'file' mode fs = fs_traverse(topdir, AnnexRepo(topdir), recursive=recursive, json='file') # fs_traverse should return a dictionary assert_equal(isinstance(fs, dict), True) # not including git and annex folders assert_equal([item for item in fs['nodes'] if ('gitdir' or 'annexdir') == item['name']], []) # extract info of the top-level child directory child = [item for item in fs['nodes'] if item['name'] == 'dir'][0] # verify node type assert_equal(child['type'], 'dir') # same node size on running fs_traversal in recursive followed by non-recursive mode # verifies child's metadata file being used to find its size # running in reverse order (non-recursive followed by recursive mode) will give (0, actual size) assert_equal(child['size']['total'], '6 Bytes') # verify subdirectory traversal if run in recursive mode if recursive: # sub-dictionary should not include git and hidden directory info assert_equal([item for item in child['nodes'] if ('subgit' or '.fgit') == item['name']], []) # extract subdirectory dictionary, else fail subchild = [subitem for subitem in child["nodes"] if subitem['name'] == 'subdir'][0] # extract info of file1.txts, else fail link = [subnode for subnode in subchild["nodes"] if subnode['name'] == 'file1.txt'][0] # verify node's sizes and type assert_equal(link['size']['total'], '3 Bytes') assert_equal(link['size']['ondisk'], link['size']['total']) assert_equal(link['type'], 'link') # extract info of file2.txt, else fail brokenlink = [subnode for subnode in subchild["nodes"] if subnode['name'] == 'file2.txt'][0] # verify node's sizes and type assert_equal(brokenlink['type'], 'link-broken') assert_equal(brokenlink['size']['ondisk'], '0 Bytes') assert_equal(brokenlink['size']['total'], '3 Bytes')
def test_check_dates(path): refdate = 1218182889 with set_date(refdate - 1): ar = AnnexRepo(path, create=True) ar.add("foo") ar.commit("add foo") foo_commit = ar.get_hexsha() ar.commit("add foo") ar.tag("foo-tag", "tag before refdate") # We can't use ar.get_tags because that returns the commit's hexsha, # not the tag's, and ar.get_hexsha is limited to commit objects. foo_tag = ar.repo.git.rev_parse("foo-tag") # Make a lightweight tag to make sure `tag_dates` doesn't choke on it. ar.tag("light") with set_date(refdate + 1): ar.add("bar") ar.commit("add bar") bar_commit = ar.get_hexsha() ar.tag("bar-tag", "tag after refdate") bar_tag = ar.repo.git.rev_parse("bar-tag") with set_date(refdate + 2): # Drop an annexed file so that we have more blobs in the git-annex # branch than its current tree. ar.drop("bar", options=["--force"]) results = {} for which in ["older", "newer"]: result = check_dates(ar, refdate, which=which)["objects"] ok_(result) if which == "newer": assert_in(bar_commit, result) assert_not_in(foo_commit, result) assert_in(bar_tag, result) elif which == "older": assert_in(foo_commit, result) assert_not_in(bar_commit, result) assert_in(foo_tag, result) results[which] = result ok_(any(x.get("filename") == "uuid.log" for x in results["older"].values())) newer_tree = check_dates(ar, refdate, annex="tree")["objects"] def is_annex_log_blob(entry): return (entry["type"] == "annex-blob" and entry["filename"].endswith(".log")) def num_logs(entries): return sum(map(is_annex_log_blob, entries.values())) # Because we dropped bar above, we should have one more blob in the # git-annex branch than in the current tree of the git-annex branch. eq_(num_logs(results["newer"]) - num_logs(newer_tree), 1) # Act like today is one day from the reference timestamp to check that we # get the same results with the one-day-back default. seconds_in_day = 60 * 60 * 24 with patch('time.time', return_value=refdate + seconds_in_day): assert_equal(check_dates(ar, annex="tree")["objects"], newer_tree) # We can give a path (str) instead of a GitRepo object. assert_equal(check_dates(path, refdate, annex="tree")["objects"], newer_tree) with assert_raises(ValueError): check_dates(ar, refdate, which="unrecognized")
def test_ls_json(topdir, topurl): annex = AnnexRepo(topdir, create=True) ds = Dataset(topdir) # create some file and commit it with open(opj(ds.path, 'subdsfile.txt'), 'w') as f: f.write('123') ds.add(path='subdsfile.txt') ds.save("Hello!", version_tag=1) # add a subdataset ds.install('subds', source=topdir) subdirds = ds.create(_path_('dir/subds2'), force=True) subdirds.add('file') git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True) # create git repo git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt')) # commit to git to init git repo git.commit() annex.add(opj(topdir, 'dir', 'subgit')) # add the non-dataset git repo to annex annex.add(opj(topdir, 'dir')) # add to annex (links) annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # broken-link annex.commit() git.add('fgit.txt') # commit to git to init git repo git.commit() # annex.add doesn't add submodule, so using ds.add ds.add(opj('dir', 'subgit')) # add the non-dataset git repo to annex ds.add('dir') # add to annex (links) ds.drop(opj('dir', 'subdir', 'file2.txt'), check=False) # broken-link # register "external" submodule by installing and uninstalling it ext_url = topurl + '/dir/subgit/.git' # need to make it installable via http Runner()('git update-server-info', cwd=opj(topdir, 'dir', 'subgit')) ds.install(opj('dir', 'subgit_ext'), source=ext_url) ds.uninstall(opj('dir', 'subgit_ext')) meta_dir = opj('.git', 'datalad', 'metadata') def get_metahash(*path): if not path: path = ['/'] return hashlib.md5(opj(*path).encode('utf-8')).hexdigest() def get_metapath(dspath, *path): return _path_(dspath, meta_dir, get_metahash(*path)) def get_meta(dspath, *path): with open(get_metapath(dspath, *path)) as f: return js.load(f) # Let's see that there is no crash if one of the files is available only # in relaxed URL mode, so no size could be picked up ds.repo.add_url_to_file( 'fromweb', topurl + '/noteventhere', options=['--relaxed']) for all_ in [True, False]: # recurse directories for recursive in [True, False]: for state in ['file', 'delete']: # subdataset should have its json created and deleted when # all=True else not subds_metapath = get_metapath(opj(topdir, 'subds')) exists_prior = exists(subds_metapath) #with swallow_logs(), swallow_outputs(): dsj = _ls_json( topdir, json=state, all_=all_, recursive=recursive ) ok_startswith(dsj['tags'], '1-') exists_post = exists(subds_metapath) # print("%s %s -> %s" % (state, exists_prior, exists_post)) assert_equal(exists_post, (state == 'file' and recursive)) # root should have its json file created and deleted in all cases ds_metapath = get_metapath(topdir) assert_equal(exists(ds_metapath), state == 'file') # children should have their metadata json's created and deleted only when recursive=True child_metapath = get_metapath(topdir, 'dir', 'subdir') assert_equal(exists(child_metapath), (state == 'file' and all_)) # ignored directories should not have json files created in any case for subdir in [('.hidden',), ('dir', 'subgit')]: assert_false(exists(get_metapath(topdir, *subdir))) # check if its updated in its nodes sublist too. used by web-ui json. regression test assert_equal(dsj['nodes'][0]['size']['total'], dsj['size']['total']) # check size of subdataset subds = [item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds')][0] assert_equal(subds['size']['total'], '3 Bytes') # dir/subds2 must not be listed among nodes of the top dataset: topds_nodes = {x['name']: x for x in dsj['nodes']} assert_in('subds', topds_nodes) # XXX # # condition here is a bit a guesswork by yoh later on # # TODO: here and below clear destiny/interaction of all_ and recursive # assert_equal(dsj['size']['total'], # '15 Bytes' if (recursive and all_) else # ('9 Bytes' if (recursive or all_) else '3 Bytes') # ) # https://github.com/datalad/datalad/issues/1674 if state == 'file' and all_: dirj = get_meta(topdir, 'dir') dir_nodes = {x['name']: x for x in dirj['nodes']} # it should be present in the subdir meta assert_in('subds2', dir_nodes) assert_not_in('url_external', dir_nodes['subds2']) assert_in('subgit_ext', dir_nodes) assert_equal(dir_nodes['subgit_ext']['url'], ext_url) # and not in topds assert_not_in('subds2', topds_nodes) # run non-recursive dataset traversal after subdataset metadata already created # to verify sub-dataset metadata being picked up from its metadata file in such cases if state == 'file' and recursive and not all_: dsj = _ls_json(topdir, json='file', all_=False) subds = [ item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds') ][0] assert_equal(subds['size']['total'], '3 Bytes') assert_equal( topds_nodes['fromweb']['size']['total'], UNKNOWN_SIZE )