def check_compress_file(ext, annex, path, name): # we base the archive name on the filename, in order to also # be able to properly test compressors where the corresponding # archive format has no capability of storing a filename # (i.e. where the archive name itself determines the filename # of the decompressed file, like .xz) archive = op.join(name, _filename + ext) compress_files([_filename], archive, path=path) assert_true(op.exists(archive)) if annex: # It should work even when file is annexed and is a symlink to the # key from datalad.support.annexrepo import AnnexRepo repo = AnnexRepo(path, init=True) repo.add(_filename) repo.commit(files=[_filename], msg="commit") dir_extracted = name + "_extracted" try: decompress_file(archive, dir_extracted) except MissingExternalDependency as exc: raise SkipTest(exc_str(exc)) _filepath = op.join(dir_extracted, _filename) ok_file_has_content(_filepath, 'content')
def test_add_archive_use_archive_dir(repo_path): repo = AnnexRepo(repo_path, create=True) with chpwd(repo_path): # Let's add first archive to the repo with default setting archive_path = opj('4u', '1.tar.gz') # check it gives informative error if archive is not already added with assert_raises(RuntimeError) as cmr: add_archive_content(archive_path) assert_re_in( "You should run ['\"]datalad save 4u\\\\1\\.tar\\.gz['\"] first" if on_windows else "You should run ['\"]datalad save 4u/1\\.tar\\.gz['\"] first", str(cmr.exception), match=False) with swallow_outputs(): repo.add(archive_path) repo.commit("added 1.tar.gz") ok_archives_caches(repo.path, 0) add_archive_content(archive_path, strip_leading_dirs=True, use_current_dir=True) ok_(not exists(opj('4u', '1 f.txt'))) ok_file_under_git(repo.path, '1 f.txt', annexed=True) ok_archives_caches(repo.path, 0) # and now let's extract under archive dir add_archive_content(archive_path, strip_leading_dirs=True) ok_file_under_git(repo.path, opj('4u', '1 f.txt'), annexed=True) ok_archives_caches(repo.path, 0) add_archive_content(opj('4u', 'sub.tar.gz')) ok_file_under_git(repo.path, opj('4u', 'sub', '2 f.txt'), annexed=True) ok_archives_caches(repo.path, 0)
def test_proxying_open_testrepobased(repo): TEST_CONTENT = "content to be annex-addurl'd" fname = 'test-annex.dat' fpath = opj(repo, fname) assert_raises(IOError, open, fpath) aio = AutomagicIO(activate=True) try: with swallow_outputs(): # now we should be able just to request to open this file with open(fpath) as f: content = f.read() eq_(content, TEST_CONTENT) finally: aio.deactivate() # and now that we have fetched it, nothing should forbid us to open it again with open(fpath) as f: eq_(f.read(), TEST_CONTENT) annex = AnnexRepo(repo, create=False) # Let's create another file deeper under the directory with the same content # so it would point to the same key, which we would drop and repeat the drill fpath2 = opj(repo, 'd1', 'd2', 'test2.dat') os.makedirs(dirname(fpath2)) with open(fpath2, 'w') as f: f.write(content) annex.add(fpath2) annex.drop(fpath2) annex.commit("added and dropped") assert_raises(IOError, open, fpath2) # Let's use context manager form with AutomagicIO() as aio: ok_(isinstance(aio, AutomagicIO)) ok_(aio.active) # swallowing output would cause trouble while testing with # DATALAD_ASSERT_NO_OPEN_FILES mode on. Reason is not 100% clear # on why underlying git-annex process would be dumping to stdout or err #with swallow_outputs(): # now we should be able just to request to open this file with open(fpath2) as f: content = f.read() eq_(content, TEST_CONTENT) annex.drop(fpath2) assert_raises(IOError, open, fpath2) # Let's use relative path with chpwd(opj(repo, 'd1')): # Let's use context manager form with AutomagicIO() as aio, \ swallow_outputs(), \ open(opj('d2', 'test2.dat')) as f: content = f.read() eq_(content, TEST_CONTENT)
def test_add_archive_content_zip(repo_path): repo = AnnexRepo(repo_path, create=True) with chpwd(repo_path): with swallow_outputs(): repo.add(["1.zip"]) repo.commit("add 1.zip") add_archive_content("1.zip") ok_file_under_git(opj(repo.path, "1", "foo"), annexed=True) ok_file_under_git(opj("1", "dir", "bar"), annexed=True) ok_archives_caches(repo.path, 0)
def test_add_archive_content_absolute_path(path): repo = AnnexRepo(opj(path, "ds"), create=True) repo.add(["1.tar.gz"]) repo.commit("1.tar.gz") abs_tar_gz = opj(path, "ds", "1.tar.gz") add_archive_content(abs_tar_gz, annex=repo) ok_file_under_git(opj(path, "ds", "1", "foo"), annexed=True) commit_msg = repo.format_commit("%B") # The commit message uses relative paths. assert_not_in(abs_tar_gz, commit_msg) assert_in("1.tar.gz", commit_msg) with assert_raises(FileNotInRepositoryError): add_archive_content(opj(path, "notds", "2.tar.gz"), annex=repo)
def test_add_archive_content_strip_leading(path_orig, url, repo_path): with chpwd(repo_path): repo = AnnexRepo(repo_path, create=True) # Let's add first archive to the repo so we could test with swallow_outputs(): repo.add_urls([opj(url, '1.tar.gz')], options=["--pathdepth", "-1"]) repo.commit("added 1.tar.gz") add_archive_content('1.tar.gz', strip_leading_dirs=True) ok_(not exists('1')) ok_file_under_git(repo.path, '1 f.txt', annexed=True) ok_file_under_git('d', '1d', annexed=True) ok_archives_caches(repo.path, 0)
def test_get_contentlocation(tdir): repo = AnnexRepo(tdir, create=True, init=True) repo.add('file.dat') repo.commit('added file.dat') key = repo.get_file_key('file.dat') cr = AnnexCustomRemote(tdir) key_path = cr.get_contentlocation(key, absolute=False) assert not isabs(key_path) key_path_abs = cr.get_contentlocation(key, absolute=True) assert isabs(key_path_abs) assert cr._contentlocations == {key: key_path} repo.drop('file.dat', options=['--force']) assert not cr.get_contentlocation(key, absolute=True)
def test_get_contentlocation(tdir=None): repo = AnnexRepo(tdir, create=True, init=True) repo.add('file.dat') repo.commit('added file.dat') # TODO contentlocation would come with eval_availability=True key = repo.get_file_annexinfo('file.dat')['key'] cr = ArchiveAnnexCustomRemote(None, path=tdir) key_path = cr.get_contentlocation(key, absolute=False) assert not isabs(key_path) key_path_abs = cr.get_contentlocation(key, absolute=True) assert isabs(key_path_abs) assert cr._contentlocations == {key: key_path} repo.drop('file.dat', options=['--force']) assert not cr.get_contentlocation(key, absolute=True)
def test_add_archive_dirs(path_orig, url, repo_path): # change to repo_path with chpwd(repo_path): # create annex repo repo = AnnexRepo(repo_path, create=True) # add archive to the repo so we could test with swallow_outputs(): repo.add_urls([opj(url, '1.tar.gz')], options=["--pathdepth", "-1"]) repo.commit("added 1.tar.gz") # test with excludes and annex options add_archive_content( '1.tar.gz', existing='archive-suffix', # Since inconsistent and seems in many cases no leading dirs to strip, keep them as provided strip_leading_dirs=True, delete=True, leading_dirs_consider=['crcns.*', '1'], leading_dirs_depth=2, use_current_dir=False, exclude='.*__MACOSX.*') # some junk penetrates if external_versions['cmd:annex'] >= '6.20170208': # should have fixed remotes eq_( repo.get_description( uuid=DATALAD_SPECIAL_REMOTES_UUIDS[ARCHIVES_SPECIAL_REMOTE] ), '[%s]' % ARCHIVES_SPECIAL_REMOTE) all_files = sorted(find_files('.')) target_files = { './CR24A/behaving1/1 f.txt', './CR24C/behaving3/3 f.txt', './CR24D/behaving2/2 f.txt', } eq_(set(all_files), target_files) # regression test: the subdir in MACOSX wasn't excluded and its name was getting stripped by leading_dir_len assert_false(exists( '__MACOSX')) # if stripping and exclude didn't work this fails assert_false( exists('c-1_data') ) # if exclude doesn't work then name of subdir gets stripped by leading_dir_len assert_false( exists('CR24B') ) # if exclude doesn't work but everything else works this fails
def test_check_dates(path): skip_if_no_module("dateutil") ref_ts = 1218182889 # Fri, 08 Aug 2008 04:08:09 -0400 refdate = "@{}".format(ref_ts) repo = os.path.join(path, "repo") with set_date(ref_ts + 5000): ar = AnnexRepo(repo) ar.add(".") ar.commit() # The standard renderer outputs json. with swallow_outputs() as cmo: # Set level to WARNING to avoid the progress bar when # DATALAD_TESTS_UI_BACKEND=console. with swallow_logs(new_level=logging.WARNING): check_dates([repo], reference_date=refdate, return_type="list") assert_in("report", json.loads(cmo.out).keys()) # We find the newer objects. newer = call([path], reference_date=refdate) eq_(len(newer), 1) ok_(newer[0]["report"]["objects"]) # There are no older objects to find. older = call([repo], reference_date=refdate, older=True) assert_false(older[0]["report"]["objects"]) # We can pass the date in RFC 2822 format. assert_dict_equal( newer[0], call([path], reference_date="08 Aug 2008 04:08:09 -0400")[0]) # paths=None defaults to the current directory. with chpwd(path): assert_dict_equal( newer[0]["report"], call(paths=None, reference_date=refdate)[0]["report"]) # Only commit type is present when annex='none'. newer_noannex = call([path], reference_date=refdate, annex="none") for entry in newer_noannex[0]["report"]["objects"].values(): ok_(entry["type"] == "commit")
def test_check_dates(path=None): skip_if_no_module("dateutil") ref_ts = 1218182889 # Fri, 08 Aug 2008 04:08:09 -0400 refdate = "@{}".format(ref_ts) repo = os.path.join(path, "repo") with set_date(ref_ts + 5000): ar = AnnexRepo(repo) ar.add(".") ar.commit() # The standard renderer outputs json. with swallow_outputs() as cmo: # Set level to WARNING to avoid the progress bar when # DATALAD_TESTS_UI_BACKEND=console. with swallow_logs(new_level=logging.WARNING): check_dates([repo], reference_date=refdate, return_type="list") assert_in("report", json.loads(cmo.out).keys()) # We find the newer objects. newer = call([path], reference_date=refdate) eq_(len(newer), 1) ok_(newer[0]["report"]["objects"]) # There are no older objects to find. older = call([repo], reference_date=refdate, older=True) assert_false(older[0]["report"]["objects"]) # We can pass the date in RFC 2822 format. assert_dict_equal( newer[0], call([path], reference_date="08 Aug 2008 04:08:09 -0400")[0]) # paths=None defaults to the current directory. with chpwd(path): assert_dict_equal( newer[0]["report"], call(paths=None, reference_date=refdate)[0]["report"]) # Only commit type is present when annex='none'. newer_noannex = call([path], reference_date=refdate, annex="none") for entry in newer_noannex[0]["report"]["objects"].values(): ok_(entry["type"] == "commit")
def put_file_under_git(path, filename=None, content=None, annexed=False): """Place file under git/annex and return used Repo """ annex, file_repo_path, filename, path, repo = _prep_file_under_git( path, filename) if content is None: content = "" with open(opj(repo.path, file_repo_path), 'w') as f_: f_.write(content) if annexed: if not isinstance(repo, AnnexRepo): repo = AnnexRepo(repo.path) repo.add(file_repo_path) else: repo.add(file_repo_path, git=True) repo.commit(_datalad_msg=True) ok_file_under_git(repo.path, file_repo_path, annexed) return repo
def test_interactions(tdir): # Just a placeholder since constructor expects a repo repo = AnnexRepo(tdir, create=True, init=True) repo.add('file.dat') repo.commit('added file.dat') for scenario in BASE_INTERACTION_SCENARIOS + [ [ ('GETAVAILABILITY', 'AVAILABILITY %s' % DEFAULT_AVAILABILITY), ('GETCOST', 'COST %d' % DEFAULT_COST), ('TRANSFER RETRIEVE somekey somefile', re.compile('TRANSFER-FAILURE RETRIEVE somekey NotImplementedError().*')), ], [ # by default we do not require any fancy init # no urls supported by default ('CLAIMURL http://example.com', 'CLAIMURL-FAILURE'), # we know that is just a single option, url, is expected so full # one would be passed ('CLAIMURL http://example.com roguearg', 'CLAIMURL-FAILURE'), ] ]: check_interaction_scenario(AnnexCustomRemote, tdir, scenario)
def check_compress_file(ext, annex, path, name): archive = name + ext compress_files([_filename], archive, path=path) assert_true(exists(archive)) if annex: # It should work even when file is annexed and is a symlink to the # key from datalad.support.annexrepo import AnnexRepo repo = AnnexRepo(path, init=True) repo.add(_filename) repo.commit(files=[_filename], msg="commit") dir_extracted = name + "_extracted" try: decompress_file(archive, dir_extracted) except MissingExternalDependency as exc: raise SkipTest(exc_str(exc)) _filepath = op.join(dir_extracted, _filename) import glob print(dir_extracted) print(glob.glob(dir_extracted + '/*')) ok_file_has_content(_filepath, 'content')
def test_add_archive_content(path_orig, url, repo_path): with chpwd(repo_path): # TODO we need to be able to pass path into add_archive_content # We could mock but I mean for the API assert_raises(RuntimeError, add_archive_content, "nonexisting.tar.gz") # no repo yet repo = AnnexRepo(repo_path, create=True) assert_raises(ValueError, add_archive_content, "nonexisting.tar.gz") # we can't add a file from outside the repo ATM assert_raises(FileNotInRepositoryError, add_archive_content, opj(path_orig, '1.tar.gz')) # Let's add first archive to the repo so we could test with swallow_outputs(): repo.add_urls([opj(url, '1.tar.gz')], options=["--pathdepth", "-1"]) for s in range(1, 5): repo.add_urls([opj(url, '%du/1.tar.gz' % s)], options=["--pathdepth", "-2"]) repo.commit("added 1.tar.gz") key_1tar = repo.get_file_key( '1.tar.gz') # will be used in the test later def d1_basic_checks(): ok_(exists('1')) ok_file_under_git('1', '1 f.txt', annexed=True) ok_file_under_git(opj('1', 'd', '1d'), annexed=True) ok_archives_caches(repo_path, 0) # and by default it just does it, everything goes to annex repo_ = add_archive_content('1.tar.gz') eq_(repo.path, repo_.path) d1_basic_checks() # If ran again, should proceed just fine since the content is the same so no changes would be made really add_archive_content('1.tar.gz') # But that other one carries updated file, so should fail due to overwrite with assert_raises(RuntimeError) as cme: add_archive_content(opj('1u', '1.tar.gz'), use_current_dir=True) # TODO: somewhat not precise since we have two possible "already exists" # -- in caching and overwrite check assert_in("already exists", str(cme.exception)) # but should do fine if overrides are allowed add_archive_content(opj('1u', '1.tar.gz'), existing='overwrite', use_current_dir=True) add_archive_content(opj('2u', '1.tar.gz'), existing='archive-suffix', use_current_dir=True) add_archive_content(opj('3u', '1.tar.gz'), existing='archive-suffix', use_current_dir=True) add_archive_content(opj('4u', '1.tar.gz'), existing='archive-suffix', use_current_dir=True) # rudimentary test assert_equal(sorted(map(basename, glob(opj(repo_path, '1', '1*')))), ['1 f-1.1.txt', '1 f-1.2.txt', '1 f-1.txt', '1 f.txt']) whereis = repo.whereis(glob(opj(repo_path, '1', '1*'))) # they all must be the same assert (all([x == whereis[0] for x in whereis[1:]])) # and we should be able to reference it while under subdirectory subdir = opj(repo_path, 'subdir') with chpwd(subdir, mkdir=True): add_archive_content(opj(pardir, '1.tar.gz'), use_current_dir=True) d1_basic_checks() # or we could keep relative path and also demand to keep the archive prefix # while extracting under original (annex root) dir add_archive_content(opj(pardir, '1.tar.gz'), add_archive_leading_dir=True) with chpwd(opj(repo_path, '1')): d1_basic_checks() with chpwd(repo_path): # test with excludes and renames and annex options add_archive_content('1.tar.gz', exclude=['d'], rename=['/ /_', '/^1/2'], annex_options="-c annex.largefiles=exclude=*.txt", delete=True) # no conflicts since new name ok_file_under_git('2', '1_f.txt', annexed=False) assert_false(exists(opj('2', 'd'))) assert_false(exists('1.tar.gz')) # delete was in effect # now test ability to extract within subdir with chpwd(opj(repo_path, 'd1'), mkdir=True): # Let's add first archive to the repo so we could test # named the same way but different content with swallow_outputs(): repo.add_urls([opj(url, 'd1', '1.tar.gz')], options=["--pathdepth", "-1"], cwd=getpwd()) # invoke under current subdir repo.commit("added 1.tar.gz in d1") def d2_basic_checks(): ok_(exists('1')) ok_file_under_git('1', '2 f.txt', annexed=True) ok_file_under_git(opj('1', 'd2', '2d'), annexed=True) ok_archives_caches(repo.path, 0) add_archive_content('1.tar.gz') d2_basic_checks() # in manual tests ran into the situation of inability to obtain on a single run # a file from an archive which was coming from a dropped key. I thought it was # tested in custom remote tests, but I guess not sufficiently well enough repo.drop(opj('1', '1 f.txt')) # should be all kosher repo.get(opj('1', '1 f.txt')) ok_archives_caches(repo.path, 1, persistent=True) ok_archives_caches(repo.path, 0, persistent=False) repo.drop(opj('1', '1 f.txt')) # should be all kosher repo.drop(key_1tar, key=True) # is available from the URL -- should be kosher repo.get(opj('1', '1 f.txt')) # that what managed to not work # TODO: check if persistent archive is there for the 1.tar.gz # We should be able to drop everything since available online with swallow_outputs(): clean(dataset=repo.path) repo.drop(key_1tar, key=True) # is available from the URL -- should be kosher repo.drop(opj('1', '1 f.txt')) # should be all kosher repo.get(opj('1', '1 f.txt')) # and should be able to get it again # bug was that dropping didn't work since archive was dropped first repo.call_annex(["drop", "--all"]) # verify that we can't drop a file if archive key was dropped and online archive was removed or changed size! ;) repo.get(key_1tar, key=True) unlink(opj(path_orig, '1.tar.gz')) with assert_raises(CommandError) as e: repo.drop(key_1tar, key=True) assert_equal(e.kwargs['stdout_json'][0]['success'], False) assert_result_values_cond( e.kwargs['stdout_json'], 'note', lambda x: '(Use --force to override this check, or adjust numcopies.)' in x) assert exists(opj(repo.path, repo.get_contentlocation(key_1tar)))
def _test_AnnexDB(cls, path): filepath1 = opj(path, 'file1.txt') filep2 = opj('d', 'file2.txt') filepath2 = opj(path, filep2) annex = AnnexRepo(path, create=True) # PhysicalFileStatusesDB relies on information in annex so files # must be committed first annex.add('file1.txt') annex.commit("initial commit") db = cls(annex=annex) def set_db_status_from_file(fpath): """To test JsonFileStatusesDB, we need to keep updating the status stored""" if cls is JsonFileStatusesDB: # we need first to set the status db.set(fpath, db._get_fileattributes_status(fpath)) set_db_status_from_file('file1.txt') status1 = db.get('file1.txt') assert(status1.size) status1_ = db.get('file1.txt') assert_equal(status1, status1_) assert_false(db.is_different('file1.txt', status1)) assert_false(db.is_different('file1.txt', status1_)) # even if we add a filename specification status1_.filename = 'file1.txt' assert_false(db.is_different('file1.txt', status1_)) status1_.filename = 'different.txt' assert_false(db.is_different('file1.txt', status1_)) os.unlink(filepath1) # under annex- - we don't have unlock yet and thus can't inplace augment with open(filepath1, 'a') as f: f.write('+') # Note/TODO: fixed (realpath) path should go. Inner logic has to adapt to # dataset singletons, that don't resolve symlinks set_db_status_from_file(realpath(filepath1)) assert(db.is_different('file1.txt', status1)) # we should be able to get status of files out and inside of git set_db_status_from_file('2git') status_git1 = db.get('2git') annex.add('2git', git=True) annex.commit("added 2git") assert_equal(db.get('2git'), status_git1) # we should be able to get status of files with relative path to top dir and abs path set_db_status_from_file(filep2) status2 = db.get(filep2) # Note/TODO: fixed (realpath) path should go. Inner logic has to adapt to # dataset singletons, that don't resolve symlinks status2_full = db.get(realpath(filepath2)) assert_equal(status2, status2_full) # TODO? what about relative to curdir?? #with chpwd(opj(path, 'd')): # status2_dir = db.get('./file2.txt') # assert_equal(status2, status2_dir) # since we asked about each file we added to DB/annex -- none should be # known as "deleted" assert_equal(db.get_obsolete(), []) # Possibly save its state for persistent storage #import pdb; pdb.set_trace() db.save() # but, if we create another DB which wasn't queried yet db2 = cls(annex=annex) # all files should be returned # TODO: fixed by using realpath, but there should be a cleaner # adaption to dataset singletons, that are NOT resolving symlinks, while the # underlying repos do! assert_equal( set(db2.get_obsolete()), {opj(realpath(path), p) for p in ['file1.txt', filep2, '2git']}) # and if we query one, it shouldn't be listed as deleted any more status2_ = db2.get(filep2) assert_equal(status2, status2_) # TODO: fixed by using realpath, but there should be a cleaner # adaption to dataset singletons, that are NOT resolving symlinks, while the # underlying repos do! assert_equal( set(db2.get_obsolete()), {opj(realpath(path), p) for p in ['file1.txt', '2git']}) # and if we queried with ./ prefix, should still work db2.get(curdir + sep + 'file1.txt') # TODO: fixed by using realpath, but there should be a cleaner # adaption to dataset singletons, that are NOT resolving symlinks, while the # underlying repos do! assert_equal( set(db2.get_obsolete()), {opj(realpath(path), p) for p in ['2git']}) # and if we queried with a full path, should still work # TODO: fixed by using realpath, but there should be a cleaner # adaption to dataset singletons, that are NOT resolving symlinks, while the # underlying repos do! db2.get(opj(realpath(path), '2git')) assert_equal(db2.get_obsolete(), [])
def test_ls_json(topdir, topurl): annex = AnnexRepo(topdir, create=True) ds = Dataset(topdir) # create some file and commit it with open(opj(ds.path, 'subdsfile.txt'), 'w') as f: f.write('123') ds.add(path='subdsfile.txt') ds.save("Hello!", version_tag=1) # add a subdataset ds.install('subds', source=topdir) subdirds = ds.create(_path_('dir/subds2'), force=True) subdirds.add('file') git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True) # create git repo git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt')) # commit to git to init git repo git.commit() annex.add(opj(topdir, 'dir', 'subgit')) # add the non-dataset git repo to annex annex.add(opj(topdir, 'dir')) # add to annex (links) annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # broken-link annex.commit() git.add('fgit.txt') # commit to git to init git repo git.commit() # annex.add doesn't add submodule, so using ds.add ds.add(opj('dir', 'subgit')) # add the non-dataset git repo to annex ds.add('dir') # add to annex (links) ds.drop(opj('dir', 'subdir', 'file2.txt'), check=False) # broken-link # register "external" submodule by installing and uninstalling it ext_url = topurl + '/dir/subgit/.git' # need to make it installable via http Runner()('git update-server-info', cwd=opj(topdir, 'dir', 'subgit')) ds.install(opj('dir', 'subgit_ext'), source=ext_url) ds.uninstall(opj('dir', 'subgit_ext')) meta_dir = opj('.git', 'datalad', 'metadata') def get_metahash(*path): if not path: path = ['/'] return hashlib.md5(opj(*path).encode('utf-8')).hexdigest() def get_metapath(dspath, *path): return _path_(dspath, meta_dir, get_metahash(*path)) def get_meta(dspath, *path): with open(get_metapath(dspath, *path)) as f: return js.load(f) # Let's see that there is no crash if one of the files is available only # in relaxed URL mode, so no size could be picked up ds.repo.add_url_to_file( 'fromweb', topurl + '/noteventhere', options=['--relaxed']) for all_ in [True, False]: # recurse directories for recursive in [True, False]: for state in ['file', 'delete']: # subdataset should have its json created and deleted when # all=True else not subds_metapath = get_metapath(opj(topdir, 'subds')) exists_prior = exists(subds_metapath) #with swallow_logs(), swallow_outputs(): dsj = _ls_json( topdir, json=state, all_=all_, recursive=recursive ) ok_startswith(dsj['tags'], '1-') exists_post = exists(subds_metapath) # print("%s %s -> %s" % (state, exists_prior, exists_post)) assert_equal(exists_post, (state == 'file' and recursive)) # root should have its json file created and deleted in all cases ds_metapath = get_metapath(topdir) assert_equal(exists(ds_metapath), state == 'file') # children should have their metadata json's created and deleted only when recursive=True child_metapath = get_metapath(topdir, 'dir', 'subdir') assert_equal(exists(child_metapath), (state == 'file' and all_)) # ignored directories should not have json files created in any case for subdir in [('.hidden',), ('dir', 'subgit')]: assert_false(exists(get_metapath(topdir, *subdir))) # check if its updated in its nodes sublist too. used by web-ui json. regression test assert_equal(dsj['nodes'][0]['size']['total'], dsj['size']['total']) # check size of subdataset subds = [item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds')][0] assert_equal(subds['size']['total'], '3 Bytes') # dir/subds2 must not be listed among nodes of the top dataset: topds_nodes = {x['name']: x for x in dsj['nodes']} assert_in('subds', topds_nodes) # XXX # # condition here is a bit a guesswork by yoh later on # # TODO: here and below clear destiny/interaction of all_ and recursive # assert_equal(dsj['size']['total'], # '15 Bytes' if (recursive and all_) else # ('9 Bytes' if (recursive or all_) else '3 Bytes') # ) # https://github.com/datalad/datalad/issues/1674 if state == 'file' and all_: dirj = get_meta(topdir, 'dir') dir_nodes = {x['name']: x for x in dirj['nodes']} # it should be present in the subdir meta assert_in('subds2', dir_nodes) assert_not_in('url_external', dir_nodes['subds2']) assert_in('subgit_ext', dir_nodes) assert_equal(dir_nodes['subgit_ext']['url'], ext_url) # and not in topds assert_not_in('subds2', topds_nodes) # run non-recursive dataset traversal after subdataset metadata already created # to verify sub-dataset metadata being picked up from its metadata file in such cases if state == 'file' and recursive and not all_: dsj = _ls_json(topdir, json='file', all_=False) subds = [ item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds') ][0] assert_equal(subds['size']['total'], '3 Bytes') assert_equal( topds_nodes['fromweb']['size']['total'], UNKNOWN_SIZE )
def test_fs_traverse(topdir): # setup temp directory tree for testing annex = AnnexRepo(topdir) AnnexRepo(opj(topdir, 'annexdir'), create=True) GitRepo(opj(topdir, 'gitdir'), create=True) GitRepo(opj(topdir, 'dir', 'subgit'), create=True) annex.add(opj(topdir, 'dir')) annex.commit() annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # traverse file system in recursive and non-recursive modes for recursive in [True, False]: # test fs_traverse in display mode with swallow_logs(new_level=logging.INFO) as log, swallow_outputs() as cmo: repo = AnnexRepo(topdir) fs = fs_traverse(topdir, repo, recurse_directories=recursive, json='display') if recursive: # fs_traverse logs should contain all not ignored subdirectories for subdir in [opj(topdir, 'dir'), opj(topdir, 'dir', 'subdir')]: assert_in('Directory: ' + subdir, log.out) # fs_traverse stdout contains subdirectory assert_in(('file2.txt' and 'dir'), cmo.out) # extract info of the top-level child directory child = [item for item in fs['nodes'] if item['name'] == 'dir'][0] # size of dir type child in non-recursive modes should be 0 Bytes(default) as # dir type child's size currently has no metadata file for traverser to pick its size from # and would require a recursive traversal w/ write to child metadata file mode assert_equal(child['size']['total'], {True: '6 Bytes', False: '0 Bytes'}[recursive]) repo.precommit() # to possibly stop batch process occupying the stdout for recursive in [True, False]: # run fs_traverse in write to json 'file' mode repo = AnnexRepo(topdir) fs = fs_traverse(topdir, repo, recurse_directories=recursive, json='file') # fs_traverse should return a dictionary assert_equal(isinstance(fs, dict), True) # not including git and annex folders assert_equal([item for item in fs['nodes'] if ('gitdir' or 'annexdir') == item['name']], []) # extract info of the top-level child directory child = [item for item in fs['nodes'] if item['name'] == 'dir'][0] # verify node type assert_equal(child['type'], 'dir') # same node size on running fs_traversal in recursive followed by non-recursive mode # verifies child's metadata file being used to find its size # running in reverse order (non-recursive followed by recursive mode) will give (0, actual size) assert_equal(child['size']['total'], '6 Bytes') # verify subdirectory traversal if run in recursive mode # In current RF 'nodes' are stripped away during recursive traversal # for now... later we might reincarnate them "differently" # TODO! if False: # recursive: # sub-dictionary should not include git and hidden directory info assert_equal([item for item in child['nodes'] if ('subgit' or '.fgit') == item['name']], []) # extract subdirectory dictionary, else fail subchild = [subitem for subitem in child["nodes"] if subitem['name'] == 'subdir'][0] # extract info of file1.txts, else fail link = [subnode for subnode in subchild["nodes"] if subnode['name'] == 'file1.txt'][0] # verify node's sizes and type assert_equal(link['size']['total'], '3 Bytes') assert_equal(link['size']['ondisk'], link['size']['total']) assert_equal(link['type'], 'link') # extract info of file2.txt, else fail brokenlink = [subnode for subnode in subchild["nodes"] if subnode['name'] == 'file2.txt'][0] # verify node's sizes and type assert_equal(brokenlink['type'], 'link-broken') assert_equal(brokenlink['size']['ondisk'], '0 Bytes') assert_equal(brokenlink['size']['total'], '3 Bytes')
def test_fs_traverse(topdir): # setup temp directory tree for testing annex = AnnexRepo(topdir) AnnexRepo(opj(topdir, 'annexdir'), create=True) GitRepo(opj(topdir, 'gitdir'), create=True) GitRepo(opj(topdir, 'dir', 'subgit'), create=True) annex.add(opj(topdir, 'dir')) annex.commit() annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # traverse file system in recursive and non-recursive modes for recursive in [True, False]: # test fs_traverse in display mode with swallow_logs( new_level=logging.INFO) as log, swallow_outputs() as cmo: repo = AnnexRepo(topdir) fs = fs_traverse(topdir, repo, recurse_directories=recursive, json='display') if recursive: # fs_traverse logs should contain all not ignored subdirectories for subdir in [ opj(topdir, 'dir'), opj(topdir, 'dir', 'subdir') ]: assert_in('Directory: ' + subdir, log.out) # fs_traverse stdout contains subdirectory assert_in(('file2.txt' and 'dir'), cmo.out) # extract info of the top-level child directory child = [item for item in fs['nodes'] if item['name'] == 'dir'][0] # size of dir type child in non-recursive modes should be 0 Bytes(default) as # dir type child's size currently has no metadata file for traverser to pick its size from # and would require a recursive traversal w/ write to child metadata file mode assert_equal(child['size']['total'], { True: '6 Bytes', False: '0 Bytes' }[recursive]) repo.precommit( ) # to possibly stop batch process occupying the stdout for recursive in [True, False]: # run fs_traverse in write to json 'file' mode repo = AnnexRepo(topdir) fs = fs_traverse(topdir, repo, recurse_directories=recursive, json='file') # fs_traverse should return a dictionary assert_equal(isinstance(fs, dict), True) # not including git and annex folders assert_equal([ item for item in fs['nodes'] if ('gitdir' or 'annexdir') == item['name'] ], []) # extract info of the top-level child directory child = [item for item in fs['nodes'] if item['name'] == 'dir'][0] # verify node type assert_equal(child['type'], 'dir') # same node size on running fs_traversal in recursive followed by non-recursive mode # verifies child's metadata file being used to find its size # running in reverse order (non-recursive followed by recursive mode) will give (0, actual size) assert_equal(child['size']['total'], '6 Bytes') # verify subdirectory traversal if run in recursive mode # In current RF 'nodes' are stripped away during recursive traversal # for now... later we might reincarnate them "differently" # TODO! if False: # recursive: # sub-dictionary should not include git and hidden directory info assert_equal([ item for item in child['nodes'] if ('subgit' or '.fgit') == item['name'] ], []) # extract subdirectory dictionary, else fail subchild = [ subitem for subitem in child["nodes"] if subitem['name'] == 'subdir' ][0] # extract info of file1.txts, else fail link = [ subnode for subnode in subchild["nodes"] if subnode['name'] == 'file1.txt' ][0] # verify node's sizes and type assert_equal(link['size']['total'], '3 Bytes') assert_equal(link['size']['ondisk'], link['size']['total']) assert_equal(link['type'], 'link') # extract info of file2.txt, else fail brokenlink = [ subnode for subnode in subchild["nodes"] if subnode['name'] == 'file2.txt' ][0] # verify node's sizes and type assert_equal(brokenlink['type'], 'link-broken') assert_equal(brokenlink['size']['ondisk'], '0 Bytes') assert_equal(brokenlink['size']['total'], '3 Bytes')
def test_openfmri_pipeline1(ind, topurl, outd, clonedir): index_html = opj(ind, 'ds666', 'index.html') list( initiate_dataset(template="openfmri", dataset_name='dataladtest-ds666', path=outd, data_fields=['dataset'])({ 'dataset': 'ds666' })) repo = AnnexRepo(outd, create=False) # to be used in the checks # Since datalad 0.11.2 all .metadata/objects go under annex. # Here we have a test where we force drop all annexed content, # to mitigate that let's place all metadata under git dotdatalad_attributes_file = opj('.datalad', '.gitattributes') repo.set_gitattributes([('metadata/objects/**', { 'annex.largefiles': '(nothing)' })], dotdatalad_attributes_file) # --amend so we do not cause change in # of commits below repo.commit("gitattributes", files=dotdatalad_attributes_file, options=['--amend']) with chpwd(outd): pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # We do not have custom changes in master yet, so it just follows incoming-processed atm # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # Since we did initiate_dataset -- now we have separate master! assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) t1w_fpath_nover = opj(outd, 'sub1', 'anat', 'sub-1_T1w.dat') ok_file_has_content(t1w_fpath_nover, "mighty load in old format") # # And now versioned files were specified! # add_to_index(index_html, content=_versioned_files) with chpwd(outd): pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) ok_( not exists(t1w_fpath_nover), "%s file should no longer be there if unversioned files get removed correctly" % t1w_fpath_nover) repo = AnnexRepo(outd, create=False) # to be used in the checks # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # We do not have custom changes in master yet, so it just follows incoming-processed atm # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # Since we did initiate_dataset -- now we have separate master! assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) # actually the tree should look quite neat with 1.0.0 tag having 1 parent in incoming # 1.0.1 having 1.0.0 and the 2nd commit in incoming as parents commits_hexsha = {b: list(_get_branch_commits(repo, b)) for b in branches} commits_l = { b: list(_get_branch_commits(repo, b, limit='left-only')) for b in branches } # all commits out there: # dataset init, crawler init # (2 commits) # + 3*(incoming, processed, merge) # + 3*aggregate-metadata update # - 1 since now that incoming starts with master, there is one less merge # In --incremental mode there is a side effect of absent now # 2*remove of obsolete metadata object files, # see https://github.com/datalad/datalad/issues/2772 # TODO inspect by knowledgeable person and re-enable #ncommits_master = len(commits_hexsha['master']) #assert_in(ncommits_master, [13, 14]) #assert_in(len(commits_l['master']), [8, 9]) # TODO inspect by knowledgeable person and re-enable #eq_(len(commits_hexsha['incoming']), ncommits_master - 8) #eq_(len(commits_l['incoming']), ncommits_master - 8) #eq_(len(commits_hexsha['incoming-processed']), ncommits_master - 5) #eq_(len(commits_l['incoming-processed']), ncommits_master - 8) # Check tags for the versions eq_(out[0]['datalad_stats'].get_total().versions, ['1.0.0', '1.0.1']) # +1 because original "release" was assumed to be 1.0.0 repo_tags = repo.get_tags() eq_(repo.get_tags(output='name'), ['1.0.0', '1.0.0+1', '1.0.1']) # Ben: The tagged ones currently are the ones with the message # '[DATALAD] dataset aggregate metadata update\n': #eq_(repo_tags[0]['hexsha'], commits_l['master'][4]) # next to the last one #eq_(repo_tags[-1]['hexsha'], commits_l['master'][0]) # the last one def hexsha(l): return l.__class__(x.hexsha for x in l) # TODO requires additional tooling to re-enable ## Verify that we have desired tree of merges #eq_(hexsha(commits_l['incoming-processed'][0].parents), (commits_l['incoming-processed'][1], # commits_l['incoming'][0])) #eq_(hexsha(commits_l['incoming-processed'][2].parents), (commits_l['incoming-processed'][3], # also in master # commits_l['incoming'][2],)) # ben: The following two comparisons are targeting these commits: # commit "Merge branch 'incoming-processed'\n" in commits_l['master'], # parents are: # commit "[DATALAD] dataset aggregate metadata update\n" in commits_l['master'] and # commit "[DATALAD] Added files from extracted archives\n\nFiles processed: 6\n renamed: 2\n +annex: 3\nBranches merged: incoming->incoming-processed\n" in commits_l['incoming-processed'] # TODO requires additional tooling to re-enable #eq_(hexsha(commits_l['master'][1].parents), (commits_l['master'][2], # commits_l['incoming-processed'][0])) #eq_(hexsha(commits_l['master'][3].parents), (commits_l['master'][4], # commits_l['incoming-processed'][1])) with chpwd(outd): eq_(set(glob('*')), {'changelog.txt', 'sub-1'}) all_files = sorted(find_files('.')) t1w_fpath = opj(outd, 'sub-1', 'anat', 'sub-1_T1w.dat') ok_file_has_content(t1w_fpath, "mighty load 1.0.1") ok_file_under_git(opj(outd, 'changelog.txt'), annexed=False) ok_file_under_git(t1w_fpath, annexed=True) try: # this is the new way from datalad.metadata.metadata import get_ds_aggregate_db_locations ds = Dataset('.') dbloc, objbase = get_ds_aggregate_db_locations(ds) dbloc = op.relpath(dbloc, start=ds.path) except ImportError: # this stopped working in early 2019 versions of datalad from datalad.metadata.metadata import agginfo_relpath dbloc = agginfo_relpath target_files = { './.datalad/config', './.datalad/crawl/crawl.cfg', # no more! # './.datalad/config.ttl', './.datalad/datalad.ttl', './.datalad/crawl/statuses/incoming.json', './.datalad/crawl/versions/incoming.json', './changelog.txt', './sub-1/anat/sub-1_T1w.dat', './sub-1/beh/responses.tsv', './' + dbloc, } target_incoming_files = { '.gitattributes', # we marked default backend right in the incoming # we now base 'incoming' on master branch, so we get all those as well '.datalad/.gitattributes', '.datalad/config', '.datalad/crawl/crawl.cfg', 'changelog.txt', 'ds666.tar.gz', 'ds666-beh_R1.0.1.tar.gz', 'ds666_R1.0.0.tar.gz', 'ds666_R1.0.1.tar.gz', 'ds666_R2.0.0.tar.gz', '.datalad/crawl/statuses/incoming.json', '.datalad/crawl/versions/incoming.json' } # Ben: metadata object files may differ in their names containing some checksum-ish shit ... # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison eq_( set([ f for f in all_files if not f.startswith('./.datalad/metadata/objects/') ]), target_files) # check that -beh was committed in 2nd commit in incoming, not the first one assert_not_in('ds666-beh_R1.0.1.tar.gz', repo.get_files(commits_l['incoming'][-1])) assert_in('ds666-beh_R1.0.1.tar.gz', repo.get_files(commits_l['incoming'][0])) # rerun pipeline -- make sure we are on the same in all branches! with chpwd(outd): out = run_pipeline(pipeline) eq_(len(out), 1) commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches} eq_(commits_hexsha, commits_hexsha_) # i.e. nothing new # actually we do manage to add_git 1 (README) since it is generated committed directly to git # BUT now fixed -- if not committed (was the same), should be marked as skipped # Nothing was committed so stats leaked all the way up eq_(out[0]['datalad_stats'], ActivityStats(files=5, skipped=5, urls=5)) eq_(out[0]['datalad_stats'], out[0]['datalad_stats'].get_total()) # rerun pipeline when new content is available # add new revision, rerun pipeline and check that stuff was processed/added correctly add_to_index( index_html, content= '<a href="ds666_R2.0.0.tar.gz">Raw data on AWS version 2.0.0</a>') with chpwd(outd): out = run_pipeline(pipeline) all_files_updated = sorted(find_files('.')) eq_(len(out), 1) assert_not_equal(out[0]['datalad_stats'].get_total(), ActivityStats()) # there is no overlays ATM, so behav would be gone since no 2.0.0 for it! target_files.remove('./sub-1/beh/responses.tsv') # Ben: metadata object files may differ in their names containing some checksum-ish shit ... # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison eq_( set([ f for f in all_files_updated if not f.startswith('./.datalad/metadata/objects/') ]), target_files) # new instance so it re-reads git stuff etc # repo = AnnexRepo(outd, create=False) # to be used in the checks commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches} commits_l_ = { b: list(_get_branch_commits(repo, b, limit='left-only')) for b in branches } assert_not_equal(commits_hexsha, commits_hexsha_) eq_(out[0]['datalad_stats'], ActivityStats()) # commit happened so stats were consumed # numbers seems to be right total_stats = out[0]['datalad_stats'].get_total() # but for some reason downloaded_size fluctuates.... why? probably archiving...? total_stats.downloaded_size = 0 eq_( total_stats, ActivityStats( files=8, skipped=5, downloaded=1, renamed=1, urls=6, add_annex=2, # add_git=1, # README versions=['2.0.0'], merges=[['incoming', 'incoming-processed']])) check_dropall_get(repo) # Let's see if pipeline would remove files we stopped tracking remove_from_index(index_html, '<a href=.ds666_R1.0.0[^<]*</a>') with chpwd(outd): with swallow_logs(new_level=logging.WARNING) as cml: out = run_pipeline(pipeline) # since files get removed in incoming, but repreprocessed completely # incomming-processed and merged into master -- new commits will come # They shouldn't have any difference but still should be new commits assert_in("There is already a tag 2.0.0 in the repository", cml.out) eq_(len(out), 1) incoming_files = repo.get_files('incoming') target_incoming_files.remove('ds666_R1.0.0.tar.gz') eq_(set(incoming_files), target_incoming_files) commits_hexsha_removed = { b: list(_get_branch_commits(repo, b)) for b in branches } # our 'statuses' database should have recorded the change thus got a diff # which propagated through all branches for b in 'master', 'incoming-processed': # with non persistent DB we had no changes # eq_(repo.repo.branches[b].commit.diff(commits_hexsha_[b][0]), []) assert_in(repo.pathobj / '.datalad/crawl/statuses/incoming.json', repo.diff(b, commits_hexsha_[b][0])) dincoming = repo.diff('incoming', commits_hexsha_['incoming'][0]) eq_(len(dincoming), 2) # 2 diff objects -- 1 file removed, 1 statuses updated eq_( set(dincoming.keys()), { repo.pathobj / '.datalad/crawl/statuses/incoming.json', repo.pathobj / 'ds666_R1.0.0.tar.gz' }) eq_(out[0]['datalad_stats'].get_total().removed, 1) assert_not_equal(commits_hexsha_, commits_hexsha_removed) # we will check if a clone would be crawling just as good from datalad.api import crawl # make a brand new clone GitRepo.clone(outd, clonedir) def _pipeline(*args, **kwargs): """Helper to mock openfmri.pipeline invocation so it looks at our 'server'""" kwargs = updated(kwargs, {'topurl': topurl, 'versioned_urls': False}) return ofpipeline(*args, **kwargs) with chpwd(clonedir), patch.object(openfmri, 'pipeline', _pipeline): output, stats = crawl( ) # we should be able to recrawl without doing anything ok_(stats, ActivityStats(files=6, skipped=6, urls=5))
def test_ls_json(topdir, topurl): annex = AnnexRepo(topdir, create=True) ds = Dataset(topdir) # create some file and commit it with open(opj(ds.path, 'subdsfile.txt'), 'w') as f: f.write('123') ds.add(path='subdsfile.txt') ds.save("Hello!", version_tag=1) # add a subdataset ds.install('subds', source=topdir) subdirds = ds.create(_path_('dir/subds2'), force=True) subdirds.add('file') git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True) # create git repo git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt')) # commit to git to init git repo git.commit() annex.add(opj(topdir, 'dir', 'subgit')) # add the non-dataset git repo to annex annex.add(opj(topdir, 'dir')) # add to annex (links) annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # broken-link annex.commit() git.add('fgit.txt') # commit to git to init git repo git.commit() # annex.add doesn't add submodule, so using ds.add ds.add(opj('dir', 'subgit')) # add the non-dataset git repo to annex ds.add('dir') # add to annex (links) ds.drop(opj('dir', 'subdir', 'file2.txt'), check=False) # broken-link # register "external" submodule by installing and uninstalling it ext_url = topurl + '/dir/subgit/.git' # need to make it installable via http Runner()('git update-server-info', cwd=opj(topdir, 'dir', 'subgit')) ds.install(opj('dir', 'subgit_ext'), source=ext_url) ds.uninstall(opj('dir', 'subgit_ext')) meta_dir = opj('.git', 'datalad', 'metadata') def get_metahash(*path): if not path: path = ['/'] return hashlib.md5(opj(*path).encode('utf-8')).hexdigest() def get_metapath(dspath, *path): return _path_(dspath, meta_dir, get_metahash(*path)) def get_meta(dspath, *path): with open(get_metapath(dspath, *path)) as f: return js.load(f) # Let's see that there is no crash if one of the files is available only # in relaxed URL mode, so no size could be picked up ds.repo.add_url_to_file('fromweb', topurl + '/noteventhere', options=['--relaxed']) for all_ in [True, False]: # recurse directories for recursive in [True, False]: for state in ['file', 'delete']: # subdataset should have its json created and deleted when # all=True else not subds_metapath = get_metapath(opj(topdir, 'subds')) exists_prior = exists(subds_metapath) #with swallow_logs(), swallow_outputs(): dsj = _ls_json(topdir, json=state, all_=all_, recursive=recursive) ok_startswith(dsj['tags'], '1-') exists_post = exists(subds_metapath) # print("%s %s -> %s" % (state, exists_prior, exists_post)) assert_equal(exists_post, (state == 'file' and recursive)) # root should have its json file created and deleted in all cases ds_metapath = get_metapath(topdir) assert_equal(exists(ds_metapath), state == 'file') # children should have their metadata json's created and deleted only when recursive=True child_metapath = get_metapath(topdir, 'dir', 'subdir') assert_equal(exists(child_metapath), (state == 'file' and all_)) # ignored directories should not have json files created in any case for subdir in [('.hidden', ), ('dir', 'subgit')]: assert_false(exists(get_metapath(topdir, *subdir))) # check if its updated in its nodes sublist too. used by web-ui json. regression test assert_equal(dsj['nodes'][0]['size']['total'], dsj['size']['total']) # check size of subdataset subds = [ item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds') ][0] assert_equal(subds['size']['total'], '3 Bytes') # dir/subds2 must not be listed among nodes of the top dataset: topds_nodes = {x['name']: x for x in dsj['nodes']} assert_in('subds', topds_nodes) # XXX # # condition here is a bit a guesswork by yoh later on # # TODO: here and below clear destiny/interaction of all_ and recursive # assert_equal(dsj['size']['total'], # '15 Bytes' if (recursive and all_) else # ('9 Bytes' if (recursive or all_) else '3 Bytes') # ) # https://github.com/datalad/datalad/issues/1674 if state == 'file' and all_: dirj = get_meta(topdir, 'dir') dir_nodes = {x['name']: x for x in dirj['nodes']} # it should be present in the subdir meta assert_in('subds2', dir_nodes) assert_not_in('url_external', dir_nodes['subds2']) assert_in('subgit_ext', dir_nodes) assert_equal(dir_nodes['subgit_ext']['url'], ext_url) # and not in topds assert_not_in('subds2', topds_nodes) # run non-recursive dataset traversal after subdataset metadata already created # to verify sub-dataset metadata being picked up from its metadata file in such cases if state == 'file' and recursive and not all_: dsj = _ls_json(topdir, json='file', all_=False) subds = [ item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds') ][0] assert_equal(subds['size']['total'], '3 Bytes') assert_equal(topds_nodes['fromweb']['size']['total'], UNKNOWN_SIZE)
def test_check_dates(path): refdate = 1218182889 with set_date(refdate - 1): ar = AnnexRepo(path, create=True) ar.add("foo") ar.commit("add foo") foo_commit = ar.get_hexsha() ar.commit("add foo") ar.tag("foo-tag", "tag before refdate") # We can't use ar.get_tags because that returns the commit's hexsha, # not the tag's, and ar.get_hexsha is limited to commit objects. foo_tag = ar.repo.git.rev_parse("foo-tag") # Make a lightweight tag to make sure `tag_dates` doesn't choke on it. ar.tag("light") with set_date(refdate + 1): ar.add("bar") ar.commit("add bar") bar_commit = ar.get_hexsha() ar.tag("bar-tag", "tag after refdate") bar_tag = ar.repo.git.rev_parse("bar-tag") with set_date(refdate + 2): # Drop an annexed file so that we have more blobs in the git-annex # branch than its current tree. ar.drop("bar", options=["--force"]) results = {} for which in ["older", "newer"]: result = check_dates(ar, refdate, which=which)["objects"] ok_(result) if which == "newer": assert_in(bar_commit, result) assert_not_in(foo_commit, result) assert_in(bar_tag, result) elif which == "older": assert_in(foo_commit, result) assert_not_in(bar_commit, result) assert_in(foo_tag, result) results[which] = result ok_(any(x.get("filename") == "uuid.log" for x in results["older"].values())) newer_tree = check_dates(ar, refdate, annex="tree")["objects"] def is_annex_log_blob(entry): return (entry["type"] == "annex-blob" and entry["filename"].endswith(".log")) def num_logs(entries): return sum(map(is_annex_log_blob, entries.values())) # Because we dropped bar above, we should have one more blob in the # git-annex branch than in the current tree of the git-annex branch. eq_(num_logs(results["newer"]) - num_logs(newer_tree), 1) # Act like today is one day from the reference timestamp to check that we # get the same results with the one-day-back default. seconds_in_day = 60 * 60 * 24 with patch('time.time', return_value=refdate + seconds_in_day): assert_equal(check_dates(ar, annex="tree")["objects"], newer_tree) # We can give a path (str) instead of a GitRepo object. assert_equal(check_dates(path, refdate, annex="tree")["objects"], newer_tree) with assert_raises(ValueError): check_dates(ar, refdate, which="unrecognized")
def test_save_amend(dspath): dspath = Path(dspath) file_in_super = dspath / 'somefile' file_in_sub = dspath / 'subds' / 'file_in_sub' # test on a hierarchy including a plain git repo: ds = Dataset(dspath).create(force=True, no_annex=True) subds = ds.create('subds', force=True) ds.save(recursive=True) assert_repo_status(ds.repo) # recursive and amend are mutually exclusive: for d in (ds, subds): assert_raises(ValueError, d.save, recursive=True, amend=True) # in an annex repo the branch we are interested in might not be the active # branch (adjusted): sub_branch = subds.repo.get_corresponding_branch() # amend in subdataset w/ new message; otherwise empty amendment: last_sha = subds.repo.get_hexsha(sub_branch) subds.save(message="new message in sub", amend=True) # we did in fact commit something: neq_(last_sha, subds.repo.get_hexsha(sub_branch)) # repo is clean: assert_repo_status(subds.repo) # message is correct: eq_( subds.repo.format_commit("%B", sub_branch).strip(), "new message in sub") # actually replaced the previous commit: assert_not_in(last_sha, subds.repo.get_branch_commits_(sub_branch)) # amend modifications in subdataset w/o new message if not subds.repo.is_managed_branch(): subds.unlock('file_in_sub') file_in_sub.write_text("modified again") last_sha = subds.repo.get_hexsha(sub_branch) subds.save(amend=True) neq_(last_sha, subds.repo.get_hexsha(sub_branch)) assert_repo_status(subds.repo) # message unchanged: eq_( subds.repo.format_commit("%B", sub_branch).strip(), "new message in sub") # actually replaced the previous commit: assert_not_in(last_sha, subds.repo.get_branch_commits_(sub_branch)) # save --amend with nothing to amend with: res = subds.save(amend=True) assert_result_count(res, 1) assert_result_count(res, 1, status='notneeded', action='save') # amend in superdataset w/ new message; otherwise empty amendment: last_sha = ds.repo.get_hexsha() ds.save(message="new message in super", amend=True) neq_(last_sha, ds.repo.get_hexsha()) assert_repo_status(subds.repo) eq_(ds.repo.format_commit("%B").strip(), "new message in super") assert_not_in(last_sha, ds.repo.get_branch_commits_()) # amend modifications in superdataset w/o new message file_in_super.write_text("changed content") if not subds.repo.is_managed_branch(): subds.unlock('file_in_sub') file_in_sub.write_text("modified once again") last_sha = ds.repo.get_hexsha() last_sha_sub = subds.repo.get_hexsha(sub_branch) ds.save(amend=True) neq_(last_sha, ds.repo.get_hexsha()) eq_(ds.repo.format_commit("%B").strip(), "new message in super") assert_not_in(last_sha, ds.repo.get_branch_commits_()) # we didn't mess with the subds: assert_repo_status(ds.repo, modified=["subds"]) eq_(last_sha_sub, subds.repo.get_hexsha(sub_branch)) eq_( subds.repo.format_commit("%B", sub_branch).strip(), "new message in sub") # save --amend with nothing to amend with: last_sha = ds.repo.get_hexsha() res = ds.save(amend=True) assert_result_count(res, 1) assert_result_count(res, 1, status='notneeded', action='save') eq_(last_sha, ds.repo.get_hexsha()) # we didn't mess with the subds: assert_repo_status(ds.repo, modified=["subds"]) eq_(last_sha_sub, subds.repo.get_hexsha(sub_branch)) eq_( subds.repo.format_commit("%B", sub_branch).strip(), "new message in sub") # amend with different identity: orig_author = ds.repo.format_commit("%an") orig_email = ds.repo.format_commit("%ae") orig_date = ds.repo.format_commit("%ad") orig_committer = ds.repo.format_commit("%cn") orig_committer_mail = ds.repo.format_commit("%ce") eq_(orig_author, orig_committer) eq_(orig_email, orig_committer_mail) with patch.dict( 'os.environ', { 'GIT_COMMITTER_NAME': 'Hopefully Different', 'GIT_COMMITTER_EMAIL': '*****@*****.**' }): ds.config.reload(force=True) ds.save(amend=True, message="amend with hope") # author was kept: eq_(orig_author, ds.repo.format_commit("%an")) eq_(orig_email, ds.repo.format_commit("%ae")) eq_(orig_date, ds.repo.format_commit("%ad")) # committer changed: eq_(ds.repo.format_commit("%cn"), "Hopefully Different") eq_(ds.repo.format_commit("%ce"), "*****@*****.**") # corner case: amend empty commit with no parent: rmtree(str(dspath)) # When adjusted branch is enforced by git-annex detecting a crippled FS, # git-annex produces an empty commit before switching to adjusted branch: # "commit before entering adjusted branch" # The commit by `create` would be the second one already. # Therefore go with plain annex repo and create an (empty) commit only when # not on adjusted branch: repo = AnnexRepo(dspath, create=True) if not repo.is_managed_branch(): repo.commit(msg="initial", options=['--allow-empty']) ds = Dataset(dspath) branch = ds.repo.get_corresponding_branch() or ds.repo.get_active_branch() # test pointless if we start with more than one commit eq_(len(list(ds.repo.get_branch_commits_(branch))), 1, msg="More than on commit '{}': {}".format( branch, ds.repo.call_git(['log', branch]))) last_sha = ds.repo.get_hexsha(branch) ds.save(message="new initial commit", amend=True) assert_repo_status(ds.repo) eq_(len(list(ds.repo.get_branch_commits_(branch))), 1, msg="More than on commit '{}': {}".format( branch, ds.repo.call_git(['log', branch]))) assert_not_in(last_sha, ds.repo.get_branch_commits_(branch)) eq_(ds.repo.format_commit("%B", branch).strip(), "new initial commit")
def test_check_dates(path): refdate = 1218182889 with set_date(refdate - 1): ar = AnnexRepo(path, create=True) def tag_object(tag): """Return object for tag. Do not dereference it. """ # We can't use ar.get_tags because that returns the commit's hexsha, # not the tag's, and ar.get_hexsha is limited to commit objects. return ar.call_git_oneline( ["rev-parse", "refs/tags/{}".format(tag)], read_only=True) ar.add("foo") ar.commit("add foo") foo_commit = ar.get_hexsha() ar.commit("add foo") ar.tag("foo-tag", "tag before refdate") foo_tag = tag_object("foo-tag") # Make a lightweight tag to make sure `tag_dates` doesn't choke on it. ar.tag("light") with set_date(refdate + 1): ar.add("bar") ar.commit("add bar") bar_commit = ar.get_hexsha() ar.tag("bar-tag", "tag after refdate") bar_tag = tag_object("bar-tag") with set_date(refdate + 2): # Drop an annexed file so that we have more blobs in the git-annex # branch than its current tree. ar.drop("bar", options=["--force"]) results = {} for which in ["older", "newer"]: result = check_dates(ar, refdate, which=which)["objects"] ok_(result) if which == "newer": assert_in(bar_commit, result) assert_not_in(foo_commit, result) assert_in(bar_tag, result) elif which == "older": assert_in(foo_commit, result) assert_not_in(bar_commit, result) assert_in(foo_tag, result) results[which] = result ok_(any( x.get("filename") == "uuid.log" for x in results["older"].values())) newer_tree = check_dates(ar, refdate, annex="tree")["objects"] def is_annex_log_blob(entry): return (entry["type"] == "annex-blob" and entry["filename"].endswith(".log")) def num_logs(entries): return sum(map(is_annex_log_blob, entries.values())) # Because we dropped bar above, we should have one more blob in the # git-annex branch than in the current tree of the git-annex branch. eq_(num_logs(results["newer"]) - num_logs(newer_tree), 1) # Act like today is one day from the reference timestamp to check that we # get the same results with the one-day-back default. seconds_in_day = 60 * 60 * 24 with patch('time.time', return_value=refdate + seconds_in_day): assert_equal(check_dates(ar, annex="tree")["objects"], newer_tree) # We can give a path (str) instead of a GitRepo object. assert_equal( check_dates(path, refdate, annex="tree")["objects"], newer_tree) with assert_raises(ValueError): check_dates(ar, refdate, which="unrecognized")
def test_interactions(tdir): # Just a placeholder since constructor expects a repo repo = AnnexRepo(tdir, create=True, init=True) repo.add('file.dat') repo.commit('added file.dat') class FIFO(object): def __init__(self, content=None, default=None): """ Parameters ---------- content default If defined, will be the one returned if empty. If not defined -- would raise an Exception """ self.content = content or [] self.default = default def _pop(self): # return empty line, usually to signal if self.content: v = self.content.pop(0) # allow for debug if v.startswith('DEBUG '): # next one return self._pop() return v else: if self.default is not None: return self.default else: raise IndexError("we are empty") def write(self, l): self.content.append(l) def read(self): return self._pop() def readline(self): return self._pop().rstrip('\n') def flush(self): pass # working hard # now we should test interactions import re ERROR_ARGS = re.compile('^ERROR .*(missing|takes) .*\d+ .*argument') for scenario in [ [], # default of doing nothing [ # support of EXPORT which by default is not supported ('EXPORTSUPPORTED', 'EXPORTSUPPORTED-FAILURE'), ], [ # some unknown option ('FANCYNEWOPTION', 'UNSUPPORTED-REQUEST'), ], [ # get the COST etc for , and make sure we do not # fail right on unsupported ('FANCYNEWOPTION', 'UNSUPPORTED-REQUEST'), ('GETCOST', 'COST %d' % DEFAULT_COST), ('GETCOST roguearg', ERROR_ARGS), ('GETAVAILABILITY', 'AVAILABILITY %s' % DEFAULT_AVAILABILITY), ('INITREMOTE', 'INITREMOTE-SUCCESS' ), # by default we do not require any fancy init # no urls supported by default ('CLAIMURL http://example.com', 'CLAIMURL-FAILURE'), # we know that is just a single option, url, is expected so full # one would be passed ('CLAIMURL http://example.com roguearg', 'CLAIMURL-FAILURE'), # but if not enough params -- ERROR_ARGS ('CLAIMURL', ERROR_ARGS) ] ]: # First one is always version and # Final empty command to signal the end of the transactions scenario = [(None, 'VERSION 1')] + scenario + [('', None)] fin, fout = FIFO(), FIFO(default='') for in_, out_ in scenario: if in_ is not None: fin.write(in_ + '\n') cr = AnnexCustomRemote(tdir, fin=fin, fout=fout) cr.main() for in_, out_ in scenario: if out_ is not None: out_read = fout.readline() if isinstance(out_, type(ERROR_ARGS)): assert out_.match(out_read), (out_, out_read) else: eq_(out_, out_read) out_read = fout.readline() eq_(out_read, '') # nothing left to say
def __call__(path=None, force=False, description=None, dataset=None, no_annex=False, save=True, annex_version=None, annex_backend='MD5E', native_metadata_type=None, shared_access=None, git_opts=None, annex_opts=None, annex_init_opts=None, text_no_annex=None): # two major cases # 1. we got a `dataset` -> we either want to create it (path is None), # or another dataset in it (path is not None) # 2. we got no dataset -> we want to create a fresh dataset at the # desired location, either at `path` or PWD if path and dataset: # Given a path and a dataset (path) not pointing to installed # dataset if not dataset.is_installed(): msg = "No installed dataset at %s found." % dataset.path dsroot = get_dataset_root(dataset.path) if dsroot: msg += " If you meant to add to the %s dataset, use that path " \ "instead but remember that if dataset is provided, " \ "relative paths are relative to the top of the " \ "dataset." % dsroot raise ValueError(msg) # sanity check first if git_opts: lgr.warning( "`git_opts` argument is presently ignored, please complain!") if no_annex: if description: raise ValueError("Incompatible arguments: cannot specify " "description for annex repo and declaring " "no annex repo.") if annex_opts: raise ValueError("Incompatible arguments: cannot specify " "options for annex and declaring no " "annex repo.") if annex_init_opts: raise ValueError("Incompatible arguments: cannot specify " "options for annex init and declaring no " "annex repo.") if not isinstance(force, bool): raise ValueError( "force should be bool, got %r. Did you mean to provide a 'path'?" % force) annotated_paths = AnnotatePaths.__call__( # nothing given explicitly, assume create fresh right here path=path if path else getpwd() if dataset is None else None, dataset=dataset, recursive=False, action='create', # we need to know whether we have to check for potential # subdataset collision force_parentds_discovery=True, # it is absolutely OK to have something that does not exist unavailable_path_status='', unavailable_path_msg=None, # if we have a dataset given that actually exists, we want to # fail if the requested path is not in it nondataset_path_status='error' \ if isinstance(dataset, Dataset) and dataset.is_installed() else '', on_failure='ignore') path = None for r in annotated_paths: if r['status']: # this is dealt with already yield r continue if path is not None: raise ValueError( "`create` can only handle single target path or dataset") path = r if len(annotated_paths) and path is None: # we got something, we complained already, done return # we know that we need to create a dataset at `path` assert (path is not None) # prep for yield path.update({'logger': lgr, 'type': 'dataset'}) # just discard, we have a new story to tell path.pop('message', None) if 'parentds' in path: subs = Subdatasets.__call__( dataset=path['parentds'], # any known fulfilled=None, recursive=False, contains=path['path'], result_xfm='relpaths') if len(subs): path.update({ 'status': 'error', 'message': ('collision with known subdataset %s/ in dataset %s', subs[0], path['parentds']) }) yield path return # TODO here we need a further test that if force=True, we need to look if # there is a superdataset (regardless of whether we want to create a # subdataset or not), and if that superdataset tracks anything within # this directory -- if so, we need to stop right here and whine, because # the result of creating a repo here will produce an undesired mess if git_opts is None: git_opts = {} if shared_access: # configure `git --shared` value git_opts['shared'] = shared_access # important to use the given Dataset object to avoid spurious ID # changes with not-yet-materialized Datasets tbds = dataset if isinstance(dataset, Dataset) and dataset.path == path['path'] \ else Dataset(path['path']) # don't create in non-empty directory without `force`: if isdir(tbds.path) and listdir(tbds.path) != [] and not force: path.update({ 'status': 'error', 'message': 'will not create a dataset in a non-empty directory, use ' '`force` option to ignore' }) yield path return if no_annex: lgr.info("Creating a new git repo at %s", tbds.path) GitRepo(tbds.path, url=None, create=True, git_opts=git_opts) else: # always come with annex when created from scratch lgr.info("Creating a new annex repo at %s", tbds.path) tbrepo = AnnexRepo(tbds.path, url=None, create=True, backend=annex_backend, version=annex_version, description=description, git_opts=git_opts, annex_opts=annex_opts, annex_init_opts=annex_init_opts) if text_no_annex: git_attributes_file = opj(tbds.path, '.gitattributes') with open(git_attributes_file, 'a') as f: f.write('* annex.largefiles=(not(mimetype=text/*))\n') tbrepo.add([git_attributes_file], git=True) tbrepo.commit("Instructed annex to add text files to git", _datalad_msg=True, files=[git_attributes_file]) if native_metadata_type is not None: if not isinstance(native_metadata_type, list): native_metadata_type = [native_metadata_type] for nt in native_metadata_type: tbds.config.add('datalad.metadata.nativetype', nt) # record an ID for this repo for the afterlife # to be able to track siblings and children id_var = 'datalad.dataset.id' if id_var in tbds.config: # make sure we reset this variable completely, in case of a re-create tbds.config.unset(id_var, where='dataset') tbds.config.add(id_var, tbds.id if tbds.id is not None else uuid.uuid1().urn.split(':')[-1], where='dataset') # make sure that v6 annex repos never commit content under .datalad with open(opj(tbds.path, '.datalad', '.gitattributes'), 'a') as gitattr: # TODO this will need adjusting, when annex'ed aggregate meta data # comes around gitattr.write( '# Text files (according to file --mime-type) are added directly to git.\n' ) gitattr.write( '# See http://git-annex.branchable.com/tips/largefiles/ for more info.\n' ) gitattr.write('** annex.largefiles=nothing\n') # save everything, we need to do this now and cannot merge with the # call below, because we may need to add this subdataset to a parent # but cannot until we have a first commit tbds.add('.datalad', to_git=True, save=save, message='[DATALAD] new dataset') # the next only makes sense if we saved the created dataset, # otherwise we have no committed state to be registered # in the parent if save and isinstance(dataset, Dataset) and dataset.path != tbds.path: # we created a dataset in another dataset # -> make submodule for r in dataset.add(tbds.path, save=True, return_type='generator', result_filter=None, result_xfm=None, on_failure='ignore'): yield r path.update({'status': 'ok'}) yield path
def _test_proxying_open(generate_load, verify_load, repo): annex = AnnexRepo(repo, create=True) fpath1 = opj(repo, "test") fpath2 = opj(repo, 'd1', 'd2', 'test2') # generate load fpath1 = generate_load(fpath1) or fpath1 os.makedirs(dirname(fpath2)) fpath2 = generate_load(fpath2) or fpath2 annex.add([fpath1, fpath2]) verify_load(fpath1) verify_load(fpath2) annex.commit("Added some files") # clone to another repo repo2 = repo + "_2" annex2 = AnnexRepo.clone(repo, repo2) # verify that can't access fpath1_2 = fpath1.replace(repo, repo2) fpath2_2 = fpath2.replace(repo, repo2) EXPECTED_EXCEPTIONS = (IOError, OSError) assert_raises(EXPECTED_EXCEPTIONS, verify_load, fpath1_2) with AutomagicIO(): # verify that it doesn't even try to get files which do not exist with patch('datalad.support.annexrepo.AnnexRepo.get') as gricm: # if we request absent file assert_raises(EXPECTED_EXCEPTIONS, open, fpath1_2 + "_", 'r') # no get should be called assert_false(gricm.called) verify_load(fpath1_2) verify_load(fpath2_2) # and even if we drop it -- we still can get it no problem annex2.drop(fpath2_2) assert_false(annex2.file_has_content(fpath2_2)) verify_load(fpath2_2) assert_true(annex2.file_has_content(fpath2_2)) annex2.drop(fpath2_2) assert_false(annex2.file_has_content(fpath2_2)) assert_true(os.path.isfile(fpath2_2)) # In check_once mode, if we drop it, it wouldn't be considered again annex2.drop(fpath2_2) assert_false(annex2.file_has_content(fpath2_2)) with AutomagicIO(check_once=True): verify_load(fpath2_2) assert_true(annex2.file_has_content(fpath2_2)) annex2.drop(fpath2_2) assert_false(annex2.file_has_content(fpath2_2)) assert_false(os.path.isfile(fpath2_2)) # if we override stdout with something not supporting fileno, like tornado # does which ruins using get under IPython # TODO: we might need to refuse any online logging in other places like that annex2.drop(fpath2_2) class StringIOfileno(StringIO): def fileno(self): raise Exception("I have no clue how to do fileno") with patch('sys.stdout', new_callable=StringIOfileno), \ patch('sys.stderr', new_callable=StringIOfileno): with AutomagicIO(): assert_false(annex2.file_has_content(fpath2_2)) verify_load(fpath2_2) assert_true(annex2.file_has_content(fpath2_2))