def test_optimized_cloning(path): # make test repo with one file and one commit originpath = op.join(path, 'origin') repo = GitRepo(originpath, create=True) with open(op.join(originpath, 'test'), 'w') as f: f.write('some') repo.add('test') repo.commit('init') ok_clean_git(originpath, annex=False) from glob import glob def _get_inodes(repo): return dict([ (os.path.join(*o.split(os.sep)[-2:]), os.stat(o).st_ino) for o in glob(os.path.join(repo.repo.git_dir, 'objects', '*', '*')) ]) origin_inodes = _get_inodes(repo) # now clone it in different ways and see what happens to the object storage from datalad.support.network import get_local_file_url clonepath = op.join(path, 'clone') for src in (originpath, get_local_file_url(originpath)): # deprecated assert_raises(DeprecatedError, GitRepo, url=src, path=clonepath) clone = GitRepo.clone(url=src, path=clonepath, create=True) clone_inodes = _get_inodes(clone) eq_(origin_inodes, clone_inodes, msg='with src={}'.format(src)) rmtree(clonepath)
def test_recurse_existing(src, path): origin_ds = _make_dataset_hierarchy(src) # make sure recursion_limit works as expected across a range of depths for depth in range(len(origin_ds)): datasets = assure_list( install(path, source=src, recursive=True, recursion_limit=depth)) # we expect one dataset per level eq_(len(datasets), depth + 1) rmtree(path) # now install all but the last two levels, no data root, sub1, sub2 = install(path, source=src, recursive=True, recursion_limit=2) ok_(sub2.repo.file_has_content('file_in_annex.txt') is False) sub3 = Dataset(opj(sub2.path, 'sub3')) ok_(not sub3.is_installed()) # now get all content in all existing datasets, no new datasets installed # in the process files = root.get(curdir, recursive=True, recursion_limit='existing') eq_(len(files), 1) ok_(sub2.repo.file_has_content('file_in_annex.txt') is True) ok_(not sub3.is_installed()) # now pull down all remaining datasets, no data sub3, sub4 = root.get(curdir, recursive=True, get_data=False) ok_(sub4.is_installed()) ok_(sub3.repo.file_has_content('file_in_annex.txt') is False) # aaannd all data files = root.get(curdir, recursive=True) eq_(len(files), 1) ok_(sub3.repo.file_has_content('file_in_annex.txt') is True)
def test_get_missing(path): repo = GitRepo(path, create=True) os.makedirs(op.join(path, 'deep')) with open(op.join(path, 'test1'), 'w') as f: f.write('some') with open(op.join(path, 'deep', 'test2'), 'w') as f: f.write('some more') # no files tracked yet, so nothing changed eq_(repo.get_changed_files(), []) repo.add('.') # still no differences between worktree and staged eq_(repo.get_changed_files(), []) eq_(set(repo.get_changed_files(staged=True)), {'test1', op.join('deep', 'test2')}) eq_(set(repo.get_changed_files(staged=True, diff_filter='AD')), {'test1', op.join('deep', 'test2')}) eq_(repo.get_changed_files(staged=True, diff_filter='D'), []) repo.commit() eq_(repo.get_changed_files(), []) eq_(repo.get_changed_files(staged=True), []) ok_clean_git(path, annex=False) unlink(op.join(path, 'test1')) eq_(repo.get_missing_files(), ['test1']) rmtree(op.join(path, 'deep')) eq_(sorted(repo.get_missing_files()), [op.join('deep', 'test2'), 'test1']) # nothing is actually known to be deleted eq_(repo.get_deleted_files(), []) # do proper removal repo.remove(op.join(path, 'test1')) # no longer missing eq_(repo.get_missing_files(), [op.join('deep', 'test2')]) # but deleted eq_(repo.get_deleted_files(), ['test1'])
def test_is_installed(src, path): ds = Dataset(path) assert_false(ds.is_installed()) # get a clone: AnnexRepo.clone(src, path) ok_(ds.is_installed()) # submodule still not installed: subds = Dataset(opj(path, 'subm 1')) assert_false(subds.is_installed()) # We must not be able to create a new repository under a known # subdataset path. # Note: Unfortunately we would still be able to generate it under # subdirectory within submodule, e.g. `subm 1/subdir` but that is # not checked here. `rev-create` will provide that protection # when create/rev-create merge. with assert_raises(PathKnownToRepositoryError): subds.create() # get the submodule # This would init so there is a .git file with symlink info, which is # as we agreed is more pain than gain, so let's use our install which would # do it right, after all we are checking 'is_installed' ;) # from datalad.cmd import Runner # Runner().run(['git', 'submodule', 'update', '--init', 'subm 1'], cwd=path) with chpwd(path): get('subm 1') ok_(subds.is_installed()) # wipe it out rmtree(ds.path) assert_false(ds.is_installed())
def _extract_archive(self, path): # we need to extract the archive # TODO: extract to _tmp and then move in a single command so we # don't end up picking up broken pieces lgr.debug( u"Extracting {self._archive} under {path}".format(**locals())) if exists(path): lgr.debug( "Previous extracted (but probably not fully) cached archive " "found. Removing %s", path) rmtree(path) os.makedirs(path) assert (exists(path)) # remove old stamp if exists(self.stamp_path): rmtree(self.stamp_path) decompress_file(self._archive, path, leading_directories=None) # TODO: must optional since we might to use this content, move it # into the tree etc # lgr.debug("Adjusting permissions to R/O for the extracted content") # rotree(path) assert (exists(path)) # create a stamp with open(self.stamp_path, 'wb') as f: f.write(ensure_bytes(self._archive)) # assert that stamp mtime is not older than archive's directory assert (self.is_extracted)
def test_install_dataset_from_just_source(src_repo=None, path=None): src_ds = Dataset(src_repo).create(result_renderer='disabled', force=True) src_ds.save(['INFO.txt', 'test.dat'], to_git=True) src_ds.save('test-annex.dat', to_git=False) # equivalent repo on github: src_url = "https://github.com/datalad/testrepo--basic--r1.git" sources = [ src_ds.path, get_local_file_url(src_ds.path, compatibility='git') ] if not dl_cfg.get('datalad.tests.nonetwork'): sources.append(src_url) for url in sources: with chpwd(path, mkdir=True): ds = install(source=url) ok_startswith(ds.path, path) ok_(ds.is_installed()) ok_(GitRepo.is_valid_repo(ds.path)) assert_repo_status(ds.path, annex=None) assert_in('INFO.txt', ds.repo.get_indexed_files()) # cleanup before next iteration rmtree(path)
def decompress_file(archive, dir_, leading_directories='strip'): """Decompress `archive` into a directory `dir_` Parameters ---------- archive: str dir_: str leading_directories: {'strip', None} If `strip`, and archive contains a single leading directory under which all content is stored, all the content will be moved one directory up and that leading directory will be removed. """ if not exists(dir_): lgr.debug("Creating directory %s to extract archive into", dir_) os.makedirs(dir_) _decompress_file(archive, dir_) if leading_directories == 'strip': _, dirs, files = next(os.walk(dir_)) if not len(files) and len(dirs) == 1: # move all the content under dirs[0] up 1 level widow_dir = opj(dir_, dirs[0]) lgr.debug("Moving content within %s upstairs", widow_dir) subdir, subdirs_, files_ = next(os.walk(opj(dir_, dirs[0]))) for f in subdirs_ + files_: os.rename(opj(subdir, f), opj(dir_, f)) # NFS might hold it victim so use rmtree so it tries a few times rmtree(widow_dir) elif leading_directories is None: pass # really do nothing else: raise NotImplementedError("Not supported %s" % leading_directories)
def test_subsuperdataset_save(path): # Verify that when invoked without recursion save does not # cause querying of subdatasets of the subdataset # see https://github.com/datalad/datalad/issues/4523 parent = Dataset(path).create() # Create 3 levels of subdatasets so later to check operation # with or without --dataset being specified sub1 = parent.create('sub1') sub2 = parent.create(sub1.pathobj / 'sub2') sub3 = parent.create(sub2.pathobj / 'sub3') assert_repo_status(path) # now we will lobotomize that sub2 so git would fail if any query is performed. rmtree(str(sub3.pathobj / '.git' / 'objects')) # the call should proceed fine since neither should care about sub3 # default is no recursion parent.save('sub1') sub1.save('sub2') assert_raises(CommandError, parent.save, 'sub1', recursive=True) # and should fail if we request saving while in the parent directory # but while not providing a dataset, since operation would run within # pointed subdataset with chpwd(sub1.path): assert_raises(CommandError, save, 'sub2') # but should not fail in the top level superdataset with chpwd(parent.path): save('sub1')
def test_optimized_cloning(path): # make test repo with one file and one commit originpath = op.join(path, 'origin') repo = GitRepo(originpath, create=True) with open(op.join(originpath, 'test'), 'w') as f: f.write('some') repo.add('test') repo.commit('init') ok_clean_git(originpath, annex=False) from glob import glob def _get_inodes(repo): return dict( [(os.path.join(*o.split(os.sep)[-2:]), os.stat(o).st_ino) for o in glob(os.path.join(repo.path, repo.get_git_dir(repo), 'objects', '*', '*'))]) origin_inodes = _get_inodes(repo) # now clone it in different ways and see what happens to the object storage from datalad.support.network import get_local_file_url clonepath = op.join(path, 'clone') for src in (originpath, get_local_file_url(originpath)): # deprecated assert_raises(DeprecatedError, GitRepo, url=src, path=clonepath) clone = GitRepo.clone(url=src, path=clonepath, create=True) clone_inodes = _get_inodes(clone) eq_(origin_inodes, clone_inodes, msg='with src={}'.format(src)) rmtree(clonepath)
def _uninstall_dataset(ds, check, has_super, **kwargs): if check and ds.is_installed(): for r in _drop_files(ds, curdir, check=True, noannex_iserror=False, **kwargs): yield r # TODO: uninstall of a subdataset that has a local URL # (e.g. ./anything) implies cannot be undone, decide how, and # if to check for that # TODO check that the relevant branched are pushed to a remote if ds.subdatasets(fulfilled=True): yield get_status_dict( status='error', ds=ds, message= ('to be uninstalled dataset %s has present subdatasets, forgot --recursive?', ds), **kwargs) return # Close any possibly associated process etc with underlying repo. # Otherwise - rmtree could fail to remove e.g. under NFS which would # still have some files opened by them (thus having .nfs00000xxxx # files) forbidding rmdir to work in rmtree ds.close() if ds.is_installed(): rmtree(ds.path) if has_super and not exists(ds.path): # recreate an empty mountpoint to make Git happier os.makedirs(ds.path) # invalidate loaded ConfigManager: ds._cfg = None yield get_status_dict(status='ok', ds=ds, **kwargs)
def test_implicit_install(src, dst): origin_top = create(src) origin_sub = origin_top.create("sub") origin_subsub = origin_sub.create("subsub") with open(opj(origin_top.path, "file1.txt"), "w") as f: f.write("content1") origin_top.add("file1.txt") with open(opj(origin_sub.path, "file2.txt"), "w") as f: f.write("content2") origin_sub.add("file2.txt") with open(opj(origin_subsub.path, "file3.txt"), "w") as f: f.write("content3") origin_subsub.add("file3.txt") origin_top.save(recursive=True) # first, install toplevel: ds = install(dst, source=src) ok_(ds.is_installed()) sub = Dataset(opj(ds.path, "sub")) ok_(not sub.is_installed()) subsub = Dataset(opj(sub.path, "subsub")) ok_(not subsub.is_installed()) # fail on obscure non-existing one assert_raises(IncompleteResultsError, ds.install, source='obscure') # install 3rd level and therefore implicitly the 2nd: result = ds.install(path=opj("sub", "subsub")) ok_(sub.is_installed()) ok_(subsub.is_installed()) # but by default implicit results are not reported eq_(result, subsub) # fail on obscure non-existing one in subds assert_raises(IncompleteResultsError, ds.install, source=opj('sub', 'obscure')) # clean up, the nasty way rmtree(dst, chmod_files=True) ok_(not exists(dst)) # again first toplevel: ds = install(dst, source=src) ok_(ds.is_installed()) sub = Dataset(opj(ds.path, "sub")) ok_(not sub.is_installed()) subsub = Dataset(opj(sub.path, "subsub")) ok_(not subsub.is_installed()) # now implicit but without an explicit dataset to install into # (deriving from CWD): with chpwd(dst): # don't ask for the file content to make return value comparison # simpler result = get(path=opj("sub", "subsub"), get_data=False, result_xfm='datasets') ok_(sub.is_installed()) ok_(subsub.is_installed()) eq_(result, [sub, subsub])
def test_implicit_install(src, dst): origin_top = create(src) origin_sub = origin_top.create("sub") origin_subsub = origin_sub.create("subsub") with open(opj(origin_top.path, "file1.txt"), "w") as f: f.write("content1") origin_top.save("file1.txt") with open(opj(origin_sub.path, "file2.txt"), "w") as f: f.write("content2") origin_sub.save("file2.txt") with open(opj(origin_subsub.path, "file3.txt"), "w") as f: f.write("content3") origin_subsub.save("file3.txt") origin_top.save(recursive=True) # first, install toplevel: ds = install(dst, source=src) ok_(ds.is_installed()) sub = Dataset(opj(ds.path, "sub")) ok_(not sub.is_installed()) subsub = Dataset(opj(sub.path, "subsub")) ok_(not subsub.is_installed()) # fail on obscure non-existing one assert_raises(IncompleteResultsError, ds.install, source='obscure') # install 3rd level and therefore implicitly the 2nd: result = ds.install(path=opj("sub", "subsub")) ok_(sub.is_installed()) ok_(subsub.is_installed()) # but by default implicit results are not reported eq_(result, subsub) # fail on obscure non-existing one in subds assert_raises(IncompleteResultsError, ds.install, source=opj('sub', 'obscure')) # clean up, the nasty way rmtree(dst, chmod_files=True) ok_(not exists(dst)) # again first toplevel: ds = install(dst, source=src) ok_(ds.is_installed()) sub = Dataset(opj(ds.path, "sub")) ok_(not sub.is_installed()) subsub = Dataset(opj(sub.path, "subsub")) ok_(not subsub.is_installed()) # now implicit but without an explicit dataset to install into # (deriving from CWD): with chpwd(dst): # don't ask for the file content to make return value comparison # simpler result = get(path=opj("sub", "subsub"), get_data=False, result_xfm='datasets') ok_(sub.is_installed()) ok_(subsub.is_installed()) eq_(result, [sub, subsub])
def teardown(self, exclude_metadata): # would make no sense if doesn't work correctly # IIRC we cannot provide custom additional depends so cannot import nose # assert_repo_status(self.ds.path) status = self.ds.status() assert all(r['state'] == 'clean' for r in status) assert len(status) >= self.nfiles rmtree(self.temp)
def _cleanup(self): if not self.remove_paths: return # Nothing TODO self.log("Cleaning up %d paths", len(self.remove_paths)) while self.remove_paths: path = self.remove_paths.pop() if op.lexists(path): rmtree(path)
def test_newthings_coming_down(originpath, destpath): origin = GitRepo(originpath, create=True) create_tree(originpath, {'load.dat': 'heavy'}) Dataset(originpath).save('load.dat') ds = install(source=originpath, path=destpath, result_xfm='datasets', return_type='item-or-list') assert_is_instance(ds.repo, GitRepo) assert_in('origin', ds.repo.get_remotes()) # turn origin into an annex origin = AnnexRepo(originpath, create=True) # clone doesn't know yet assert_false(knows_annex(ds.path)) # but after an update it should # no merge, only one sibling, no parameters should be specific enough assert_result_count(ds.update(), 1, status='ok', type='dataset') assert (knows_annex(ds.path)) # no branches appeared eq_(ds.repo.get_branches(), [DEFAULT_BRANCH]) # now merge, and get an annex assert_result_count(ds.update(merge=True), 1, action='update', status='ok', type='dataset') assert_in('git-annex', ds.repo.get_branches()) assert_is_instance(ds.repo, AnnexRepo) # should be fully functional testfname = opj(ds.path, 'load.dat') assert_false(ds.repo.file_has_content(testfname)) ds.get('.') ok_file_has_content(opj(ds.path, 'load.dat'), 'heavy') # check that a new tag comes down origin.tag('first!') assert_result_count(ds.update(), 1, status='ok', type='dataset') eq_(ds.repo.get_tags(output='name')[0], 'first!') # and now we destroy the remote annex origin.call_git(['config', '--remove-section', 'annex']) rmtree(opj(origin.path, '.git', 'annex'), chmod_files=True) origin.call_git(['branch', '-D', 'git-annex']) origin = GitRepo(originpath) assert_false(knows_annex(originpath)) # and update the local clone # for now this should simply not fail (see gh-793), later might be enhanced to a # graceful downgrade before_branches = ds.repo.get_branches() assert_result_count(ds.update(), 1, status='ok', type='dataset') eq_(before_branches, ds.repo.get_branches()) # annex branch got pruned eq_(['origin/HEAD', 'origin/' + DEFAULT_BRANCH], ds.repo.get_remote_branches()) # check that a new tag comes down even if repo types mismatch origin.tag('second!') assert_result_count(ds.update(), 1, status='ok', type='dataset') eq_(ds.repo.get_tags(output='name')[-1], 'second!')
def _clone_from_any_source(sources, dest): # should not be the case, but we need to distinguish between failure # of git-clone, due to existing target and an unsuccessful clone # attempt. See below. existed = dest and exists(dest) for source_ in sources: try: lgr.debug("Retrieving a dataset from URL: " "{0}".format(source_)) with swallow_logs(): GitRepo.clone(path=dest, url=source_, create=True) return source_ # do not bother with other sources if succeeded except GitCommandError as e: lgr.debug("Failed to retrieve from URL: " "{0}".format(source_)) if not existed and dest \ and exists(dest): lgr.debug("Wiping out unsuccessful clone attempt at " "{}".format(dest)) rmtree(dest) if source_ == sources[-1]: # Note: The following block is evaluated whenever we # fail even with the last try. Not nice, but currently # necessary until we get a more precise exception: #################################### # TODO: We may want to introduce a --force option to # overwrite the target. # TODO: Currently assuming if `existed` and there is a # GitCommandError means that these both things are connected. # Need newer GitPython to get stderr from GitCommandError # (already fixed within GitPython.) if existed: # rudimentary check for an installed dataset at target: # (TODO: eventually check for being the one, that this # is about) dest_ds = Dataset(dest) if dest_ds.is_installed(): lgr.info("{0} appears to be installed already." "".format(dest_ds)) break else: lgr.warning("Target {0} already exists and is not " "an installed dataset. Skipped." "".format(dest)) # Keep original in debug output: lgr.debug("Original failure:{0}" "{1}".format(linesep, exc_str(e))) return None ################## # Re-raise if failed even with the last candidate lgr.debug("Unable to establish repository instance at " "{0} from {1}" "".format(dest, sources)) raise
def test_implicit_install(src, dst): origin_top = create(src) origin_sub = origin_top.create("sub") origin_subsub = origin_sub.create("subsub") with open(opj(origin_top.path, "file1.txt"), "w") as f: f.write("content1") origin_top.add("file1.txt") with open(opj(origin_sub.path, "file2.txt"), "w") as f: f.write("content2") origin_sub.add("file2.txt") with open(opj(origin_subsub.path, "file3.txt"), "w") as f: f.write("content3") origin_subsub.add("file3.txt") origin_top.save(auto_add_changes=True) # first, install toplevel: ds = install(dst, source=src) ok_(ds.is_installed()) sub = Dataset(opj(ds.path, "sub")) ok_(not sub.is_installed()) subsub = Dataset(opj(sub.path, "subsub")) ok_(not subsub.is_installed()) # fail on obscure non-existing one assert_raises(InstallFailedError, ds.install, source='obscure') # install 3rd level and therefore implicitly the 2nd: result = ds.install(path=opj("sub", "subsub")) ok_(sub.is_installed()) ok_(subsub.is_installed()) eq_(result, subsub) # fail on obscure non-existing one in subds assert_raises(InstallFailedError, ds.install, source=opj('sub', 'obscure')) # clean up: rmtree(dst, chmod_files=True) ok_(not exists(dst)) # again first toplevel: ds = install(dst, source=src) ok_(ds.is_installed()) sub = Dataset(opj(ds.path, "sub")) ok_(not sub.is_installed()) subsub = Dataset(opj(sub.path, "subsub")) ok_(not subsub.is_installed()) # now implicit but without an explicit dataset to install into # (deriving from CWD): with chpwd(dst): result = get(path=opj("sub", "subsub")) ok_(sub.is_installed()) ok_(subsub.is_installed()) eq_(result, [subsub])
def _uninstall_dataset(ds, check, has_super, **kwargs): cwd = Path.cwd() if ds.pathobj == cwd or ds.pathobj in cwd.parents: yield get_status_dict( status='error', ds=ds, message='refusing to uninstall a dataset at or above the ' 'current working directory', **kwargs) return if check and ds.is_installed(): # if the checks are on we need to make sure to exit this function # whenever any drop failed, because we cannot rely on the error # to actually cause a stop in upstairs code bad_things_happened = False for r in _drop_files(ds, op.curdir, check=True, noannex_iserror=False, **kwargs): yield r if r['action'] == 'drop' and \ not r.get('status', None) in ('ok', 'notneeded'): bad_things_happened = True if bad_things_happened: # error reporting already happened, we can just stop here return # TODO: uninstall of a subdataset that has a local URL # (e.g. ./anything) implies cannot be undone, decide how, and # if to check for that # TODO check that the relevant branched are pushed to a remote if ds.subdatasets(fulfilled=True): yield get_status_dict( status='error', ds=ds, message= ('to be uninstalled dataset %s has present subdatasets, forgot --recursive?', ds), **kwargs) return # Close any possibly associated process etc with underlying repo. # Otherwise - rmtree could fail to remove e.g. under NFS which would # still have some files opened by them (thus having .nfs00000xxxx # files) forbidding rmdir to work in rmtree ds.close() if ds.is_installed(): rmtree(ds.path) if has_super and not op.exists(ds.path): # recreate an empty mountpoint to make Git happier os.makedirs(ds.path) # invalidate loaded ConfigManager: ds._cfg = None yield get_status_dict(status='ok', ds=ds, **kwargs)
def test_newthings_coming_down(originpath, destpath): origin = GitRepo(originpath, create=True) create_tree(originpath, {'load.dat': 'heavy'}) Dataset(originpath).save('load.dat') ds = install( source=originpath, path=destpath, result_xfm='datasets', return_type='item-or-list') assert_is_instance(ds.repo, GitRepo) assert_in('origin', ds.repo.get_remotes()) # turn origin into an annex origin = AnnexRepo(originpath, create=True) # clone doesn't know yet assert_false(knows_annex(ds.path)) # but after an update it should # no merge, only one sibling, no parameters should be specific enough assert_result_count(ds.update(), 1, status='ok', type='dataset') assert(knows_annex(ds.path)) # no branches appeared eq_(ds.repo.get_branches(), ['master']) # now merge, and get an annex assert_result_count(ds.update(merge=True), 1, status='ok', type='dataset') assert_in('git-annex', ds.repo.get_branches()) assert_is_instance(ds.repo, AnnexRepo) # should be fully functional testfname = opj(ds.path, 'load.dat') assert_false(ds.repo.file_has_content(testfname)) ds.get('.') ok_file_has_content(opj(ds.path, 'load.dat'), 'heavy') # check that a new tag comes down origin.tag('first!') assert_result_count(ds.update(), 1, status='ok', type='dataset') eq_(ds.repo.get_tags(output='name')[0], 'first!') # and now we destroy the remote annex origin._git_custom_command([], ['git', 'config', '--remove-section', 'annex']) rmtree(opj(origin.path, '.git', 'annex'), chmod_files=True) origin._git_custom_command([], ['git', 'branch', '-D', 'git-annex']) origin = GitRepo(originpath) assert_false(knows_annex(originpath)) # and update the local clone # for now this should simply not fail (see gh-793), later might be enhanced to a # graceful downgrade before_branches = ds.repo.get_branches() assert_result_count(ds.update(), 1, status='ok', type='dataset') eq_(before_branches, ds.repo.get_branches()) # annex branch got pruned eq_(['origin/HEAD', 'origin/master'], ds.repo.get_remote_branches()) # check that a new tag comes down even if repo types mismatch origin.tag('second!') assert_result_count(ds.update(), 1, status='ok', type='dataset') eq_(ds.repo.get_tags(output='name')[-1], 'second!')
def setup_cache(self): ds_path = create_test_dataset(self.dsname, spec='2/-2/-2', seed=0)[0] self.log("Setup cache ds path %s. CWD: %s", ds_path, getpwd()) # Will store into a tarfile since otherwise install -r is way too slow # to be invoked for every benchmark # Store full path since apparently setup is not ran in that directory self.tarfile = op.realpath(SampleSuperDatasetBenchmarks.tarfile) with tarfile.open(self.tarfile, "w") as tar: # F.CK -- Python tarfile can't later extract those because key dirs are # read-only. For now just a workaround - make it all writeable from datalad.utils import rotree rotree(self.dsname, ro=False, chmod_files=False) tar.add(self.dsname, recursive=True) rmtree(self.dsname)
def setup_cache(self): # creating in CWD so things get removed when ASV is done ds_path = create_test_dataset("testds1", spec='2/-2/-2', seed=0)[0] # Will store into a tarfile since otherwise install -r is way too slow # to be invoked for every benchmark tarfile_path = opj(osp.dirname(ds_path), 'testds1.tar') with tarfile.open(tarfile_path, "w") as tar: # F.CK -- Python tarfile can't later extract those because key dirs are # read-only. For now just a workaround - make it all writeable from datalad.utils import rotree rotree('testds1', ro=False, chmod_files=False) tar.add('testds1', recursive=True) rmtree('testds1') return tarfile_path
def test_recurse_existing(src, path): origin_ds = _make_dataset_hierarchy(src) # make sure recursion_limit works as expected across a range of depths for depth in range(len(origin_ds)): res = install(path, source=src, recursive=True, recursion_limit=depth, result_xfm=None, return_type='list', result_filter=None) # we expect one dataset per level assert_result_count(res, depth + 1, type='dataset', status='ok') rmtree(path) # now install all but the last two levels, no data root, sub1, sub2 = install(path, source=src, recursive=True, recursion_limit=2, result_xfm='datasets', result_filter=None) ok_(sub2.repo.file_has_content('file_in_annex.txt') is False) sub3 = Dataset(opj(sub2.path, 'sub3')) ok_(not sub3.is_installed()) # now get all content in all existing datasets, no new datasets installed # in the process files = root.get(curdir, recursive=True, recursion_limit='existing') assert_not_in_results(files, type='dataset', status='ok') assert_result_count(files, 1, type='file', status='ok') ok_(sub2.repo.file_has_content('file_in_annex.txt') is True) ok_(not sub3.is_installed()) # now pull down all remaining datasets, no data sub3, sub4 = root.get(curdir, recursive=True, get_data=False, result_xfm='datasets', result_filter=lambda x: x['status'] == 'ok') ok_(sub4.is_installed()) ok_(sub3.repo.file_has_content('file_in_annex.txt') is False) # aaannd all data files = root.get( curdir, recursive=True, result_filter=lambda x: x['status'] == 'ok' and x['type'] == 'file') eq_(len(files), 1) ok_(sub3.repo.file_has_content('file_in_annex.txt') is True)
def test_remove_subds(path): ds = create(path) ds.create('sub') ds.create(op.join('sub', 'subsub')) assert_repo_status(ds.path) assert_result_count(ds.subdatasets(), 1, path=op.join(ds.path, 'sub')) # all good at this point, subdataset known, dataset clean # now have some external force wipe out the subdatasets rmtree(op.join(ds.path, 'sub')) assert_result_count(ds.status(), 1, path=op.join(ds.path, 'sub'), state='deleted') # a single call to save() must fix up the mess assert_status('ok', ds.save()) assert_repo_status(ds.path)
def _uninstall_dataset(ds, check, has_super): results = [] if check and ds.is_installed(): results.extend(_drop_files(ds, curdir, check=True)) # TODO: uninstall of a subdataset that has a local URL # (e.g. ./anything) implies cannot be undone, decide how, and # if to check for that # TODO check that the relevant branched are pushed to a remote if ds.get_subdatasets(fulfilled=True): raise ValueError('to be uninstalled dataset has present subdatasets, forgot --recursive?') if ds.is_installed(): rmtree(ds.path) if has_super and not exists(ds.path): # recreate an empty mountpoint to make Git happier os.makedirs(ds.path) results.append(ds) return results
def setup_cache(self): ds_path = create_test_dataset( self.dsname , spec='2/-2/-2' , seed=0 )[0] self.log("Setup cache ds path %s. CWD: %s", ds_path, getpwd()) # Will store into a tarfile since otherwise install -r is way too slow # to be invoked for every benchmark # Store full path since apparently setup is not ran in that directory self.tarfile = op.realpath(SampleSuperDatasetBenchmarks.tarfile) with tarfile.open(self.tarfile, "w") as tar: # F.CK -- Python tarfile can't later extract those because key dirs are # read-only. For now just a workaround - make it all writeable from datalad.utils import rotree rotree(self.dsname, ro=False, chmod_files=False) tar.add(self.dsname, recursive=True) rmtree(self.dsname)
def test_remove_subds(path): ds = create(path) ds.create('sub') ds.create(op.join('sub', 'subsub')) assert_repo_status(ds.path) assert_result_count( ds.subdatasets(), 1, path=op.join(ds.path, 'sub')) # all good at this point, subdataset known, dataset clean # now have some external force wipe out the subdatasets rmtree(op.join(ds.path, 'sub')) assert_result_count( ds.status(), 1, path=op.join(ds.path, 'sub'), state='deleted') # a single call to save() must fix up the mess assert_status('ok', ds.save()) assert_repo_status(ds.path)
def _uninstall_dataset(ds, check, has_super, **kwargs): if check and ds.is_installed(): # if the checks are on we need to make sure to exit this function # whenever any drop failed, because we cannot rely on the error # to actually cause a stop in upstairs code bad_things_happened = False for r in _drop_files( ds, curdir, check=True, noannex_iserror=False, **kwargs): yield r if r['action'] == 'drop' and \ not r.get('status', None) in ('ok', 'notneeded'): bad_things_happened = True if bad_things_happened: # error reporting already happened, we can just stop here return # we want to use the bound dataset method from datalad.distribution.subdatasets import Subdatasets # TODO: uninstall of a subdataset that has a local URL # (e.g. ./anything) implies cannot be undone, decide how, and # if to check for that # TODO check that the relevant branched are pushed to a remote if ds.subdatasets(fulfilled=True): yield get_status_dict( status='error', ds=ds, message=( 'to be uninstalled dataset %s has present subdatasets, forgot --recursive?', ds), **kwargs) return # Close any possibly associated process etc with underlying repo. # Otherwise - rmtree could fail to remove e.g. under NFS which would # still have some files opened by them (thus having .nfs00000xxxx # files) forbidding rmdir to work in rmtree ds.close() if ds.is_installed(): rmtree(ds.path) if has_super and not exists(ds.path): # recreate an empty mountpoint to make Git happier os.makedirs(ds.path) # invalidate loaded ConfigManager: ds._cfg = None yield get_status_dict(status='ok', ds=ds, **kwargs)
def test_nested_create(path): # to document some more organic usage pattern ds = Dataset(path).create() ok_clean_git(ds.path) lvl2relpath = opj('lvl1', 'lvl2') lvl2path = opj(ds.path, lvl2relpath) os.makedirs(lvl2path) os.makedirs(opj(ds.path, 'lvl1', 'empty')) with open(opj(lvl2path, 'file'), 'w') as f: f.write('some') ok_(ds.save(auto_add_changes=True)) # later create subdataset in a fresh dir subds1 = ds.create(opj('lvl1', 'subds')) ok_clean_git(ds.path) eq_(ds.get_subdatasets(), [opj('lvl1', 'subds')]) # later create subdataset in an existing empty dir subds2 = ds.create(opj('lvl1', 'empty')) ok_clean_git(ds.path) # later try to wrap existing content into a new subdataset # but that won't work assert_raises(ValueError, ds.create, lvl2relpath) # even with force, as to do this properly complicated surgery would need to # take place assert_raises(CommandError, ds.create, lvl2relpath, force=True) # only way to make it work is to unannex the content upfront ds.repo._run_annex_command('unannex', annex_options=[opj(lvl2relpath, 'file')]) # nothing to save, git-annex commits the unannex itself ok_(not ds.save()) # still nothing without force # "err='lvl1/lvl2' already exists in the index" assert_raises(ValueError, ds.create, lvl2relpath) # XXX even force doesn't help, because (I assume) GitPython doesn't update # its representation of the Git index properly assert_raises(CommandError, ds.create, lvl2relpath, force=True) # it is not GitPython's state that is at fault here, test with fresh # dataset isnstance ds = Dataset(ds.path) assert_raises(CommandError, ds.create, lvl2relpath, force=True) # it seems we are at fault here rmtree(opj(lvl2path, '.git')) assert_raises(CommandError, ds.repo.add_submodule, lvl2relpath) # despite the failure: assert_in(lvl2relpath, ds.get_subdatasets())
def test_is_installed(src, path): ds = Dataset(path) assert_false(ds.is_installed()) # get a clone: AnnexRepo.clone(src, path) ok_(ds.is_installed()) # submodule still not installed: subds = Dataset(opj(path, 'subm 1')) assert_false(subds.is_installed()) # We must not be able to create a new repository under a known # subdataset path. # Note: Unfortunately we would still be able to generate it under # subdirectory within submodule, e.g. `subm 1/subdir` but that is # not checked here. `rev-create` will provide that protection # when create/rev-create merge. res = subds.rev_create(on_failure='ignore', return_type='list', result_filter=None, result_xfm=None) assert_result_count(res, 1) assert_result_count( res, 1, status='error', path=subds.path, message=('collision with content in parent dataset at %s: %s', ds.path, [subds.path])) # get the submodule # This would init so there is a .git file with symlink info, which is # as we agreed is more pain than gain, so let's use our install which would # do it right, after all we are checking 'is_installed' ;) # from datalad.cmd import Runner # Runner().run(['git', 'submodule', 'update', '--init', 'subm 1'], cwd=path) with chpwd(path): get('subm 1') ok_(subds.is_installed()) # wipe it out rmtree(ds.path) assert_false(ds.is_installed())
def test_is_installed(src=None, path=None): ca = dict(result_renderer='disabled') # a remote dataset with a subdataset underneath origds = Dataset(src).create(**ca) _ = origds.create('subm 1', **ca) ds = Dataset(path) assert_false(ds.is_installed()) # get a clone: clone(src, path, **ca) ok_(ds.is_installed()) # submodule still not installed: subds = Dataset(ds.pathobj / 'subm 1') assert_false(subds.is_installed()) # We must not be able to create a new repository under a known # subdataset path. # Note: Unfortunately we would still be able to generate it under # subdirectory within submodule, e.g. `subm 1/subdir` but that is # not checked here. `create` provides that protection though. res = subds.create(on_failure='ignore', return_type='list', result_filter=None, result_xfm=None, **ca) assert_result_count(res, 1) assert_result_count(res, 1, status='error', path=subds.path, message=('collision with %s (dataset) in dataset %s', subds.path, ds.path)) # get the submodule with chpwd(ds.path): get('subm 1', **ca) ok_(subds.is_installed()) # wipe it out rmtree(ds.path) assert_false(ds.is_installed())
def test_get_missing(path): repo = GitRepo(path, create=True) os.makedirs(op.join(path, 'deep')) with open(op.join(path, 'test1'), 'w') as f: f.write('some') with open(op.join(path, 'deep', 'test2'), 'w') as f: f.write('some more') repo.add('.') repo.commit() ok_clean_git(path, annex=False) os.unlink(op.join(path, 'test1')) eq_(repo.get_missing_files(), ['test1']) rmtree(op.join(path, 'deep')) eq_(sorted(repo.get_missing_files()), [op.join('deep', 'test2'), 'test1']) # nothing is actually known to be deleted eq_(repo.get_deleted_files(), []) # do proper removal repo.remove(op.join(path, 'test1')) # no longer missing eq_(repo.get_missing_files(), [op.join('deep', 'test2')]) # but deleted eq_(repo.get_deleted_files(), ['test1'])
def test_is_installed(src, path): ds = Dataset(path) assert_false(ds.is_installed()) # get a clone: AnnexRepo.clone(src, path) ok_(ds.is_installed()) # submodule still not installed: subds = Dataset(opj(path, 'subm 1')) assert_false(subds.is_installed()) subds.create() # get the submodule # This would init so there is a .git file with symlink info, which is # as we agreed is more pain than gain, so let's use our install which would # do it right, after all we are checking 'is_installed' ;) # from datalad.cmd import Runner # Runner().run(['git', 'submodule', 'update', '--init', 'subm 1'], cwd=path) with chpwd(path): get('subm 1') ok_(subds.is_installed()) # wipe it out rmtree(ds.path) assert_false(ds.is_installed())
def test_recurse_existing(src, path): origin_ds = _make_dataset_hierarchy(src) # make sure recursion_limit works as expected across a range of depths for depth in range(len(origin_ds)): res = install( path, source=src, recursive=True, recursion_limit=depth, result_xfm=None, return_type='list', result_filter=None) # we expect one dataset per level assert_result_count( res, depth + 1, type='dataset', status='ok') rmtree(path) # now install all but the last two levels, no data root, sub1, sub2 = install( path, source=src, recursive=True, recursion_limit=2, result_xfm='datasets', result_filter=None) ok_(sub2.repo.file_has_content('file_in_annex.txt') is False) sub3 = Dataset(opj(sub2.path, 'sub3')) ok_(not sub3.is_installed()) # now get all content in all existing datasets, no new datasets installed # in the process files = root.get(curdir, recursive=True, recursion_limit='existing') assert_not_in_results(files, type='dataset', status='ok') assert_result_count(files, 1, type='file', status='ok') ok_(sub2.repo.file_has_content('file_in_annex.txt') is True) ok_(not sub3.is_installed()) # now pull down all remaining datasets, no data sub3, sub4 = root.get( curdir, recursive=True, get_data=False, result_xfm='datasets', result_filter=lambda x: x['status'] == 'ok') ok_(sub4.is_installed()) ok_(sub3.repo.file_has_content('file_in_annex.txt') is False) # aaannd all data files = root.get(curdir, recursive=True, result_filter=lambda x: x['status'] == 'ok' and x['type'] == 'file') eq_(len(files), 1) ok_(sub3.repo.file_has_content('file_in_annex.txt') is True)
def _kill_dataset(ds): """This is a harsh internal helper: it will wipe out a dataset, no checks """ # figure out whether we should be nice to a superdataset later on has_super = ds.get_superdataset(topmost=False, registered_only=True) # Close any possibly associated process etc with underlying repo. # Otherwise - rmtree could fail to remove e.g. under NFS which would # still have some files opened by them (thus having .nfs00000xxxx # files) forbidding rmdir to work in rmtree ds.close() rmtree(ds.path) # invalidate loaded ConfigManager -- datasets are singletons!! ds._cfg = None if has_super: # recreate an empty mountpoint to make Git happier ds.pathobj.mkdir(exist_ok=True) yield dict( # keep uninstall to please the gods of a distant past #action='drop', action='uninstall', path=ds.path, type='dataset', status='ok', )
def test_no_blows(cookiesdir): cookies = CookiesDB(op.join(cookiesdir, 'mycookies')) # set the cookie cookies['best'] = 'mine' assert_equal(cookies['best'], 'mine') """ Somehow this manages to trigger on conda but not on debian for me File "/home/yoh/anaconda-2018.12-3.7/envs/test-gitpython/lib/python3.7/shelve.py", line 125, in __setitem__ self.dict[key.encode(self.keyencoding)] = f.getvalue() File "/home/yoh/anaconda-2018.12-3.7/envs/test-gitpython/lib/python3.7/dbm/dumb.py", line 216, in __setitem__ self._index[key] = self._setval(pos, val) File "/home/yoh/anaconda-2018.12-3.7/envs/test-gitpython/lib/python3.7/dbm/dumb.py", line 178, in _setval with _io.open(self._datfile, 'rb+') as f: FileNotFoundError: [Errno 2] No such file or directory: '/home/yoh/.tmp/datalad_temp_test_no_blowsalnsw_wk/mycookies.dat' on Debian (python 3.7.3~rc1-1) I just get a warning: BDB3028 /home/yoh/.tmp/datalad_temp_test_no_blows58tdg67s/mycookies.db: unable to flush: No such file or directory """ try: rmtree(cookiesdir) except OSError: # on NFS directory might still be open, so .nfs* lock file would prevent # removal, but it shouldn't matter and .close should succeed pass cookies.close()
def postclonecfg_annexdataset(ds, reckless, description=None): """If ds "knows annex" -- annex init it, set into reckless etc Provides additional tune up to a possibly an annex repo, e.g. "enables" reckless mode, sets up description """ # in any case check whether we need to annex-init the installed thing: if not knows_annex(ds.path): # not for us return # init annex when traces of a remote annex can be detected if reckless == 'auto': lgr.debug( "Instruct annex to hardlink content in %s from local " "sources, if possible (reckless)", ds.path) ds.config.set( 'annex.hardlink', 'true', where='local', reload=True) lgr.debug("Initializing annex repo at %s", ds.path) # Note, that we cannot enforce annex-init via AnnexRepo(). # If such an instance already exists, its __init__ will not be executed. # Therefore do quick test once we have an object and decide whether to call # its _init(). # # Additionally, call init if we need to add a description (see #1403), # since AnnexRepo.__init__ can only do it with create=True repo = AnnexRepo(ds.path, init=True) if not repo.is_initialized() or description: repo._init(description=description) if reckless == 'auto' or (reckless and reckless.startswith('shared-')): repo.call_annex(['untrust', 'here']) elif reckless == 'ephemeral': # with ephemeral we declare 'here' as 'dead' right away, whenever # we symlink origin's annex, since availability from 'here' should # not be propagated for an ephemeral clone when we publish back to # origin. # This will cause stuff like this for a locally present annexed file: # % git annex whereis d1 # whereis d1 (0 copies) failed # BUT this works: # % git annex find . --not --in here # % git annex find . --in here # d1 # we don't want annex copy-to origin ds.config.set( 'remote.origin.annex-ignore', 'true', where='local') ds.repo.set_remote_dead('here') if check_symlink_capability(ds.repo.dot_git / 'dl_link_test', ds.repo.dot_git / 'dl_target_test'): # symlink the annex to avoid needless copies in an ephemeral clone annex_dir = ds.repo.dot_git / 'annex' origin_annex_url = ds.config.get("remote.origin.url", None) origin_git_path = None if origin_annex_url: try: # Deal with file:// scheme URLs as well as plain paths. # If origin isn't local, we have nothing to do. origin_git_path = Path(RI(origin_annex_url).localpath) # we are local; check for a bare repo first to not mess w/ # the path if GitRepo(origin_git_path, create=False).bare: # origin is a bare repo -> use path as is pass elif origin_git_path.name != '.git': origin_git_path /= '.git' except ValueError: # Note, that accessing localpath on a non-local RI throws # ValueError rather than resulting in an AttributeError. # TODO: Warning level okay or is info level sufficient? # Note, that setting annex-dead is independent of # symlinking .git/annex. It might still make sense to # have an ephemeral clone that doesn't propagate its avail. # info. Therefore don't fail altogether. lgr.warning("reckless=ephemeral mode: origin doesn't seem " "local: %s\nno symlinks being used", origin_annex_url) if origin_git_path: # TODO make sure that we do not delete any unique data rmtree(str(annex_dir)) \ if not annex_dir.is_symlink() else annex_dir.unlink() annex_dir.symlink_to(origin_git_path / 'annex', target_is_directory=True) else: # TODO: What level? + note, that annex-dead is independ lgr.warning("reckless=ephemeral mode: Unable to create symlinks on " "this file system.") srs = {True: [], False: []} # special remotes by "autoenable" key remote_uuids = None # might be necessary to discover known UUIDs repo_config = repo.config # Note: The purpose of this function is to inform the user. So if something # looks misconfigured, we'll warn and move on to the next item. for uuid, config in repo.get_special_remotes().items(): sr_name = config.get('name', None) if sr_name is None: lgr.warning( 'Ignoring special remote %s because it does not have a name. ' 'Known information: %s', uuid, config) continue sr_autoenable = config.get('autoenable', False) try: sr_autoenable = ensure_bool(sr_autoenable) except ValueError: lgr.warning( 'Failed to process "autoenable" value %r for sibling %s in ' 'dataset %s as bool.' 'You might need to enable it later manually and/or fix it up to' ' avoid this message in the future.', sr_autoenable, sr_name, ds.path) continue # If it looks like a type=git special remote, make sure we have up to # date information. See gh-2897. if sr_autoenable and repo_config.get("remote.{}.fetch".format(sr_name)): try: repo.fetch(remote=sr_name) except CommandError as exc: lgr.warning("Failed to fetch type=git special remote %s: %s", sr_name, exc_str(exc)) # determine whether there is a registered remote with matching UUID if uuid: if remote_uuids is None: remote_uuids = { # Check annex-config-uuid first. For sameas annex remotes, # this will point to the UUID for the configuration (i.e. # the key returned by get_special_remotes) rather than the # shared UUID. (repo_config.get('remote.%s.annex-config-uuid' % r) or repo_config.get('remote.%s.annex-uuid' % r)) for r in repo.get_remotes() } if uuid not in remote_uuids: srs[sr_autoenable].append(sr_name) if srs[True]: lgr.debug( "configuration for %s %s added because of autoenable," " but no UUIDs for them yet known for dataset %s", # since we are only at debug level, we could call things their # proper names single_or_plural("special remote", "special remotes", len(srs[True]), True), ", ".join(srs[True]), ds.path ) if srs[False]: # if has no auto-enable special remotes lgr.info( 'access to %s %s not auto-enabled, enable with:\n' '\t\tdatalad siblings -d "%s" enable -s %s', # but since humans might read it, we better confuse them with our # own terms! single_or_plural("dataset sibling", "dataset siblings", len(srs[False]), True), ", ".join(srs[False]), ds.path, srs[False][0] if len(srs[False]) == 1 else "SIBLING", ) # we have just cloned the repo, so it has 'origin', configure any # reachable origin of origins yield from configure_origins(ds, ds)
def clone_dataset( srcs, destds, reckless=None, description=None, result_props=None, cfg=None): """Internal helper to perform cloning without sanity checks (assumed done) This helper does not handle any saving of subdataset modification or adding in a superdataset. Parameters ---------- srcs : list Any suitable clone source specifications (paths, URLs) destds : Dataset Dataset instance for the clone destination reckless : {None, 'auto', 'ephemeral', 'shared-...'}, optional Mode switch to put cloned dataset into unsafe/throw-away configurations, i.e. sacrifice data safety for performance or resource footprint. When None and `cfg` is specified, use the value of `datalad.clone.reckless`. description : str, optional Location description for the annex of the dataset clone (if there is any). result_props : dict, optional Default properties for any yielded result, passed on to get_status_dict(). cfg : ConfigManager, optional Configuration for parent dataset. This will be queried instead of the global DataLad configuration. Yields ------ dict DataLad result records """ if not result_props: # in case the caller had no specific idea on how results should look # like, provide sensible defaults result_props = dict( action='install', logger=lgr, ds=destds, ) if reckless is None and cfg: # if reckless is not explicitly given, but we operate on a # superdataset, query whether it has been instructed to operate # in a reckless mode, and inherit it for the coming clone reckless = cfg.get('datalad.clone.reckless', None) dest_path = destds.pathobj # decode all source candidate specifications candidate_sources = [decode_source_spec(s, cfg=cfg) for s in srcs] # now expand the candidate sources with additional variants of the decoded # giturl, while duplicating the other properties in the additional records # for simplicity. The hope is to overcome a few corner cases and be more # robust than git clone candidate_sources = [ dict(props, giturl=s) for props in candidate_sources for s in _get_flexible_source_candidates(props['giturl']) ] # important test! based on this `rmtree` will happen below after failed clone dest_path_existed = dest_path.exists() if dest_path_existed and any(dest_path.iterdir()): if destds.is_installed(): # check if dest was cloned from the given source before # this is where we would have installed this from # this is where it was actually installed from track_name, track_url = _get_tracking_source(destds) try: # this will get us track_url in system native path conventions, # whenever it is a path (and not a URL) # this is needed to match it to any potentially incoming local # source path in the 'notneeded' test below track_path = str(Path(track_url)) except Exception: # this should never happen, because Path() will let any non-path stringification # pass through unmodified, but we do not want any potential crash due to # pathlib behavior changes lgr.debug("Unexpected behavior of pathlib!") track_path = None for cand in candidate_sources: src = cand['giturl'] if track_url == src \ or (not is_url(track_url) and get_local_file_url(track_url, compatibility='git') == src) \ or track_path == expanduser(src): yield get_status_dict( status='notneeded', message=("dataset %s was already cloned from '%s'", destds, src), **result_props) return # anything else is an error yield get_status_dict( status='error', message='target path already exists and not empty, refuse to clone into target path', **result_props) return log_progress( lgr.info, 'cloneds', 'Cloning dataset to %s', destds, total=len(candidate_sources), label='Clone attempt', unit=' Candidate locations', ) error_msgs = OrderedDict() # accumulate all error messages formatted per each url for cand in candidate_sources: log_progress( lgr.info, 'cloneds', 'Attempting to clone from %s to %s', cand['giturl'], dest_path, update=1, increment=True) clone_opts = {} if cand.get('version', None): clone_opts['branch'] = cand['version'] try: # TODO for now GitRepo.clone() cannot handle Path instances, and PY35 # doesn't make it happen seemlessly GitRepo.clone( path=str(dest_path), url=cand['giturl'], clone_options=clone_opts, create=True) except CommandError as e: e_stderr = e.stderr error_msgs[cand['giturl']] = e lgr.debug("Failed to clone from URL: %s (%s)", cand['giturl'], exc_str(e)) if dest_path.exists(): lgr.debug("Wiping out unsuccessful clone attempt at: %s", dest_path) # We must not just rmtree since it might be curdir etc # we should remove all files/directories under it # TODO stringification can be removed once patlib compatible # or if PY35 is no longer supported rmtree(str(dest_path), children_only=dest_path_existed) if e_stderr and 'could not create work tree' in e_stderr.lower(): # this cannot be fixed by trying another URL re_match = re.match(r".*fatal: (.*)$", e_stderr, flags=re.MULTILINE | re.DOTALL) # cancel progress bar log_progress( lgr.info, 'cloneds', 'Completed clone attempts for %s', destds ) yield get_status_dict( status='error', message=re_match.group(1).strip() if re_match else "stderr: " + e_stderr, **result_props) return # next candidate continue result_props['source'] = cand # do not bother with other sources if succeeded break log_progress( lgr.info, 'cloneds', 'Completed clone attempts for %s', destds ) if not destds.is_installed(): if len(error_msgs): if all(not e.stdout and not e.stderr for e in error_msgs.values()): # there is nothing we can learn from the actual exception, # the exit code is uninformative, the command is predictable error_msg = "Failed to clone from all attempted sources: %s" error_args = list(error_msgs.keys()) else: error_msg = "Failed to clone from any candidate source URL. " \ "Encountered errors per each url were:\n- %s" error_args = '\n- '.join( '{}\n {}'.format(url, exc_str(exc)) for url, exc in error_msgs.items() ) else: # yoh: Not sure if we ever get here but I felt that there could # be a case when this might happen and original error would # not be sufficient to troubleshoot what is going on. error_msg = "Awkward error -- we failed to clone properly. " \ "Although no errors were encountered, target " \ "dataset at %s seems to be not fully installed. " \ "The 'succesful' source was: %s" error_args = (destds.path, cand['giturl']) yield get_status_dict( status='error', message=(error_msg, error_args), **result_props) return if not cand.get("version"): postclone_check_head(destds) # act on --reckless=shared-... # must happen prior git-annex-init, where we can cheaply alter the repo # setup through safe re-init'ing if reckless and reckless.startswith('shared-'): lgr.debug('Reinit %s to enable shared access permissions', destds) destds.repo.call_git(['init', '--shared={}'.format(reckless[7:])]) yield from postclonecfg_annexdataset( destds, reckless, description) # perform any post-processing that needs to know details of the clone # source if result_props['source']['type'] == 'ria': yield from postclonecfg_ria(destds, result_props['source']) if reckless: # store the reckless setting in the dataset to make it # known to later clones of subdatasets via get() destds.config.set( 'datalad.clone.reckless', reckless, where='local', reload=True) # yield successful clone of the base dataset now, as any possible # subdataset clone down below will not alter the Git-state of the # parent yield get_status_dict(status='ok', **result_props)
def __call__( source, path=None, dataset=None, description=None, reckless=False, alt_sources=None): # TODO next ones should be there, but cannot go anywhere # git_opts=None, # git_clone_opts=None, # annex_opts=None, # annex_init_opts=None # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. dataset = require_dataset( dataset, check_installed=True, purpose='cloning') \ if dataset is not None else dataset refds_path = dataset.path if dataset else None if isinstance(source, Dataset): source = source.path if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination raise ValueError( "clone `source` and destination `path` are identical [{}]. " "If you are trying to add a subdataset simply use `add`".format( path)) if path is not None: path = resolve_path(path, dataset) # Possibly do conversion from source into a git-friendly url # luckily GitRepo will undo any fancy file:/// url to make use of Git's # optimization for local clones.... source_url = source source_ = _get_git_url_from_source(source) lgr.debug("Resolved clone source from '%s' to '%s'", source, source_) source = source_ # derive target from source: if path is None: # we got nothing but a source. do something similar to git clone # and derive the path from the source and continue path = _get_installationpath_from_url(source) # since this is a relative `path`, resolve it: path = resolve_path(path, dataset) lgr.debug("Determined clone target path from source") lgr.debug("Resolved clone target path to: '%s'", path) # there is no other way -- my intoxicated brain tells me assert(path is not None) destination_dataset = Dataset(path) dest_path = path status_kwargs = dict( action='install', ds=destination_dataset, logger=lgr, refds=refds_path, source_url=source_url) # important test! based on this `rmtree` will happen below after failed clone if exists(dest_path) and listdir(dest_path): if destination_dataset.is_installed(): # check if dest was cloned from the given source before # this is where we would have installed this from guessed_sources = _get_flexible_source_candidates( source, dest_path) # this is where it was actually installed from track_name, track_url = _get_tracking_source(destination_dataset) if track_url in guessed_sources or \ get_local_file_url(track_url) in guessed_sources: yield get_status_dict( status='notneeded', message=("dataset %s was already cloned from '%s'", destination_dataset, source), **status_kwargs) return # anything else is an error yield get_status_dict( status='error', message='target path already exists and not empty, refuse to clone into target path', **status_kwargs) return if dataset is not None and relpath(path, start=dataset.path).startswith(pardir): yield get_status_dict( status='error', message=("clone target path '%s' not in specified target dataset '%s'", path, dataset), **status_kwargs) return # generate candidate URLs from source argument to overcome a few corner cases # and hopefully be more robust than git clone candidate_sources = [] # combine all given sources (incl. alternatives), maintain order for s in [source] + assure_list(alt_sources): candidate_sources.extend(_get_flexible_source_candidates(s)) candidates_str = \ " [%d other candidates]" % (len(candidate_sources) - 1) \ if len(candidate_sources) > 1 \ else '' lgr.info("Cloning %s%s into '%s'", source, candidates_str, dest_path) dest_path_existed = exists(dest_path) error_msgs = OrderedDict() # accumulate all error messages formatted per each url for isource_, source_ in enumerate(candidate_sources): try: lgr.debug("Attempting to clone %s (%d out of %d candidates) to '%s'", source_, isource_ + 1, len(candidate_sources), dest_path) GitRepo.clone(path=dest_path, url=source_, create=True) break # do not bother with other sources if succeeded except GitCommandError as e: error_msgs[source_] = exc_str_ = exc_str(e) lgr.debug("Failed to clone from URL: %s (%s)", source_, exc_str_) if exists(dest_path): lgr.debug("Wiping out unsuccessful clone attempt at: %s", dest_path) # We must not just rmtree since it might be curdir etc # we should remove all files/directories under it rmtree(dest_path, children_only=dest_path_existed) # Whenever progress reporting is enabled, as it is now, # we end up without e.stderr since it is "processed" out by # GitPython/our progress handler. e_stderr = e.stderr from datalad.support.gitrepo import GitPythonProgressBar if not e_stderr and GitPythonProgressBar._last_error_lines: e_stderr = os.linesep.join(GitPythonProgressBar._last_error_lines) if 'could not create work tree' in e_stderr.lower(): # this cannot be fixed by trying another URL re_match = re.match(r".*fatal: (.*)$", e_stderr, flags=re.MULTILINE | re.DOTALL) yield get_status_dict( status='error', message=re_match.group(1) if re_match else "stderr: " + e_stderr, **status_kwargs) return if not destination_dataset.is_installed(): if len(error_msgs): error_msg = "Failed to clone from any candidate source URL. " \ "Encountered errors per each url were: %s" error_args = (error_msgs, ) else: # yoh: Not sure if we ever get here but I felt that there could # be a case when this might happen and original error would # not be sufficient to troubleshoot what is going on. error_msg = "Awkward error -- we failed to clone properly. " \ "Although no errors were encountered, target " \ "dataset at %s seems to be not fully installed. " \ "The 'succesful' source was: %s" error_args = (destination_dataset.path, source_) yield get_status_dict( status='error', message=(error_msg, error_args), **status_kwargs) return if dataset is not None: # we created a dataset in another dataset # -> make submodule for r in dataset.save( dest_path, return_type='generator', result_filter=None, result_xfm=None, on_failure='ignore'): yield r _handle_possible_annex_dataset( destination_dataset, reckless, description=description) # yield successful clone of the base dataset now, as any possible # subdataset clone down below will not alter the Git-state of the # parent yield get_status_dict(status='ok', **status_kwargs)
def teardown(self, tarfile_path): for path in [self.ds.path + '_', self.ds.path]: print("Cleaning up %s" % path) if osp.exists(path): rmtree(path)
def teardown(self): if osp.exists(self.path): rmtree(self.path)