def test_GitRepo_fetch(test_path, orig_path, clone_path): origin = GitRepo.clone(test_path, orig_path) clone = GitRepo.clone(orig_path, clone_path) filename = get_most_obscure_supported_name() origin.checkout("new_branch", ['-b']) with open(op.join(orig_path, filename), 'w') as f: f.write("New file.") origin.add(filename) origin.commit("new file added.") fetched = clone.fetch(remote='origin') # test FetchInfo list returned by fetch eq_([u'origin/' + clone.get_active_branch(), u'origin/new_branch'], [commit.name for commit in fetched]) ok_clean_git(clone.path, annex=False) assert_in("origin/new_branch", clone.get_remote_branches()) assert_in(filename, clone.get_files("origin/new_branch")) assert_false(op.exists(op.join(clone_path, filename))) # not checked out # create a remote without an URL: origin.add_remote('not-available', 'git://example.com/not/existing') origin.config.unset('remote.not-available.url', where='local') # fetch without provided URL fetched = origin.fetch('not-available') # nothing was done, nothing returned: eq_([], fetched)
def test_GitRepo_pull(test_path, orig_path, clone_path): origin = GitRepo.clone(test_path, orig_path) clone = GitRepo.clone(orig_path, clone_path) filename = get_most_obscure_supported_name() with open(op.join(orig_path, filename), 'w') as f: f.write("New file.") origin.add(filename) origin.commit("new file added.") clone.pull() ok_(op.exists(op.join(clone_path, filename))) # While at it, let's test _get_remotes_having_commit a bit clone.add_remote("very_origin", test_path) clone.fetch("very_origin") eq_( clone._get_remotes_having_commit(clone.get_hexsha()), ['origin'] ) prev_commit = clone.get_hexsha('HEAD^') eq_( set(clone._get_remotes_having_commit(prev_commit)), {'origin', 'very_origin'} )
def test_GitRepo_fetch(test_path, orig_path, clone_path): origin = GitRepo.clone(test_path, orig_path) clone = GitRepo.clone(orig_path, clone_path) filename = get_most_obscure_supported_name() origin.checkout("new_branch", ['-b']) with open(op.join(orig_path, filename), 'w') as f: f.write("New file.") origin.add(filename) origin.commit("new file added.") fetched = clone.fetch(remote='origin') # test FetchInfo list returned by fetch eq_([u'origin/' + clone.get_active_branch(), u'origin/new_branch'], [commit.name for commit in fetched]) ok_clean_git(clone.path, annex=False) assert_in("origin/new_branch", clone.get_remote_branches()) assert_in(filename, clone.get_files("origin/new_branch")) assert_false(op.exists(op.join(clone_path, filename))) # not checked out # create a remote without an URL: origin.add_remote('not-available', 'git://example.com/not/existing') origin.config.unset('remote.not-available.url', where='local') # fetch without provided URL fetched = origin.fetch('not-available') # nothing was done, nothing returned: eq_([], fetched)
def test_knows_annex(here, there): from datalad.support.gitrepo import GitRepo from datalad.support.annexrepo import AnnexRepo GitRepo(path=here, create=True) assert_false(knows_annex(here)) AnnexRepo(path=here, create=True) assert_true(knows_annex(here)) GitRepo.clone(path=there, url=here, create=True) assert_true(knows_annex(there))
def test_knows_annex(here, there): from datalad.support.gitrepo import GitRepo from datalad.support.annexrepo import AnnexRepo GitRepo(path=here, create=True) assert_false(knows_annex(here)) AnnexRepo(path=here, create=True) assert_true(knows_annex(here)) GitRepo.clone(path=there, url=here, create=True) assert_true(knows_annex(there))
def _clone_from_any_source(sources, dest): # should not be the case, but we need to distinguish between failure # of git-clone, due to existing target and an unsuccessful clone # attempt. See below. existed = dest and exists(dest) for source_ in sources: try: lgr.debug("Retrieving a dataset from URL: " "{0}".format(source_)) with swallow_logs(): GitRepo.clone(path=dest, url=source_, create=True) return source_ # do not bother with other sources if succeeded except GitCommandError as e: lgr.debug("Failed to retrieve from URL: " "{0}".format(source_)) if not existed and dest \ and exists(dest): lgr.debug("Wiping out unsuccessful clone attempt at " "{}".format(dest)) rmtree(dest) if source_ == sources[-1]: # Note: The following block is evaluated whenever we # fail even with the last try. Not nice, but currently # necessary until we get a more precise exception: #################################### # TODO: We may want to introduce a --force option to # overwrite the target. # TODO: Currently assuming if `existed` and there is a # GitCommandError means that these both things are connected. # Need newer GitPython to get stderr from GitCommandError # (already fixed within GitPython.) if existed: # rudimentary check for an installed dataset at target: # (TODO: eventually check for being the one, that this # is about) dest_ds = Dataset(dest) if dest_ds.is_installed(): lgr.info("{0} appears to be installed already." "".format(dest_ds)) break else: lgr.warning("Target {0} already exists and is not " "an installed dataset. Skipped." "".format(dest)) # Keep original in debug output: lgr.debug("Original failure:{0}" "{1}".format(linesep, exc_str(e))) return None ################## # Re-raise if failed even with the last candidate lgr.debug("Unable to establish repository instance at " "{0} from {1}" "".format(dest, sources)) raise
def _clone_from_any_source(sources, dest): # should not be the case, but we need to distinguish between failure # of git-clone, due to existing target and an unsuccessful clone # attempt. See below. existed = dest and exists(dest) for source_ in sources: try: lgr.debug("Retrieving a dataset from URL: " "{0}".format(source_)) with swallow_logs(): GitRepo.clone(path=dest, url=source_, create=True) return source_ # do not bother with other sources if succeeded except GitCommandError as e: lgr.debug("Failed to retrieve from URL: " "{0}".format(source_)) if not existed and dest \ and exists(dest): lgr.debug("Wiping out unsuccessful clone attempt at " "{}".format(dest)) rmtree(dest) if source_ == sources[-1]: # Note: The following block is evaluated whenever we # fail even with the last try. Not nice, but currently # necessary until we get a more precise exception: #################################### # TODO: We may want to introduce a --force option to # overwrite the target. # TODO: Currently assuming if `existed` and there is a # GitCommandError means that these both things are connected. # Need newer GitPython to get stderr from GitCommandError # (already fixed within GitPython.) if existed: # rudimentary check for an installed dataset at target: # (TODO: eventually check for being the one, that this # is about) dest_ds = Dataset(dest) if dest_ds.is_installed(): lgr.info("{0} appears to be installed already." "".format(dest_ds)) break else: lgr.warning("Target {0} already exists and is not " "an installed dataset. Skipped." "".format(dest)) # Keep original in debug output: lgr.debug("Original failure:{0}" "{1}".format(linesep, exc_str(e))) return None ################## # Re-raise if failed even with the last candidate lgr.debug("Unable to establish repository instance at " "{0} from {1}" "".format(dest, sources)) raise
def test_GitRepo_get_remote_url(orig_path, path): gr = GitRepo.clone(orig_path, path) gr.add_remote('github', 'git://github.com/datalad/testrepo--basic--r1') eq_(gr.get_remote_url('origin'), orig_path) eq_(gr.get_remote_url('github'), 'git://github.com/datalad/testrepo--basic--r1')
def test_GitRepo_add(src, path): gr = GitRepo.clone(src, path) filename = get_most_obscure_supported_name() with open(op.join(path, filename), 'w') as f: f.write("File to add to git") added = gr.add(filename) eq_(added, {'success': True, 'file': filename}) assert_in(filename, gr.get_indexed_files(), "%s not successfully added to %s" % (filename, path)) # uncommitted: ok_(gr.dirty) filename = "another.txt" with open(op.join(path, filename), 'w') as f: f.write("Another file to add to git") # include committing: added2 = gr.add(filename) gr.commit(msg="Add two files.") eq_(added2, {'success': True, 'file': filename}) assert_in(filename, gr.get_indexed_files(), "%s not successfully added to %s" % (filename, path)) ok_clean_git(path)
def test_GitRepo_add(src, path): gr = GitRepo.clone(src, path) filename = get_most_obscure_supported_name() with open(op.join(path, filename), 'w') as f: f.write("File to add to git") added = gr.add(filename) eq_(added, {'success': True, 'file': filename}) assert_in(filename, gr.get_indexed_files(), "%s not successfully added to %s" % (filename, path)) # uncommitted: ok_(gr.dirty) filename = "another.txt" with open(op.join(path, filename), 'w') as f: f.write("Another file to add to git") assert_raises(AssertionError, gr.add, filename, git=False) assert_raises(AssertionError, gr.add, filename, git=None) # include committing: added2 = gr.add(filename) gr.commit(msg="Add two files.") eq_(added2, {'success': True, 'file': filename}) assert_in(filename, gr.get_indexed_files(), "%s not successfully added to %s" % (filename, path)) ok_clean_git(path)
def test_optimized_cloning(path): # make test repo with one file and one commit originpath = op.join(path, 'origin') repo = GitRepo(originpath, create=True) with open(op.join(originpath, 'test'), 'w') as f: f.write('some') repo.add('test') repo.commit('init') ok_clean_git(originpath, annex=False) from glob import glob def _get_inodes(repo): return dict( [(os.path.join(*o.split(os.sep)[-2:]), os.stat(o).st_ino) for o in glob(os.path.join(repo.path, repo.get_git_dir(repo), 'objects', '*', '*'))]) origin_inodes = _get_inodes(repo) # now clone it in different ways and see what happens to the object storage from datalad.support.network import get_local_file_url clonepath = op.join(path, 'clone') for src in (originpath, get_local_file_url(originpath)): # deprecated assert_raises(DeprecatedError, GitRepo, url=src, path=clonepath) clone = GitRepo.clone(url=src, path=clonepath, create=True) clone_inodes = _get_inodes(clone) eq_(origin_inodes, clone_inodes, msg='with src={}'.format(src)) rmtree(clonepath)
def test_optimized_cloning(path): # make test repo with one file and one commit originpath = op.join(path, 'origin') repo = GitRepo(originpath, create=True) with open(op.join(originpath, 'test'), 'w') as f: f.write('some') repo.add('test') repo.commit('init') ok_clean_git(originpath, annex=False) from glob import glob def _get_inodes(repo): return dict([ (os.path.join(*o.split(os.sep)[-2:]), os.stat(o).st_ino) for o in glob(os.path.join(repo.repo.git_dir, 'objects', '*', '*')) ]) origin_inodes = _get_inodes(repo) # now clone it in different ways and see what happens to the object storage from datalad.support.network import get_local_file_url clonepath = op.join(path, 'clone') for src in (originpath, get_local_file_url(originpath)): # deprecated assert_raises(DeprecatedError, GitRepo, url=src, path=clonepath) clone = GitRepo.clone(url=src, path=clonepath, create=True) clone_inodes = _get_inodes(clone) eq_(origin_inodes, clone_inodes, msg='with src={}'.format(src)) rmtree(clonepath)
def test_GitRepo_get_remote_url(orig_path, path): gr = GitRepo.clone(orig_path, path) gr.add_remote('github', 'git://github.com/datalad/testrepo--basic--r1') eq_(gr.get_remote_url('origin'), orig_path) eq_(gr.get_remote_url('github'), 'git://github.com/datalad/testrepo--basic--r1')
def test_GitRepo_remote_remove(orig_path, path): gr = GitRepo.clone(orig_path, path) gr.add_remote('github', 'git://github.com/datalad/testrepo--basic--r1') gr.remove_remote('github') out = gr.get_remotes() eq_(len(out), 1) assert_in('origin', out)
def test_GitRepo_remote_remove(orig_path, path): gr = GitRepo.clone(orig_path, path) gr.add_remote('github', 'git://github.com/datalad/testrepo--basic--r1') gr.remove_remote('github') out = gr.get_remotes() eq_(len(out), 1) assert_in('origin', out)
def test_GitRepo_pull(test_path, orig_path, clone_path): origin = GitRepo.clone(test_path, orig_path) clone = GitRepo.clone(orig_path, clone_path) filename = get_most_obscure_supported_name() with open(op.join(orig_path, filename), 'w') as f: f.write("New file.") origin.add(filename) origin.commit("new file added.") clone.pull() ok_(op.exists(op.join(clone_path, filename))) # While at it, let's test _get_remotes_having_commit a bit clone.add_remote("very_origin", test_path) clone.fetch("very_origin") eq_(clone._get_remotes_having_commit(clone.get_hexsha()), ['origin']) prev_commit = clone.get_hexsha('HEAD^') eq_(set(clone._get_remotes_having_commit(prev_commit)), {'origin', 'very_origin'})
def test_GitRepo_remote_add(orig_path, path): gr = GitRepo.clone(orig_path, path) out = gr.get_remotes() assert_in('origin', out) eq_(len(out), 1) gr.add_remote('github', 'git://github.com/datalad/testrepo--basic--r1') out = gr.get_remotes() assert_in('origin', out) assert_in('github', out) eq_(len(out), 2) eq_('git://github.com/datalad/testrepo--basic--r1', gr.config['remote.github.url'])
def test_GitRepo_remote_add(orig_path, path): gr = GitRepo.clone(orig_path, path) out = gr.get_remotes() assert_in('origin', out) eq_(len(out), 1) gr.add_remote('github', 'git://github.com/datalad/testrepo--basic--r1') out = gr.get_remotes() assert_in('origin', out) assert_in('github', out) eq_(len(out), 2) eq_('git://github.com/datalad/testrepo--basic--r1', gr.config['remote.github.url'])
def test_GitRepo_get_indexed_files(src, path): gr = GitRepo.clone(src, path) idx_list = gr.get_indexed_files() runner = Runner() out = runner(['git', 'ls-files'], cwd=path) out_list = list(filter(bool, out[0].split('\n'))) for item in idx_list: assert_in(item, out_list, "%s not found in output of git ls-files in %s" % (item, path)) for item in out_list: assert_in(item, idx_list, "%s not found in output of get_indexed_files in %s" % (item, path))
def test_GitRepo_push_n_checkout(orig_path, clone_path): origin = GitRepo(orig_path) clone = GitRepo.clone(orig_path, clone_path) filename = get_most_obscure_supported_name() with open(op.join(clone_path, filename), 'w') as f: f.write("New file.") clone.add(filename) clone.commit("new file added.") # TODO: need checkout first: clone.push('origin', '+master:new-branch') origin.checkout('new-branch') ok_(op.exists(op.join(orig_path, filename)))
def test_GitRepo_push_n_checkout(orig_path, clone_path): origin = GitRepo(orig_path) clone = GitRepo.clone(orig_path, clone_path) filename = get_most_obscure_supported_name() with open(op.join(clone_path, filename), 'w') as f: f.write("New file.") clone.add(filename) clone.commit("new file added.") # TODO: need checkout first: clone.push('origin', '+master:new-branch') origin.checkout('new-branch') ok_(op.exists(op.join(orig_path, filename)))
def test_GitRepo_instance_from_clone(src, dst): gr = GitRepo.clone(src, dst) assert_is_instance(gr, GitRepo, "GitRepo was not created.") assert_is_instance(gr.repo, gitpy.Repo, "Failed to instantiate GitPython Repo object.") ok_(op.exists(op.join(dst, '.git'))) # do it again should raise GitCommandError since git will notice there's # already a git-repo at that path and therefore can't clone to `dst` # Note: Since GitRepo is now a WeakSingletonRepo, this is prevented from # happening atm. Disabling for now: # raise SkipTest("Disabled for RF: WeakSingletonRepo") with swallow_logs() as logs: assert_raises(GitCommandError, GitRepo.clone, src, dst)
def test_GitRepo_instance_from_clone(src, dst): gr = GitRepo.clone(src, dst) assert_is_instance(gr, GitRepo, "GitRepo was not created.") assert_is_instance(gr.repo, gitpy.Repo, "Failed to instantiate GitPython Repo object.") ok_(op.exists(op.join(dst, '.git'))) # do it again should raise GitCommandError since git will notice there's # already a git-repo at that path and therefore can't clone to `dst` # Note: Since GitRepo is now a WeakSingletonRepo, this is prevented from # happening atm. Disabling for now: # raise SkipTest("Disabled for RF: WeakSingletonRepo") with swallow_logs() as logs: assert_raises(GitCommandError, GitRepo.clone, src, dst)
def test_GitRepo_get_indexed_files(src, path): gr = GitRepo.clone(src, path) idx_list = gr.get_indexed_files() runner = Runner() out = runner(['git', 'ls-files'], cwd=path) out_list = list(filter(bool, out[0].split('\n'))) for item in idx_list: assert_in( item, out_list, "%s not found in output of git ls-files in %s" % (item, path)) for item in out_list: assert_in( item, idx_list, "%s not found in output of get_indexed_files in %s" % (item, path))
def test_get_tracking_branch(o_path, c_path): clone = GitRepo.clone(o_path, c_path) # Note, that the default branch might differ even if it is always 'master'. # For direct mode annex repositories it would then be "annex/direct/master" # for example. Therefore use whatever branch is checked out by default: master_branch = clone.get_active_branch() ok_(master_branch) eq_(('origin', 'refs/heads/' + master_branch), clone.get_tracking_branch()) clone.checkout('new_branch', ['-b']) eq_((None, None), clone.get_tracking_branch()) eq_(('origin', 'refs/heads/' + master_branch), clone.get_tracking_branch(master_branch))
def test_get_tracking_branch(o_path, c_path): clone = GitRepo.clone(o_path, c_path) # Note, that the default branch might differ even if it is always 'master'. # For direct mode annex repositories it would then be "annex/direct/master" # for example. Therefore use whatever branch is checked out by default: master_branch = clone.get_active_branch() ok_(master_branch) eq_(('origin', 'refs/heads/' + master_branch), clone.get_tracking_branch()) clone.checkout('new_branch', ['-b']) eq_((None, None), clone.get_tracking_branch()) eq_(('origin', 'refs/heads/' + master_branch), clone.get_tracking_branch(master_branch))
def test_GitRepo_get_files(url, path): gr = GitRepo.clone(url, path) # get the expected files via os for comparison: os_files = set() for (dirpath, dirnames, filenames) in os.walk(path): rel_dir = os.path.relpath(dirpath, start=path) if rel_dir.startswith(".git"): continue for file_ in filenames: file_path = os.path.normpath(op.join(rel_dir, file_)) os_files.add(file_path) # get the files via GitRepo: local_files = set(gr.get_files()) remote_files = set(gr.get_files(branch="origin/master")) eq_(local_files, set(gr.get_indexed_files())) eq_(local_files, remote_files) eq_(local_files, os_files) # create a different branch: gr.checkout('new_branch', ['-b']) filename = 'another_file.dat' with open(op.join(path, filename), 'w') as f: f.write("something") gr.add(filename) gr.commit("Added.") # now get the files again: local_files = set(gr.get_files()) eq_(local_files, os_files.union({filename})) # retrieve remote branch again, which should not have changed: remote_files = set(gr.get_files(branch="origin/master")) eq_(remote_files, os_files) eq_(set([filename]), local_files.difference(remote_files)) # switch back and query non-active branch: gr.checkout('master') local_files = set(gr.get_files()) branch_files = set(gr.get_files(branch="new_branch")) eq_(set([filename]), branch_files.difference(local_files))
def test_GitRepo_get_files(url, path): gr = GitRepo.clone(url, path) # get the expected files via os for comparison: os_files = set() for (dirpath, dirnames, filenames) in os.walk(path): rel_dir = os.path.relpath(dirpath, start=path) if rel_dir.startswith(".git"): continue for file_ in filenames: file_path = os.path.normpath(op.join(rel_dir, file_)) os_files.add(file_path) # get the files via GitRepo: local_files = set(gr.get_files()) remote_files = set(gr.get_files(branch="origin/master")) eq_(local_files, set(gr.get_indexed_files())) eq_(local_files, remote_files) eq_(local_files, os_files) # create a different branch: gr.checkout('new_branch', ['-b']) filename = 'another_file.dat' with open(op.join(path, filename), 'w') as f: f.write("something") gr.add(filename) gr.commit("Added.") # now get the files again: local_files = set(gr.get_files()) eq_(local_files, os_files.union({filename})) # retrieve remote branch again, which should not have changed: remote_files = set(gr.get_files(branch="origin/master")) eq_(remote_files, os_files) eq_(set([filename]), local_files.difference(remote_files)) # switch back and query non-active branch: gr.checkout('master') local_files = set(gr.get_files()) branch_files = set(gr.get_files(branch="new_branch")) eq_(set([filename]), branch_files.difference(local_files))
def clone_dataset( srcs, destds, reckless=None, description=None, result_props=None, cfg=None): """Internal helper to perform cloning without sanity checks (assumed done) This helper does not handle any saving of subdataset modification or adding in a superdataset. Parameters ---------- srcs : list Any suitable clone source specifications (paths, URLs) destds : Dataset Dataset instance for the clone destination reckless : {None, 'auto', 'ephemeral', 'shared-...'}, optional Mode switch to put cloned dataset into unsafe/throw-away configurations, i.e. sacrifice data safety for performance or resource footprint. When None and `cfg` is specified, use the value of `datalad.clone.reckless`. description : str, optional Location description for the annex of the dataset clone (if there is any). result_props : dict, optional Default properties for any yielded result, passed on to get_status_dict(). cfg : ConfigManager, optional Configuration for parent dataset. This will be queried instead of the global DataLad configuration. Yields ------ dict DataLad result records """ if not result_props: # in case the caller had no specific idea on how results should look # like, provide sensible defaults result_props = dict( action='install', logger=lgr, ds=destds, ) if reckless is None and cfg: # if reckless is not explicitly given, but we operate on a # superdataset, query whether it has been instructed to operate # in a reckless mode, and inherit it for the coming clone reckless = cfg.get('datalad.clone.reckless', None) dest_path = destds.pathobj # decode all source candidate specifications candidate_sources = [decode_source_spec(s, cfg=cfg) for s in srcs] # now expand the candidate sources with additional variants of the decoded # giturl, while duplicating the other properties in the additional records # for simplicity. The hope is to overcome a few corner cases and be more # robust than git clone candidate_sources = [ dict(props, giturl=s) for props in candidate_sources for s in _get_flexible_source_candidates(props['giturl']) ] # important test! based on this `rmtree` will happen below after failed clone dest_path_existed = dest_path.exists() if dest_path_existed and any(dest_path.iterdir()): if destds.is_installed(): # check if dest was cloned from the given source before # this is where we would have installed this from # this is where it was actually installed from track_name, track_url = _get_tracking_source(destds) try: # this will get us track_url in system native path conventions, # whenever it is a path (and not a URL) # this is needed to match it to any potentially incoming local # source path in the 'notneeded' test below track_path = str(Path(track_url)) except Exception: # this should never happen, because Path() will let any non-path stringification # pass through unmodified, but we do not want any potential crash due to # pathlib behavior changes lgr.debug("Unexpected behavior of pathlib!") track_path = None for cand in candidate_sources: src = cand['giturl'] if track_url == src \ or (not is_url(track_url) and get_local_file_url(track_url, compatibility='git') == src) \ or track_path == expanduser(src): yield get_status_dict( status='notneeded', message=("dataset %s was already cloned from '%s'", destds, src), **result_props) return # anything else is an error yield get_status_dict( status='error', message='target path already exists and not empty, refuse to clone into target path', **result_props) return log_progress( lgr.info, 'cloneds', 'Cloning dataset to %s', destds, total=len(candidate_sources), label='Clone attempt', unit=' Candidate locations', ) error_msgs = OrderedDict() # accumulate all error messages formatted per each url for cand in candidate_sources: log_progress( lgr.info, 'cloneds', 'Attempting to clone from %s to %s', cand['giturl'], dest_path, update=1, increment=True) clone_opts = {} if cand.get('version', None): clone_opts['branch'] = cand['version'] try: # TODO for now GitRepo.clone() cannot handle Path instances, and PY35 # doesn't make it happen seemlessly GitRepo.clone( path=str(dest_path), url=cand['giturl'], clone_options=clone_opts, create=True) except CommandError as e: e_stderr = e.stderr error_msgs[cand['giturl']] = e lgr.debug("Failed to clone from URL: %s (%s)", cand['giturl'], exc_str(e)) if dest_path.exists(): lgr.debug("Wiping out unsuccessful clone attempt at: %s", dest_path) # We must not just rmtree since it might be curdir etc # we should remove all files/directories under it # TODO stringification can be removed once patlib compatible # or if PY35 is no longer supported rmtree(str(dest_path), children_only=dest_path_existed) if e_stderr and 'could not create work tree' in e_stderr.lower(): # this cannot be fixed by trying another URL re_match = re.match(r".*fatal: (.*)$", e_stderr, flags=re.MULTILINE | re.DOTALL) # cancel progress bar log_progress( lgr.info, 'cloneds', 'Completed clone attempts for %s', destds ) yield get_status_dict( status='error', message=re_match.group(1).strip() if re_match else "stderr: " + e_stderr, **result_props) return # next candidate continue result_props['source'] = cand # do not bother with other sources if succeeded break log_progress( lgr.info, 'cloneds', 'Completed clone attempts for %s', destds ) if not destds.is_installed(): if len(error_msgs): if all(not e.stdout and not e.stderr for e in error_msgs.values()): # there is nothing we can learn from the actual exception, # the exit code is uninformative, the command is predictable error_msg = "Failed to clone from all attempted sources: %s" error_args = list(error_msgs.keys()) else: error_msg = "Failed to clone from any candidate source URL. " \ "Encountered errors per each url were:\n- %s" error_args = '\n- '.join( '{}\n {}'.format(url, exc_str(exc)) for url, exc in error_msgs.items() ) else: # yoh: Not sure if we ever get here but I felt that there could # be a case when this might happen and original error would # not be sufficient to troubleshoot what is going on. error_msg = "Awkward error -- we failed to clone properly. " \ "Although no errors were encountered, target " \ "dataset at %s seems to be not fully installed. " \ "The 'succesful' source was: %s" error_args = (destds.path, cand['giturl']) yield get_status_dict( status='error', message=(error_msg, error_args), **result_props) return if not cand.get("version"): postclone_check_head(destds) # act on --reckless=shared-... # must happen prior git-annex-init, where we can cheaply alter the repo # setup through safe re-init'ing if reckless and reckless.startswith('shared-'): lgr.debug('Reinit %s to enable shared access permissions', destds) destds.repo.call_git(['init', '--shared={}'.format(reckless[7:])]) yield from postclonecfg_annexdataset( destds, reckless, description) # perform any post-processing that needs to know details of the clone # source if result_props['source']['type'] == 'ria': yield from postclonecfg_ria(destds, result_props['source']) if reckless: # store the reckless setting in the dataset to make it # known to later clones of subdatasets via get() destds.config.set( 'datalad.clone.reckless', reckless, where='local', reload=True) # yield successful clone of the base dataset now, as any possible # subdataset clone down below will not alter the Git-state of the # parent yield get_status_dict(status='ok', **result_props)
def __call__( source, path=None, dataset=None, description=None, reckless=False, alt_sources=None): # TODO next ones should be there, but cannot go anywhere # git_opts=None, # git_clone_opts=None, # annex_opts=None, # annex_init_opts=None # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. dataset = require_dataset( dataset, check_installed=True, purpose='cloning') \ if dataset is not None else dataset refds_path = dataset.path if dataset else None if isinstance(source, Dataset): source = source.path if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination raise ValueError( "clone `source` and destination `path` are identical [{}]. " "If you are trying to add a subdataset simply use `add`".format( path)) if path is not None: path = resolve_path(path, dataset) # Possibly do conversion from source into a git-friendly url # luckily GitRepo will undo any fancy file:/// url to make use of Git's # optimization for local clones.... source_url = source source_ = _get_git_url_from_source(source) lgr.debug("Resolved clone source from '%s' to '%s'", source, source_) source = source_ # derive target from source: if path is None: # we got nothing but a source. do something similar to git clone # and derive the path from the source and continue path = _get_installationpath_from_url(source) # since this is a relative `path`, resolve it: path = resolve_path(path, dataset) lgr.debug("Determined clone target path from source") lgr.debug("Resolved clone target path to: '%s'", path) # there is no other way -- my intoxicated brain tells me assert(path is not None) destination_dataset = Dataset(path) dest_path = path status_kwargs = dict( action='install', ds=destination_dataset, logger=lgr, refds=refds_path, source_url=source_url) # important test! based on this `rmtree` will happen below after failed clone if exists(dest_path) and listdir(dest_path): if destination_dataset.is_installed(): # check if dest was cloned from the given source before # this is where we would have installed this from guessed_sources = _get_flexible_source_candidates( source, dest_path) # this is where it was actually installed from track_name, track_url = _get_tracking_source(destination_dataset) if track_url in guessed_sources or \ get_local_file_url(track_url) in guessed_sources: yield get_status_dict( status='notneeded', message=("dataset %s was already cloned from '%s'", destination_dataset, source), **status_kwargs) return # anything else is an error yield get_status_dict( status='error', message='target path already exists and not empty, refuse to clone into target path', **status_kwargs) return if dataset is not None and relpath(path, start=dataset.path).startswith(pardir): yield get_status_dict( status='error', message=("clone target path '%s' not in specified target dataset '%s'", path, dataset), **status_kwargs) return # generate candidate URLs from source argument to overcome a few corner cases # and hopefully be more robust than git clone candidate_sources = [] # combine all given sources (incl. alternatives), maintain order for s in [source] + assure_list(alt_sources): candidate_sources.extend(_get_flexible_source_candidates(s)) candidates_str = \ " [%d other candidates]" % (len(candidate_sources) - 1) \ if len(candidate_sources) > 1 \ else '' lgr.info("Cloning %s%s into '%s'", source, candidates_str, dest_path) dest_path_existed = exists(dest_path) error_msgs = OrderedDict() # accumulate all error messages formatted per each url for isource_, source_ in enumerate(candidate_sources): try: lgr.debug("Attempting to clone %s (%d out of %d candidates) to '%s'", source_, isource_ + 1, len(candidate_sources), dest_path) GitRepo.clone(path=dest_path, url=source_, create=True) break # do not bother with other sources if succeeded except GitCommandError as e: error_msgs[source_] = exc_str_ = exc_str(e) lgr.debug("Failed to clone from URL: %s (%s)", source_, exc_str_) if exists(dest_path): lgr.debug("Wiping out unsuccessful clone attempt at: %s", dest_path) # We must not just rmtree since it might be curdir etc # we should remove all files/directories under it rmtree(dest_path, children_only=dest_path_existed) # Whenever progress reporting is enabled, as it is now, # we end up without e.stderr since it is "processed" out by # GitPython/our progress handler. e_stderr = e.stderr from datalad.support.gitrepo import GitPythonProgressBar if not e_stderr and GitPythonProgressBar._last_error_lines: e_stderr = os.linesep.join(GitPythonProgressBar._last_error_lines) if 'could not create work tree' in e_stderr.lower(): # this cannot be fixed by trying another URL re_match = re.match(r".*fatal: (.*)$", e_stderr, flags=re.MULTILINE | re.DOTALL) yield get_status_dict( status='error', message=re_match.group(1) if re_match else "stderr: " + e_stderr, **status_kwargs) return if not destination_dataset.is_installed(): if len(error_msgs): error_msg = "Failed to clone from any candidate source URL. " \ "Encountered errors per each url were: %s" error_args = (error_msgs, ) else: # yoh: Not sure if we ever get here but I felt that there could # be a case when this might happen and original error would # not be sufficient to troubleshoot what is going on. error_msg = "Awkward error -- we failed to clone properly. " \ "Although no errors were encountered, target " \ "dataset at %s seems to be not fully installed. " \ "The 'succesful' source was: %s" error_args = (destination_dataset.path, source_) yield get_status_dict( status='error', message=(error_msg, error_args), **status_kwargs) return if dataset is not None: # we created a dataset in another dataset # -> make submodule for r in dataset.save( dest_path, return_type='generator', result_filter=None, result_xfm=None, on_failure='ignore'): yield r _handle_possible_annex_dataset( destination_dataset, reckless, description=description) # yield successful clone of the base dataset now, as any possible # subdataset clone down below will not alter the Git-state of the # parent yield get_status_dict(status='ok', **status_kwargs)
def __call__(source, path=None, dataset=None, description=None, reckless=False, alt_sources=None): # TODO next ones should be there, but cannot go anywhere # git_opts=None, # git_clone_opts=None, # annex_opts=None, # annex_init_opts=None # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. dataset = require_dataset( dataset, check_installed=True, purpose='cloning') \ if dataset is not None else dataset refds_path = dataset.path if dataset else None if isinstance(source, Dataset): source = source.path if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination raise ValueError( "clone `source` and destination `path` are identical [{}]. " "If you are trying to add a subdataset simply use `add`". format(path)) if path is not None: path = resolve_path(path, dataset) # Possibly do conversion from source into a git-friendly url # luckily GitRepo will undo any fancy file:/// url to make use of Git's # optimization for local clones.... source_url = source source_ = _get_git_url_from_source(source) lgr.debug("Resolved clone source from '%s' to '%s'", source, source_) source = source_ # derive target from source: if path is None: # we got nothing but a source. do something similar to git clone # and derive the path from the source and continue path = _get_installationpath_from_url(source) # since this is a relative `path`, resolve it: path = resolve_path(path, dataset) lgr.debug("Determined clone target path from source") lgr.debug("Resolved clone target path to: '%s'", path) # there is no other way -- my intoxicated brain tells me assert (path is not None) destination_dataset = Dataset(path) dest_path = path status_kwargs = dict(action='install', ds=destination_dataset, logger=lgr, refds=refds_path, source_url=source_url) # important test! based on this `rmtree` will happen below after failed clone if exists(dest_path) and listdir(dest_path): if destination_dataset.is_installed(): # check if dest was cloned from the given source before # this is where we would have installed this from guessed_sources = _get_flexible_source_candidates( source, dest_path) # this is where it was actually installed from track_name, track_url = _get_tracking_source( destination_dataset) if track_url in guessed_sources or \ get_local_file_url(track_url) in guessed_sources: yield get_status_dict( status='notneeded', message=("dataset %s was already cloned from '%s'", destination_dataset, source), **status_kwargs) return # anything else is an error yield get_status_dict( status='error', message= 'target path already exists and not empty, refuse to clone into target path', **status_kwargs) return if dataset is not None and relpath( path, start=dataset.path).startswith(pardir): yield get_status_dict( status='error', message= ("clone target path '%s' not in specified target dataset '%s'", path, dataset), **status_kwargs) return # generate candidate URLs from source argument to overcome a few corner cases # and hopefully be more robust than git clone candidate_sources = [] # combine all given sources (incl. alternatives), maintain order for s in [source] + assure_list(alt_sources): candidate_sources.extend(_get_flexible_source_candidates(s)) candidates_str = \ " [%d other candidates]" % (len(candidate_sources) - 1) \ if len(candidate_sources) > 1 \ else '' lgr.info("Cloning %s%s into '%s'", source, candidates_str, dest_path) dest_path_existed = exists(dest_path) error_msgs = OrderedDict( ) # accumulate all error messages formatted per each url for isource_, source_ in enumerate(candidate_sources): try: lgr.debug( "Attempting to clone %s (%d out of %d candidates) to '%s'", source_, isource_ + 1, len(candidate_sources), dest_path) GitRepo.clone(path=dest_path, url=source_, create=True) break # do not bother with other sources if succeeded except GitCommandError as e: error_msgs[source_] = exc_str_ = exc_str(e) lgr.debug("Failed to clone from URL: %s (%s)", source_, exc_str_) if exists(dest_path): lgr.debug("Wiping out unsuccessful clone attempt at: %s", dest_path) # We must not just rmtree since it might be curdir etc # we should remove all files/directories under it rmtree(dest_path, children_only=dest_path_existed) # Whenever progress reporting is enabled, as it is now, # we end up without e.stderr since it is "processed" out by # GitPython/our progress handler. e_stderr = e.stderr from datalad.support.gitrepo import GitPythonProgressBar if not e_stderr and GitPythonProgressBar._last_error_lines: e_stderr = os.linesep.join( GitPythonProgressBar._last_error_lines) if 'could not create work tree' in e_stderr.lower(): # this cannot be fixed by trying another URL re_match = re.match(r".*fatal: (.*)$", e_stderr, flags=re.MULTILINE | re.DOTALL) yield get_status_dict( status='error', message=re_match.group(1) if re_match else "stderr: " + e_stderr, **status_kwargs) return if not destination_dataset.is_installed(): if len(error_msgs): error_msg = "Failed to clone from any candidate source URL. " \ "Encountered errors per each url were: %s" error_args = (error_msgs, ) else: # yoh: Not sure if we ever get here but I felt that there could # be a case when this might happen and original error would # not be sufficient to troubleshoot what is going on. error_msg = "Awkward error -- we failed to clone properly. " \ "Although no errors were encountered, target " \ "dataset at %s seems to be not fully installed. " \ "The 'succesful' source was: %s" error_args = (destination_dataset.path, source_) yield get_status_dict(status='error', message=(error_msg, error_args), **status_kwargs) return if dataset is not None: # we created a dataset in another dataset # -> make submodule for r in dataset.add(dest_path, save=True, ds2super=True, return_type='generator', result_filter=None, result_xfm=None, on_failure='ignore'): yield r _handle_possible_annex_dataset(destination_dataset, reckless, description=description) # yield successful clone of the base dataset now, as any possible # subdataset clone down below will not alter the Git-state of the # parent yield get_status_dict(status='ok', **status_kwargs)
def test_openfmri_pipeline1(ind, topurl, outd, clonedir): index_html = opj(ind, 'ds666', 'index.html') list( initiate_dataset(template="openfmri", dataset_name='dataladtest-ds666', path=outd, data_fields=['dataset'])({ 'dataset': 'ds666' })) repo = AnnexRepo(outd, create=False) # to be used in the checks # Since datalad 0.11.2 all .metadata/objects go under annex. # Here we have a test where we force drop all annexed content, # to mitigate that let's place all metadata under git dotdatalad_attributes_file = opj('.datalad', '.gitattributes') repo.set_gitattributes([('metadata/objects/**', { 'annex.largefiles': '(nothing)' })], dotdatalad_attributes_file) # --amend so we do not cause change in # of commits below repo.commit("gitattributes", files=dotdatalad_attributes_file, options=['--amend']) with chpwd(outd): pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # We do not have custom changes in master yet, so it just follows incoming-processed atm # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # Since we did initiate_dataset -- now we have separate master! assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) t1w_fpath_nover = opj(outd, 'sub1', 'anat', 'sub-1_T1w.dat') ok_file_has_content(t1w_fpath_nover, "mighty load in old format") # # And now versioned files were specified! # add_to_index(index_html, content=_versioned_files) with chpwd(outd): pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) ok_( not exists(t1w_fpath_nover), "%s file should no longer be there if unversioned files get removed correctly" % t1w_fpath_nover) repo = AnnexRepo(outd, create=False) # to be used in the checks # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # We do not have custom changes in master yet, so it just follows incoming-processed atm # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # Since we did initiate_dataset -- now we have separate master! assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) # actually the tree should look quite neat with 1.0.0 tag having 1 parent in incoming # 1.0.1 having 1.0.0 and the 2nd commit in incoming as parents commits_hexsha = {b: list(_get_branch_commits(repo, b)) for b in branches} commits_l = { b: list(_get_branch_commits(repo, b, limit='left-only')) for b in branches } # all commits out there: # dataset init, crawler init # (2 commits) # + 3*(incoming, processed, merge) # + 3*aggregate-metadata update # - 1 since now that incoming starts with master, there is one less merge # In --incremental mode there is a side effect of absent now # 2*remove of obsolete metadata object files, # see https://github.com/datalad/datalad/issues/2772 # TODO inspect by knowledgeable person and re-enable #ncommits_master = len(commits_hexsha['master']) #assert_in(ncommits_master, [13, 14]) #assert_in(len(commits_l['master']), [8, 9]) # TODO inspect by knowledgeable person and re-enable #eq_(len(commits_hexsha['incoming']), ncommits_master - 8) #eq_(len(commits_l['incoming']), ncommits_master - 8) #eq_(len(commits_hexsha['incoming-processed']), ncommits_master - 5) #eq_(len(commits_l['incoming-processed']), ncommits_master - 8) # Check tags for the versions eq_(out[0]['datalad_stats'].get_total().versions, ['1.0.0', '1.0.1']) # +1 because original "release" was assumed to be 1.0.0 repo_tags = repo.get_tags() eq_(repo.get_tags(output='name'), ['1.0.0', '1.0.0+1', '1.0.1']) # Ben: The tagged ones currently are the ones with the message # '[DATALAD] dataset aggregate metadata update\n': #eq_(repo_tags[0]['hexsha'], commits_l['master'][4]) # next to the last one #eq_(repo_tags[-1]['hexsha'], commits_l['master'][0]) # the last one def hexsha(l): return l.__class__(x.hexsha for x in l) # TODO requires additional tooling to re-enable ## Verify that we have desired tree of merges #eq_(hexsha(commits_l['incoming-processed'][0].parents), (commits_l['incoming-processed'][1], # commits_l['incoming'][0])) #eq_(hexsha(commits_l['incoming-processed'][2].parents), (commits_l['incoming-processed'][3], # also in master # commits_l['incoming'][2],)) # ben: The following two comparisons are targeting these commits: # commit "Merge branch 'incoming-processed'\n" in commits_l['master'], # parents are: # commit "[DATALAD] dataset aggregate metadata update\n" in commits_l['master'] and # commit "[DATALAD] Added files from extracted archives\n\nFiles processed: 6\n renamed: 2\n +annex: 3\nBranches merged: incoming->incoming-processed\n" in commits_l['incoming-processed'] # TODO requires additional tooling to re-enable #eq_(hexsha(commits_l['master'][1].parents), (commits_l['master'][2], # commits_l['incoming-processed'][0])) #eq_(hexsha(commits_l['master'][3].parents), (commits_l['master'][4], # commits_l['incoming-processed'][1])) with chpwd(outd): eq_(set(glob('*')), {'changelog.txt', 'sub-1'}) all_files = sorted(find_files('.')) t1w_fpath = opj(outd, 'sub-1', 'anat', 'sub-1_T1w.dat') ok_file_has_content(t1w_fpath, "mighty load 1.0.1") ok_file_under_git(opj(outd, 'changelog.txt'), annexed=False) ok_file_under_git(t1w_fpath, annexed=True) try: # this is the new way from datalad.metadata.metadata import get_ds_aggregate_db_locations ds = Dataset('.') dbloc, objbase = get_ds_aggregate_db_locations(ds) dbloc = op.relpath(dbloc, start=ds.path) except ImportError: # this stopped working in early 2019 versions of datalad from datalad.metadata.metadata import agginfo_relpath dbloc = agginfo_relpath target_files = { './.datalad/config', './.datalad/crawl/crawl.cfg', # no more! # './.datalad/config.ttl', './.datalad/datalad.ttl', './.datalad/crawl/statuses/incoming.json', './.datalad/crawl/versions/incoming.json', './changelog.txt', './sub-1/anat/sub-1_T1w.dat', './sub-1/beh/responses.tsv', './' + dbloc, } target_incoming_files = { '.gitattributes', # we marked default backend right in the incoming # we now base 'incoming' on master branch, so we get all those as well '.datalad/.gitattributes', '.datalad/config', '.datalad/crawl/crawl.cfg', 'changelog.txt', 'ds666.tar.gz', 'ds666-beh_R1.0.1.tar.gz', 'ds666_R1.0.0.tar.gz', 'ds666_R1.0.1.tar.gz', 'ds666_R2.0.0.tar.gz', '.datalad/crawl/statuses/incoming.json', '.datalad/crawl/versions/incoming.json' } # Ben: metadata object files may differ in their names containing some checksum-ish shit ... # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison eq_( set([ f for f in all_files if not f.startswith('./.datalad/metadata/objects/') ]), target_files) # check that -beh was committed in 2nd commit in incoming, not the first one assert_not_in('ds666-beh_R1.0.1.tar.gz', repo.get_files(commits_l['incoming'][-1])) assert_in('ds666-beh_R1.0.1.tar.gz', repo.get_files(commits_l['incoming'][0])) # rerun pipeline -- make sure we are on the same in all branches! with chpwd(outd): out = run_pipeline(pipeline) eq_(len(out), 1) commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches} eq_(commits_hexsha, commits_hexsha_) # i.e. nothing new # actually we do manage to add_git 1 (README) since it is generated committed directly to git # BUT now fixed -- if not committed (was the same), should be marked as skipped # Nothing was committed so stats leaked all the way up eq_(out[0]['datalad_stats'], ActivityStats(files=5, skipped=5, urls=5)) eq_(out[0]['datalad_stats'], out[0]['datalad_stats'].get_total()) # rerun pipeline when new content is available # add new revision, rerun pipeline and check that stuff was processed/added correctly add_to_index( index_html, content= '<a href="ds666_R2.0.0.tar.gz">Raw data on AWS version 2.0.0</a>') with chpwd(outd): out = run_pipeline(pipeline) all_files_updated = sorted(find_files('.')) eq_(len(out), 1) assert_not_equal(out[0]['datalad_stats'].get_total(), ActivityStats()) # there is no overlays ATM, so behav would be gone since no 2.0.0 for it! target_files.remove('./sub-1/beh/responses.tsv') # Ben: metadata object files may differ in their names containing some checksum-ish shit ... # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison eq_( set([ f for f in all_files_updated if not f.startswith('./.datalad/metadata/objects/') ]), target_files) # new instance so it re-reads git stuff etc # repo = AnnexRepo(outd, create=False) # to be used in the checks commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches} commits_l_ = { b: list(_get_branch_commits(repo, b, limit='left-only')) for b in branches } assert_not_equal(commits_hexsha, commits_hexsha_) eq_(out[0]['datalad_stats'], ActivityStats()) # commit happened so stats were consumed # numbers seems to be right total_stats = out[0]['datalad_stats'].get_total() # but for some reason downloaded_size fluctuates.... why? probably archiving...? total_stats.downloaded_size = 0 eq_( total_stats, ActivityStats( files=8, skipped=5, downloaded=1, renamed=1, urls=6, add_annex=2, # add_git=1, # README versions=['2.0.0'], merges=[['incoming', 'incoming-processed']])) check_dropall_get(repo) # Let's see if pipeline would remove files we stopped tracking remove_from_index(index_html, '<a href=.ds666_R1.0.0[^<]*</a>') with chpwd(outd): with swallow_logs(new_level=logging.WARNING) as cml: out = run_pipeline(pipeline) # since files get removed in incoming, but repreprocessed completely # incomming-processed and merged into master -- new commits will come # They shouldn't have any difference but still should be new commits assert_in("There is already a tag 2.0.0 in the repository", cml.out) eq_(len(out), 1) incoming_files = repo.get_files('incoming') target_incoming_files.remove('ds666_R1.0.0.tar.gz') eq_(set(incoming_files), target_incoming_files) commits_hexsha_removed = { b: list(_get_branch_commits(repo, b)) for b in branches } # our 'statuses' database should have recorded the change thus got a diff # which propagated through all branches for b in 'master', 'incoming-processed': # with non persistent DB we had no changes # eq_(repo.repo.branches[b].commit.diff(commits_hexsha_[b][0]), []) assert_in(repo.pathobj / '.datalad/crawl/statuses/incoming.json', repo.diff(b, commits_hexsha_[b][0])) dincoming = repo.diff('incoming', commits_hexsha_['incoming'][0]) eq_(len(dincoming), 2) # 2 diff objects -- 1 file removed, 1 statuses updated eq_( set(dincoming.keys()), { repo.pathobj / '.datalad/crawl/statuses/incoming.json', repo.pathobj / 'ds666_R1.0.0.tar.gz' }) eq_(out[0]['datalad_stats'].get_total().removed, 1) assert_not_equal(commits_hexsha_, commits_hexsha_removed) # we will check if a clone would be crawling just as good from datalad.api import crawl # make a brand new clone GitRepo.clone(outd, clonedir) def _pipeline(*args, **kwargs): """Helper to mock openfmri.pipeline invocation so it looks at our 'server'""" kwargs = updated(kwargs, {'topurl': topurl, 'versioned_urls': False}) return ofpipeline(*args, **kwargs) with chpwd(clonedir), patch.object(openfmri, 'pipeline', _pipeline): output, stats = crawl( ) # we should be able to recrawl without doing anything ok_(stats, ActivityStats(files=6, skipped=6, urls=5))
def __call__(source, path=None, dataset=None, description=None, reckless=False, alt_sources=None): # TODO next ones should be there, but cannot go anywhere # git_opts=None, # git_clone_opts=None, # annex_opts=None, # annex_init_opts=None # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. dataset = require_dataset( dataset, check_installed=True, purpose='cloning') \ if dataset is not None else dataset refds_path = dataset.path if dataset else None if isinstance(source, Dataset): source = source.path if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination raise ValueError( "clone `source` and destination `path` are identical [{}]. " "If you are trying to add a subdataset simply use `add`". format(path)) if path is not None: path = resolve_path(path, dataset) # Possibly do conversion from source into a git-friendly url # luckily GitRepo will undo any fancy file:/// url to make use of Git's # optimization for local clones.... source_url = source source_ = _get_git_url_from_source(source) lgr.debug("Resolved clone source from '%s' to '%s'", source, source_) source = source_ # derive target from source: if path is None: # we got nothing but a source. do something similar to git clone # and derive the path from the source and continue path = _get_installationpath_from_url(source) # since this is a relative `path`, resolve it: path = resolve_path(path, dataset) lgr.debug("Determined clone target path from source") lgr.debug("Resolved clone target path to: '%s'", path) # there is no other way -- my intoxicated brain tells me assert (path is not None) destination_dataset = Dataset(path) dest_path = path status_kwargs = dict(action='install', ds=destination_dataset, logger=lgr, refds=refds_path, source_url=source_url) # important test! based on this `rmtree` will happen below after failed clone if exists(dest_path) and listdir(dest_path): if destination_dataset.is_installed(): # check if dest was cloned from the given source before # this is where we would have installed this from guessed_sources = _get_flexible_source_candidates( source, dest_path) # this is where it was actually installed from track_name, track_url = _get_tracking_source( destination_dataset) if track_url in guessed_sources or \ get_local_file_url(track_url) in guessed_sources: yield get_status_dict( status='notneeded', message=("dataset %s was already cloned from '%s'", destination_dataset, source), **status_kwargs) return # anything else is an error yield get_status_dict( status='error', message= 'target path already exists and not empty, refuse to clone into target path', **status_kwargs) return if dataset is not None and relpath( path, start=dataset.path).startswith(pardir): yield get_status_dict( status='error', message= ("clone target path '%s' not in specified target dataset '%s'", path, dataset), **status_kwargs) return # generate candidate URLs from source argument to overcome a few corner cases # and hopefully be more robust than git clone candidate_sources = [] # combine all given sources (incl. alternatives), maintain order for s in [source] + assure_list(alt_sources): candidate_sources.extend(_get_flexible_source_candidates(s)) lgr.info("Cloning %s to '%s'", source, dest_path) for isource_, source_ in enumerate(candidate_sources): try: lgr.debug( "Attempting to clone %s (%d out of %d candidates) to '%s'", source_, isource_ + 1, len(candidate_sources), dest_path) GitRepo.clone(path=dest_path, url=source_, create=True) break # do not bother with other sources if succeeded except GitCommandError as e: lgr.debug("Failed to clone from URL: %s (%s)", source_, exc_str(e)) if exists(dest_path): lgr.debug("Wiping out unsuccessful clone attempt at: %s", dest_path) rmtree(dest_path) if 'could not create work tree' in e.stderr.lower(): # this cannot be fixed by trying another URL yield get_status_dict( status='error', message=re.match(r".*fatal: (.*)\n", e.stderr, flags=re.MULTILINE | re.DOTALL).group(1), **status_kwargs) return if not destination_dataset.is_installed(): yield get_status_dict( status='error', message=( "Failed to clone data from any candidate source URL: %s", candidate_sources), **status_kwargs) return if dataset is not None: # we created a dataset in another dataset # -> make submodule for r in dataset.add(dest_path, save=True, ds2super=True, return_type='generator', result_filter=None, result_xfm=None, on_failure='ignore'): yield r _handle_possible_annex_dataset(destination_dataset, reckless, description=description) # yield successful clone of the base dataset now, as any possible # subdataset clone down below will not alter the Git-state of the # parent yield get_status_dict(status='ok', **status_kwargs)