def clone_dataset( srcs, destds, reckless=None, description=None, result_props=None, cfg=None): """Internal helper to perform cloning without sanity checks (assumed done) This helper does not handle any saving of subdataset modification or adding in a superdataset. Parameters ---------- srcs : list Any suitable clone source specifications (paths, URLs) destds : Dataset Dataset instance for the clone destination reckless : {None, 'auto', 'ephemeral', 'shared-...'}, optional Mode switch to put cloned dataset into unsafe/throw-away configurations, i.e. sacrifice data safety for performance or resource footprint. When None and `cfg` is specified, use the value of `datalad.clone.reckless`. description : str, optional Location description for the annex of the dataset clone (if there is any). result_props : dict, optional Default properties for any yielded result, passed on to get_status_dict(). cfg : ConfigManager, optional Configuration for parent dataset. This will be queried instead of the global DataLad configuration. Yields ------ dict DataLad result records """ if not result_props: # in case the caller had no specific idea on how results should look # like, provide sensible defaults result_props = dict( action='install', logger=lgr, ds=destds, ) if reckless is None and cfg: # if reckless is not explicitly given, but we operate on a # superdataset, query whether it has been instructed to operate # in a reckless mode, and inherit it for the coming clone reckless = cfg.get('datalad.clone.reckless', None) dest_path = destds.pathobj # decode all source candidate specifications candidate_sources = [decode_source_spec(s, cfg=cfg) for s in srcs] # now expand the candidate sources with additional variants of the decoded # giturl, while duplicating the other properties in the additional records # for simplicity. The hope is to overcome a few corner cases and be more # robust than git clone candidate_sources = [ dict(props, giturl=s) for props in candidate_sources for s in _get_flexible_source_candidates(props['giturl']) ] # important test! based on this `rmtree` will happen below after failed clone dest_path_existed = dest_path.exists() if dest_path_existed and any(dest_path.iterdir()): if destds.is_installed(): # check if dest was cloned from the given source before # this is where we would have installed this from # this is where it was actually installed from track_name, track_url = _get_tracking_source(destds) try: # this will get us track_url in system native path conventions, # whenever it is a path (and not a URL) # this is needed to match it to any potentially incoming local # source path in the 'notneeded' test below track_path = str(Path(track_url)) except Exception: # this should never happen, because Path() will let any non-path stringification # pass through unmodified, but we do not want any potential crash due to # pathlib behavior changes lgr.debug("Unexpected behavior of pathlib!") track_path = None for cand in candidate_sources: src = cand['giturl'] if track_url == src \ or (not is_url(track_url) and get_local_file_url(track_url, compatibility='git') == src) \ or track_path == expanduser(src): yield get_status_dict( status='notneeded', message=("dataset %s was already cloned from '%s'", destds, src), **result_props) return # anything else is an error yield get_status_dict( status='error', message='target path already exists and not empty, refuse to clone into target path', **result_props) return log_progress( lgr.info, 'cloneds', 'Cloning dataset to %s', destds, total=len(candidate_sources), label='Clone attempt', unit=' Candidate locations', ) error_msgs = OrderedDict() # accumulate all error messages formatted per each url for cand in candidate_sources: log_progress( lgr.info, 'cloneds', 'Attempting to clone from %s to %s', cand['giturl'], dest_path, update=1, increment=True) clone_opts = {} if cand.get('version', None): clone_opts['branch'] = cand['version'] try: # TODO for now GitRepo.clone() cannot handle Path instances, and PY35 # doesn't make it happen seemlessly GitRepo.clone( path=str(dest_path), url=cand['giturl'], clone_options=clone_opts, create=True) except CommandError as e: e_stderr = e.stderr error_msgs[cand['giturl']] = e lgr.debug("Failed to clone from URL: %s (%s)", cand['giturl'], exc_str(e)) if dest_path.exists(): lgr.debug("Wiping out unsuccessful clone attempt at: %s", dest_path) # We must not just rmtree since it might be curdir etc # we should remove all files/directories under it # TODO stringification can be removed once patlib compatible # or if PY35 is no longer supported rmtree(str(dest_path), children_only=dest_path_existed) if e_stderr and 'could not create work tree' in e_stderr.lower(): # this cannot be fixed by trying another URL re_match = re.match(r".*fatal: (.*)$", e_stderr, flags=re.MULTILINE | re.DOTALL) # cancel progress bar log_progress( lgr.info, 'cloneds', 'Completed clone attempts for %s', destds ) yield get_status_dict( status='error', message=re_match.group(1).strip() if re_match else "stderr: " + e_stderr, **result_props) return # next candidate continue result_props['source'] = cand # do not bother with other sources if succeeded break log_progress( lgr.info, 'cloneds', 'Completed clone attempts for %s', destds ) if not destds.is_installed(): if len(error_msgs): if all(not e.stdout and not e.stderr for e in error_msgs.values()): # there is nothing we can learn from the actual exception, # the exit code is uninformative, the command is predictable error_msg = "Failed to clone from all attempted sources: %s" error_args = list(error_msgs.keys()) else: error_msg = "Failed to clone from any candidate source URL. " \ "Encountered errors per each url were:\n- %s" error_args = '\n- '.join( '{}\n {}'.format(url, exc_str(exc)) for url, exc in error_msgs.items() ) else: # yoh: Not sure if we ever get here but I felt that there could # be a case when this might happen and original error would # not be sufficient to troubleshoot what is going on. error_msg = "Awkward error -- we failed to clone properly. " \ "Although no errors were encountered, target " \ "dataset at %s seems to be not fully installed. " \ "The 'succesful' source was: %s" error_args = (destds.path, cand['giturl']) yield get_status_dict( status='error', message=(error_msg, error_args), **result_props) return if not cand.get("version"): postclone_check_head(destds) # act on --reckless=shared-... # must happen prior git-annex-init, where we can cheaply alter the repo # setup through safe re-init'ing if reckless and reckless.startswith('shared-'): lgr.debug('Reinit %s to enable shared access permissions', destds) destds.repo.call_git(['init', '--shared={}'.format(reckless[7:])]) yield from postclonecfg_annexdataset( destds, reckless, description) # perform any post-processing that needs to know details of the clone # source if result_props['source']['type'] == 'ria': yield from postclonecfg_ria(destds, result_props['source']) if reckless: # store the reckless setting in the dataset to make it # known to later clones of subdatasets via get() destds.config.set( 'datalad.clone.reckless', reckless, where='local', reload=True) # yield successful clone of the base dataset now, as any possible # subdataset clone down below will not alter the Git-state of the # parent yield get_status_dict(status='ok', **result_props)
def test_is_url(): ok_(is_url('file://localhost/some')) ok_(is_url('http://localhost')) ok_(is_url('ssh://me@localhost')) # in current understanding it is indeed a url but an 'ssh', implicit=True, not just # a useless scheme=weired with a hope to point to a netloc with swallow_logs(): ok_(is_url('weired://')) nok_(is_url('relative')) nok_(is_url('/absolute')) ok_(is_url('like@sshlogin')) # actually we do allow ssh:implicit urls ATM nok_(is_url('')) nok_(is_url(' ')) nok_(is_url(123)) # stuff of other types wouldn't be considered a URL # we can pass RI instance directly ok_(is_url(RI('file://localhost/some'))) nok_(is_url(RI('relative')))