Exemple #1
0
def test_get_flexible_source_candidates():
    f = _get_flexible_source_candidates
    # for http and https (dummy transport) we should get /.git source added
    eq_(f('http://e.c'), ['http://e.c', 'http://e.c/.git'])
    eq_(f('http://e.c/s/p'), ['http://e.c/s/p', 'http://e.c/s/p/.git'])
    # for those candidates should be just the original address, since git
    # understands those just fine
    for s in ('http://e.c/.git',
              '/',
              'relative/path',
              'smallrelative',
              './neighbor',
              '../../look/into/parent/bedroom',
              'p:somewhere',
              'user@host:/full/path',
              ):
        eq_(_get_flexible_source_candidates(s), [s])
    # Now a few relative ones
    eq_(f('../r', '.'), ['../r'])
    eq_(f('../r', 'ssh://host/path'), ['ssh://host/r'])
    eq_(f('sub', 'ssh://host/path'), ['ssh://host/path/sub'])
    eq_(f('../r', 'http://e.c/p'), ['http://e.c/r', 'http://e.c/r/.git'])
    eq_(f('sub', 'http://e.c/p'), ['http://e.c/p/sub', 'http://e.c/p/sub/.git'])

    # tricky ones
    eq_(f('sub', 'http://e.c/p/.git'), ['http://e.c/p/sub/.git'])
    eq_(f('../s1/s2', 'http://e.c/p/.git'), ['http://e.c/s1/s2/.git'])

    # incorrect ones will stay incorrect
    eq_(f('../s1/s2', 'http://e.c/.git'), ['http://e.c/../s1/s2/.git'])

    # when source is not relative, but base_url is specified as just the destination path,
    # not really a "base url" as name was suggesting, then it should be ignored
    eq_(f('http://e.c/p', '/path'), ['http://e.c/p', 'http://e.c/p/.git'])
Exemple #2
0
def test_get_flexible_source_candidates():
    f = _get_flexible_source_candidates
    # for http and https (dummy transport) we should get /.git source added
    eq_(f('http://e.c'), ['http://e.c', 'http://e.c/.git'])
    eq_(f('http://e.c/s/p'), ['http://e.c/s/p', 'http://e.c/s/p/.git'])
    # for those candidates should be just the original address, since git
    # understands those just fine
    for s in ('http://e.c/.git',
              '/',
              'relative/path',
              'smallrelative',
              './neighbor',
              '../../look/into/parent/bedroom',
              'p:somewhere',
              'user@host:/full/path',
              ):
        eq_(_get_flexible_source_candidates(s), [s])
    # Now a few relative ones
    eq_(f('../r', '.'), ['../r'])
    eq_(f('../r', 'ssh://host/path'), ['ssh://host/r'])
    eq_(f('sub', 'ssh://host/path'), ['ssh://host/path/sub'])
    eq_(f('../r', 'http://e.c/p'), ['http://e.c/r', 'http://e.c/r/.git'])
    eq_(f('sub', 'http://e.c/p'), ['http://e.c/p/sub', 'http://e.c/p/sub/.git'])

    # tricky ones
    eq_(f('sub', 'http://e.c/p/.git'), ['http://e.c/p/sub/.git'])
    eq_(f('../s1/s2', 'http://e.c/p/.git'), ['http://e.c/s1/s2/.git'])

    # incorrect ones will stay incorrect
    eq_(f('../s1/s2', 'http://e.c/.git'), ['http://e.c/../s1/s2/.git'])

    # when source is not relative, but base_url is specified as just the destination path,
    # not really a "base url" as name was suggesting, then it should be ignored
    eq_(f('http://e.c/p', '/path'), ['http://e.c/p', 'http://e.c/p/.git'])
Exemple #3
0
def clone_dataset(
        srcs,
        destds,
        reckless=None,
        description=None,
        result_props=None,
        cfg=None):
    """Internal helper to perform cloning without sanity checks (assumed done)

    This helper does not handle any saving of subdataset modification or adding
    in a superdataset.

    Parameters
    ----------
    srcs : list
      Any suitable clone source specifications (paths, URLs)
    destds : Dataset
      Dataset instance for the clone destination
    reckless : {None, 'auto', 'ephemeral', 'shared-...'}, optional
      Mode switch to put cloned dataset into unsafe/throw-away configurations, i.e.
      sacrifice data safety for performance or resource footprint. When None
      and `cfg` is specified, use the value of `datalad.clone.reckless`.
    description : str, optional
      Location description for the annex of the dataset clone (if there is any).
    result_props : dict, optional
      Default properties for any yielded result, passed on to get_status_dict().
    cfg : ConfigManager, optional
      Configuration for parent dataset. This will be queried instead
      of the global DataLad configuration.

    Yields
    ------
    dict
      DataLad result records
    """
    if not result_props:
        # in case the caller had no specific idea on how results should look
        # like, provide sensible defaults
        result_props = dict(
            action='install',
            logger=lgr,
            ds=destds,
        )

    if reckless is None and cfg:
        # if reckless is not explicitly given, but we operate on a
        # superdataset, query whether it has been instructed to operate
        # in a reckless mode, and inherit it for the coming clone
        reckless = cfg.get('datalad.clone.reckless', None)

    dest_path = destds.pathobj

    # decode all source candidate specifications
    candidate_sources = [decode_source_spec(s, cfg=cfg) for s in srcs]

    # now expand the candidate sources with additional variants of the decoded
    # giturl, while duplicating the other properties in the additional records
    # for simplicity. The hope is to overcome a few corner cases and be more
    # robust than git clone
    candidate_sources = [
        dict(props, giturl=s) for props in candidate_sources
        for s in _get_flexible_source_candidates(props['giturl'])
    ]

    # important test! based on this `rmtree` will happen below after failed clone
    dest_path_existed = dest_path.exists()
    if dest_path_existed and any(dest_path.iterdir()):
        if destds.is_installed():
            # check if dest was cloned from the given source before
            # this is where we would have installed this from
            # this is where it was actually installed from
            track_name, track_url = _get_tracking_source(destds)
            try:
                # this will get us track_url in system native path conventions,
                # whenever it is a path (and not a URL)
                # this is needed to match it to any potentially incoming local
                # source path in the 'notneeded' test below
                track_path = str(Path(track_url))
            except Exception:
                # this should never happen, because Path() will let any non-path stringification
                # pass through unmodified, but we do not want any potential crash due to
                # pathlib behavior changes
                lgr.debug("Unexpected behavior of pathlib!")
                track_path = None
            for cand in candidate_sources:
                src = cand['giturl']
                if track_url == src \
                        or (not is_url(track_url)
                            and get_local_file_url(track_url, compatibility='git') == src) \
                        or track_path == expanduser(src):
                    yield get_status_dict(
                        status='notneeded',
                        message=("dataset %s was already cloned from '%s'",
                                 destds,
                                 src),
                        **result_props)
                    return
        # anything else is an error
        yield get_status_dict(
            status='error',
            message='target path already exists and not empty, refuse to clone into target path',
            **result_props)
        return

    log_progress(
        lgr.info,
        'cloneds',
        'Cloning dataset to %s', destds,
        total=len(candidate_sources),
        label='Clone attempt',
        unit=' Candidate locations',
    )
    error_msgs = OrderedDict()  # accumulate all error messages formatted per each url
    for cand in candidate_sources:
        log_progress(
            lgr.info,
            'cloneds',
            'Attempting to clone from %s to %s', cand['giturl'], dest_path,
            update=1,
            increment=True)

        clone_opts = {}

        if cand.get('version', None):
            clone_opts['branch'] = cand['version']
        try:
            # TODO for now GitRepo.clone() cannot handle Path instances, and PY35
            # doesn't make it happen seemlessly
            GitRepo.clone(
                path=str(dest_path),
                url=cand['giturl'],
                clone_options=clone_opts,
                create=True)

        except CommandError as e:
            e_stderr = e.stderr

            error_msgs[cand['giturl']] = e
            lgr.debug("Failed to clone from URL: %s (%s)",
                      cand['giturl'], exc_str(e))
            if dest_path.exists():
                lgr.debug("Wiping out unsuccessful clone attempt at: %s",
                          dest_path)
                # We must not just rmtree since it might be curdir etc
                # we should remove all files/directories under it
                # TODO stringification can be removed once patlib compatible
                # or if PY35 is no longer supported
                rmtree(str(dest_path), children_only=dest_path_existed)

            if e_stderr and 'could not create work tree' in e_stderr.lower():
                # this cannot be fixed by trying another URL
                re_match = re.match(r".*fatal: (.*)$", e_stderr,
                                    flags=re.MULTILINE | re.DOTALL)
                # cancel progress bar
                log_progress(
                    lgr.info,
                    'cloneds',
                    'Completed clone attempts for %s', destds
                )
                yield get_status_dict(
                    status='error',
                    message=re_match.group(1).strip()
                    if re_match else "stderr: " + e_stderr,
                    **result_props)
                return
            # next candidate
            continue

        result_props['source'] = cand
        # do not bother with other sources if succeeded
        break

    log_progress(
        lgr.info,
        'cloneds',
        'Completed clone attempts for %s', destds
    )

    if not destds.is_installed():
        if len(error_msgs):
            if all(not e.stdout and not e.stderr for e in error_msgs.values()):
                # there is nothing we can learn from the actual exception,
                # the exit code is uninformative, the command is predictable
                error_msg = "Failed to clone from all attempted sources: %s"
                error_args = list(error_msgs.keys())
            else:
                error_msg = "Failed to clone from any candidate source URL. " \
                            "Encountered errors per each url were:\n- %s"
                error_args = '\n- '.join(
                    '{}\n  {}'.format(url, exc_str(exc))
                    for url, exc in error_msgs.items()
                )
        else:
            # yoh: Not sure if we ever get here but I felt that there could
            #      be a case when this might happen and original error would
            #      not be sufficient to troubleshoot what is going on.
            error_msg = "Awkward error -- we failed to clone properly. " \
                        "Although no errors were encountered, target " \
                        "dataset at %s seems to be not fully installed. " \
                        "The 'succesful' source was: %s"
            error_args = (destds.path, cand['giturl'])
        yield get_status_dict(
            status='error',
            message=(error_msg, error_args),
            **result_props)
        return

    if not cand.get("version"):
        postclone_check_head(destds)

    # act on --reckless=shared-...
    # must happen prior git-annex-init, where we can cheaply alter the repo
    # setup through safe re-init'ing
    if reckless and reckless.startswith('shared-'):
        lgr.debug('Reinit %s to enable shared access permissions', destds)
        destds.repo.call_git(['init', '--shared={}'.format(reckless[7:])])

    yield from postclonecfg_annexdataset(
        destds,
        reckless,
        description)

    # perform any post-processing that needs to know details of the clone
    # source
    if result_props['source']['type'] == 'ria':
        yield from postclonecfg_ria(destds, result_props['source'])

    if reckless:
        # store the reckless setting in the dataset to make it
        # known to later clones of subdatasets via get()
        destds.config.set(
            'datalad.clone.reckless', reckless,
            where='local',
            reload=True)

    # yield successful clone of the base dataset now, as any possible
    # subdataset clone down below will not alter the Git-state of the
    # parent
    yield get_status_dict(status='ok', **result_props)
Exemple #4
0
def _get_flexible_source_candidates_for_submodule(ds, sm):
    """Assemble candidate locations from where to clone a submodule

    The following locations candidates are considered. For each candidate a
    cost is given in parenthesis, lower values indicate higher cost:

    - URL of any configured superdataset remote that is known to have the
      desired submodule commit, with the submodule path appended to it.
      There can be more than one candidate (cost 500).

    - A URL or absolute path recorded in `.gitmodules` (cost 600).

    - In case `.gitmodules` contains a relative path instead of a URL,
      the URL of any configured superdataset remote that is known to have the
      desired submodule commit, with this relative path appended to it.
      There can be more than one candidate (cost 500).

    - In case `.gitmodules` contains a relative path as a URL, the absolute
      path of the superdataset, appended with this relative path (cost 900).

    Additional candidate URLs can be generated based on templates specified as
    configuration variables with the pattern

      `datalad.get.subdataset-source-candidate-<name>`

    where `name` is an arbitrary identifier. If name starts with three digits
    (e.g. '400myserver') these will be interpreted as a cost, and the
    respective candidate will be sorted into the generated candidate list
    according to this cost. If no cost is given, a default of 700
    is used.

    A template string assigned to such a variable can utilize the Python format
    mini language and may reference a number of properties that are inferred
    from the parent dataset's knowledge about the target subdataset. Properties
    include any submodule property specified in the respective `.gitmodules`
    record. For convenience, an existing `datalad-id` record is made available
    under the shortened name `id`.

    Additionally, the URL of any configured remote that contains the respective
    submodule commit is available as `remote-<name>` properties, where `name`
    is the configured remote name.

    Lastly, all candidates are sorted according to their cost (lower values
    first, and duplicate URLs are stripped, while preserving the first item in the
    candidate list.

    Parameters
    ----------
    ds : Dataset
      Parent dataset of to-be-installed subdataset.
    sm : dict
      Submodule record as produced by `subdatasets()`.

    Returns
    -------
    list of dict
      Where each dict has keys 'cost' (int), 'name' (str), 'url' (str).
      Names are not unique and either derived from the name of the respective
      remote, template configuration variable, or 'local'.
    """
    # short cuts
    ds_repo = ds.repo
    sm_url = sm.get('gitmodule_url', None)
    sm_path = op.relpath(sm['path'], start=sm['parentds'])

    clone_urls = []

    # CANDIDATE: tracking remote of the current branch
    tracking_remote, tracking_branch = ds_repo.get_tracking_branch()
    candidate_remotes = [tracking_remote] if tracking_remote else []

    # if we have a remote, let's check the location of that remote
    # for the presence of the desired submodule
    last_commit = ds_repo.get_last_commit_hexsha(sm_path)
    if last_commit:
        # CANDIDATE: any remote that has the commit when the submodule was
        # last modified

        # ideally should also give preference to the remotes which have
        # the same branch checked out I guess
        candidate_remotes += list(
            _get_remotes_having_commit(ds_repo, last_commit))

    # prepare a dict to generate URL candidates from templates
    sm_candidate_props = {
        k[10:].replace('datalad-id', 'id'): v
        for k, v in sm.items() if k.startswith('gitmodule_')
    }

    for remote in unique(candidate_remotes):
        remote_url = ds_repo.get_remote_url(remote, push=False)

        # Directly on parent's ds url
        if remote_url:
            # make remotes and their URLs available to template rendering
            sm_candidate_props['remoteurl-{}'.format(remote)] = remote_url
            # attempt: submodule checkout at parent remote URL
            # We might need to quote sm_path portion, e.g. for spaces etc
            if isinstance(RI(remote_url), URL):
                sm_path_url = urlquote(sm_path)
            else:
                sm_path_url = sm_path

            clone_urls.extend(
                dict(cost=500, name=remote, url=url)
                for url in _get_flexible_source_candidates(
                    # alternate suffixes are tested by `clone` anyways
                    sm_path_url,
                    remote_url,
                    alternate_suffix=False))

            # attempt: provided (configured?) submodule URL
            # TODO: consider supporting DataLadRI here?  or would confuse
            #  git and we wouldn't want that (i.e. not allow pure git clone
            #  --recursive)
            if sm_url:
                clone_urls.extend(
                    dict(cost=600, name=remote, url=url)
                    for url in _get_flexible_source_candidates(
                        sm_url, remote_url, alternate_suffix=False))
    cost_candidate_expr = re.compile('[0-9][0-9][0-9].*')
    candcfg_prefix = 'datalad.get.subdataset-source-candidate-'
    for name, tmpl in [(c[len(candcfg_prefix):], ds_repo.config[c])
                       for c in ds_repo.config.keys()
                       if c.startswith(candcfg_prefix)]:
        url = tmpl.format(**sm_candidate_props)
        # we don't want "flexible_source_candidates" here, this is
        # configuration that can be made arbitrarily precise from the
        # outside. Additional guesswork can only make it slower
        has_cost = cost_candidate_expr.match(name) is not None
        clone_urls.append(
            # assign a default cost, if a config doesn't have one
            dict(
                cost=int(name[:3]) if has_cost else 700,
                name=name[3:] if has_cost else name,
                url=url,
                from_config=True,
            ))

    # CANDIDATE: the actual configured gitmodule URL
    if sm_url:
        clone_urls.extend(
            dict(cost=900, name='local', url=url)
            for url in _get_flexible_source_candidates(
                sm_url, ds.path, alternate_suffix=False)
            # avoid inclusion of submodule location itself
            if url != sm['path'])

    # sort all candidates by their label, thereby allowing a
    # candidate provided by configuration to purposefully
    # sort before or after automatically generated configuration
    clone_urls = sorted(clone_urls, key=lambda x: x['cost'])
    # take out any duplicate source candidates
    # unique() takes out the duplicated at the tail end
    clone_urls = unique(clone_urls, lambda x: x['url'])

    return clone_urls
Exemple #5
0
def _get_flexible_source_candidates_for_submodule(ds, sm):
    """Assemble candidates from where to install a submodule

    Even if a URL for submodule is provided explicitly -- first tries urls under
    parent's module tracking branch remote.

    Additional candidate URLs can be generated based on templates specified as
    configuration variables with the pattern

      `datalad.get.subdataset-source-candidate-<name>`

    where `name` is an arbitrary identifier.

    A template string assigned to such a variable can utilize the Python format
    mini language and may reference a number of properties that are inferred
    from the parent dataset's knowledge about the target subdataset. Properties
    include any submodule property specified in the respective .gitmodules
    record. For convenience, an existing `datalad-id` record is made available
    under the shortened name `id`.

    Additionally, the URL of any configured remote that contains the respective
    submodule commit is available as `remote-<name>` properties, where `name`
    is the configured remote name.

    Parameters
    ----------
    ds : Dataset
      Parent dataset of to-be-installed subdataset.
    sm : dict
      Submodule record as produced by `subdatasets()`.

    Returns
    -------
    list of tuples
      Where each tuples consists of a name and a URL. Names are not unique
      and either derived from the name of the respective remote, template
      configuration variable, or 'origin' for the candidate URL that was
      obtained from the .gitmodule record.
    """
    # short cuts
    ds_repo = ds.repo
    sm_url = sm.get('gitmodule_url', None)
    sm_path = op.relpath(sm['path'], start=sm['parentds'])

    clone_urls = []

    # CANDIDATE: tracking remote of the current branch
    tracking_remote, tracking_branch = ds_repo.get_tracking_branch()
    candidate_remotes = [tracking_remote] if tracking_remote else []

    # if we have a remote, let's check the location of that remote
    # for the presence of the desired submodule
    last_commit = ds_repo.get_last_commit_hexsha(sm_path)
    if last_commit:
        # CANDIDATE: any remote that has the commit when the submodule was
        # last modified

        # ideally should also give preference to the remotes which have
        # the same branch checked out I guess
        candidate_remotes += list(
            _get_remotes_having_commit(ds_repo, last_commit))

    # prepare a dict to generate URL candidates from templates
    sm_candidate_props = {
        k[10:].replace('datalad-id', 'id'): v
        for k, v in sm.items() if k.startswith('gitmodule_')
    }

    for remote in unique(candidate_remotes):
        remote_url = ds_repo.get_remote_url(remote, push=False)

        # Directly on parent's ds url
        if remote_url:
            # make remotes and their URLs available to template rendering
            sm_candidate_props['remoteurl-{}'.format(remote)] = remote_url
            # attempt: submodule checkout at parent remote URL
            # We might need to quote sm_path portion, e.g. for spaces etc
            if isinstance(RI(remote_url), URL):
                sm_path_url = urlquote(sm_path)
            else:
                sm_path_url = sm_path

            clone_urls.extend(
                (remote, url) for url in _get_flexible_source_candidates(
                    # alternate suffixes are tested by `clone` anyways
                    sm_path_url,
                    remote_url,
                    alternate_suffix=False))

            # attempt: provided (configured?) submodule URL
            # TODO: consider supporting DataLadRI here?  or would confuse
            #  git and we wouldn't want that (i.e. not allow pure git clone
            #  --recursive)
            if sm_url:
                clone_urls.extend(
                    (remote, url) for url in _get_flexible_source_candidates(
                        sm_url, remote_url, alternate_suffix=False))

        for name, tmpl in [
            (c[12:], ds_repo.config[c]) for c in ds_repo.config.keys()
                if c.startswith('datalad.get.subdataset-source-candidate-')
        ]:
            url = tmpl.format(**sm_candidate_props)
            # we don't want "flexible_source_candidates" here, this is
            # configuration that can be made arbitrarily precise from the
            # outside. Additional guesswork can only make it slower
            clone_urls.append((name, url))

    # CANDIDATE: the actual configured gitmodule URL
    if sm_url:
        clone_urls.extend(('local', url)
                          for url in _get_flexible_source_candidates(
                              sm_url, ds.path, alternate_suffix=False)
                          # avoid inclusion of submodule location itself
                          if url != sm['path'])

    return unique(clone_urls, lambda x: x[1])