Beispiel #1
0
def test_verify_ria_url():
    # unsupported protocol
    assert_raises(ValueError, verify_ria_url, 'ria+ftp://localhost/tmp/this',
                  {})
    # bunch of caes that should work
    cases = {
        'ria+file:///tmp/this': (None, '/tmp/this'),
        # no normalization
        'ria+file:///tmp/this/': (None, '/tmp/this/'),
        # with hosts
        'ria+ssh://localhost/tmp/this': ('ssh://localhost', '/tmp/this'),
        'ria+http://localhost/tmp/this': ('http://localhost', '/tmp/this'),
        'ria+https://localhost/tmp/this': ('https://localhost', '/tmp/this'),
        # with username
        'ria+ssh://humbug@localhost/tmp/this':
        ('ssh://humbug@localhost', '/tmp/this'),
        # with port
        'ria+ssh://humbug@localhost:2222/tmp/this':
        ('ssh://humbug@localhost:2222', '/tmp/this'),
        'ria+ssh://localhost:2200/tmp/this':
        ('ssh://*****:*****@localhost:8080/tmp/this':
        ('https://*****:*****@localhost:8080', '/tmp/this'),
        # document a strange (MIH thinks undesirable), but pre-existing
        # behavior an 'ssh example.com' would end up in the user HOME,
        # not in '/'
        'ria+ssh://example.com': ('ssh://example.com', '/')
    }
    for i, o in cases.items():
        # we are not testing the URL rewriting here
        assert_equal(o, verify_ria_url(i, {})[:2])
Beispiel #2
0
    def _verify_config(self, gitdir, fail_noid=True):
        # try loading all needed info from (git) config
        name = self.annex.getconfig('name')
        if not name:
            raise RIARemoteError(
                "Cannot determine special remote name, got: {}".format(
                    repr(name)))
        # get store url:
        self.ria_store_url = self.annex.getconfig('url')
        if self.ria_store_url:
            # support URL rewrite without talking to a DataLad ConfigManager
            # Q is why? Why not use the config manager?
            url_cfgs = dict()
            url_cfgs_raw = _get_gitcfg(gitdir, "^url.*", regex=True)
            if url_cfgs_raw:
                for line in url_cfgs_raw.splitlines():
                    k, v = line.split()
                    url_cfgs[k] = v
            self.storage_host, self.store_base_path, self.ria_store_url = \
                verify_ria_url(self.ria_store_url, url_cfgs)

        # TODO duplicates call to `git-config` after RIA url rewrite
        self._load_cfg(gitdir, name)

        # for now still accept the configs, if no ria-URL is known:
        if not self.ria_store_url:
            if not self.store_base_path:
                self.store_base_path = self.annex.getconfig('base-path')
            if not self.store_base_path:
                raise RIARemoteError(
                    "No remote base path configured. "
                    "Specify `base-path` setting.")

        self.store_base_path = Path(self.store_base_path)
        if not self.store_base_path.is_absolute():
            raise RIARemoteError(
                'Non-absolute object tree base path configuration: %s'
                '' % str(self.store_base_path))

        # for now still accept the configs, if no ria-URL is known:
        if not self.ria_store_url:
            # Note: Special value '0' is replaced by None only after checking the repository's annex config.
            # This is to uniformly handle '0' and None later on, but let a user's config '0' overrule what's
            # stored by git-annex.
            if not self.storage_host:
                self.storage_host = self.annex.getconfig('ssh-host')
            elif self.storage_host == '0':
                self.storage_host = None

        # go look for an ID
        self.archive_id = self.annex.getconfig('archive-id')
        if fail_noid and not self.archive_id:
            raise RIARemoteError(
                "No archive ID configured. This should not happen.")

        # TODO: This should prob. not be done! Would only have an effect if force-write was committed
        #       annex-special-remote-config and this is likely a bad idea.
        if not self.force_write:
            self.force_write = self.annex.getconfig('force-write')
Beispiel #3
0
    def _verify_config(self, gitdir, fail_noid=True):
        # try loading all needed info from (git) config
        name = self.annex.getconfig('name')
        if not name:
            name = self.annex.getconfig('sameas-name')
        if not name:
            raise RIARemoteError(
                "Cannot determine special remote name, got: {}".format(
                    repr(name)))
        # get store url(s):
        self.ria_store_url = self.annex.getconfig('url')
        self.ria_store_pushurl = self.annex.getconfig('push-url')
        # Support URL rewrite without talking to a DataLad ConfigManager,
        # because of additional import cost otherwise. Remember that this is a
        # special remote not a "real" datalad process.
        url_cfgs = dict()
        url_cfgs_raw = _get_gitcfg(gitdir, "^url.*", regex=True)
        if url_cfgs_raw:
            for line in url_cfgs_raw.splitlines():
                k, v = line.split()
                url_cfgs[k] = v

        if self.ria_store_url:
            self.storage_host, self.store_base_path, self.ria_store_url = \
                verify_ria_url(self.ria_store_url, url_cfgs)

        else:
            # for now still accept the configs, if no ria-URL is known, but
            # issue deprecation warning:
            host = _get_gitcfg(gitdir,
                               'annex.ora-remote.{}.ssh-host'.format(name)) or \
                   self.annex.getconfig('ssh-host')
            # Note: Special value '0' is replaced by None only after checking
            # the repository's annex config. This is to uniformly handle '0' and
            # None later on, but let a user's config '0' overrule what's
            # stored by git-annex.
            self.storage_host = None if host == '0' else host

            path = _get_gitcfg(gitdir,
                               'annex.ora-remote.{}.base-path'.format(name)) or \
                   self.annex.getconfig('base-path')
            self.store_base_path = path.strip() if path else path

            if path or host:
                self.message(
                    "WARNING: base-path + ssh-host configs are "
                    "deprecated and won't be considered in the future."
                    " Use 'git annex enableremote {} "
                    "url=<RIA-URL-TO-STORE>' to store a ria+<scheme>:"
                    "//... URL in the special remote's config."
                    "".format(name))

        if not self.store_base_path:
            raise RIARemoteError(
                "No base path configured for RIA store. Specify a proper "
                "ria+<scheme>://... URL.")

        # the base path is ultimately derived from a URL, always treat as POSIX
        self.store_base_path = PurePosixPath(self.store_base_path)
        if not self.store_base_path.is_absolute():
            raise RIARemoteError(
                'Non-absolute object tree base path configuration: %s'
                '' % str(self.store_base_path))

        if self.ria_store_pushurl:
            if self.ria_store_pushurl.startswith("ria+http"):
                raise RIARemoteError("Invalid push-url: {}. Pushing over HTTP "
                                     "not implemented."
                                     "".format(self.ria_store_pushurl))
            self.storage_host_push, self.store_base_path_push, \
                self.ria_store_pushurl = verify_ria_url(self.ria_store_pushurl,
                                                        url_cfgs)

        # TODO duplicates call to `git-config` after RIA url rewrite
        self._load_cfg(gitdir, name)

        # go look for an ID
        self.archive_id = self.annex.getconfig('archive-id')
        if fail_noid and not self.archive_id:
            raise RIARemoteError(
                "No archive ID configured. This should not happen.")

        # TODO: This should prob. not be done! Would only have an effect if
        #       force-write was committed annex-special-remote-config and this
        #       is likely a bad idea.
        if not self.force_write:
            self.force_write = self.annex.getconfig('force-write')
Beispiel #4
0
def _create_sibling_ria(
        ds,
        url,
        name,
        storage_sibling,
        storage_name,
        existing,
        shared,
        group,
        post_update_hook,
        trust_level,
        res_kwargs):
    # be safe across datasets
    res_kwargs = res_kwargs.copy()
    # update dataset
    res_kwargs['ds'] = ds

    if not isinstance(ds.repo, AnnexRepo):
        # No point in dealing with a special remote when there's no annex.
        # Note, that in recursive invocations this might only apply to some of
        # the datasets. Therefore dealing with it here rather than one level up.
        lgr.debug("No annex at %s. Ignoring special remote options.", ds.path)
        storage_sibling = False
        storage_name = None

    # parse target URL
    try:
        ssh_host, base_path, rewritten_url = verify_ria_url(url, ds.config)
    except ValueError as e:
        yield get_status_dict(
            status='error',
            message=str(e),
            **res_kwargs
        )
        return

    base_path = Path(base_path)

    git_url = decode_source_spec(
        # append dataset id to url and use magic from clone-helper:
        url + '#{}'.format(ds.id),
        cfg=ds.config
    )['giturl']
    # determine layout locations; go for a v1 layout
    repo_path, _, _ = get_layout_locations(1, base_path, ds.id)

    ds_siblings = [r['name'] for r in ds.siblings(result_renderer=None)]
    # Figure whether we are supposed to skip this very dataset
    if existing == 'skip' and (
            name in ds_siblings or (
                storage_name and storage_name in ds_siblings)):
        yield get_status_dict(
            status='notneeded',
            message="Skipped on existing sibling",
            **res_kwargs
        )
        # if we skip here, nothing else can change that decision further
        # down
        return

    # figure whether we need to skip or error due an existing target repo before
    # we try to init a special remote.
    if ssh_host:
        from datalad import ssh_manager
        ssh = ssh_manager.get_connection(
            ssh_host,
            use_remote_annex_bundle=False)
        ssh.open()

    if existing in ['skip', 'error']:
        config_path = repo_path / 'config'
        # No .git -- if it's an existing repo in a RIA store it should be a
        # bare repo.
        # Theoretically we could have additional checks for whether we have
        # an empty repo dir or a non-bare repo or whatever else.
        if ssh_host:
            try:
                ssh('[ -e {p} ]'.format(p=quote_cmdlinearg(str(config_path))))
                exists = True
            except CommandError:
                exists = False
        else:
            exists = config_path.exists()

        if exists:
            if existing == 'skip':
                # 1. not rendered by default
                # 2. message doesn't show up in ultimate result
                #    record as shown by -f json_pp
                yield get_status_dict(
                    status='notneeded',
                    message="Skipped on existing remote "
                            "directory {}".format(repo_path),
                    **res_kwargs
                )
                return
            else:  # existing == 'error'
                yield get_status_dict(
                    status='error',
                    message="remote directory {} already "
                            "exists.".format(repo_path),
                    **res_kwargs
                )
                return

    if storage_sibling == 'only':
        lgr.info("create storage sibling '{}' ...".format(name))
    else:
        lgr.info("create sibling{} '{}'{} ...".format(
            's' if storage_name else '',
            name,
            " and '{}'".format(storage_name) if storage_name else '',
        ))
    create_ds_in_store(SSHRemoteIO(ssh_host) if ssh_host else LocalIO(),
                       base_path, ds.id, '2', '1')
    if storage_sibling:
        # we are using the main `name`, if the only thing we are creating
        # is the storage sibling
        srname = name if storage_sibling == 'only' else storage_name

        lgr.debug('init special remote {}'.format(srname))
        special_remote_options = [
            'type=external',
            'externaltype=ora',
            'encryption=none',
            'autoenable=true',
            'url={}'.format(url)]
        try:
            ds.repo.init_remote(
                srname,
                options=special_remote_options)
        except CommandError as e:
            if existing == 'reconfigure' \
                    and 'git-annex: There is already a special remote' \
                    in e.stderr:
                # run enableremote instead
                lgr.debug(
                    "special remote '%s' already exists. "
                    "Run enableremote instead.",
                    srname)
                # TODO: Use AnnexRepo.enable_remote (which needs to get
                #       `options` first)
                ds.repo.call_annex([
                    'enableremote',
                    srname] + special_remote_options)
            else:
                yield get_status_dict(
                    status='error',
                    message="initremote failed.\nstdout: %s\nstderr: %s"
                    % (e.stdout, e.stderr),
                    **res_kwargs
                )
                return

        if trust_level:
            ds.repo.call_annex([trust_level, srname])
        # get uuid for use in bare repo's config
        uuid = ds.config.get("remote.{}.annex-uuid".format(srname))

    if storage_sibling == 'only':
        # we can stop here, the rest of the function is about setting up
        # the git remote part of the sibling
        yield get_status_dict(
            status='ok',
            **res_kwargs,
        )
        return

    # 2. create a bare repository in-store:

    lgr.debug("init bare repository")
    # TODO: we should prob. check whether it's there already. How?
    # Note: like the special remote itself, we assume local FS if no
    # SSH host is specified
    disabled_hook = repo_path / 'hooks' / 'post-update.sample'
    enabled_hook = repo_path / 'hooks' / 'post-update'

    if group:
        chgrp_cmd = "chgrp -R {} {}".format(
            quote_cmdlinearg(str(group)),
            quote_cmdlinearg(str(repo_path)))

    if ssh_host:
        ssh('cd {rootdir} && git init --bare{shared}'.format(
            rootdir=quote_cmdlinearg(str(repo_path)),
            shared=" --shared='{}'".format(
                quote_cmdlinearg(shared)) if shared else ''
        ))

        if storage_sibling:
            # write special remote's uuid into git-config, so clone can
            # which one it is supposed to be and enable it even with
            # fallback URL
            ssh("cd {rootdir} && git config datalad.ora-remote.uuid {uuid}"
                "".format(rootdir=quote_cmdlinearg(str(repo_path)),
                          uuid=uuid))

        if post_update_hook:
            ssh('mv {} {}'.format(quote_cmdlinearg(str(disabled_hook)),
                                  quote_cmdlinearg(str(enabled_hook))))

        if group:
            # Either repository existed before or a new directory was
            # created for it, set its group to a desired one if was
            # provided with the same chgrp
            ssh(chgrp_cmd)
    else:
        gr = GitRepo(repo_path, create=True, bare=True,
                     shared=shared if shared else None)
        if storage_sibling:
            # write special remote's uuid into git-config, so clone can
            # which one it is supposed to be and enable it even with
            # fallback URL
            gr.config.add("datalad.ora-remote.uuid", uuid, where='local')

        if post_update_hook:
            disabled_hook.rename(enabled_hook)
        if group:
            # TODO; do we need a cwd here?
            subprocess.run(chgrp_cmd, cwd=quote_cmdlinearg(ds.path))

    # add a git remote to the bare repository
    # Note: needs annex-ignore! Otherwise we might push into dirhash
    # lower annex/object tree instead of mixed, since it's a bare
    # repo. This in turn would be an issue, if we want to pack the
    # entire thing into an archive. Special remote will then not be
    # able to access content in the "wrong" place within the archive
    lgr.debug("set up git remote")
    if name in ds_siblings:
        # otherwise we should have skipped or failed before
        assert existing == 'reconfigure'
    ds.config.set(
        "remote.{}.annex-ignore".format(name),
        value="true",
        where="local")
    ds.siblings(
        'configure',
        name=name,
        url=git_url
        if ssh_host
        else str(repo_path),
        recursive=False,
        # Note, that this should be None if storage_sibling was not set
        publish_depends=storage_name,
        result_renderer=None,
        # Note, that otherwise a subsequent publish will report
        # "notneeded".
        fetch=True
    )

    yield get_status_dict(
        status='ok',
        **res_kwargs,
    )
Beispiel #5
0
    def __call__(url,
                 name,
                 dataset=None,
                 storage_name=None,
                 post_update_hook=False,
                 shared=None,
                 group=None,
                 storage_sibling=True,
                 existing='error',
                 trust_level=None,
                 recursive=False,
                 recursion_limit=None,
                 disable_storage__=None,
                 ):
        if disable_storage__ is not None:
            import warnings
            warnings.warn("datalad-create-sibling-ria --no-storage-sibling "
                          "is deprecated, use --storage-sibling off instead.",
                          DeprecationWarning)
            # recode to new setup
            disable_storage__ = None
            storage_sibling = False

        if storage_sibling == 'only' and storage_name:
            lgr.warning(
                "Sibling name will be used for storage sibling in "
                "storage-sibling-only mode, but a storage sibling name "
                "was provided"
            )

        ds = require_dataset(
            dataset, check_installed=True, purpose='create sibling RIA')
        res_kwargs = dict(
            ds=ds,
            action="create-sibling-ria",
            logger=lgr,
        )

        # parse target URL
        try:
            ssh_host, base_path, rewritten_url = verify_ria_url(url, ds.config)
        except ValueError as e:
            yield get_status_dict(
                status='error',
                message=str(e),
                **res_kwargs
            )
            return

        if ds.repo.get_hexsha() is None or ds.id is None:
            raise RuntimeError(
                "Repository at {} is not a DataLad dataset, "
                "run 'datalad create [--force]' first.".format(ds.path))

        if not storage_sibling and storage_name:
            lgr.warning(
                "Storage sibling setup disabled, but a storage sibling name "
                "was provided"
            )

        if storage_sibling and not storage_name:
            storage_name = "{}-storage".format(name)

        if storage_sibling and name == storage_name:
            # leads to unresolvable, circular dependency with publish-depends
            raise ValueError("sibling names must not be equal")

        if not isinstance(url, str):
            raise TypeError("url is not a string, but %s" % type(url))

        # Query existing siblings upfront in order to fail early on
        # existing=='error', since misconfiguration (particularly of special
        # remotes) only to fail in a subdataset later on with that config, can
        # be quite painful.
        # TODO: messages - this is "create-sibling". Don't confuse existence of
        #       local remotes with existence of the actual remote sibling
        #       in wording
        if existing == 'error':
            # in recursive mode this check could take a substantial amount of
            # time: employ a progress bar (or rather a counter, because we don't
            # know the total in advance
            pbar_id = 'check-siblings-{}'.format(id(ds))
            log_progress(
                lgr.info, pbar_id,
                'Start checking pre-existing sibling configuration %s', ds,
                label='Query siblings',
                unit=' Siblings',
            )
            # even if we have to fail, let's report all conflicting siblings
            # in subdatasets
            failed = False
            for r in ds.siblings(result_renderer=None,
                                 recursive=recursive,
                                 recursion_limit=recursion_limit):
                log_progress(
                    lgr.info, pbar_id,
                    'Discovered sibling %s in dataset at %s',
                    r['name'], r['path'],
                    update=1,
                    increment=True)
                if not r['type'] == 'sibling' or r['status'] != 'ok':
                    # this is an internal status query that has not consequence
                    # for the outside world. Be silent unless something useful
                    # can be said
                    #yield r
                    continue
                if r['name'] == name:
                    res = get_status_dict(
                        status='error',
                        message="a sibling '{}' is already configured in "
                        "dataset {}".format(name, r['path']),
                        **res_kwargs,
                    )
                    failed = True
                    yield res
                    continue
                if storage_name and r['name'] == storage_name:
                    res = get_status_dict(
                        status='error',
                        message="a sibling '{}' is already configured in "
                        "dataset {}".format(storage_name, r['path']),
                        **res_kwargs,
                    )
                    failed = True
                    yield res
                    continue
            log_progress(
                lgr.info, pbar_id,
                'Finished checking pre-existing sibling configuration %s', ds,
            )
            if failed:
                return

        # TODO: - URL parsing + store creation needs to be RF'ed based on
        #         command abstractions
        #       - more generally consider store creation a dedicated command or
        #         option
        # Note: URL parsing is done twice ATM (for top-level ds). This can't be
        # reduced to single instance, since rewriting url based on config could
        # be different for subdatasets.

        create_store(SSHRemoteIO(ssh_host) if ssh_host else LocalIO(),
                     Path(base_path),
                     '1')

        yield from _create_sibling_ria(
            ds,
            url,
            name,
            storage_sibling,
            storage_name,
            existing,
            shared,
            group,
            post_update_hook,
            trust_level,
            res_kwargs)

        if recursive:
            # Note: subdatasets can be treated independently, so go full
            # recursion when querying for them and _no_recursion with the
            # actual call. Theoretically this can be parallelized.

            for subds in ds.subdatasets(fulfilled=True,
                                        recursive=True,
                                        recursion_limit=recursion_limit,
                                        result_xfm='datasets'):
                yield from _create_sibling_ria(
                    subds,
                    url,
                    name,
                    storage_sibling,
                    storage_name,
                    existing,
                    shared,
                    group,
                    post_update_hook,
                    trust_level,
                    res_kwargs)
Beispiel #6
0
    def __call__(
            url,
            name,
            *,  # note that `name` is required but not posarg in CLI
            dataset=None,
            storage_name=None,
            alias=None,
            post_update_hook=False,
            shared=None,
            group=None,
            storage_sibling=True,
            existing='error',
            new_store_ok=False,
            trust_level=None,
            recursive=False,
            recursion_limit=None,
            disable_storage__=None,
            push_url=None):
        if disable_storage__ is not None:
            import warnings
            warnings.warn(
                "datalad-create-sibling-ria --no-storage-sibling "
                "is deprecated, use --storage-sibling off instead.",
                DeprecationWarning)
            # recode to new setup
            disable_storage__ = None
            storage_sibling = False

        if storage_sibling == 'only' and storage_name:
            lgr.warning(
                "Sibling name will be used for storage sibling in "
                "storage-sibling-only mode, but a storage sibling name "
                "was provided")

        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='create RIA sibling(s)')
        res_kwargs = dict(
            ds=ds,
            action="create-sibling-ria",
            logger=lgr,
        )

        # parse target URL
        # Note: URL parsing is done twice ATM (for top-level ds). This can't be
        # reduced to single instance, since rewriting url based on config could
        # be different for subdatasets.
        try:
            ssh_host, base_path, rewritten_url = \
                verify_ria_url(push_url if push_url else url, ds.config)
        except ValueError as e:
            yield get_status_dict(status='error', message=str(e), **res_kwargs)
            return

        if ds.repo.get_hexsha() is None or ds.id is None:
            raise RuntimeError("Repository at {} is not a DataLad dataset, "
                               "run 'datalad create [--force]' first.".format(
                                   ds.path))

        if not storage_sibling and storage_name:
            lgr.warning(
                "Storage sibling setup disabled, but a storage sibling name "
                "was provided")

        if storage_sibling and not storage_name:
            storage_name = "{}-storage".format(name)

        if storage_sibling and name == storage_name:
            # leads to unresolvable, circular dependency with publish-depends
            raise ValueError("sibling names must not be equal")

        if not isinstance(url, str):
            raise TypeError("url is not a string, but %s" % type(url))

        # Query existing siblings upfront in order to fail early on
        # existing=='error', since misconfiguration (particularly of special
        # remotes) only to fail in a subdataset later on with that config, can
        # be quite painful.
        # TODO: messages - this is "create-sibling". Don't confuse existence of
        #       local remotes with existence of the actual remote sibling
        #       in wording
        if existing == 'error':
            failed = False
            for dpath, sname in _yield_ds_w_matching_siblings(
                    ds, (name, storage_name),
                    recursive=recursive,
                    recursion_limit=recursion_limit):
                res = get_status_dict(
                    status='error',
                    message=(
                        "a sibling %r is already configured in dataset %r",
                        sname, dpath),
                    type='sibling',
                    name=sname,
                    ds=ds,
                    **res_kwargs,
                )
                failed = True
                yield res
            if failed:
                return
        # TODO: - URL parsing + store creation needs to be RF'ed based on
        #         command abstractions
        #       - more generally consider store creation a dedicated command or
        #         option

        io = SSHRemoteIO(ssh_host) if ssh_host else LocalIO()
        try:
            # determine the existence of a store by trying to read its layout.
            # Because this raises a FileNotFound error if non-existent, we need
            # to catch it
            io.read_file(Path(base_path) / 'ria-layout-version')
        except (FileNotFoundError, RIARemoteError,
                RemoteCommandFailedError) as e:
            if not new_store_ok:
                # we're instructed to only act in case of an existing RIA store
                res = get_status_dict(status='error',
                                      message="No store found at '{}'. Forgot "
                                      "--new-store-ok ?".format(
                                          Path(base_path)),
                                      **res_kwargs)
                yield res
                return

        log_progress(
            lgr.info,
            'create-sibling-ria',
            'Creating a new RIA store at %s',
            Path(base_path),
        )
        create_store(io, Path(base_path), '1')

        yield from _create_sibling_ria(ds, url, push_url, name,
                                       storage_sibling, storage_name, alias,
                                       existing, shared, group,
                                       post_update_hook, trust_level,
                                       res_kwargs)

        if recursive:
            # Note: subdatasets can be treated independently, so go full
            # recursion when querying for them and _no_recursion with the
            # actual call. Theoretically this can be parallelized.

            for subds in ds.subdatasets(state='present',
                                        recursive=True,
                                        recursion_limit=recursion_limit,
                                        return_type='generator',
                                        result_renderer='disabled',
                                        result_xfm='datasets'):
                yield from _create_sibling_ria(
                    subds,
                    url,
                    push_url,
                    name,
                    storage_sibling,
                    storage_name,
                    None,  # subdatasets can't have the same alias as the parent
                    existing,
                    shared,
                    group,
                    post_update_hook,
                    trust_level,
                    res_kwargs)