def test_verify_ria_url(): # unsupported protocol assert_raises(ValueError, verify_ria_url, 'ria+ftp://localhost/tmp/this', {}) # bunch of caes that should work cases = { 'ria+file:///tmp/this': (None, '/tmp/this'), # no normalization 'ria+file:///tmp/this/': (None, '/tmp/this/'), # with hosts 'ria+ssh://localhost/tmp/this': ('ssh://localhost', '/tmp/this'), 'ria+http://localhost/tmp/this': ('http://localhost', '/tmp/this'), 'ria+https://localhost/tmp/this': ('https://localhost', '/tmp/this'), # with username 'ria+ssh://humbug@localhost/tmp/this': ('ssh://humbug@localhost', '/tmp/this'), # with port 'ria+ssh://humbug@localhost:2222/tmp/this': ('ssh://humbug@localhost:2222', '/tmp/this'), 'ria+ssh://localhost:2200/tmp/this': ('ssh://*****:*****@localhost:8080/tmp/this': ('https://*****:*****@localhost:8080', '/tmp/this'), # document a strange (MIH thinks undesirable), but pre-existing # behavior an 'ssh example.com' would end up in the user HOME, # not in '/' 'ria+ssh://example.com': ('ssh://example.com', '/') } for i, o in cases.items(): # we are not testing the URL rewriting here assert_equal(o, verify_ria_url(i, {})[:2])
def _verify_config(self, gitdir, fail_noid=True): # try loading all needed info from (git) config name = self.annex.getconfig('name') if not name: raise RIARemoteError( "Cannot determine special remote name, got: {}".format( repr(name))) # get store url: self.ria_store_url = self.annex.getconfig('url') if self.ria_store_url: # support URL rewrite without talking to a DataLad ConfigManager # Q is why? Why not use the config manager? url_cfgs = dict() url_cfgs_raw = _get_gitcfg(gitdir, "^url.*", regex=True) if url_cfgs_raw: for line in url_cfgs_raw.splitlines(): k, v = line.split() url_cfgs[k] = v self.storage_host, self.store_base_path, self.ria_store_url = \ verify_ria_url(self.ria_store_url, url_cfgs) # TODO duplicates call to `git-config` after RIA url rewrite self._load_cfg(gitdir, name) # for now still accept the configs, if no ria-URL is known: if not self.ria_store_url: if not self.store_base_path: self.store_base_path = self.annex.getconfig('base-path') if not self.store_base_path: raise RIARemoteError( "No remote base path configured. " "Specify `base-path` setting.") self.store_base_path = Path(self.store_base_path) if not self.store_base_path.is_absolute(): raise RIARemoteError( 'Non-absolute object tree base path configuration: %s' '' % str(self.store_base_path)) # for now still accept the configs, if no ria-URL is known: if not self.ria_store_url: # Note: Special value '0' is replaced by None only after checking the repository's annex config. # This is to uniformly handle '0' and None later on, but let a user's config '0' overrule what's # stored by git-annex. if not self.storage_host: self.storage_host = self.annex.getconfig('ssh-host') elif self.storage_host == '0': self.storage_host = None # go look for an ID self.archive_id = self.annex.getconfig('archive-id') if fail_noid and not self.archive_id: raise RIARemoteError( "No archive ID configured. This should not happen.") # TODO: This should prob. not be done! Would only have an effect if force-write was committed # annex-special-remote-config and this is likely a bad idea. if not self.force_write: self.force_write = self.annex.getconfig('force-write')
def _verify_config(self, gitdir, fail_noid=True): # try loading all needed info from (git) config name = self.annex.getconfig('name') if not name: name = self.annex.getconfig('sameas-name') if not name: raise RIARemoteError( "Cannot determine special remote name, got: {}".format( repr(name))) # get store url(s): self.ria_store_url = self.annex.getconfig('url') self.ria_store_pushurl = self.annex.getconfig('push-url') # Support URL rewrite without talking to a DataLad ConfigManager, # because of additional import cost otherwise. Remember that this is a # special remote not a "real" datalad process. url_cfgs = dict() url_cfgs_raw = _get_gitcfg(gitdir, "^url.*", regex=True) if url_cfgs_raw: for line in url_cfgs_raw.splitlines(): k, v = line.split() url_cfgs[k] = v if self.ria_store_url: self.storage_host, self.store_base_path, self.ria_store_url = \ verify_ria_url(self.ria_store_url, url_cfgs) else: # for now still accept the configs, if no ria-URL is known, but # issue deprecation warning: host = _get_gitcfg(gitdir, 'annex.ora-remote.{}.ssh-host'.format(name)) or \ self.annex.getconfig('ssh-host') # Note: Special value '0' is replaced by None only after checking # the repository's annex config. This is to uniformly handle '0' and # None later on, but let a user's config '0' overrule what's # stored by git-annex. self.storage_host = None if host == '0' else host path = _get_gitcfg(gitdir, 'annex.ora-remote.{}.base-path'.format(name)) or \ self.annex.getconfig('base-path') self.store_base_path = path.strip() if path else path if path or host: self.message( "WARNING: base-path + ssh-host configs are " "deprecated and won't be considered in the future." " Use 'git annex enableremote {} " "url=<RIA-URL-TO-STORE>' to store a ria+<scheme>:" "//... URL in the special remote's config." "".format(name)) if not self.store_base_path: raise RIARemoteError( "No base path configured for RIA store. Specify a proper " "ria+<scheme>://... URL.") # the base path is ultimately derived from a URL, always treat as POSIX self.store_base_path = PurePosixPath(self.store_base_path) if not self.store_base_path.is_absolute(): raise RIARemoteError( 'Non-absolute object tree base path configuration: %s' '' % str(self.store_base_path)) if self.ria_store_pushurl: if self.ria_store_pushurl.startswith("ria+http"): raise RIARemoteError("Invalid push-url: {}. Pushing over HTTP " "not implemented." "".format(self.ria_store_pushurl)) self.storage_host_push, self.store_base_path_push, \ self.ria_store_pushurl = verify_ria_url(self.ria_store_pushurl, url_cfgs) # TODO duplicates call to `git-config` after RIA url rewrite self._load_cfg(gitdir, name) # go look for an ID self.archive_id = self.annex.getconfig('archive-id') if fail_noid and not self.archive_id: raise RIARemoteError( "No archive ID configured. This should not happen.") # TODO: This should prob. not be done! Would only have an effect if # force-write was committed annex-special-remote-config and this # is likely a bad idea. if not self.force_write: self.force_write = self.annex.getconfig('force-write')
def _create_sibling_ria( ds, url, name, storage_sibling, storage_name, existing, shared, group, post_update_hook, trust_level, res_kwargs): # be safe across datasets res_kwargs = res_kwargs.copy() # update dataset res_kwargs['ds'] = ds if not isinstance(ds.repo, AnnexRepo): # No point in dealing with a special remote when there's no annex. # Note, that in recursive invocations this might only apply to some of # the datasets. Therefore dealing with it here rather than one level up. lgr.debug("No annex at %s. Ignoring special remote options.", ds.path) storage_sibling = False storage_name = None # parse target URL try: ssh_host, base_path, rewritten_url = verify_ria_url(url, ds.config) except ValueError as e: yield get_status_dict( status='error', message=str(e), **res_kwargs ) return base_path = Path(base_path) git_url = decode_source_spec( # append dataset id to url and use magic from clone-helper: url + '#{}'.format(ds.id), cfg=ds.config )['giturl'] # determine layout locations; go for a v1 layout repo_path, _, _ = get_layout_locations(1, base_path, ds.id) ds_siblings = [r['name'] for r in ds.siblings(result_renderer=None)] # Figure whether we are supposed to skip this very dataset if existing == 'skip' and ( name in ds_siblings or ( storage_name and storage_name in ds_siblings)): yield get_status_dict( status='notneeded', message="Skipped on existing sibling", **res_kwargs ) # if we skip here, nothing else can change that decision further # down return # figure whether we need to skip or error due an existing target repo before # we try to init a special remote. if ssh_host: from datalad import ssh_manager ssh = ssh_manager.get_connection( ssh_host, use_remote_annex_bundle=False) ssh.open() if existing in ['skip', 'error']: config_path = repo_path / 'config' # No .git -- if it's an existing repo in a RIA store it should be a # bare repo. # Theoretically we could have additional checks for whether we have # an empty repo dir or a non-bare repo or whatever else. if ssh_host: try: ssh('[ -e {p} ]'.format(p=quote_cmdlinearg(str(config_path)))) exists = True except CommandError: exists = False else: exists = config_path.exists() if exists: if existing == 'skip': # 1. not rendered by default # 2. message doesn't show up in ultimate result # record as shown by -f json_pp yield get_status_dict( status='notneeded', message="Skipped on existing remote " "directory {}".format(repo_path), **res_kwargs ) return else: # existing == 'error' yield get_status_dict( status='error', message="remote directory {} already " "exists.".format(repo_path), **res_kwargs ) return if storage_sibling == 'only': lgr.info("create storage sibling '{}' ...".format(name)) else: lgr.info("create sibling{} '{}'{} ...".format( 's' if storage_name else '', name, " and '{}'".format(storage_name) if storage_name else '', )) create_ds_in_store(SSHRemoteIO(ssh_host) if ssh_host else LocalIO(), base_path, ds.id, '2', '1') if storage_sibling: # we are using the main `name`, if the only thing we are creating # is the storage sibling srname = name if storage_sibling == 'only' else storage_name lgr.debug('init special remote {}'.format(srname)) special_remote_options = [ 'type=external', 'externaltype=ora', 'encryption=none', 'autoenable=true', 'url={}'.format(url)] try: ds.repo.init_remote( srname, options=special_remote_options) except CommandError as e: if existing == 'reconfigure' \ and 'git-annex: There is already a special remote' \ in e.stderr: # run enableremote instead lgr.debug( "special remote '%s' already exists. " "Run enableremote instead.", srname) # TODO: Use AnnexRepo.enable_remote (which needs to get # `options` first) ds.repo.call_annex([ 'enableremote', srname] + special_remote_options) else: yield get_status_dict( status='error', message="initremote failed.\nstdout: %s\nstderr: %s" % (e.stdout, e.stderr), **res_kwargs ) return if trust_level: ds.repo.call_annex([trust_level, srname]) # get uuid for use in bare repo's config uuid = ds.config.get("remote.{}.annex-uuid".format(srname)) if storage_sibling == 'only': # we can stop here, the rest of the function is about setting up # the git remote part of the sibling yield get_status_dict( status='ok', **res_kwargs, ) return # 2. create a bare repository in-store: lgr.debug("init bare repository") # TODO: we should prob. check whether it's there already. How? # Note: like the special remote itself, we assume local FS if no # SSH host is specified disabled_hook = repo_path / 'hooks' / 'post-update.sample' enabled_hook = repo_path / 'hooks' / 'post-update' if group: chgrp_cmd = "chgrp -R {} {}".format( quote_cmdlinearg(str(group)), quote_cmdlinearg(str(repo_path))) if ssh_host: ssh('cd {rootdir} && git init --bare{shared}'.format( rootdir=quote_cmdlinearg(str(repo_path)), shared=" --shared='{}'".format( quote_cmdlinearg(shared)) if shared else '' )) if storage_sibling: # write special remote's uuid into git-config, so clone can # which one it is supposed to be and enable it even with # fallback URL ssh("cd {rootdir} && git config datalad.ora-remote.uuid {uuid}" "".format(rootdir=quote_cmdlinearg(str(repo_path)), uuid=uuid)) if post_update_hook: ssh('mv {} {}'.format(quote_cmdlinearg(str(disabled_hook)), quote_cmdlinearg(str(enabled_hook)))) if group: # Either repository existed before or a new directory was # created for it, set its group to a desired one if was # provided with the same chgrp ssh(chgrp_cmd) else: gr = GitRepo(repo_path, create=True, bare=True, shared=shared if shared else None) if storage_sibling: # write special remote's uuid into git-config, so clone can # which one it is supposed to be and enable it even with # fallback URL gr.config.add("datalad.ora-remote.uuid", uuid, where='local') if post_update_hook: disabled_hook.rename(enabled_hook) if group: # TODO; do we need a cwd here? subprocess.run(chgrp_cmd, cwd=quote_cmdlinearg(ds.path)) # add a git remote to the bare repository # Note: needs annex-ignore! Otherwise we might push into dirhash # lower annex/object tree instead of mixed, since it's a bare # repo. This in turn would be an issue, if we want to pack the # entire thing into an archive. Special remote will then not be # able to access content in the "wrong" place within the archive lgr.debug("set up git remote") if name in ds_siblings: # otherwise we should have skipped or failed before assert existing == 'reconfigure' ds.config.set( "remote.{}.annex-ignore".format(name), value="true", where="local") ds.siblings( 'configure', name=name, url=git_url if ssh_host else str(repo_path), recursive=False, # Note, that this should be None if storage_sibling was not set publish_depends=storage_name, result_renderer=None, # Note, that otherwise a subsequent publish will report # "notneeded". fetch=True ) yield get_status_dict( status='ok', **res_kwargs, )
def __call__(url, name, dataset=None, storage_name=None, post_update_hook=False, shared=None, group=None, storage_sibling=True, existing='error', trust_level=None, recursive=False, recursion_limit=None, disable_storage__=None, ): if disable_storage__ is not None: import warnings warnings.warn("datalad-create-sibling-ria --no-storage-sibling " "is deprecated, use --storage-sibling off instead.", DeprecationWarning) # recode to new setup disable_storage__ = None storage_sibling = False if storage_sibling == 'only' and storage_name: lgr.warning( "Sibling name will be used for storage sibling in " "storage-sibling-only mode, but a storage sibling name " "was provided" ) ds = require_dataset( dataset, check_installed=True, purpose='create sibling RIA') res_kwargs = dict( ds=ds, action="create-sibling-ria", logger=lgr, ) # parse target URL try: ssh_host, base_path, rewritten_url = verify_ria_url(url, ds.config) except ValueError as e: yield get_status_dict( status='error', message=str(e), **res_kwargs ) return if ds.repo.get_hexsha() is None or ds.id is None: raise RuntimeError( "Repository at {} is not a DataLad dataset, " "run 'datalad create [--force]' first.".format(ds.path)) if not storage_sibling and storage_name: lgr.warning( "Storage sibling setup disabled, but a storage sibling name " "was provided" ) if storage_sibling and not storage_name: storage_name = "{}-storage".format(name) if storage_sibling and name == storage_name: # leads to unresolvable, circular dependency with publish-depends raise ValueError("sibling names must not be equal") if not isinstance(url, str): raise TypeError("url is not a string, but %s" % type(url)) # Query existing siblings upfront in order to fail early on # existing=='error', since misconfiguration (particularly of special # remotes) only to fail in a subdataset later on with that config, can # be quite painful. # TODO: messages - this is "create-sibling". Don't confuse existence of # local remotes with existence of the actual remote sibling # in wording if existing == 'error': # in recursive mode this check could take a substantial amount of # time: employ a progress bar (or rather a counter, because we don't # know the total in advance pbar_id = 'check-siblings-{}'.format(id(ds)) log_progress( lgr.info, pbar_id, 'Start checking pre-existing sibling configuration %s', ds, label='Query siblings', unit=' Siblings', ) # even if we have to fail, let's report all conflicting siblings # in subdatasets failed = False for r in ds.siblings(result_renderer=None, recursive=recursive, recursion_limit=recursion_limit): log_progress( lgr.info, pbar_id, 'Discovered sibling %s in dataset at %s', r['name'], r['path'], update=1, increment=True) if not r['type'] == 'sibling' or r['status'] != 'ok': # this is an internal status query that has not consequence # for the outside world. Be silent unless something useful # can be said #yield r continue if r['name'] == name: res = get_status_dict( status='error', message="a sibling '{}' is already configured in " "dataset {}".format(name, r['path']), **res_kwargs, ) failed = True yield res continue if storage_name and r['name'] == storage_name: res = get_status_dict( status='error', message="a sibling '{}' is already configured in " "dataset {}".format(storage_name, r['path']), **res_kwargs, ) failed = True yield res continue log_progress( lgr.info, pbar_id, 'Finished checking pre-existing sibling configuration %s', ds, ) if failed: return # TODO: - URL parsing + store creation needs to be RF'ed based on # command abstractions # - more generally consider store creation a dedicated command or # option # Note: URL parsing is done twice ATM (for top-level ds). This can't be # reduced to single instance, since rewriting url based on config could # be different for subdatasets. create_store(SSHRemoteIO(ssh_host) if ssh_host else LocalIO(), Path(base_path), '1') yield from _create_sibling_ria( ds, url, name, storage_sibling, storage_name, existing, shared, group, post_update_hook, trust_level, res_kwargs) if recursive: # Note: subdatasets can be treated independently, so go full # recursion when querying for them and _no_recursion with the # actual call. Theoretically this can be parallelized. for subds in ds.subdatasets(fulfilled=True, recursive=True, recursion_limit=recursion_limit, result_xfm='datasets'): yield from _create_sibling_ria( subds, url, name, storage_sibling, storage_name, existing, shared, group, post_update_hook, trust_level, res_kwargs)
def __call__( url, name, *, # note that `name` is required but not posarg in CLI dataset=None, storage_name=None, alias=None, post_update_hook=False, shared=None, group=None, storage_sibling=True, existing='error', new_store_ok=False, trust_level=None, recursive=False, recursion_limit=None, disable_storage__=None, push_url=None): if disable_storage__ is not None: import warnings warnings.warn( "datalad-create-sibling-ria --no-storage-sibling " "is deprecated, use --storage-sibling off instead.", DeprecationWarning) # recode to new setup disable_storage__ = None storage_sibling = False if storage_sibling == 'only' and storage_name: lgr.warning( "Sibling name will be used for storage sibling in " "storage-sibling-only mode, but a storage sibling name " "was provided") ds = require_dataset(dataset, check_installed=True, purpose='create RIA sibling(s)') res_kwargs = dict( ds=ds, action="create-sibling-ria", logger=lgr, ) # parse target URL # Note: URL parsing is done twice ATM (for top-level ds). This can't be # reduced to single instance, since rewriting url based on config could # be different for subdatasets. try: ssh_host, base_path, rewritten_url = \ verify_ria_url(push_url if push_url else url, ds.config) except ValueError as e: yield get_status_dict(status='error', message=str(e), **res_kwargs) return if ds.repo.get_hexsha() is None or ds.id is None: raise RuntimeError("Repository at {} is not a DataLad dataset, " "run 'datalad create [--force]' first.".format( ds.path)) if not storage_sibling and storage_name: lgr.warning( "Storage sibling setup disabled, but a storage sibling name " "was provided") if storage_sibling and not storage_name: storage_name = "{}-storage".format(name) if storage_sibling and name == storage_name: # leads to unresolvable, circular dependency with publish-depends raise ValueError("sibling names must not be equal") if not isinstance(url, str): raise TypeError("url is not a string, but %s" % type(url)) # Query existing siblings upfront in order to fail early on # existing=='error', since misconfiguration (particularly of special # remotes) only to fail in a subdataset later on with that config, can # be quite painful. # TODO: messages - this is "create-sibling". Don't confuse existence of # local remotes with existence of the actual remote sibling # in wording if existing == 'error': failed = False for dpath, sname in _yield_ds_w_matching_siblings( ds, (name, storage_name), recursive=recursive, recursion_limit=recursion_limit): res = get_status_dict( status='error', message=( "a sibling %r is already configured in dataset %r", sname, dpath), type='sibling', name=sname, ds=ds, **res_kwargs, ) failed = True yield res if failed: return # TODO: - URL parsing + store creation needs to be RF'ed based on # command abstractions # - more generally consider store creation a dedicated command or # option io = SSHRemoteIO(ssh_host) if ssh_host else LocalIO() try: # determine the existence of a store by trying to read its layout. # Because this raises a FileNotFound error if non-existent, we need # to catch it io.read_file(Path(base_path) / 'ria-layout-version') except (FileNotFoundError, RIARemoteError, RemoteCommandFailedError) as e: if not new_store_ok: # we're instructed to only act in case of an existing RIA store res = get_status_dict(status='error', message="No store found at '{}'. Forgot " "--new-store-ok ?".format( Path(base_path)), **res_kwargs) yield res return log_progress( lgr.info, 'create-sibling-ria', 'Creating a new RIA store at %s', Path(base_path), ) create_store(io, Path(base_path), '1') yield from _create_sibling_ria(ds, url, push_url, name, storage_sibling, storage_name, alias, existing, shared, group, post_update_hook, trust_level, res_kwargs) if recursive: # Note: subdatasets can be treated independently, so go full # recursion when querying for them and _no_recursion with the # actual call. Theoretically this can be parallelized. for subds in ds.subdatasets(state='present', recursive=True, recursion_limit=recursion_limit, return_type='generator', result_renderer='disabled', result_xfm='datasets'): yield from _create_sibling_ria( subds, url, push_url, name, storage_sibling, storage_name, None, # subdatasets can't have the same alias as the parent existing, shared, group, post_update_hook, trust_level, res_kwargs)