def _check_and_update_remote_server_info(ds, remote): # if we managed to copy to "http" url we should should try to trigger git # update-server-info hook on the remote if there was ssh annexurl defined # for it. Apparently we do that already in create_sibling ones, but here # we need more checks and preparation remote_url = ds.repo.config.get('remote.%s.url' % remote, None) if remote_url: remote_url = RI(remote_url) if isinstance(remote_url, URL) and remote_url.scheme in ( 'http', 'https'): remote_annexurl = ds.repo.config.get('remote.%s.annexurl' % remote, None) if remote_annexurl: remote_annexurl_ri = RI(remote_annexurl) if is_ssh(remote_annexurl_ri): ssh = ssh_manager.get_connection(remote_annexurl_ri) ssh('git -C {} update-server-info'.format( sh_quote(remote_annexurl_ri.path))) return True else: lgr.debug( "There is no annexurl defined but not ssh: %s, " "dunno if " "we could/should do anything", remote_annexurl ) return False
def __call__(login, cmd, port=None, no_stdin=False): lgr.debug("sshrun invoked: %r %r %r %r", login, cmd, port, no_stdin) # Perspective workarounds for git-annex invocation, see # https://github.com/datalad/datalad/issues/1456#issuecomment-292641319 if cmd.startswith("'") and cmd.endswith("'"): lgr.debug( "Detected additional level of quotations in %r so performing " "shlex split", cmd ) # there is an additional layer of quotes # Let's strip them off using shlex import shlex cmd_ = shlex.split(cmd) if len(cmd_) != 1: raise RuntimeError( "Obtained more or less than a single argument upon shlex " "split: %s" % repr(cmd_)) cmd = cmd_[0] sshurl = 'ssh://{}{}'.format( login, ':{}'.format(port) if port else '') ssh = ssh_manager.get_connection(sshurl) # TODO: /dev/null on windows ;) or may be could be just None? stdin_ = open('/dev/null', 'r') if no_stdin else sys.stdin try: out, err = ssh(cmd, stdin=stdin_, log_output=False) finally: if no_stdin: stdin_.close() os.write(1, out.encode('UTF-8')) os.write(2, err.encode('UTF-8'))
def __call__(login, cmd, port=None, no_stdin=False): lgr.debug("sshrun invoked: %r %r %r %r", login, cmd, port, no_stdin) # Perspective workarounds for git-annex invocation, see # https://github.com/datalad/datalad/issues/1456#issuecomment-292641319 if cmd.startswith("'") and cmd.endswith("'"): lgr.debug( "Detected additional level of quotations in %r so performing " "shlex split", cmd) # there is an additional layer of quotes # Let's strip them off using shlex import shlex cmd_ = shlex.split(cmd) if len(cmd_) != 1: raise RuntimeError( "Obtained more or less than a single argument upon shlex " "split: %s" % repr(cmd_)) cmd = cmd_[0] sshurl = 'ssh://{}{}'.format(login, ':{}'.format(port) if port else '') ssh = ssh_manager.get_connection(sshurl) # TODO: /dev/null on windows ;) or may be could be just None? stdin_ = open('/dev/null', 'r') if no_stdin else sys.stdin try: out, err = ssh(cmd, stdin=stdin_, log_output=False) finally: if no_stdin: stdin_.close() os.write(1, out.encode('UTF-8')) os.write(2, err.encode('UTF-8'))
def test_replace_and_relative_sshpath(src_path, dst_path): # We need to come up with the path relative to our current home directory # https://github.com/datalad/datalad/issues/1653 # but because we override HOME the HOME on the remote end would be # different even though a localhost. So we need to query it from datalad import ssh_manager ssh = ssh_manager.get_connection('localhost') remote_home, err = ssh('pwd') assert not err remote_home = remote_home.rstrip('\n') dst_relpath = os.path.relpath(dst_path, remote_home) url = 'localhost:%s' % dst_relpath ds = Dataset(src_path).create() create_tree(ds.path, {'sub.dat': 'lots of data'}) ds.add('sub.dat') ds.create_sibling(url, ui=True) published = ds.publish(to='localhost', transfer_data='all') assert_result_count(published, 1, path=opj(ds.path, 'sub.dat')) # verify that hook runs and there is nothing in stderr # since it exits with 0 exit even if there was a problem out, err = Runner(cwd=opj(dst_path, '.git'))(_path_('hooks/post-update')) assert_false(out) assert_false(err) # Verify that we could replace and publish no problem # https://github.com/datalad/datalad/issues/1656 # Strangely it spits outs IncompleteResultsError exception atm... so just # checking that it fails somehow res = ds.create_sibling(url, on_failure='ignore') assert_status('error', res) assert_in('already configured', res[0]['message'][0]) # "Settings" such as UI do not persist, so we specify it again # for the test below depending on it ds.create_sibling(url, existing='replace', ui=True) published2 = ds.publish(to='localhost', transfer_data='all') assert_result_count(published2, 1, path=opj(ds.path, 'sub.dat')) # and one more test since in above test it would not puke ATM but just # not even try to copy since it assumes that file is already there create_tree(ds.path, {'sub2.dat': 'more data'}) ds.add('sub2.dat') published3 = ds.publish(to='localhost', transfer_data='none') # we publish just git assert_result_count(published3, 0, path=opj(ds.path, 'sub2.dat')) # now publish "with" data, which should also trigger the hook! # https://github.com/datalad/datalad/issues/1658 from glob import glob from datalad.consts import WEB_META_LOG logs_prior = glob(_path_(dst_path, WEB_META_LOG, '*')) published4 = ds.publish(to='localhost', transfer_data='all') assert_result_count(published4, 1, path=opj(ds.path, 'sub2.dat')) logs_post = glob(_path_(dst_path, WEB_META_LOG, '*')) eq_(len(logs_post), len(logs_prior) + 1) assert_postupdate_hooks(dst_path)
def test_replace_and_relative_sshpath(src_path, dst_path): # We need to come up with the path relative to our current home directory # https://github.com/datalad/datalad/issues/1653 # but because we override HOME the HOME on the remote end would be # different even though a localhost. So we need to query it from datalad import ssh_manager ssh = ssh_manager.get_connection('localhost') remote_home, err = ssh('pwd') assert not err remote_home = remote_home.rstrip('\n') dst_relpath = os.path.relpath(dst_path, remote_home) url = 'localhost:%s' % dst_relpath ds = Dataset(src_path).create() create_tree(ds.path, {'sub.dat': 'lots of data'}) ds.save('sub.dat') ds.create_sibling(url, ui=True) published = ds.publish(to='localhost', transfer_data='all') assert_result_count(published, 1, path=opj(ds.path, 'sub.dat')) # verify that hook runs and there is nothing in stderr # since it exits with 0 exit even if there was a problem out, err = Runner(cwd=opj(dst_path, '.git'))(_path_('hooks/post-update')) assert_false(out) assert_false(err) # Verify that we could replace and publish no problem # https://github.com/datalad/datalad/issues/1656 # Strangely it spits outs IncompleteResultsError exception atm... so just # checking that it fails somehow res = ds.create_sibling(url, on_failure='ignore') assert_status('error', res) assert_in('already configured', res[0]['message'][0]) # "Settings" such as UI do not persist, so we specify it again # for the test below depending on it ds.create_sibling(url, existing='replace', ui=True) published2 = ds.publish(to='localhost', transfer_data='all') assert_result_count(published2, 1, path=opj(ds.path, 'sub.dat')) # and one more test since in above test it would not puke ATM but just # not even try to copy since it assumes that file is already there create_tree(ds.path, {'sub2.dat': 'more data'}) ds.save('sub2.dat') published3 = ds.publish(to='localhost', transfer_data='none') # we publish just git assert_result_count(published3, 0, path=opj(ds.path, 'sub2.dat')) # now publish "with" data, which should also trigger the hook! # https://github.com/datalad/datalad/issues/1658 from glob import glob from datalad.consts import WEB_META_LOG logs_prior = glob(_path_(dst_path, WEB_META_LOG, '*')) published4 = ds.publish(to='localhost', transfer_data='all') assert_result_count(published4, 1, path=opj(ds.path, 'sub2.dat')) logs_post = glob(_path_(dst_path, WEB_META_LOG, '*')) eq_(len(logs_post), len(logs_prior) + 1) assert_postupdate_hooks(dst_path)
def __call__(login, cmd, *, port=None, ipv4=False, ipv6=False, options=None, no_stdin=False): lgr.debug( "sshrun invoked: login=%r, cmd=%r, port=%r, options=%r, " "ipv4=%r, ipv6=%r, no_stdin=%r", login, cmd, port, options, ipv4, ipv6, no_stdin) # Perspective workarounds for git-annex invocation, see # https://github.com/datalad/datalad/issues/1456#issuecomment-292641319 if cmd.startswith("'") and cmd.endswith("'"): lgr.debug( "Detected additional level of quotations in %r so performing " "command line splitting", cmd) # there is an additional layer of quotes # Let's strip them off by splitting the command cmd_ = split_cmdline(cmd) if len(cmd_) != 1: raise RuntimeError( "Obtained more or less than a single argument after " "command line splitting: %s" % repr(cmd_)) cmd = cmd_[0] sshurl = 'ssh://{}{}'.format(login, ':{}'.format(port) if port else '') if ipv4 and ipv6: raise ValueError("Cannot force both IPv4 and IPv6") elif ipv4: force_ip = 4 elif ipv6: force_ip = 6 else: force_ip = None ssh = ssh_manager.get_connection(sshurl, force_ip=force_ip) # use an empty temp file as stdin if none shall be connected stdin_ = tempfile.TemporaryFile() if no_stdin else sys.stdin try: out, err = ssh(cmd, stdin=stdin_, log_output=False, options=options) finally: if no_stdin: stdin_.close() os.write(1, out.encode('UTF-8')) os.write(2, err.encode('UTF-8'))
def __call__(sshurl, target=None, target_dir=None, target_url=None, target_pushurl=None, dataset=None, recursive=False, existing='error', shared=False, ui=False, as_common_datasrc=None, publish_by_default=None, publish_depends=None): if sshurl is None: raise ValueError("""insufficient information for target creation (needs at least a dataset and a SSH URL).""") if target is None and (target_url is not None or target_pushurl is not None): raise ValueError("""insufficient information for adding the target as a sibling (needs at least a name)""") # shortcut ds = require_dataset(dataset, check_installed=True, purpose='creating a sibling') assert(ds is not None and sshurl is not None and ds.repo is not None) # determine target parameters: sshri = RI(sshurl) if not isinstance(sshri, SSHRI) \ and not (isinstance(sshri, URL) and sshri.scheme == 'ssh'): raise ValueError("Unsupported SSH URL: '{0}', use ssh://host/path or host:path syntax".format(sshurl)) if target_dir is None: if sshri.path: target_dir = sshri.path else: target_dir = '.' # TODO: centralize and generalize template symbol handling replicate_local_structure = False if "%NAME" not in target_dir: replicate_local_structure = True # collect datasets to use: datasets = dict() datasets[basename(ds.path)] = ds if recursive: for subds in ds.get_subdatasets(recursive=True): sub_path = opj(ds.path, subds) # TODO: when enhancing Dataset/*Repo classes and therefore # adapt to moved code, make proper distinction between name and # path of a submodule, which are technically different. This # probably will become important on windows as well as whenever # we want to allow for moved worktrees. datasets[basename(ds.path) + '/' + subds] = \ Dataset(sub_path) # request ssh connection: not_supported_on_windows("TODO") lgr.info("Connecting ...") ssh = ssh_manager.get_connection(sshurl) ssh.open() # flag to check if at dataset_root at_root = True # loop over all datasets, ordered from top to bottom to make test # below valid (existing directories would cause the machinery to halt) # But we need to run post-update hook in depth-first fashion, so # would only collect first and then run (see gh #790) remote_repos_to_run_hook_for = [] for current_dspath in \ sorted(datasets.keys(), key=lambda x: x.count('/')): current_ds = datasets[current_dspath] if not current_ds.is_installed(): lgr.info("Skipping %s since not installed locally", current_dspath) continue if not replicate_local_structure: path = target_dir.replace("%NAME", current_dspath.replace("/", "-")) else: # TODO: opj depends on local platform, not the remote one. # check how to deal with it. Does windows ssh server accept # posix paths? vice versa? Should planned SSH class provide # tools for this issue? path = normpath(opj(target_dir, relpath(datasets[current_dspath].path, start=ds.path))) lgr.info("Creating target dataset {0} at {1}".format(current_dspath, path)) # Must be set to True only if exists and existing='reconfigure' # otherwise we might skip actions if we say existing='reconfigure' # but it did not even exist before only_reconfigure = False if path != '.': # check if target exists # TODO: Is this condition valid for != '.' only? path_exists = True try: out, err = ssh(["ls", path]) except CommandError as e: if "No such file or directory" in e.stderr and \ path in e.stderr: path_exists = False else: raise # It's an unexpected failure here if path_exists: if existing == 'error': raise RuntimeError("Target directory %s already exists." % path) elif existing == 'skip': continue elif existing == 'replace': ssh(["chmod", "+r+w", "-R", path]) # enable write permissions to allow removing dir ssh(["rm", "-rf", path]) # remove target at path path_exists = False # if we succeeded in removing it elif existing == 'reconfigure': only_reconfigure = True else: raise ValueError("Do not know how to handle existing=%s" % repr(existing)) if not path_exists: try: ssh(["mkdir", "-p", path]) except CommandError as e: lgr.error("Remotely creating target directory failed at " "%s.\nError: %s" % (path, exc_str(e))) continue # don't (re-)initialize dataset if existing == reconfigure if not only_reconfigure: # init git and possibly annex repo if not CreateSibling.init_remote_repo( path, ssh, shared, datasets[current_dspath], description=target_url): continue # check git version on remote end lgr.info("Adjusting remote git configuration") remote_git_version = CreateSibling.get_remote_git_version(ssh) if remote_git_version and remote_git_version >= "2.4": # allow for pushing to checked out branch try: ssh(["git", "-C", path] + ["config", "receive.denyCurrentBranch", "updateInstead"]) except CommandError as e: lgr.error("git config failed at remote location %s.\n" "You will not be able to push to checked out " "branch. Error: %s", path, exc_str(e)) else: lgr.error("Git version >= 2.4 needed to configure remote." " Version detected on server: %s\nSkipping configuration" " of receive.denyCurrentBranch - you will not be able to" " publish updates to this repository. Upgrade your git" " and run with --existing=reconfigure" % remote_git_version) # enable metadata refresh on dataset updates to publication server lgr.info("Enabling git post-update hook ...") try: CreateSibling.create_postupdate_hook( path, ssh, datasets[current_dspath]) except CommandError as e: lgr.error("Failed to add json creation command to post update " "hook.\nError: %s" % exc_str(e)) # publish web-interface to root dataset on publication server if at_root and ui: lgr.info("Uploading web interface to %s" % path) at_root = False try: CreateSibling.upload_web_interface(path, ssh, shared, ui) except CommandError as e: lgr.error("Failed to push web interface to the remote " "datalad repository.\nError: %s" % exc_str(e)) remote_repos_to_run_hook_for.append(path) # in reverse order would be depth first lgr.debug("Running post-update hooks in all created siblings") for path in remote_repos_to_run_hook_for[::-1]: # Trigger the hook try: ssh( ["cd '" + _path_(path, ".git") + "' && hooks/post-update"], wrap_args=False # we wrapped here manually ) except CommandError as e: lgr.error("Failed to run post-update hook under path %s. " "Error: %s" % (path, exc_str(e))) if target: # add the sibling(s): lgr.debug("Adding the siblings") if target_url is None: target_url = sshurl if target_pushurl is None and sshurl != target_url: target_pushurl = sshurl AddSibling()(dataset=ds, name=target, url=target_url, pushurl=target_pushurl, recursive=recursive, fetch=True, force=existing in {'replace'}, as_common_datasrc=as_common_datasrc, publish_by_default=publish_by_default, publish_depends=publish_depends)
def __call__(sshurl, name=None, target_dir=None, target_url=None, target_pushurl=None, dataset=None, recursive=False, recursion_limit=None, existing='error', shared=None, group=None, ui=False, as_common_datasrc=None, publish_by_default=None, publish_depends=None, annex_wanted=None, annex_group=None, annex_groupwanted=None, inherit=False, since=None): # # nothing without a base dataset # ds = require_dataset(dataset, check_installed=True, purpose='creating a sibling') refds_path = ds.path # # all checks that are possible before we start parsing the dataset # # possibly use sshurl to get the name in case if not specified if not sshurl: if not inherit: raise InsufficientArgumentsError( "needs at least an SSH URL, if no inherit option" ) if name is None: raise ValueError( "Neither SSH URL, nor the name of sibling to inherit from " "was specified" ) # It might well be that we already have this remote setup try: sshurl = CreateSibling._get_remote_url(ds, name) except Exception as exc: lgr.debug('%s does not know about url for %s: %s', ds, name, exc_str(exc)) elif inherit: raise ValueError( "For now, for clarity not allowing specifying a custom sshurl " "while inheriting settings" ) # may be could be safely dropped -- still WiP if not sshurl: # TODO: may be more back up before _prep? super_ds = ds.get_superdataset() if not super_ds: raise ValueError( "Could not determine super dataset for %s to inherit URL" % ds ) super_url = CreateSibling._get_remote_url(super_ds, name) # for now assuming hierarchical setup # (TODO: to be able to destinguish between the two, probably # needs storing datalad.*.target_dir to have %RELNAME in there) sshurl = slash_join(super_url, relpath(ds.path, super_ds.path)) # check the login URL sshri = RI(sshurl) if not is_ssh(sshri): raise ValueError( "Unsupported SSH URL: '{0}', " "use ssh://host/path or host:path syntax".format(sshurl)) if not name: # use the hostname as default remote name name = sshri.hostname lgr.debug( "No sibling name given, use URL hostname '%s' as sibling name", name) if since == '': # consider creating siblings only since the point of # the last update # XXX here we assume one to one mapping of names from local branches # to the remote active_branch = ds.repo.get_active_branch() since = '%s/%s' % (name, active_branch) # # parse the base dataset to find all subdatasets that need processing # to_process = [] for ap in AnnotatePaths.__call__( dataset=refds_path, # only a single path! path=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='create_sibling', # both next should not happen anyways unavailable_path_status='impossible', nondataset_path_status='error', modified=since, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) != 'dataset' or ap.get('state', None) == 'absent': # this can happen when there is `since`, but we have no # use for anything but datasets here continue checkds_remotes = Dataset(ap['path']).repo.get_remotes() \ if ap.get('state', None) != 'absent' \ else [] if publish_depends: # make sure dependencies are valid # TODO: inherit -- we might want to automagically create # those dependents as well??? unknown_deps = set(assure_list(publish_depends)).difference(checkds_remotes) if unknown_deps: ap['status'] = 'error' ap['message'] = ( 'unknown sibling(s) specified as publication dependency: %s', unknown_deps) yield ap continue if name in checkds_remotes and existing in ('error', 'skip'): ap['status'] = 'error' if existing == 'error' else 'notneeded' ap['message'] = ( "sibling '%s' already configured (specify alternative name, or force " "reconfiguration via --existing", name) yield ap continue to_process.append(ap) if not to_process: # we ruled out all possibilities # TODO wait for gh-1218 and make better return values lgr.info("No datasets qualify for sibling creation. " "Consider different settings for --existing " "or --since if this is unexpected") return if target_dir is None: if sshri.path: target_dir = sshri.path else: target_dir = '.' # TODO: centralize and generalize template symbol handling replicate_local_structure = "%RELNAME" not in target_dir # request ssh connection: lgr.info("Connecting ...") assert(sshurl is not None) # delayed anal verification ssh = ssh_manager.get_connection(sshurl) if not ssh.get_annex_version(): raise MissingExternalDependency( 'git-annex', msg='on the remote system') # # all checks done and we have a connection, now do something # # loop over all datasets, ordered from top to bottom to make test # below valid (existing directories would cause the machinery to halt) # But we need to run post-update hook in depth-first fashion, so # would only collect first and then run (see gh #790) yielded = set() remote_repos_to_run_hook_for = [] for currentds_ap in \ sorted(to_process, key=lambda x: x['path'].count('/')): current_ds = Dataset(currentds_ap['path']) path = _create_dataset_sibling( name, current_ds, ds.path, ssh, replicate_local_structure, sshri, target_dir, target_url, target_pushurl, existing, shared, group, publish_depends, publish_by_default, ui, as_common_datasrc, annex_wanted, annex_group, annex_groupwanted, inherit ) if not path: # nothing new was created # TODO is 'notneeded' appropriate in this case? currentds_ap['status'] = 'notneeded' # TODO explain status in 'message' yield currentds_ap yielded.add(currentds_ap['path']) continue remote_repos_to_run_hook_for.append((path, currentds_ap)) # publish web-interface to root dataset on publication server if current_ds.path == ds.path and ui: lgr.info("Uploading web interface to %s" % path) try: CreateSibling.upload_web_interface(path, ssh, shared, ui) except CommandError as e: currentds_ap['status'] = 'error' currentds_ap['message'] = ( "failed to push web interface to the remote datalad repository (%s)", exc_str(e)) yield currentds_ap yielded.add(currentds_ap['path']) continue # in reverse order would be depth first lgr.info("Running post-update hooks in all created siblings") # TODO: add progressbar for path, currentds_ap in remote_repos_to_run_hook_for[::-1]: # Trigger the hook lgr.debug("Running hook for %s (if exists and executable)", path) try: ssh("cd {} " "&& ( [ -x hooks/post-update ] && hooks/post-update || : )" "".format(sh_quote(_path_(path, ".git")))) except CommandError as e: currentds_ap['status'] = 'error' currentds_ap['message'] = ( "failed to run post-update hook under remote path %s (%s)", path, exc_str(e)) yield currentds_ap yielded.add(currentds_ap['path']) continue if not currentds_ap['path'] in yielded: # if we were silent until now everything is just splendid currentds_ap['status'] = 'ok' yield currentds_ap
def _create_sibling_ria(ds, url, name, ria_remote, ria_remote_name, existing, shared, group, post_update_hook, res_kwargs): # be safe across datasets res_kwargs = res_kwargs.copy() # parse target URL try: ssh_host, base_path = verify_ria_url(url, ds.config) except ValueError as e: yield get_status_dict(status='error', message=str(e), **res_kwargs) return base_path = Path(base_path) git_url = decode_source_spec( # append dataset id to url and use magic from clone-helper: url + '#{}'.format(ds.id), cfg=ds.config)['giturl'] # go for a v1 layout repo_path, _, _ = get_layout_locations(1, base_path, ds.id) ds_siblings = [r['name'] for r in ds.siblings(result_renderer=None)] # Figure whether we are supposed to skip this very dataset if existing == 'skip' and (name in ds_siblings or (ria_remote_name and ria_remote_name in ds_siblings)): yield get_status_dict(status='notneeded', message="Skipped on existing sibling", **res_kwargs) # if we skip here, nothing else can change that decision further # down return # we might learn that some processing (remote repo creation is # not desired) skip = False lgr.info("create sibling{} '{}'{} ...".format( 's' if ria_remote_name else '', name, " and '{}'".format(ria_remote_name) if ria_remote_name else '', )) if ssh_host: from datalad import ssh_manager ssh = ssh_manager.get_connection(ssh_host, use_remote_annex_bundle=False) ssh.open() # determine layout locations if ria_remote: lgr.debug('init special remote {}'.format(ria_remote_name)) ria_remote_options = [ 'type=external', 'externaltype=ria', 'encryption=none', 'autoenable=true', 'url={}'.format(url) ] try: ds.repo.init_remote(ria_remote_name, options=ria_remote_options) except CommandError as e: if existing in ['replace', 'reconfigure'] \ and 'git-annex: There is already a special remote' \ in e.stderr: # run enableremote instead lgr.debug( "special remote '%s' already exists. " "Run enableremote instead.", ria_remote_name) # TODO: Use AnnexRepo.enable_remote (which needs to get # `options` first) cmd = ['git', 'annex', 'enableremote', ria_remote_name ] + ria_remote_options subprocess.run(cmd, cwd=quote_cmdlinearg(ds.repo.path)) else: yield get_status_dict( status='error', message="initremote failed.\nstdout: %s\nstderr: %s" % (e.stdout, e.stderr), **res_kwargs) return # 1. create remote object store: # Note: All it actually takes is to trigger the special # remote's `prepare` method once. # ATM trying to achieve that by invoking a minimal fsck. # TODO: - It's probably faster to actually talk to the special # remote (i.e. pretending to be annex and use # the protocol to send PREPARE) # - Alternatively we can create the remote directory and # ria version file directly, but this means # code duplication that then needs to be kept in sync # with ria-remote implementation. # - this leads to the third option: Have that creation # routine importable and callable from # ria-remote package without the need to actually # instantiate a RIARemote object lgr.debug("initializing object store") ds.repo.fsck(remote=ria_remote_name, fast=True, annex_options=['--exclude=*/*']) else: # with no special remote we currently need to create the # required directories # TODO: This should be cleaner once we have access to the # special remote's RemoteIO classes without # talking via annex if ssh_host: try: stdout, stderr = ssh('test -e {repo}'.format( repo=quote_cmdlinearg(str(repo_path)))) exists = True except CommandError as e: exists = False if exists: if existing == 'skip': # 1. not rendered by default # 2. message doesn't show up in ultimate result # record as shown by -f json_pp yield get_status_dict(status='notneeded', message="Skipped on existing remote " "directory {}".format(repo_path), **res_kwargs) skip = True elif existing in ['error', 'reconfigure']: yield get_status_dict( status='error', message="remote directory {} already " "exists.".format(repo_path), **res_kwargs) return elif existing == 'replace': ssh('chmod u+w -R {}'.format( quote_cmdlinearg(str(repo_path)))) ssh('rm -rf {}'.format(quote_cmdlinearg(str(repo_path)))) if not skip: ssh('mkdir -p {}'.format(quote_cmdlinearg(str(repo_path)))) else: if repo_path.exists(): if existing == 'skip': skip = True elif existing in ['error', 'reconfigure']: yield get_status_dict( status='error', message="remote directory {} already " "exists.".format(repo_path), **res_kwargs) return elif existing == 'replace': rmtree(repo_path) if not skip: repo_path.mkdir(parents=True) # Note, that this could have changed since last tested due to existing # remote dir if skip: return # 2. create a bare repository in-store: lgr.debug("init bare repository") # TODO: we should prob. check whether it's there already. How? # Note: like the special remote itself, we assume local FS if no # SSH host is specified disabled_hook = repo_path / 'hooks' / 'post-update.sample' enabled_hook = repo_path / 'hooks' / 'post-update' if group: chgrp_cmd = "chgrp -R {} {}".format(quote_cmdlinearg(str(group)), quote_cmdlinearg(str(repo_path))) if ssh_host: ssh('cd {rootdir} && git init --bare{shared}'.format( rootdir=quote_cmdlinearg(str(repo_path)), shared=" --shared='{}'".format(quote_cmdlinearg(shared)) if shared else '')) if post_update_hook: ssh('mv {} {}'.format(quote_cmdlinearg(str(disabled_hook)), quote_cmdlinearg(str(enabled_hook)))) if group: # Either repository existed before or a new directory was # created for it, set its group to a desired one if was # provided with the same chgrp ssh(chgrp_cmd) else: GitRepo(repo_path, create=True, bare=True, shared=" --shared='{}'".format(quote_cmdlinearg(shared)) if shared else None) if post_update_hook: disabled_hook.rename(enabled_hook) if group: # TODO; do we need a cwd here? subprocess.run(chgrp_cmd, cwd=quote_cmdlinearg(ds.path)) # add a git remote to the bare repository # Note: needs annex-ignore! Otherwise we might push into default # annex/object tree instead of directory type tree with dirhash # lower. This in turn would be an issue, if we want to pack the # entire thing into an archive. Special remote will then not be # able to access content in the "wrong" place within the archive lgr.debug("set up git remote") # TODO: # - This sibings call results in "[WARNING] Failed to determine # if datastore carries annex." # (see https://github.com/datalad/datalad/issues/4028) # => for now have annex-ignore configured before. Evtl. Allow # configure/add to include that option # - additionally there's # https://github.com/datalad/datalad/issues/3989, # where datalad-siblings might hang forever if name in ds_siblings: # otherwise we should have skipped or failed before assert existing in ['replace', 'reconfigure'] ds.config.set("remote.{}.annex-ignore".format(name), value="true", where="local") ds.siblings( 'configure', name=name, url=git_url if ssh_host else str(repo_path), recursive=False, # Note, that this should be None if ria_remote was not set publish_depends=ria_remote_name, result_renderer=None, # Note, that otherwise a subsequent publish will report # "notneeded". fetch=True) yield get_status_dict( status='ok', **res_kwargs, )
def check_replace_and_relative_sshpath(use_ssh, src_path, dst_path): # We need to come up with the path relative to our current home directory # https://github.com/datalad/datalad/issues/1653 # but because we override HOME the HOME on the remote end would be # different even though a datalad-test. So we need to query it if use_ssh: from datalad import ssh_manager ssh = ssh_manager.get_connection('datalad-test') remote_home, err = ssh('pwd') remote_home = remote_home.rstrip('\n') dst_relpath = os.path.relpath(dst_path, remote_home) url = 'datalad-test:%s' % dst_relpath sibname = 'datalad-test' else: url = dst_path sibname = 'local' ds = Dataset(src_path).create() create_tree(ds.path, {'sub.dat': 'lots of data'}) ds.save('sub.dat') try: res = ds.create_sibling(url, ui=have_webui()) except UnicodeDecodeError: if sys.version_info < (3, 7): # observed test failing on ubuntu 18.04 with python 3.6 # (reproduced in conda env locally with python 3.6.10 when LANG=C # We will just skip this tricky one raise SkipTest("Known failure") raise assert_in_results(res, action="create_sibling", sibling_name=sibname) published = ds.publish(to=sibname, transfer_data='all') assert_result_count(published, 1, path=opj(ds.path, 'sub.dat')) if have_webui(): # verify that hook runs and there is nothing in stderr # since it exits with 0 exit even if there was a problem out = Runner(cwd=opj(dst_path, '.git')).run( [_path_('hooks/post-update')], protocol=StdOutErrCapture) assert_false(out['stdout']) assert_false(out['stderr']) # Verify that we could replace and publish no problem # https://github.com/datalad/datalad/issues/1656 # Strangely it spits outs IncompleteResultsError exception atm... so just # checking that it fails somehow res = ds.create_sibling(url, on_failure='ignore') assert_status('error', res) assert_in('already configured', res[0]['message'][0]) # "Settings" such as UI do not persist, so we specify it again # for the test below depending on it with assert_raises(RuntimeError): # but we cannot replace in non-interactive mode ds.create_sibling(url, existing='replace', ui=have_webui()) # We don't have context manager like @with_testsui, so @with_testsui(responses=["yes"]) def interactive_create_sibling(): ds.create_sibling(url, existing='replace', ui=have_webui()) interactive_create_sibling() published2 = ds.publish(to=sibname, transfer_data='all') assert_result_count(published2, 1, path=opj(ds.path, 'sub.dat')) # and one more test since in above test it would not puke ATM but just # not even try to copy since it assumes that file is already there create_tree(ds.path, {'sub2.dat': 'more data'}) ds.save('sub2.dat') published3 = ds.publish(to=sibname, transfer_data='none') # we publish just git assert_result_count(published3, 0, path=opj(ds.path, 'sub2.dat')) if not have_webui(): return # now publish "with" data, which should also trigger the hook! # https://github.com/datalad/datalad/issues/1658 from glob import glob from datalad.consts import WEB_META_LOG logs_prior = glob(_path_(dst_path, WEB_META_LOG, '*')) published4 = ds.publish(to=sibname, transfer_data='all') assert_result_count(published4, 1, path=opj(ds.path, 'sub2.dat')) logs_post = glob(_path_(dst_path, WEB_META_LOG, '*')) eq_(len(logs_post), len(logs_prior) + 1) assert_postupdate_hooks(dst_path)
def __call__(sshurl, *, name=None, target_dir=None, target_url=None, target_pushurl=None, dataset=None, recursive=False, recursion_limit=None, existing='error', shared=None, group=None, ui=False, as_common_datasrc=None, publish_by_default=None, publish_depends=None, annex_wanted=None, annex_group=None, annex_groupwanted=None, inherit=False, since=None): if ui: # the webui has been moved to the deprecated extension try: from datalad_deprecated.sibling_webui \ import upload_web_interface except Exception as e: # we could just test for ModuleNotFoundError (which should be # all that would happen with PY3.6+, but be a little more robust # and use the pattern from duecredit if type(e).__name__ not in ('ImportError', 'ModuleNotFoundError'): lgr.error( "Failed to import datalad_deprecated.sibling_webui " "due to %s", str(e)) raise RuntimeError( "The DataLad web UI has been moved to an extension " "package. Please install the Python package " "`datalad_deprecated` to be able to deploy it.") # push uses '^' to annotate the previous pushed committish, and None for default # behavior. '' was/is (to be deprecated) used in `publish` and 'create-sibling'. # Alert user about the mistake if since == '': # deprecation was added prior 0.16.0 import warnings warnings.warn("'since' should point to commitish or use '^'.", DeprecationWarning) since = '^' # # nothing without a base dataset # ds = require_dataset(dataset, check_installed=True, purpose='create sibling(s)') refds_path = ds.path # # all checks that are possible before we start parsing the dataset # if since and not recursive: raise ValueError("The use of 'since' requires 'recursive'") # possibly use sshurl to get the name in case if not specified if not sshurl: if not inherit: raise InsufficientArgumentsError( "needs at least an SSH URL, if no inherit option") if name is None: raise ValueError( "Neither SSH URL, nor the name of sibling to inherit from " "was specified") # It might well be that we already have this remote setup try: sshurl = CreateSibling._get_remote_url(ds, name) except Exception as exc: ce = CapturedException(exc) lgr.debug('%s does not know about url for %s: %s', ds, name, ce) elif inherit: raise ValueError( "For now, for clarity not allowing specifying a custom sshurl " "while inheriting settings") # may be could be safely dropped -- still WiP if not sshurl: # TODO: may be more back up before _prep? super_ds = ds.get_superdataset() if not super_ds: raise ValueError( "Could not determine super dataset for %s to inherit URL" % ds) super_url = CreateSibling._get_remote_url(super_ds, name) # for now assuming hierarchical setup # (TODO: to be able to distinguish between the two, probably # needs storing datalad.*.target_dir to have %RELNAME in there) sshurl = slash_join(super_url, relpath(refds_path, super_ds.path)) # check the login URL sibling_ri = RI(sshurl) ssh_sibling = is_ssh(sibling_ri) if not (ssh_sibling or isinstance(sibling_ri, PathRI)): raise ValueError( "Unsupported SSH URL or path: '{0}', " "use ssh://host/path, host:path or path syntax".format(sshurl)) if not name: name = sibling_ri.hostname if ssh_sibling else "local" lgr.info("No sibling name given. Using %s'%s' as sibling name", "URL hostname " if ssh_sibling else "", name) if since == '^': # consider creating siblings only since the point of # the last update # XXX here we assume one to one mapping of names from local branches # to the remote active_branch = ds.repo.get_active_branch() since = '%s/%s' % (name, active_branch) to_process = [] if recursive: # # parse the base dataset to find all subdatasets that need processing # cand_ds = [ Dataset(r['path']) for r in diff_dataset( ds, fr=since, to='HEAD', # w/o False we might not follow into new subdatasets # which do not have that remote yet setup, # see https://github.com/datalad/datalad/issues/6596 constant_refs=False, # save cycles, we are only looking for datasets annex=None, untracked='no', recursive=True, datasets_only=True, ) # not installed subdatasets would be 'clean' so we would skip them if r.get('type') == 'dataset' and r.get('state', None) != 'clean' ] if not since: # not only subdatasets cand_ds = [ds] + cand_ds else: # only the current ds cand_ds = [ds] # check remotes setup() for d in cand_ds: d_repo = d.repo if d_repo is None: continue checkds_remotes = d.repo.get_remotes() res = dict( action='create_sibling', path=d.path, type='dataset', ) if publish_depends: # make sure dependencies are valid # TODO: inherit -- we might want to automagically create # those dependents as well??? unknown_deps = set( ensure_list(publish_depends)).difference(checkds_remotes) if unknown_deps: yield dict( res, status='error', message=('unknown sibling(s) specified as publication ' 'dependency: %s', unknown_deps), ) continue if name in checkds_remotes and existing in ('error', 'skip'): yield dict( res, sibling_name=name, status='error' if existing == 'error' else 'notneeded', message=( "sibling '%s' already configured (specify alternative " "name, or force reconfiguration via --existing", name), ) continue to_process.append(res) if not to_process: # we ruled out all possibilities # TODO wait for gh-1218 and make better return values lgr.info("No datasets qualify for sibling creation. " "Consider different settings for --existing " "or --since if this is unexpected") return if ssh_sibling: # request ssh connection: lgr.info("Connecting ...") shell = ssh_manager.get_connection(sshurl) else: shell = _RunnerAdapter() sibling_ri.path = str(resolve_path(sibling_ri.path, dataset)) if target_dir: target_dir = opj(sibling_ri.path, target_dir) if target_dir is None: if sibling_ri.path: target_dir = sibling_ri.path else: target_dir = '.' # TODO: centralize and generalize template symbol handling replicate_local_structure = "%RELNAME" not in target_dir if not shell.get_annex_version(): raise MissingExternalDependency( 'git-annex', msg="It's required on the {} machine to create a sibling". format('remote' if ssh_sibling else 'local')) # # all checks done and we have a connection, now do something # # loop over all datasets, ordered from top to bottom to make test # below valid (existing directories would cause the machinery to halt) # But we need to run post-update hook in depth-first fashion, so # would only collect first and then run (see gh #790) yielded = set() remote_repos_to_run_hook_for = [] for currentds_ap in \ sorted(to_process, key=lambda x: x['path'].count('/')): current_ds = Dataset(currentds_ap['path']) path = _create_dataset_sibling( name, current_ds, refds_path, shell, replicate_local_structure, sibling_ri, target_dir, target_url, target_pushurl, existing, shared, group, publish_depends, publish_by_default, ui, as_common_datasrc, annex_wanted, annex_group, annex_groupwanted, inherit) currentds_ap["sibling_name"] = name if not path: # nothing new was created # TODO is 'notneeded' appropriate in this case? currentds_ap['status'] = 'notneeded' # TODO explain status in 'message' yield currentds_ap yielded.add(currentds_ap['path']) continue remote_repos_to_run_hook_for.append((path, currentds_ap)) # publish web-interface to root dataset on publication server if current_ds.path == refds_path and ui: from datalad_deprecated.sibling_webui import upload_web_interface lgr.info("Uploading web interface to %s", path) try: upload_web_interface(path, shell, shared, ui) except CommandError as e: ce = CapturedException(e) currentds_ap['status'] = 'error' currentds_ap['message'] = ( "failed to push web interface to the remote datalad repository (%s)", ce) currentds_ap['exception'] = ce yield currentds_ap yielded.add(currentds_ap['path']) continue # in reverse order would be depth first lgr.info("Running post-update hooks in all created siblings") # TODO: add progressbar for path, currentds_ap in remote_repos_to_run_hook_for[::-1]: # Trigger the hook lgr.debug("Running hook for %s (if exists and executable)", path) try: shell( "cd {} " "&& ( [ -x hooks/post-update ] && hooks/post-update || true )" "".format(sh_quote(_path_(path, ".git")))) except CommandError as e: ce = CapturedException(e) currentds_ap['status'] = 'error' currentds_ap['message'] = ( "failed to run post-update hook under remote path %s (%s)", path, ce) currentds_ap['exception'] = ce yield currentds_ap yielded.add(currentds_ap['path']) continue if not currentds_ap['path'] in yielded: # if we were silent until now everything is just splendid currentds_ap['status'] = 'ok' yield currentds_ap