def test_create_push_url(detection_path=None, ds_path=None, store_path=None): store_path = Path(store_path) ds_path = Path(ds_path) detection_path = Path(detection_path) ds = Dataset(ds_path).create(force=True) ds.save() # patch SSHConnection to signal it was used: from datalad.support.sshconnector import SSHManager def detector(f, d): @wraps(f) def _wrapper(*args, **kwargs): d.touch() return f(*args, **kwargs) return _wrapper url = "ria+{}".format(store_path.as_uri()) push_url = "ria+ssh://datalad-test{}".format(store_path.as_posix()) assert not detection_path.exists() with patch('datalad.support.sshconnector.SSHManager.get_connection', new=detector(SSHManager.get_connection, detection_path)): ds.create_sibling_ria(url, "datastore", push_url=push_url, new_store_ok=True) # used ssh_manager despite file-url hence used push-url (ria+ssh): assert detection_path.exists() # correct config in special remote: sr_cfg = ds.repo.get_special_remotes()[ds.siblings( name='datastore-storage')[0]['annex-uuid']] eq_(sr_cfg['url'], url) eq_(sr_cfg['push-url'], push_url) # git remote based on url (local path): eq_(ds.config.get("remote.datastore.url"), (store_path / ds.id[:3] / ds.id[3:]).as_posix()) eq_( ds.config.get("remote.datastore.pushurl"), "ssh://datalad-test{}".format( (store_path / ds.id[:3] / ds.id[3:]).as_posix())) # git-push uses SSH: detection_path.unlink() ds.push('.', to="datastore", data='nothing') assert detection_path.exists() # data push # Note, that here the patching has no effect, since the special remote # is running in a subprocess of git-annex. Hence we can't detect SSH # usage really. However, ORA remote is tested elsewhere - if it succeeds # all should be good wrt `create-sibling-ria`. ds.repo.call_annex(['copy', '.', '--to', 'datastore-storage'])
def test_bundle_invariance(path): remote_url = 'ssh://localhost' manager = SSHManager() testfile = Path(path) / 'dummy' for flag in (True, False): assert_false(testfile.exists()) ssh = manager.get_connection(remote_url, use_remote_annex_bundle=flag) ssh('cd .>{}'.format(str(testfile))) ok_(testfile.exists()) testfile.unlink()
def test_bundle_invariance(path): remote_url = 'ssh://localhost' manager = SSHManager() testfile = Path(path) / 'dummy' for flag in (True, False): assert_false(testfile.exists()) ssh = manager.get_connection(remote_url, use_remote_annex_bundle=flag) ssh('cd .>{}'.format(text_type(testfile))) ok_(testfile.exists()) testfile.unlink()
def compress_files(files, archive, path=None, overwrite=True): """Compress `files` into an `archive` file Parameters ---------- files : list of str archive : str path : str Alternative directory under which compressor will be invoked, to e.g. take into account relative paths of files and/or archive overwrite : bool Whether to allow overwriting the target archive file if one already exists """ runner = Runner(cwd=path) apath = Path(archive) if apath.exists(): if overwrite: apath.unlink() else: raise ValueError( 'Target archive {} already exists and overwrite is forbidden'. format(apath)) suffixes = _normalize_fname_suffixes(apath.suffixes) if len(suffixes) > 1 and suffixes[-2] == '.tar': cmd = '7z u .tar -so -- {} | 7z u -si -- {}'.format( join_cmdline(files), quote_cmdlinearg(str(apath)), ) else: cmd = ['7z', 'u', str(apath), '--'] + files runner.run(cmd, protocol=KillOutput)
def _store_new(self, url=None, authentication_type=None, authenticator_class=None, url_re=None, name=None, credential_name=None, credential_type=None, level='user'): """Stores a provider and credential config and reloads afterwards. Note ---- non-interactive version of `enter_new`. For now non-public, pending further refactoring Parameters ---------- level: str Where to store the config. Choices: 'user' (default), 'ds', 'site' Returns ------- Provider The stored `Provider` as reported by reload """ # We don't ask user for confirmation, so for this non-interactive # routine require everything to be explicitly specified. if any(not a for a in [ url, authentication_type, authenticator_class, url_re, name, credential_name, credential_type ]): raise ValueError("All arguments must be specified") if level not in ['user', 'ds', 'site']: raise ValueError("'level' must be one of 'user', 'ds', 'site'") providers_dir = Path(self._get_providers_dirs()[level]) if not providers_dir.exists(): providers_dir.mkdir(parents=True, exist_ok=True) filepath = providers_dir / f"{name}.cfg" cfg = self._CONFIG_TEMPLATE.format(**locals()) filepath.write_bytes(cfg.encode('utf-8')) self.reload() return self.get_provider(url)
def test_asyncio_forked(temp): # temp will be used to communicate from child either it succeeded or not temp = Path(temp) runner = Runner() import os try: pid = os.fork() except BaseException as exc: # .fork availability is "Unix", and there are cases where it is "not supported" # so we will just skip if no forking is possible raise SkipTest(f"Cannot fork: {exc}") # if does not fail (in original or in a fork) -- we are good if sys.version_info < (3, 8) and pid != 0: # for some reason it is crucial to sleep a little (but 0.001 is not enough) # in the master process with older pythons or it takes forever to make the child run sleep(0.1) try: runner.run([sys.executable, '--version'], protocol=StdOutCapture) if pid == 0: temp.write_text("I rule") except: if pid == 0: temp.write_text("I suck") if pid != 0: # parent: look after the child t0 = time() try: while not temp.exists() or temp.stat().st_size < 6: if time() - t0 > 5: raise AssertionError( "Child process did not create a file we expected!") finally: # kill the child os.kill(pid, signal.SIGTERM) # see if it was a good one eq_(temp.read_text(), "I rule") else: # sleep enough so parent just kills me the kid before I continue doing bad deeds sleep(10)
def test_no_storage(store1=None, store2=None, ds_path=None): store1_url = 'ria+' + get_local_file_url(store1) store2_url = 'ria+' + get_local_file_url(store2) ds = Dataset(ds_path).create(force=True) ds.save(recursive=True) assert_repo_status(ds.path) res = ds.create_sibling_ria(store1_url, "datastore1", storage_sibling=False, new_store_ok=True) assert_result_count(res, 1, status='ok', action='create-sibling-ria') eq_({'datastore1', 'here'}, {s['name'] for s in ds.siblings(result_renderer='disabled')}) # deprecated way of disabling storage still works res = ds.create_sibling_ria(store2_url, "datastore2", storage_sibling=False, new_store_ok=True) assert_result_count(res, 1, status='ok', action='create-sibling-ria') eq_({'datastore2', 'datastore1', 'here'}, {s['name'] for s in ds.siblings(result_renderer='disabled')}) # no annex/object dir should be created when there is no special remote # to use it. for s in [store1, store2]: p = Path(s) / ds.id[:3] / ds.id[3:] / 'annex' / 'objects' assert_false(p.exists()) # smoke test that we can push to it res = ds.push(to='datastore1') assert_status('ok', res) # but nothing was copied, because there is no storage sibling assert_result_count(res, 0, action='copy')
def test_datalad_credential_helper(path=None): ds = Dataset(path).create() # tell git to use git-credential-datalad ds.config.add('credential.helper', 'datalad', scope='local') ds.config.add('datalad.credentials.githelper.noninteractive', 'true', scope='global') from datalad.downloaders.providers import Providers url1 = "https://datalad-test.org/some" url2 = "https://datalad-test.org/other" provider_name = "datalad-test.org" # `Providers` code is old and only considers a dataset root based on PWD # for config lookup. contextmanager below can be removed once the # provider/credential system is redesigned. with chpwd(ds.path): gitcred = GitCredentialInterface(url=url1, repo=ds) # There's nothing set up yet, helper should return empty gitcred.fill() eq_(gitcred['username'], '') eq_(gitcred['password'], '') # store new credentials # Note, that `Providers.enter_new()` currently uses user-level config # files for storage only. TODO: make that an option! # To not mess with existing ones, fail if it already exists: cfg_file = Path(Providers._get_providers_dirs()['user']) \ / f"{provider_name}.cfg" assert_false(cfg_file.exists()) # Make sure we clean up from datalad.tests import _TEMP_PATHS_GENERATED _TEMP_PATHS_GENERATED.append(str(cfg_file)) # Give credentials to git and ask it to store them: gitcred = GitCredentialInterface(url=url1, username="******", password="******", repo=ds) gitcred.approve() assert_true(cfg_file.exists()) providers = Providers.from_config_files() p1 = providers.get_provider(url=url1, only_nondefault=True) assert_is_instance(p1.credential, UserPassword) eq_(p1.credential.get('user'), 'dl-user') eq_(p1.credential.get('password'), 'dl-pwd') # default regex should be host only, so matching url2, too p2 = providers.get_provider(url=url2, only_nondefault=True) assert_is_instance(p1.credential, UserPassword) eq_(p1.credential.get('user'), 'dl-user') eq_(p1.credential.get('password'), 'dl-pwd') # git, too, should now find it for both URLs gitcred = GitCredentialInterface(url=url1, repo=ds) gitcred.fill() eq_(gitcred['username'], 'dl-user') eq_(gitcred['password'], 'dl-pwd') gitcred = GitCredentialInterface(url=url2, repo=ds) gitcred.fill() eq_(gitcred['username'], 'dl-user') eq_(gitcred['password'], 'dl-pwd') # Rejection must not currently lead to deleting anything, since we would # delete too broadly. gitcred.reject() assert_true(cfg_file.exists()) gitcred = GitCredentialInterface(url=url1, repo=ds) gitcred.fill() eq_(gitcred['username'], 'dl-user') eq_(gitcred['password'], 'dl-pwd') dlcred = UserPassword(name=provider_name) eq_(dlcred.get('user'), 'dl-user') eq_(dlcred.get('password'), 'dl-pwd')
class SSHConnection(object): """Representation of a (shared) ssh connection. """ def __init__(self, ctrl_path, sshri, identity_file=None, use_remote_annex_bundle=True, force_ip=False): """Create a connection handler The actual opening of the connection is performed on-demand. Parameters ---------- ctrl_path: str path to SSH controlmaster sshri: SSHRI SSH resource identifier (contains all connection-relevant info), or another resource identifier that can be converted into an SSHRI. identity_file : str or None Value to pass to ssh's -i option. use_remote_annex_bundle : bool If set, look for a git-annex installation on the remote and prefer its binaries in the search path (i.e. prefer a bundled Git over a system package). force_ip : {False, 4, 6} Force the use of IPv4 or IPv6 addresses with -4 or -6. """ self._runner = None from datalad.support.network import SSHRI, is_ssh if not is_ssh(sshri): raise ValueError( "Non-SSH resource identifiers are not supported for SSH " "connections: {}".format(sshri)) self.sshri = SSHRI( **{ k: v for k, v in sshri.fields.items() if k in ('username', 'hostname', 'port') }) # on windows cmd args lists are always converted into a string using appropriate # quoting rules, on other platforms args lists are passed directly and we need # to take care of quoting ourselves ctrlpath_arg = "ControlPath={}".format( ctrl_path if on_windows else sh_quote(str(ctrl_path))) self._ssh_args = ["-o", ctrlpath_arg] self.ctrl_path = Path(ctrl_path) if self.sshri.port: self._ssh_args += ['-p', '{}'.format(self.sshri.port)] if force_ip: self._ssh_args.append("-{}".format(force_ip)) self._identity_file = identity_file self._use_remote_annex_bundle = use_remote_annex_bundle # essential properties of the remote system self._remote_props = {} self._opened_by_us = False def __call__(self, cmd, options=None, stdin=None, log_output=True): """Executes a command on the remote. It is the callers responsibility to properly quote commands for remote execution (e.g. filename with spaces of other special characters). Use the `sh_quote()` from the module for this purpose. Parameters ---------- cmd: str command to run on the remote options : list of str, optional Additional options to pass to the `-o` flag of `ssh`. Note: Many (probably most) of the available configuration options should not be set here because they can critically change the properties of the connection. This exists to allow options like SendEnv to be set. Returns ------- tuple of str stdout, stderr of the command run. """ # XXX: check for open socket once # and provide roll back if fails to run and was not explicitly # checked first # MIH: this would mean that we would have to distinguish failure # of a payload command from failure of SSH itself. SSH however, # only distinguishes success and failure of the entire operation # Increase in fragility from introspection makes a potential # performance benefit a questionable improvement. # make sure we have an open connection, will test if action is needed # by itself self.open() # locate annex and set the bundled vs. system Git machinery in motion if self._use_remote_annex_bundle: remote_annex_installdir = self.get_annex_installdir() if remote_annex_installdir: # make sure to use the bundled git version if any exists cmd = '{}; {}'.format( 'export "PATH={}:$PATH"'.format(remote_annex_installdir), cmd) # build SSH call, feed remote command as a single last argument # whatever it contains will go to the remote machine for execution # we cannot perform any sort of escaping, because it will limit # what we can do on the remote, e.g. concatenate commands with '&&' ssh_cmd = ["ssh"] + self._ssh_args for opt in options or []: ssh_cmd.extend(["-o", opt]) ssh_cmd += [self.sshri.as_str()] \ + [cmd] # TODO: pass expect parameters from above? # Hard to explain to toplevel users ... So for now, just set True out = self.runner.run( ssh_cmd, protocol=StdOutErrCapture if log_output else NoCapture, stdin=stdin) return out['stdout'], out['stderr'] @property def runner(self): if self._runner is None: self._runner = WitlessRunner() return self._runner def is_open(self): if not self.ctrl_path.exists(): lgr.log(5, "Not opening %s for checking since %s does not exist", self, self.ctrl_path) return False # check whether controlmaster is still running: cmd = ["ssh", "-O", "check"] + self._ssh_args + [self.sshri.as_str()] lgr.debug("Checking %s by calling %s" % (self, cmd)) try: # expect_stderr since ssh would announce to stderr # "Master is running" and that is normal, not worthy warning about # etc -- we are doing the check here for successful operation with tempfile.TemporaryFile() as tempf: self.runner.run( cmd, # do not leak output protocol=StdOutErrCapture, stdin=tempf) res = True except CommandError as e: if e.code != 255: # this is not a normal SSH error, whine ... raise e # SSH died and left socket behind, or server closed connection self.close() res = False lgr.debug("Check of %s has %s", self, { True: 'succeeded', False: 'failed' }[res]) return res def open(self): """Opens the connection. In other words: Creates the SSH ControlMaster to be used by this connection, if it is not there already. Returns ------- bool True when SSH reports success opening the connection, False when a ControlMaster for an open connection already exists. Raises ------ ConnectionOpenFailedError When starting the SSH ControlMaster process failed. """ # the socket should vanish almost instantly when the connection closes # sending explicit 'check' commands to the control master is expensive # (needs tempfile to shield stdin, Runner overhead, etc...) # as we do not use any advanced features (forwarding, stop[ing the # master without exiting) it should be relatively safe to just perform # the much cheaper check of an existing control path if self.ctrl_path.exists(): return False # set control options ctrl_options = [ "-fN", "-o", "ControlMaster=auto", "-o", "ControlPersist=15m" ] + self._ssh_args if self._identity_file: ctrl_options.extend(["-i", self._identity_file]) # create ssh control master command cmd = ["ssh"] + ctrl_options + [self.sshri.as_str()] # start control master: lgr.debug("Opening %s by calling %s" % (self, cmd)) proc = Popen(cmd) stdout, stderr = proc.communicate( input="\n") # why the f.. this is necessary? # wait till the command exits, connection is conclusively # open or not at this point exit_code = proc.wait() if exit_code != 0: raise ConnectionOpenFailedError( cmd, 'Failed to open SSH connection (could not start ControlMaster process)', exit_code, stdout, stderr, ) self._opened_by_us = True return True def close(self): """Closes the connection. """ if not self._opened_by_us: lgr.debug("Not closing %s since was not opened by itself", self) return # stop controlmaster: cmd = ["ssh", "-O", "stop"] + self._ssh_args + [self.sshri.as_str()] lgr.debug("Closing %s by calling %s", self, cmd) try: self.runner.run(cmd, protocol=StdOutErrCapture) except CommandError as e: lgr.debug("Failed to run close command") if self.ctrl_path.exists(): lgr.debug("Removing existing control path %s", self.ctrl_path) # socket need to go in any case self.ctrl_path.unlink() if e.code != 255: # not a "normal" SSH error raise e def _get_scp_command_spec(self, recursive, preserve_attrs): """Internal helper for SCP interface methods""" # Convert ssh's port flag (-p) to scp's (-P). scp_options = ["-P" if x == "-p" else x for x in self._ssh_args] # add recursive, preserve_attributes flag if recursive, preserve_attrs set and create scp command scp_options += ["-r"] if recursive else [] scp_options += ["-p"] if preserve_attrs else [] return ["scp"] + scp_options def put(self, source, destination, recursive=False, preserve_attrs=False): """Copies source file/folder to destination on the remote. Note: this method performs escaping of filenames to an extent that moderately weird ones should work (spaces, quotes, pipes, other characters with special shell meaning), but more complicated cases might require appropriate external preprocessing of filenames. Parameters ---------- source : str or list file/folder path(s) to copy from on local destination : str file/folder path to copy to on remote recursive : bool flag to enable recursive copying of given sources preserve_attrs : bool preserve modification times, access times, and modes from the original file Returns ------- str stdout, stderr of the copy operation. """ # make sure we have an open connection, will test if action is needed # by itself self.open() scp_cmd = self._get_scp_command_spec(recursive, preserve_attrs) # add source filepath(s) to scp command scp_cmd += ensure_list(source) # add destination path scp_cmd += [ '%s:%s' % ( self.sshri.hostname, _quote_filename_for_scp(destination), ) ] out = self.runner.run(scp_cmd, protocol=StdOutErrCapture) return out['stdout'], out['stderr'] def get(self, source, destination, recursive=False, preserve_attrs=False): """Copies source file/folder from remote to a local destination. Note: this method performs escaping of filenames to an extent that moderately weird ones should work (spaces, quotes, pipes, other characters with special shell meaning), but more complicated cases might require appropriate external preprocessing of filenames. Parameters ---------- source : str or list file/folder path(s) to copy from the remote host destination : str file/folder path to copy to on the local host recursive : bool flag to enable recursive copying of given sources preserve_attrs : bool preserve modification times, access times, and modes from the original file Returns ------- str stdout, stderr of the copy operation. """ # make sure we have an open connection, will test if action is needed # by itself self.open() scp_cmd = self._get_scp_command_spec(recursive, preserve_attrs) # add source filepath(s) to scp command, prefixed with the remote host scp_cmd += [ "%s:%s" % (self.sshri.hostname, _quote_filename_for_scp(s)) for s in ensure_list(source) ] # add destination path scp_cmd += [destination] out = self.runner.run(scp_cmd, protocol=StdOutErrCapture) return out['stdout'], out['stderr'] def get_annex_installdir(self): key = 'installdir:annex' if key in self._remote_props: return self._remote_props[key] annex_install_dir = None # already set here to avoid any sort of recursion until we know # more self._remote_props[key] = annex_install_dir try: with tempfile.TemporaryFile() as tempf: # TODO does not work on windows annex_install_dir = self( # use sh -e to be able to fail at each stage of the process "sh -e -c 'dirname $(readlink -f $(which git-annex-shell))'", stdin=tempf)[0].strip() except CommandError as e: lgr.debug('Failed to locate remote git-annex installation: %s', exc_str(e)) self._remote_props[key] = annex_install_dir return annex_install_dir def get_annex_version(self): key = 'cmd:annex' if key in self._remote_props: return self._remote_props[key] try: # modern annex versions version = self('git annex version --raw')[0] except CommandError: # either no annex, or old version try: # fall back on method that could work with older installations out, err = self('git annex version') version = out.split('\n')[0].split(':')[1].strip() except CommandError as e: lgr.debug('Failed to determine remote git-annex version: %s', exc_str(e)) version = None self._remote_props[key] = version return version def get_git_version(self): key = 'cmd:git' if key in self._remote_props: return self._remote_props[key] git_version = None try: git_version = self('git version')[0].split()[2] except CommandError as e: lgr.debug('Failed to determine Git version: %s', exc_str(e)) self._remote_props[key] = git_version return git_version
def _test_create_store(host, base_path, ds_path, clone_path): ds = Dataset(ds_path).create(force=True) subds = ds.create('sub', force=True) subds2 = ds.create('sub2', force=True, annex=False) ds.save(recursive=True) assert_repo_status(ds.path) # don't specify special remote. By default should be git-remote + "-storage" res = ds.create_sibling_ria("ria+ssh://test-store:", "datastore") assert_result_count(res, 1, status='ok', action='create-sibling-ria') eq_(len(res), 1) # remotes exist, but only in super siblings = ds.siblings(result_renderer=None) eq_({'datastore', 'datastore-storage', 'here'}, {s['name'] for s in siblings}) sub_siblings = subds.siblings(result_renderer=None) eq_({'here'}, {s['name'] for s in sub_siblings}) sub2_siblings = subds2.siblings(result_renderer=None) eq_({'here'}, {s['name'] for s in sub2_siblings}) # TODO: post-update hook was enabled # check bare repo: git_config = Path(base_path) / ds.id[:3] / ds.id[3:] / 'config' assert git_config.exists() content = git_config.read_text() assert_in("[datalad \"ora-remote\"]", content) super_uuid = ds.config.get("remote.{}.annex-uuid".format('datastore-storage')) assert_in("uuid = {}".format(super_uuid), content) # implicit test of success by ria-installing from store: ds.publish(to="datastore", transfer_data='all') with chpwd(clone_path): if host: # note, we are not using the "test-store"-label here clone('ria+ssh://{}{}#{}'.format(host, base_path, ds.id), path='test_install') else: # TODO: Whenever ria+file supports special remote config (label), # change here: clone('ria+file://{}#{}'.format(base_path, ds.id), path='test_install') installed_ds = Dataset(op.join(clone_path, 'test_install')) assert installed_ds.is_installed() assert_repo_status(installed_ds.repo) eq_(installed_ds.id, ds.id) assert_in(op.join('ds', 'file1.txt'), installed_ds.repo.get_annexed_files()) assert_result_count(installed_ds.get(op.join('ds', 'file1.txt')), 1, status='ok', action='get', path=op.join(installed_ds.path, 'ds', 'file1.txt')) # now, again but recursive. res = ds.create_sibling_ria("ria+ssh://test-store:", "datastore", recursive=True, existing='reconfigure') eq_(len(res), 3) assert_result_count(res, 1, path=str(ds.pathobj), status='ok', action="create-sibling-ria") assert_result_count(res, 1, path=str(subds.pathobj), status='ok', action="create-sibling-ria") assert_result_count(res, 1, path=str(subds2.pathobj), status='ok', action="create-sibling-ria") # remotes now exist in super and sub siblings = ds.siblings(result_renderer=None) eq_({'datastore', 'datastore-storage', 'here'}, {s['name'] for s in siblings}) sub_siblings = subds.siblings(result_renderer=None) eq_({'datastore', 'datastore-storage', 'here'}, {s['name'] for s in sub_siblings}) # but no special remote in plain git subdataset: sub2_siblings = subds2.siblings(result_renderer=None) eq_({'datastore', 'here'}, {s['name'] for s in sub2_siblings}) # for testing trust_level parameter, redo for each label: for trust in ['trust', 'semitrust', 'untrust']: ds.create_sibling_ria("ria+ssh://test-store:", "datastore", existing='reconfigure', trust_level=trust) res = ds.repo.repo_info() assert_in('[datastore-storage]', [r['description'] for r in res['{}ed repositories'.format(trust)]])
def restructure_ukb2bids(ds, subid, unrecognized_dir, base_path=None): """Perform the necessary renames to restructure to BIDS Parameters ---------- ds : Dataset DataLad dataset instance to restructure. The checked-out branch is taken as the subject of restructuring. subid : str Participant ID unrecognized_dir : str or None Name of a directory to put all unrecognized files into. The given value is used to populate the 'unrecogdir' substitution label in `ukb2bids_map`. If None, unrecognized files will not be moved. The directory will be placed inside the respective session directory. base_path : Path-like Base path to determine relative path names of any file for BIDS mapping """ # shortcut repo = ds.repo # prep for yield res = dict( action='ukb_bidsify', type='file', logger=lgr, refds=ds.path, ) # loop over all known files for fp in ds.status( path=base_path, annex=None, untracked='all', eval_subdataset_state='no', report_filetype='raw', return_type='generator', result_renderer=None): path = Path(fp['path']) if not path.exists(): lgr.debug('Skip mapping %s, no longer exists (likely moved before)', path) continue rp_parts = list(Path(fp['path']).relative_to(base_path or ds.pathobj).parts) # instance number will serve as BIDS session try: session = rp_parts[0].split('_')[1] except IndexError: # ignore anything that doesn't look like a UKB data record continue # pull out instance number from the top-level component, because the matching # is uniform and agnostic of instances rp_parts[0] = '_'.join(rp_parts[0].split('_')[::2]) fname = Path(rp_parts[-1]) # build a list of candidate mapping to try, and suffixes to reappend # upon a successful match cands = [ # full thing (str(Path(*rp_parts)), ''), # without suffix(es) (str(Path( *rp_parts[:-1], fname.name[:-sum(len(s) for s in fname.suffixes)])), ''.join(fname.suffixes)), ] # all intermediate path components cands += reversed([ (str(Path(*rp_parts[:i + 1])), str(Path(*rp_parts[i + 1:]))) for i in range(len(rp_parts) - 1) ]) for pattern, suffix in cands: target_path = ukb2bids.get(pattern, None) if target_path is not None: # append suffix if isinstance(suffix, Path): target_path = opj(str(target_path), suffix) else: target_path = target_path + suffix # apply substitutions target_path = target_path.format( subj=subid, session='ses-{}'.format(session), unrecogdir='@@UNRECOG@@' if unrecognized_dir is None else Path('ses-{}'.format(session)) / unrecognized_dir, ) break if target_path is None or '@@UNRECOG@@' in target_path: yield dict( res, path=fp['path'], status='impossible', message='No BIDS file name mapping available', ) continue full_sourcepath = Path(fp['path']) full_targetpath = ds.pathobj / target_path if full_targetpath.exists(): lgr.info('Overwriting %s', str(target_path)) target_path.unlink() else: # ensure target directory full_targetpath.parent.mkdir(parents=True, exist_ok=True) full_sourcepath.rename(full_targetpath) # delete empty source directories for p in full_sourcepath.parents: try: p.rmdir() except OSError: lgr.debug( "Not removing non-empty parent directory of %s", fp['path']) break yield dict( res, path=fp['path'], bids_path=str(full_targetpath), status='ok', )
class MultiplexSSHConnection(BaseSSHConnection): """Representation of a (shared) ssh connection. """ def __init__(self, ctrl_path, sshri, **kwargs): """Create a connection handler The actual opening of the connection is performed on-demand. Parameters ---------- ctrl_path: str path to SSH controlmaster sshri: SSHRI SSH resource identifier (contains all connection-relevant info), or another resource identifier that can be converted into an SSHRI. **kwargs Pass on to BaseSSHConnection """ super().__init__(sshri, **kwargs) # on windows cmd args lists are always converted into a string using appropriate # quoting rules, on other platforms args lists are passed directly and we need # to take care of quoting ourselves ctrlpath_arg = "ControlPath={}".format( ctrl_path if on_windows else sh_quote(str(ctrl_path))) self._ssh_args += ["-o", ctrlpath_arg] self._ssh_open_args += [ "-fN", "-o", "ControlMaster=auto", "-o", "ControlPersist=15m", ] self.ctrl_path = Path(ctrl_path) self._opened_by_us = False # used by @fasteners.locked self._lock = [ threading.Lock(), fasteners.process_lock.InterProcessLock( self.ctrl_path.with_suffix('.lck')) ] def __call__(self, cmd, options=None, stdin=None, log_output=True): # XXX: check for open socket once # and provide roll back if fails to run and was not explicitly # checked first # MIH: this would mean that we would have to distinguish failure # of a payload command from failure of SSH itself. SSH however, # only distinguishes success and failure of the entire operation # Increase in fragility from introspection makes a potential # performance benefit a questionable improvement. # make sure we have an open connection, will test if action is needed # by itself self.open() ssh_cmd = [self.ssh_executable] + self._ssh_args return self._exec_ssh(ssh_cmd, cmd, options=options, stdin=stdin, log_output=log_output) def _assemble_multiplex_ssh_cmd(self, additional_arguments): return [self.ssh_executable] \ + additional_arguments \ + self._ssh_args \ + [self.sshri.as_str()] def is_open(self): if not self.ctrl_path.exists(): lgr.log(5, "Not opening %s for checking since %s does not exist", self, self.ctrl_path) return False # check whether controlmaster is still running: cmd = self._assemble_multiplex_ssh_cmd(["-O", "check"]) lgr.debug("Checking %s by calling %s", self, cmd) try: # expect_stderr since ssh would announce to stderr # "Master is running" and that is normal, not worthy warning about # etc -- we are doing the check here for successful operation with tempfile.TemporaryFile() as tempf: self.runner.run( cmd, # do not leak output protocol=StdOutErrCapture, stdin=tempf) res = True except CommandError as e: if e.code != 255: # this is not a normal SSH error, whine ... raise e # SSH died and left socket behind, or server closed connection self.close() res = False lgr.debug("Check of %s has %s", self, { True: 'succeeded', False: 'failed' }[res]) return res @fasteners.locked def open(self): """Opens the connection. In other words: Creates the SSH ControlMaster to be used by this connection, if it is not there already. Returns ------- bool True when SSH reports success opening the connection, False when a ControlMaster for an open connection already exists. Raises ------ ConnectionOpenFailedError When starting the SSH ControlMaster process failed. """ # the socket should vanish almost instantly when the connection closes # sending explicit 'check' commands to the control master is expensive # (needs tempfile to shield stdin, Runner overhead, etc...) # as we do not use any advanced features (forwarding, stop[ing the # master without exiting) it should be relatively safe to just perform # the much cheaper check of an existing control path if self.ctrl_path.exists(): return False # create ssh control master command cmd = self._assemble_multiplex_ssh_cmd(self._ssh_open_args) # start control master: lgr.debug("Opening %s by calling %s", self, cmd) # The following call is exempt from bandit's security checks because # we/the user control the content of 'cmd'. proc = Popen(cmd) # nosec stdout, stderr = proc.communicate( input="\n") # why the f.. this is necessary? # wait till the command exits, connection is conclusively # open or not at this point exit_code = proc.wait() if exit_code != 0: raise ConnectionOpenFailedError( cmd, 'Failed to open SSH connection (could not start ControlMaster process)', exit_code, stdout, stderr, ) self._opened_by_us = True return True def close(self): if not self._opened_by_us: lgr.debug("Not closing %s since was not opened by itself", self) return # stop controlmaster: cmd = self._assemble_multiplex_ssh_cmd(["-O", "stop"]) lgr.debug("Closing %s by calling %s", self, cmd) try: self.runner.run(cmd, protocol=StdOutErrCapture) except CommandError as e: lgr.debug("Failed to run close command") if self.ctrl_path.exists(): lgr.debug("Removing existing control path %s", self.ctrl_path) # socket need to go in any case self.ctrl_path.unlink() if e.code != 255: # not a "normal" SSH error raise e
def restructure_ukb2bids(ds, subid, unrecognized_dir, base_path=None, session=None): """Perform the necessary renames to restructure to BIDS Parameters ---------- ds : Dataset DataLad dataset instance to restructure. The checked-out branch is taken as the subject of restructuring. subid : str Participant ID unrecognized_dir : str or None Name of a directory to put all unrecognized files into. The given value is used to populate the 'unrecogdir' substitution label in `ukb2bids_map`. If None, unrecognized files will not be moved. base_path : Path-like Base path to determine relative path names of any file for BIDS mapping session : str Session label for BIDS mapping """ # shortcut repo = ds.repo # prep for yield res = dict( action='ukb_bidsify', type='file', logger=lgr, refds=ds.path, ) # loop over all known files for fp in ds.status( path=base_path, annex=None, untracked='no', eval_subdataset_state='no', report_filetype='raw', return_type='generator', result_renderer=None): path = Path(fp['path']) if not path.exists(): lgr.debug('Skip mapping %s, no longer exists (likely moved before)', path) continue relpath = Path(fp['path']).relative_to(base_path or ds.pathobj) rp_parts = relpath.parts if rp_parts[0].startswith(('.git', '.datalad')): # ignore internal data structures continue # build a list of candidate mapping to try, and suffixes to reappend # upon a successful match cands = [ # full thing (str(relpath), ''), # without suffix(es) (str(Path( relpath.parent, relpath.name[:-sum(len(s) for s in relpath.suffixes)])), ''.join(relpath.suffixes)), ] # all intermediate path components cands += reversed([ (str(Path(*rp_parts[:i + 1])), Path(*rp_parts[i + 1:])) for i in range(len(rp_parts) - 1) ]) for pattern, suffix in cands: target_path = ukb2bids.get(pattern, None) if target_path is not None: # append suffix if isinstance(suffix, Path): target_path = opj(str(target_path), suffix) else: target_path = target_path + suffix # apply substitutions target_path = target_path.format( subj=subid, session='ses-{}'.format(session) if session else '', unrecogdir='@@UNRECOG@@' if unrecognized_dir is None else unrecognized_dir, ) break if target_path is None or '@@UNRECOG@@' in target_path: yield dict( res, path=fp['path'], status='impossible', message='No BIDS file name mapping available', ) continue full_sourcepath = Path(fp['path']) full_targetpath = ds.pathobj / target_path if full_targetpath.exists(): yield dict( res, path=fp['path'], status='error', message=('Target path %s already exists (naming conflict?)', target_path) ) continue # ensure target directory full_targetpath.parent.mkdir(parents=True, exist_ok=True) full_sourcepath.rename(full_targetpath) # delete empty source directories for p in full_sourcepath.parents: try: p.rmdir() except OSError: lgr.debug( "Not removing non-empty parent directory of %s", fp['path']) break yield dict( res, path=fp['path'], bids_path=str(full_targetpath), status='ok', )