def test_create_push_url(detection_path=None, ds_path=None, store_path=None):

    store_path = Path(store_path)
    ds_path = Path(ds_path)
    detection_path = Path(detection_path)

    ds = Dataset(ds_path).create(force=True)
    ds.save()

    # patch SSHConnection to signal it was used:
    from datalad.support.sshconnector import SSHManager

    def detector(f, d):
        @wraps(f)
        def _wrapper(*args, **kwargs):
            d.touch()
            return f(*args, **kwargs)

        return _wrapper

    url = "ria+{}".format(store_path.as_uri())
    push_url = "ria+ssh://datalad-test{}".format(store_path.as_posix())
    assert not detection_path.exists()

    with patch('datalad.support.sshconnector.SSHManager.get_connection',
               new=detector(SSHManager.get_connection, detection_path)):

        ds.create_sibling_ria(url,
                              "datastore",
                              push_url=push_url,
                              new_store_ok=True)
        # used ssh_manager despite file-url hence used push-url (ria+ssh):
        assert detection_path.exists()

        # correct config in special remote:
        sr_cfg = ds.repo.get_special_remotes()[ds.siblings(
            name='datastore-storage')[0]['annex-uuid']]
        eq_(sr_cfg['url'], url)
        eq_(sr_cfg['push-url'], push_url)

        # git remote based on url (local path):
        eq_(ds.config.get("remote.datastore.url"),
            (store_path / ds.id[:3] / ds.id[3:]).as_posix())
        eq_(
            ds.config.get("remote.datastore.pushurl"),
            "ssh://datalad-test{}".format(
                (store_path / ds.id[:3] / ds.id[3:]).as_posix()))

        # git-push uses SSH:
        detection_path.unlink()
        ds.push('.', to="datastore", data='nothing')
        assert detection_path.exists()

        # data push
        # Note, that here the patching has no effect, since the special remote
        # is running in a subprocess of git-annex. Hence we can't detect SSH
        # usage really. However, ORA remote is tested elsewhere - if it succeeds
        # all should be good wrt `create-sibling-ria`.
        ds.repo.call_annex(['copy', '.', '--to', 'datastore-storage'])
Beispiel #2
0
def test_bundle_invariance(path):
    remote_url = 'ssh://localhost'
    manager = SSHManager()
    testfile = Path(path) / 'dummy'
    for flag in (True, False):
        assert_false(testfile.exists())
        ssh = manager.get_connection(remote_url, use_remote_annex_bundle=flag)
        ssh('cd .>{}'.format(str(testfile)))
        ok_(testfile.exists())
        testfile.unlink()
Beispiel #3
0
def test_bundle_invariance(path):
    remote_url = 'ssh://localhost'
    manager = SSHManager()
    testfile = Path(path) / 'dummy'
    for flag in (True, False):
        assert_false(testfile.exists())
        ssh = manager.get_connection(remote_url, use_remote_annex_bundle=flag)
        ssh('cd .>{}'.format(text_type(testfile)))
        ok_(testfile.exists())
        testfile.unlink()
Beispiel #4
0
def compress_files(files, archive, path=None, overwrite=True):
    """Compress `files` into an `archive` file

    Parameters
    ----------
    files : list of str
    archive : str
    path : str
      Alternative directory under which compressor will be invoked, to e.g.
      take into account relative paths of files and/or archive
    overwrite : bool
      Whether to allow overwriting the target archive file if one already exists
    """
    runner = Runner(cwd=path)
    apath = Path(archive)
    if apath.exists():
        if overwrite:
            apath.unlink()
        else:
            raise ValueError(
                'Target archive {} already exists and overwrite is forbidden'.
                format(apath))
    suffixes = _normalize_fname_suffixes(apath.suffixes)
    if len(suffixes) > 1 and suffixes[-2] == '.tar':
        cmd = '7z u .tar -so -- {} | 7z u -si -- {}'.format(
            join_cmdline(files),
            quote_cmdlinearg(str(apath)),
        )
    else:
        cmd = ['7z', 'u', str(apath), '--'] + files
    runner.run(cmd, protocol=KillOutput)
Beispiel #5
0
    def _store_new(self,
                   url=None,
                   authentication_type=None,
                   authenticator_class=None,
                   url_re=None,
                   name=None,
                   credential_name=None,
                   credential_type=None,
                   level='user'):
        """Stores a provider and credential config and reloads afterwards.

        Note
        ----
        non-interactive version of `enter_new`.
        For now non-public, pending further refactoring

        Parameters
        ----------
        level: str
          Where to store the config. Choices: 'user' (default), 'ds', 'site'

        Returns
        -------
        Provider
          The stored `Provider` as reported by reload
        """

        # We don't ask user for confirmation, so for this non-interactive
        # routine require everything to be explicitly specified.
        if any(not a for a in [
                url, authentication_type, authenticator_class, url_re, name,
                credential_name, credential_type
        ]):
            raise ValueError("All arguments must be specified")

        if level not in ['user', 'ds', 'site']:
            raise ValueError("'level' must be one of 'user', 'ds', 'site'")

        providers_dir = Path(self._get_providers_dirs()[level])
        if not providers_dir.exists():
            providers_dir.mkdir(parents=True, exist_ok=True)
        filepath = providers_dir / f"{name}.cfg"
        cfg = self._CONFIG_TEMPLATE.format(**locals())
        filepath.write_bytes(cfg.encode('utf-8'))
        self.reload()
        return self.get_provider(url)
Beispiel #6
0
def test_asyncio_forked(temp):
    # temp will be used to communicate from child either it succeeded or not
    temp = Path(temp)
    runner = Runner()
    import os
    try:
        pid = os.fork()
    except BaseException as exc:
        # .fork availability is "Unix", and there are cases where it is "not supported"
        # so we will just skip if no forking is possible
        raise SkipTest(f"Cannot fork: {exc}")
    # if does not fail (in original or in a fork) -- we are good
    if sys.version_info < (3, 8) and pid != 0:
        # for some reason it is crucial to sleep a little (but 0.001 is not enough)
        # in the master process with older pythons or it takes forever to make the child run
        sleep(0.1)
    try:
        runner.run([sys.executable, '--version'], protocol=StdOutCapture)
        if pid == 0:
            temp.write_text("I rule")
    except:
        if pid == 0:
            temp.write_text("I suck")
    if pid != 0:
        # parent: look after the child
        t0 = time()
        try:
            while not temp.exists() or temp.stat().st_size < 6:
                if time() - t0 > 5:
                    raise AssertionError(
                        "Child process did not create a file we expected!")
        finally:
            # kill the child
            os.kill(pid, signal.SIGTERM)
        # see if it was a good one
        eq_(temp.read_text(), "I rule")
    else:
        # sleep enough so parent just kills me the kid before I continue doing bad deeds
        sleep(10)
def test_no_storage(store1=None, store2=None, ds_path=None):
    store1_url = 'ria+' + get_local_file_url(store1)
    store2_url = 'ria+' + get_local_file_url(store2)

    ds = Dataset(ds_path).create(force=True)
    ds.save(recursive=True)
    assert_repo_status(ds.path)

    res = ds.create_sibling_ria(store1_url,
                                "datastore1",
                                storage_sibling=False,
                                new_store_ok=True)
    assert_result_count(res, 1, status='ok', action='create-sibling-ria')
    eq_({'datastore1', 'here'},
        {s['name']
         for s in ds.siblings(result_renderer='disabled')})

    # deprecated way of disabling storage still works
    res = ds.create_sibling_ria(store2_url,
                                "datastore2",
                                storage_sibling=False,
                                new_store_ok=True)
    assert_result_count(res, 1, status='ok', action='create-sibling-ria')
    eq_({'datastore2', 'datastore1', 'here'},
        {s['name']
         for s in ds.siblings(result_renderer='disabled')})

    # no annex/object dir should be created when there is no special remote
    # to use it.
    for s in [store1, store2]:
        p = Path(s) / ds.id[:3] / ds.id[3:] / 'annex' / 'objects'
        assert_false(p.exists())

    # smoke test that we can push to it
    res = ds.push(to='datastore1')
    assert_status('ok', res)
    # but nothing was copied, because there is no storage sibling
    assert_result_count(res, 0, action='copy')
Beispiel #8
0
def test_datalad_credential_helper(path=None):

    ds = Dataset(path).create()

    # tell git to use git-credential-datalad
    ds.config.add('credential.helper', 'datalad', scope='local')
    ds.config.add('datalad.credentials.githelper.noninteractive',
                  'true',
                  scope='global')

    from datalad.downloaders.providers import Providers

    url1 = "https://datalad-test.org/some"
    url2 = "https://datalad-test.org/other"
    provider_name = "datalad-test.org"

    # `Providers` code is old and only considers a dataset root based on PWD
    # for config lookup. contextmanager below can be removed once the
    # provider/credential system is redesigned.
    with chpwd(ds.path):

        gitcred = GitCredentialInterface(url=url1, repo=ds)

        # There's nothing set up yet, helper should return empty
        gitcred.fill()
        eq_(gitcred['username'], '')
        eq_(gitcred['password'], '')

        # store new credentials
        # Note, that `Providers.enter_new()` currently uses user-level config
        # files for storage only. TODO: make that an option!
        # To not mess with existing ones, fail if it already exists:

        cfg_file = Path(Providers._get_providers_dirs()['user']) \
                   / f"{provider_name}.cfg"
        assert_false(cfg_file.exists())

        # Make sure we clean up
        from datalad.tests import _TEMP_PATHS_GENERATED
        _TEMP_PATHS_GENERATED.append(str(cfg_file))

        # Give credentials to git and ask it to store them:
        gitcred = GitCredentialInterface(url=url1,
                                         username="******",
                                         password="******",
                                         repo=ds)
        gitcred.approve()

        assert_true(cfg_file.exists())
        providers = Providers.from_config_files()
        p1 = providers.get_provider(url=url1, only_nondefault=True)
        assert_is_instance(p1.credential, UserPassword)
        eq_(p1.credential.get('user'), 'dl-user')
        eq_(p1.credential.get('password'), 'dl-pwd')

        # default regex should be host only, so matching url2, too
        p2 = providers.get_provider(url=url2, only_nondefault=True)
        assert_is_instance(p1.credential, UserPassword)
        eq_(p1.credential.get('user'), 'dl-user')
        eq_(p1.credential.get('password'), 'dl-pwd')

        # git, too, should now find it for both URLs
        gitcred = GitCredentialInterface(url=url1, repo=ds)
        gitcred.fill()
        eq_(gitcred['username'], 'dl-user')
        eq_(gitcred['password'], 'dl-pwd')

        gitcred = GitCredentialInterface(url=url2, repo=ds)
        gitcred.fill()
        eq_(gitcred['username'], 'dl-user')
        eq_(gitcred['password'], 'dl-pwd')

        # Rejection must not currently lead to deleting anything, since we would
        # delete too broadly.
        gitcred.reject()
        assert_true(cfg_file.exists())
        gitcred = GitCredentialInterface(url=url1, repo=ds)
        gitcred.fill()
        eq_(gitcred['username'], 'dl-user')
        eq_(gitcred['password'], 'dl-pwd')
        dlcred = UserPassword(name=provider_name)
        eq_(dlcred.get('user'), 'dl-user')
        eq_(dlcred.get('password'), 'dl-pwd')
class SSHConnection(object):
    """Representation of a (shared) ssh connection.
    """
    def __init__(self,
                 ctrl_path,
                 sshri,
                 identity_file=None,
                 use_remote_annex_bundle=True,
                 force_ip=False):
        """Create a connection handler

        The actual opening of the connection is performed on-demand.

        Parameters
        ----------
        ctrl_path: str
          path to SSH controlmaster
        sshri: SSHRI
          SSH resource identifier (contains all connection-relevant info),
          or another resource identifier that can be converted into an SSHRI.
        identity_file : str or None
          Value to pass to ssh's -i option.
        use_remote_annex_bundle : bool
          If set, look for a git-annex installation on the remote and
          prefer its binaries in the search path (i.e. prefer a bundled
          Git over a system package).
        force_ip : {False, 4, 6}
           Force the use of IPv4 or IPv6 addresses with -4 or -6.
        """
        self._runner = None

        from datalad.support.network import SSHRI, is_ssh
        if not is_ssh(sshri):
            raise ValueError(
                "Non-SSH resource identifiers are not supported for SSH "
                "connections: {}".format(sshri))
        self.sshri = SSHRI(
            **{
                k: v
                for k, v in sshri.fields.items()
                if k in ('username', 'hostname', 'port')
            })
        # on windows cmd args lists are always converted into a string using appropriate
        # quoting rules, on other platforms args lists are passed directly and we need
        # to take care of quoting ourselves
        ctrlpath_arg = "ControlPath={}".format(
            ctrl_path if on_windows else sh_quote(str(ctrl_path)))
        self._ssh_args = ["-o", ctrlpath_arg]
        self.ctrl_path = Path(ctrl_path)
        if self.sshri.port:
            self._ssh_args += ['-p', '{}'.format(self.sshri.port)]

        if force_ip:
            self._ssh_args.append("-{}".format(force_ip))
        self._identity_file = identity_file
        self._use_remote_annex_bundle = use_remote_annex_bundle

        # essential properties of the remote system
        self._remote_props = {}
        self._opened_by_us = False

    def __call__(self, cmd, options=None, stdin=None, log_output=True):
        """Executes a command on the remote.

        It is the callers responsibility to properly quote commands
        for remote execution (e.g. filename with spaces of other special
        characters). Use the `sh_quote()` from the module for this purpose.

        Parameters
        ----------
        cmd: str
          command to run on the remote
        options : list of str, optional
          Additional options to pass to the `-o` flag of `ssh`. Note: Many
          (probably most) of the available configuration options should not be
          set here because they can critically change the properties of the
          connection. This exists to allow options like SendEnv to be set.

        Returns
        -------
        tuple of str
          stdout, stderr of the command run.
        """

        # XXX: check for open socket once
        #      and provide roll back if fails to run and was not explicitly
        #      checked first
        # MIH: this would mean that we would have to distinguish failure
        #      of a payload command from failure of SSH itself. SSH however,
        #      only distinguishes success and failure of the entire operation
        #      Increase in fragility from introspection makes a potential
        #      performance benefit a questionable improvement.
        # make sure we have an open connection, will test if action is needed
        # by itself
        self.open()

        # locate annex and set the bundled vs. system Git machinery in motion
        if self._use_remote_annex_bundle:
            remote_annex_installdir = self.get_annex_installdir()
            if remote_annex_installdir:
                # make sure to use the bundled git version if any exists
                cmd = '{}; {}'.format(
                    'export "PATH={}:$PATH"'.format(remote_annex_installdir),
                    cmd)

        # build SSH call, feed remote command as a single last argument
        # whatever it contains will go to the remote machine for execution
        # we cannot perform any sort of escaping, because it will limit
        # what we can do on the remote, e.g. concatenate commands with '&&'
        ssh_cmd = ["ssh"] + self._ssh_args
        for opt in options or []:
            ssh_cmd.extend(["-o", opt])

        ssh_cmd += [self.sshri.as_str()] \
            + [cmd]

        # TODO: pass expect parameters from above?
        # Hard to explain to toplevel users ... So for now, just set True
        out = self.runner.run(
            ssh_cmd,
            protocol=StdOutErrCapture if log_output else NoCapture,
            stdin=stdin)
        return out['stdout'], out['stderr']

    @property
    def runner(self):
        if self._runner is None:
            self._runner = WitlessRunner()
        return self._runner

    def is_open(self):
        if not self.ctrl_path.exists():
            lgr.log(5, "Not opening %s for checking since %s does not exist",
                    self, self.ctrl_path)
            return False
        # check whether controlmaster is still running:
        cmd = ["ssh", "-O", "check"] + self._ssh_args + [self.sshri.as_str()]
        lgr.debug("Checking %s by calling %s" % (self, cmd))
        try:
            # expect_stderr since ssh would announce to stderr
            # "Master is running" and that is normal, not worthy warning about
            # etc -- we are doing the check here for successful operation
            with tempfile.TemporaryFile() as tempf:
                self.runner.run(
                    cmd,
                    # do not leak output
                    protocol=StdOutErrCapture,
                    stdin=tempf)
            res = True
        except CommandError as e:
            if e.code != 255:
                # this is not a normal SSH error, whine ...
                raise e
            # SSH died and left socket behind, or server closed connection
            self.close()
            res = False
        lgr.debug("Check of %s has %s", self, {
            True: 'succeeded',
            False: 'failed'
        }[res])
        return res

    def open(self):
        """Opens the connection.

        In other words: Creates the SSH ControlMaster to be used by this
        connection, if it is not there already.

        Returns
        -------
        bool
          True when SSH reports success opening the connection, False when
          a ControlMaster for an open connection already exists.

        Raises
        ------
        ConnectionOpenFailedError
          When starting the SSH ControlMaster process failed.
        """
        # the socket should vanish almost instantly when the connection closes
        # sending explicit 'check' commands to the control master is expensive
        # (needs tempfile to shield stdin, Runner overhead, etc...)
        # as we do not use any advanced features (forwarding, stop[ing the
        # master without exiting) it should be relatively safe to just perform
        # the much cheaper check of an existing control path
        if self.ctrl_path.exists():
            return False

        # set control options
        ctrl_options = [
            "-fN", "-o", "ControlMaster=auto", "-o", "ControlPersist=15m"
        ] + self._ssh_args
        if self._identity_file:
            ctrl_options.extend(["-i", self._identity_file])
        # create ssh control master command
        cmd = ["ssh"] + ctrl_options + [self.sshri.as_str()]

        # start control master:
        lgr.debug("Opening %s by calling %s" % (self, cmd))
        proc = Popen(cmd)
        stdout, stderr = proc.communicate(
            input="\n")  # why the f.. this is necessary?

        # wait till the command exits, connection is conclusively
        # open or not at this point
        exit_code = proc.wait()

        if exit_code != 0:
            raise ConnectionOpenFailedError(
                cmd,
                'Failed to open SSH connection (could not start ControlMaster process)',
                exit_code,
                stdout,
                stderr,
            )
        self._opened_by_us = True
        return True

    def close(self):
        """Closes the connection.
        """
        if not self._opened_by_us:
            lgr.debug("Not closing %s since was not opened by itself", self)
            return
        # stop controlmaster:
        cmd = ["ssh", "-O", "stop"] + self._ssh_args + [self.sshri.as_str()]
        lgr.debug("Closing %s by calling %s", self, cmd)
        try:
            self.runner.run(cmd, protocol=StdOutErrCapture)
        except CommandError as e:
            lgr.debug("Failed to run close command")
            if self.ctrl_path.exists():
                lgr.debug("Removing existing control path %s", self.ctrl_path)
                # socket need to go in any case
                self.ctrl_path.unlink()
            if e.code != 255:
                # not a "normal" SSH error
                raise e

    def _get_scp_command_spec(self, recursive, preserve_attrs):
        """Internal helper for SCP interface methods"""
        # Convert ssh's port flag (-p) to scp's (-P).
        scp_options = ["-P" if x == "-p" else x for x in self._ssh_args]
        # add recursive, preserve_attributes flag if recursive, preserve_attrs set and create scp command
        scp_options += ["-r"] if recursive else []
        scp_options += ["-p"] if preserve_attrs else []
        return ["scp"] + scp_options

    def put(self, source, destination, recursive=False, preserve_attrs=False):
        """Copies source file/folder to destination on the remote.

        Note: this method performs escaping of filenames to an extent that
        moderately weird ones should work (spaces, quotes, pipes, other
        characters with special shell meaning), but more complicated cases
        might require appropriate external preprocessing of filenames.

        Parameters
        ----------
        source : str or list
          file/folder path(s) to copy from on local
        destination : str
          file/folder path to copy to on remote
        recursive : bool
          flag to enable recursive copying of given sources
        preserve_attrs : bool
          preserve modification times, access times, and modes from the
          original file

        Returns
        -------
        str
          stdout, stderr of the copy operation.
        """
        # make sure we have an open connection, will test if action is needed
        # by itself
        self.open()
        scp_cmd = self._get_scp_command_spec(recursive, preserve_attrs)
        # add source filepath(s) to scp command
        scp_cmd += ensure_list(source)
        # add destination path
        scp_cmd += [
            '%s:%s' % (
                self.sshri.hostname,
                _quote_filename_for_scp(destination),
            )
        ]
        out = self.runner.run(scp_cmd, protocol=StdOutErrCapture)
        return out['stdout'], out['stderr']

    def get(self, source, destination, recursive=False, preserve_attrs=False):
        """Copies source file/folder from remote to a local destination.

        Note: this method performs escaping of filenames to an extent that
        moderately weird ones should work (spaces, quotes, pipes, other
        characters with special shell meaning), but more complicated cases
        might require appropriate external preprocessing of filenames.

        Parameters
        ----------
        source : str or list
          file/folder path(s) to copy from the remote host
        destination : str
          file/folder path to copy to on the local host
        recursive : bool
          flag to enable recursive copying of given sources
        preserve_attrs : bool
          preserve modification times, access times, and modes from the
          original file

        Returns
        -------
        str
          stdout, stderr of the copy operation.
        """
        # make sure we have an open connection, will test if action is needed
        # by itself
        self.open()
        scp_cmd = self._get_scp_command_spec(recursive, preserve_attrs)
        # add source filepath(s) to scp command, prefixed with the remote host
        scp_cmd += [
            "%s:%s" % (self.sshri.hostname, _quote_filename_for_scp(s))
            for s in ensure_list(source)
        ]
        # add destination path
        scp_cmd += [destination]
        out = self.runner.run(scp_cmd, protocol=StdOutErrCapture)
        return out['stdout'], out['stderr']

    def get_annex_installdir(self):
        key = 'installdir:annex'
        if key in self._remote_props:
            return self._remote_props[key]
        annex_install_dir = None
        # already set here to avoid any sort of recursion until we know
        # more
        self._remote_props[key] = annex_install_dir
        try:
            with tempfile.TemporaryFile() as tempf:
                # TODO does not work on windows
                annex_install_dir = self(
                    # use sh -e to be able to fail at each stage of the process
                    "sh -e -c 'dirname $(readlink -f $(which git-annex-shell))'",
                    stdin=tempf)[0].strip()
        except CommandError as e:
            lgr.debug('Failed to locate remote git-annex installation: %s',
                      exc_str(e))
        self._remote_props[key] = annex_install_dir
        return annex_install_dir

    def get_annex_version(self):
        key = 'cmd:annex'
        if key in self._remote_props:
            return self._remote_props[key]
        try:
            # modern annex versions
            version = self('git annex version --raw')[0]
        except CommandError:
            # either no annex, or old version
            try:
                # fall back on method that could work with older installations
                out, err = self('git annex version')
                version = out.split('\n')[0].split(':')[1].strip()
            except CommandError as e:
                lgr.debug('Failed to determine remote git-annex version: %s',
                          exc_str(e))
                version = None
        self._remote_props[key] = version
        return version

    def get_git_version(self):
        key = 'cmd:git'
        if key in self._remote_props:
            return self._remote_props[key]
        git_version = None
        try:
            git_version = self('git version')[0].split()[2]
        except CommandError as e:
            lgr.debug('Failed to determine Git version: %s', exc_str(e))
        self._remote_props[key] = git_version
        return git_version
Beispiel #10
0
def _test_create_store(host, base_path, ds_path, clone_path):

    ds = Dataset(ds_path).create(force=True)

    subds = ds.create('sub', force=True)
    subds2 = ds.create('sub2', force=True, annex=False)
    ds.save(recursive=True)
    assert_repo_status(ds.path)

    # don't specify special remote. By default should be git-remote + "-storage"
    res = ds.create_sibling_ria("ria+ssh://test-store:", "datastore")
    assert_result_count(res, 1, status='ok', action='create-sibling-ria')
    eq_(len(res), 1)

    # remotes exist, but only in super
    siblings = ds.siblings(result_renderer=None)
    eq_({'datastore', 'datastore-storage', 'here'},
        {s['name'] for s in siblings})
    sub_siblings = subds.siblings(result_renderer=None)
    eq_({'here'}, {s['name'] for s in sub_siblings})
    sub2_siblings = subds2.siblings(result_renderer=None)
    eq_({'here'}, {s['name'] for s in sub2_siblings})

    # TODO: post-update hook was enabled

    # check bare repo:
    git_config = Path(base_path) / ds.id[:3] / ds.id[3:] / 'config'
    assert git_config.exists()
    content = git_config.read_text()
    assert_in("[datalad \"ora-remote\"]", content)
    super_uuid = ds.config.get("remote.{}.annex-uuid".format('datastore-storage'))
    assert_in("uuid = {}".format(super_uuid), content)

    # implicit test of success by ria-installing from store:
    ds.publish(to="datastore", transfer_data='all')
    with chpwd(clone_path):
        if host:
            # note, we are not using the "test-store"-label here
            clone('ria+ssh://{}{}#{}'.format(host, base_path, ds.id),
                  path='test_install')
        else:
            # TODO: Whenever ria+file supports special remote config (label),
            # change here:
            clone('ria+file://{}#{}'.format(base_path, ds.id),
                  path='test_install')
        installed_ds = Dataset(op.join(clone_path, 'test_install'))
        assert installed_ds.is_installed()
        assert_repo_status(installed_ds.repo)
        eq_(installed_ds.id, ds.id)
        assert_in(op.join('ds', 'file1.txt'),
                  installed_ds.repo.get_annexed_files())
        assert_result_count(installed_ds.get(op.join('ds', 'file1.txt')),
                            1,
                            status='ok',
                            action='get',
                            path=op.join(installed_ds.path, 'ds', 'file1.txt'))

    # now, again but recursive.
    res = ds.create_sibling_ria("ria+ssh://test-store:", "datastore",
                                recursive=True, existing='reconfigure')
    eq_(len(res), 3)
    assert_result_count(res, 1, path=str(ds.pathobj), status='ok', action="create-sibling-ria")
    assert_result_count(res, 1, path=str(subds.pathobj), status='ok', action="create-sibling-ria")
    assert_result_count(res, 1, path=str(subds2.pathobj), status='ok', action="create-sibling-ria")

    # remotes now exist in super and sub
    siblings = ds.siblings(result_renderer=None)
    eq_({'datastore', 'datastore-storage', 'here'},
        {s['name'] for s in siblings})
    sub_siblings = subds.siblings(result_renderer=None)
    eq_({'datastore', 'datastore-storage', 'here'},
        {s['name'] for s in sub_siblings})
    # but no special remote in plain git subdataset:
    sub2_siblings = subds2.siblings(result_renderer=None)
    eq_({'datastore', 'here'},
        {s['name'] for s in sub2_siblings})

    # for testing trust_level parameter, redo for each label:
    for trust in ['trust', 'semitrust', 'untrust']:
        ds.create_sibling_ria("ria+ssh://test-store:",
                              "datastore",
                              existing='reconfigure',
                              trust_level=trust)
        res = ds.repo.repo_info()
        assert_in('[datastore-storage]',
                  [r['description']
                   for r in res['{}ed repositories'.format(trust)]])
Beispiel #11
0
def restructure_ukb2bids(ds, subid, unrecognized_dir, base_path=None):
    """Perform the necessary renames to restructure to BIDS

    Parameters
    ----------
    ds : Dataset
      DataLad dataset instance to restructure. The checked-out branch
      is taken as the subject of restructuring.
    subid : str
      Participant ID
    unrecognized_dir : str or None
      Name of a directory to put all unrecognized files into. The given
      value is used to populate the 'unrecogdir' substitution label
      in `ukb2bids_map`. If None, unrecognized files will not be moved.
      The directory will be placed inside the respective session directory.
    base_path : Path-like
      Base path to determine relative path names of any file for BIDS
      mapping
    """
    # shortcut
    repo = ds.repo

    # prep for yield
    res = dict(
        action='ukb_bidsify',
        type='file',
        logger=lgr,
        refds=ds.path,
    )

    # loop over all known files
    for fp in ds.status(
            path=base_path,
            annex=None,
            untracked='all',
            eval_subdataset_state='no',
            report_filetype='raw',
            return_type='generator',
            result_renderer=None):
        path = Path(fp['path'])
        if not path.exists():
            lgr.debug('Skip mapping %s, no longer exists (likely moved before)', path)
            continue
        rp_parts = list(Path(fp['path']).relative_to(base_path or ds.pathobj).parts)
        # instance number will serve as BIDS session
        try:
            session = rp_parts[0].split('_')[1]
        except IndexError:
            # ignore anything that doesn't look like a UKB data record
            continue
        # pull out instance number from the top-level component, because the matching
        # is uniform and agnostic of instances
        rp_parts[0] = '_'.join(rp_parts[0].split('_')[::2])
        fname = Path(rp_parts[-1])
        # build a list of candidate mapping to try, and suffixes to reappend
        # upon a successful match
        cands = [
            # full thing
            (str(Path(*rp_parts)), ''),
            # without suffix(es)
            (str(Path(
                *rp_parts[:-1],
                fname.name[:-sum(len(s) for s in fname.suffixes)])),
             ''.join(fname.suffixes)),
        ]
        # all intermediate path components
        cands += reversed([
            (str(Path(*rp_parts[:i + 1])),
             str(Path(*rp_parts[i + 1:])))
            for i in range(len(rp_parts) - 1)
        ])
        for pattern, suffix in cands:
            target_path = ukb2bids.get(pattern, None)
            if target_path is not None:
                # append suffix
                if isinstance(suffix, Path):
                    target_path = opj(str(target_path), suffix)
                else:
                    target_path = target_path + suffix
                # apply substitutions
                target_path = target_path.format(
                    subj=subid,
                    session='ses-{}'.format(session),
                    unrecogdir='@@UNRECOG@@'
                    if unrecognized_dir is None
                    else Path('ses-{}'.format(session)) / unrecognized_dir,
                )
                break
        if target_path is None or '@@UNRECOG@@' in target_path:
            yield dict(
                res,
                path=fp['path'],
                status='impossible',
                message='No BIDS file name mapping available',
            )
            continue
        full_sourcepath = Path(fp['path'])
        full_targetpath = ds.pathobj / target_path
        if full_targetpath.exists():
            lgr.info('Overwriting %s', str(target_path))
            target_path.unlink()
        else:
            # ensure target directory
            full_targetpath.parent.mkdir(parents=True, exist_ok=True)
        full_sourcepath.rename(full_targetpath)
        # delete empty source directories
        for p in full_sourcepath.parents:
            try:
                p.rmdir()
            except OSError:
                lgr.debug(
                    "Not removing non-empty parent directory of %s",
                    fp['path'])
                break
        yield dict(
            res,
            path=fp['path'],
            bids_path=str(full_targetpath),
            status='ok',
        )
Beispiel #12
0
class MultiplexSSHConnection(BaseSSHConnection):
    """Representation of a (shared) ssh connection.
    """
    def __init__(self, ctrl_path, sshri, **kwargs):
        """Create a connection handler

        The actual opening of the connection is performed on-demand.

        Parameters
        ----------
        ctrl_path: str
          path to SSH controlmaster
        sshri: SSHRI
          SSH resource identifier (contains all connection-relevant info),
          or another resource identifier that can be converted into an SSHRI.
        **kwargs
          Pass on to BaseSSHConnection
        """
        super().__init__(sshri, **kwargs)

        # on windows cmd args lists are always converted into a string using appropriate
        # quoting rules, on other platforms args lists are passed directly and we need
        # to take care of quoting ourselves
        ctrlpath_arg = "ControlPath={}".format(
            ctrl_path if on_windows else sh_quote(str(ctrl_path)))
        self._ssh_args += ["-o", ctrlpath_arg]
        self._ssh_open_args += [
            "-fN",
            "-o",
            "ControlMaster=auto",
            "-o",
            "ControlPersist=15m",
        ]
        self.ctrl_path = Path(ctrl_path)
        self._opened_by_us = False
        # used by @fasteners.locked
        self._lock = [
            threading.Lock(),
            fasteners.process_lock.InterProcessLock(
                self.ctrl_path.with_suffix('.lck'))
        ]

    def __call__(self, cmd, options=None, stdin=None, log_output=True):

        # XXX: check for open socket once
        #      and provide roll back if fails to run and was not explicitly
        #      checked first
        # MIH: this would mean that we would have to distinguish failure
        #      of a payload command from failure of SSH itself. SSH however,
        #      only distinguishes success and failure of the entire operation
        #      Increase in fragility from introspection makes a potential
        #      performance benefit a questionable improvement.
        # make sure we have an open connection, will test if action is needed
        # by itself
        self.open()

        ssh_cmd = [self.ssh_executable] + self._ssh_args
        return self._exec_ssh(ssh_cmd,
                              cmd,
                              options=options,
                              stdin=stdin,
                              log_output=log_output)

    def _assemble_multiplex_ssh_cmd(self, additional_arguments):
        return [self.ssh_executable] \
               + additional_arguments \
               + self._ssh_args \
               + [self.sshri.as_str()]

    def is_open(self):
        if not self.ctrl_path.exists():
            lgr.log(5, "Not opening %s for checking since %s does not exist",
                    self, self.ctrl_path)
            return False
        # check whether controlmaster is still running:
        cmd = self._assemble_multiplex_ssh_cmd(["-O", "check"])

        lgr.debug("Checking %s by calling %s", self, cmd)
        try:
            # expect_stderr since ssh would announce to stderr
            # "Master is running" and that is normal, not worthy warning about
            # etc -- we are doing the check here for successful operation
            with tempfile.TemporaryFile() as tempf:
                self.runner.run(
                    cmd,
                    # do not leak output
                    protocol=StdOutErrCapture,
                    stdin=tempf)
            res = True
        except CommandError as e:
            if e.code != 255:
                # this is not a normal SSH error, whine ...
                raise e
            # SSH died and left socket behind, or server closed connection
            self.close()
            res = False
        lgr.debug("Check of %s has %s", self, {
            True: 'succeeded',
            False: 'failed'
        }[res])
        return res

    @fasteners.locked
    def open(self):
        """Opens the connection.

        In other words: Creates the SSH ControlMaster to be used by this
        connection, if it is not there already.

        Returns
        -------
        bool
          True when SSH reports success opening the connection, False when
          a ControlMaster for an open connection already exists.

        Raises
        ------
        ConnectionOpenFailedError
          When starting the SSH ControlMaster process failed.
        """
        # the socket should vanish almost instantly when the connection closes
        # sending explicit 'check' commands to the control master is expensive
        # (needs tempfile to shield stdin, Runner overhead, etc...)
        # as we do not use any advanced features (forwarding, stop[ing the
        # master without exiting) it should be relatively safe to just perform
        # the much cheaper check of an existing control path
        if self.ctrl_path.exists():
            return False

        # create ssh control master command
        cmd = self._assemble_multiplex_ssh_cmd(self._ssh_open_args)

        # start control master:
        lgr.debug("Opening %s by calling %s", self, cmd)
        # The following call is exempt from bandit's security checks because
        # we/the user control the content of 'cmd'.
        proc = Popen(cmd)  # nosec
        stdout, stderr = proc.communicate(
            input="\n")  # why the f.. this is necessary?

        # wait till the command exits, connection is conclusively
        # open or not at this point
        exit_code = proc.wait()

        if exit_code != 0:
            raise ConnectionOpenFailedError(
                cmd,
                'Failed to open SSH connection (could not start ControlMaster process)',
                exit_code,
                stdout,
                stderr,
            )
        self._opened_by_us = True
        return True

    def close(self):
        if not self._opened_by_us:
            lgr.debug("Not closing %s since was not opened by itself", self)
            return
        # stop controlmaster:
        cmd = self._assemble_multiplex_ssh_cmd(["-O", "stop"])
        lgr.debug("Closing %s by calling %s", self, cmd)
        try:
            self.runner.run(cmd, protocol=StdOutErrCapture)
        except CommandError as e:
            lgr.debug("Failed to run close command")
            if self.ctrl_path.exists():
                lgr.debug("Removing existing control path %s", self.ctrl_path)
                # socket need to go in any case
                self.ctrl_path.unlink()
            if e.code != 255:
                # not a "normal" SSH error
                raise e
Beispiel #13
0
def restructure_ukb2bids(ds, subid, unrecognized_dir, base_path=None,
                         session=None):
    """Perform the necessary renames to restructure to BIDS

    Parameters
    ----------
    ds : Dataset
      DataLad dataset instance to restructure. The checked-out branch
      is taken as the subject of restructuring.
    subid : str
      Participant ID
    unrecognized_dir : str or None
      Name of a directory to put all unrecognized files into. The given
      value is used to populate the 'unrecogdir' substitution label
      in `ukb2bids_map`. If None, unrecognized files will not be moved.
    base_path : Path-like
      Base path to determine relative path names of any file for BIDS
      mapping
    session : str
      Session label for BIDS mapping
    """
    # shortcut
    repo = ds.repo

    # prep for yield
    res = dict(
        action='ukb_bidsify',
        type='file',
        logger=lgr,
        refds=ds.path,
    )

    # loop over all known files
    for fp in ds.status(
            path=base_path,
            annex=None,
            untracked='no',
            eval_subdataset_state='no',
            report_filetype='raw',
            return_type='generator',
            result_renderer=None):
        path = Path(fp['path'])
        if not path.exists():
            lgr.debug('Skip mapping %s, no longer exists (likely moved before)', path)
            continue
        relpath = Path(fp['path']).relative_to(base_path or ds.pathobj)
        rp_parts = relpath.parts
        if rp_parts[0].startswith(('.git', '.datalad')):
            # ignore internal data structures
            continue
        # build a list of candidate mapping to try, and suffixes to reappend
        # upon a successful match
        cands = [
            # full thing
            (str(relpath), ''),
            # without suffix(es)
            (str(Path(
                relpath.parent,
                relpath.name[:-sum(len(s) for s in relpath.suffixes)])),
             ''.join(relpath.suffixes)),
        ]
        # all intermediate path components
        cands += reversed([
            (str(Path(*rp_parts[:i + 1])),
             Path(*rp_parts[i + 1:]))
            for i in range(len(rp_parts) - 1)
        ])
        for pattern, suffix in cands:
            target_path = ukb2bids.get(pattern, None)
            if target_path is not None:
                # append suffix
                if isinstance(suffix, Path):
                    target_path = opj(str(target_path), suffix)
                else:
                    target_path = target_path + suffix
                # apply substitutions
                target_path = target_path.format(
                    subj=subid,
                    session='ses-{}'.format(session) if session else '',
                    unrecogdir='@@UNRECOG@@'
                    if unrecognized_dir is None else unrecognized_dir,
                )
                break
        if target_path is None or '@@UNRECOG@@' in target_path:
            yield dict(
                res,
                path=fp['path'],
                status='impossible',
                message='No BIDS file name mapping available',
            )
            continue
        full_sourcepath = Path(fp['path'])
        full_targetpath = ds.pathobj / target_path
        if full_targetpath.exists():
            yield dict(
                res,
                path=fp['path'],
                status='error',
                message=('Target path %s already exists (naming conflict?)',
                         target_path)
            )
            continue
        # ensure target directory
        full_targetpath.parent.mkdir(parents=True, exist_ok=True)
        full_sourcepath.rename(full_targetpath)
        # delete empty source directories
        for p in full_sourcepath.parents:
            try:
                p.rmdir()
            except OSError:
                lgr.debug(
                    "Not removing non-empty parent directory of %s",
                    fp['path'])
                break
        yield dict(
            res,
            path=fp['path'],
            bids_path=str(full_targetpath),
            status='ok',
        )