Exemple #1
0
    def __init__(self, host, buffer_size=DEFAULT_BUFFER_SIZE):
        """
        Parameters
        ----------
        host : str
          SSH-accessible host(name) to perform remote IO operations
          on.
        """

        from datalad.support.sshconnector import SSHManager
        # connection manager -- we don't have to keep it around, I think
        self.sshmanager = SSHManager()
        # the connection to the remote
        # we don't open it yet, not yet clear if needed
        self.ssh = self.sshmanager.get_connection(
            host,
            use_remote_annex_bundle=False,
        )
        self.ssh.open()
        # open a remote shell
        cmd = ['ssh'] + self.ssh._ssh_args + [self.ssh.sshri.as_str()]
        self.shell = subprocess.Popen(cmd, stderr=subprocess.DEVNULL, stdout=subprocess.PIPE, stdin=subprocess.PIPE)
        # swallow login message(s):
        self.shell.stdin.write(b"echo RIA-REMOTE-LOGIN-END\n")
        self.shell.stdin.flush()
        while True:
            line = self.shell.stdout.readline()
            if line == b"RIA-REMOTE-LOGIN-END\n":
                break
        # TODO: Same for stderr?

        # make sure default is used when None was passed, too.
        self.buffer_size = buffer_size if buffer_size else DEFAULT_BUFFER_SIZE
Exemple #2
0
def setup_storage_tree(dataset, ssh_host, repo_path):
    """
    1. trigger creation of the dataset's directory at the remote end
    2. make it a bare repository
    """

    # Note: All it actually takes is to trigger the special remote's `prepare` method once.
    # ATM trying to achieve that by invoking a minimal fsck.
    # TODO: - It's probably faster to actually talk to the special remote (i.e. pretending to be annex and use the
    #       protocol to send PREPARE)
    #       - Alternatively we can create the remote directory and ria version file directly, but this means code
    #       duplication that then needs to be kept in sync with ria-remote implementation.
    #       - this leads to the third option: Have that creation routine importable and callable from ria-remote package
    #       without the need to actually instantiate a RIARemote object
    print("Initializing INM7 storage for this dataset")
    cmd = [
        'git', 'annex', 'fsck', '--from=inm7-storage', '--fast',
        '--exclude=*/*'
    ]
    subprocess.run(cmd, cwd=text_type(dataset.path))

    # TODO: we should prob. check whether it's there already. How?
    # Note: like the special remote itself, we assume local FS if no SSH host is specified
    if ssh_host:
        sshmanager = SSHManager()
        ssh = sshmanager.get_connection(ssh_host,
                                        use_remote_annex_bundle=False)
        ssh.open()
        ssh('cd {} && git init --bare'.format(repo_path))
    else:
        cmd = ['git', 'init', '--bare']
        subprocess.run(cmd, cwd=text_type(repo_path), check=True)
Exemple #3
0
class SSHRemoteIO(IOBase):
    """IO operation if the object tree is SSH-accessible

    It doesn't even think about a windows server.
    """

    # output markers to detect possible command failure as well as end of output
    # from a particular command:
    REMOTE_CMD_FAIL = "ora-remote: end - fail"
    REMOTE_CMD_OK = "ora-remote: end - ok"

    def __init__(self, host, buffer_size=DEFAULT_BUFFER_SIZE):
        """
        Parameters
        ----------
        host : str
          SSH-accessible host(name) to perform remote IO operations
          on.
        """

        from datalad.support.sshconnector import SSHManager
        # connection manager -- we don't have to keep it around, I think
        self.sshmanager = SSHManager()
        # the connection to the remote
        # we don't open it yet, not yet clear if needed
        self.ssh = self.sshmanager.get_connection(
            host,
            use_remote_annex_bundle=False,
        )
        self.ssh.open()
        # open a remote shell
        cmd = ['ssh'] + self.ssh._ssh_args + [self.ssh.sshri.as_str()]
        self.shell = subprocess.Popen(cmd,
                                      stderr=subprocess.DEVNULL,
                                      stdout=subprocess.PIPE,
                                      stdin=subprocess.PIPE)
        # swallow login message(s):
        self.shell.stdin.write(b"echo RIA-REMOTE-LOGIN-END\n")
        self.shell.stdin.flush()
        while True:
            line = self.shell.stdout.readline()
            if line == b"RIA-REMOTE-LOGIN-END\n":
                break
        # TODO: Same for stderr?

        # make sure default is used when None was passed, too.
        self.buffer_size = buffer_size if buffer_size else DEFAULT_BUFFER_SIZE

    def close(self):
        # try exiting shell clean first
        self.shell.stdin.write(b"exit\n")
        self.shell.stdin.flush()
        exitcode = self.shell.wait(timeout=0.5)
        # be more brutal if it doesn't work
        if exitcode is None:  # timed out
            # TODO: Theoretically terminate() can raise if not successful.
            #       How to deal with that?
            self.shell.terminate()
        self.sshmanager.close()

    def _append_end_markers(self, cmd):
        """Append end markers to remote command"""

        return cmd + " && printf '%s\\n' {} || printf '%s\\n' {}\n".format(
            sh_quote(self.REMOTE_CMD_OK), sh_quote(self.REMOTE_CMD_FAIL))

    def _get_download_size_from_key(self, key):
        """Get the size of an annex object file from it's key

        Note, that this is not necessarily the size of the annexed file, but
        possibly only a chunk of it.

        Parameter
        ---------
        key: str
          annex key of the file

        Returns
        -------
        int
          size in bytes
        """
        # TODO: datalad's AnnexRepo.get_size_from_key() is not correct/not
        #       fitting. Incorporate the wisdom there, too.
        #       We prob. don't want to actually move this method there, since
        #       AnnexRepo would be quite an expensive import. Startup time for
        #       special remote matters.
        # TODO: this method can be more compact. we don't need particularly
        #       elaborated error distinction

        # see: https://git-annex.branchable.com/internals/key_format/
        key_parts = key.split('--')
        key_fields = key_parts[0].split('-')

        s = S = C = None

        for field in key_fields[1:]:  # note: first has to be backend -> ignore
            if field.startswith('s'):
                # size of the annexed file content:
                s = int(field[1:]) if field[1:].isdigit() else None
            elif field.startswith('S'):
                # we have a chunk and that's the chunksize:
                S = int(field[1:]) if field[1:].isdigit() else None
            elif field.startswith('C'):
                # we have a chunk, this is it's number:
                C = int(field[1:]) if field[1:].isdigit() else None

        if s is None:
            return None
        elif S is None and C is None:
            return s
        elif S and C:
            if C <= int(s / S):
                return S
            else:
                return s % S
        else:
            raise RIARemoteError("invalid key: {}".format(key))

    def _run(self, cmd, no_output=True, check=False):

        # TODO: we might want to redirect stderr to stdout here (or have
        #       additional end marker in stderr) otherwise we can't empty stderr
        #       to be ready for next command. We also can't read stderr for
        #       better error messages (RemoteError) without making sure there's
        #       something to read in any case (it's blocking!).
        #       However, if we are sure stderr can only ever happen if we would
        #       raise RemoteError anyway, it might be okay.
        call = self._append_end_markers(cmd)
        self.shell.stdin.write(call.encode())
        self.shell.stdin.flush()

        lines = []
        while True:
            line = self.shell.stdout.readline().decode()
            lines.append(line)
            if line == self.REMOTE_CMD_OK + '\n':
                # end reading
                break
            elif line == self.REMOTE_CMD_FAIL + '\n':
                if check:
                    raise RemoteCommandFailedError(
                        "{cmd} failed: {msg}".format(cmd=cmd,
                                                     msg="".join(lines[:-1])))
                else:
                    break
        if no_output and len(lines) > 1:
            raise RIARemoteError("{}: {}".format(call, "".join(lines)))
        return "".join(lines[:-1])

    def mkdir(self, path):
        self._run('mkdir -p {}'.format(sh_quote(str(path))))

    def put(self, src, dst, progress_cb):
        self.ssh.put(str(src), str(dst))

    def get(self, src, dst, progress_cb):

        # Note, that as we are in blocking mode, we can't easily fail on the
        # actual get (that is 'cat').
        # Therefore check beforehand.
        if not self.exists(src):
            raise RIARemoteError("annex object {src} does not exist."
                                 "".format(src=src))

        # TODO: see get_from_archive()

        # TODO: Currently we will hang forever if the file isn't readable and
        #       it's supposed size is bigger than whatever cat spits out on
        #       stdout. This is because we don't notice that cat has exited
        #       non-zero. We could have end marker on stderr instead, but then
        #       we need to empty stderr beforehand to not act upon output from
        #       earlier calls. This is a problem with blocking reading, since we
        #       need to make sure there's actually something to read in any
        #       case.
        cmd = 'cat {}'.format(sh_quote(str(src)))
        self.shell.stdin.write(cmd.encode())
        self.shell.stdin.write(b"\n")
        self.shell.stdin.flush()

        from os.path import basename
        key = basename(str(src))
        try:
            size = self._get_download_size_from_key(key)
        except RemoteError as e:
            raise RemoteError("src: {}".format(str(src)) + str(e))

        if size is None:
            # rely on SCP for now
            self.ssh.get(str(src), str(dst))
            return

        with open(dst, 'wb') as target_file:
            bytes_received = 0
            while bytes_received < size:
                # TODO: some additional abortion criteria? check stderr in
                #       addition?
                c = self.shell.stdout.read1(self.buffer_size)
                # no idea yet, whether or not there's sth to gain by a
                # sophisticated determination of how many bytes to read at once
                # (like size - bytes_received)
                if c:
                    bytes_received += len(c)
                    target_file.write(c)
                    progress_cb(bytes_received)

    def rename(self, src, dst):
        self._run('mv {} {}'.format(sh_quote(str(src)), sh_quote(str(dst))))

    def remove(self, path):
        self._run('rm {}'.format(sh_quote(str(path))))

    def remove_dir(self, path):
        self._run('rmdir {}'.format(sh_quote(str(path))))

    def exists(self, path):
        try:
            self._run('test -e {}'.format(sh_quote(str(path))), check=True)
            return True
        except RemoteCommandFailedError:
            return False

    def in_archive(self, archive_path, file_path):

        if not self.exists(archive_path):
            return False

        loc = str(file_path)
        # query 7z for the specific object location, keeps the output
        # lean, even for big archives
        cmd = '7z l {} {}'.format(sh_quote(str(archive_path)), sh_quote(loc))

        # Note: Currently relies on file_path not showing up in case of failure
        # including non-existent archive. If need be could be more sophisticated
        # and called with check=True + catch RemoteCommandFailedError
        out = self._run(cmd, no_output=False, check=False)

        return loc in out

    def get_from_archive(self, archive, src, dst, progress_cb):

        # Note, that as we are in blocking mode, we can't easily fail on the
        # actual get (that is 'cat'). Therefore check beforehand.
        if not self.exists(archive):
            raise RIARemoteError("archive {arc} does not exist."
                                 "".format(arc=archive))

        # TODO: We probably need to check exitcode on stderr (via marker). If
        #       archive or content is missing we will otherwise hang forever
        #       waiting for stdout to fill `size`.

        cmd = '7z x -so {} {}\n'.format(sh_quote(str(archive)),
                                        sh_quote(str(src)))
        self.shell.stdin.write(cmd.encode())
        self.shell.stdin.flush()

        # TODO: - size needs double-check and some robustness
        #       - can we assume src to be a posixpath?
        #       - RF: Apart from the executed command this should be pretty much
        #         identical to self.get(), so move that code into a common
        #         function

        from os.path import basename
        size = self._get_download_size_from_key(basename(str(src)))

        with open(dst, 'wb') as target_file:
            bytes_received = 0
            while bytes_received < size:
                c = self.shell.stdout.read1(self.buffer_size)
                if c:
                    bytes_received += len(c)
                    target_file.write(c)
                    progress_cb(bytes_received)

    def read_file(self, file_path):

        cmd = "cat  {}".format(sh_quote(str(file_path)))
        try:
            out = self._run(cmd, no_output=False, check=True)
        except RemoteCommandFailedError:
            raise RIARemoteError("Could not read {}".format(str(file_path)))

        return out

    def write_file(self, file_path, content, mode='w'):

        if mode == 'w':
            mode = ">"
        elif mode == 'a':
            mode = ">>"
        else:
            raise ValueError("Unknown mode '{}'".format(mode))
        if not content.endswith('\n'):
            content += '\n'

        cmd = "printf '%s' {} {} {}".format(sh_quote(content), mode,
                                            sh_quote(str(file_path)))
        try:
            self._run(cmd, check=True)
        except RemoteCommandFailedError:
            raise RIARemoteError("Could not write to {}".format(
                str(file_path)))

    def get_7z(self):
        # TODO: To not rely on availability in PATH we might want to use `which`
        #       (`where` on windows) and get the actual path to 7z to re-use in
        #       in_archive() and get().
        #       Note: `command -v XXX` or `type` might be cross-platform
        #       solution!
        #       However, for availability probing only, it would be sufficient
        #       to just call 7z and see whether it returns zero.

        try:
            self._run("7z", check=True, no_output=False)
            return True
        except RemoteCommandFailedError:
            return False