Exemple #1
0
def _check_and_update_remote_server_info(ds, remote):
    # if we managed to copy to "http" url  we should should try to trigger git
    # update-server-info hook on the remote if there was ssh annexurl defined
    # for it. Apparently we do that already in create_sibling ones, but here
    # we need more checks and preparation
    remote_url = ds.repo.config.get('remote.%s.url' % remote, None)
    if remote_url:
        remote_url = RI(remote_url)
        if isinstance(remote_url, URL) and remote_url.scheme in (
        'http', 'https'):
            remote_annexurl = ds.repo.config.get('remote.%s.annexurl' % remote,
                                                 None)
            if remote_annexurl:
                remote_annexurl_ri = RI(remote_annexurl)
                if is_ssh(remote_annexurl_ri):
                    ssh = ssh_manager.get_connection(remote_annexurl_ri)
                    ssh('git -C {} update-server-info'.format(
                        sh_quote(remote_annexurl_ri.path)))
                    return True
                else:
                    lgr.debug(
                        "There is no annexurl defined but not ssh: %s, "
                        "dunno if "
                        "we could/should do anything", remote_annexurl
                    )
    return False
Exemple #2
0
 def __call__(login, cmd, port=None, no_stdin=False):
     lgr.debug("sshrun invoked: %r %r %r %r", login, cmd, port, no_stdin)
     # Perspective workarounds for git-annex invocation, see
     # https://github.com/datalad/datalad/issues/1456#issuecomment-292641319
     
     if cmd.startswith("'") and cmd.endswith("'"):
         lgr.debug(
             "Detected additional level of quotations in %r so performing "
             "shlex split", cmd
         )
         # there is an additional layer of quotes
         # Let's strip them off using shlex
         import shlex
         cmd_ = shlex.split(cmd)
         if len(cmd_) != 1:
             raise RuntimeError(
                 "Obtained more or less than a single argument upon shlex "
                 "split: %s" % repr(cmd_))
         cmd = cmd_[0]
     sshurl = 'ssh://{}{}'.format(
         login,
         ':{}'.format(port) if port else '')
     ssh = ssh_manager.get_connection(sshurl)
     # TODO: /dev/null on windows ;)  or may be could be just None?
     stdin_ = open('/dev/null', 'r') if no_stdin else sys.stdin
     try:
         out, err = ssh(cmd, stdin=stdin_, log_output=False)
     finally:
         if no_stdin:
             stdin_.close()
     os.write(1, out.encode('UTF-8'))
     os.write(2, err.encode('UTF-8'))
Exemple #3
0
    def __call__(login, cmd, port=None, no_stdin=False):
        lgr.debug("sshrun invoked: %r %r %r %r", login, cmd, port, no_stdin)
        # Perspective workarounds for git-annex invocation, see
        # https://github.com/datalad/datalad/issues/1456#issuecomment-292641319

        if cmd.startswith("'") and cmd.endswith("'"):
            lgr.debug(
                "Detected additional level of quotations in %r so performing "
                "shlex split", cmd)
            # there is an additional layer of quotes
            # Let's strip them off using shlex
            import shlex
            cmd_ = shlex.split(cmd)
            if len(cmd_) != 1:
                raise RuntimeError(
                    "Obtained more or less than a single argument upon shlex "
                    "split: %s" % repr(cmd_))
            cmd = cmd_[0]
        sshurl = 'ssh://{}{}'.format(login, ':{}'.format(port) if port else '')
        ssh = ssh_manager.get_connection(sshurl)
        # TODO: /dev/null on windows ;)  or may be could be just None?
        stdin_ = open('/dev/null', 'r') if no_stdin else sys.stdin
        try:
            out, err = ssh(cmd, stdin=stdin_, log_output=False)
        finally:
            if no_stdin:
                stdin_.close()
        os.write(1, out.encode('UTF-8'))
        os.write(2, err.encode('UTF-8'))
Exemple #4
0
def test_replace_and_relative_sshpath(src_path, dst_path):
    # We need to come up with the path relative to our current home directory
    # https://github.com/datalad/datalad/issues/1653
    # but because we override HOME the HOME on the remote end would be
    # different even though a localhost. So we need to query it
    from datalad import ssh_manager
    ssh = ssh_manager.get_connection('localhost')
    remote_home, err = ssh('pwd')
    assert not err
    remote_home = remote_home.rstrip('\n')
    dst_relpath = os.path.relpath(dst_path, remote_home)
    url = 'localhost:%s' % dst_relpath
    ds = Dataset(src_path).create()
    create_tree(ds.path, {'sub.dat': 'lots of data'})
    ds.add('sub.dat')
    ds.create_sibling(url, ui=True)
    published = ds.publish(to='localhost', transfer_data='all')
    assert_result_count(published, 1, path=opj(ds.path, 'sub.dat'))
    # verify that hook runs and there is nothing in stderr
    # since it exits with 0 exit even if there was a problem
    out, err = Runner(cwd=opj(dst_path, '.git'))(_path_('hooks/post-update'))
    assert_false(out)
    assert_false(err)

    # Verify that we could replace and publish no problem
    # https://github.com/datalad/datalad/issues/1656
    # Strangely it spits outs IncompleteResultsError exception atm... so just
    # checking that it fails somehow
    res = ds.create_sibling(url, on_failure='ignore')
    assert_status('error', res)
    assert_in('already configured', res[0]['message'][0])
    # "Settings" such as UI do not persist, so we specify it again
    # for the test below depending on it
    ds.create_sibling(url, existing='replace', ui=True)
    published2 = ds.publish(to='localhost', transfer_data='all')
    assert_result_count(published2, 1, path=opj(ds.path, 'sub.dat'))

    # and one more test since in above test it would not puke ATM but just
    # not even try to copy since it assumes that file is already there
    create_tree(ds.path, {'sub2.dat': 'more data'})
    ds.add('sub2.dat')
    published3 = ds.publish(to='localhost',
                            transfer_data='none')  # we publish just git
    assert_result_count(published3, 0, path=opj(ds.path, 'sub2.dat'))
    # now publish "with" data, which should also trigger the hook!
    # https://github.com/datalad/datalad/issues/1658
    from glob import glob
    from datalad.consts import WEB_META_LOG
    logs_prior = glob(_path_(dst_path, WEB_META_LOG, '*'))
    published4 = ds.publish(to='localhost', transfer_data='all')
    assert_result_count(published4, 1, path=opj(ds.path, 'sub2.dat'))
    logs_post = glob(_path_(dst_path, WEB_META_LOG, '*'))
    eq_(len(logs_post), len(logs_prior) + 1)

    assert_postupdate_hooks(dst_path)
def test_replace_and_relative_sshpath(src_path, dst_path):
    # We need to come up with the path relative to our current home directory
    # https://github.com/datalad/datalad/issues/1653
    # but because we override HOME the HOME on the remote end would be
    # different even though a localhost. So we need to query it
    from datalad import ssh_manager
    ssh = ssh_manager.get_connection('localhost')
    remote_home, err = ssh('pwd')
    assert not err
    remote_home = remote_home.rstrip('\n')
    dst_relpath = os.path.relpath(dst_path, remote_home)
    url = 'localhost:%s' % dst_relpath
    ds = Dataset(src_path).create()
    create_tree(ds.path, {'sub.dat': 'lots of data'})
    ds.save('sub.dat')
    ds.create_sibling(url, ui=True)
    published = ds.publish(to='localhost', transfer_data='all')
    assert_result_count(published, 1, path=opj(ds.path, 'sub.dat'))
    # verify that hook runs and there is nothing in stderr
    # since it exits with 0 exit even if there was a problem
    out, err = Runner(cwd=opj(dst_path, '.git'))(_path_('hooks/post-update'))
    assert_false(out)
    assert_false(err)

    # Verify that we could replace and publish no problem
    # https://github.com/datalad/datalad/issues/1656
    # Strangely it spits outs IncompleteResultsError exception atm... so just
    # checking that it fails somehow
    res = ds.create_sibling(url, on_failure='ignore')
    assert_status('error', res)
    assert_in('already configured', res[0]['message'][0])
    # "Settings" such as UI do not persist, so we specify it again
    # for the test below depending on it
    ds.create_sibling(url, existing='replace', ui=True)
    published2 = ds.publish(to='localhost', transfer_data='all')
    assert_result_count(published2, 1, path=opj(ds.path, 'sub.dat'))

    # and one more test since in above test it would not puke ATM but just
    # not even try to copy since it assumes that file is already there
    create_tree(ds.path, {'sub2.dat': 'more data'})
    ds.save('sub2.dat')
    published3 = ds.publish(to='localhost', transfer_data='none')  # we publish just git
    assert_result_count(published3, 0, path=opj(ds.path, 'sub2.dat'))
    # now publish "with" data, which should also trigger the hook!
    # https://github.com/datalad/datalad/issues/1658
    from glob import glob
    from datalad.consts import WEB_META_LOG
    logs_prior = glob(_path_(dst_path, WEB_META_LOG, '*'))
    published4 = ds.publish(to='localhost', transfer_data='all')
    assert_result_count(published4, 1, path=opj(ds.path, 'sub2.dat'))
    logs_post = glob(_path_(dst_path, WEB_META_LOG, '*'))
    eq_(len(logs_post), len(logs_prior) + 1)

    assert_postupdate_hooks(dst_path)
Exemple #6
0
    def __call__(login,
                 cmd,
                 *,
                 port=None,
                 ipv4=False,
                 ipv6=False,
                 options=None,
                 no_stdin=False):
        lgr.debug(
            "sshrun invoked: login=%r, cmd=%r, port=%r, options=%r, "
            "ipv4=%r, ipv6=%r, no_stdin=%r", login, cmd, port, options, ipv4,
            ipv6, no_stdin)
        # Perspective workarounds for git-annex invocation, see
        # https://github.com/datalad/datalad/issues/1456#issuecomment-292641319

        if cmd.startswith("'") and cmd.endswith("'"):
            lgr.debug(
                "Detected additional level of quotations in %r so performing "
                "command line splitting", cmd)
            # there is an additional layer of quotes
            # Let's strip them off by splitting the command
            cmd_ = split_cmdline(cmd)
            if len(cmd_) != 1:
                raise RuntimeError(
                    "Obtained more or less than a single argument after "
                    "command line splitting: %s" % repr(cmd_))
            cmd = cmd_[0]
        sshurl = 'ssh://{}{}'.format(login, ':{}'.format(port) if port else '')

        if ipv4 and ipv6:
            raise ValueError("Cannot force both IPv4 and IPv6")
        elif ipv4:
            force_ip = 4
        elif ipv6:
            force_ip = 6
        else:
            force_ip = None

        ssh = ssh_manager.get_connection(sshurl, force_ip=force_ip)
        # use an empty temp file as stdin if none shall be connected
        stdin_ = tempfile.TemporaryFile() if no_stdin else sys.stdin
        try:
            out, err = ssh(cmd,
                           stdin=stdin_,
                           log_output=False,
                           options=options)
        finally:
            if no_stdin:
                stdin_.close()
        os.write(1, out.encode('UTF-8'))
        os.write(2, err.encode('UTF-8'))
Exemple #7
0
    def __call__(sshurl, target=None, target_dir=None,
                 target_url=None, target_pushurl=None,
                 dataset=None, recursive=False,
                 existing='error', shared=False, ui=False,
                 as_common_datasrc=None,
                 publish_by_default=None,
                 publish_depends=None):

        if sshurl is None:
            raise ValueError("""insufficient information for target creation
            (needs at least a dataset and a SSH URL).""")

        if target is None and (target_url is not None or
                               target_pushurl is not None):
            raise ValueError("""insufficient information for adding the target
            as a sibling (needs at least a name)""")

        # shortcut
        ds = require_dataset(dataset, check_installed=True,
                             purpose='creating a sibling')

        assert(ds is not None and sshurl is not None and ds.repo is not None)

        # determine target parameters:
        sshri = RI(sshurl)

        if not isinstance(sshri, SSHRI) \
                and not (isinstance(sshri, URL) and sshri.scheme == 'ssh'):
                    raise ValueError("Unsupported SSH URL: '{0}', use ssh://host/path or host:path syntax".format(sshurl))

        if target_dir is None:
            if sshri.path:
                target_dir = sshri.path
            else:
                target_dir = '.'

        # TODO: centralize and generalize template symbol handling
        replicate_local_structure = False
        if "%NAME" not in target_dir:
            replicate_local_structure = True

        # collect datasets to use:
        datasets = dict()
        datasets[basename(ds.path)] = ds
        if recursive:
            for subds in ds.get_subdatasets(recursive=True):
                sub_path = opj(ds.path, subds)
                # TODO: when enhancing Dataset/*Repo classes and therefore
                # adapt to moved code, make proper distinction between name and
                # path of a submodule, which are technically different. This
                # probably will become important on windows as well as whenever
                # we want to allow for moved worktrees.
                datasets[basename(ds.path) + '/' + subds] = \
                    Dataset(sub_path)

        # request ssh connection:
        not_supported_on_windows("TODO")
        lgr.info("Connecting ...")
        ssh = ssh_manager.get_connection(sshurl)
        ssh.open()

        # flag to check if at dataset_root
        at_root = True

        # loop over all datasets, ordered from top to bottom to make test
        # below valid (existing directories would cause the machinery to halt)
        # But we need to run post-update hook in depth-first fashion, so
        # would only collect first and then run (see gh #790)
        remote_repos_to_run_hook_for = []
        for current_dspath in \
                sorted(datasets.keys(), key=lambda x: x.count('/')):
            current_ds = datasets[current_dspath]
            if not current_ds.is_installed():
                lgr.info("Skipping %s since not installed locally", current_dspath)
                continue
            if not replicate_local_structure:
                path = target_dir.replace("%NAME",
                                          current_dspath.replace("/", "-"))
            else:
                # TODO: opj depends on local platform, not the remote one.
                # check how to deal with it. Does windows ssh server accept
                # posix paths? vice versa? Should planned SSH class provide
                # tools for this issue?
                path = normpath(opj(target_dir,
                                    relpath(datasets[current_dspath].path,
                                            start=ds.path)))

            lgr.info("Creating target dataset {0} at {1}".format(current_dspath, path))
            # Must be set to True only if exists and existing='reconfigure'
            # otherwise we might skip actions if we say existing='reconfigure'
            # but it did not even exist before
            only_reconfigure = False
            if path != '.':
                # check if target exists
                # TODO: Is this condition valid for != '.' only?
                path_exists = True
                try:
                    out, err = ssh(["ls", path])
                except CommandError as e:
                    if "No such file or directory" in e.stderr and \
                            path in e.stderr:
                        path_exists = False
                    else:
                        raise  # It's an unexpected failure here

                if path_exists:
                    if existing == 'error':
                        raise RuntimeError("Target directory %s already exists." % path)
                    elif existing == 'skip':
                        continue
                    elif existing == 'replace':
                        ssh(["chmod", "+r+w", "-R", path])  # enable write permissions to allow removing dir
                        ssh(["rm", "-rf", path])            # remove target at path
                        path_exists = False                 # if we succeeded in removing it
                    elif existing == 'reconfigure':
                        only_reconfigure = True
                    else:
                        raise ValueError("Do not know how to handle existing=%s" % repr(existing))

                if not path_exists:
                    try:
                        ssh(["mkdir", "-p", path])
                    except CommandError as e:
                        lgr.error("Remotely creating target directory failed at "
                                  "%s.\nError: %s" % (path, exc_str(e)))
                        continue

            # don't (re-)initialize dataset if existing == reconfigure
            if not only_reconfigure:
                # init git and possibly annex repo
                if not CreateSibling.init_remote_repo(
                        path, ssh, shared, datasets[current_dspath],
                        description=target_url):
                    continue

            # check git version on remote end
            lgr.info("Adjusting remote git configuration")
            remote_git_version = CreateSibling.get_remote_git_version(ssh)
            if remote_git_version and remote_git_version >= "2.4":
                # allow for pushing to checked out branch
                try:
                    ssh(["git", "-C", path] +
                        ["config", "receive.denyCurrentBranch", "updateInstead"])
                except CommandError as e:
                    lgr.error("git config failed at remote location %s.\n"
                              "You will not be able to push to checked out "
                              "branch. Error: %s", path, exc_str(e))
            else:
                lgr.error("Git version >= 2.4 needed to configure remote."
                          " Version detected on server: %s\nSkipping configuration"
                          " of receive.denyCurrentBranch - you will not be able to"
                          " publish updates to this repository. Upgrade your git"
                          " and run with --existing=reconfigure"
                          % remote_git_version)

            # enable metadata refresh on dataset updates to publication server
            lgr.info("Enabling git post-update hook ...")
            try:
                CreateSibling.create_postupdate_hook(
                    path, ssh, datasets[current_dspath])
            except CommandError as e:
                lgr.error("Failed to add json creation command to post update "
                          "hook.\nError: %s" % exc_str(e))

            # publish web-interface to root dataset on publication server
            if at_root and ui:
                lgr.info("Uploading web interface to %s" % path)
                at_root = False
                try:
                    CreateSibling.upload_web_interface(path, ssh, shared, ui)
                except CommandError as e:
                    lgr.error("Failed to push web interface to the remote "
                              "datalad repository.\nError: %s" % exc_str(e))

            remote_repos_to_run_hook_for.append(path)

        # in reverse order would be depth first
        lgr.debug("Running post-update hooks in all created siblings")
        for path in remote_repos_to_run_hook_for[::-1]:
            # Trigger the hook
            try:
                ssh(
                    ["cd '" + _path_(path, ".git") + "' && hooks/post-update"],
                    wrap_args=False  # we wrapped here manually
                )
            except CommandError as e:
                lgr.error("Failed to run post-update hook under path %s. "
                          "Error: %s" % (path, exc_str(e)))

        if target:
            # add the sibling(s):
            lgr.debug("Adding the siblings")
            if target_url is None:
                target_url = sshurl
            if target_pushurl is None and sshurl != target_url:
                target_pushurl = sshurl
            AddSibling()(dataset=ds,
                         name=target,
                         url=target_url,
                         pushurl=target_pushurl,
                         recursive=recursive,
                         fetch=True,
                         force=existing in {'replace'},
                         as_common_datasrc=as_common_datasrc,
                         publish_by_default=publish_by_default,
                         publish_depends=publish_depends)
Exemple #8
0
    def __call__(sshurl, name=None, target_dir=None,
                 target_url=None, target_pushurl=None,
                 dataset=None,
                 recursive=False,
                 recursion_limit=None,
                 existing='error',
                 shared=None,
                 group=None,
                 ui=False,
                 as_common_datasrc=None,
                 publish_by_default=None,
                 publish_depends=None,
                 annex_wanted=None, annex_group=None, annex_groupwanted=None,
                 inherit=False,
                 since=None):
        #
        # nothing without a base dataset
        #
        ds = require_dataset(dataset, check_installed=True,
                             purpose='creating a sibling')
        refds_path = ds.path

        #
        # all checks that are possible before we start parsing the dataset
        #

        # possibly use sshurl to get the name in case if not specified
        if not sshurl:
            if not inherit:
                raise InsufficientArgumentsError(
                    "needs at least an SSH URL, if no inherit option"
                )
            if name is None:
                raise ValueError(
                    "Neither SSH URL, nor the name of sibling to inherit from "
                    "was specified"
                )
            # It might well be that we already have this remote setup
            try:
                sshurl = CreateSibling._get_remote_url(ds, name)
            except Exception as exc:
                lgr.debug('%s does not know about url for %s: %s', ds, name, exc_str(exc))
        elif inherit:
            raise ValueError(
                "For now, for clarity not allowing specifying a custom sshurl "
                "while inheriting settings"
            )
            # may be could be safely dropped -- still WiP

        if not sshurl:
            # TODO: may be more back up before _prep?
            super_ds = ds.get_superdataset()
            if not super_ds:
                raise ValueError(
                    "Could not determine super dataset for %s to inherit URL"
                    % ds
                )
            super_url = CreateSibling._get_remote_url(super_ds, name)
            # for now assuming hierarchical setup
            # (TODO: to be able to destinguish between the two, probably
            # needs storing datalad.*.target_dir to have %RELNAME in there)
            sshurl = slash_join(super_url, relpath(ds.path, super_ds.path))

        # check the login URL
        sshri = RI(sshurl)
        if not is_ssh(sshri):
            raise ValueError(
                "Unsupported SSH URL: '{0}', "
                "use ssh://host/path or host:path syntax".format(sshurl))

        if not name:
            # use the hostname as default remote name
            name = sshri.hostname
            lgr.debug(
                "No sibling name given, use URL hostname '%s' as sibling name",
                name)

        if since == '':
            # consider creating siblings only since the point of
            # the last update
            # XXX here we assume one to one mapping of names from local branches
            # to the remote
            active_branch = ds.repo.get_active_branch()
            since = '%s/%s' % (name, active_branch)

        #
        # parse the base dataset to find all subdatasets that need processing
        #
        to_process = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                # only a single path!
                path=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='create_sibling',
                # both next should not happen anyways
                unavailable_path_status='impossible',
                nondataset_path_status='error',
                modified=since,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', None) != 'dataset' or ap.get('state', None) == 'absent':
                # this can happen when there is `since`, but we have no
                # use for anything but datasets here
                continue
            checkds_remotes = Dataset(ap['path']).repo.get_remotes() \
                if ap.get('state', None) != 'absent' \
                else []
            if publish_depends:
                # make sure dependencies are valid
                # TODO: inherit -- we might want to automagically create
                # those dependents as well???
                unknown_deps = set(assure_list(publish_depends)).difference(checkds_remotes)
                if unknown_deps:
                    ap['status'] = 'error'
                    ap['message'] = (
                        'unknown sibling(s) specified as publication dependency: %s',
                        unknown_deps)
                    yield ap
                    continue
            if name in checkds_remotes and existing in ('error', 'skip'):
                ap['status'] = 'error' if existing == 'error' else 'notneeded'
                ap['message'] = (
                    "sibling '%s' already configured (specify alternative name, or force "
                    "reconfiguration via --existing",
                    name)
                yield ap
                continue
            to_process.append(ap)

        if not to_process:
            # we ruled out all possibilities
            # TODO wait for gh-1218 and make better return values
            lgr.info("No datasets qualify for sibling creation. "
                     "Consider different settings for --existing "
                     "or --since if this is unexpected")
            return

        if target_dir is None:
            if sshri.path:
                target_dir = sshri.path
            else:
                target_dir = '.'

        # TODO: centralize and generalize template symbol handling
        replicate_local_structure = "%RELNAME" not in target_dir

        # request ssh connection:
        lgr.info("Connecting ...")
        assert(sshurl is not None)  # delayed anal verification
        ssh = ssh_manager.get_connection(sshurl)
        if not ssh.get_annex_version():
            raise MissingExternalDependency(
                'git-annex',
                msg='on the remote system')

        #
        # all checks done and we have a connection, now do something
        #

        # loop over all datasets, ordered from top to bottom to make test
        # below valid (existing directories would cause the machinery to halt)
        # But we need to run post-update hook in depth-first fashion, so
        # would only collect first and then run (see gh #790)
        yielded = set()
        remote_repos_to_run_hook_for = []
        for currentds_ap in \
                sorted(to_process, key=lambda x: x['path'].count('/')):
            current_ds = Dataset(currentds_ap['path'])

            path = _create_dataset_sibling(
                name,
                current_ds,
                ds.path,
                ssh,
                replicate_local_structure,
                sshri,
                target_dir,
                target_url,
                target_pushurl,
                existing,
                shared,
                group,
                publish_depends,
                publish_by_default,
                ui,
                as_common_datasrc,
                annex_wanted,
                annex_group,
                annex_groupwanted,
                inherit
            )
            if not path:
                # nothing new was created
                # TODO is 'notneeded' appropriate in this case?
                currentds_ap['status'] = 'notneeded'
                # TODO explain status in 'message'
                yield currentds_ap
                yielded.add(currentds_ap['path'])
                continue
            remote_repos_to_run_hook_for.append((path, currentds_ap))

            # publish web-interface to root dataset on publication server
            if current_ds.path == ds.path and ui:
                lgr.info("Uploading web interface to %s" % path)
                try:
                    CreateSibling.upload_web_interface(path, ssh, shared, ui)
                except CommandError as e:
                    currentds_ap['status'] = 'error'
                    currentds_ap['message'] = (
                        "failed to push web interface to the remote datalad repository (%s)",
                        exc_str(e))
                    yield currentds_ap
                    yielded.add(currentds_ap['path'])
                    continue

        # in reverse order would be depth first
        lgr.info("Running post-update hooks in all created siblings")
        # TODO: add progressbar
        for path, currentds_ap in remote_repos_to_run_hook_for[::-1]:
            # Trigger the hook
            lgr.debug("Running hook for %s (if exists and executable)", path)
            try:
                ssh("cd {} "
                    "&& ( [ -x hooks/post-update ] && hooks/post-update || : )"
                    "".format(sh_quote(_path_(path, ".git"))))
            except CommandError as e:
                currentds_ap['status'] = 'error'
                currentds_ap['message'] = (
                    "failed to run post-update hook under remote path %s (%s)",
                    path, exc_str(e))
                yield currentds_ap
                yielded.add(currentds_ap['path'])
                continue
            if not currentds_ap['path'] in yielded:
                # if we were silent until now everything is just splendid
                currentds_ap['status'] = 'ok'
                yield currentds_ap
def _create_sibling_ria(ds, url, name, ria_remote, ria_remote_name, existing,
                        shared, group, post_update_hook, res_kwargs):
    # be safe across datasets
    res_kwargs = res_kwargs.copy()

    # parse target URL
    try:
        ssh_host, base_path = verify_ria_url(url, ds.config)
    except ValueError as e:
        yield get_status_dict(status='error', message=str(e), **res_kwargs)
        return

    base_path = Path(base_path)

    git_url = decode_source_spec(
        # append dataset id to url and use magic from clone-helper:
        url + '#{}'.format(ds.id),
        cfg=ds.config)['giturl']
    # go for a v1 layout
    repo_path, _, _ = get_layout_locations(1, base_path, ds.id)

    ds_siblings = [r['name'] for r in ds.siblings(result_renderer=None)]
    # Figure whether we are supposed to skip this very dataset
    if existing == 'skip' and (name in ds_siblings or
                               (ria_remote_name
                                and ria_remote_name in ds_siblings)):
        yield get_status_dict(status='notneeded',
                              message="Skipped on existing sibling",
                              **res_kwargs)
        # if we skip here, nothing else can change that decision further
        # down
        return

    # we might learn that some processing (remote repo creation is
    # not desired)
    skip = False

    lgr.info("create sibling{} '{}'{} ...".format(
        's' if ria_remote_name else '',
        name,
        " and '{}'".format(ria_remote_name) if ria_remote_name else '',
    ))
    if ssh_host:
        from datalad import ssh_manager
        ssh = ssh_manager.get_connection(ssh_host,
                                         use_remote_annex_bundle=False)
        ssh.open()

    # determine layout locations
    if ria_remote:
        lgr.debug('init special remote {}'.format(ria_remote_name))
        ria_remote_options = [
            'type=external', 'externaltype=ria', 'encryption=none',
            'autoenable=true', 'url={}'.format(url)
        ]
        try:
            ds.repo.init_remote(ria_remote_name, options=ria_remote_options)
        except CommandError as e:
            if existing in ['replace', 'reconfigure'] \
                    and 'git-annex: There is already a special remote' \
                    in e.stderr:
                # run enableremote instead
                lgr.debug(
                    "special remote '%s' already exists. "
                    "Run enableremote instead.", ria_remote_name)
                # TODO: Use AnnexRepo.enable_remote (which needs to get
                #       `options` first)
                cmd = ['git', 'annex', 'enableremote', ria_remote_name
                       ] + ria_remote_options
                subprocess.run(cmd, cwd=quote_cmdlinearg(ds.repo.path))
            else:
                yield get_status_dict(
                    status='error',
                    message="initremote failed.\nstdout: %s\nstderr: %s" %
                    (e.stdout, e.stderr),
                    **res_kwargs)
                return

        # 1. create remote object store:
        # Note: All it actually takes is to trigger the special
        # remote's `prepare` method once.
        # ATM trying to achieve that by invoking a minimal fsck.
        # TODO: - It's probably faster to actually talk to the special
        #         remote (i.e. pretending to be annex and use
        #         the protocol to send PREPARE)
        #       - Alternatively we can create the remote directory and
        #         ria version file directly, but this means
        #         code duplication that then needs to be kept in sync
        #         with ria-remote implementation.
        #       - this leads to the third option: Have that creation
        #         routine importable and callable from
        #         ria-remote package without the need to actually
        #         instantiate a RIARemote object
        lgr.debug("initializing object store")
        ds.repo.fsck(remote=ria_remote_name,
                     fast=True,
                     annex_options=['--exclude=*/*'])
    else:
        # with no special remote we currently need to create the
        # required directories
        # TODO: This should be cleaner once we have access to the
        #       special remote's RemoteIO classes without
        #       talking via annex
        if ssh_host:
            try:
                stdout, stderr = ssh('test -e {repo}'.format(
                    repo=quote_cmdlinearg(str(repo_path))))
                exists = True
            except CommandError as e:
                exists = False
            if exists:
                if existing == 'skip':
                    # 1. not rendered by default
                    # 2. message doesn't show up in ultimate result
                    #    record as shown by -f json_pp
                    yield get_status_dict(status='notneeded',
                                          message="Skipped on existing remote "
                                          "directory {}".format(repo_path),
                                          **res_kwargs)
                    skip = True
                elif existing in ['error', 'reconfigure']:
                    yield get_status_dict(
                        status='error',
                        message="remote directory {} already "
                        "exists.".format(repo_path),
                        **res_kwargs)
                    return
                elif existing == 'replace':
                    ssh('chmod u+w -R {}'.format(
                        quote_cmdlinearg(str(repo_path))))
                    ssh('rm -rf {}'.format(quote_cmdlinearg(str(repo_path))))
            if not skip:
                ssh('mkdir -p {}'.format(quote_cmdlinearg(str(repo_path))))
        else:
            if repo_path.exists():
                if existing == 'skip':
                    skip = True
                elif existing in ['error', 'reconfigure']:
                    yield get_status_dict(
                        status='error',
                        message="remote directory {} already "
                        "exists.".format(repo_path),
                        **res_kwargs)
                    return
                elif existing == 'replace':
                    rmtree(repo_path)
            if not skip:
                repo_path.mkdir(parents=True)

    # Note, that this could have changed since last tested due to existing
    # remote dir
    if skip:
        return

    # 2. create a bare repository in-store:

    lgr.debug("init bare repository")
    # TODO: we should prob. check whether it's there already. How?
    # Note: like the special remote itself, we assume local FS if no
    # SSH host is specified
    disabled_hook = repo_path / 'hooks' / 'post-update.sample'
    enabled_hook = repo_path / 'hooks' / 'post-update'

    if group:
        chgrp_cmd = "chgrp -R {} {}".format(quote_cmdlinearg(str(group)),
                                            quote_cmdlinearg(str(repo_path)))

    if ssh_host:
        ssh('cd {rootdir} && git init --bare{shared}'.format(
            rootdir=quote_cmdlinearg(str(repo_path)),
            shared=" --shared='{}'".format(quote_cmdlinearg(shared))
            if shared else ''))
        if post_update_hook:
            ssh('mv {} {}'.format(quote_cmdlinearg(str(disabled_hook)),
                                  quote_cmdlinearg(str(enabled_hook))))

        if group:
            # Either repository existed before or a new directory was
            # created for it, set its group to a desired one if was
            # provided with the same chgrp
            ssh(chgrp_cmd)
    else:
        GitRepo(repo_path,
                create=True,
                bare=True,
                shared=" --shared='{}'".format(quote_cmdlinearg(shared))
                if shared else None)
        if post_update_hook:
            disabled_hook.rename(enabled_hook)
        if group:
            # TODO; do we need a cwd here?
            subprocess.run(chgrp_cmd, cwd=quote_cmdlinearg(ds.path))

    # add a git remote to the bare repository
    # Note: needs annex-ignore! Otherwise we might push into default
    # annex/object tree instead of directory type tree with dirhash
    # lower. This in turn would be an issue, if we want to pack the
    # entire thing into an archive. Special remote will then not be
    # able to access content in the "wrong" place within the archive
    lgr.debug("set up git remote")
    # TODO:
    # - This sibings call results in "[WARNING] Failed to determine
    #   if datastore carries annex."
    #   (see https://github.com/datalad/datalad/issues/4028)
    #   => for now have annex-ignore configured before. Evtl. Allow
    #      configure/add to include that option
    #      - additionally there's
    #        https://github.com/datalad/datalad/issues/3989,
    #        where datalad-siblings might hang forever
    if name in ds_siblings:
        # otherwise we should have skipped or failed before
        assert existing in ['replace', 'reconfigure']
    ds.config.set("remote.{}.annex-ignore".format(name),
                  value="true",
                  where="local")
    ds.siblings(
        'configure',
        name=name,
        url=git_url if ssh_host else str(repo_path),
        recursive=False,
        # Note, that this should be None if ria_remote was not set
        publish_depends=ria_remote_name,
        result_renderer=None,
        # Note, that otherwise a subsequent publish will report
        # "notneeded".
        fetch=True)

    yield get_status_dict(
        status='ok',
        **res_kwargs,
    )
def check_replace_and_relative_sshpath(use_ssh, src_path, dst_path):
    # We need to come up with the path relative to our current home directory
    # https://github.com/datalad/datalad/issues/1653
    # but because we override HOME the HOME on the remote end would be
    # different even though a datalad-test. So we need to query it
    if use_ssh:
        from datalad import ssh_manager
        ssh = ssh_manager.get_connection('datalad-test')
        remote_home, err = ssh('pwd')
        remote_home = remote_home.rstrip('\n')
        dst_relpath = os.path.relpath(dst_path, remote_home)
        url = 'datalad-test:%s' % dst_relpath
        sibname = 'datalad-test'
    else:
        url = dst_path
        sibname = 'local'

    ds = Dataset(src_path).create()
    create_tree(ds.path, {'sub.dat': 'lots of data'})
    ds.save('sub.dat')
    try:
        res = ds.create_sibling(url, ui=have_webui())
    except UnicodeDecodeError:
        if sys.version_info < (3, 7):
            # observed test failing on ubuntu 18.04 with python 3.6
            # (reproduced in conda env locally with python 3.6.10 when LANG=C
            # We will just skip this tricky one
            raise SkipTest("Known failure")
        raise
    assert_in_results(res, action="create_sibling", sibling_name=sibname)
    published = ds.publish(to=sibname, transfer_data='all')
    assert_result_count(published, 1, path=opj(ds.path, 'sub.dat'))
    if have_webui():
        # verify that hook runs and there is nothing in stderr
        # since it exits with 0 exit even if there was a problem
        out = Runner(cwd=opj(dst_path, '.git')).run(
            [_path_('hooks/post-update')], protocol=StdOutErrCapture)
        assert_false(out['stdout'])
        assert_false(out['stderr'])

    # Verify that we could replace and publish no problem
    # https://github.com/datalad/datalad/issues/1656
    # Strangely it spits outs IncompleteResultsError exception atm... so just
    # checking that it fails somehow
    res = ds.create_sibling(url, on_failure='ignore')
    assert_status('error', res)
    assert_in('already configured', res[0]['message'][0])
    # "Settings" such as UI do not persist, so we specify it again
    # for the test below depending on it
    with assert_raises(RuntimeError):
        # but we cannot replace in non-interactive mode
        ds.create_sibling(url, existing='replace', ui=have_webui())

    # We don't have context manager like @with_testsui, so
    @with_testsui(responses=["yes"])
    def interactive_create_sibling():
        ds.create_sibling(url, existing='replace', ui=have_webui())

    interactive_create_sibling()

    published2 = ds.publish(to=sibname, transfer_data='all')
    assert_result_count(published2, 1, path=opj(ds.path, 'sub.dat'))

    # and one more test since in above test it would not puke ATM but just
    # not even try to copy since it assumes that file is already there
    create_tree(ds.path, {'sub2.dat': 'more data'})
    ds.save('sub2.dat')
    published3 = ds.publish(to=sibname,
                            transfer_data='none')  # we publish just git
    assert_result_count(published3, 0, path=opj(ds.path, 'sub2.dat'))

    if not have_webui():
        return

    # now publish "with" data, which should also trigger the hook!
    # https://github.com/datalad/datalad/issues/1658
    from glob import glob
    from datalad.consts import WEB_META_LOG
    logs_prior = glob(_path_(dst_path, WEB_META_LOG, '*'))
    published4 = ds.publish(to=sibname, transfer_data='all')
    assert_result_count(published4, 1, path=opj(ds.path, 'sub2.dat'))
    logs_post = glob(_path_(dst_path, WEB_META_LOG, '*'))
    eq_(len(logs_post), len(logs_prior) + 1)

    assert_postupdate_hooks(dst_path)
Exemple #11
0
    def __call__(sshurl,
                 *,
                 name=None,
                 target_dir=None,
                 target_url=None,
                 target_pushurl=None,
                 dataset=None,
                 recursive=False,
                 recursion_limit=None,
                 existing='error',
                 shared=None,
                 group=None,
                 ui=False,
                 as_common_datasrc=None,
                 publish_by_default=None,
                 publish_depends=None,
                 annex_wanted=None,
                 annex_group=None,
                 annex_groupwanted=None,
                 inherit=False,
                 since=None):
        if ui:
            # the webui has been moved to the deprecated extension
            try:
                from datalad_deprecated.sibling_webui \
                    import upload_web_interface
            except Exception as e:
                # we could just test for ModuleNotFoundError (which should be
                # all that would happen with PY3.6+, but be a little more robust
                # and use the pattern from duecredit
                if type(e).__name__ not in ('ImportError',
                                            'ModuleNotFoundError'):
                    lgr.error(
                        "Failed to import datalad_deprecated.sibling_webui "
                        "due to %s", str(e))
                raise RuntimeError(
                    "The DataLad web UI has been moved to an extension "
                    "package. Please install the Python package "
                    "`datalad_deprecated` to be able to deploy it.")

        # push uses '^' to annotate the previous pushed committish, and None for default
        # behavior. '' was/is (to be deprecated) used in `publish` and 'create-sibling'.
        # Alert user about the mistake
        if since == '':
            # deprecation was added prior 0.16.0
            import warnings
            warnings.warn("'since' should point to commitish or use '^'.",
                          DeprecationWarning)
            since = '^'

        #
        # nothing without a base dataset
        #
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='create sibling(s)')
        refds_path = ds.path

        #
        # all checks that are possible before we start parsing the dataset
        #
        if since and not recursive:
            raise ValueError("The use of 'since' requires 'recursive'")
        # possibly use sshurl to get the name in case if not specified
        if not sshurl:
            if not inherit:
                raise InsufficientArgumentsError(
                    "needs at least an SSH URL, if no inherit option")
            if name is None:
                raise ValueError(
                    "Neither SSH URL, nor the name of sibling to inherit from "
                    "was specified")
            # It might well be that we already have this remote setup
            try:
                sshurl = CreateSibling._get_remote_url(ds, name)
            except Exception as exc:
                ce = CapturedException(exc)
                lgr.debug('%s does not know about url for %s: %s', ds, name,
                          ce)
        elif inherit:
            raise ValueError(
                "For now, for clarity not allowing specifying a custom sshurl "
                "while inheriting settings")
            # may be could be safely dropped -- still WiP

        if not sshurl:
            # TODO: may be more back up before _prep?
            super_ds = ds.get_superdataset()
            if not super_ds:
                raise ValueError(
                    "Could not determine super dataset for %s to inherit URL" %
                    ds)
            super_url = CreateSibling._get_remote_url(super_ds, name)
            # for now assuming hierarchical setup
            # (TODO: to be able to distinguish between the two, probably
            # needs storing datalad.*.target_dir to have %RELNAME in there)
            sshurl = slash_join(super_url, relpath(refds_path, super_ds.path))

        # check the login URL
        sibling_ri = RI(sshurl)
        ssh_sibling = is_ssh(sibling_ri)
        if not (ssh_sibling or isinstance(sibling_ri, PathRI)):
            raise ValueError(
                "Unsupported SSH URL or path: '{0}', "
                "use ssh://host/path, host:path or path syntax".format(sshurl))

        if not name:
            name = sibling_ri.hostname if ssh_sibling else "local"
            lgr.info("No sibling name given. Using %s'%s' as sibling name",
                     "URL hostname " if ssh_sibling else "", name)
        if since == '^':
            # consider creating siblings only since the point of
            # the last update
            # XXX here we assume one to one mapping of names from local branches
            # to the remote
            active_branch = ds.repo.get_active_branch()
            since = '%s/%s' % (name, active_branch)

        to_process = []
        if recursive:
            #
            # parse the base dataset to find all subdatasets that need processing
            #
            cand_ds = [
                Dataset(r['path']) for r in diff_dataset(
                    ds,
                    fr=since,
                    to='HEAD',
                    # w/o False we might not follow into new subdatasets
                    # which do not have that remote yet setup,
                    # see https://github.com/datalad/datalad/issues/6596
                    constant_refs=False,
                    # save cycles, we are only looking for datasets
                    annex=None,
                    untracked='no',
                    recursive=True,
                    datasets_only=True,
                )
                # not installed subdatasets would be 'clean' so we would skip them
                if r.get('type') == 'dataset'
                and r.get('state', None) != 'clean'
            ]
            if not since:
                # not only subdatasets
                cand_ds = [ds] + cand_ds
        else:
            # only the current ds
            cand_ds = [ds]
        # check remotes setup()
        for d in cand_ds:
            d_repo = d.repo
            if d_repo is None:
                continue
            checkds_remotes = d.repo.get_remotes()
            res = dict(
                action='create_sibling',
                path=d.path,
                type='dataset',
            )

            if publish_depends:
                # make sure dependencies are valid
                # TODO: inherit -- we might want to automagically create
                # those dependents as well???
                unknown_deps = set(
                    ensure_list(publish_depends)).difference(checkds_remotes)
                if unknown_deps:
                    yield dict(
                        res,
                        status='error',
                        message=('unknown sibling(s) specified as publication '
                                 'dependency: %s', unknown_deps),
                    )
                    continue
            if name in checkds_remotes and existing in ('error', 'skip'):
                yield dict(
                    res,
                    sibling_name=name,
                    status='error' if existing == 'error' else 'notneeded',
                    message=(
                        "sibling '%s' already configured (specify alternative "
                        "name, or force reconfiguration via --existing", name),
                )
                continue
            to_process.append(res)

        if not to_process:
            # we ruled out all possibilities
            # TODO wait for gh-1218 and make better return values
            lgr.info("No datasets qualify for sibling creation. "
                     "Consider different settings for --existing "
                     "or --since if this is unexpected")
            return

        if ssh_sibling:
            # request ssh connection:
            lgr.info("Connecting ...")
            shell = ssh_manager.get_connection(sshurl)
        else:
            shell = _RunnerAdapter()
            sibling_ri.path = str(resolve_path(sibling_ri.path, dataset))
            if target_dir:
                target_dir = opj(sibling_ri.path, target_dir)

        if target_dir is None:
            if sibling_ri.path:
                target_dir = sibling_ri.path
            else:
                target_dir = '.'

        # TODO: centralize and generalize template symbol handling
        replicate_local_structure = "%RELNAME" not in target_dir

        if not shell.get_annex_version():
            raise MissingExternalDependency(
                'git-annex',
                msg="It's required on the {} machine to create a sibling".
                format('remote' if ssh_sibling else 'local'))

        #
        # all checks done and we have a connection, now do something
        #

        # loop over all datasets, ordered from top to bottom to make test
        # below valid (existing directories would cause the machinery to halt)
        # But we need to run post-update hook in depth-first fashion, so
        # would only collect first and then run (see gh #790)
        yielded = set()
        remote_repos_to_run_hook_for = []
        for currentds_ap in \
                sorted(to_process, key=lambda x: x['path'].count('/')):
            current_ds = Dataset(currentds_ap['path'])

            path = _create_dataset_sibling(
                name, current_ds, refds_path, shell, replicate_local_structure,
                sibling_ri, target_dir, target_url, target_pushurl, existing,
                shared, group, publish_depends, publish_by_default, ui,
                as_common_datasrc, annex_wanted, annex_group,
                annex_groupwanted, inherit)
            currentds_ap["sibling_name"] = name
            if not path:
                # nothing new was created
                # TODO is 'notneeded' appropriate in this case?
                currentds_ap['status'] = 'notneeded'
                # TODO explain status in 'message'
                yield currentds_ap
                yielded.add(currentds_ap['path'])
                continue
            remote_repos_to_run_hook_for.append((path, currentds_ap))

            # publish web-interface to root dataset on publication server
            if current_ds.path == refds_path and ui:
                from datalad_deprecated.sibling_webui import upload_web_interface
                lgr.info("Uploading web interface to %s", path)
                try:
                    upload_web_interface(path, shell, shared, ui)
                except CommandError as e:
                    ce = CapturedException(e)
                    currentds_ap['status'] = 'error'
                    currentds_ap['message'] = (
                        "failed to push web interface to the remote datalad repository (%s)",
                        ce)
                    currentds_ap['exception'] = ce
                    yield currentds_ap
                    yielded.add(currentds_ap['path'])
                    continue

        # in reverse order would be depth first
        lgr.info("Running post-update hooks in all created siblings")
        # TODO: add progressbar
        for path, currentds_ap in remote_repos_to_run_hook_for[::-1]:
            # Trigger the hook
            lgr.debug("Running hook for %s (if exists and executable)", path)
            try:
                shell(
                    "cd {} "
                    "&& ( [ -x hooks/post-update ] && hooks/post-update || true )"
                    "".format(sh_quote(_path_(path, ".git"))))
            except CommandError as e:
                ce = CapturedException(e)
                currentds_ap['status'] = 'error'
                currentds_ap['message'] = (
                    "failed to run post-update hook under remote path %s (%s)",
                    path, ce)
                currentds_ap['exception'] = ce
                yield currentds_ap
                yielded.add(currentds_ap['path'])
                continue
            if not currentds_ap['path'] in yielded:
                # if we were silent until now everything is just splendid
                currentds_ap['status'] = 'ok'
                yield currentds_ap