Beispiel #1
0
def _check_and_update_remote_server_info(ds, remote):
    # if we managed to copy to "http" url  we should should try to trigger git
    # update-server-info hook on the remote if there was ssh annexurl defined
    # for it. Apparently we do that already in create_sibling ones, but here
    # we need more checks and preparation
    remote_url = ds.repo.config.get('remote.%s.url' % remote, None)
    if remote_url:
        remote_url = RI(remote_url)
        if isinstance(remote_url,
                      URL) and remote_url.scheme in ('http', 'https'):
            remote_annexurl = ds.repo.config.get('remote.%s.annexurl' % remote,
                                                 None)
            if remote_annexurl:
                remote_annexurl_ri = RI(remote_annexurl)
                if is_ssh(remote_annexurl_ri):
                    ssh = ssh_manager.get_connection(remote_annexurl_ri)
                    ssh('git -C {} update-server-info'.format(
                        sh_quote(remote_annexurl_ri.path)))
                    return True
                else:
                    lgr.debug(
                        "There is no annexurl defined but not ssh: %s, "
                        "dunno if "
                        "we could/should do anything", remote_annexurl)
    return False
Beispiel #2
0
def _check_ri(ri, cls, exact_str=True, localpath=None, **fields):
    """just a helper to carry out few checks on urls"""
    with swallow_logs(new_level=logging.DEBUG) as cml:
        ri_ = cls(**fields)
        murl = RI(ri)
        eq_(murl.__class__, cls)  # not just a subclass
        eq_(murl, ri_)
        if isinstance(ri, str):
            eq_(str(RI(ri)), ri)
        eq_(eval(repr(ri_)), ri)  # repr leads back to identical ri_
        eq_(ri, ri_)  # just in case ;)  above should fail first if smth is wrong
        if not exact_str:
            assert_in('Parsed version of', cml.out)
    (eq_ if exact_str else neq_)(ri, str(ri_))  # that we can reconstruct it EXACTLY on our examples
    # and that we have access to all those fields
    nok_(set(fields).difference(set(cls._FIELDS)))
    for f, v in fields.items():
        eq_(getattr(ri_, f), v)

    if localpath:
        eq_(ri_.localpath, localpath)
        old_localpath = ri_.localpath  # for a test below
    else:
        # if not given -- must be a remote url, should raise exception
        with assert_raises(ValueError):
            ri_.localpath

    # do changes in the path persist?
    old_str = str(ri_)
    ri_.path = newpath = opj(ri_.path, 'sub')
    eq_(ri_.path, newpath)
    neq_(str(ri_), old_str)
    if localpath:
        eq_(ri_.localpath, opj(old_localpath, 'sub'))
Beispiel #3
0
def _get_flexible_source_candidates(src, base_url=None, alternate_suffix=True):
    """Get candidates to try cloning from.

    Primarily to mitigate the problem that git doesn't append /.git
    while cloning from non-bare repos over dummy protocol (http*).  Also to
    simplify creation of urls whenever base url and relative path within it
    provided

    Parameters
    ----------
    src : string or RI
      Full or relative (then considered within base_url if provided) path
    base_url : string or RI, optional
    alternate_suffix : bool
      Whether to generate URL candidates with and without '/.git' suffixes.

    Returns
    -------
    candidates : list of str
      List of RIs (path, url, ssh targets) to try to install from
    """
    candidates = []

    ri = RI(src)
    if isinstance(ri, PathRI) and not isabs(ri.path) and base_url:
        ri = RI(base_url)
        if ri.path.endswith('/.git'):
            base_path = ri.path[:-5]
            base_suffix = '.git'
        else:
            base_path = ri.path
            base_suffix = ''
        if isinstance(ri, PathRI):
            # this is a path, so stay native
            ri.path = normpath(opj(base_path, src, base_suffix))
        else:
            # we are handling a URL, use POSIX path conventions
            ri.path = posixpath.normpath(
                posixpath.join(base_path, src, base_suffix))

    src = str(ri)

    candidates.append(src)
    if alternate_suffix and isinstance(ri, URL):
        if ri.scheme in {'http', 'https'}:
            # additionally try to consider .git:
            if not src.rstrip('/').endswith('/.git'):
                candidates.append(
                    '{0}/.git'.format(src.rstrip('/')))

    # TODO:
    # We need to provide some error msg with InstallFailedError, since now
    # it just swallows everything.
    # yoh: not sure if this comment applies here, but could be still applicable
    # outisde

    return candidates
Beispiel #4
0
def _get_flexible_source_candidates(src, base_url=None):
    """Get candidates to try cloning from.

    Primarily to mitigate the problem that git doesn't append /.git
    while cloning from non-bare repos over dummy protocol (http*).  Also to
    simplify creation of urls whenever base url and relative path within it
    provided

    Parameters
    ----------
    src : string or RI
      Full or relative (then considered within base_url if provided) path
    base_url : string or RI, optional

    Returns
    -------
    candidates : list of str
      List of RIs (path, url, ssh targets) to try to install from
    """
    candidates = []

    ri = RI(src)
    if isinstance(ri, PathRI) and not isabs(ri.path) and base_url:
        ri = RI(base_url)
        if ri.path.endswith('/.git'):
            base_path = ri.path[:-5]
            base_suffix = '.git'
        else:
            base_path = ri.path
            base_suffix = ''
        ri.path = normpath(opj(base_path, src, base_suffix))

    src = str(ri)

    candidates.append(src)
    if isinstance(ri, URL):
        if ri.scheme in {'http', 'https'}:
            # additionally try to consider .git:
            if not src.rstrip('/').endswith('/.git'):
                candidates.append('{0}/.git'.format(src.rstrip('/')))

    # TODO:
    # We need to provide some error msg with InstallFailedError, since now
    # it just swallows everything.
    # yoh: not sure if this comment applies here, but could be still applicable
    # outisde

    return candidates
Beispiel #5
0
def resolve_path(path, ds=None):
    """Resolve a path specification (against a Dataset location)

    Any explicit path (absolute or relative) is returned as an absolute path.
    In case of an explicit relative path, the current working directory is
    used as a reference. Any non-explicit relative path is resolved against
    as dataset location, i.e. considered relative to the location of the
    dataset. If no dataset is provided, the current working directory is
    used.

    Returns
    -------
    Absolute path
    """
    # first make sure it's actually a valid path:
    from datalad.support.network import PathRI
    if not isinstance(RI(path), PathRI):
        raise ValueError("%s is not a valid path" % path)

    path = expandpath(path, force_absolute=False)
    if is_explicit_path(path):
        # normalize path consistently between two (explicit and implicit) cases
        return dlabspath(path, norm=True)

    # no dataset given, use CWD as reference
    # note: abspath would disregard symlink in CWD
    top_path = getpwd() \
        if ds is None else ds.path if isinstance(ds, Dataset) else ds
    return normpath(opj(top_path, path))
Beispiel #6
0
    def _flyweight_id_from_args(cls, *args, **kwargs):

        if args:
            # to a certain degree we need to simulate an actual call to __init__
            # and make sure, passed arguments are fitting:
            # TODO: Figure out, whether there is a cleaner way to do this in a
            # generic fashion
            assert('path' not in kwargs)
            path = args[0]
            args = args[1:]
        elif 'path' in kwargs:
            path = kwargs.pop('path')
        else:
            raise TypeError("__init__() requires argument `path`")

        if path is None:
            lgr.debug("path is None. args: %s, kwargs: %s", args, kwargs)
            raise ValueError("path must not be None")

        # Custom handling for few special abbreviations if defined by the class
        path_ = cls._flyweight_preproc_path(path)

        # mirror what is happening in __init__
        if isinstance(path, ut.PurePath):
            path = str(path)

        # Sanity check for argument `path`:
        # raise if we cannot deal with `path` at all or
        # if it is not a local thing:
        localpath = RI(path_).localpath

        path_postproc = cls._flyweight_postproc_path(localpath)

        kwargs['path'] = path_postproc
        return path_postproc, args, kwargs
Beispiel #7
0
def configure_origins(cfgds, probeds, label=None):
    """Configure any discoverable local dataset 'origin' sibling as a remote

    Parameters
    ----------
    cfgds : Dataset
      Dataset to receive the remote configurations
    probeds : Dataset
      Dataset to start looking for 'origin' remotes. May be identical with
      `cfgds`.
    label : int, optional
      Each discovered 'origin' will be configured as a remote under the name
      'origin-<label>'. If no label is given, '2' will be used by default,
      given that there is typically a 'origin' remote already.
    """
    if label is None:
        label = 1
    # let's look at the URL for that remote and see if it is a local
    # dataset
    origin_url = probeds.config.get('remote.origin.url')
    if not origin_url:
        # no origin, nothing to do
        return
    if not cfgds.config.obtain(
            'datalad.install.inherit-local-origin',
            default=True):
        # no inheritance wanted
        return
    if not isinstance(RI(origin_url), PathRI):
        # not local path
        return

    # no need to reconfigure original/direct origin again
    if cfgds != probeds:
        # prevent duplicates
        known_remote_urls = set(
            cfgds.config.get(r + '.url', None)
            for r in cfgds.config.sections()
            if r.startswith('remote.')
        )
        if origin_url not in known_remote_urls:
            yield from cfgds.siblings(
                'configure',
                # no chance for conflict, can only be the second configured
                # remote
                name='origin-{}'.format(label),
                url=origin_url,
                # fetch to get all annex info
                fetch=True,
                result_renderer='disabled',
                on_failure='ignore',
            )
    # and dive deeper
    # given the clone source is a local dataset, we can have a
    # cheap look at it, and configure its own 'origin' as a remote
    # (if there is any), and benefit from additional annex availability
    yield from configure_origins(
        cfgds,
        Dataset(probeds.pathobj / origin_url),
        label=label + 1)
Beispiel #8
0
    def get_connection(self, url):
        """Get a singleton, representing a shared ssh connection to `url`

        Parameters
        ----------
        url: str
          ssh url

        Returns
        -------
        SSHConnection
        """
        # parse url:
        from datalad.support.network import RI, is_ssh
        sshri = RI(url)

        if not is_ssh(sshri):
            raise ValueError(
                "Unsupported SSH URL: '{0}', use ssh://host/path or host:path syntax"
                .format(url))

        # determine control master:
        ctrl_path = "%s/%s" % (self.socket_dir, sshri.hostname)
        if sshri.port:
            ctrl_path += ":%s" % sshri.port

        # do we know it already?
        if ctrl_path in self._connections:
            return self._connections[ctrl_path]
        else:
            c = SSHConnection(ctrl_path, sshri.hostname)
            self._connections[ctrl_path] = c
            return c
Beispiel #9
0
def _get_flexible_source_candidates_for_submodule(ds, sm_path, sm_url=None):
    """Retrieve candidates from where to install the submodule

    Even if url for submodule is provided explicitly -- first tries urls under
    parent's module tracking branch remote.
    """
    clone_urls = []

    # should be our first candidate
    tracking_remote, tracking_branch = ds.repo.get_tracking_branch()
    candidate_remotes = [tracking_remote] if tracking_remote else []

    # if we have a remote, let's check the location of that remote
    # for the presence of the desired submodule
    try:
        last_commit = next(ds.repo._get_files_history(sm_path)).hexsha
        # ideally should also give preference to the remotes which have
        # the same branch checked out I guess
        candidate_remotes += list(ds.repo._get_remotes_having_commit(last_commit))
    except StopIteration:
        # no commit for it known yet, ... oh well
        pass

    for remote in unique(candidate_remotes):
        remote_url = ds.repo.get_remote_url(remote, push=False)

        # Directly on parent's ds url
        if remote_url:
            # attempt: submodule checkout at parent remote URL
            # We might need to quote sm_path portion, e.g. for spaces etc
            if isinstance(RI(remote_url), URL):
                sm_path_url = urlquote(sm_path)
            else:
                sm_path_url = sm_path

            clone_urls.extend(
                _get_flexible_source_candidates(
                    # alternate suffixes are tested by `clone` anyways
                    sm_path_url, remote_url, alternate_suffix=False))

            # attempt: provided (configured?) submodule URL
            # TODO: consider supporting DataLadRI here?  or would confuse
            #  git and we wouldn't want that (i.e. not allow pure git clone
            #  --recursive)
            if sm_url:
                clone_urls += _get_flexible_source_candidates(
                    sm_url,
                    remote_url,
                    alternate_suffix=False
                )

    # Do based on the ds.path as the last resort
    if sm_url:
        clone_urls += _get_flexible_source_candidates(
            sm_url,
            ds.path,
            alternate_suffix=False)

    return unique(clone_urls)
Beispiel #10
0
def _get_git_url_from_source(source):
    """Return URL for cloning associated with a source specification

    For now just resolves DataLadRIs
    """
    # TODO: Probably RF this into RI.as_git_url(), that would be overridden
    # by subclasses or sth. like that
    if not isinstance(source, RI):
        source_ri = RI(source)
    else:
        source_ri = source
    if isinstance(source_ri, DataLadRI):
        # we have got our DataLadRI as the source, so expand it
        source = source_ri.as_git_url()
    else:
        source = str(source_ri)
    return source
Beispiel #11
0
def _get_git_url_from_source(source):
    """Return URL for cloning associated with a source specification

    For now just resolves DataLadRIs
    """
    # TODO: Probably RF this into RI.as_git_url(), that would be overridden
    # by subclasses or sth. like that
    if not isinstance(source, RI):
        source_ri = RI(source)
    else:
        source_ri = source
    if isinstance(source_ri, DataLadRI):
        # we have got our DataLadRI as the source, so expand it
        source = source_ri.as_git_url()
    else:
        source = str(source_ri)
    return source
    def get_connection(self,
                       url,
                       use_remote_annex_bundle=True,
                       force_ip=False):
        """Get a singleton, representing a shared ssh connection to `url`

        Parameters
        ----------
        url: str
          ssh url
        force_ip : {False, 4, 6}
          Force the use of IPv4 or IPv6 addresses.

        Returns
        -------
        SSHConnection
        """
        # parse url:
        from datalad.support.network import RI, is_ssh
        if isinstance(url, RI):
            sshri = url
        else:
            if ':' not in url and '/' not in url:
                # it is just a hostname
                lgr.debug("Assuming %r is just a hostname for ssh connection",
                          url)
                url += ':'
            sshri = RI(url)

        if not is_ssh(sshri):
            raise ValueError("Unsupported SSH URL: '{0}', use "
                             "ssh://host/path or host:path syntax".format(url))

        from datalad import cfg
        identity_file = cfg.get("datalad.ssh.identityfile")

        conhash = get_connection_hash(
            sshri.hostname,
            port=sshri.port,
            identity_file=identity_file or "",
            username=sshri.username,
            bundled=use_remote_annex_bundle,
            force_ip=force_ip,
        )
        # determine control master:
        ctrl_path = self.socket_dir / conhash

        # do we know it already?
        if ctrl_path in self._connections:
            return self._connections[ctrl_path]
        else:
            c = SSHConnection(ctrl_path,
                              sshri,
                              identity_file=identity_file,
                              use_remote_annex_bundle=use_remote_annex_bundle,
                              force_ip=force_ip)
            self._connections[ctrl_path] = c
            return c
Beispiel #13
0
def test_is_url():
    ok_(is_url('file://localhost/some'))
    ok_(is_url('http://localhost'))
    ok_(is_url('ssh://me@localhost'))
    # in current understanding it is indeed a url but an 'ssh', implicit=True, not just
    # a useless scheme=weired with a hope to point to a netloc
    with swallow_logs():
        ok_(is_url('weired://'))
    nok_(is_url('relative'))
    nok_(is_url('/absolute'))
    ok_(is_url('like@sshlogin'))  # actually we do allow ssh:implicit urls ATM
    nok_(is_url(''))
    nok_(is_url(' '))
    nok_(is_url(123))  # stuff of other types wouldn't be considered a URL

    # we can pass RI instance directly
    ok_(is_url(RI('file://localhost/some')))
    nok_(is_url(RI('relative')))
Beispiel #14
0
def test_is_ssh():

    ssh_locators = [
        "ssh://host", "ssh://host/some/where", "user@host:path/sp1",
        "user@host:/absolute/path/sp1", "host:path/sp1",
        "host:/absolute/path/sp1", "user@host"
    ]
    for ri in ssh_locators:
        ok_(is_ssh(ri), "not considered ssh (string): %s" % ri)
        ok_(is_ssh(RI(ri)), "not considered ssh (RI): %s" % ri)

    non_ssh_locators = [
        "file://path/to", "/abs/path", "../rel/path", "http://example.com",
        "git://host/user/proj", "s3://bucket/save/?key=891"
    ]
    for ri in non_ssh_locators:
        ok_(not is_ssh(ri), "considered ssh (string): %s" % ri)
        ok_(not is_ssh(RI(ri)), "considered ssh (RI): %s" % ri)
Beispiel #15
0
def _add_remote(ds, name, known_remotes, url, pushurl, fetch, description,
                as_common_datasrc, publish_depends, publish_by_default,
                annex_wanted, annex_required, annex_group, annex_groupwanted,
                inherit, get_annex_info, **res_kwargs):
    # TODO: allow for no url if 'inherit' and deduce from the super ds
    #       create-sibling already does it -- generalize/use
    #  Actually we could even inherit/deduce name from the super by checking
    #  which remote it is actively tracking in current branch... but may be
    #  would be too much magic

    # it seems that the only difference is that `add` should fail if a remote
    # already exists
    if (url is None and pushurl is None):
        raise InsufficientArgumentsError(
            """insufficient information to add a sibling
            (needs at least a dataset, and any URL).""")
    if url is None:
        url = pushurl

    if not name:
        urlri = RI(url)
        # use the hostname as default remote name
        name = urlri.hostname
        lgr.debug(
            "No sibling name given, use URL hostname '%s' as sibling name",
            name)

    if not name:
        raise InsufficientArgumentsError("no sibling name given")
    if name in known_remotes:
        yield get_status_dict(
            action='add-sibling',
            status='error',
            path=ds.path,
            type='sibling',
            name=name,
            message=("sibling is already known: %s, use `configure` instead?",
                     name),
            **res_kwargs)
        return
    # this remote is fresh: make it known
    # just minimalistic name and URL, the rest is coming from `configure`
    ds.repo.add_remote(name, url)
    known_remotes.append(name)
    # always copy signature from above to avoid bugs
    for r in _configure_remote(ds, name, known_remotes, url, pushurl, fetch,
                               description, as_common_datasrc, publish_depends,
                               publish_by_default, annex_wanted,
                               annex_required, annex_group, annex_groupwanted,
                               inherit, get_annex_info, **res_kwargs):
        if r['action'] == 'configure-sibling':
            r['action'] = 'add-sibling'
        yield r
Beispiel #16
0
    def _flyweight_id_from_args(cls, *args, **kwargs):

        if args:
            # to a certain degree we need to simulate an actual call to __init__
            # and make sure, passed arguments are fitting:
            # TODO: Figure out, whether there is a cleaner way to do this in a
            # generic fashion
            assert('path' not in kwargs)
            path = args[0]
            args = args[1:]
        elif 'path' in kwargs:
            path = kwargs.pop('path')
        else:
            raise TypeError("__init__() requires argument `path`")

        if path is None:
            raise AttributeError

        # mirror what is happening in __init__
        if isinstance(path, ut.PurePath):
            path = text_type(path)

        # Custom handling for few special abbreviations
        path_ = path
        if path == '^':
            # get the topmost dataset from current location. Note that 'zsh'
            # might have its ideas on what to do with ^, so better use as -d^
            path_ = Dataset(curdir).get_superdataset(topmost=True).path
        elif path == '///':
            # TODO: logic/UI on installing a default dataset could move here
            # from search?
            path_ = cfg.obtain('datalad.locations.default-dataset')
        if path != path_:
            lgr.debug("Resolved dataset alias %r to path %r", path, path_)

        # Sanity check for argument `path`:
        # raise if we cannot deal with `path` at all or
        # if it is not a local thing:
        path_ = RI(path_).localpath

        # we want an absolute path, but no resolved symlinks
        if not isabs(path_):
            path_ = opj(getpwd(), path_)

        # use canonical paths only:
        path_ = normpath(path_)
        kwargs['path'] = path_
        return path_, args, kwargs
Beispiel #17
0
def _import_dicom_tarball(target_ds, tarball, filename):

    # # TODO: doesn't work for updates yet:
    # # - branches are expected to not exist yet
    target_ds.repo.checkout('incoming', options=['-b'])
    target_ds.repo.init_remote(
        ARCHIVES_SPECIAL_REMOTE,
        options=[
            'encryption=none', 'type=external',
            'externaltype=%s' % ARCHIVES_SPECIAL_REMOTE, 'autoenable=true',
            'uuid={}'.format(
                DATALAD_SPECIAL_REMOTES_UUIDS[ARCHIVES_SPECIAL_REMOTE])
        ])

    if isinstance(RI(tarball), PathRI):
        shutil.copy2(tarball, op.join(target_ds.path, filename))
        target_ds.repo.add(filename)

    else:
        target_ds.repo.add_url_to_file(file_=filename,
                                       url=tarball,
                                       batch=False)

    target_ds.repo.commit(msg="Retrieved %s" % tarball)
    target_ds.repo.checkout('incoming-processed', options=['--orphan'])
    if target_ds.repo.dirty:
        target_ds.repo.remove('.', r=True, f=True)

    target_ds.repo.merge('incoming',
                         options=["-s", "ours", "--no-commit"],
                         expect_stderr=True)
    target_ds.repo._git_custom_command([], "git read-tree -m -u incoming")

    from datalad.coreapi import add_archive_content
    # # TODO: Reconsider value of --existing
    add_archive_content(archive=filename,
                        annex=target_ds.repo,
                        existing='archive-suffix',
                        delete=True,
                        commit=False,
                        allow_dirty=True)

    target_ds.repo.commit(msg="Extracted %s" % tarball)
    target_ds.repo.checkout('master')
    target_ds.repo.merge('incoming-processed', options=["--allow-unrelated"])
Beispiel #18
0
def _get_flexible_source_candidates(src, base_url=None, alternate_suffix=True):
    """Get candidates to try cloning from.

    Primarily to mitigate the problem that git doesn't append /.git
    while cloning from non-bare repos over dummy protocol (http*).  Also to
    simplify creation of urls whenever base url and relative path within it
    provided

    Parameters
    ----------
    src : string or RI
      Full or relative (then considered within base_url if provided) path
    base_url : string or RI, optional
    alternate_suffix : bool
      Whether to generate URL candidates with and without '/.git' suffixes.

    Returns
    -------
    candidates : list of str
      List of RIs (path, url, ssh targets) to try to install from
    """
    candidates = []

    ri = RI(src)
    if isinstance(ri, PathRI) and not isabs(ri.path) and base_url:
        ri = RI(base_url)
        if ri.path.endswith('/.git'):
            base_path = ri.path[:-5]
            base_suffix = '.git'
        else:
            base_path = ri.path
            base_suffix = ''
        if isinstance(ri, PathRI):
            # this is a path, so stay native
            ri.path = normpath(opj(base_path, src, base_suffix))
        else:
            # we are handling a URL, use POSIX path conventions
            ri.path = posixpath.normpath(
                posixpath.join(base_path, src, base_suffix))

    src = str(ri)

    candidates.append(src)
    if alternate_suffix and isinstance(ri, URL):
        if ri.scheme in {'http', 'https'}:
            # additionally try to consider .git:
            if not src.rstrip('/').endswith('/.git'):
                candidates.append(
                    '{0}/.git'.format(src.rstrip('/')))

    return candidates
Beispiel #19
0
def test_get_multiple_files(path, url, ds_dir):
    from os import listdir
    from datalad.support.network import RI

    file_list = [f for f in listdir(path) if not f.startswith('.')]

    # prepare urls:
    [RI(url + f) for f in file_list]

    # prepare origin
    origin = Dataset(path).create(force=True)
    origin.add(file_list)
    origin.save("initial")

    ds = install(ds_dir,
                 source=path,
                 result_xfm='datasets',
                 return_type='item-or-list')

    # no content present:
    ok_(not any(ds.repo.file_has_content(file_list)))

    # get two plus an invalid one:
    result = ds.get(['file1.txt', 'file2.txt', 'not_existing.txt'],
                    on_failure='ignore')
    assert_status('impossible', [result[0]])
    assert_status(['ok', 'notneeded'], result[1:])
    # explicitly given not existing file was skipped:
    # (see test_get_invalid_call)
    eq_(set([basename(item.get('path')) for item in result[1:]]),
        {'file1.txt', 'file2.txt'})
    ok_(all(ds.repo.file_has_content(['file1.txt', 'file2.txt'])))

    # get all of them:
    result = ds.get(curdir)
    # there were two files left to get:
    eq_(
        set([
            basename(item.get('path')) for item in result
            if item['type'] == 'file'
        ]), {'file3.txt', 'file4.txt'})
    ok_(all(ds.repo.file_has_content(file_list)))
Beispiel #20
0
 def _get_ds_remote_shared_setting(ds, name, ssh):
     """Figure out setting of sharedrepository for dataset's `name` remote"""
     shared = None
     try:
         current_super_url = CreateSibling._get_remote_url(ds, name)
         current_super_ri = RI(current_super_url)
         out, err = ssh('git -C {} config --get core.sharedrepository'.format(
             # TODO -- we might need to expanduser taking .user into account
             # but then it must be done also on remote side
             sh_quote(current_super_ri.path)))
         shared = out.strip()
         if err:
             lgr.warning("Got stderr while calling ssh: %s", err)
     except CommandError as e:
         lgr.debug(
             "Could not figure out remote shared setting of %s for %s due "
             "to %s", ds, name, exc_str(e))
         # could well be ok if e.g. not shared
         # TODO: more detailed analysis may be?
     return shared
Beispiel #21
0
    def _prep_connection_args(self, url):
        # parse url:
        from datalad.support.network import RI, is_ssh
        if isinstance(url, RI):
            sshri = url
        else:
            if ':' not in url and '/' not in url:
                # it is just a hostname
                lgr.debug("Assuming %r is just a hostname for ssh connection",
                          url)
                url += ':'
            sshri = RI(url)

        if not is_ssh(sshri):
            raise ValueError("Unsupported SSH URL: '{0}', use "
                             "ssh://host/path or host:path syntax".format(url))

        from datalad import cfg
        identity_file = cfg.get("datalad.ssh.identityfile")
        return sshri, identity_file
Beispiel #22
0
    def get_connection(self, url):
        """Get a singleton, representing a shared ssh connection to `url`

        Parameters
        ----------
        url: str
          ssh url

        Returns
        -------
        SSHConnection
        """
        # parse url:
        from datalad.support.network import RI, is_ssh
        if isinstance(url, RI):
            sshri = url
        else:
            if ':' not in url and '/' not in url:
                # it is just a hostname
                lgr.debug("Assuming %r is just a hostname for ssh connection",
                          url)
                url += ':'
            sshri = RI(url)

        if not is_ssh(sshri):
            raise ValueError("Unsupported SSH URL: '{0}', use "
                             "ssh://host/path or host:path syntax".format(url))

        conhash = get_connection_hash(sshri.hostname,
                                      port=sshri.port,
                                      username=sshri.username)
        # determine control master:
        ctrl_path = "%s/%s" % (self.socket_dir, conhash)

        # do we know it already?
        if ctrl_path in self._connections:
            return self._connections[ctrl_path]
        else:
            c = SSHConnection(ctrl_path, sshri)
            self._connections[ctrl_path] = c
            return c
Beispiel #23
0
def _get_installationpath_from_url(url):
    """Returns a relative path derived from the trailing end of a URL

    This can be used to determine an installation path of a Dataset
    from a URL, analog to what `git clone` does.
    """
    ri = RI(url)
    if isinstance(ri, (URL, DataLadRI)):  # decode only if URL
        path = ri.path.rstrip('/')
        path = urlunquote(path) if path else ri.hostname
        if '/' in path:
            path = path.split('/')
            if path[-1] == '.git':
                path = path[-2]
            else:
                path = path[-1]
    else:
        path = Path(url).parts[-1]
    if path.endswith('.git'):
        path = path[:-4]
    return path
Beispiel #24
0
    def _run_on_ds_ssh_remote(ds, name, ssh, cmd):
        """Given a dataset, and name of the remote, run command via ssh

        Parameters
        ----------
        cmd: str
          Will be .format()'ed given the `path` to the dataset on remote

        Returns
        -------
        out

        Raises
        ------
        CommandError
        """
        remote_url = CreateSibling._get_remote_url(ds, name)
        remote_ri = RI(remote_url)
        out, err = ssh(cmd.format(path=sh_quote(remote_ri.path)))
        if err:
            lgr.warning("Got stderr while calling ssh: %s", err)
        return out
Beispiel #25
0
def _test_url_quote_path(cls, clskwargs, target_url):
    path = '/ "\';a&b&cd `| '
    if not (cls is PathRI):
        clskwargs['hostname'] = hostname = 'example.com'
    url = cls(path=path, **clskwargs)
    eq_(url.path, path)
    if 'hostname' in clskwargs:
        eq_(url.hostname, hostname)
    # all nasty symbols should be quoted
    url_str = str(url)
    eq_(url_str, target_url)
    # no side-effects:
    eq_(url.path, path)
    if 'hostname' in clskwargs:
        eq_(url.hostname, hostname)

    # and figured out and unquoted
    url_ = RI(url_str)
    ok_(isinstance(url_, cls))
    eq_(url_.path, path)
    if 'hostname' in clskwargs:
        eq_(url.hostname, hostname)
Beispiel #26
0
def _get_flexible_source_candidates_for_submodule(ds, sm_path, sm_url=None):
    """Retrieve candidates from where to install the submodule

    Even if url for submodule is provided explicitly -- first tries urls under
    parent's module tracking branch remote.
    """
    clone_urls = []
    # if we have a remote, let's check the location of that remote
    # for the presence of the desired submodule
    remote_name, remote_url = _get_tracking_source(ds)

    # Directly on parent's ds url
    if remote_url:
        # attempt: submodule checkout at parent remote URL
        # We might need to quote sm_path portion, e.g. for spaces etc
        if isinstance(RI(remote_url), URL):
            sm_path_url = urlquote(sm_path)
        else:
            sm_path_url = sm_path

        clone_urls.extend(
            _get_flexible_source_candidates(
                # alternate suffixes are tested by `clone` anyways
                sm_path_url,
                remote_url,
                alternate_suffix=False))

    # attempt: provided (configured?) submodule URL
    # TODO: consider supporting DataLadRI here?  or would confuse
    #  git and we wouldn't want that (i.e. not allow pure git clone
    #  --recursive)
    if sm_url:
        clone_urls += _get_flexible_source_candidates(
            sm_url,
            remote_url if remote_url else ds.path,
            alternate_suffix=False)

    return unique(clone_urls)
Beispiel #27
0
def test_get_multiple_files(path, url, ds_dir):
    from os import listdir
    from datalad.support.network import RI

    file_list = [f for f in listdir(path) if not f.startswith('.')]

    # prepare urls:
    urls = [RI(url + f) for f in file_list]

    # prepare origin
    origin = Dataset(path).create(force=True)
    origin.add(file_list)
    origin.save("initial")

    ds = install(ds_dir, source=path)

    # no content present:
    ok_(not any(ds.repo.file_has_content(file_list)))

    # get two plus an invalid one:
    with assert_raises(IncompleteResultsError) as cme:
        ds.get(['file1.txt', 'file2.txt', 'not_existing.txt'])
    result = cme.exception.results
    # explicitly given not existing file was skipped:
    # (see test_get_invalid_call)
    eq_(set([item.get('file') for item in result]),
        {'file1.txt', 'file2.txt'})
    ok_(all([x['success'] is True
             for x in result if x['file'] in ['file1.txt', 'file2.txt']]))
    ok_(all(ds.repo.file_has_content(['file1.txt', 'file2.txt'])))

    # get all of them:
    result = ds.get(curdir)
    # there were two files left to get:
    eq_(set([item.get('file') for item in result]),
        {'file3.txt', 'file4.txt'})
    ok_(all(ds.repo.file_has_content(file_list)))
Beispiel #28
0
def postclonecfg_annexdataset(ds, reckless, description=None):
    """If ds "knows annex" -- annex init it, set into reckless etc

    Provides additional tune up to a possibly an annex repo, e.g.
    "enables" reckless mode, sets up description
    """
    # in any case check whether we need to annex-init the installed thing:
    if not knows_annex(ds.path):
        # not for us
        return

    # init annex when traces of a remote annex can be detected
    if reckless == 'auto':
        lgr.debug(
            "Instruct annex to hardlink content in %s from local "
            "sources, if possible (reckless)", ds.path)
        ds.config.set(
            'annex.hardlink', 'true', where='local', reload=True)

    lgr.debug("Initializing annex repo at %s", ds.path)
    # Note, that we cannot enforce annex-init via AnnexRepo().
    # If such an instance already exists, its __init__ will not be executed.
    # Therefore do quick test once we have an object and decide whether to call
    # its _init().
    #
    # Additionally, call init if we need to add a description (see #1403),
    # since AnnexRepo.__init__ can only do it with create=True
    repo = AnnexRepo(ds.path, init=True)
    if not repo.is_initialized() or description:
        repo._init(description=description)
    if reckless == 'auto' or (reckless and reckless.startswith('shared-')):
        repo.call_annex(['untrust', 'here'])

    elif reckless == 'ephemeral':
        # with ephemeral we declare 'here' as 'dead' right away, whenever
        # we symlink origin's annex, since availability from 'here' should
        # not be propagated for an ephemeral clone when we publish back to
        # origin.
        # This will cause stuff like this for a locally present annexed file:
        # % git annex whereis d1
        # whereis d1 (0 copies) failed
        # BUT this works:
        # % git annex find . --not --in here
        # % git annex find . --in here
        # d1

        # we don't want annex copy-to origin
        ds.config.set(
            'remote.origin.annex-ignore', 'true',
            where='local')

        ds.repo.set_remote_dead('here')

        if check_symlink_capability(ds.repo.dot_git / 'dl_link_test',
                                    ds.repo.dot_git / 'dl_target_test'):
            # symlink the annex to avoid needless copies in an ephemeral clone
            annex_dir = ds.repo.dot_git / 'annex'
            origin_annex_url = ds.config.get("remote.origin.url", None)
            origin_git_path = None
            if origin_annex_url:
                try:
                    # Deal with file:// scheme URLs as well as plain paths.
                    # If origin isn't local, we have nothing to do.
                    origin_git_path = Path(RI(origin_annex_url).localpath)

                    # we are local; check for a bare repo first to not mess w/
                    # the path
                    if GitRepo(origin_git_path, create=False).bare:
                        # origin is a bare repo -> use path as is
                        pass
                    elif origin_git_path.name != '.git':
                        origin_git_path /= '.git'
                except ValueError:
                    # Note, that accessing localpath on a non-local RI throws
                    # ValueError rather than resulting in an AttributeError.
                    # TODO: Warning level okay or is info level sufficient?
                    # Note, that setting annex-dead is independent of
                    # symlinking .git/annex. It might still make sense to
                    # have an ephemeral clone that doesn't propagate its avail.
                    # info. Therefore don't fail altogether.
                    lgr.warning("reckless=ephemeral mode: origin doesn't seem "
                                "local: %s\nno symlinks being used",
                                origin_annex_url)
            if origin_git_path:
                # TODO make sure that we do not delete any unique data
                rmtree(str(annex_dir)) \
                    if not annex_dir.is_symlink() else annex_dir.unlink()
                annex_dir.symlink_to(origin_git_path / 'annex',
                                     target_is_directory=True)
        else:
            # TODO: What level? + note, that annex-dead is independ
            lgr.warning("reckless=ephemeral mode: Unable to create symlinks on "
                        "this file system.")

    srs = {True: [], False: []}  # special remotes by "autoenable" key
    remote_uuids = None  # might be necessary to discover known UUIDs

    repo_config = repo.config
    # Note: The purpose of this function is to inform the user. So if something
    # looks misconfigured, we'll warn and move on to the next item.
    for uuid, config in repo.get_special_remotes().items():
        sr_name = config.get('name', None)
        if sr_name is None:
            lgr.warning(
                'Ignoring special remote %s because it does not have a name. '
                'Known information: %s',
                uuid, config)
            continue
        sr_autoenable = config.get('autoenable', False)
        try:
            sr_autoenable = ensure_bool(sr_autoenable)
        except ValueError:
            lgr.warning(
                'Failed to process "autoenable" value %r for sibling %s in '
                'dataset %s as bool.'
                'You might need to enable it later manually and/or fix it up to'
                ' avoid this message in the future.',
                sr_autoenable, sr_name, ds.path)
            continue

        # If it looks like a type=git special remote, make sure we have up to
        # date information. See gh-2897.
        if sr_autoenable and repo_config.get("remote.{}.fetch".format(sr_name)):
            try:
                repo.fetch(remote=sr_name)
            except CommandError as exc:
                lgr.warning("Failed to fetch type=git special remote %s: %s",
                            sr_name, exc_str(exc))

        # determine whether there is a registered remote with matching UUID
        if uuid:
            if remote_uuids is None:
                remote_uuids = {
                    # Check annex-config-uuid first. For sameas annex remotes,
                    # this will point to the UUID for the configuration (i.e.
                    # the key returned by get_special_remotes) rather than the
                    # shared UUID.
                    (repo_config.get('remote.%s.annex-config-uuid' % r) or
                     repo_config.get('remote.%s.annex-uuid' % r))
                    for r in repo.get_remotes()
                }
            if uuid not in remote_uuids:
                srs[sr_autoenable].append(sr_name)

    if srs[True]:
        lgr.debug(
            "configuration for %s %s added because of autoenable,"
            " but no UUIDs for them yet known for dataset %s",
            # since we are only at debug level, we could call things their
            # proper names
            single_or_plural("special remote",
                             "special remotes", len(srs[True]), True),
            ", ".join(srs[True]),
            ds.path
        )

    if srs[False]:
        # if has no auto-enable special remotes
        lgr.info(
            'access to %s %s not auto-enabled, enable with:\n'
            '\t\tdatalad siblings -d "%s" enable -s %s',
            # but since humans might read it, we better confuse them with our
            # own terms!
            single_or_plural("dataset sibling",
                             "dataset siblings", len(srs[False]), True),
            ", ".join(srs[False]),
            ds.path,
            srs[False][0] if len(srs[False]) == 1 else "SIBLING",
        )

    # we have just cloned the repo, so it has 'origin', configure any
    # reachable origin of origins
    yield from configure_origins(ds, ds)
Beispiel #29
0
def decode_source_spec(spec, cfg=None):
    """Decode information from a clone source specification

    Parameters
    ----------
    spec : str
      Any supported clone source specification
    cfg : ConfigManager, optional
      Configuration will be queried from the instance (i.e. from a particular
      dataset). If None is given, the global DataLad configuration will be
      queried.

    Returns
    -------
    dict
      The value of each decoded property is stored under its own key in this
      dict. By default the following keys are return: 'type', a specification
      type label {'giturl', 'dataladri', 'ria'}; 'source' the original
      source specification; 'giturl' a URL for the source that is a suitable
      source argument for git-clone; 'version' a version-identifer, if present
      (None else); 'default_destpath' a relative path that that can be used as
      a clone destination.
    """
    if cfg is None:
        from datalad import cfg
    # standard property dict composition
    props = dict(
        source=spec,
        version=None,
    )

    # Git never gets to see these URLs, so let's manually apply any
    # rewrite configuration Git might know about.
    # Note: We need to rewrite before parsing, otherwise parsing might go wrong.
    # This is particularly true for insteadOf labels replacing even the URL
    # scheme.
    spec = cfg.rewrite_url(spec)
    # common starting point is a RI instance, support for accepting an RI
    # instance is kept for backward-compatibility reasons
    source_ri = RI(spec) if not isinstance(spec, RI) else spec

    # scenario switch, each case must set 'giturl' at the very minimum
    if isinstance(source_ri, DataLadRI):
        # we have got our DataLadRI as the source, so expand it
        props['type'] = 'dataladri'
        props['giturl'] = source_ri.as_git_url()
    elif isinstance(source_ri, URL) and source_ri.scheme.startswith('ria+'):
        # parse a RIA URI
        dsid, version = source_ri.fragment.split('@', maxsplit=1) \
            if '@' in source_ri.fragment else (source_ri.fragment, None)
        uuid_regex = r'[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}'
        if re.match(uuid_regex, dsid):
            trace = '{}/{}'.format(dsid[:3], dsid[3:])
            default_destpath = dsid
        elif dsid.startswith('~'):
            trace = 'alias/{}'.format(dsid[1:])
            default_destpath = dsid[1:]
        else:
            raise ValueError(
                'RIA URI not recognized, no valid dataset ID or other supported '
                'scheme: {}'.format(spec))
        # now we cancel the fragment in the original URL, but keep everthing else
        # in order to be able to support the various combinations of ports, paths,
        # and everything else
        source_ri.fragment = ''
        # strip the custom protocol and go with standard one
        source_ri.scheme = source_ri.scheme[4:]
        # take any existing path, and add trace to dataset within the store
        source_ri.path = '{urlpath}{urldelim}{trace}'.format(
            urlpath=source_ri.path if source_ri.path else '',
            urldelim='' if not source_ri.path or source_ri.path.endswith('/') else '/',
            trace=trace,
        )
        props.update(
            type='ria',
            giturl=str(source_ri),
            version=version,
            default_destpath=default_destpath,
        )
    else:
        # let's assume that anything else is a URI that Git can handle
        props['type'] = 'giturl'
        # use original input verbatim
        props['giturl'] = spec

    if 'default_destpath' not in props:
        # if we still have no good idea on where a dataset could be cloned to if no
        # path was given, do something similar to git clone and derive the path from
        # the source
        props['default_destpath'] = _get_installationpath_from_url(props['giturl'])

    return props
Beispiel #30
0
def _configure_remote(ds, name, known_remotes, url, pushurl, fetch,
                      description, as_common_datasrc, publish_depends,
                      publish_by_default, annex_wanted, annex_required,
                      annex_group, annex_groupwanted, inherit, get_annex_info,
                      **res_kwargs):
    result_props = dict(action='configure-sibling',
                        path=ds.path,
                        type='sibling',
                        name=name,
                        **res_kwargs)
    if name is None:
        result_props['status'] = 'error'
        result_props['message'] = 'need sibling `name` for configuration'
        yield result_props
        return

    if name != 'here':
        # do all configure steps that are not meaningful for the 'here' sibling
        # AKA the local repo
        if name not in known_remotes:
            # this remote is fresh: make it known
            # just minimalistic name and URL, the rest is coming from `configure`
            ds.repo.add_remote(name, url)
            known_remotes.append(name)
        elif url:
            # not new, override URl if given
            ds.repo.set_remote_url(name, url)

        # make sure we have a configured fetch expression at this point
        fetchvar = 'remote.{}.fetch'.format(name)
        if fetchvar not in ds.repo.config:
            # place default fetch refspec in config
            # same as `git remote add` would have added
            ds.repo.config.add(fetchvar,
                               '+refs/heads/*:refs/remotes/{}/*'.format(name),
                               where='local')

        if pushurl:
            ds.repo.set_remote_url(name, pushurl, push=True)

        if publish_depends:
            # Check if all `deps` remotes are known to the `repo`
            unknown_deps = set(
                assure_list(publish_depends)).difference(known_remotes)
            if unknown_deps:
                result_props['status'] = 'error'
                result_props['message'] = (
                    'unknown sibling(s) specified as publication dependency: %s',
                    unknown_deps)
                yield result_props
                return

        # define config var name for potential publication dependencies
        depvar = 'remote.{}.datalad-publish-depends'.format(name)
        # and default pushes
        dfltvar = "remote.{}.push".format(name)

        if fetch:
            # fetch the remote so we are up to date
            for r in Update.__call__(dataset=res_kwargs['refds'],
                                     path=[dict(path=ds.path, type='dataset')],
                                     sibling=name,
                                     merge=False,
                                     recursive=False,
                                     on_failure='ignore',
                                     return_type='generator',
                                     result_xfm=None):
                # fixup refds
                r.update(res_kwargs)
                yield r

        if inherit:
            # Adjust variables which we should inherit
            delayed_super = _DelayedSuper(ds.repo)
            publish_depends = _inherit_config_var(delayed_super, depvar,
                                                  publish_depends)
            publish_by_default = _inherit_config_var(delayed_super, dfltvar,
                                                     publish_by_default)
            # Copy relevant annex settings for the sibling
            # makes sense only if current AND super are annexes, so it is
            # kinda a boomer, since then forbids having a super a pure git
            if isinstance(ds.repo, AnnexRepo) and \
                    isinstance(delayed_super.repo, AnnexRepo):
                if annex_wanted is None:
                    annex_wanted = _inherit_annex_var(delayed_super, name,
                                                      'wanted')
                if annex_required is None:
                    annex_required = _inherit_annex_var(
                        delayed_super, name, 'required')
                if annex_group is None:
                    # I think it might be worth inheritting group regardless what
                    # value is
                    #if annex_wanted in {'groupwanted', 'standard'}:
                    annex_group = _inherit_annex_var(delayed_super, name,
                                                     'group')
                if annex_wanted == 'groupwanted' and annex_groupwanted is None:
                    # we better have a value for the expression for that group
                    annex_groupwanted = _inherit_annex_var(
                        delayed_super, name, 'groupwanted')

        if publish_depends:
            if depvar in ds.config:
                # config vars are incremental, so make sure we start from
                # scratch
                ds.config.unset(depvar, where='local', reload=False)
            for d in assure_list(publish_depends):
                lgr.info('Configure additional publication dependency on "%s"',
                         d)
                ds.config.add(depvar, d, where='local', reload=False)
            ds.config.reload()

        if publish_by_default:
            if dfltvar in ds.config:
                ds.config.unset(dfltvar, where='local', reload=False)
            for refspec in assure_list(publish_by_default):
                lgr.info(
                    'Configure additional default publication refspec "%s"',
                    refspec)
                ds.config.add(dfltvar, refspec, 'local')
            ds.config.reload()

        assert isinstance(ds.repo, GitRepo)  # just against silly code
        if isinstance(ds.repo, AnnexRepo):
            # we need to check if added sibling an annex, and try to enable it
            # another part of the fix for #463 and #432
            try:
                if not ds.config.obtain('remote.{}.annex-ignore'.format(name),
                                        default=False,
                                        valtype=EnsureBool(),
                                        store=False):
                    ds.repo.enable_remote(name)
            except CommandError as exc:
                # TODO yield
                # this is unlikely to ever happen, now done for AnnexRepo instances
                # only
                lgr.info("Failed to enable annex remote %s, "
                         "could be a pure git" % name)
                lgr.debug("Exception was: %s" % exc_str(exc))
            if as_common_datasrc:
                ri = RI(url)
                if isinstance(ri, URL) and ri.scheme in ('http', 'https'):
                    # XXX what if there is already a special remote
                    # of this name? Above check for remotes ignores special
                    # remotes. we need to `git annex dead REMOTE` on reconfigure
                    # before we can init a new one
                    # XXX except it is not enough

                    # make special remote of type=git (see #335)
                    ds.repo._run_annex_command('initremote',
                                               annex_options=[
                                                   as_common_datasrc,
                                                   'type=git',
                                                   'location={}'.format(url),
                                                   'autoenable=true'
                                               ])
                else:
                    yield dict(
                        status='impossible',
                        name=name,
                        message='cannot configure as a common data source, '
                        'URL protocol is not http or https',
                        **result_props)
    #
    # place configure steps that also work for 'here' below
    #
    if isinstance(ds.repo, AnnexRepo):
        for prop, var in (('wanted', annex_wanted),
                          ('required', annex_required), ('group',
                                                         annex_group)):
            if var is not None:
                ds.repo.set_preferred_content(prop, var,
                                              '.' if name == 'here' else name)
        if annex_groupwanted:
            ds.repo.set_groupwanted(annex_group, annex_groupwanted)

    if description:
        if not isinstance(ds.repo, AnnexRepo):
            result_props['status'] = 'impossible'
            result_props[
                'message'] = 'cannot set description of a plain Git repository'
            yield result_props
            return
        ds.repo._run_annex_command('describe',
                                   annex_options=[name, description])

    # report all we know at once
    info = list(
        _query_remotes(ds, name, known_remotes,
                       get_annex_info=get_annex_info))[0]
    info.update(dict(status='ok', **result_props))
    yield info
Beispiel #31
0
    def __call__(sshurl, name=None, target_dir=None,
                 target_url=None, target_pushurl=None,
                 dataset=None,
                 recursive=False,
                 recursion_limit=None,
                 existing='error',
                 shared=None,
                 group=None,
                 ui=False,
                 as_common_datasrc=None,
                 publish_by_default=None,
                 publish_depends=None,
                 annex_wanted=None, annex_group=None, annex_groupwanted=None,
                 inherit=False,
                 since=None):
        #
        # nothing without a base dataset
        #
        ds = require_dataset(dataset, check_installed=True,
                             purpose='creating a sibling')
        refds_path = ds.path

        #
        # all checks that are possible before we start parsing the dataset
        #

        # possibly use sshurl to get the name in case if not specified
        if not sshurl:
            if not inherit:
                raise InsufficientArgumentsError(
                    "needs at least an SSH URL, if no inherit option"
                )
            if name is None:
                raise ValueError(
                    "Neither SSH URL, nor the name of sibling to inherit from "
                    "was specified"
                )
            # It might well be that we already have this remote setup
            try:
                sshurl = CreateSibling._get_remote_url(ds, name)
            except Exception as exc:
                lgr.debug('%s does not know about url for %s: %s', ds, name, exc_str(exc))
        elif inherit:
            raise ValueError(
                "For now, for clarity not allowing specifying a custom sshurl "
                "while inheriting settings"
            )
            # may be could be safely dropped -- still WiP

        if not sshurl:
            # TODO: may be more back up before _prep?
            super_ds = ds.get_superdataset()
            if not super_ds:
                raise ValueError(
                    "Could not determine super dataset for %s to inherit URL"
                    % ds
                )
            super_url = CreateSibling._get_remote_url(super_ds, name)
            # for now assuming hierarchical setup
            # (TODO: to be able to destinguish between the two, probably
            # needs storing datalad.*.target_dir to have %RELNAME in there)
            sshurl = slash_join(super_url, relpath(ds.path, super_ds.path))

        # check the login URL
        sshri = RI(sshurl)
        if not is_ssh(sshri):
            raise ValueError(
                "Unsupported SSH URL: '{0}', "
                "use ssh://host/path or host:path syntax".format(sshurl))

        if not name:
            # use the hostname as default remote name
            name = sshri.hostname
            lgr.debug(
                "No sibling name given, use URL hostname '%s' as sibling name",
                name)

        if since == '':
            # consider creating siblings only since the point of
            # the last update
            # XXX here we assume one to one mapping of names from local branches
            # to the remote
            active_branch = ds.repo.get_active_branch()
            since = '%s/%s' % (name, active_branch)

        #
        # parse the base dataset to find all subdatasets that need processing
        #
        to_process = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                # only a single path!
                path=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='create_sibling',
                # both next should not happen anyways
                unavailable_path_status='impossible',
                nondataset_path_status='error',
                modified=since,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', None) != 'dataset' or ap.get('state', None) == 'absent':
                # this can happen when there is `since`, but we have no
                # use for anything but datasets here
                continue
            checkds_remotes = Dataset(ap['path']).repo.get_remotes() \
                if ap.get('state', None) != 'absent' \
                else []
            if publish_depends:
                # make sure dependencies are valid
                # TODO: inherit -- we might want to automagically create
                # those dependents as well???
                unknown_deps = set(assure_list(publish_depends)).difference(checkds_remotes)
                if unknown_deps:
                    ap['status'] = 'error'
                    ap['message'] = (
                        'unknown sibling(s) specified as publication dependency: %s',
                        unknown_deps)
                    yield ap
                    continue
            if name in checkds_remotes and existing in ('error', 'skip'):
                ap['status'] = 'error' if existing == 'error' else 'notneeded'
                ap['message'] = (
                    "sibling '%s' already configured (specify alternative name, or force "
                    "reconfiguration via --existing",
                    name)
                yield ap
                continue
            to_process.append(ap)

        if not to_process:
            # we ruled out all possibilities
            # TODO wait for gh-1218 and make better return values
            lgr.info("No datasets qualify for sibling creation. "
                     "Consider different settings for --existing "
                     "or --since if this is unexpected")
            return

        if target_dir is None:
            if sshri.path:
                target_dir = sshri.path
            else:
                target_dir = '.'

        # TODO: centralize and generalize template symbol handling
        replicate_local_structure = "%RELNAME" not in target_dir

        # request ssh connection:
        lgr.info("Connecting ...")
        assert(sshurl is not None)  # delayed anal verification
        ssh = ssh_manager.get_connection(sshurl)
        if not ssh.get_annex_version():
            raise MissingExternalDependency(
                'git-annex',
                msg='on the remote system')

        #
        # all checks done and we have a connection, now do something
        #

        # loop over all datasets, ordered from top to bottom to make test
        # below valid (existing directories would cause the machinery to halt)
        # But we need to run post-update hook in depth-first fashion, so
        # would only collect first and then run (see gh #790)
        yielded = set()
        remote_repos_to_run_hook_for = []
        for currentds_ap in \
                sorted(to_process, key=lambda x: x['path'].count('/')):
            current_ds = Dataset(currentds_ap['path'])

            path = _create_dataset_sibling(
                name,
                current_ds,
                ds.path,
                ssh,
                replicate_local_structure,
                sshri,
                target_dir,
                target_url,
                target_pushurl,
                existing,
                shared,
                group,
                publish_depends,
                publish_by_default,
                ui,
                as_common_datasrc,
                annex_wanted,
                annex_group,
                annex_groupwanted,
                inherit
            )
            if not path:
                # nothing new was created
                # TODO is 'notneeded' appropriate in this case?
                currentds_ap['status'] = 'notneeded'
                # TODO explain status in 'message'
                yield currentds_ap
                yielded.add(currentds_ap['path'])
                continue
            remote_repos_to_run_hook_for.append((path, currentds_ap))

            # publish web-interface to root dataset on publication server
            if current_ds.path == ds.path and ui:
                lgr.info("Uploading web interface to %s" % path)
                try:
                    CreateSibling.upload_web_interface(path, ssh, shared, ui)
                except CommandError as e:
                    currentds_ap['status'] = 'error'
                    currentds_ap['message'] = (
                        "failed to push web interface to the remote datalad repository (%s)",
                        exc_str(e))
                    yield currentds_ap
                    yielded.add(currentds_ap['path'])
                    continue

        # in reverse order would be depth first
        lgr.info("Running post-update hooks in all created siblings")
        # TODO: add progressbar
        for path, currentds_ap in remote_repos_to_run_hook_for[::-1]:
            # Trigger the hook
            lgr.debug("Running hook for %s (if exists and executable)", path)
            try:
                ssh("cd {} "
                    "&& ( [ -x hooks/post-update ] && hooks/post-update || : )"
                    "".format(sh_quote(_path_(path, ".git"))))
            except CommandError as e:
                currentds_ap['status'] = 'error'
                currentds_ap['message'] = (
                    "failed to run post-update hook under remote path %s (%s)",
                    path, exc_str(e))
                yield currentds_ap
                yielded.add(currentds_ap['path'])
                continue
            if not currentds_ap['path'] in yielded:
                # if we were silent until now everything is just splendid
                currentds_ap['status'] = 'ok'
                yield currentds_ap
Beispiel #32
0
def test_add_source(path, url, ds_dir):
    raise SkipTest('functionality is not supported ATM')
    from os import listdir
    from datalad.support.network import RI

    urls = [RI(url + f) for f in listdir(path)]
    ds = Dataset(ds_dir).create()
    eq_(len(ds.repo.get_annexed_files()), 0)

    # add a remote source to git => fail:
    assert_raises(NotImplementedError, ds.add, source=urls[0], to_git=True)
    # annex add a remote source:
    ds.add(source=urls[0])
    eq_(len(ds.repo.get_annexed_files()), 1)

    # add two remote source an give local names:
    ds.add(path=['local1.dat', 'local2.dat'], source=urls[1:3])
    annexed = ds.repo.get_annexed_files()
    eq_(len(annexed), 3)
    assert_in('local1.dat', annexed)
    assert_in('local2.dat', annexed)

    # add a second source for one of them
    ds.add(path='local1.dat', source=urls[3])
    eq_(len(annexed), 3)
    whereis_dict = ds.repo.whereis('local1.dat', output='full')
    reg_urls = [whereis_dict[uuid]['urls'] for uuid in whereis_dict
                if not whereis_dict[uuid]['here']]
    eq_(len(reg_urls), 1)  # one remote for 'local1.dat', that is not "here"
    eq_({str(urls[1]), str(urls[3])},
        set(reg_urls[0]))

    # just to be sure compare to 'local2.dat':
    whereis_dict = ds.repo.whereis('local2.dat', output='full')
    reg_urls = [whereis_dict[uuid]['urls'] for uuid in whereis_dict
                if not whereis_dict[uuid]['here']]
    eq_(len(reg_urls), 1)  # one remote for 'local2.dat', that is not "here"
    eq_([urls[2]], reg_urls[0])

    # provide more paths than sources:
    # report failure on non-existing 'local4.dat':
    result = ds.add(path=['local3.dat', 'local4.dat'], source=urls[4])
    ok_(all([r['success'] is False and r['note'] == 'not found'
             for r in result if r['file'] == 'local4.dat']))

    with open(opj(ds.path, 'local4.dat'), 'w') as f:
        f.write('local4 content')

    ds.add(path=['local3.dat', 'local4.dat'], source=urls[4])
    annexed = ds.repo.get_annexed_files()
    eq_(len(annexed), 5)
    assert_in('local3.dat', annexed)
    assert_in('local4.dat', annexed)

    # 'local3.dat' has a remote source
    whereis_dict = ds.repo.whereis('local3.dat', output='full')
    reg_urls = [whereis_dict[uuid]['urls'] for uuid in whereis_dict
                if not whereis_dict[uuid]['here']]
    eq_(len(reg_urls), 1)  # one remote for 'local3.dat', that is not "here"
    eq_([urls[4]], reg_urls[0])

    # 'local4.dat' has no remote source
    whereis_dict = ds.repo.whereis('local4.dat', output='full')
    reg_urls = [whereis_dict[uuid]['urls'] for uuid in whereis_dict
                if not whereis_dict[uuid]['here']]
    eq_(len(reg_urls), 0)

    # provide more sources than paths:
    ds.add('local5.dat', source=urls[5:])
    annexed = ds.repo.get_annexed_files()
    assert_in('local5.dat', annexed)
    eq_(len(annexed), 5 + len(urls[5:]))

    # Note: local4.dat didn't come from an url,
    # but 'local1.dat' consumes two urls
    eq_(len(annexed), len(urls))
    # all files annexed (-2 for '.git' and '.datalad'):
    eq_(len(annexed), len(listdir(ds.path)) - 2)