コード例 #1
ファイル: test_gitrepo.py プロジェクト: hanke/datalad
def test_GitRepo_get_toppath(repo, tempdir, repo2):
    reporeal = op.realpath(repo)
    eq_(GitRepo.get_toppath(repo, follow_up=False), reporeal)
    eq_(GitRepo.get_toppath(repo), repo)
    # Generate some nested directory
    GitRepo(repo2, create=True)
    repo2real = op.realpath(repo2)
    nested = op.join(repo2, "d1", "d2")
    eq_(GitRepo.get_toppath(nested, follow_up=False), repo2real)
    eq_(GitRepo.get_toppath(nested), repo2)
    # and if not under git, should return None
    eq_(GitRepo.get_toppath(tempdir), None)
コード例 #2
ファイル: test_gitrepo.py プロジェクト: akeshavan/datalad
def test_GitRepo_get_toppath(repo, tempdir, repo2):
    reporeal = op.realpath(repo)
    eq_(GitRepo.get_toppath(repo, follow_up=False), reporeal)
    eq_(GitRepo.get_toppath(repo), repo)
    # Generate some nested directory
    GitRepo(repo2, create=True)
    repo2real = op.realpath(repo2)
    nested = op.join(repo2, "d1", "d2")
    eq_(GitRepo.get_toppath(nested, follow_up=False), repo2real)
    eq_(GitRepo.get_toppath(nested), repo2)
    # and if not under git, should return None
    eq_(GitRepo.get_toppath(tempdir), None)
コード例 #3
ファイル: dataset.py プロジェクト: silky/datalad
    def get_superdataset(self, datalad_only=False, topmost=False):
        """Get the dataset's superdataset

        datalad_only : bool, optional
          Either to consider only "datalad datasets" (with non-None
          id), or (if False, which is default) - any git repository
        topmost : bool, optional
          Return the topmost super-dataset. Might then be the current one.

        Dataset or None
        # TODO: return only if self is subdataset of the superdataset
        #       (meaning: registered as submodule)?
        path = self.path
        sds_path = path if topmost else None
        while path:
            # normalize the path after adding .. so we guaranteed to not
            # follow into original directory if path itself is a symlink
            par_path = normpath(opj(path, pardir))
            sds_path_ = GitRepo.get_toppath(par_path)
            if sds_path_ is None:
                # no more parents, use previous found

            if datalad_only:
                # test if current git is actually a dataset?
                sds = Dataset(sds_path_)
                # can't use ATM since we just autogenerate and ID, see
                # https://github.com/datalad/datalad/issues/986
                # if not sds.id:
                if not sds.config.get('datalad.dataset.id', None):

            # That was a good candidate
            sds_path = sds_path_
            path = par_path
            if not topmost:
                # no looping

        if sds_path is None:
            # None was found
            return None

        # No postprocessing now should be necessary since get_toppath
        # tries its best to not resolve symlinks now

        return Dataset(sds_path)
コード例 #4
ファイル: dataset.py プロジェクト: silky/datalad
def require_dataset(dataset, check_installed=True, purpose=None):
    """Helper function to resolve a dataset.

    This function tries to resolve a dataset given an input argument,
    or based on the process' working directory, if `None` is given.

    dataset : None or path or Dataset
      Some value identifying a dataset or `None`. In the latter case
      a dataset will be searched based on the process working directory.
    check_installed : bool, optional
      If True, an optional check whether the resolved dataset is
      properly installed will be performed.
    purpose : str, optional
      This string will be inserted in error messages to make them more
      informative. The pattern is "... dataset for <STRING>".

      Or raises an exception (InsufficientArgumentsError).
    if dataset is not None and not isinstance(dataset, Dataset):
        dataset = Dataset(dataset)

    if dataset is None:  # possible scenario of cmdline calls
        dspath = GitRepo.get_toppath(getpwd())
        if not dspath:
            raise NoDatasetArgumentFound("No dataset found")
        dataset = Dataset(dspath)

    assert (dataset is not None)
    lgr.debug("Resolved dataset{0}: {1}".format(
        ' for {}'.format(purpose) if purpose else '', dataset))

    if check_installed and not dataset.is_installed():
        raise ValueError("No installed dataset found at "

    return dataset
コード例 #5
    def __call__(url, dataset=None, recursive=False):

        # shortcut
        ds = dataset

        if ds is not None and not isinstance(ds, Dataset):
            ds = Dataset(ds)

        # if we have no dataset given, figure out which one we need to operate
        # on, based on the current working directory of the process:
        if ds is None:
            # try to find a dataset at or above PWD:
            dspath = GitRepo.get_toppath(getpwd())
            if dspath is None:
                raise ValueError("No dataset found at %s." % getpwd())
            ds = Dataset(dspath)
        assert (ds is not None)

        if not ds.is_installed():
            raise ValueError("No installed dataset found at "
        assert (ds.repo is not None)

        repos_to_update = [ds.repo]
        if recursive:
            repos_to_update += [
                GitRepo(opj(ds.path, sub_path))
                for sub_path in ds.get_dataset_handles(recursive=True)

        for handle_repo in repos_to_update:
            parser = get_module_parser(handle_repo)
            for submodule_section in parser.sections():
                submodule_name = submodule_section[11:-1]
                    submodule_section, "url",
                    url.replace("%NAME", submodule_name.replace("/", "-")))

        return  # TODO: return value?
コード例 #6
    def __call__(url, dataset=None, recursive=False):

        # shortcut
        ds = dataset

        if ds is not None and not isinstance(ds, Dataset):
            ds = Dataset(ds)

        # if we have no dataset given, figure out which one we need to operate
        # on, based on the current working directory of the process:
        if ds is None:
            # try to find a dataset at or above PWD:
            dspath = GitRepo.get_toppath(getpwd())
            if dspath is None:
                raise ValueError("No dataset found at %s." % getpwd())
            ds = Dataset(dspath)
        assert(ds is not None)

        if not ds.is_installed():
            raise ValueError("No installed dataset found at "
        assert(ds.repo is not None)

        repos_to_update = [ds.repo]
        if recursive:
            repos_to_update += [GitRepo(opj(ds.path, sub_path))
                                for sub_path in

        for handle_repo in repos_to_update:
            parser = get_module_parser(handle_repo)
            for submodule_section in parser.sections():
                submodule_name = submodule_section[11:-1]
                parser.set_value(submodule_section, "url",
                                             submodule_name.replace("/", "-")))

        return  # TODO: return value?
コード例 #7
    def __call__(dataset=None,

        # TODO: Detect malformed URL and fail?

        if name is None or (url is None and pushurl is None):
            raise ValueError("""insufficient information to add a sibling
                (needs at least a dataset, a name and an URL).""")
        if url is None:
            url = pushurl

        # shortcut
        ds = dataset

        if ds is not None and not isinstance(ds, Dataset):
            ds = Dataset(ds)
        if ds is None:
            # try to find a dataset at or above CWD
            dspath = GitRepo.get_toppath(abspath(getpwd()))
            if dspath is None:
                raise ValueError("No dataset found at or above {0}.".format(
            ds = Dataset(dspath)
            lgr.debug("Resolved dataset for target creation: {0}".format(ds))

        assert (ds is not None and name is not None and url is not None)

        if not ds.is_installed():
            raise ValueError("Dataset {0} is not installed yet.".format(ds))
        assert (ds.repo is not None)

        ds_basename = basename(ds.path)
        repos = {ds_basename: {'repo': ds.repo}}
        if recursive:
            for subds in ds.get_dataset_handles(recursive=True):
                sub_path = opj(ds.path, subds)
                repos[ds_basename + '/' + subds] = {
                    #                repos[subds] = {
                    'repo': GitRepo(sub_path, create=False)

        # Note: This is copied from create_publication_target_sshwebserver
        # as it is the same logic as for its target_dir.
        # TODO: centralize and generalize template symbol handling
        # TODO: Check pushurl for template symbols too. Probably raise if only
        #       one of them uses such symbols

        replicate_local_structure = False
        if "%NAME" not in url:
            replicate_local_structure = True

        for repo in repos:
            if not replicate_local_structure:
                repos[repo]['url'] = url.replace("%NAME",
                                                 repo.replace("/", "-"))
                if pushurl:
                    repos[repo]['pushurl'] = pushurl.replace(
                        "%NAME", repo.replace("/", "-"))
                repos[repo]['url'] = url
                if pushurl:
                    repos[repo]['pushurl'] = pushurl

                if repo != ds_basename:
                    repos[repo]['url'] = _urljoin(repos[repo]['url'],
                                                  repo[len(ds_basename) + 1:])
                    if pushurl:
                        repos[repo]['pushurl'] = _urljoin(
                            repo[len(ds_basename) + 1:])

        # collect existing remotes:
        already_existing = list()
        conflicting = list()
        for repo in repos:
            if name in repos[repo]['repo'].git_get_remotes():
                lgr.debug("""Remote '{0}' already exists
                          in '{1}'.""".format(name, repo))

                existing_url = repos[repo]['repo'].git_get_remote_url(name)
                existing_pushurl = \
                    repos[repo]['repo'].git_get_remote_url(name, push=True)

                if repos[repo]['url'].rstrip('/') != existing_url.rstrip('/') \
                        or (pushurl and existing_pushurl and
                            repos[repo]['pushurl'].rstrip('/') !=
                                    existing_pushurl.rstrip('/')) \
                        or (pushurl and not existing_pushurl):

        if not force and conflicting:
            raise RuntimeError("Sibling '{0}' already exists with conflicting"
                               " URL for {1} dataset(s). {2}".format(
                                   name, len(conflicting), conflicting))

        runner = Runner()
        successfully_added = list()
        for repo in repos:
            if repo in already_existing:
                if repo not in conflicting:
                    lgr.debug("Skipping {0}. Nothing to do.".format(repo))
                # rewrite url
                cmd = ["git", "remote", "set-url", name, repos[repo]['url']]
                runner.run(cmd, cwd=repos[repo]['repo'].path)
                # add the remote
                cmd = ["git", "remote", "add", name, repos[repo]['url']]
                runner.run(cmd, cwd=repos[repo]['repo'].path)
            if pushurl:
                cmd = [
                    "git", "remote", "set-url", "--push", name,
                runner.run(cmd, cwd=repos[repo]['repo'].path)

        return successfully_added
コード例 #8
    def __call__(dataset=None, path=None, source=None, recursive=False,
        lgr.debug("Installation attempt started")
        # shortcut
        ds = dataset

        if ds is not None and not isinstance(ds, Dataset):
            ds = Dataset(ds)

        if isinstance(path, list):
            if not len(path):
                # normalize value to expected state when nothing was provided
                path = None
            elif len(path) == 1:
                # we can simply continue with the function as called with a
                # single argument
                path = path[0]
                lgr.debug("Installation of multiple targets was requested: {0}".format(path))
                return [Install.__call__(
                        recursive=recursive) for p in path]

        # resolve the target location against the provided dataset
        if path is not None:
            # make sure it is not a URL, `resolve_path` cannot handle that
            if is_url(path):
                    path = get_local_path_from_url(path)
                    path = resolve_path(path, ds)
                except ValueError:
                    # URL doesn't point to a local something
                path = resolve_path(path, ds)

        # any `path` argument that point to something local now resolved and
        # is no longer a URL

        # if we have no dataset given, figure out which one we need to operate
        # on, based on the resolved target location (that is now guaranteed to
        # be specified, but only if path isn't a URL (anymore) -> special case,
        # handles below
        if ds is None and path is not None and not is_url(path):
            # try to find a dataset at or above the installation target
            dspath = GitRepo.get_toppath(abspath(path))
            if dspath is None:
                # no top-level dataset found, use path as such
                dspath = path
            ds = Dataset(dspath)

        if ds is None and source is None and path is not None:
            # no dataset, no source
            # this could be a shortcut install call, where the first
            # arg identifies the source
            if is_url(path) or os.path.exists(path):
                # we have an actual URL -> this should be the source
                # OR
                # it is not a URL, but it exists locally
                    "Single argument given to install and no dataset found. "
                    "Assuming the argument identifies a source location.")
                source = path
                path = None

        lgr.debug("Resolved installation target: {0}".format(path))

        if ds is None and path is None and source is not None:
            # we got nothing but a source. do something similar to git clone
            # and derive the path from the source and continue
                "Neither dataset not target installation path provided. "
                "Assuming installation of a remote dataset. "
                "Deriving destination path from given source {0}".format(
            ds = Dataset(_installationpath_from_url(source))

        if not path and ds is None:
            # no dataset, no target location, nothing to do
            raise InsufficientArgumentsError(
                "insufficient information for installation (needs at "
                "least a dataset or an installation path")

        assert(ds is not None)

        lgr.debug("Resolved target dataset for installation: {0}".format(ds))

        vcs = ds.repo
        if vcs is None:
            # TODO check that a "ds.path" actually points to a TOPDIR
            # should be the case already, but maybe nevertheless check
                with swallow_logs():
                    vcs = Install._get_new_vcs(ds, source, vcs)
            except GitCommandError:
                lgr.debug("Cannot retrieve from URL: {0}".format(source))
                # maybe source URL was missing a '/.git'
                if source and not source.rstrip('/').endswith('/.git'):
                    source = '{0}/.git'.format(source.rstrip('/'))
                    lgr.debug("Attempt to retrieve from URL: {0}".format(source))
                    vcs = Install._get_new_vcs(ds, source, vcs)
                    lgr.debug("Unable to establish repository instance at: {0}".format(ds.path))

        assert(ds.repo)  # is automagically re-evaluated in the .repo property

        runner = Runner()

        if path is None or path == ds.path:
            # if the goal was to install this dataset, we are done,
            # except for 'recursive'.

            # TODO: For now 'recursive' means just submodules.
            # See --with-data vs. -- recursive and figure it out
            if recursive:
                for sm in ds.repo.get_submodules():
                        ds, sm.path, sm.url, recursive=recursive)
            return ds

        # at this point this dataset is "installed", now we can test whether to
        # install something into the dataset

        # needed by the logic below

        # express the destination path relative to the root of this dataset
        relativepath = relpath(path, start=ds.path)
        if path.startswith(pardir):
            raise ValueError("installation path outside dataset")

            "Resolved installation target relative to dataset {0}: {1}".format(
                ds, relativepath))

        # this dataset must already know everything necessary
        # FLOW GUIDE
        # at this point we know nothing about the
        # installation targether
            # it is simplest to let annex tell us what we are dealing with
            lgr.debug("Trying to fetch file %s using annex", relativepath)
            if not isinstance(vcs, AnnexRepo):
                assert(isinstance(vcs, GitRepo))
                # FLOW GUIDE
                # this is not an annex repo, but we raise exceptions
                # to be able to treat them alike in the special case handling
                # below
                if not exists(path):
                    raise IOError("path doesn't exist yet, might need special handling")
                elif relativepath in vcs.get_indexed_files():
                    # relativepath is in git
                    raise FileInGitError("We need to handle it as known to git")
                    raise FileNotInAnnexError("We don't have yet annex repo here")
            if vcs.get_file_key(relativepath):
                # FLOW GUIDE EXIT POINT
                # this is an annex'ed file -> get it
                # TODO implement `copy --from` using `source`
                # TODO fail if `source` is something strange
                # return the absolute path to the installed file
                return path

        except FileInGitError:
            # FLOW GUIDE
            # `path` is either
            # - a  file already checked into Git
            # - known submodule
            lgr.log(5, "FileInGitError logic")
            if source is not None:
                raise FileInGitError("File %s is already in git. Specifying source (%s) makes no sense"
                                     % (path, source))
            # file is checked into git directly -> nothing to do
            # OR this is a submodule of this dataset
            submodule = [sm for sm in ds.repo.get_submodules()
                         if sm.path == relativepath]
            if not len(submodule):
                # FLOW GUIDE EXIT POINT
                # this is a file in Git and no submodule, just return its path
                lgr.debug("Don't act, data already present in Git")
                return path
            elif len(submodule) > 1:
                raise RuntimeError(
                    "more than one submodule registered at the same path?")
            submodule = submodule[0]

            # we are dealing with a known submodule (i.e. `source`
            # doesn't matter) -> check it out
            lgr.debug("Install subdataset at: {0}".format(submodule.path))
            subds = _install_subds_from_flexible_source(
                ds, submodule.path, submodule.url, recursive=recursive)
            return subds

        except FileNotInAnnexError:
            # FLOW GUIDE
            # `path` is either
            # - content of a subdataset
            # - an untracked file in this dataset
            # - an entire untracked/unknown existing subdataset
            lgr.log(5, "FileNotInAnnexError logic")
            subds = get_containing_subdataset(ds, relativepath)
            if ds.path != subds.path:
                # FLOW GUIDE EXIT POINT
                # target path belongs to a known subdataset, hand
                # installation over to it
                return subds.install(
                    path=relpath(path, start=subds.path),

            # FLOW GUIDE
            # this must be an untracked/existing something, so either
            # - a file
            # - a directory
            # - an entire repository
            if exists(opj(path, '.git')):
                # FLOW GUIDE EXIT POINT
                # this is an existing repo and must be in-place turned into
                # a submodule of this dataset
                return _install_subds_inplace(
                    ds, path, relativepath, source, runner)

            # - untracked file or directory in this dataset
            if isdir(path) and not recursive:
                # this is a directory and we want --recursive for it
                raise ValueError(
                    "installation of a directory requires the `recursive` flag")

            # few sanity checks
            if source and abspath(source) != path:
                raise ValueError(
                    "installation target already exists, but `source` points to "
                    "another location (target: '{0}', source: '{0}'".format(
                        source, path))

            if not add_data_to_git and not (isinstance(vcs, AnnexRepo)):
                raise RuntimeError(
                    "Trying to install file(s) into a dataset "
                    "with a plain Git repository. First initialize annex, or "
                    "provide override flag.")

            # switch `add` procedure between Git and Git-annex according to flag
            if add_data_to_git:
                added_files = resolve_path(relativepath, ds)
                # do a blunt `annex add`
                added_files = vcs.annex_add(relativepath)
                # return just the paths of the installed components
                if isinstance(added_files, list):
                    added_files = [resolve_path(i['file'], ds) for i in added_files]
                    added_files = resolve_path(added_files['file'], ds)
            if added_files:
                return added_files
                return None

        except IOError:
            # FLOW GUIDE
            # more complicated special cases -- `path` is either
            # - a file/subdataset in a not yet initialized but known
            #   submodule
            # - an entire untracked/unknown existing subdataset
            # - non-existing content that should be installed from `source`
            lgr.log(5, "IOError logic")
            # we can end up here in two cases ATM
            if (exists(path) or islink(path)) or source is None:
                # FLOW GUIDE
                # - target exists but this dataset's VCS rejects it,
                #   so it should be part of a subdataset
                # or
                # - target doesn't exist, but no source is given, so
                #   it could be a handle that is actually contained in
                #   a not yet installed subdataset
                subds = get_containing_subdataset(ds, relativepath)
                if ds.path != subds.path:
                    # FLOW GUIDE
                    # target path belongs to a subdataset, hand installation
                    # over to it
                    if not subds.is_installed():
                        # FLOW GUIDE
                        # we are dealing with a target in a not yet
                        # available but known subdataset -> install it first
                        ds.install(subds.path, recursive=recursive)
                    return subds.install(
                        path=relpath(path, start=subds.path),

                # FLOW GUIDE EXIT POINT
                raise InsufficientArgumentsError(
                    "insufficient information for installation: the "
                    "installation target {0} doesn't exists, isn't a "
                    "known handle of dataset {1}, and no `source` "
                    "information was provided.".format(path, ds))

            if not source:
                # FLOW GUIDE EXIT POINT
                raise InsufficientArgumentsError(
                    "insufficient information for installation: the "
                    "installation target {0} doesn't exists, isn't a "
                    "known handle of dataset {1}, and no `source` "
                    "information was provided.".format(path, ds))

            source_path = expandpath(source)
            if exists(source_path):
                # FLOW GUIDE EXIT POINT
                # this could be
                # - local file
                # - local directory
                # - repository outside the dataset
                # we only want to support the last case of locally cloning
                # a repo -- fail otherwise
                if exists(opj(source_path, '.git')):
                    return _install_subds_from_flexible_source(
                        ds, relativepath, source_path, recursive)

                raise ValueError(
                    "installing individual local files or directories is not "
                    "supported, copy/move them into the dataset first")

            # FLOW GUIDE
            # `source` is non-local, it could be:
            #   - repository
            #   - file
            # we have no further evidence, hence we need to try
                # FLOW GUIDE EXIT POINT
                # assume it is a dataset
                return _install_subds_from_flexible_source(
                    ds, relativepath, source, recursive)
            except CommandError:
                # FLOW GUIDE EXIT POINT
                # apaarently not a repo, assume it is a file url
                vcs.annex_addurl_to_file(relativepath, source)
                return path
コード例 #9
    def __call__(sshurl,

        if sshurl is None:
            raise ValueError("""insufficient information for target creation
            (needs at least a dataset and a SSH URL).""")

        if target is None and (target_url is not None
                               or target_pushurl is not None):
            raise ValueError("""insufficient information for adding the target
            as a sibling (needs at least a name)""")

        # shortcut
        ds = dataset

        if ds is not None and not isinstance(ds, Dataset):
            ds = Dataset(ds)
        if ds is None:
            # try to find a dataset at or above CWD
            dspath = GitRepo.get_toppath(abspath(getpwd()))
            if dspath is None:
                raise ValueError("""No dataset found
                                 at or above {0}.""".format(getpwd()))
            ds = Dataset(dspath)
            lgr.debug("Resolved dataset for target creation: {0}".format(ds))
        assert (ds is not None and sshurl is not None)

        if not ds.is_installed():
            raise ValueError(
                """Dataset {0} is not installed yet.""".format(ds))
        assert (ds.repo is not None)

        # determine target parameters:
        parsed_target = urlparse(sshurl)
        host_name = parsed_target.netloc

        # TODO: Sufficient to fail on this condition?
        if not parsed_target.netloc:
            raise ValueError("Malformed URL: {0}".format(sshurl))

        if target_dir is None:
            if parsed_target.path:
                target_dir = parsed_target.path
                target_dir = '.'

        # TODO: centralize and generalize template symbol handling
        replicate_local_structure = False
        if "%NAME" not in target_dir:
            replicate_local_structure = True

        # collect datasets to use:
        datasets = dict()
        datasets[basename(ds.path)] = ds
        if recursive:
            for subds in ds.get_dataset_handles(recursive=True):
                sub_path = opj(ds.path, subds)
                # TODO: when enhancing Dataset/*Repo classes and therefore
                # adapt to moved code, make proper distinction between name and
                # path of a submodule, which are technically different. This
                # probably will become important on windows as well as whenever
                # we want to allow for moved worktrees.
                datasets[basename(ds.path) + '/' + subds] = \

        # setup SSH Connection:
        # TODO: Make the entire setup a helper to use it when pushing via
        # publish?

        # - build control master:
        from datalad.utils import assure_dir
        from os import geteuid  # Linux specific import
        var_run_user_datalad = "/var/run/user/%s/datalad" % geteuid()
        control_path = "%s/%s" % (var_run_user_datalad, host_name)
        control_path += ":%s" % parsed_target.port if parsed_target.port else ""

        # - start control master:
        cmd = "ssh -o ControlMaster=yes -o \"ControlPath=%s\" " \
              "-o ControlPersist=yes %s exit" % (control_path, host_name)
        lgr.debug("Try starting control master by calling:\n%s" % cmd)
        import subprocess
        proc = subprocess.Popen(cmd, shell=True)
        proc.communicate(input="\n")  # why the f.. this is necessary?

        runner = Runner()
        ssh_cmd = ["ssh", "-S", control_path, host_name]

        lgr.info("Creating target datasets ...")
        for current_dataset in datasets:
            if not replicate_local_structure:
                path = target_dir.replace("%NAME",
                                          current_dataset.replace("/", "-"))
                # TODO: opj depends on local platform, not the remote one.
                # check how to deal with it. Does windows ssh server accept
                # posix paths? vice versa? Should planned SSH class provide
                # tools for this issue?
                path = normpath(

            if path != '.':
                # check if target exists
                # TODO: Is this condition valid for != '.' only?
                path_exists = True
                cmd = ssh_cmd + ["ls", path]
                    out, err = runner.run(cmd,
                except CommandError as e:
                    if "No such file or directory" in e.stderr and \
                                    path in e.stderr:
                        path_exists = False
                        raise  # It's an unexpected failure here

                if path_exists:
                    if existing == 'raise':
                        raise RuntimeError(
                            "Target directory %s already exists." % path)
                    elif existing == 'skip':
                    elif existing == 'replace':
                        raise ValueError(
                            "Do not know how to hand existing=%s" %

                cmd = ssh_cmd + ["mkdir", "-p", path]
                except CommandError as e:
                    lgr.error("Remotely creating target directory failed at "
                              "%s.\nError: %s" % (path, str(e)))

            # init git repo
            cmd = ssh_cmd + ["git", "-C", path, "init"]
            if shared:
                cmd.append("--shared=%s" % shared)
            except CommandError as e:
                lgr.error("Remotely initializing git repository failed at %s."
                          "\nError: %s\nSkipping ..." % (path, str(e)))

            # check git version on remote end:
            cmd = ssh_cmd + ["git", "version"]
                out, err = runner.run(cmd)
                git_version = out.lstrip("git version").strip()
                lgr.debug("Detected git version on server: %s" % git_version)
                if git_version < "2.4":
                    lgr.error("Git version >= 2.4 needed to configure remote."
                              " Version detected on server: %s\nSkipping ..." %

            except CommandError as e:
                lgr.warning("Failed to determine git version on remote.\n"
                            "Error: {0}\nTrying to configure anyway "

            # allow for pushing to checked out branch
            cmd = ssh_cmd + [
                "git", "-C", path, "config", "receive.denyCurrentBranch",
            except CommandError as e:
                lgr.warning("git config failed at remote location %s.\n"
                            "You will not be able to push to checked out "
                            "branch." % path)

            # enable post-update hook:
            cmd = ssh_cmd + [
                opj(path, ".git/hooks/post-update.sample"),
                opj(path, ".git/hooks/post-update")
            except CommandError as e:
                lgr.error("Failed to enable post update hook.\n"
                          "Error: %s" % e.message)

            # initially update server info "manually":
            cmd = ssh_cmd + ["git", "-C", path, "update-server-info"]
            except CommandError as e:
                lgr.error("Failed to update server info.\n"
                          "Error: %s" % e.message)

        # stop controlmaster (close ssh connection):
        cmd = ["ssh", "-O", "stop", "-S", control_path, host_name]
        out, err = runner.run(cmd, expect_stderr=True)

        if target:
            # add the sibling(s):
            if target_url is None:
                target_url = sshurl
            if target_pushurl is None:
                target_pushurl = sshurl
            result_adding = AddSibling()(dataset=ds,
                                         force=existing in {'replace'})
コード例 #10
    def __call__(sshurl, target=None, target_dir=None,
                 target_url=None, target_pushurl=None,
                 dataset=None, recursive=False,
                 existing='raise', shared=False):

        if sshurl is None:
            raise ValueError("""insufficient information for target creation
            (needs at least a dataset and a SSH URL).""")

        if target is None and (target_url is not None
                               or target_pushurl is not None):
            raise ValueError("""insufficient information for adding the target
            as a sibling (needs at least a name)""")

        # shortcut
        ds = dataset

        if ds is not None and not isinstance(ds, Dataset):
            ds = Dataset(ds)
        if ds is None:
            # try to find a dataset at or above CWD
            dspath = GitRepo.get_toppath(abspath(getpwd()))
            if dspath is None:
                raise ValueError("""No dataset found
                                 at or above {0}.""".format(getpwd()))
            ds = Dataset(dspath)
            lgr.debug("Resolved dataset for target creation: {0}".format(ds))
        assert(ds is not None and sshurl is not None)

        if not ds.is_installed():
            raise ValueError("""Dataset {0} is not installed yet.""".format(ds))
        assert(ds.repo is not None)

        # determine target parameters:
        parsed_target = urlparse(sshurl)
        host_name = parsed_target.netloc

        # TODO: Sufficient to fail on this condition?
        if not parsed_target.netloc:
            raise ValueError("Malformed URL: {0}".format(sshurl))

        if target_dir is None:
            if parsed_target.path:
                target_dir = parsed_target.path
                target_dir = '.'

        # TODO: centralize and generalize template symbol handling
        replicate_local_structure = False
        if "%NAME" not in target_dir:
            replicate_local_structure = True

        # collect datasets to use:
        datasets = dict()
        datasets[basename(ds.path)] = ds
        if recursive:
            for subds in ds.get_dataset_handles(recursive=True):
                sub_path = opj(ds.path, subds)
                # TODO: when enhancing Dataset/*Repo classes and therefore
                # adapt to moved code, make proper distinction between name and
                # path of a submodule, which are technically different. This
                # probably will become important on windows as well as whenever
                # we want to allow for moved worktrees.
                datasets[basename(ds.path) + '/' + subds] = \

        # setup SSH Connection:
        # TODO: Make the entire setup a helper to use it when pushing via
        # publish?

        # - build control master:
        from datalad.utils import assure_dir
        from os import geteuid  # Linux specific import
        var_run_user_datalad = "/var/run/user/%s/datalad" % geteuid()
        control_path = "%s/%s" % (var_run_user_datalad, host_name)
        control_path += ":%s" % parsed_target.port if parsed_target.port else ""

        # - start control master:
        cmd = "ssh -o ControlMaster=yes -o \"ControlPath=%s\" " \
              "-o ControlPersist=yes %s exit" % (control_path, host_name)
        lgr.debug("Try starting control master by calling:\n%s" % cmd)
        import subprocess
        proc = subprocess.Popen(cmd, shell=True)
        proc.communicate(input="\n")  # why the f.. this is necessary?

        runner = Runner()
        ssh_cmd = ["ssh", "-S", control_path, host_name]

        lgr.info("Creating target datasets ...")
        for current_dataset in datasets:
            if not replicate_local_structure:
                path = target_dir.replace("%NAME",
                                          current_dataset.replace("/", "-"))
                # TODO: opj depends on local platform, not the remote one.
                # check how to deal with it. Does windows ssh server accept
                # posix paths? vice versa? Should planned SSH class provide
                # tools for this issue?
                path = normpath(opj(target_dir,

            if path != '.':
                # check if target exists
                # TODO: Is this condition valid for != '.' only?
                path_exists = True
                cmd = ssh_cmd + ["ls", path]
                    out, err = runner.run(cmd, expect_fail=True,
                except CommandError as e:
                    if "No such file or directory" in e.stderr and \
                                    path in e.stderr:
                        path_exists = False
                        raise  # It's an unexpected failure here

                if path_exists:
                    if existing == 'raise':
                        raise RuntimeError(
                            "Target directory %s already exists." % path)
                    elif existing == 'skip':
                    elif existing == 'replace':
                        raise ValueError("Do not know how to hand existing=%s" % repr(existing))

                cmd = ssh_cmd + ["mkdir", "-p", path]
                except CommandError as e:
                    lgr.error("Remotely creating target directory failed at "
                              "%s.\nError: %s" % (path, str(e)))

            # init git repo
            cmd = ssh_cmd + ["git", "-C", path, "init"]
            if shared:
                cmd.append("--shared=%s" % shared)
            except CommandError as e:
                lgr.error("Remotely initializing git repository failed at %s."
                          "\nError: %s\nSkipping ..." % (path, str(e)))

            # check git version on remote end:
            cmd = ssh_cmd + ["git", "version"]
                out, err = runner.run(cmd)
                git_version = out.lstrip("git version").strip()
                lgr.debug("Detected git version on server: %s" % git_version)
                if git_version < "2.4":
                    lgr.error("Git version >= 2.4 needed to configure remote."
                              " Version detected on server: %s\nSkipping ..."
                              % git_version)

            except CommandError as e:
                    "Failed to determine git version on remote.\n"
                    "Error: {0}\nTrying to configure anyway "

            # allow for pushing to checked out branch
            cmd = ssh_cmd + ["git", "-C", path, "config",
            except CommandError as e:
                lgr.warning("git config failed at remote location %s.\n"
                            "You will not be able to push to checked out "
                            "branch." % path)

            # enable post-update hook:
            cmd = ssh_cmd + ["mv", opj(path, ".git/hooks/post-update.sample"),
                             opj(path, ".git/hooks/post-update")]
            except CommandError as e:
                lgr.error("Failed to enable post update hook.\n"
                          "Error: %s" % e.message)

            # initially update server info "manually":
            cmd = ssh_cmd + ["git", "-C", path, "update-server-info"]
            except CommandError as e:
                lgr.error("Failed to update server info.\n"
                          "Error: %s" % e.message)

        # stop controlmaster (close ssh connection):
        cmd = ["ssh", "-O", "stop", "-S", control_path, host_name]
        out, err = runner.run(cmd, expect_stderr=True)

        if target:
            # add the sibling(s):
            if target_url is None:
                target_url = sshurl
            if target_pushurl is None:
                target_pushurl = sshurl
            result_adding = AddSibling()(dataset=ds,
                                         force=existing in {'replace'})
コード例 #11
ファイル: add_sibling.py プロジェクト: glalteva/datalad
    def __call__(dataset=None, name=None, url=None,
                 pushurl=None, recursive=False, force=False):

        # TODO: Detect malformed URL and fail?

        if name is None or (url is None and pushurl is None):
            raise ValueError("""insufficient information to add a sibling
                (needs at least a dataset, a name and an URL).""")
        if url is None:
            url = pushurl

        # shortcut
        ds = dataset

        if ds is not None and not isinstance(ds, Dataset):
            ds = Dataset(ds)
        if ds is None:
            # try to find a dataset at or above CWD
            dspath = GitRepo.get_toppath(abspath(getpwd()))
            if dspath is None:
                raise ValueError(
                        "No dataset found at or above {0}.".format(getpwd()))
            ds = Dataset(dspath)
            lgr.debug("Resolved dataset for target creation: {0}".format(ds))

        assert(ds is not None and name is not None and url is not None)

        if not ds.is_installed():
            raise ValueError("Dataset {0} is not installed yet.".format(ds))
        assert(ds.repo is not None)

        ds_basename = basename(ds.path)
        repos = {
            ds_basename: {'repo': ds.repo}
        if recursive:
            for subds in ds.get_dataset_handles(recursive=True):
                sub_path = opj(ds.path, subds)
                repos[ds_basename + '/' + subds] = {
#                repos[subds] = {
                    'repo': GitRepo(sub_path, create=False)

        # Note: This is copied from create_publication_target_sshwebserver
        # as it is the same logic as for its target_dir.
        # TODO: centralize and generalize template symbol handling
        # TODO: Check pushurl for template symbols too. Probably raise if only
        #       one of them uses such symbols

        replicate_local_structure = False
        if "%NAME" not in url:
            replicate_local_structure = True

        for repo in repos:
            if not replicate_local_structure:
                repos[repo]['url'] = url.replace("%NAME",
                                                 repo.replace("/", "-"))
                if pushurl:
                    repos[repo]['pushurl'] = pushurl.replace("%NAME",
                repos[repo]['url'] = url
                if pushurl:
                    repos[repo]['pushurl'] = pushurl

                if repo != ds_basename:
                    repos[repo]['url'] = _urljoin(repos[repo]['url'], repo[len(ds_basename)+1:])
                    if pushurl:
                        repos[repo]['pushurl'] = _urljoin(repos[repo]['pushurl'], repo[len(ds_basename)+1:])

        # collect existing remotes:
        already_existing = list()
        conflicting = list()
        for repo in repos:
            if name in repos[repo]['repo'].git_get_remotes():
                lgr.debug("""Remote '{0}' already exists
                          in '{1}'.""".format(name, repo))

                existing_url = repos[repo]['repo'].git_get_remote_url(name)
                existing_pushurl = \
                    repos[repo]['repo'].git_get_remote_url(name, push=True)

                if repos[repo]['url'].rstrip('/') != existing_url.rstrip('/') \
                        or (pushurl and existing_pushurl and
                            repos[repo]['pushurl'].rstrip('/') !=
                                    existing_pushurl.rstrip('/')) \
                        or (pushurl and not existing_pushurl):

        if not force and conflicting:
            raise RuntimeError("Sibling '{0}' already exists with conflicting"
                               " URL for {1} dataset(s). {2}".format(
                                   name, len(conflicting), conflicting))

        runner = Runner()
        successfully_added = list()
        for repo in repos:
            if repo in already_existing:
                if repo not in conflicting:
                    lgr.debug("Skipping {0}. Nothing to do.".format(repo))
                # rewrite url
                cmd = ["git", "remote", "set-url", name, repos[repo]['url']]
                runner.run(cmd, cwd=repos[repo]['repo'].path)
                # add the remote
                cmd = ["git", "remote", "add", name, repos[repo]['url']]
                runner.run(cmd, cwd=repos[repo]['repo'].path)
            if pushurl:
                cmd = ["git", "remote", "set-url", "--push", name,
                runner.run(cmd, cwd=repos[repo]['repo'].path)

        return successfully_added
コード例 #12
ファイル: publish.py プロジェクト: glalteva/datalad
    def __call__(
            # Note: add remote currently disabled in publish
            # dest_url=None, dest_pushurl=None,

        # Note: add remote currently disabled in publish
        # if dest is None and (dest_url is not None
        #                        or dest_pushurl is not None):
        #     raise ValueError("""insufficient information for adding the
        #     destination as a sibling (needs at least a name)""")

        # shortcut
        ds = dataset

        if ds is not None and not isinstance(ds, Dataset):
            ds = Dataset(ds)
        if not path:
            path = curdir

        elif isinstance(path, list):
            return [
                    # Note: add remote currently disabled in publish
                    # dest_url=dest_url,
                    # dest_pushurl=dest_pushurl,
                    recursive=recursive) for p in path

        # resolve the location against the provided dataset
        if path is not None:
            path = resolve_path(path, ds)

        lgr.info("Publishing {0}".format(path))

        # if we have no dataset given, figure out which one we need to operate
        # on, based on the resolved location (that is now guaranteed to
        # be specified
        if ds is None:
            # try to find a dataset at or above the location
            dspath = GitRepo.get_toppath(abspath(path))
            if dspath is None:
                # no top-level dataset found, use path as such
                dspath = path
            ds = Dataset(dspath)
        lgr.debug("Resolved dataset for publication: {0}".format(ds))
        assert (ds is not None)

        # it might still be about a subdataset of ds:
        if path is not None:
            relativepath = relpath(path, start=ds.path)
            subds = get_containing_subdataset(ds, relativepath)
            if subds.path != ds.path:
                # path belongs to a subdataset; hand it over
                lgr.debug("Hand over to submodule %s" % subds.path)
                return subds.publish(
                    path=relpath(path, start=subds.path),
                    # Note: add remote currently disabled in publish
                    # dest_url=dest_url,
                    # dest_pushurl=dest_pushurl,

        # now, we know, we have to operate on ds. So, ds needs to be installed,
        # since we cannot publish anything from a not installed dataset,
        # can we?
        # (But may be just the existence of ds.repo is important here.)
        if not ds.is_installed():
            raise ValueError("No installed dataset found at "
        assert (ds.repo is not None)

        # TODO: For now we can deal with a sibling(remote) name given by `dest`
        # only. Figure out, when to allow for passing a local path or URL
        # directly and what to do in that case.

        # Note: we need an upstream remote, if there's none given. We could
        # wait for git push to complain, but we need to explicitly figure it
        # out for pushing annex branch anyway and we might as well fail right
        # here.

        # keep original dest in case it's None for passing to recursive calls:
        dest_resolved = dest
        if dest is None:
            # check for tracking branch's remote:
                std_out, std_err = \
                                                ["git", "config", "--get", "branch.{active_branch}.remote".format(active_branch=ds.repo.git_get_active_branch())],
            except CommandError as e:
                if e.code == 1 and e.stdout == "":
                    std_out = None
            if std_out:
                dest_resolved = std_out.strip()
                # we have no remote given and no upstream => fail
                raise RuntimeError("No known default target for "
                                   "publication and none given.")

        # upstream branch needed for update (merge) and subsequent push,
        # in case there is no.
        set_upstream = False
            # Note: tracking branch actually defined bei entry "merge"
            # PLUS entry "remote"
            std_out, std_err = \
                                            ["git", "config", "--get",
        except CommandError as e:
            if e.code == 1 and e.stdout == "":
                # no tracking branch yet:
                set_upstream = True

        # is `dest` an already known remote?
        if dest_resolved not in ds.repo.git_get_remotes():
            # unknown remote
            raise ValueError("No sibling '%s' found." % dest_resolved)

            # Note: add remote currently disabled in publish
            # if dest_url is None:
            #     raise ValueError("No sibling '%s' found. Provide `dest-url`"
            #                      " to register it." % dest_resolved)
            # lgr.info("Sibling %s unknown. Registering ...")
            # # Fill in URL-Template:
            # remote_url = dest_url.replace("%NAME", basename(ds.path))
            # # TODO: handle_name.replace("/", "-")) instead of basename()
            # #       - figure it out ;)
            # #       - either a datasets needs to discover superdatasets in
            # #         order to get it's relative path to provide a name
            # #       - or: We need a different approach on the templates
            # # Add the remote
            # ds.repo.git_remote_add(dest_resolved, remote_url)
            # if dest_pushurl:
            #     # Fill in template:
            #     remote_url_push = \
            #         dest_pushurl.replace("%NAME", basename(ds.path))
            #     # TODO: Different way of replacing %NAME; See above
            #     # Modify push url:
            #     ds.repo._git_custom_command('',
            #                                 ["git", "remote",
            #                                  "set-url",
            #                                  "--push", dest_resolved,
            #                                  remote_url_push])
            # lgr.info("Added sibling '%s'." % dest)
            # lgr.debug("Added remote '%s':\n %s (fetch)\n%s (push)." %
            #           (dest_resolved, remote_url,
            #            remote_url_push if dest_pushurl else remote_url))
        # Note: add remote currently disabled in publish
        # else:
        #     # known remote: parameters dest-url-* currently invalid.
        #     # This may change to adapt the existing remote.
        #     if dest_url:
        #         lgr.warning("Sibling '%s' already exists for dataset '%s'. "
        #                     "Ignoring dest-url %s." %
        #                     (dest_resolved, ds.path, dest_url))
        #     if dest_pushurl:
        #         lgr.warning("Sibling '%s' already exists for dataset '%s'. "
        #                     "Ignoring dest-pushurl %s." %
        #                     (dest_resolved, ds.path, dest_pushurl))

        # Figure out, what to publish
        if path is None or path == ds.path:
            # => publish the dataset itself
            # push local state:
            # TODO: Rework git_push in GitRepo
            cmd = ['git', 'push']
            if set_upstream:
                # no upstream branch yet
            cmd += [dest_resolved, ds.repo.git_get_active_branch()]
            ds.repo._git_custom_command('', cmd)
            # push annex branch:
            if isinstance(ds.repo, AnnexRepo):
                ds.repo.git_push("%s +git-annex:git-annex" % dest_resolved)

            # TODO: if with_data is a shell pattern, we get a list, when called
            # from shell, right?
            # => adapt the following and check constraints to allow for that
            if with_data:
                ds.repo._git_custom_command('', ["git", "annex", "copy"] +
                                            with_data +
                                            ["--to", dest_resolved])

            if recursive and ds.get_dataset_handles() != []:
                results = [ds]
                # Note: add remote currently disabled in publish
                # modify URL templates:
                # if dest_url:
                #     dest_url = dest_url.replace('%NAME', basename(ds.path) + '-%NAME')
                # if dest_pushurl:
                #     dest_pushurl = dest_pushurl.replace('%NAME', basename(ds.path) + '-%NAME')
                for subds in ds.get_dataset_handles():
                        Dataset(opj(ds.path, subds)).publish(
                            # Note: use `dest` instead of `dest_resolved` in case
                            # dest was None, so subdatasets would use their default
                            # as well
                            # Note: add remote currently disabled in publish
                            # dest_url=dest_url,
                            # dest_pushurl=dest_pushurl,
                return results

            return ds

        elif exists(path):
            # At this point `path` is not referencing a (sub)dataset.
            # An annexed file is the only thing left, that `path` might be
            # validly pointing to. Anything else we can't handle currently.
            if isinstance(ds.repo, AnnexRepo):
                    if ds.repo.get_file_key(relativepath):
                        # file is in annex, publish it
                                           '--to=%s' % dest_resolved])
                        return path
                except (FileInGitError, FileNotInAnnexError):
            # `path` can't be published
            lgr.warning("Don't know how to publish %s." % path)
            return None

            # nothing to publish found
            lgr.warning("Nothing to publish found at %s." % path)
            return None
コード例 #13
ファイル: update.py プロジェクト: glalteva/datalad
    def __call__(name=None, dataset=None,
                 merge=False, recursive=False, fetch_all=False,
        # TODO: Is there an 'update filehandle' similar to install and publish?
        # What does it mean?

        if reobtain_data:
            # TODO: properly define, what to do
            raise NotImplementedError("TODO: Option '--reobtain-data' not "
                                      "implemented yet.")

        # shortcut
        ds = dataset

        if ds is not None and not isinstance(ds, Dataset):
            ds = Dataset(ds)

        # if we have no dataset given, figure out which one we need to operate
        # on, based on the current working directory of the process:
        if ds is None:
            # try to find a dataset at or above PWD:
            dspath = GitRepo.get_toppath(getpwd())
            if dspath is None:
                raise ValueError("No dataset found at %s." % getpwd())
            ds = Dataset(dspath)
        assert(ds is not None)

        if not ds.is_installed():
            raise ValueError("No installed dataset found at "
        assert(ds.repo is not None)

        repos_to_update = [ds.repo]
        if recursive:
            repos_to_update += [GitRepo(opj(ds.path, sub_path))
                                for sub_path in

        for repo in repos_to_update:
            # get all remotes:
            remotes = repo.git_get_remotes()
            if name and name not in remotes:
                lgr.warning("'%s' not known to dataset %s.\nSkipping" %
                            (name, repo.path))

            # Currently '--merge' works for single remote only:
            # TODO: - condition still incomplete
            #       - We can merge if a remote was given or there is a
            #         tracking branch
            #       - we also can fetch all remotes independently on whether or
            #         not we merge a certain remote
            if not name and len(remotes) > 1 and merge:
                lgr.debug("Found multiple remotes:\n%s" % remotes)
                raise NotImplementedError("No merge strategy for multiple "
                                          "remotes implemented yet.")
            lgr.info("Updating handle '%s' ..." % repo.path)

            # fetch remote(s):
            repo.git_fetch(name if name else '',
                           "--all" if fetch_all else '')

            # if it is an annex and there is a tracking branch, and we didn't
            # fetch the entire remote anyway, explicitly fetch git-annex
            # branch:
            # TODO: Is this logic correct? Shouldn't we fetch git-annex from
            # `name` if there is any (or if there is no tracking branch but we
            # have a `name`?
            if knows_annex(repo.path) and not fetch_all:
                # check for tracking branch's remote:
                    std_out, std_err = \
                        ["git", "config", "--get",
                except CommandError as e:
                    if e.code == 1 and e.stdout == "":
                        std_out = None
                if std_out:  # we have a "tracking remote"
                    repo.git_fetch("%s git-annex" % std_out.strip())

            # merge:
            if merge:
                lgr.info("Applying changes from tracking branch...")
                cmd_list = ["git", "pull"]
                if name:
                    # branch needed, if not default remote
                    # => TODO: use default remote/tracking branch to compare
                    #          (see above, where git-annex is fetched)
                    # => TODO: allow for passing a branch
                    # (or more general refspec?)
                    # For now, just use the same name

                out, err = repo._git_custom_command('', cmd_list)
                if knows_annex(repo.path):
                    # annex-apply:
                    lgr.info("Updating annex ...")
                    out, err = repo._git_custom_command('', ["git", "annex", "merge"])
コード例 #14
ファイル: get.py プロジェクト: silky/datalad
    def __call__(
        # internal -- instead of returning 'get'ed items, return final
        # content_by_ds, unavailable_paths.  To be used by the call from
        # Install.__call__ and done so to avoid creating another reusable
        # function which would need to duplicate all this heavy list of
        # kwargs
        # 1. turn all input paths into absolute paths
        # 2. Sort the world into existing handles and the rest
        # 3. Try locate missing handles (obtain subdatasets along the way)
        # 4. Expand into subdatasets with recursion enables (potentially
        #    obtain even more subdatasets
        # 5. Shoot info of which handles to get in each subdataset to,
        #    git-annex, once at the very end

        # TODO: consider allowing an empty `path` argument, as with other commands,
        # to indicate CWD
        resolved_paths, dataset_path = get_normalized_path_arguments(
            path, dataset, default=None)
        if not resolved_paths:
            raise InsufficientArgumentsError(
                "`get` needs at least one path as argument")

        # sort paths into the respective datasets
        dir_lookup = {}
        content_by_ds, unavailable_paths, nondataset_paths = \
            "Found %i existing dataset(s) to get content in "
            "and %d unavailable paths", len(content_by_ds),
        # IMPORTANT NOTE re `content_by_ds`
        # each key is a subdataset that we need to get something in
        # if the value[0] is the subdataset's path, we want all of it
        # if the value[0] == curdir, we just installed it as part of
        # resolving file handles and we did not say anything but "give
        # me the dataset handle"

        # explore the unknown
        for path in sorted(unavailable_paths):
            # how close can we get?
            dspath = GitRepo.get_toppath(path)
            if dspath is None:
                # nothing we can do for this path
            ds = Dataset(dspath)
            # must always yield a dataset -- we sorted out the ones outside
            # any dataset at the very top
            assert ds.is_installed()
            # now actually obtain whatever is necessary to get to this path
            containing_ds = install_necessary_subdatasets(ds, path, reckless)
            if containing_ds.path != ds.path:
                    "Installed %s to fulfill request for content for "
                    "path %s", containing_ds, path)
                # mark resulting dataset as auto-installed
                if containing_ds.path == path:
                    # we had to get the entire dataset, not something within
                    # mark that it just appeared
                    content_by_ds[path] = [curdir]
                    # we need to get content within
                    content_by_ds[path] = [path]

        if recursive and not recursion_limit == 'existing':
            # obtain any subdatasets underneath the paths given inside the
            # subdatasets that we know already exist
            # unless we do not want recursion into not-yet-installed datasets
            for subdspath in sorted(content_by_ds.keys()):
                for content_path in content_by_ds[subdspath]:
                    if not isdir(content_path):
                        # a non-directory cannot have content underneath
                    subds = Dataset(subdspath)
                        "Obtaining %s %s recursively", subds,
                        ("underneath %s" %
                         content_path if subds.path != content_path else ""))
                    cbysubds = _recursive_install_subds_underneath(
                        # `content_path` was explicitly given as input
                        # we count recursions from the input, hence we
                        # can start with the full number
                        # protect against magic marker misinterpretation
                        # only relevant for _get, hence replace here
                        start=content_path if content_path != curdir else None)
                    # gets file content for all freshly installed subdatasets

        ## we have now done everything we could to obtain whatever subdataset
        ## to get something on the file system for previously unavailable paths
        ## check and sort one last
        content_by_ds, unavailable_paths, nondataset_paths2 = \

        if nondataset_paths:
            lgr.warning("ignored paths that do not belong to any dataset: %s",

        if unavailable_paths:
            lgr.warning('ignored non-existing paths: %s', unavailable_paths)

        # hand over to git-annex
        results = list(
        # ??? should we in _return_datasets case just return both content_by_ds
        # and unavailable_paths may be so we provide consistent across runs output
        # and then issue outside similar IncompleteResultsError?
        if unavailable_paths:  # and likely other error flags
            if _return_datasets:
                results = sorted(
            raise IncompleteResultsError(results, failed=unavailable_paths)
            return sorted(content_by_ds) if _return_datasets else results
コード例 #15
ファイル: uninstall.py プロジェクト: glalteva/datalad
    def __call__(dataset=None, path=None, data_only=True, recursive=False):

        # Note: copy logic from install to resolve dataset and path:
        # shortcut
        ds = dataset

        if ds is not None and not isinstance(ds, Dataset):
            ds = Dataset(ds)

        if not path:
            if ds is None:
                # no dataset, no target location, nothing to do
                raise ValueError(
                    "insufficient information for uninstallation (needs at "
                    "least a dataset or a path")
        elif isinstance(path, list):
            # TODO: not sure. might be possible to deal with that list directly
            return [Uninstall.__call__(
                    recursive=recursive) for p in path]

        # resolve the target location against the provided dataset
        if path is not None:
            path = resolve_path(path, ds)

        lgr.debug("Resolved uninstallation target: {0}".format(path))

        # if we have no dataset given, figure out which one we need to operate
        # on, based on the resolved target location (that is now guaranteed to
        # be specified
        if ds is None:
            # try to find a dataset at or above the installation target
            dspath = GitRepo.get_toppath(abspath(path))
            if dspath is None:
                # no top-level dataset found, use path as such
                dspath = path
            ds = Dataset(dspath)
        assert(ds is not None)

        lgr.debug("Resolved target dataset for uninstallation: {0}".format(ds))

        if not ds.is_installed():
            if not path or path == ds.path:
                # we want to uninstall the dataset itself, which is not
                # installed => nothing to do
                # TODO: consider `data` option! is_installed currently only
                # checks for a repository
                lgr.info("Dataset {0} not installed. Nothing to "
                # we want to uninstall something from a not installed dataset
                # Doesn't make sense, does it? => fail
                raise ValueError("Dataset {0} is not installed.".format(ds.path))

        assert(ds.repo is not None)

        if not path or path == ds.path:
            # uninstall the dataset `ds`
            # TODO: what to consider?
            #   - whether it is a submodule of another dataset
            #   - `data_only` ?
            #   - `recursive`
            #   - what to return in what case (data_only)?
            raise NotImplementedError("TODO: Uninstall dataset %s" % ds.path)

        # needed by the logic below

        # express the destination path relative to the root of this dataset
        relativepath = relpath(path, start=ds.path)
        if path.startswith(pardir):
            raise ValueError("uninstallation path outside dataset")

            "Resolved uninstallation target relative to dataset {0}: {1}".format(
                ds, relativepath))

        # figure out, what path actually is pointing to:
        if not exists(path):
            # nothing there, nothing to uninstall
            lgr.info("Nothing found to uninstall at %s" % path)

        if relativepath in ds.get_dataset_handles(recursive=True):
            # it's a submodule
            # --recursive required or implied?
            raise NotImplementedError("TODO: uninstall submodule %s from "
                                      "dataset %s" % (relativepath, ds.path))

        if isdir(path):
            # don't know what to do yet
            # in git vs. untracked?
            # recursive?
            raise NotImplementedError("TODO: uninstall directory %s from "
                                      "dataset %s" % (path, ds.path))

        # we know, it's an existing file
        if isinstance(ds.repo, AnnexRepo):
            except FileInGitError:
                # file directly in git
                _file_in_git = True

            except FileNotInAnnexError:
                # either an untracked file in this dataset, or something that
                # also actually exists in the file system but could be part of
                # a subdataset
                _untracked_or_within_submodule = True

            # it's an annexed file
            if data_only:
                return path
                raise NotImplementedError("TODO: fully uninstall file %s "
                                          "(annex) from dataset %s" %
                                          (path, ds.path))
            # plain git repo
            if relativepath in ds.repo.get_indexed_files():
                # file directly in git
                _file_in_git = True
                # either an untracked file in this dataset, or something that
                # also actually exists in the file system but could be part of
                # a subdataset
                _untracked_or_within_submodule = True

        if _file_in_git:
            if data_only:
                raise ValueError("%s is not a file handle. Removing its "
                                 "data only doesn't make sense." % path)
                return ds.repo.git_remove([relativepath])

        elif _untracked_or_within_submodule:
            subds = get_containing_subdataset(ds, relativepath)
            if ds.path != subds.path:
                # target path belongs to a subdataset, hand uninstallation
                # over to it
                return subds.uninstall(
                    path=relpath(path, start=subds.path),

            # this must be an untracked/existing something
            # it wasn't installed, so we cannot uninstall it
            raise ValueError("Cannot uninstall %s" % path)
コード例 #16
def get_paths_by_dataset(paths,
    """Sort a list of paths per dataset they are contained in.

    Any paths that are not part of a dataset, or presently unavailable are

    paths : sequence
      A sequence of path specifications to sort.
    recursive : bool
      Flag whether to report subdatasets under any of the given paths
    recursion_limit :
      Depth constraint for recursion. See `Dataset.get_subdatasets()` for more
    out : dict or None
      By default a new output dictionary is created, howeverm and existing one
      can be provided via this argument to enable incremental processing.
    dir_lookup : dict or None
      Optional lookup cache that maps paths to previously determined datasets.
      This can speed up repeated processing.

    Tuple(dict, list, list)
      Dict of `existing dataset path`: `path` mappings, the list of currently
      non-existing paths (possibly matching currently uninstalled datasets),
      and any paths that are not part of any dataset

    # sort paths into the respective datasets
    if dir_lookup is None:
        dir_lookup = {}
    if out is None:
        out = {}
    # paths that don't exist (yet)
    unavailable_paths = []
    nondataset_paths = []
    for path in paths:
        if not lexists(path):
            # not there yet, impossible to say which ds it will actually
            # be in, if any
        # the path exists in some shape or form
        if isdir(path):
            # this could contain all types of additional content
            d = path
            # for everything else we are interested in the container
            d = dirname(path)
            if not d:
                d = curdir
        # this could be `None` if there is no git repo
        dspath = dir_lookup.get(d, GitRepo.get_toppath(d))
        dir_lookup[d] = dspath
        if not dspath:
        if isdir(path):
            ds = Dataset(dspath)
            # we need to doublecheck that this is not a subdataset mount
            # point, in which case get_toppath() would point to the parent
            smpath = ds.get_containing_subdataset(path, recursion_limit=1).path
            if smpath != dspath:
                # fix entry
                dir_lookup[d] = smpath
                # submodule still needs to be obtained
            if recursive:
                # make sure we get everything relevant in all _checked out_
                # subdatasets, obtaining of previously unavailable subdataset
                # else done elsewhere
                subs = ds.get_subdatasets(fulfilled=True,
                for sub in subs:
                    subdspath = opj(dspath, sub)
                    if subdspath.startswith(_with_sep(path)):
                        # this subdatasets is underneath the search path
                        # we want it all
                        # be careful to not overwrite anything, in case
                        # this subdataset has been processed before
                        out[subdspath] = out.get(subdspath, [subdspath])
        out[dspath] = out.get(dspath, []) + [path]
    return out, unavailable_paths, nondataset_paths
コード例 #17
ファイル: uninstall.py プロジェクト: glalteva/datalad
    def __call__(dataset=None, path=None, data_only=True, recursive=False):

        # Note: copy logic from install to resolve dataset and path:
        # shortcut
        ds = dataset

        if ds is not None and not isinstance(ds, Dataset):
            ds = Dataset(ds)

        if not path:
            if ds is None:
                # no dataset, no target location, nothing to do
                raise ValueError(
                    "insufficient information for uninstallation (needs at "
                    "least a dataset or a path")
        elif isinstance(path, list):
            # TODO: not sure. might be possible to deal with that list directly
            return [
                                   recursive=recursive) for p in path

        # resolve the target location against the provided dataset
        if path is not None:
            path = resolve_path(path, ds)

        lgr.debug("Resolved uninstallation target: {0}".format(path))

        # if we have no dataset given, figure out which one we need to operate
        # on, based on the resolved target location (that is now guaranteed to
        # be specified
        if ds is None:
            # try to find a dataset at or above the installation target
            dspath = GitRepo.get_toppath(abspath(path))
            if dspath is None:
                # no top-level dataset found, use path as such
                dspath = path
            ds = Dataset(dspath)
        assert (ds is not None)

        lgr.debug("Resolved target dataset for uninstallation: {0}".format(ds))

        if not ds.is_installed():
            if not path or path == ds.path:
                # we want to uninstall the dataset itself, which is not
                # installed => nothing to do
                # TODO: consider `data` option! is_installed currently only
                # checks for a repository
                lgr.info("Dataset {0} not installed. Nothing to "
                # we want to uninstall something from a not installed dataset
                # Doesn't make sense, does it? => fail
                raise ValueError("Dataset {0} is not installed.".format(

        assert (ds.repo is not None)

        if not path or path == ds.path:
            # uninstall the dataset `ds`
            # TODO: what to consider?
            #   - whether it is a submodule of another dataset
            #   - `data_only` ?
            #   - `recursive`
            #   - what to return in what case (data_only)?
            raise NotImplementedError("TODO: Uninstall dataset %s" % ds.path)

        # needed by the logic below
        assert (isabs(path))

        # express the destination path relative to the root of this dataset
        relativepath = relpath(path, start=ds.path)
        if path.startswith(pardir):
            raise ValueError("uninstallation path outside dataset")

            "Resolved uninstallation target relative to dataset {0}: {1}".
            format(ds, relativepath))

        # figure out, what path actually is pointing to:
        if not exists(path):
            # nothing there, nothing to uninstall
            lgr.info("Nothing found to uninstall at %s" % path)

        if relativepath in ds.get_dataset_handles(recursive=True):
            # it's a submodule
            # --recursive required or implied?
            raise NotImplementedError("TODO: uninstall submodule %s from "
                                      "dataset %s" % (relativepath, ds.path))

        if isdir(path):
            # don't know what to do yet
            # in git vs. untracked?
            # recursive?
            raise NotImplementedError("TODO: uninstall directory %s from "
                                      "dataset %s" % (path, ds.path))

        # we know, it's an existing file
        if isinstance(ds.repo, AnnexRepo):
            except FileInGitError:
                # file directly in git
                _file_in_git = True

            except FileNotInAnnexError:
                # either an untracked file in this dataset, or something that
                # also actually exists in the file system but could be part of
                # a subdataset
                _untracked_or_within_submodule = True

            # it's an annexed file
            if data_only:
                return path
                raise NotImplementedError("TODO: fully uninstall file %s "
                                          "(annex) from dataset %s" %
                                          (path, ds.path))
            # plain git repo
            if relativepath in ds.repo.get_indexed_files():
                # file directly in git
                _file_in_git = True
                # either an untracked file in this dataset, or something that
                # also actually exists in the file system but could be part of
                # a subdataset
                _untracked_or_within_submodule = True

        if _file_in_git:
            if data_only:
                raise ValueError("%s is not a file handle. Removing its "
                                 "data only doesn't make sense." % path)
                return ds.repo.git_remove([relativepath])

        elif _untracked_or_within_submodule:
            subds = get_containing_subdataset(ds, relativepath)
            if ds.path != subds.path:
                # target path belongs to a subdataset, hand uninstallation
                # over to it
                return subds.uninstall(path=relpath(path, start=subds.path),

            # this must be an untracked/existing something
            # it wasn't installed, so we cannot uninstall it
            raise ValueError("Cannot uninstall %s" % path)
コード例 #18
ファイル: publish.py プロジェクト: glalteva/datalad
    def __call__(dataset=None, dest=None, path=None,
                 # Note: add remote currently disabled in publish
                 # dest_url=None, dest_pushurl=None,
                 with_data=None, recursive=False):

        # Note: add remote currently disabled in publish
        # if dest is None and (dest_url is not None
        #                        or dest_pushurl is not None):
        #     raise ValueError("""insufficient information for adding the
        #     destination as a sibling (needs at least a name)""")

        # shortcut
        ds = dataset

        if ds is not None and not isinstance(ds, Dataset):
            ds = Dataset(ds)
        if not path:
            path = curdir

        elif isinstance(path, list):
            return [Publish.__call__(
                    # Note: add remote currently disabled in publish
                    # dest_url=dest_url,
                    # dest_pushurl=dest_pushurl,
                    recursive=recursive) for p in path]

        # resolve the location against the provided dataset
        if path is not None:
            path = resolve_path(path, ds)

        lgr.info("Publishing {0}".format(path))

        # if we have no dataset given, figure out which one we need to operate
        # on, based on the resolved location (that is now guaranteed to
        # be specified
        if ds is None:
            # try to find a dataset at or above the location
            dspath = GitRepo.get_toppath(abspath(path))
            if dspath is None:
                # no top-level dataset found, use path as such
                dspath = path
            ds = Dataset(dspath)
        lgr.debug("Resolved dataset for publication: {0}".format(ds))
        assert(ds is not None)

        # it might still be about a subdataset of ds:
        if path is not None:
            relativepath = relpath(path, start=ds.path)
            subds = get_containing_subdataset(ds, relativepath)
            if subds.path != ds.path:
                    # path belongs to a subdataset; hand it over
                    lgr.debug("Hand over to submodule %s" % subds.path)
                    return subds.publish(dest=dest,
                                         path=relpath(path, start=subds.path),
                                         # Note: add remote currently disabled in publish
                                         # dest_url=dest_url,
                                         # dest_pushurl=dest_pushurl,

        # now, we know, we have to operate on ds. So, ds needs to be installed,
        # since we cannot publish anything from a not installed dataset,
        # can we?
        # (But may be just the existence of ds.repo is important here.)
        if not ds.is_installed():
            raise ValueError("No installed dataset found at "
        assert(ds.repo is not None)

        # TODO: For now we can deal with a sibling(remote) name given by `dest`
        # only. Figure out, when to allow for passing a local path or URL
        # directly and what to do in that case.

        # Note: we need an upstream remote, if there's none given. We could
        # wait for git push to complain, but we need to explicitly figure it
        # out for pushing annex branch anyway and we might as well fail right
        # here.

        # keep original dest in case it's None for passing to recursive calls:
        dest_resolved = dest
        if dest is None:
            # check for tracking branch's remote:
                std_out, std_err = \
                                                ["git", "config", "--get", "branch.{active_branch}.remote".format(active_branch=ds.repo.git_get_active_branch())],
            except CommandError as e:
                if e.code == 1 and e.stdout == "":
                    std_out = None
            if std_out:
                dest_resolved = std_out.strip()
                # we have no remote given and no upstream => fail
                raise RuntimeError("No known default target for "
                                   "publication and none given.")

        # upstream branch needed for update (merge) and subsequent push,
        # in case there is no.
        set_upstream = False
            # Note: tracking branch actually defined bei entry "merge"
            # PLUS entry "remote"
            std_out, std_err = \
                                            ["git", "config", "--get",
        except CommandError as e:
            if e.code == 1 and e.stdout == "":
                # no tracking branch yet:
                set_upstream = True

        # is `dest` an already known remote?
        if dest_resolved not in ds.repo.git_get_remotes():
            # unknown remote
            raise ValueError("No sibling '%s' found." % dest_resolved)

            # Note: add remote currently disabled in publish
            # if dest_url is None:
            #     raise ValueError("No sibling '%s' found. Provide `dest-url`"
            #                      " to register it." % dest_resolved)
            # lgr.info("Sibling %s unknown. Registering ...")
            # # Fill in URL-Template:
            # remote_url = dest_url.replace("%NAME", basename(ds.path))
            # # TODO: handle_name.replace("/", "-")) instead of basename()
            # #       - figure it out ;)
            # #       - either a datasets needs to discover superdatasets in
            # #         order to get it's relative path to provide a name
            # #       - or: We need a different approach on the templates
            # # Add the remote
            # ds.repo.git_remote_add(dest_resolved, remote_url)
            # if dest_pushurl:
            #     # Fill in template:
            #     remote_url_push = \
            #         dest_pushurl.replace("%NAME", basename(ds.path))
            #     # TODO: Different way of replacing %NAME; See above
            #     # Modify push url:
            #     ds.repo._git_custom_command('',
            #                                 ["git", "remote",
            #                                  "set-url",
            #                                  "--push", dest_resolved,
            #                                  remote_url_push])
            # lgr.info("Added sibling '%s'." % dest)
            # lgr.debug("Added remote '%s':\n %s (fetch)\n%s (push)." %
            #           (dest_resolved, remote_url,
            #            remote_url_push if dest_pushurl else remote_url))
        # Note: add remote currently disabled in publish
        # else:
        #     # known remote: parameters dest-url-* currently invalid.
        #     # This may change to adapt the existing remote.
        #     if dest_url:
        #         lgr.warning("Sibling '%s' already exists for dataset '%s'. "
        #                     "Ignoring dest-url %s." %
        #                     (dest_resolved, ds.path, dest_url))
        #     if dest_pushurl:
        #         lgr.warning("Sibling '%s' already exists for dataset '%s'. "
        #                     "Ignoring dest-pushurl %s." %
        #                     (dest_resolved, ds.path, dest_pushurl))

        # Figure out, what to publish
        if path is None or path == ds.path:
            # => publish the dataset itself
            # push local state:
            # TODO: Rework git_push in GitRepo
            cmd = ['git', 'push']
            if set_upstream:
                # no upstream branch yet
            cmd += [dest_resolved, ds.repo.git_get_active_branch()]
            ds.repo._git_custom_command('', cmd)
            # push annex branch:
            if isinstance(ds.repo, AnnexRepo):
                ds.repo.git_push("%s +git-annex:git-annex" % dest_resolved)

            # TODO: if with_data is a shell pattern, we get a list, when called
            # from shell, right?
            # => adapt the following and check constraints to allow for that
            if with_data:
                ds.repo._git_custom_command('', ["git", "annex", "copy"] +
                                            with_data + ["--to", dest_resolved])

            if recursive and ds.get_dataset_handles() != []:
                results = [ds]
                # Note: add remote currently disabled in publish
                # modify URL templates:
                # if dest_url:
                #     dest_url = dest_url.replace('%NAME', basename(ds.path) + '-%NAME')
                # if dest_pushurl:
                #     dest_pushurl = dest_pushurl.replace('%NAME', basename(ds.path) + '-%NAME')
                for subds in ds.get_dataset_handles():
                        # Note: use `dest` instead of `dest_resolved` in case
                        # dest was None, so subdatasets would use their default
                        # as well
                        # Note: add remote currently disabled in publish
                        # dest_url=dest_url,
                        # dest_pushurl=dest_pushurl,
                return results

            return ds

        elif exists(path):
            # At this point `path` is not referencing a (sub)dataset.
            # An annexed file is the only thing left, that `path` might be
            # validly pointing to. Anything else we can't handle currently.
            if isinstance(ds.repo, AnnexRepo):
                    if ds.repo.get_file_key(relativepath):
                        # file is in annex, publish it
                                                                  '--to=%s' % dest_resolved])
                        return path
                except (FileInGitError, FileNotInAnnexError):
            # `path` can't be published
            lgr.warning("Don't know how to publish %s." % path)
            return None

            # nothing to publish found
            lgr.warning("Nothing to publish found at %s." % path)
            return None