def test_GitRepo_get_toppath(repo, tempdir, repo2): reporeal = op.realpath(repo) eq_(GitRepo.get_toppath(repo, follow_up=False), reporeal) eq_(GitRepo.get_toppath(repo), repo) # Generate some nested directory GitRepo(repo2, create=True) repo2real = op.realpath(repo2) nested = op.join(repo2, "d1", "d2") os.makedirs(nested) eq_(GitRepo.get_toppath(nested, follow_up=False), repo2real) eq_(GitRepo.get_toppath(nested), repo2) # and if not under git, should return None eq_(GitRepo.get_toppath(tempdir), None)
def test_GitRepo_get_toppath(repo, tempdir, repo2): reporeal = op.realpath(repo) eq_(GitRepo.get_toppath(repo, follow_up=False), reporeal) eq_(GitRepo.get_toppath(repo), repo) # Generate some nested directory GitRepo(repo2, create=True) repo2real = op.realpath(repo2) nested = op.join(repo2, "d1", "d2") os.makedirs(nested) eq_(GitRepo.get_toppath(nested, follow_up=False), repo2real) eq_(GitRepo.get_toppath(nested), repo2) # and if not under git, should return None eq_(GitRepo.get_toppath(tempdir), None)
def get_superdataset(self, datalad_only=False, topmost=False): """Get the dataset's superdataset Parameters ---------- datalad_only : bool, optional Either to consider only "datalad datasets" (with non-None id), or (if False, which is default) - any git repository topmost : bool, optional Return the topmost super-dataset. Might then be the current one. Returns ------- Dataset or None """ # TODO: return only if self is subdataset of the superdataset # (meaning: registered as submodule)? path = self.path sds_path = path if topmost else None while path: # normalize the path after adding .. so we guaranteed to not # follow into original directory if path itself is a symlink par_path = normpath(opj(path, pardir)) sds_path_ = GitRepo.get_toppath(par_path) if sds_path_ is None: # no more parents, use previous found break if datalad_only: # test if current git is actually a dataset? sds = Dataset(sds_path_) # can't use ATM since we just autogenerate and ID, see # https://github.com/datalad/datalad/issues/986 # if not sds.id: if not sds.config.get('datalad.dataset.id', None): break # That was a good candidate sds_path = sds_path_ path = par_path if not topmost: # no looping break if sds_path is None: # None was found return None # No postprocessing now should be necessary since get_toppath # tries its best to not resolve symlinks now return Dataset(sds_path)
def require_dataset(dataset, check_installed=True, purpose=None): """Helper function to resolve a dataset. This function tries to resolve a dataset given an input argument, or based on the process' working directory, if `None` is given. Parameters ---------- dataset : None or path or Dataset Some value identifying a dataset or `None`. In the latter case a dataset will be searched based on the process working directory. check_installed : bool, optional If True, an optional check whether the resolved dataset is properly installed will be performed. purpose : str, optional This string will be inserted in error messages to make them more informative. The pattern is "... dataset for <STRING>". Returns ------- Dataset Or raises an exception (InsufficientArgumentsError). """ if dataset is not None and not isinstance(dataset, Dataset): dataset = Dataset(dataset) if dataset is None: # possible scenario of cmdline calls dspath = GitRepo.get_toppath(getpwd()) if not dspath: raise NoDatasetArgumentFound("No dataset found") dataset = Dataset(dspath) assert (dataset is not None) lgr.debug("Resolved dataset{0}: {1}".format( ' for {}'.format(purpose) if purpose else '', dataset)) if check_installed and not dataset.is_installed(): raise ValueError("No installed dataset found at " "{0}.".format(dataset.path)) return dataset
def __call__(url, dataset=None, recursive=False): # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) # if we have no dataset given, figure out which one we need to operate # on, based on the current working directory of the process: if ds is None: # try to find a dataset at or above PWD: dspath = GitRepo.get_toppath(getpwd()) if dspath is None: raise ValueError("No dataset found at %s." % getpwd()) ds = Dataset(dspath) assert (ds is not None) if not ds.is_installed(): raise ValueError("No installed dataset found at " "{0}.".format(ds.path)) assert (ds.repo is not None) repos_to_update = [ds.repo] if recursive: repos_to_update += [ GitRepo(opj(ds.path, sub_path)) for sub_path in ds.get_dataset_handles(recursive=True) ] for handle_repo in repos_to_update: parser = get_module_parser(handle_repo) for submodule_section in parser.sections(): submodule_name = submodule_section[11:-1] parser.set_value( submodule_section, "url", url.replace("%NAME", submodule_name.replace("/", "-"))) return # TODO: return value?
def __call__(url, dataset=None, recursive=False): # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) # if we have no dataset given, figure out which one we need to operate # on, based on the current working directory of the process: if ds is None: # try to find a dataset at or above PWD: dspath = GitRepo.get_toppath(getpwd()) if dspath is None: raise ValueError("No dataset found at %s." % getpwd()) ds = Dataset(dspath) assert(ds is not None) if not ds.is_installed(): raise ValueError("No installed dataset found at " "{0}.".format(ds.path)) assert(ds.repo is not None) repos_to_update = [ds.repo] if recursive: repos_to_update += [GitRepo(opj(ds.path, sub_path)) for sub_path in ds.get_dataset_handles(recursive=True)] for handle_repo in repos_to_update: parser = get_module_parser(handle_repo) for submodule_section in parser.sections(): submodule_name = submodule_section[11:-1] parser.set_value(submodule_section, "url", url.replace("%NAME", submodule_name.replace("/", "-"))) return # TODO: return value?
def __call__(dataset=None, name=None, url=None, pushurl=None, recursive=False, force=False): # TODO: Detect malformed URL and fail? if name is None or (url is None and pushurl is None): raise ValueError("""insufficient information to add a sibling (needs at least a dataset, a name and an URL).""") if url is None: url = pushurl # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) if ds is None: # try to find a dataset at or above CWD dspath = GitRepo.get_toppath(abspath(getpwd())) if dspath is None: raise ValueError("No dataset found at or above {0}.".format( getpwd())) ds = Dataset(dspath) lgr.debug("Resolved dataset for target creation: {0}".format(ds)) assert (ds is not None and name is not None and url is not None) if not ds.is_installed(): raise ValueError("Dataset {0} is not installed yet.".format(ds)) assert (ds.repo is not None) ds_basename = basename(ds.path) repos = {ds_basename: {'repo': ds.repo}} if recursive: for subds in ds.get_dataset_handles(recursive=True): sub_path = opj(ds.path, subds) repos[ds_basename + '/' + subds] = { # repos[subds] = { 'repo': GitRepo(sub_path, create=False) } # Note: This is copied from create_publication_target_sshwebserver # as it is the same logic as for its target_dir. # TODO: centralize and generalize template symbol handling # TODO: Check pushurl for template symbols too. Probably raise if only # one of them uses such symbols replicate_local_structure = False if "%NAME" not in url: replicate_local_structure = True for repo in repos: if not replicate_local_structure: repos[repo]['url'] = url.replace("%NAME", repo.replace("/", "-")) if pushurl: repos[repo]['pushurl'] = pushurl.replace( "%NAME", repo.replace("/", "-")) else: repos[repo]['url'] = url if pushurl: repos[repo]['pushurl'] = pushurl if repo != ds_basename: repos[repo]['url'] = _urljoin(repos[repo]['url'], repo[len(ds_basename) + 1:]) if pushurl: repos[repo]['pushurl'] = _urljoin( repos[repo]['pushurl'], repo[len(ds_basename) + 1:]) # collect existing remotes: already_existing = list() conflicting = list() for repo in repos: if name in repos[repo]['repo'].git_get_remotes(): already_existing.append(repo) lgr.debug("""Remote '{0}' already exists in '{1}'.""".format(name, repo)) existing_url = repos[repo]['repo'].git_get_remote_url(name) existing_pushurl = \ repos[repo]['repo'].git_get_remote_url(name, push=True) if repos[repo]['url'].rstrip('/') != existing_url.rstrip('/') \ or (pushurl and existing_pushurl and repos[repo]['pushurl'].rstrip('/') != existing_pushurl.rstrip('/')) \ or (pushurl and not existing_pushurl): conflicting.append(repo) if not force and conflicting: raise RuntimeError("Sibling '{0}' already exists with conflicting" " URL for {1} dataset(s). {2}".format( name, len(conflicting), conflicting)) runner = Runner() successfully_added = list() for repo in repos: if repo in already_existing: if repo not in conflicting: lgr.debug("Skipping {0}. Nothing to do.".format(repo)) continue # rewrite url cmd = ["git", "remote", "set-url", name, repos[repo]['url']] runner.run(cmd, cwd=repos[repo]['repo'].path) else: # add the remote cmd = ["git", "remote", "add", name, repos[repo]['url']] runner.run(cmd, cwd=repos[repo]['repo'].path) if pushurl: cmd = [ "git", "remote", "set-url", "--push", name, repos[repo]['pushurl'] ] runner.run(cmd, cwd=repos[repo]['repo'].path) successfully_added.append(repo) return successfully_added
def __call__(dataset=None, path=None, source=None, recursive=False, add_data_to_git=False): lgr.debug("Installation attempt started") # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) if isinstance(path, list): if not len(path): # normalize value to expected state when nothing was provided path = None elif len(path) == 1: # we can simply continue with the function as called with a # single argument path = path[0] else: lgr.debug("Installation of multiple targets was requested: {0}".format(path)) return [Install.__call__( dataset=ds, path=p, source=source, recursive=recursive) for p in path] # resolve the target location against the provided dataset if path is not None: # make sure it is not a URL, `resolve_path` cannot handle that if is_url(path): try: path = get_local_path_from_url(path) path = resolve_path(path, ds) except ValueError: # URL doesn't point to a local something pass else: path = resolve_path(path, ds) # any `path` argument that point to something local now resolved and # is no longer a URL # if we have no dataset given, figure out which one we need to operate # on, based on the resolved target location (that is now guaranteed to # be specified, but only if path isn't a URL (anymore) -> special case, # handles below if ds is None and path is not None and not is_url(path): # try to find a dataset at or above the installation target dspath = GitRepo.get_toppath(abspath(path)) if dspath is None: # no top-level dataset found, use path as such dspath = path ds = Dataset(dspath) if ds is None and source is None and path is not None: # no dataset, no source # this could be a shortcut install call, where the first # arg identifies the source if is_url(path) or os.path.exists(path): # we have an actual URL -> this should be the source # OR # it is not a URL, but it exists locally lgr.debug( "Single argument given to install and no dataset found. " "Assuming the argument identifies a source location.") source = path path = None lgr.debug("Resolved installation target: {0}".format(path)) if ds is None and path is None and source is not None: # we got nothing but a source. do something similar to git clone # and derive the path from the source and continue lgr.debug( "Neither dataset not target installation path provided. " "Assuming installation of a remote dataset. " "Deriving destination path from given source {0}".format( source)) ds = Dataset(_installationpath_from_url(source)) if not path and ds is None: # no dataset, no target location, nothing to do raise InsufficientArgumentsError( "insufficient information for installation (needs at " "least a dataset or an installation path") assert(ds is not None) lgr.debug("Resolved target dataset for installation: {0}".format(ds)) vcs = ds.repo if vcs is None: # TODO check that a "ds.path" actually points to a TOPDIR # should be the case already, but maybe nevertheless check try: with swallow_logs(): vcs = Install._get_new_vcs(ds, source, vcs) except GitCommandError: lgr.debug("Cannot retrieve from URL: {0}".format(source)) # maybe source URL was missing a '/.git' if source and not source.rstrip('/').endswith('/.git'): source = '{0}/.git'.format(source.rstrip('/')) lgr.debug("Attempt to retrieve from URL: {0}".format(source)) vcs = Install._get_new_vcs(ds, source, vcs) else: lgr.debug("Unable to establish repository instance at: {0}".format(ds.path)) raise assert(ds.repo) # is automagically re-evaluated in the .repo property runner = Runner() if path is None or path == ds.path: # if the goal was to install this dataset, we are done, # except for 'recursive'. # TODO: For now 'recursive' means just submodules. # See --with-data vs. -- recursive and figure it out if recursive: for sm in ds.repo.get_submodules(): _install_subds_from_flexible_source( ds, sm.path, sm.url, recursive=recursive) return ds # at this point this dataset is "installed", now we can test whether to # install something into the dataset # needed by the logic below assert(isabs(path)) # express the destination path relative to the root of this dataset relativepath = relpath(path, start=ds.path) if path.startswith(pardir): raise ValueError("installation path outside dataset") lgr.debug( "Resolved installation target relative to dataset {0}: {1}".format( ds, relativepath)) # this dataset must already know everything necessary ################################################### # FLOW GUIDE # # at this point we know nothing about the # installation targether ################################################### try: # it is simplest to let annex tell us what we are dealing with lgr.debug("Trying to fetch file %s using annex", relativepath) if not isinstance(vcs, AnnexRepo): assert(isinstance(vcs, GitRepo)) # FLOW GUIDE # this is not an annex repo, but we raise exceptions # to be able to treat them alike in the special case handling # below if not exists(path): raise IOError("path doesn't exist yet, might need special handling") elif relativepath in vcs.get_indexed_files(): # relativepath is in git raise FileInGitError("We need to handle it as known to git") else: raise FileNotInAnnexError("We don't have yet annex repo here") if vcs.get_file_key(relativepath): # FLOW GUIDE EXIT POINT # this is an annex'ed file -> get it # TODO implement `copy --from` using `source` # TODO fail if `source` is something strange vcs.annex_get(relativepath) # return the absolute path to the installed file return path except FileInGitError: ################################################### # FLOW GUIDE # # `path` is either # - a file already checked into Git # - known submodule ################################################### lgr.log(5, "FileInGitError logic") if source is not None: raise FileInGitError("File %s is already in git. Specifying source (%s) makes no sense" % (path, source)) # file is checked into git directly -> nothing to do # OR this is a submodule of this dataset submodule = [sm for sm in ds.repo.get_submodules() if sm.path == relativepath] if not len(submodule): # FLOW GUIDE EXIT POINT # this is a file in Git and no submodule, just return its path lgr.debug("Don't act, data already present in Git") return path elif len(submodule) > 1: raise RuntimeError( "more than one submodule registered at the same path?") submodule = submodule[0] # FLOW GUIDE EXIT POINT # we are dealing with a known submodule (i.e. `source` # doesn't matter) -> check it out lgr.debug("Install subdataset at: {0}".format(submodule.path)) subds = _install_subds_from_flexible_source( ds, submodule.path, submodule.url, recursive=recursive) return subds except FileNotInAnnexError: ################################################### # FLOW GUIDE # # `path` is either # - content of a subdataset # - an untracked file in this dataset # - an entire untracked/unknown existing subdataset ################################################### lgr.log(5, "FileNotInAnnexError logic") subds = get_containing_subdataset(ds, relativepath) if ds.path != subds.path: # FLOW GUIDE EXIT POINT # target path belongs to a known subdataset, hand # installation over to it return subds.install( path=relpath(path, start=subds.path), source=source, recursive=recursive, add_data_to_git=add_data_to_git) # FLOW GUIDE # this must be an untracked/existing something, so either # - a file # - a directory # - an entire repository if exists(opj(path, '.git')): # FLOW GUIDE EXIT POINT # this is an existing repo and must be in-place turned into # a submodule of this dataset return _install_subds_inplace( ds, path, relativepath, source, runner) # FLOW GUIDE EXIT POINT # - untracked file or directory in this dataset if isdir(path) and not recursive: # this is a directory and we want --recursive for it raise ValueError( "installation of a directory requires the `recursive` flag") # few sanity checks if source and abspath(source) != path: raise ValueError( "installation target already exists, but `source` points to " "another location (target: '{0}', source: '{0}'".format( source, path)) if not add_data_to_git and not (isinstance(vcs, AnnexRepo)): raise RuntimeError( "Trying to install file(s) into a dataset " "with a plain Git repository. First initialize annex, or " "provide override flag.") # switch `add` procedure between Git and Git-annex according to flag if add_data_to_git: vcs.git_add(relativepath) added_files = resolve_path(relativepath, ds) else: # do a blunt `annex add` added_files = vcs.annex_add(relativepath) # return just the paths of the installed components if isinstance(added_files, list): added_files = [resolve_path(i['file'], ds) for i in added_files] else: added_files = resolve_path(added_files['file'], ds) if added_files: return added_files else: return None except IOError: ################################################### # FLOW GUIDE # # more complicated special cases -- `path` is either # - a file/subdataset in a not yet initialized but known # submodule # - an entire untracked/unknown existing subdataset # - non-existing content that should be installed from `source` ################################################### lgr.log(5, "IOError logic") # we can end up here in two cases ATM if (exists(path) or islink(path)) or source is None: # FLOW GUIDE # - target exists but this dataset's VCS rejects it, # so it should be part of a subdataset # or # - target doesn't exist, but no source is given, so # it could be a handle that is actually contained in # a not yet installed subdataset subds = get_containing_subdataset(ds, relativepath) if ds.path != subds.path: # FLOW GUIDE # target path belongs to a subdataset, hand installation # over to it if not subds.is_installed(): # FLOW GUIDE # we are dealing with a target in a not yet # available but known subdataset -> install it first ds.install(subds.path, recursive=recursive) return subds.install( path=relpath(path, start=subds.path), source=source, recursive=recursive, add_data_to_git=add_data_to_git) # FLOW GUIDE EXIT POINT raise InsufficientArgumentsError( "insufficient information for installation: the " "installation target {0} doesn't exists, isn't a " "known handle of dataset {1}, and no `source` " "information was provided.".format(path, ds)) if not source: # FLOW GUIDE EXIT POINT raise InsufficientArgumentsError( "insufficient information for installation: the " "installation target {0} doesn't exists, isn't a " "known handle of dataset {1}, and no `source` " "information was provided.".format(path, ds)) source_path = expandpath(source) if exists(source_path): # FLOW GUIDE EXIT POINT # this could be # - local file # - local directory # - repository outside the dataset # we only want to support the last case of locally cloning # a repo -- fail otherwise if exists(opj(source_path, '.git')): return _install_subds_from_flexible_source( ds, relativepath, source_path, recursive) raise ValueError( "installing individual local files or directories is not " "supported, copy/move them into the dataset first") # FLOW GUIDE # `source` is non-local, it could be: # - repository # - file # we have no further evidence, hence we need to try try: # FLOW GUIDE EXIT POINT # assume it is a dataset return _install_subds_from_flexible_source( ds, relativepath, source, recursive) except CommandError: # FLOW GUIDE EXIT POINT # apaarently not a repo, assume it is a file url vcs.annex_addurl_to_file(relativepath, source) return path
def __call__(sshurl, target=None, target_dir=None, target_url=None, target_pushurl=None, dataset=None, recursive=False, existing='raise', shared=False): if sshurl is None: raise ValueError("""insufficient information for target creation (needs at least a dataset and a SSH URL).""") if target is None and (target_url is not None or target_pushurl is not None): raise ValueError("""insufficient information for adding the target as a sibling (needs at least a name)""") # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) if ds is None: # try to find a dataset at or above CWD dspath = GitRepo.get_toppath(abspath(getpwd())) if dspath is None: raise ValueError("""No dataset found at or above {0}.""".format(getpwd())) ds = Dataset(dspath) lgr.debug("Resolved dataset for target creation: {0}".format(ds)) assert (ds is not None and sshurl is not None) if not ds.is_installed(): raise ValueError( """Dataset {0} is not installed yet.""".format(ds)) assert (ds.repo is not None) # determine target parameters: parsed_target = urlparse(sshurl) host_name = parsed_target.netloc # TODO: Sufficient to fail on this condition? if not parsed_target.netloc: raise ValueError("Malformed URL: {0}".format(sshurl)) if target_dir is None: if parsed_target.path: target_dir = parsed_target.path else: target_dir = '.' # TODO: centralize and generalize template symbol handling replicate_local_structure = False if "%NAME" not in target_dir: replicate_local_structure = True # collect datasets to use: datasets = dict() datasets[basename(ds.path)] = ds if recursive: for subds in ds.get_dataset_handles(recursive=True): sub_path = opj(ds.path, subds) # TODO: when enhancing Dataset/*Repo classes and therefore # adapt to moved code, make proper distinction between name and # path of a submodule, which are technically different. This # probably will become important on windows as well as whenever # we want to allow for moved worktrees. datasets[basename(ds.path) + '/' + subds] = \ Dataset(sub_path) # setup SSH Connection: # TODO: Make the entire setup a helper to use it when pushing via # publish? # - build control master: from datalad.utils import assure_dir not_supported_on_windows("TODO") from os import geteuid # Linux specific import var_run_user_datalad = "/var/run/user/%s/datalad" % geteuid() assure_dir(var_run_user_datalad) control_path = "%s/%s" % (var_run_user_datalad, host_name) control_path += ":%s" % parsed_target.port if parsed_target.port else "" # - start control master: cmd = "ssh -o ControlMaster=yes -o \"ControlPath=%s\" " \ "-o ControlPersist=yes %s exit" % (control_path, host_name) lgr.debug("Try starting control master by calling:\n%s" % cmd) import subprocess proc = subprocess.Popen(cmd, shell=True) proc.communicate(input="\n") # why the f.. this is necessary? runner = Runner() ssh_cmd = ["ssh", "-S", control_path, host_name] lgr.info("Creating target datasets ...") for current_dataset in datasets: if not replicate_local_structure: path = target_dir.replace("%NAME", current_dataset.replace("/", "-")) else: # TODO: opj depends on local platform, not the remote one. # check how to deal with it. Does windows ssh server accept # posix paths? vice versa? Should planned SSH class provide # tools for this issue? path = normpath( opj(target_dir, relpath(datasets[current_dataset].path, start=ds.path))) if path != '.': # check if target exists # TODO: Is this condition valid for != '.' only? path_exists = True cmd = ssh_cmd + ["ls", path] try: out, err = runner.run(cmd, expect_fail=True, expect_stderr=True) except CommandError as e: if "No such file or directory" in e.stderr and \ path in e.stderr: path_exists = False else: raise # It's an unexpected failure here if path_exists: if existing == 'raise': raise RuntimeError( "Target directory %s already exists." % path) elif existing == 'skip': continue elif existing == 'replace': pass else: raise ValueError( "Do not know how to hand existing=%s" % repr(existing)) cmd = ssh_cmd + ["mkdir", "-p", path] try: runner.run(cmd) except CommandError as e: lgr.error("Remotely creating target directory failed at " "%s.\nError: %s" % (path, str(e))) continue # init git repo cmd = ssh_cmd + ["git", "-C", path, "init"] if shared: cmd.append("--shared=%s" % shared) try: runner.run(cmd) except CommandError as e: lgr.error("Remotely initializing git repository failed at %s." "\nError: %s\nSkipping ..." % (path, str(e))) continue # check git version on remote end: cmd = ssh_cmd + ["git", "version"] try: out, err = runner.run(cmd) git_version = out.lstrip("git version").strip() lgr.debug("Detected git version on server: %s" % git_version) if git_version < "2.4": lgr.error("Git version >= 2.4 needed to configure remote." " Version detected on server: %s\nSkipping ..." % git_version) continue except CommandError as e: lgr.warning("Failed to determine git version on remote.\n" "Error: {0}\nTrying to configure anyway " "...".format(e.message)) # allow for pushing to checked out branch cmd = ssh_cmd + [ "git", "-C", path, "config", "receive.denyCurrentBranch", "updateInstead" ] try: runner.run(cmd) except CommandError as e: lgr.warning("git config failed at remote location %s.\n" "You will not be able to push to checked out " "branch." % path) # enable post-update hook: cmd = ssh_cmd + [ "mv", opj(path, ".git/hooks/post-update.sample"), opj(path, ".git/hooks/post-update") ] try: runner.run(cmd) except CommandError as e: lgr.error("Failed to enable post update hook.\n" "Error: %s" % e.message) # initially update server info "manually": cmd = ssh_cmd + ["git", "-C", path, "update-server-info"] try: runner.run(cmd) except CommandError as e: lgr.error("Failed to update server info.\n" "Error: %s" % e.message) # stop controlmaster (close ssh connection): cmd = ["ssh", "-O", "stop", "-S", control_path, host_name] out, err = runner.run(cmd, expect_stderr=True) if target: # add the sibling(s): if target_url is None: target_url = sshurl if target_pushurl is None: target_pushurl = sshurl result_adding = AddSibling()(dataset=ds, name=target, url=target_url, pushurl=target_pushurl, recursive=recursive, force=existing in {'replace'})
def __call__(sshurl, target=None, target_dir=None, target_url=None, target_pushurl=None, dataset=None, recursive=False, existing='raise', shared=False): if sshurl is None: raise ValueError("""insufficient information for target creation (needs at least a dataset and a SSH URL).""") if target is None and (target_url is not None or target_pushurl is not None): raise ValueError("""insufficient information for adding the target as a sibling (needs at least a name)""") # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) if ds is None: # try to find a dataset at or above CWD dspath = GitRepo.get_toppath(abspath(getpwd())) if dspath is None: raise ValueError("""No dataset found at or above {0}.""".format(getpwd())) ds = Dataset(dspath) lgr.debug("Resolved dataset for target creation: {0}".format(ds)) assert(ds is not None and sshurl is not None) if not ds.is_installed(): raise ValueError("""Dataset {0} is not installed yet.""".format(ds)) assert(ds.repo is not None) # determine target parameters: parsed_target = urlparse(sshurl) host_name = parsed_target.netloc # TODO: Sufficient to fail on this condition? if not parsed_target.netloc: raise ValueError("Malformed URL: {0}".format(sshurl)) if target_dir is None: if parsed_target.path: target_dir = parsed_target.path else: target_dir = '.' # TODO: centralize and generalize template symbol handling replicate_local_structure = False if "%NAME" not in target_dir: replicate_local_structure = True # collect datasets to use: datasets = dict() datasets[basename(ds.path)] = ds if recursive: for subds in ds.get_dataset_handles(recursive=True): sub_path = opj(ds.path, subds) # TODO: when enhancing Dataset/*Repo classes and therefore # adapt to moved code, make proper distinction between name and # path of a submodule, which are technically different. This # probably will become important on windows as well as whenever # we want to allow for moved worktrees. datasets[basename(ds.path) + '/' + subds] = \ Dataset(sub_path) # setup SSH Connection: # TODO: Make the entire setup a helper to use it when pushing via # publish? # - build control master: from datalad.utils import assure_dir not_supported_on_windows("TODO") from os import geteuid # Linux specific import var_run_user_datalad = "/var/run/user/%s/datalad" % geteuid() assure_dir(var_run_user_datalad) control_path = "%s/%s" % (var_run_user_datalad, host_name) control_path += ":%s" % parsed_target.port if parsed_target.port else "" # - start control master: cmd = "ssh -o ControlMaster=yes -o \"ControlPath=%s\" " \ "-o ControlPersist=yes %s exit" % (control_path, host_name) lgr.debug("Try starting control master by calling:\n%s" % cmd) import subprocess proc = subprocess.Popen(cmd, shell=True) proc.communicate(input="\n") # why the f.. this is necessary? runner = Runner() ssh_cmd = ["ssh", "-S", control_path, host_name] lgr.info("Creating target datasets ...") for current_dataset in datasets: if not replicate_local_structure: path = target_dir.replace("%NAME", current_dataset.replace("/", "-")) else: # TODO: opj depends on local platform, not the remote one. # check how to deal with it. Does windows ssh server accept # posix paths? vice versa? Should planned SSH class provide # tools for this issue? path = normpath(opj(target_dir, relpath(datasets[current_dataset].path, start=ds.path))) if path != '.': # check if target exists # TODO: Is this condition valid for != '.' only? path_exists = True cmd = ssh_cmd + ["ls", path] try: out, err = runner.run(cmd, expect_fail=True, expect_stderr=True) except CommandError as e: if "No such file or directory" in e.stderr and \ path in e.stderr: path_exists = False else: raise # It's an unexpected failure here if path_exists: if existing == 'raise': raise RuntimeError( "Target directory %s already exists." % path) elif existing == 'skip': continue elif existing == 'replace': pass else: raise ValueError("Do not know how to hand existing=%s" % repr(existing)) cmd = ssh_cmd + ["mkdir", "-p", path] try: runner.run(cmd) except CommandError as e: lgr.error("Remotely creating target directory failed at " "%s.\nError: %s" % (path, str(e))) continue # init git repo cmd = ssh_cmd + ["git", "-C", path, "init"] if shared: cmd.append("--shared=%s" % shared) try: runner.run(cmd) except CommandError as e: lgr.error("Remotely initializing git repository failed at %s." "\nError: %s\nSkipping ..." % (path, str(e))) continue # check git version on remote end: cmd = ssh_cmd + ["git", "version"] try: out, err = runner.run(cmd) git_version = out.lstrip("git version").strip() lgr.debug("Detected git version on server: %s" % git_version) if git_version < "2.4": lgr.error("Git version >= 2.4 needed to configure remote." " Version detected on server: %s\nSkipping ..." % git_version) continue except CommandError as e: lgr.warning( "Failed to determine git version on remote.\n" "Error: {0}\nTrying to configure anyway " "...".format(e.message)) # allow for pushing to checked out branch cmd = ssh_cmd + ["git", "-C", path, "config", "receive.denyCurrentBranch", "updateInstead"] try: runner.run(cmd) except CommandError as e: lgr.warning("git config failed at remote location %s.\n" "You will not be able to push to checked out " "branch." % path) # enable post-update hook: cmd = ssh_cmd + ["mv", opj(path, ".git/hooks/post-update.sample"), opj(path, ".git/hooks/post-update")] try: runner.run(cmd) except CommandError as e: lgr.error("Failed to enable post update hook.\n" "Error: %s" % e.message) # initially update server info "manually": cmd = ssh_cmd + ["git", "-C", path, "update-server-info"] try: runner.run(cmd) except CommandError as e: lgr.error("Failed to update server info.\n" "Error: %s" % e.message) # stop controlmaster (close ssh connection): cmd = ["ssh", "-O", "stop", "-S", control_path, host_name] out, err = runner.run(cmd, expect_stderr=True) if target: # add the sibling(s): if target_url is None: target_url = sshurl if target_pushurl is None: target_pushurl = sshurl result_adding = AddSibling()(dataset=ds, name=target, url=target_url, pushurl=target_pushurl, recursive=recursive, force=existing in {'replace'})
def __call__(dataset=None, name=None, url=None, pushurl=None, recursive=False, force=False): # TODO: Detect malformed URL and fail? if name is None or (url is None and pushurl is None): raise ValueError("""insufficient information to add a sibling (needs at least a dataset, a name and an URL).""") if url is None: url = pushurl # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) if ds is None: # try to find a dataset at or above CWD dspath = GitRepo.get_toppath(abspath(getpwd())) if dspath is None: raise ValueError( "No dataset found at or above {0}.".format(getpwd())) ds = Dataset(dspath) lgr.debug("Resolved dataset for target creation: {0}".format(ds)) assert(ds is not None and name is not None and url is not None) if not ds.is_installed(): raise ValueError("Dataset {0} is not installed yet.".format(ds)) assert(ds.repo is not None) ds_basename = basename(ds.path) repos = { ds_basename: {'repo': ds.repo} } if recursive: for subds in ds.get_dataset_handles(recursive=True): sub_path = opj(ds.path, subds) repos[ds_basename + '/' + subds] = { # repos[subds] = { 'repo': GitRepo(sub_path, create=False) } # Note: This is copied from create_publication_target_sshwebserver # as it is the same logic as for its target_dir. # TODO: centralize and generalize template symbol handling # TODO: Check pushurl for template symbols too. Probably raise if only # one of them uses such symbols replicate_local_structure = False if "%NAME" not in url: replicate_local_structure = True for repo in repos: if not replicate_local_structure: repos[repo]['url'] = url.replace("%NAME", repo.replace("/", "-")) if pushurl: repos[repo]['pushurl'] = pushurl.replace("%NAME", repo.replace("/", "-")) else: repos[repo]['url'] = url if pushurl: repos[repo]['pushurl'] = pushurl if repo != ds_basename: repos[repo]['url'] = _urljoin(repos[repo]['url'], repo[len(ds_basename)+1:]) if pushurl: repos[repo]['pushurl'] = _urljoin(repos[repo]['pushurl'], repo[len(ds_basename)+1:]) # collect existing remotes: already_existing = list() conflicting = list() for repo in repos: if name in repos[repo]['repo'].git_get_remotes(): already_existing.append(repo) lgr.debug("""Remote '{0}' already exists in '{1}'.""".format(name, repo)) existing_url = repos[repo]['repo'].git_get_remote_url(name) existing_pushurl = \ repos[repo]['repo'].git_get_remote_url(name, push=True) if repos[repo]['url'].rstrip('/') != existing_url.rstrip('/') \ or (pushurl and existing_pushurl and repos[repo]['pushurl'].rstrip('/') != existing_pushurl.rstrip('/')) \ or (pushurl and not existing_pushurl): conflicting.append(repo) if not force and conflicting: raise RuntimeError("Sibling '{0}' already exists with conflicting" " URL for {1} dataset(s). {2}".format( name, len(conflicting), conflicting)) runner = Runner() successfully_added = list() for repo in repos: if repo in already_existing: if repo not in conflicting: lgr.debug("Skipping {0}. Nothing to do.".format(repo)) continue # rewrite url cmd = ["git", "remote", "set-url", name, repos[repo]['url']] runner.run(cmd, cwd=repos[repo]['repo'].path) else: # add the remote cmd = ["git", "remote", "add", name, repos[repo]['url']] runner.run(cmd, cwd=repos[repo]['repo'].path) if pushurl: cmd = ["git", "remote", "set-url", "--push", name, repos[repo]['pushurl']] runner.run(cmd, cwd=repos[repo]['repo'].path) successfully_added.append(repo) return successfully_added
def __call__( dataset=None, dest=None, path=None, # Note: add remote currently disabled in publish # dest_url=None, dest_pushurl=None, with_data=None, recursive=False): # Note: add remote currently disabled in publish # if dest is None and (dest_url is not None # or dest_pushurl is not None): # raise ValueError("""insufficient information for adding the # destination as a sibling (needs at least a name)""") # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) if not path: path = curdir elif isinstance(path, list): return [ Publish.__call__( dataset=ds, dest=dest, path=p, # Note: add remote currently disabled in publish # dest_url=dest_url, # dest_pushurl=dest_pushurl, with_data=with_data, recursive=recursive) for p in path ] # resolve the location against the provided dataset if path is not None: path = resolve_path(path, ds) lgr.info("Publishing {0}".format(path)) # if we have no dataset given, figure out which one we need to operate # on, based on the resolved location (that is now guaranteed to # be specified if ds is None: # try to find a dataset at or above the location dspath = GitRepo.get_toppath(abspath(path)) if dspath is None: # no top-level dataset found, use path as such dspath = path ds = Dataset(dspath) lgr.debug("Resolved dataset for publication: {0}".format(ds)) assert (ds is not None) # it might still be about a subdataset of ds: if path is not None: relativepath = relpath(path, start=ds.path) subds = get_containing_subdataset(ds, relativepath) if subds.path != ds.path: # path belongs to a subdataset; hand it over lgr.debug("Hand over to submodule %s" % subds.path) return subds.publish( dest=dest, path=relpath(path, start=subds.path), # Note: add remote currently disabled in publish # dest_url=dest_url, # dest_pushurl=dest_pushurl, with_data=with_data, recursive=recursive) # now, we know, we have to operate on ds. So, ds needs to be installed, # since we cannot publish anything from a not installed dataset, # can we? # (But may be just the existence of ds.repo is important here.) if not ds.is_installed(): raise ValueError("No installed dataset found at " "{0}.".format(ds.path)) assert (ds.repo is not None) # TODO: For now we can deal with a sibling(remote) name given by `dest` # only. Figure out, when to allow for passing a local path or URL # directly and what to do in that case. # Note: we need an upstream remote, if there's none given. We could # wait for git push to complain, but we need to explicitly figure it # out for pushing annex branch anyway and we might as well fail right # here. # keep original dest in case it's None for passing to recursive calls: dest_resolved = dest if dest is None: # check for tracking branch's remote: try: std_out, std_err = \ ds.repo._git_custom_command('', ["git", "config", "--get", "branch.{active_branch}.remote".format(active_branch=ds.repo.git_get_active_branch())], expect_fail=True) except CommandError as e: if e.code == 1 and e.stdout == "": std_out = None else: raise if std_out: dest_resolved = std_out.strip() else: # we have no remote given and no upstream => fail raise RuntimeError("No known default target for " "publication and none given.") # upstream branch needed for update (merge) and subsequent push, # in case there is no. set_upstream = False try: # Note: tracking branch actually defined bei entry "merge" # PLUS entry "remote" std_out, std_err = \ ds.repo._git_custom_command('', ["git", "config", "--get", "branch.{active_branch}.merge".format(active_branch=ds.repo.git_get_active_branch())], expect_fail=True) except CommandError as e: if e.code == 1 and e.stdout == "": # no tracking branch yet: set_upstream = True else: raise # is `dest` an already known remote? if dest_resolved not in ds.repo.git_get_remotes(): # unknown remote raise ValueError("No sibling '%s' found." % dest_resolved) # Note: add remote currently disabled in publish # if dest_url is None: # raise ValueError("No sibling '%s' found. Provide `dest-url`" # " to register it." % dest_resolved) # lgr.info("Sibling %s unknown. Registering ...") # # # Fill in URL-Template: # remote_url = dest_url.replace("%NAME", basename(ds.path)) # # TODO: handle_name.replace("/", "-")) instead of basename() # # - figure it out ;) # # - either a datasets needs to discover superdatasets in # # order to get it's relative path to provide a name # # - or: We need a different approach on the templates # # # Add the remote # ds.repo.git_remote_add(dest_resolved, remote_url) # if dest_pushurl: # # Fill in template: # remote_url_push = \ # dest_pushurl.replace("%NAME", basename(ds.path)) # # TODO: Different way of replacing %NAME; See above # # # Modify push url: # ds.repo._git_custom_command('', # ["git", "remote", # "set-url", # "--push", dest_resolved, # remote_url_push]) # lgr.info("Added sibling '%s'." % dest) # lgr.debug("Added remote '%s':\n %s (fetch)\n%s (push)." % # (dest_resolved, remote_url, # remote_url_push if dest_pushurl else remote_url)) # Note: add remote currently disabled in publish # else: # # known remote: parameters dest-url-* currently invalid. # # This may change to adapt the existing remote. # if dest_url: # lgr.warning("Sibling '%s' already exists for dataset '%s'. " # "Ignoring dest-url %s." % # (dest_resolved, ds.path, dest_url)) # if dest_pushurl: # lgr.warning("Sibling '%s' already exists for dataset '%s'. " # "Ignoring dest-pushurl %s." % # (dest_resolved, ds.path, dest_pushurl)) # Figure out, what to publish if path is None or path == ds.path: # => publish the dataset itself # push local state: # TODO: Rework git_push in GitRepo cmd = ['git', 'push'] if set_upstream: # no upstream branch yet cmd.append("--set-upstream") cmd += [dest_resolved, ds.repo.git_get_active_branch()] ds.repo._git_custom_command('', cmd) # push annex branch: if isinstance(ds.repo, AnnexRepo): ds.repo.git_push("%s +git-annex:git-annex" % dest_resolved) # TODO: if with_data is a shell pattern, we get a list, when called # from shell, right? # => adapt the following and check constraints to allow for that if with_data: ds.repo._git_custom_command('', ["git", "annex", "copy"] + with_data + ["--to", dest_resolved]) if recursive and ds.get_dataset_handles() != []: results = [ds] # Note: add remote currently disabled in publish # modify URL templates: # if dest_url: # dest_url = dest_url.replace('%NAME', basename(ds.path) + '-%NAME') # if dest_pushurl: # dest_pushurl = dest_pushurl.replace('%NAME', basename(ds.path) + '-%NAME') for subds in ds.get_dataset_handles(): results.append( Dataset(opj(ds.path, subds)).publish( dest=dest, # Note: use `dest` instead of `dest_resolved` in case # dest was None, so subdatasets would use their default # as well # Note: add remote currently disabled in publish # dest_url=dest_url, # dest_pushurl=dest_pushurl, with_data=with_data, recursive=recursive)) return results return ds elif exists(path): # At this point `path` is not referencing a (sub)dataset. # An annexed file is the only thing left, that `path` might be # validly pointing to. Anything else we can't handle currently. if isinstance(ds.repo, AnnexRepo): try: if ds.repo.get_file_key(relativepath): # file is in annex, publish it ds.repo._run_annex_command( 'copy', annex_options=[path, '--to=%s' % dest_resolved]) return path except (FileInGitError, FileNotInAnnexError): pass # `path` can't be published lgr.warning("Don't know how to publish %s." % path) return None else: # nothing to publish found lgr.warning("Nothing to publish found at %s." % path) return None
def __call__(name=None, dataset=None, merge=False, recursive=False, fetch_all=False, reobtain_data=False): """ """ # TODO: Is there an 'update filehandle' similar to install and publish? # What does it mean? if reobtain_data: # TODO: properly define, what to do raise NotImplementedError("TODO: Option '--reobtain-data' not " "implemented yet.") # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) # if we have no dataset given, figure out which one we need to operate # on, based on the current working directory of the process: if ds is None: # try to find a dataset at or above PWD: dspath = GitRepo.get_toppath(getpwd()) if dspath is None: raise ValueError("No dataset found at %s." % getpwd()) ds = Dataset(dspath) assert(ds is not None) if not ds.is_installed(): raise ValueError("No installed dataset found at " "{0}.".format(ds.path)) assert(ds.repo is not None) repos_to_update = [ds.repo] if recursive: repos_to_update += [GitRepo(opj(ds.path, sub_path)) for sub_path in ds.get_dataset_handles(recursive=True)] for repo in repos_to_update: # get all remotes: remotes = repo.git_get_remotes() if name and name not in remotes: lgr.warning("'%s' not known to dataset %s.\nSkipping" % (name, repo.path)) continue # Currently '--merge' works for single remote only: # TODO: - condition still incomplete # - We can merge if a remote was given or there is a # tracking branch # - we also can fetch all remotes independently on whether or # not we merge a certain remote if not name and len(remotes) > 1 and merge: lgr.debug("Found multiple remotes:\n%s" % remotes) raise NotImplementedError("No merge strategy for multiple " "remotes implemented yet.") lgr.info("Updating handle '%s' ..." % repo.path) # fetch remote(s): repo.git_fetch(name if name else '', "--all" if fetch_all else '') # if it is an annex and there is a tracking branch, and we didn't # fetch the entire remote anyway, explicitly fetch git-annex # branch: # TODO: Is this logic correct? Shouldn't we fetch git-annex from # `name` if there is any (or if there is no tracking branch but we # have a `name`? if knows_annex(repo.path) and not fetch_all: # check for tracking branch's remote: try: std_out, std_err = \ repo._git_custom_command('', ["git", "config", "--get", "branch.{active_branch}.remote".format( active_branch=repo.git_get_active_branch())]) except CommandError as e: if e.code == 1 and e.stdout == "": std_out = None else: raise if std_out: # we have a "tracking remote" repo.git_fetch("%s git-annex" % std_out.strip()) # merge: if merge: lgr.info("Applying changes from tracking branch...") cmd_list = ["git", "pull"] if name: cmd_list.append(name) # branch needed, if not default remote # => TODO: use default remote/tracking branch to compare # (see above, where git-annex is fetched) # => TODO: allow for passing a branch # (or more general refspec?) # For now, just use the same name cmd_list.append(repo.git_get_active_branch()) out, err = repo._git_custom_command('', cmd_list) lgr.info(out) if knows_annex(repo.path): # annex-apply: lgr.info("Updating annex ...") out, err = repo._git_custom_command('', ["git", "annex", "merge"]) lgr.info(out)
def __call__( path, source=None, dataset=None, recursive=False, recursion_limit=None, get_data=True, reckless=False, git_opts=None, annex_opts=None, annex_get_opts=None, jobs=None, verbose=False, # internal -- instead of returning 'get'ed items, return final # content_by_ds, unavailable_paths. To be used by the call from # Install.__call__ and done so to avoid creating another reusable # function which would need to duplicate all this heavy list of # kwargs _return_datasets=False): # IMPLEMENTATION CONCEPT: # # 1. turn all input paths into absolute paths # 2. Sort the world into existing handles and the rest # 3. Try locate missing handles (obtain subdatasets along the way) # 4. Expand into subdatasets with recursion enables (potentially # obtain even more subdatasets # 5. Shoot info of which handles to get in each subdataset to, # git-annex, once at the very end # TODO: consider allowing an empty `path` argument, as with other commands, # to indicate CWD resolved_paths, dataset_path = get_normalized_path_arguments( path, dataset, default=None) if not resolved_paths: raise InsufficientArgumentsError( "`get` needs at least one path as argument") # sort paths into the respective datasets dir_lookup = {} content_by_ds, unavailable_paths, nondataset_paths = \ get_paths_by_dataset(resolved_paths, recursive=recursive, recursion_limit=recursion_limit, dir_lookup=dir_lookup) lgr.debug( "Found %i existing dataset(s) to get content in " "and %d unavailable paths", len(content_by_ds), len(unavailable_paths)) # IMPORTANT NOTE re `content_by_ds` # each key is a subdataset that we need to get something in # if the value[0] is the subdataset's path, we want all of it # if the value[0] == curdir, we just installed it as part of # resolving file handles and we did not say anything but "give # me the dataset handle" # explore the unknown for path in sorted(unavailable_paths): # how close can we get? dspath = GitRepo.get_toppath(path) if dspath is None: # nothing we can do for this path continue ds = Dataset(dspath) # must always yield a dataset -- we sorted out the ones outside # any dataset at the very top assert ds.is_installed() # now actually obtain whatever is necessary to get to this path containing_ds = install_necessary_subdatasets(ds, path, reckless) if containing_ds.path != ds.path: lgr.debug( "Installed %s to fulfill request for content for " "path %s", containing_ds, path) # mark resulting dataset as auto-installed if containing_ds.path == path: # we had to get the entire dataset, not something within # mark that it just appeared content_by_ds[path] = [curdir] else: # we need to get content within content_by_ds[path] = [path] if recursive and not recursion_limit == 'existing': # obtain any subdatasets underneath the paths given inside the # subdatasets that we know already exist # unless we do not want recursion into not-yet-installed datasets for subdspath in sorted(content_by_ds.keys()): for content_path in content_by_ds[subdspath]: if not isdir(content_path): # a non-directory cannot have content underneath continue subds = Dataset(subdspath) lgr.info( "Obtaining %s %s recursively", subds, ("underneath %s" % content_path if subds.path != content_path else "")) cbysubds = _recursive_install_subds_underneath( subds, # `content_path` was explicitly given as input # we count recursions from the input, hence we # can start with the full number recursion_limit, reckless, # protect against magic marker misinterpretation # only relevant for _get, hence replace here start=content_path if content_path != curdir else None) # gets file content for all freshly installed subdatasets content_by_ds.update(cbysubds) ## we have now done everything we could to obtain whatever subdataset ## to get something on the file system for previously unavailable paths ## check and sort one last content_by_ds, unavailable_paths, nondataset_paths2 = \ get_paths_by_dataset( unavailable_paths, recursive=recursive, recursion_limit=recursion_limit, out=content_by_ds, dir_lookup=dir_lookup) nondataset_paths.extend(nondataset_paths2) if nondataset_paths: lgr.warning("ignored paths that do not belong to any dataset: %s", nondataset_paths) if unavailable_paths: lgr.warning('ignored non-existing paths: %s', unavailable_paths) # hand over to git-annex results = list( chain.from_iterable( _get(content_by_ds, refpath=dataset_path, source=source, jobs=jobs, get_data=get_data))) # ??? should we in _return_datasets case just return both content_by_ds # and unavailable_paths may be so we provide consistent across runs output # and then issue outside similar IncompleteResultsError? if unavailable_paths: # and likely other error flags if _return_datasets: results = sorted( set(content_by_ds).difference(unavailable_paths)) raise IncompleteResultsError(results, failed=unavailable_paths) else: return sorted(content_by_ds) if _return_datasets else results
def __call__(dataset=None, path=None, data_only=True, recursive=False): # Note: copy logic from install to resolve dataset and path: # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) if not path: if ds is None: # no dataset, no target location, nothing to do raise ValueError( "insufficient information for uninstallation (needs at " "least a dataset or a path") elif isinstance(path, list): # TODO: not sure. might be possible to deal with that list directly return [Uninstall.__call__( dataset=ds, path=p, data_only=data_only, recursive=recursive) for p in path] # resolve the target location against the provided dataset if path is not None: path = resolve_path(path, ds) lgr.debug("Resolved uninstallation target: {0}".format(path)) # if we have no dataset given, figure out which one we need to operate # on, based on the resolved target location (that is now guaranteed to # be specified if ds is None: # try to find a dataset at or above the installation target dspath = GitRepo.get_toppath(abspath(path)) if dspath is None: # no top-level dataset found, use path as such dspath = path ds = Dataset(dspath) assert(ds is not None) lgr.debug("Resolved target dataset for uninstallation: {0}".format(ds)) if not ds.is_installed(): if not path or path == ds.path: # we want to uninstall the dataset itself, which is not # installed => nothing to do # TODO: consider `data` option! is_installed currently only # checks for a repository lgr.info("Dataset {0} not installed. Nothing to " "do.".format(ds.path)) return else: # we want to uninstall something from a not installed dataset # Doesn't make sense, does it? => fail raise ValueError("Dataset {0} is not installed.".format(ds.path)) assert(ds.repo is not None) if not path or path == ds.path: # uninstall the dataset `ds` # TODO: what to consider? # - whether it is a submodule of another dataset # - `data_only` ? # - `recursive` # - what to return in what case (data_only)? raise NotImplementedError("TODO: Uninstall dataset %s" % ds.path) # needed by the logic below assert(isabs(path)) # express the destination path relative to the root of this dataset relativepath = relpath(path, start=ds.path) if path.startswith(pardir): raise ValueError("uninstallation path outside dataset") lgr.debug( "Resolved uninstallation target relative to dataset {0}: {1}".format( ds, relativepath)) # figure out, what path actually is pointing to: if not exists(path): # nothing there, nothing to uninstall lgr.info("Nothing found to uninstall at %s" % path) return if relativepath in ds.get_dataset_handles(recursive=True): # it's a submodule # --recursive required or implied? raise NotImplementedError("TODO: uninstall submodule %s from " "dataset %s" % (relativepath, ds.path)) if isdir(path): # don't know what to do yet # in git vs. untracked? # recursive? raise NotImplementedError("TODO: uninstall directory %s from " "dataset %s" % (path, ds.path)) # we know, it's an existing file if isinstance(ds.repo, AnnexRepo): try: ds.repo.get_file_key(relativepath) except FileInGitError: # file directly in git _file_in_git = True except FileNotInAnnexError: # either an untracked file in this dataset, or something that # also actually exists in the file system but could be part of # a subdataset _untracked_or_within_submodule = True # it's an annexed file if data_only: ds.repo.annex_drop([path]) return path else: raise NotImplementedError("TODO: fully uninstall file %s " "(annex) from dataset %s" % (path, ds.path)) else: # plain git repo if relativepath in ds.repo.get_indexed_files(): # file directly in git _file_in_git = True else: # either an untracked file in this dataset, or something that # also actually exists in the file system but could be part of # a subdataset _untracked_or_within_submodule = True if _file_in_git: if data_only: raise ValueError("%s is not a file handle. Removing its " "data only doesn't make sense." % path) else: return ds.repo.git_remove([relativepath]) elif _untracked_or_within_submodule: subds = get_containing_subdataset(ds, relativepath) if ds.path != subds.path: # target path belongs to a subdataset, hand uninstallation # over to it return subds.uninstall( path=relpath(path, start=subds.path), data_only=data_only, recursive=recursive) # this must be an untracked/existing something # it wasn't installed, so we cannot uninstall it raise ValueError("Cannot uninstall %s" % path)
def get_paths_by_dataset(paths, recursive=False, recursion_limit=None, out=None, dir_lookup=None): """Sort a list of paths per dataset they are contained in. Any paths that are not part of a dataset, or presently unavailable are reported. Parameter --------- paths : sequence A sequence of path specifications to sort. recursive : bool Flag whether to report subdatasets under any of the given paths recursion_limit : Depth constraint for recursion. See `Dataset.get_subdatasets()` for more information. out : dict or None By default a new output dictionary is created, howeverm and existing one can be provided via this argument to enable incremental processing. dir_lookup : dict or None Optional lookup cache that maps paths to previously determined datasets. This can speed up repeated processing. Returns ------- Tuple(dict, list, list) Dict of `existing dataset path`: `path` mappings, the list of currently non-existing paths (possibly matching currently uninstalled datasets), and any paths that are not part of any dataset """ # sort paths into the respective datasets if dir_lookup is None: dir_lookup = {} if out is None: out = {} # paths that don't exist (yet) unavailable_paths = [] nondataset_paths = [] for path in paths: if not lexists(path): # not there yet, impossible to say which ds it will actually # be in, if any unavailable_paths.append(path) continue # the path exists in some shape or form if isdir(path): # this could contain all types of additional content d = path else: # for everything else we are interested in the container d = dirname(path) if not d: d = curdir # this could be `None` if there is no git repo dspath = dir_lookup.get(d, GitRepo.get_toppath(d)) dir_lookup[d] = dspath if not dspath: nondataset_paths.append(path) continue if isdir(path): ds = Dataset(dspath) # we need to doublecheck that this is not a subdataset mount # point, in which case get_toppath() would point to the parent smpath = ds.get_containing_subdataset(path, recursion_limit=1).path if smpath != dspath: # fix entry dir_lookup[d] = smpath # submodule still needs to be obtained unavailable_paths.append(path) continue if recursive: # make sure we get everything relevant in all _checked out_ # subdatasets, obtaining of previously unavailable subdataset # else done elsewhere subs = ds.get_subdatasets(fulfilled=True, recursive=recursive, recursion_limit=recursion_limit) for sub in subs: subdspath = opj(dspath, sub) if subdspath.startswith(_with_sep(path)): # this subdatasets is underneath the search path # we want it all # be careful to not overwrite anything, in case # this subdataset has been processed before out[subdspath] = out.get(subdspath, [subdspath]) out[dspath] = out.get(dspath, []) + [path] return out, unavailable_paths, nondataset_paths
def __call__(dataset=None, path=None, data_only=True, recursive=False): # Note: copy logic from install to resolve dataset and path: # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) if not path: if ds is None: # no dataset, no target location, nothing to do raise ValueError( "insufficient information for uninstallation (needs at " "least a dataset or a path") elif isinstance(path, list): # TODO: not sure. might be possible to deal with that list directly return [ Uninstall.__call__(dataset=ds, path=p, data_only=data_only, recursive=recursive) for p in path ] # resolve the target location against the provided dataset if path is not None: path = resolve_path(path, ds) lgr.debug("Resolved uninstallation target: {0}".format(path)) # if we have no dataset given, figure out which one we need to operate # on, based on the resolved target location (that is now guaranteed to # be specified if ds is None: # try to find a dataset at or above the installation target dspath = GitRepo.get_toppath(abspath(path)) if dspath is None: # no top-level dataset found, use path as such dspath = path ds = Dataset(dspath) assert (ds is not None) lgr.debug("Resolved target dataset for uninstallation: {0}".format(ds)) if not ds.is_installed(): if not path or path == ds.path: # we want to uninstall the dataset itself, which is not # installed => nothing to do # TODO: consider `data` option! is_installed currently only # checks for a repository lgr.info("Dataset {0} not installed. Nothing to " "do.".format(ds.path)) return else: # we want to uninstall something from a not installed dataset # Doesn't make sense, does it? => fail raise ValueError("Dataset {0} is not installed.".format( ds.path)) assert (ds.repo is not None) if not path or path == ds.path: # uninstall the dataset `ds` # TODO: what to consider? # - whether it is a submodule of another dataset # - `data_only` ? # - `recursive` # - what to return in what case (data_only)? raise NotImplementedError("TODO: Uninstall dataset %s" % ds.path) # needed by the logic below assert (isabs(path)) # express the destination path relative to the root of this dataset relativepath = relpath(path, start=ds.path) if path.startswith(pardir): raise ValueError("uninstallation path outside dataset") lgr.debug( "Resolved uninstallation target relative to dataset {0}: {1}". format(ds, relativepath)) # figure out, what path actually is pointing to: if not exists(path): # nothing there, nothing to uninstall lgr.info("Nothing found to uninstall at %s" % path) return if relativepath in ds.get_dataset_handles(recursive=True): # it's a submodule # --recursive required or implied? raise NotImplementedError("TODO: uninstall submodule %s from " "dataset %s" % (relativepath, ds.path)) if isdir(path): # don't know what to do yet # in git vs. untracked? # recursive? raise NotImplementedError("TODO: uninstall directory %s from " "dataset %s" % (path, ds.path)) # we know, it's an existing file if isinstance(ds.repo, AnnexRepo): try: ds.repo.get_file_key(relativepath) except FileInGitError: # file directly in git _file_in_git = True except FileNotInAnnexError: # either an untracked file in this dataset, or something that # also actually exists in the file system but could be part of # a subdataset _untracked_or_within_submodule = True # it's an annexed file if data_only: ds.repo.annex_drop([path]) return path else: raise NotImplementedError("TODO: fully uninstall file %s " "(annex) from dataset %s" % (path, ds.path)) else: # plain git repo if relativepath in ds.repo.get_indexed_files(): # file directly in git _file_in_git = True else: # either an untracked file in this dataset, or something that # also actually exists in the file system but could be part of # a subdataset _untracked_or_within_submodule = True if _file_in_git: if data_only: raise ValueError("%s is not a file handle. Removing its " "data only doesn't make sense." % path) else: return ds.repo.git_remove([relativepath]) elif _untracked_or_within_submodule: subds = get_containing_subdataset(ds, relativepath) if ds.path != subds.path: # target path belongs to a subdataset, hand uninstallation # over to it return subds.uninstall(path=relpath(path, start=subds.path), data_only=data_only, recursive=recursive) # this must be an untracked/existing something # it wasn't installed, so we cannot uninstall it raise ValueError("Cannot uninstall %s" % path)
def __call__(dataset=None, dest=None, path=None, # Note: add remote currently disabled in publish # dest_url=None, dest_pushurl=None, with_data=None, recursive=False): # Note: add remote currently disabled in publish # if dest is None and (dest_url is not None # or dest_pushurl is not None): # raise ValueError("""insufficient information for adding the # destination as a sibling (needs at least a name)""") # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) if not path: path = curdir elif isinstance(path, list): return [Publish.__call__( dataset=ds, dest=dest, path=p, # Note: add remote currently disabled in publish # dest_url=dest_url, # dest_pushurl=dest_pushurl, with_data=with_data, recursive=recursive) for p in path] # resolve the location against the provided dataset if path is not None: path = resolve_path(path, ds) lgr.info("Publishing {0}".format(path)) # if we have no dataset given, figure out which one we need to operate # on, based on the resolved location (that is now guaranteed to # be specified if ds is None: # try to find a dataset at or above the location dspath = GitRepo.get_toppath(abspath(path)) if dspath is None: # no top-level dataset found, use path as such dspath = path ds = Dataset(dspath) lgr.debug("Resolved dataset for publication: {0}".format(ds)) assert(ds is not None) # it might still be about a subdataset of ds: if path is not None: relativepath = relpath(path, start=ds.path) subds = get_containing_subdataset(ds, relativepath) if subds.path != ds.path: # path belongs to a subdataset; hand it over lgr.debug("Hand over to submodule %s" % subds.path) return subds.publish(dest=dest, path=relpath(path, start=subds.path), # Note: add remote currently disabled in publish # dest_url=dest_url, # dest_pushurl=dest_pushurl, with_data=with_data, recursive=recursive) # now, we know, we have to operate on ds. So, ds needs to be installed, # since we cannot publish anything from a not installed dataset, # can we? # (But may be just the existence of ds.repo is important here.) if not ds.is_installed(): raise ValueError("No installed dataset found at " "{0}.".format(ds.path)) assert(ds.repo is not None) # TODO: For now we can deal with a sibling(remote) name given by `dest` # only. Figure out, when to allow for passing a local path or URL # directly and what to do in that case. # Note: we need an upstream remote, if there's none given. We could # wait for git push to complain, but we need to explicitly figure it # out for pushing annex branch anyway and we might as well fail right # here. # keep original dest in case it's None for passing to recursive calls: dest_resolved = dest if dest is None: # check for tracking branch's remote: try: std_out, std_err = \ ds.repo._git_custom_command('', ["git", "config", "--get", "branch.{active_branch}.remote".format(active_branch=ds.repo.git_get_active_branch())], expect_fail=True) except CommandError as e: if e.code == 1 and e.stdout == "": std_out = None else: raise if std_out: dest_resolved = std_out.strip() else: # we have no remote given and no upstream => fail raise RuntimeError("No known default target for " "publication and none given.") # upstream branch needed for update (merge) and subsequent push, # in case there is no. set_upstream = False try: # Note: tracking branch actually defined bei entry "merge" # PLUS entry "remote" std_out, std_err = \ ds.repo._git_custom_command('', ["git", "config", "--get", "branch.{active_branch}.merge".format(active_branch=ds.repo.git_get_active_branch())], expect_fail=True) except CommandError as e: if e.code == 1 and e.stdout == "": # no tracking branch yet: set_upstream = True else: raise # is `dest` an already known remote? if dest_resolved not in ds.repo.git_get_remotes(): # unknown remote raise ValueError("No sibling '%s' found." % dest_resolved) # Note: add remote currently disabled in publish # if dest_url is None: # raise ValueError("No sibling '%s' found. Provide `dest-url`" # " to register it." % dest_resolved) # lgr.info("Sibling %s unknown. Registering ...") # # # Fill in URL-Template: # remote_url = dest_url.replace("%NAME", basename(ds.path)) # # TODO: handle_name.replace("/", "-")) instead of basename() # # - figure it out ;) # # - either a datasets needs to discover superdatasets in # # order to get it's relative path to provide a name # # - or: We need a different approach on the templates # # # Add the remote # ds.repo.git_remote_add(dest_resolved, remote_url) # if dest_pushurl: # # Fill in template: # remote_url_push = \ # dest_pushurl.replace("%NAME", basename(ds.path)) # # TODO: Different way of replacing %NAME; See above # # # Modify push url: # ds.repo._git_custom_command('', # ["git", "remote", # "set-url", # "--push", dest_resolved, # remote_url_push]) # lgr.info("Added sibling '%s'." % dest) # lgr.debug("Added remote '%s':\n %s (fetch)\n%s (push)." % # (dest_resolved, remote_url, # remote_url_push if dest_pushurl else remote_url)) # Note: add remote currently disabled in publish # else: # # known remote: parameters dest-url-* currently invalid. # # This may change to adapt the existing remote. # if dest_url: # lgr.warning("Sibling '%s' already exists for dataset '%s'. " # "Ignoring dest-url %s." % # (dest_resolved, ds.path, dest_url)) # if dest_pushurl: # lgr.warning("Sibling '%s' already exists for dataset '%s'. " # "Ignoring dest-pushurl %s." % # (dest_resolved, ds.path, dest_pushurl)) # Figure out, what to publish if path is None or path == ds.path: # => publish the dataset itself # push local state: # TODO: Rework git_push in GitRepo cmd = ['git', 'push'] if set_upstream: # no upstream branch yet cmd.append("--set-upstream") cmd += [dest_resolved, ds.repo.git_get_active_branch()] ds.repo._git_custom_command('', cmd) # push annex branch: if isinstance(ds.repo, AnnexRepo): ds.repo.git_push("%s +git-annex:git-annex" % dest_resolved) # TODO: if with_data is a shell pattern, we get a list, when called # from shell, right? # => adapt the following and check constraints to allow for that if with_data: ds.repo._git_custom_command('', ["git", "annex", "copy"] + with_data + ["--to", dest_resolved]) if recursive and ds.get_dataset_handles() != []: results = [ds] # Note: add remote currently disabled in publish # modify URL templates: # if dest_url: # dest_url = dest_url.replace('%NAME', basename(ds.path) + '-%NAME') # if dest_pushurl: # dest_pushurl = dest_pushurl.replace('%NAME', basename(ds.path) + '-%NAME') for subds in ds.get_dataset_handles(): results.append(Dataset(opj(ds.path, subds)).publish( dest=dest, # Note: use `dest` instead of `dest_resolved` in case # dest was None, so subdatasets would use their default # as well # Note: add remote currently disabled in publish # dest_url=dest_url, # dest_pushurl=dest_pushurl, with_data=with_data, recursive=recursive)) return results return ds elif exists(path): # At this point `path` is not referencing a (sub)dataset. # An annexed file is the only thing left, that `path` might be # validly pointing to. Anything else we can't handle currently. if isinstance(ds.repo, AnnexRepo): try: if ds.repo.get_file_key(relativepath): # file is in annex, publish it ds.repo._run_annex_command('copy', annex_options=[path, '--to=%s' % dest_resolved]) return path except (FileInGitError, FileNotInAnnexError): pass # `path` can't be published lgr.warning("Don't know how to publish %s." % path) return None else: # nothing to publish found lgr.warning("Nothing to publish found at %s." % path) return None