def test_crawl_api_recursive(get_subdatasets_, run_pipeline_, load_pipeline_from_config_, get_repo_pipeline_script_path_, get_lofilename_, chpwd_, tdir): pwd = getpwd() with chpwd(tdir): output, stats = crawl(recursive=True) assert_equal(pwd, getpwd()) if external_versions['mock'] < '1.0.1': raise SkipTest( "needs a more recent mock which throws exceptions in side_effects") assert_equal(output, [[]] * 4 + [None]) # for now output is just a list of outputs assert_equal( stats, ActivityStats( datasets_crawled=5, datasets_crawl_failed=1)) # nothing was done but we got it crawled chpwd_.assert_has_calls([ call(None), call('path1'), call('path1/path1_1'), call('path2'), ], any_order=True) assert_equal( list(find_files('.*', tdir, exclude_vcs=False)), [_path_(tdir, 'some.log')]) # no files were generated besides the log
def test_getpwd_basic(): pwd = getpwd() ok_(isabs(pwd)) eq_(os.getcwd(), abspath(pwd)) # that we do not chdir anywhere if None provided with patch('os.chdir') as oschdir: with chpwd(None): eq_(getpwd(), pwd) assert_false(oschdir.called)
def require_dataset(dataset, check_installed=True, purpose=None): """Helper function to resolve a dataset. This function tries to resolve a dataset given an input argument, or based on the process' working directory, if `None` is given. Parameters ---------- dataset : None or path or Dataset Some value identifying a dataset or `None`. In the latter case a dataset will be searched based on the process working directory. check_installed : bool, optional If True, an optional check whether the resolved dataset is properly installed will be performed. purpose : str, optional This string will be inserted in error messages to make them more informative. The pattern is "... dataset for <STRING>". Returns ------- Dataset If a dataset could be determined. Raises ------ NoDatasetFound If not dataset could be determined. """ if dataset is not None and not isinstance(dataset, Dataset): dataset = Dataset(dataset) if dataset is None: # possible scenario of cmdline calls dspath = get_dataset_root(getpwd()) if not dspath: raise NoDatasetFound( "No dataset found at '{}'{}. Specify a dataset to work with " "by providing its path via the `dataset` option, " "or change the current working directory to be in a " "dataset.".format( getpwd(), " for the purpose {!r}".format(purpose) if purpose else '')) dataset = Dataset(dspath) assert (dataset is not None) lgr.debug(u"Resolved dataset%s: %s", u' to {}'.format(purpose) if purpose else '', dataset.path) if check_installed and not dataset.is_installed(): raise NoDatasetFound(f"No installed dataset found at {dataset.path}") return dataset
def test_getpwd_symlink(tdir): sdir = opj(tdir, 's1') pwd_orig = getpwd() Path(sdir).symlink_to(Path('.')) s1dir = opj(sdir, 's1') s2dir = opj(sdir, 's2') try: chpwd(sdir) pwd = getpwd() eq_(pwd, sdir) chpwd('s1') eq_(getpwd(), s1dir) chpwd('.') eq_(getpwd(), s1dir) chpwd('..') eq_(getpwd(), sdir) finally: chpwd(pwd_orig) # test context handler way of use with chpwd(s1dir): eq_(getpwd(), s1dir) eq_(getpwd(), pwd_orig) assert_false(exists(s2dir)) with assert_raises(OSError): with chpwd(s2dir): pass with chpwd(s2dir, mkdir=True): ok_(exists(s2dir)) eq_(getpwd(), s2dir)
def _test_assert_Xwd_unchanged(func): orig_cwd = os.getcwd() orig_pwd = getpwd() @assert_cwd_unchanged def do_chdir(): func(os.pardir) with assert_raises(AssertionError) as cm: do_chdir() eq_(orig_cwd, os.getcwd(), "assert_cwd_unchanged didn't return us back to cwd %s" % orig_cwd) eq_(orig_pwd, getpwd(), "assert_cwd_unchanged didn't return us back to pwd %s" % orig_pwd)
def _describe_system(): import platform as pl from datalad import get_encoding_info if hasattr(pl, 'dist'): dist = pl.dist() else: # Python 3.8 removed .dist but recommended "distro" is slow, so we # try it only if needed try: import distro dist = distro.linux_distribution(full_distribution_name=False) except ImportError: lgr.info( "Please install 'distro' package to obtain distribution information" ) dist = tuple() except Exception as exc: lgr.warning( "No distribution information will be provided since 'distro' " "fails to import/run: %s", exc_str(exc) ) dist = tuple() return { 'type': os.name, 'name': pl.system(), 'release': pl.release(), 'version': pl.version(), 'distribution': ' '.join([_t2s(dist), _t2s(pl.mac_ver()), _t2s(pl.win32_ver())]).rstrip(), 'max_path_length': get_max_path_length(getpwd()), 'encoding': get_encoding_info(), }
def test_install_subds_from_another_remote(topdir): # https://github.com/datalad/datalad/issues/1905 from datalad.support.network import PathRI with chpwd(topdir): origin_ = 'origin' clone1_ = 'clone1' clone2_ = 'clone2' origin = create(origin_, no_annex=True) clone1 = install(source=origin, path=clone1_) # print("Initial clone") clone1.create_sibling('ssh://localhost%s/%s' % (PathRI(getpwd()).posixpath, clone2_), name=clone2_) # print("Creating clone2") clone1.publish(to=clone2_) clone2 = Dataset(clone2_) # print("Initiating subdataset") clone2.create('subds1') # print("Updating") clone1.update(merge=True, sibling=clone2_) # print("Installing within updated dataset -- should be able to install from clone2") clone1.install('subds1')
def path_is_under(values, path=None): """Whether a given path is a subdirectory of any of the given test values Parameters ---------- values : sequence or dict Paths to be tested against. This can be a dictionary in which case all values from all keys will be tested against. path : path or None Test path. If None is given, the process' working directory is used. Returns ------- bool """ if path is None: from datalad.utils import getpwd path = getpwd() if isinstance(values, dict): values = chain(*values.values()) for p in values: rpath = relpath(p, start=path) if rpath == curdir \ or rpath == pardir \ or set(psplit(rpath)) == {pardir}: # first match is enough return True return False
def resolve_path(path, ds=None): """Resolve a path specification (against a Dataset location) Any explicit path (absolute or relative) is returned as an absolute path. In case of an explicit relative path, the current working directory is used as a reference. Any non-explicit relative path is resolved against as dataset location, i.e. considered relative to the location of the dataset. If no dataset is provided, the current working directory is used. Returns ------- Absolute path """ # first make sure it's actually a valid path: from datalad.support.network import PathRI if not isinstance(RI(path), PathRI): raise ValueError("%s is not a valid path" % path) path = expandpath(path, force_absolute=False) if is_explicit_path(path): # normalize path consistently between two (explicit and implicit) cases return dlabspath(path, norm=True) # no dataset given, use CWD as reference # note: abspath would disregard symlink in CWD top_path = getpwd() \ if ds is None else ds.path if isinstance(ds, Dataset) else ds return normpath(opj(top_path, path))
def get_max_path_length(top_path=None, maxl=1000): """Deduce the maximal length of the filename in a given path """ if not top_path: top_path = getpwd() import random from datalad import lgr from datalad.dochelpers import exc_str from datalad.support import path prefix = path.join(top_path, "dl%d" % random.randint(1, 100000)) # some smart folks could implement binary search for this max_path_length = None for i in range(maxl - len(prefix)): filename = prefix + '_' * i path_length = len(filename) try: with open(filename, 'w') as f: max_path_length = path_length except Exception as exc: lgr.debug( "Failed to create sample file for length %d. Last succeeded was %s. Exception: %s", path_length, max_path_length, exc_str(exc)) break unlink(filename) return max_path_length
def _describe_system(): import platform as pl from datalad import get_encoding_info from datalad.utils import get_linux_distribution try: dist = get_linux_distribution() except Exception as exc: ce = CapturedException(exc) lgr.warning("Failed to get distribution information: %s", ce) dist = tuple() return { 'type': os.name, 'name': pl.system(), 'release': pl.release(), 'version': pl.version(), 'distribution': ' '.join([_t2s(dist), _t2s(pl.mac_ver()), _t2s(pl.win32_ver())]).rstrip(), 'max_path_length': get_max_path_length(getpwd()), 'encoding': get_encoding_info(), 'filesystem': { l: _get_fs_type(l, p) for l, p in [('CWD', Path.cwd()), ('TMP', Path(tempfile.gettempdir()) ), ('HOME', Path.home())] } }
def _describe_system(): import platform as pl from datalad import get_encoding_info from datalad.utils import get_linux_distribution try: dist = get_linux_distribution() except Exception as exc: lgr.warning("Failed to get distribution information: %s", exc_str(exc)) dist = tuple() return { 'type': os.name, 'name': pl.system(), 'release': pl.release(), 'version': pl.version(), 'distribution': ' '.join([_t2s(dist), _t2s(pl.mac_ver()), _t2s(pl.win32_ver())]).rstrip(), 'max_path_length': get_max_path_length(getpwd()), 'encoding': get_encoding_info(), }
def get_max_path_length(top_path=None, maxl=1000): """Deduce the maximal length of the filename in a given path """ if not top_path: top_path = getpwd() import os import random from datalad import lgr from datalad.dochelpers import exc_str from datalad.support import path prefix = path.join(top_path, "dl%d" % random.randint(1 ,100000)) # some smart folks could implement binary search for this max_path_length = None for i in range(maxl-len(prefix)): filename = prefix + '_' * i path_length = len(filename) try: with open(filename, 'w') as f: max_path_length = path_length except Exception as exc: lgr.debug( "Failed to create sample file for length %d. Last succeeded was %s. Exception: %s", path_length, max_path_length, exc_str(exc)) break unlink(filename) return max_path_length
def get_command_pwds(dataset): """Return the current directory for the dataset. Parameters ---------- dataset : Dataset Returns ------- A tuple, where the first item is the absolute path of the pwd and the second is the pwd relative to the dataset's path. """ # Follow path resolution logic describe in gh-3435. if isinstance(dataset, Dataset): # Paths relative to dataset. pwd = dataset.path rel_pwd = op.curdir else: # Paths relative to current directory. pwd = getpwd() # Pass pwd to get_dataset_root instead of os.path.curdir to handle # repos whose leading paths have a symlinked directory (see the # TMPDIR="/var/tmp/sym link" test case). if not dataset: dataset = get_dataset_root(pwd) if dataset: rel_pwd = relpath(pwd, dataset) else: rel_pwd = pwd # and leave handling to caller return pwd, rel_pwd
def get_command_pwds(dataset): """Return the directory for the command. Parameters ---------- dataset : Dataset Returns ------- A tuple, where the first item is the absolute path of the pwd and the second is the pwd relative to the dataset's path. """ if dataset: pwd = dataset.path rel_pwd = curdir else: # act on the whole dataset if nothing else was specified # Follow our generic semantic that if dataset is specified, # paths are relative to it, if not -- relative to pwd pwd = getpwd() # Pass pwd to get_dataset_root instead of os.path.curdir to handle # repos whose leading paths have a symlinked directory (see the # TMPDIR="/var/tmp/sym link" test case). dataset = get_dataset_root(pwd) if dataset: rel_pwd = relpath(pwd, dataset) else: rel_pwd = pwd # and leave handling on deciding either we # deal with it or crash to checks below return pwd, rel_pwd
def get_command_pwds(dataset): """Return the directory for the command. Parameters ---------- dataset : Dataset Returns ------- A tuple, where the first item is the absolute path of the pwd and the second is the pwd relative to the dataset's path. """ if dataset: pwd = dataset.path rel_pwd = curdir else: # act on the whole dataset if nothing else was specified dataset = get_dataset_root(curdir) # Follow our generic semantic that if dataset is specified, # paths are relative to it, if not -- relative to pwd pwd = getpwd() if dataset: rel_pwd = relpath(pwd, dataset) else: rel_pwd = pwd # and leave handling on deciding either we # deal with it or crash to checks below return pwd, rel_pwd
def _test_assert_Xwd_unchanged_ok_chdir(func): # Test that we are not masking out other "more important" exceptions orig_cwd = os.getcwd() orig_pwd = getpwd() @assert_cwd_unchanged(ok_to_chdir=True) def do_chdir_value_error(): func(os.pardir) return "a value" with swallow_logs() as cml: eq_(do_chdir_value_error(), "a value") eq_(orig_cwd, os.getcwd(), "assert_cwd_unchanged didn't return us back to cwd %s" % orig_cwd) eq_(orig_pwd, getpwd(), "assert_cwd_unchanged didn't return us back to cwd %s" % orig_pwd) assert_not_in("Mitigating and changing back", cml.out)
def test_run_under_dir(d): orig_pwd = getpwd() orig_cwd = os.getcwd() @run_under_dir(d) def f(arg, kwarg=None): eq_(arg, 1) eq_(kwarg, 2) eq_(getpwd(), d) f(1, 2) eq_(getpwd(), orig_pwd) eq_(os.getcwd(), orig_cwd) # and if fails assert_raises(AssertionError, f, 1, 3) eq_(getpwd(), orig_pwd) eq_(os.getcwd(), orig_cwd)
def test_normalize_path(git_path): gr = GitRepo(git_path) # cwd is currently outside the repo, so any relative path # should be interpreted as relative to `annex_path` assert_raises(FileNotInRepositoryError, _normalize_path, gr.path, getpwd()) result = _normalize_path(gr.path, "testfile") eq_(result, "testfile", "_normalize_path() returned %s" % result) # result = _normalize_path(gr.path, op.join('.', 'testfile')) # eq_(result, "testfile", "_normalize_path() returned %s" % result) # # result = _normalize_path(gr.path, op.join('testdir', '..', 'testfile')) # eq_(result, "testfile", "_normalize_path() returned %s" % result) # Note: By now, normpath within normalize_paths() is disabled, therefore # disable these tests. result = _normalize_path(gr.path, op.join('testdir', 'testfile')) eq_(result, op.join("testdir", "testfile"), "_normalize_path() returned %s" % result) result = _normalize_path(gr.path, op.join(git_path, "testfile")) eq_(result, "testfile", "_normalize_path() returned %s" % result) # now we are inside, so # OLD PHILOSOPHY: relative paths are relative to cwd and have # to be converted to be relative to annex_path # NEW PHILOSOPHY: still relative to repo! unless starts with . (curdir) or .. (pardir) with chpwd(op.join(git_path, 'd1', 'd2')): result = _normalize_path(gr.path, "testfile") eq_(result, 'testfile', "_normalize_path() returned %s" % result) # if not joined as directory name but just a prefix to the filename, should # behave correctly for d in (op.curdir, op.pardir): result = _normalize_path(gr.path, d + "testfile") eq_(result, d + 'testfile', "_normalize_path() returned %s" % result) result = _normalize_path(gr.path, op.join(op.curdir, "testfile")) eq_(result, op.join('d1', 'd2', 'testfile'), "_normalize_path() returned %s" % result) result = _normalize_path(gr.path, op.join(op.pardir, 'testfile')) eq_(result, op.join('d1', 'testfile'), "_normalize_path() returned %s" % result) assert_raises(FileNotInRepositoryError, _normalize_path, gr.path, op.join(git_path, '..', 'outside')) result = _normalize_path(gr.path, op.join(git_path, 'd1', 'testfile')) eq_(result, op.join('d1', 'testfile'), "_normalize_path() returned %s" % result)
def test_rev_resolve_path(path): if op.realpath(path) != path: raise SkipTest("Test assumptions require non-symlinked parent paths") # initially ran into on OSX https://github.com/datalad/datalad/issues/2406 opath = op.join(path, "origin") os.makedirs(opath) if not on_windows: lpath = op.join(path, "linked") os.symlink('origin', lpath) ds_global = Dataset(path) # path resolution of absolute paths is not influenced by symlinks # ignore the linked path on windows, it is not a symlink in the POSIX sense for d in (opath, ) if on_windows else (opath, lpath): ds_local = Dataset(d) # no symlink resolution eq_(str(rev_resolve_path(d)), d) with chpwd(d): # be aware: knows about cwd, but this CWD has symlinks resolved eq_(str(rev_resolve_path(d).cwd()), opath) # using pathlib's `resolve()` will resolve any # symlinks # also resolve `opath`, as on old windows systems the path might # come in crippled (e.g. C:\Users\MIKE~1/...) # and comparison would fails unjustified eq_(rev_resolve_path('.').resolve(), ut.Path(opath).resolve()) # no norming, but absolute paths, without resolving links eq_(rev_resolve_path('.'), ut.Path(d)) eq_(str(rev_resolve_path('.')), d) eq_(str(rev_resolve_path(op.join(os.curdir, 'bu'), ds=ds_global)), op.join(d, 'bu')) eq_(str(rev_resolve_path(op.join(os.pardir, 'bu'), ds=ds_global)), op.join(ds_global.path, 'bu')) # resolve against a dataset eq_(str(rev_resolve_path('bu', ds=ds_local)), op.join(d, 'bu')) eq_(str(rev_resolve_path('bu', ds=ds_global)), op.join(path, 'bu')) # but paths outside the dataset are left untouched eq_(str(rev_resolve_path(op.join(os.curdir, 'bu'), ds=ds_global)), op.join(getpwd(), 'bu')) eq_(str(rev_resolve_path(op.join(os.pardir, 'bu'), ds=ds_global)), op.normpath(op.join(getpwd(), os.pardir, 'bu')))
def test_GitRepo_files_decorator(): class testclass(object): def __init__(self): self.path = opj('some', 'where') # TODO # yoh: logic is alien to me below why to have two since both look identical! @normalize_paths def decorated_many(self, files): return files @normalize_paths def decorated_one(self, file_): return file_ test_instance = testclass() # When a single file passed -- single path returned obscure_filename = get_most_obscure_supported_name() file_to_test = opj(test_instance.path, 'deep', obscure_filename) # file doesn't exist eq_(test_instance.decorated_one(file_to_test), _normalize_path(test_instance.path, file_to_test)) eq_(test_instance.decorated_one(file_to_test), _normalize_path(test_instance.path, file_to_test)) file_to_test = obscure_filename eq_(test_instance.decorated_many(file_to_test), _normalize_path(test_instance.path, file_to_test)) eq_(test_instance.decorated_one(file_to_test), _normalize_path(test_instance.path, file_to_test)) file_to_test = opj(obscure_filename, 'beyond', 'obscure') eq_(test_instance.decorated_many(file_to_test), _normalize_path(test_instance.path, file_to_test)) file_to_test = opj(getpwd(), 'somewhere', 'else', obscure_filename) assert_raises(FileNotInRepositoryError, test_instance.decorated_many, file_to_test) # If a list passed -- list returned files_to_test = ['now', opj('a list', 'of'), 'paths'] expect = [] for item in files_to_test: expect.append(_normalize_path(test_instance.path, item)) eq_(test_instance.decorated_many(files_to_test), expect) eq_(test_instance.decorated_many(''), []) assert_raises(ValueError, test_instance.decorated_many, 1) assert_raises(ValueError, test_instance.decorated_one, 1)
def test_GitRepo_files_decorator(): class testclass(object): def __init__(self): self.path = op.join('some', 'where') # TODO # yoh: logic is alien to me below why to have two since both look identical! @normalize_paths def decorated_many(self, files): return files @normalize_paths def decorated_one(self, file_): return file_ test_instance = testclass() # When a single file passed -- single path returned obscure_filename = get_most_obscure_supported_name() file_to_test = op.join(test_instance.path, 'deep', obscure_filename) # file doesn't exist eq_(test_instance.decorated_one(file_to_test), _normalize_path(test_instance.path, file_to_test)) eq_(test_instance.decorated_one(file_to_test), _normalize_path(test_instance.path, file_to_test)) file_to_test = obscure_filename eq_(test_instance.decorated_many(file_to_test), _normalize_path(test_instance.path, file_to_test)) eq_(test_instance.decorated_one(file_to_test), _normalize_path(test_instance.path, file_to_test)) file_to_test = op.join(obscure_filename, 'beyond', 'obscure') eq_(test_instance.decorated_many(file_to_test), _normalize_path(test_instance.path, file_to_test)) file_to_test = op.join(getpwd(), 'somewhere', 'else', obscure_filename) assert_raises(FileNotInRepositoryError, test_instance.decorated_many, file_to_test) # If a list passed -- list returned files_to_test = ['now', op.join('a list', 'of'), 'paths'] expect = [] for item in files_to_test: expect.append(_normalize_path(test_instance.path, item)) eq_(test_instance.decorated_many(files_to_test), expect) eq_(test_instance.decorated_many(''), []) assert_raises(ValueError, test_instance.decorated_many, 1) assert_raises(ValueError, test_instance.decorated_one, 1)
def test_getpwd_change_mode(tdir): from datalad import utils if utils._pwd_mode != 'PWD': raise SkipTest("Makes sense to be tested only in PWD mode, " "but we seems to be beyond that already") # The evil plain chdir call os.chdir(tdir) # Just testing the logic of switching to cwd mode and issuing a warning with swallow_logs(new_level=logging.DEBUG) as cml: pwd = getpwd() eq_(pwd, str(Path(pwd).resolve())) # might have symlinks, thus realpath assert_in("symlinks in the paths will be resolved", cml.out) eq_(utils._pwd_mode, 'cwd')
def __call__(url, dataset=None, recursive=False): # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) # if we have no dataset given, figure out which one we need to operate # on, based on the current working directory of the process: if ds is None: # try to find a dataset at or above PWD: dspath = GitRepo.get_toppath(getpwd()) if dspath is None: raise ValueError("No dataset found at %s." % getpwd()) ds = Dataset(dspath) assert (ds is not None) if not ds.is_installed(): raise ValueError("No installed dataset found at " "{0}.".format(ds.path)) assert (ds.repo is not None) repos_to_update = [ds.repo] if recursive: repos_to_update += [ GitRepo(opj(ds.path, sub_path)) for sub_path in ds.get_dataset_handles(recursive=True) ] for handle_repo in repos_to_update: parser = get_module_parser(handle_repo) for submodule_section in parser.sections(): submodule_name = submodule_section[11:-1] parser.set_value( submodule_section, "url", url.replace("%NAME", submodule_name.replace("/", "-"))) return # TODO: return value?
def __init__(self, patterns, pwd=None, expand=False): self.pwd = pwd or getpwd() self._expand = expand if patterns is None: self._maybe_dot = [] self._paths = {"patterns": []} else: patterns, dots = partition(patterns, lambda i: i.strip() == ".") self._maybe_dot = ["."] if list(dots) else [] self._paths = { "patterns": [relpath(p, start=pwd) if isabs(p) else p for p in patterns]}
def __call__(url, dataset=None, recursive=False): # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) # if we have no dataset given, figure out which one we need to operate # on, based on the current working directory of the process: if ds is None: # try to find a dataset at or above PWD: dspath = GitRepo.get_toppath(getpwd()) if dspath is None: raise ValueError("No dataset found at %s." % getpwd()) ds = Dataset(dspath) assert(ds is not None) if not ds.is_installed(): raise ValueError("No installed dataset found at " "{0}.".format(ds.path)) assert(ds.repo is not None) repos_to_update = [ds.repo] if recursive: repos_to_update += [GitRepo(opj(ds.path, sub_path)) for sub_path in ds.get_dataset_handles(recursive=True)] for handle_repo in repos_to_update: parser = get_module_parser(handle_repo) for submodule_section in parser.sections(): submodule_name = submodule_section[11:-1] parser.set_value(submodule_section, "url", url.replace("%NAME", submodule_name.replace("/", "-"))) return # TODO: return value?
def _describe_system(): import platform as pl from datalad import get_encoding_info return { 'type': os.name, 'name': pl.system(), 'release': pl.release(), 'version': pl.version(), 'distribution': ' '.join([_t2s(pl.dist()), _t2s(pl.mac_ver()), _t2s(pl.win32_ver())]).rstrip(), 'max_path_length': get_max_path_length(getpwd()), 'encoding': get_encoding_info(), }
def setup_cache(self): ds_path = create_test_dataset(self.dsname, spec='2/-2/-2', seed=0)[0] self.log("Setup cache ds path %s. CWD: %s", ds_path, getpwd()) # Will store into a tarfile since otherwise install -r is way too slow # to be invoked for every benchmark # Store full path since apparently setup is not ran in that directory self.tarfile = op.realpath(SampleSuperDatasetBenchmarks.tarfile) with tarfile.open(self.tarfile, "w") as tar: # F.CK -- Python tarfile can't later extract those because key dirs are # read-only. For now just a workaround - make it all writeable from datalad.utils import rotree rotree(self.dsname, ro=False, chmod_files=False) tar.add(self.dsname, recursive=True) rmtree(self.dsname)
def __init__(self, patterns, pwd=None, expand=False): self.pwd = pwd or getpwd() self._expand = expand if patterns is None: self._maybe_dot = [] self._paths = {"patterns": [], "sub_patterns": {}} else: patterns = list(map(assure_unicode, patterns)) patterns, dots = partition(patterns, lambda i: i.strip() == ".") self._maybe_dot = ["."] if list(dots) else [] self._paths = { "patterns": [op.relpath(p, start=pwd) if op.isabs(p) else p for p in patterns], "sub_patterns": {}}
def _flyweight_id_from_args(cls, *args, **kwargs): if args: # to a certain degree we need to simulate an actual call to __init__ # and make sure, passed arguments are fitting: # TODO: Figure out, whether there is a cleaner way to do this in a # generic fashion assert('path' not in kwargs) path = args[0] args = args[1:] elif 'path' in kwargs: path = kwargs.pop('path') else: raise TypeError("__init__() requires argument `path`") if path is None: raise AttributeError # mirror what is happening in __init__ if isinstance(path, ut.PurePath): path = text_type(path) # Custom handling for few special abbreviations path_ = path if path == '^': # get the topmost dataset from current location. Note that 'zsh' # might have its ideas on what to do with ^, so better use as -d^ path_ = Dataset(curdir).get_superdataset(topmost=True).path elif path == '///': # TODO: logic/UI on installing a default dataset could move here # from search? path_ = cfg.obtain('datalad.locations.default-dataset') if path != path_: lgr.debug("Resolved dataset alias %r to path %r", path, path_) # Sanity check for argument `path`: # raise if we cannot deal with `path` at all or # if it is not a local thing: path_ = RI(path_).localpath # we want an absolute path, but no resolved symlinks if not isabs(path_): path_ = opj(getpwd(), path_) # use canonical paths only: path_ = normpath(path_) kwargs['path'] = path_ return path_, args, kwargs
def setup(self): self.log("Setup ran in %s, existing paths: %s", getpwd(), glob('*')) tempdir = tempfile.mkdtemp(**get_tempfile_kwargs({}, prefix="bm")) self.remove_paths.append(tempdir) with tarfile.open(self.tarfile) as tar: tar.extractall(tempdir) # TODO -- remove this abomination after https://github.com/datalad/datalad/issues/1512 is fixed epath = op.join(tempdir, 'testds1') epath_unique = epath + str(self.__class__.ds_count) os.rename(epath, epath_unique) self.__class__.ds_count += 1 self.ds = Dataset(epath_unique) self.repo = self.ds.repo self.log("Finished setup for %s", tempdir)
def __init__(self, patterns, pwd=None, expand=False): self.pwd = pwd or getpwd() self._expand = expand if patterns is None: self._maybe_dot = [] self._patterns = [] else: patterns = list(map(ensure_unicode, patterns)) patterns, dots = partition(patterns, lambda i: i.strip() == ".") self._maybe_dot = ["."] if list(dots) else [] self._patterns = [ op.relpath(p, start=pwd) if op.isabs(p) else p for p in patterns ] self._cache = {}
def setup(self): self.log("Setup ran in %s, existing paths: %s", getpwd(), glob('*')) tempdir = tempfile.mkdtemp( **get_tempfile_kwargs({}, prefix="bm") ) self.remove_paths.append(tempdir) with tarfile.open(self.tarfile) as tar: tar.extractall(tempdir) # TODO -- remove this abomination after https://github.com/datalad/datalad/issues/1512 is fixed epath = op.join(tempdir, 'testds1') epath_unique = epath + str(self.__class__.ds_count) os.rename(epath, epath_unique) self.__class__.ds_count += 1 self.ds = Dataset(epath_unique) self.repo = self.ds.repo self.log("Finished setup for %s", tempdir)
def setup_cache(self): ds_path = create_test_dataset( self.dsname , spec='2/-2/-2' , seed=0 )[0] self.log("Setup cache ds path %s. CWD: %s", ds_path, getpwd()) # Will store into a tarfile since otherwise install -r is way too slow # to be invoked for every benchmark # Store full path since apparently setup is not ran in that directory self.tarfile = op.realpath(SampleSuperDatasetBenchmarks.tarfile) with tarfile.open(self.tarfile, "w") as tar: # F.CK -- Python tarfile can't later extract those because key dirs are # read-only. For now just a workaround - make it all writeable from datalad.utils import rotree rotree(self.dsname, ro=False, chmod_files=False) tar.add(self.dsname, recursive=True) rmtree(self.dsname)
def require_dataset(dataset, check_installed=True, purpose=None): """Helper function to resolve a dataset. This function tries to resolve a dataset given an input argument, or based on the process' working directory, if `None` is given. Parameters ---------- dataset : None or path or Dataset Some value identifying a dataset or `None`. In the latter case a dataset will be searched based on the process working directory. check_installed : bool, optional If True, an optional check whether the resolved dataset is properly installed will be performed. purpose : str, optional This string will be inserted in error messages to make them more informative. The pattern is "... dataset for <STRING>". Returns ------- Dataset Or raises an exception (InsufficientArgumentsError). """ if dataset is not None and not isinstance(dataset, Dataset): dataset = Dataset(dataset) if dataset is None: # possible scenario of cmdline calls dspath = get_dataset_root(getpwd()) if not dspath: raise NoDatasetArgumentFound("No dataset found") dataset = Dataset(dspath) assert(dataset is not None) lgr.debug(u"Resolved dataset{0}: {1}".format( ' for {}'.format(purpose) if purpose else '', dataset)) if check_installed and not dataset.is_installed(): raise ValueError("No installed dataset found at " "{0}.".format(dataset.path)) return dataset
def require_dataset(dataset, check_installed=True, purpose=None): """Helper function to resolve a dataset. This function tries to resolve a dataset given an input argument, or based on the process' working directory, if `None` is given. Parameters ---------- dataset : None or path or Dataset Some value identifying a dataset or `None`. In the latter case a dataset will be searched based on the process working directory. check_installed : bool, optional If True, an optional check whether the resolved dataset is properly installed will be performed. purpose : str, optional This string will be inserted in error messages to make them more informative. The pattern is "... dataset for <STRING>". Returns ------- Dataset Or raises an exception (InsufficientArgumentsError). """ if dataset is not None and not isinstance(dataset, Dataset): dataset = Dataset(dataset) if dataset is None: # possible scenario of cmdline calls dspath = get_dataset_root(getpwd()) if not dspath: raise NoDatasetArgumentFound("No dataset found") dataset = Dataset(dspath) assert(dataset is not None) lgr.debug("Resolved dataset{0}: {1}".format( ' for {}'.format(purpose) if purpose else '', dataset)) if check_installed and not dataset.is_installed(): raise ValueError("No installed dataset found at " "{0}.".format(dataset.path)) return dataset
def _describe_system(): import platform as pl from datalad import get_encoding_info if hasattr(pl, 'dist'): dist = pl.dist() else: # Python 3.8 removed .dist but recommended "distro" is slow, so we # try it only if needed try: import distro dist = distro.linux_distribution(full_distribution_name=False) except ImportError: lgr.info( "Please install 'distro' package to obtain distribution information" ) dist = tuple() except Exception as exc: lgr.warning( "No distribution information will be provided since 'distro' " "fails to import/run: %s", exc_str(exc)) dist = tuple() return { 'type': os.name, 'name': pl.system(), 'release': pl.release(), 'version': pl.version(), 'distribution': ' '.join([_t2s(dist), _t2s(pl.mac_ver()), _t2s(pl.win32_ver())]).rstrip(), 'max_path_length': get_max_path_length(getpwd()), 'encoding': get_encoding_info(), }
def test_resolve_path(somedir): abs_path = abspath(somedir) # just to be sure rel_path = "some" expl_path_cur = opj(os.curdir, rel_path) expl_path_par = opj(os.pardir, rel_path) eq_(resolve_path(abs_path), abs_path) current = getpwd() # no Dataset => resolve using cwd: eq_(resolve_path(abs_path), abs_path) eq_(resolve_path(rel_path), opj(current, rel_path)) eq_(resolve_path(expl_path_cur), normpath(opj(current, expl_path_cur))) eq_(resolve_path(expl_path_par), normpath(opj(current, expl_path_par))) # now use a Dataset as reference: ds = Dataset(abs_path) eq_(resolve_path(abs_path, ds), abs_path) eq_(resolve_path(rel_path, ds), opj(abs_path, rel_path)) eq_(resolve_path(expl_path_cur, ds), normpath(opj(current, expl_path_cur))) eq_(resolve_path(expl_path_par, ds), normpath(opj(current, expl_path_par)))
def resolve_path(path, ds=None): """Resolve a path specification (against a Dataset location) Any explicit path (absolute or relative) is returned as an absolute path. In case of an explicit relative path, the current working directory is used as a reference. Any non-explicit relative path is resolved against as dataset location, i.e. considered relative to the location of the dataset. If no dataset is provided, the current working directory is used. Returns ------- Absolute path """ path = expandpath(path, force_absolute=False) # TODO: normpath?! if is_explicit_path(path): return abspath(path) # no dataset given, use CWD as reference # note: abspath would disregard symlink in CWD top_path = getpwd() \ if ds is None else ds.path if isinstance(ds, Dataset) else ds return normpath(opj(top_path, path))
def __call__( path=None, initopts=None, force=False, description=None, dataset=None, no_annex=False, fake_dates=False, cfg_proc=None ): refds_path = dataset.path if hasattr(dataset, 'path') else dataset # two major cases # 1. we got a `dataset` -> we either want to create it (path is None), # or another dataset in it (path is not None) # 2. we got no dataset -> we want to create a fresh dataset at the # desired location, either at `path` or PWD # sanity check first if no_annex: if description: raise ValueError("Incompatible arguments: cannot specify " "description for annex repo and declaring " "no annex repo.") if path: path = rev_resolve_path(path, dataset) path = path if path \ else getpwd() if dataset is None \ else refds_path # we know that we need to create a dataset at `path` assert(path is not None) # prep for yield res = dict(action='create', path=text_type(path), logger=lgr, type='dataset', refds=refds_path) refds = None if refds_path and refds_path != path: refds = require_dataset( refds_path, check_installed=True, purpose='creating a subdataset') path_inrefds = path_under_rev_dataset(refds, path) if path_inrefds is None: yield dict( res, status='error', message=( "dataset containing given paths is not underneath " "the reference dataset %s: %s", dataset, text_type(path)), ) return # try to locate an immediate parent dataset # we want to know this (irrespective of whether we plan on adding # this new dataset to a parent) in order to avoid conflicts with # a potentially absent/uninstalled subdataset of the parent # in this location # it will cost some filesystem traversal though... parentds_path = rev_get_dataset_root( op.normpath(op.join(text_type(path), os.pardir))) if parentds_path: prepo = GitRepo(parentds_path) parentds_path = ut.Path(parentds_path) # we cannot get away with a simple # GitRepo.get_content_info(), as we need to detect # uninstalled/added subdatasets too check_path = ut.Path(path) pstatus = prepo.status( untracked='no', # limit query to target path for a potentially massive speed-up paths=[check_path.relative_to(parentds_path)]) if any( check_path == p or check_path in p.parents for p in pstatus): # redo the check in a slower fashion, it is already broken # let's take our time for a proper error message conflict = [ p for p in pstatus if check_path == p or check_path in p.parents] res.update({ 'status': 'error', 'message': ( 'collision with content in parent dataset at %s: %s', text_type(parentds_path), [text_type(c) for c in conflict])}) yield res return # another set of check to see whether the target path is pointing # into a known subdataset that is not around ATM subds_status = { parentds_path / k.relative_to(prepo.path) for k, v in iteritems(pstatus) if v.get('type', None) == 'dataset'} check_paths = [check_path] check_paths.extend(check_path.parents) if any(p in subds_status for p in check_paths): conflict = [p for p in check_paths if p in subds_status] res.update({ 'status': 'error', 'message': ( 'collision with %s (dataset) in dataset %s', text_type(conflict[0]), text_type(parentds_path))}) yield res return # important to use the given Dataset object to avoid spurious ID # changes with not-yet-materialized Datasets tbds = dataset if isinstance(dataset, Dataset) and \ dataset.path == path else Dataset(text_type(path)) # don't create in non-empty directory without `force`: if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force: res.update({ 'status': 'error', 'message': 'will not create a dataset in a non-empty directory, use ' '`force` option to ignore'}) yield res return # stuff that we create and want to have tracked with git (not annex) add_to_git = {} if initopts is not None and isinstance(initopts, list): initopts = {'_from_cmdline_': initopts} # create and configure desired repository if no_annex: lgr.info("Creating a new git repo at %s", tbds.path) tbrepo = GitRepo( tbds.path, url=None, create=True, create_sanity_checks=False, git_opts=initopts, fake_dates=fake_dates) # place a .noannex file to indicate annex to leave this repo alone stamp_path = ut.Path(tbrepo.path) / '.noannex' stamp_path.touch() add_to_git[stamp_path] = { 'type': 'file', 'state': 'untracked'} else: # always come with annex when created from scratch lgr.info("Creating a new annex repo at %s", tbds.path) tbrepo = AnnexRepo( tbds.path, url=None, create=True, create_sanity_checks=False, # do not set backend here, to avoid a dedicated commit backend=None, # None causes version to be taken from config version=None, description=description, git_opts=initopts, fake_dates=fake_dates ) # set the annex backend in .gitattributes as a staged change tbrepo.set_default_backend( cfg.obtain('datalad.repo.backend'), persistent=True, commit=False) add_to_git[tbds.repo.pathobj / '.gitattributes'] = { 'type': 'file', 'state': 'added'} # make sure that v6 annex repos never commit content under .datalad attrs_cfg = ( ('config', 'annex.largefiles', 'nothing'), ('metadata/aggregate*', 'annex.largefiles', 'nothing'), ('metadata/objects/**', 'annex.largefiles', '({})'.format(cfg.obtain( 'datalad.metadata.create-aggregate-annex-limit')))) attrs = tbds.repo.get_gitattributes( [op.join('.datalad', i[0]) for i in attrs_cfg]) set_attrs = [] for p, k, v in attrs_cfg: if not attrs.get( op.join('.datalad', p), {}).get(k, None) == v: set_attrs.append((p, {k: v})) if set_attrs: tbds.repo.set_gitattributes( set_attrs, attrfile=op.join('.datalad', '.gitattributes')) # prevent git annex from ever annexing .git* stuff (gh-1597) attrs = tbds.repo.get_gitattributes('.git') if not attrs.get('.git', {}).get( 'annex.largefiles', None) == 'nothing': tbds.repo.set_gitattributes([ ('**/.git*', {'annex.largefiles': 'nothing'})]) # must use the repo.pathobj as this will have resolved symlinks add_to_git[tbds.repo.pathobj / '.gitattributes'] = { 'type': 'file', 'state': 'untracked'} # record an ID for this repo for the afterlife # to be able to track siblings and children id_var = 'datalad.dataset.id' # Note, that Dataset property `id` will change when we unset the # respective config. Therefore store it before: tbds_id = tbds.id if id_var in tbds.config: # make sure we reset this variable completely, in case of a # re-create tbds.config.unset(id_var, where='dataset') if _seed is None: # just the standard way uuid_id = uuid.uuid1().urn.split(':')[-1] else: # Let's generate preseeded ones uuid_id = str(uuid.UUID(int=random.getrandbits(128))) tbds.config.add( id_var, tbds_id if tbds_id is not None else uuid_id, where='dataset', reload=False) # make config overrides permanent in the repo config # this is similar to what `annex init` does # we are only doing this for config overrides and do not expose # a dedicated argument, because it is sufficient for the cmdline # and unnecessary for the Python API (there could simply be a # subsequence ds.config.add() call) for k, v in iteritems(tbds.config.overrides): tbds.config.add(k, v, where='local', reload=False) # all config manipulation is done -> fll reload tbds.config.reload() # must use the repo.pathobj as this will have resolved symlinks add_to_git[tbds.repo.pathobj / '.datalad'] = { 'type': 'directory', 'state': 'untracked'} # save everything, we need to do this now and cannot merge with the # call below, because we may need to add this subdataset to a parent # but cannot until we have a first commit tbds.repo.save( message='[DATALAD] new dataset', git=True, # we have to supply our own custom status, as the repo does # not have a single commit yet and the is no HEAD reference # TODO make `GitRepo.status()` robust to this state. _status=add_to_git, ) # the next only makes sense if we saved the created dataset, # otherwise we have no committed state to be registered # in the parent if isinstance(dataset, Dataset) and dataset.path != tbds.path: # we created a dataset in another dataset # -> make submodule for r in dataset.save( path=tbds.path, ): yield r res.update({'status': 'ok'}) yield res for cfg_proc_ in cfg_proc or []: for r in tbds.run_procedure('cfg_' + cfg_proc_): yield r
def rev_resolve_path(path, ds=None): """Resolve a path specification (against a Dataset location) Any path is returned as an absolute path. If, and only if, a dataset object instance is given as `ds`, relative paths are interpreted as relative to the given dataset. In all other cases, relative paths are treated as relative to the current working directory. Note however, that this function is not able to resolve arbitrarily obfuscated path specifications. All operations are purely lexical, and no actual path resolution against the filesystem content is performed. Consequently, common relative path arguments like '../something' (relative to PWD) can be handled properly, but things like 'down/../under' cannot, as resolving this path properly depends on the actual target of any (potential) symlink leading up to '..'. Parameters ---------- path : str or PathLike or list Platform-specific path specific path specification. Multiple path specifications can be given as a list ds : Dataset or None Dataset instance to resolve relative paths against. Returns ------- `pathlib.Path` object or list(Path) When a list was given as input a list is returned, a Path instance otherwise. """ got_ds_instance = isinstance(ds, Dataset) if ds is not None and not got_ds_instance: ds = require_dataset( ds, check_installed=False, purpose='path resolution') out = [] for p in assure_list(path): if ds is None or not got_ds_instance: # no dataset at all or no instance provided -> CWD is always the reference # nothing needs to be done here. Path-conversion and absolutification # are done next pass # we have a given datasets instance elif not Path(p).is_absolute(): # we have a dataset and no abspath nor an explicit relative path -> # resolve it against the dataset p = ds.pathobj / p p = ut.Path(p) # make sure we return an absolute path, but without actually # resolving anything if not p.is_absolute(): # in general it is almost impossible to use resolve() when # we can have symlinks in the root path of a dataset # (that we don't want to resolve here), symlinks to annex'ed # files (that we never want to resolve), and other within-repo # symlinks that we (sometimes) want to resolve (i.e. symlinked # paths for addressing content vs adding content) # CONCEPT: do the minimal thing to catch most real-world inputs # ASSUMPTION: the only sane relative path input that needs # handling and can be handled are upward references like # '../../some/that', wherease stuff like 'down/../someotherdown' # are intellectual excercises # ALGORITHM: match any number of leading '..' path components # and shorten the PWD by that number # NOT using ut.Path.cwd(), because it has symlinks resolved!! pwd_parts = ut.Path(getpwd()).parts path_parts = p.parts leading_parents = 0 for pp in p.parts: if pp == op.pardir: leading_parents += 1 path_parts = path_parts[1:] elif pp == op.curdir: # we want to discard that, but without stripping # a corresponding parent path_parts = path_parts[1:] else: break p = ut.Path( op.join( *(pwd_parts[:-leading_parents if leading_parents else None] + path_parts))) # note that we will not "normpath()" the result, check the # pathlib docs for why this is the only sane choice in the # face of the possibility of symlinks in the path out.append(p) return out[0] if isinstance(path, (string_types, PurePath)) else out
def __call__(name=None, dataset=None, merge=False, recursive=False, fetch_all=False, reobtain_data=False): """ """ # TODO: Is there an 'update filehandle' similar to install and publish? # What does it mean? if reobtain_data: # TODO: properly define, what to do raise NotImplementedError("TODO: Option '--reobtain-data' not " "implemented yet.") # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) # if we have no dataset given, figure out which one we need to operate # on, based on the current working directory of the process: if ds is None: # try to find a dataset at or above PWD: dspath = GitRepo.get_toppath(getpwd()) if dspath is None: raise ValueError("No dataset found at %s." % getpwd()) ds = Dataset(dspath) assert(ds is not None) if not ds.is_installed(): raise ValueError("No installed dataset found at " "{0}.".format(ds.path)) assert(ds.repo is not None) repos_to_update = [ds.repo] if recursive: repos_to_update += [GitRepo(opj(ds.path, sub_path)) for sub_path in ds.get_dataset_handles(recursive=True)] for repo in repos_to_update: # get all remotes: remotes = repo.git_get_remotes() if name and name not in remotes: lgr.warning("'%s' not known to dataset %s.\nSkipping" % (name, repo.path)) continue # Currently '--merge' works for single remote only: # TODO: - condition still incomplete # - We can merge if a remote was given or there is a # tracking branch # - we also can fetch all remotes independently on whether or # not we merge a certain remote if not name and len(remotes) > 1 and merge: lgr.debug("Found multiple remotes:\n%s" % remotes) raise NotImplementedError("No merge strategy for multiple " "remotes implemented yet.") lgr.info("Updating handle '%s' ..." % repo.path) # fetch remote(s): repo.git_fetch(name if name else '', "--all" if fetch_all else '') # if it is an annex and there is a tracking branch, and we didn't # fetch the entire remote anyway, explicitly fetch git-annex # branch: # TODO: Is this logic correct? Shouldn't we fetch git-annex from # `name` if there is any (or if there is no tracking branch but we # have a `name`? if knows_annex(repo.path) and not fetch_all: # check for tracking branch's remote: try: std_out, std_err = \ repo._git_custom_command('', ["git", "config", "--get", "branch.{active_branch}.remote".format( active_branch=repo.git_get_active_branch())]) except CommandError as e: if e.code == 1 and e.stdout == "": std_out = None else: raise if std_out: # we have a "tracking remote" repo.git_fetch("%s git-annex" % std_out.strip()) # merge: if merge: lgr.info("Applying changes from tracking branch...") cmd_list = ["git", "pull"] if name: cmd_list.append(name) # branch needed, if not default remote # => TODO: use default remote/tracking branch to compare # (see above, where git-annex is fetched) # => TODO: allow for passing a branch # (or more general refspec?) # For now, just use the same name cmd_list.append(repo.git_get_active_branch()) out, err = repo._git_custom_command('', cmd_list) lgr.info(out) if knows_annex(repo.path): # annex-apply: lgr.info("Updating annex ...") out, err = repo._git_custom_command('', ["git", "annex", "merge"]) lgr.info(out)
def __call__(sshurl, target=None, target_dir=None, target_url=None, target_pushurl=None, dataset=None, recursive=False, existing='raise', shared=False): if sshurl is None: raise ValueError("""insufficient information for target creation (needs at least a dataset and a SSH URL).""") if target is None and (target_url is not None or target_pushurl is not None): raise ValueError("""insufficient information for adding the target as a sibling (needs at least a name)""") # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) if ds is None: # try to find a dataset at or above CWD dspath = GitRepo.get_toppath(abspath(getpwd())) if dspath is None: raise ValueError("""No dataset found at or above {0}.""".format(getpwd())) ds = Dataset(dspath) lgr.debug("Resolved dataset for target creation: {0}".format(ds)) assert(ds is not None and sshurl is not None) if not ds.is_installed(): raise ValueError("""Dataset {0} is not installed yet.""".format(ds)) assert(ds.repo is not None) # determine target parameters: parsed_target = urlparse(sshurl) host_name = parsed_target.netloc # TODO: Sufficient to fail on this condition? if not parsed_target.netloc: raise ValueError("Malformed URL: {0}".format(sshurl)) if target_dir is None: if parsed_target.path: target_dir = parsed_target.path else: target_dir = '.' # TODO: centralize and generalize template symbol handling replicate_local_structure = False if "%NAME" not in target_dir: replicate_local_structure = True # collect datasets to use: datasets = dict() datasets[basename(ds.path)] = ds if recursive: for subds in ds.get_dataset_handles(recursive=True): sub_path = opj(ds.path, subds) # TODO: when enhancing Dataset/*Repo classes and therefore # adapt to moved code, make proper distinction between name and # path of a submodule, which are technically different. This # probably will become important on windows as well as whenever # we want to allow for moved worktrees. datasets[basename(ds.path) + '/' + subds] = \ Dataset(sub_path) # setup SSH Connection: # TODO: Make the entire setup a helper to use it when pushing via # publish? # - build control master: from datalad.utils import assure_dir not_supported_on_windows("TODO") from os import geteuid # Linux specific import var_run_user_datalad = "/var/run/user/%s/datalad" % geteuid() assure_dir(var_run_user_datalad) control_path = "%s/%s" % (var_run_user_datalad, host_name) control_path += ":%s" % parsed_target.port if parsed_target.port else "" # - start control master: cmd = "ssh -o ControlMaster=yes -o \"ControlPath=%s\" " \ "-o ControlPersist=yes %s exit" % (control_path, host_name) lgr.debug("Try starting control master by calling:\n%s" % cmd) import subprocess proc = subprocess.Popen(cmd, shell=True) proc.communicate(input="\n") # why the f.. this is necessary? runner = Runner() ssh_cmd = ["ssh", "-S", control_path, host_name] lgr.info("Creating target datasets ...") for current_dataset in datasets: if not replicate_local_structure: path = target_dir.replace("%NAME", current_dataset.replace("/", "-")) else: # TODO: opj depends on local platform, not the remote one. # check how to deal with it. Does windows ssh server accept # posix paths? vice versa? Should planned SSH class provide # tools for this issue? path = normpath(opj(target_dir, relpath(datasets[current_dataset].path, start=ds.path))) if path != '.': # check if target exists # TODO: Is this condition valid for != '.' only? path_exists = True cmd = ssh_cmd + ["ls", path] try: out, err = runner.run(cmd, expect_fail=True, expect_stderr=True) except CommandError as e: if "No such file or directory" in e.stderr and \ path in e.stderr: path_exists = False else: raise # It's an unexpected failure here if path_exists: if existing == 'raise': raise RuntimeError( "Target directory %s already exists." % path) elif existing == 'skip': continue elif existing == 'replace': pass else: raise ValueError("Do not know how to hand existing=%s" % repr(existing)) cmd = ssh_cmd + ["mkdir", "-p", path] try: runner.run(cmd) except CommandError as e: lgr.error("Remotely creating target directory failed at " "%s.\nError: %s" % (path, str(e))) continue # init git repo cmd = ssh_cmd + ["git", "-C", path, "init"] if shared: cmd.append("--shared=%s" % shared) try: runner.run(cmd) except CommandError as e: lgr.error("Remotely initializing git repository failed at %s." "\nError: %s\nSkipping ..." % (path, str(e))) continue # check git version on remote end: cmd = ssh_cmd + ["git", "version"] try: out, err = runner.run(cmd) git_version = out.lstrip("git version").strip() lgr.debug("Detected git version on server: %s" % git_version) if git_version < "2.4": lgr.error("Git version >= 2.4 needed to configure remote." " Version detected on server: %s\nSkipping ..." % git_version) continue except CommandError as e: lgr.warning( "Failed to determine git version on remote.\n" "Error: {0}\nTrying to configure anyway " "...".format(e.message)) # allow for pushing to checked out branch cmd = ssh_cmd + ["git", "-C", path, "config", "receive.denyCurrentBranch", "updateInstead"] try: runner.run(cmd) except CommandError as e: lgr.warning("git config failed at remote location %s.\n" "You will not be able to push to checked out " "branch." % path) # enable post-update hook: cmd = ssh_cmd + ["mv", opj(path, ".git/hooks/post-update.sample"), opj(path, ".git/hooks/post-update")] try: runner.run(cmd) except CommandError as e: lgr.error("Failed to enable post update hook.\n" "Error: %s" % e.message) # initially update server info "manually": cmd = ssh_cmd + ["git", "-C", path, "update-server-info"] try: runner.run(cmd) except CommandError as e: lgr.error("Failed to update server info.\n" "Error: %s" % e.message) # stop controlmaster (close ssh connection): cmd = ["ssh", "-O", "stop", "-S", control_path, host_name] out, err = runner.run(cmd, expect_stderr=True) if target: # add the sibling(s): if target_url is None: target_url = sshurl if target_pushurl is None: target_pushurl = sshurl result_adding = AddSibling()(dataset=ds, name=target, url=target_url, pushurl=target_pushurl, recursive=recursive, force=existing in {'replace'})
def __call__(dataset=None, name=None, url=None, pushurl=None, recursive=False, force=False): # TODO: Detect malformed URL and fail? if name is None or (url is None and pushurl is None): raise ValueError("""insufficient information to add a sibling (needs at least a dataset, a name and an URL).""") if url is None: url = pushurl # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) if ds is None: # try to find a dataset at or above CWD dspath = GitRepo.get_toppath(abspath(getpwd())) if dspath is None: raise ValueError( "No dataset found at or above {0}.".format(getpwd())) ds = Dataset(dspath) lgr.debug("Resolved dataset for target creation: {0}".format(ds)) assert(ds is not None and name is not None and url is not None) if not ds.is_installed(): raise ValueError("Dataset {0} is not installed yet.".format(ds)) assert(ds.repo is not None) ds_basename = basename(ds.path) repos = { ds_basename: {'repo': ds.repo} } if recursive: for subds in ds.get_dataset_handles(recursive=True): sub_path = opj(ds.path, subds) repos[ds_basename + '/' + subds] = { # repos[subds] = { 'repo': GitRepo(sub_path, create=False) } # Note: This is copied from create_publication_target_sshwebserver # as it is the same logic as for its target_dir. # TODO: centralize and generalize template symbol handling # TODO: Check pushurl for template symbols too. Probably raise if only # one of them uses such symbols replicate_local_structure = False if "%NAME" not in url: replicate_local_structure = True for repo in repos: if not replicate_local_structure: repos[repo]['url'] = url.replace("%NAME", repo.replace("/", "-")) if pushurl: repos[repo]['pushurl'] = pushurl.replace("%NAME", repo.replace("/", "-")) else: repos[repo]['url'] = url if pushurl: repos[repo]['pushurl'] = pushurl if repo != ds_basename: repos[repo]['url'] = _urljoin(repos[repo]['url'], repo[len(ds_basename)+1:]) if pushurl: repos[repo]['pushurl'] = _urljoin(repos[repo]['pushurl'], repo[len(ds_basename)+1:]) # collect existing remotes: already_existing = list() conflicting = list() for repo in repos: if name in repos[repo]['repo'].git_get_remotes(): already_existing.append(repo) lgr.debug("""Remote '{0}' already exists in '{1}'.""".format(name, repo)) existing_url = repos[repo]['repo'].git_get_remote_url(name) existing_pushurl = \ repos[repo]['repo'].git_get_remote_url(name, push=True) if repos[repo]['url'].rstrip('/') != existing_url.rstrip('/') \ or (pushurl and existing_pushurl and repos[repo]['pushurl'].rstrip('/') != existing_pushurl.rstrip('/')) \ or (pushurl and not existing_pushurl): conflicting.append(repo) if not force and conflicting: raise RuntimeError("Sibling '{0}' already exists with conflicting" " URL for {1} dataset(s). {2}".format( name, len(conflicting), conflicting)) runner = Runner() successfully_added = list() for repo in repos: if repo in already_existing: if repo not in conflicting: lgr.debug("Skipping {0}. Nothing to do.".format(repo)) continue # rewrite url cmd = ["git", "remote", "set-url", name, repos[repo]['url']] runner.run(cmd, cwd=repos[repo]['repo'].path) else: # add the remote cmd = ["git", "remote", "add", name, repos[repo]['url']] runner.run(cmd, cwd=repos[repo]['repo'].path) if pushurl: cmd = ["git", "remote", "set-url", "--push", name, repos[repo]['pushurl']] runner.run(cmd, cwd=repos[repo]['repo'].path) successfully_added.append(repo) return successfully_added
def __call__( path=None, force=False, description=None, dataset=None, no_annex=False, save=True, annex_version=None, annex_backend='MD5E', native_metadata_type=None, shared_access=None, git_opts=None, annex_opts=None, annex_init_opts=None): # two major cases # 1. we got a `dataset` -> we either want to create it (path is None), # or another dataset in it (path is not None) # 2. we got no dataset -> we want to create a fresh dataset at the # desired location, either at `path` or PWD # sanity check first if git_opts: lgr.warning( "`git_opts` argument is presently ignored, please complain!") if no_annex: if description: raise ValueError("Incompatible arguments: cannot specify " "description for annex repo and declaring " "no annex repo.") if annex_opts: raise ValueError("Incompatible arguments: cannot specify " "options for annex and declaring no " "annex repo.") if annex_init_opts: raise ValueError("Incompatible arguments: cannot specify " "options for annex init and declaring no " "annex repo.") if not isinstance(force, bool): raise ValueError("force should be bool, got %r. Did you mean to provide a 'path'?" % force) # straight from input arg, no messing around before this if path is None: if dataset is None: # nothing given explicity, assume create fresh right here path = getpwd() else: # no path, but dataset -> create that dataset path = dataset.path else: # resolve the path against a potential dataset path = resolve_path(path, ds=dataset) # we know that we need to create a dataset at `path` assert(path is not None) if git_opts is None: git_opts = {} if shared_access: # configure `git --shared` value git_opts['shared'] = shared_access # check for sane subdataset path real_targetpath = with_pathsep(realpath(path)) # realpath OK if dataset is not None: # make sure we get to an expected state if not real_targetpath.startswith( # realpath OK with_pathsep(realpath(dataset.path))): # realpath OK raise ValueError("path {} outside {}".format(path, dataset)) # important to use the given Dataset object to avoid spurious ID # changes with not-yet-materialized Datasets tbds = dataset if dataset is not None and dataset.path == path else Dataset(path) # don't create in non-empty directory without `force`: if isdir(tbds.path) and listdir(tbds.path) != [] and not force: raise ValueError("Cannot create dataset in directory %s " "(not empty). Use option 'force' in order to " "ignore this and enforce creation." % tbds.path) if no_annex: lgr.info("Creating a new git repo at %s", tbds.path) GitRepo( tbds.path, url=None, create=True, git_opts=git_opts) else: # always come with annex when created from scratch lgr.info("Creating a new annex repo at %s", tbds.path) AnnexRepo( tbds.path, url=None, create=True, backend=annex_backend, version=annex_version, description=description, git_opts=git_opts, annex_opts=annex_opts, annex_init_opts=annex_init_opts) if native_metadata_type is not None: if not isinstance(native_metadata_type, list): native_metadata_type = [native_metadata_type] for nt in native_metadata_type: tbds.config.add('datalad.metadata.nativetype', nt) # record an ID for this repo for the afterlife # to be able to track siblings and children id_var = 'datalad.dataset.id' if id_var in tbds.config: # make sure we reset this variable completely, in case of a re-create tbds.config.unset(id_var, where='dataset') tbds.config.add( id_var, tbds.id if tbds.id is not None else uuid.uuid1().urn.split(':')[-1], where='dataset') # make sure that v6 annex repos never commit content under .datalad with open(opj(tbds.path, '.datalad', '.gitattributes'), 'a') as gitattr: # TODO this will need adjusting, when annex'ed aggregate meta data # comes around gitattr.write('** annex.largefiles=nothing\n') # save everthing tbds.add('.datalad', to_git=True, save=False) if save: save_dataset( tbds, paths=['.datalad'], message='[DATALAD] new dataset') # the next only makes sense if we saved the created dataset, # otherwise we have no committed state to be registered # in the parent if dataset is not None and dataset.path != tbds.path: # we created a dataset in another dataset # -> make submodule dataset.add(tbds.path, save=save, ds2super=True) return tbds
# If there is a bundled git, make sure GitPython uses it too: from datalad.cmd import GitRunner GitRunner._check_git_path() if GitRunner._GIT_PATH: import os os.environ['GIT_PYTHON_GIT_EXECUTABLE'] = \ os.path.join(GitRunner._GIT_PATH, 'git') from .config import ConfigManager cfg = ConfigManager() from .log import lgr from datalad.utils import get_encoding_info, get_envvars_info, getpwd # To analyze/initiate our decision making on what current directory to return getpwd() lgr.log(5, "Instantiating ssh manager") from .support.sshconnector import SSHManager ssh_manager = SSHManager() atexit.register(ssh_manager.close, allow_fail=False) atexit.register(lgr.log, 5, "Exiting") from .version import __version__ def test(module='datalad', verbose=False, nocapture=False, pdb=False, stop=False): """A helper to run datalad's tests. Requires nose """ argv = [] #module] # could make it 'smarter' but decided to be explicit so later we could