Beispiel #1
0
def test_crawl_api_recursive(get_subdatasets_, run_pipeline_,
                             load_pipeline_from_config_,
                             get_repo_pipeline_script_path_, get_lofilename_,
                             chpwd_, tdir):
    pwd = getpwd()
    with chpwd(tdir):
        output, stats = crawl(recursive=True)
    assert_equal(pwd, getpwd())
    if external_versions['mock'] < '1.0.1':
        raise SkipTest(
            "needs a more recent mock which throws exceptions in side_effects")
    assert_equal(output,
                 [[]] * 4 + [None])  # for now output is just a list of outputs
    assert_equal(
        stats, ActivityStats(
            datasets_crawled=5,
            datasets_crawl_failed=1))  # nothing was done but we got it crawled
    chpwd_.assert_has_calls([
        call(None),
        call('path1'),
        call('path1/path1_1'),
        call('path2'),
    ],
                            any_order=True)
    assert_equal(
        list(find_files('.*', tdir, exclude_vcs=False)),
        [_path_(tdir, 'some.log')])  # no files were generated besides the log
Beispiel #2
0
def test_getpwd_basic():
    pwd = getpwd()
    ok_(isabs(pwd))
    eq_(os.getcwd(), abspath(pwd))

    # that we do not chdir anywhere if None provided
    with patch('os.chdir') as oschdir:
        with chpwd(None):
            eq_(getpwd(), pwd)
        assert_false(oschdir.called)
Beispiel #3
0
def require_dataset(dataset, check_installed=True, purpose=None):
    """Helper function to resolve a dataset.

    This function tries to resolve a dataset given an input argument,
    or based on the process' working directory, if `None` is given.

    Parameters
    ----------
    dataset : None or path or Dataset
      Some value identifying a dataset or `None`. In the latter case
      a dataset will be searched based on the process working directory.
    check_installed : bool, optional
      If True, an optional check whether the resolved dataset is
      properly installed will be performed.
    purpose : str, optional
      This string will be inserted in error messages to make them more
      informative. The pattern is "... dataset for <STRING>".

    Returns
    -------
    Dataset
      If a dataset could be determined.

    Raises
    ------
    NoDatasetFound
      If not dataset could be determined.
    """
    if dataset is not None and not isinstance(dataset, Dataset):
        dataset = Dataset(dataset)

    if dataset is None:  # possible scenario of cmdline calls
        dspath = get_dataset_root(getpwd())
        if not dspath:
            raise NoDatasetFound(
                "No dataset found at '{}'{}.  Specify a dataset to work with "
                "by providing its path via the `dataset` option, "
                "or change the current working directory to be in a "
                "dataset.".format(
                    getpwd(), " for the purpose {!r}".format(purpose)
                    if purpose else ''))
        dataset = Dataset(dspath)

    assert (dataset is not None)
    lgr.debug(u"Resolved dataset%s: %s",
              u' to {}'.format(purpose) if purpose else '', dataset.path)

    if check_installed and not dataset.is_installed():
        raise NoDatasetFound(f"No installed dataset found at {dataset.path}")

    return dataset
Beispiel #4
0
def test_getpwd_symlink(tdir):
    sdir = opj(tdir, 's1')
    pwd_orig = getpwd()
    Path(sdir).symlink_to(Path('.'))
    s1dir = opj(sdir, 's1')
    s2dir = opj(sdir, 's2')
    try:
        chpwd(sdir)
        pwd = getpwd()
        eq_(pwd, sdir)
        chpwd('s1')
        eq_(getpwd(), s1dir)
        chpwd('.')
        eq_(getpwd(), s1dir)
        chpwd('..')
        eq_(getpwd(), sdir)
    finally:
        chpwd(pwd_orig)

    # test context handler way of use
    with chpwd(s1dir):
        eq_(getpwd(), s1dir)
    eq_(getpwd(), pwd_orig)

    assert_false(exists(s2dir))
    with assert_raises(OSError):
        with chpwd(s2dir):
            pass
    with chpwd(s2dir, mkdir=True):
        ok_(exists(s2dir))
        eq_(getpwd(), s2dir)
Beispiel #5
0
def _test_assert_Xwd_unchanged(func):
    orig_cwd = os.getcwd()
    orig_pwd = getpwd()

    @assert_cwd_unchanged
    def do_chdir():
        func(os.pardir)

    with assert_raises(AssertionError) as cm:
        do_chdir()

    eq_(orig_cwd, os.getcwd(),
        "assert_cwd_unchanged didn't return us back to cwd %s" % orig_cwd)
    eq_(orig_pwd, getpwd(),
        "assert_cwd_unchanged didn't return us back to pwd %s" % orig_pwd)
Beispiel #6
0
def _describe_system():
    import platform as pl
    from datalad import get_encoding_info

    if hasattr(pl, 'dist'):
        dist = pl.dist()
    else:
        # Python 3.8 removed .dist but recommended "distro" is slow, so we
        # try it only if needed
        try:
            import distro
            dist = distro.linux_distribution(full_distribution_name=False)
        except ImportError:
            lgr.info(
                "Please install 'distro' package to obtain distribution information"
            )
            dist = tuple()
        except Exception as exc:
            lgr.warning(
                "No distribution information will be provided since 'distro' "
                "fails to import/run: %s", exc_str(exc)
            )
            dist = tuple()

    return {
        'type': os.name,
        'name': pl.system(),
        'release': pl.release(),
        'version': pl.version(),
        'distribution': ' '.join([_t2s(dist),
                                  _t2s(pl.mac_ver()),
                                  _t2s(pl.win32_ver())]).rstrip(),
        'max_path_length': get_max_path_length(getpwd()),
        'encoding': get_encoding_info(),
    }
Beispiel #7
0
def test_install_subds_from_another_remote(topdir):
    # https://github.com/datalad/datalad/issues/1905
    from datalad.support.network import PathRI
    with chpwd(topdir):
        origin_ = 'origin'
        clone1_ = 'clone1'
        clone2_ = 'clone2'

        origin = create(origin_, no_annex=True)
        clone1 = install(source=origin, path=clone1_)
        # print("Initial clone")
        clone1.create_sibling('ssh://localhost%s/%s' %
                              (PathRI(getpwd()).posixpath, clone2_),
                              name=clone2_)

        # print("Creating clone2")
        clone1.publish(to=clone2_)
        clone2 = Dataset(clone2_)
        # print("Initiating subdataset")
        clone2.create('subds1')

        # print("Updating")
        clone1.update(merge=True, sibling=clone2_)
        # print("Installing within updated dataset -- should be able to install from clone2")
        clone1.install('subds1')
Beispiel #8
0
def path_is_under(values, path=None):
    """Whether a given path is a subdirectory of any of the given test values

    Parameters
    ----------
    values : sequence or dict
      Paths to be tested against. This can be a dictionary in which case
      all values from all keys will be tested against.
    path : path or None
      Test path. If None is given, the process' working directory is
      used.

    Returns
    -------
    bool
    """
    if path is None:
        from datalad.utils import getpwd
        path = getpwd()
    if isinstance(values, dict):
        values = chain(*values.values())
    for p in values:
        rpath = relpath(p, start=path)
        if rpath == curdir \
                or rpath == pardir \
                or set(psplit(rpath)) == {pardir}:
            # first match is enough
            return True
    return False
Beispiel #9
0
def resolve_path(path, ds=None):
    """Resolve a path specification (against a Dataset location)

    Any explicit path (absolute or relative) is returned as an absolute path.
    In case of an explicit relative path, the current working directory is
    used as a reference. Any non-explicit relative path is resolved against
    as dataset location, i.e. considered relative to the location of the
    dataset. If no dataset is provided, the current working directory is
    used.

    Returns
    -------
    Absolute path
    """
    # first make sure it's actually a valid path:
    from datalad.support.network import PathRI
    if not isinstance(RI(path), PathRI):
        raise ValueError("%s is not a valid path" % path)

    path = expandpath(path, force_absolute=False)
    if is_explicit_path(path):
        # normalize path consistently between two (explicit and implicit) cases
        return dlabspath(path, norm=True)

    # no dataset given, use CWD as reference
    # note: abspath would disregard symlink in CWD
    top_path = getpwd() \
        if ds is None else ds.path if isinstance(ds, Dataset) else ds
    return normpath(opj(top_path, path))
Beispiel #10
0
def get_max_path_length(top_path=None, maxl=1000):
    """Deduce the maximal length of the filename in a given path
    """
    if not top_path:
        top_path = getpwd()
    import random
    from datalad import lgr
    from datalad.dochelpers import exc_str
    from datalad.support import path
    prefix = path.join(top_path, "dl%d" % random.randint(1, 100000))
    # some smart folks could implement binary search for this
    max_path_length = None
    for i in range(maxl - len(prefix)):
        filename = prefix + '_' * i
        path_length = len(filename)
        try:
            with open(filename, 'w') as f:
                max_path_length = path_length
        except Exception as exc:
            lgr.debug(
                "Failed to create sample file for length %d. Last succeeded was %s. Exception: %s",
                path_length, max_path_length, exc_str(exc))
            break
        unlink(filename)
    return max_path_length
Beispiel #11
0
def _describe_system():
    import platform as pl
    from datalad import get_encoding_info
    from datalad.utils import get_linux_distribution
    try:
        dist = get_linux_distribution()
    except Exception as exc:
        ce = CapturedException(exc)
        lgr.warning("Failed to get distribution information: %s", ce)
        dist = tuple()

    return {
        'type':
        os.name,
        'name':
        pl.system(),
        'release':
        pl.release(),
        'version':
        pl.version(),
        'distribution':
        ' '.join([_t2s(dist),
                  _t2s(pl.mac_ver()),
                  _t2s(pl.win32_ver())]).rstrip(),
        'max_path_length':
        get_max_path_length(getpwd()),
        'encoding':
        get_encoding_info(),
        'filesystem': {
            l: _get_fs_type(l, p)
            for l, p in [('CWD',
                          Path.cwd()), ('TMP', Path(tempfile.gettempdir())
                                        ), ('HOME', Path.home())]
        }
    }
Beispiel #12
0
def _describe_system():
    import platform as pl
    from datalad import get_encoding_info
    from datalad.utils import get_linux_distribution
    try:
        dist = get_linux_distribution()
    except Exception as exc:
        lgr.warning("Failed to get distribution information: %s", exc_str(exc))
        dist = tuple()

    return {
        'type':
        os.name,
        'name':
        pl.system(),
        'release':
        pl.release(),
        'version':
        pl.version(),
        'distribution':
        ' '.join([_t2s(dist),
                  _t2s(pl.mac_ver()),
                  _t2s(pl.win32_ver())]).rstrip(),
        'max_path_length':
        get_max_path_length(getpwd()),
        'encoding':
        get_encoding_info(),
    }
Beispiel #13
0
def get_max_path_length(top_path=None, maxl=1000):
    """Deduce the maximal length of the filename in a given path
    """
    if not top_path:
        top_path = getpwd()
    import os
    import random
    from datalad import lgr
    from datalad.dochelpers import exc_str
    from datalad.support import path
    prefix = path.join(top_path, "dl%d" % random.randint(1 ,100000))
    # some smart folks could implement binary search for this
    max_path_length = None
    for i in range(maxl-len(prefix)):
        filename = prefix + '_' * i
        path_length = len(filename)
        try:
            with open(filename, 'w') as f:
                max_path_length = path_length
        except Exception as exc:
            lgr.debug(
                "Failed to create sample file for length %d. Last succeeded was %s. Exception: %s",
                path_length, max_path_length, exc_str(exc))
            break
        unlink(filename)
    return max_path_length
Beispiel #14
0
def get_command_pwds(dataset):
    """Return the current directory for the dataset.

    Parameters
    ----------
    dataset : Dataset

    Returns
    -------
    A tuple, where the first item is the absolute path of the pwd and the
    second is the pwd relative to the dataset's path.
    """
    # Follow path resolution logic describe in gh-3435.
    if isinstance(dataset, Dataset):  # Paths relative to dataset.
        pwd = dataset.path
        rel_pwd = op.curdir
    else:  # Paths relative to current directory.
        pwd = getpwd()
        # Pass pwd to get_dataset_root instead of os.path.curdir to handle
        # repos whose leading paths have a symlinked directory (see the
        # TMPDIR="/var/tmp/sym link" test case).
        if not dataset:
            dataset = get_dataset_root(pwd)

        if dataset:
            rel_pwd = relpath(pwd, dataset)
        else:
            rel_pwd = pwd  # and leave handling to caller
    return pwd, rel_pwd
Beispiel #15
0
def resolve_path(path, ds=None):
    """Resolve a path specification (against a Dataset location)

    Any explicit path (absolute or relative) is returned as an absolute path.
    In case of an explicit relative path, the current working directory is
    used as a reference. Any non-explicit relative path is resolved against
    as dataset location, i.e. considered relative to the location of the
    dataset. If no dataset is provided, the current working directory is
    used.

    Returns
    -------
    Absolute path
    """
    # first make sure it's actually a valid path:
    from datalad.support.network import PathRI
    if not isinstance(RI(path), PathRI):
        raise ValueError("%s is not a valid path" % path)

    path = expandpath(path, force_absolute=False)
    if is_explicit_path(path):
        # normalize path consistently between two (explicit and implicit) cases
        return dlabspath(path, norm=True)

    # no dataset given, use CWD as reference
    # note: abspath would disregard symlink in CWD
    top_path = getpwd() \
        if ds is None else ds.path if isinstance(ds, Dataset) else ds
    return normpath(opj(top_path, path))
Beispiel #16
0
def path_is_under(values, path=None):
    """Whether a given path is a subdirectory of any of the given test values

    Parameters
    ----------
    values : sequence or dict
      Paths to be tested against. This can be a dictionary in which case
      all values from all keys will be tested against.
    path : path or None
      Test path. If None is given, the process' working directory is
      used.

    Returns
    -------
    bool
    """
    if path is None:
        from datalad.utils import getpwd
        path = getpwd()
    if isinstance(values, dict):
        values = chain(*values.values())
    for p in values:
        rpath = relpath(p, start=path)
        if rpath == curdir \
                or rpath == pardir \
                or set(psplit(rpath)) == {pardir}:
            # first match is enough
            return True
    return False
Beispiel #17
0
def get_command_pwds(dataset):
    """Return the directory for the command.

    Parameters
    ----------
    dataset : Dataset

    Returns
    -------
    A tuple, where the first item is the absolute path of the pwd and the
    second is the pwd relative to the dataset's path.
    """
    if dataset:
        pwd = dataset.path
        rel_pwd = curdir
    else:
        # act on the whole dataset if nothing else was specified

        # Follow our generic semantic that if dataset is specified,
        # paths are relative to it, if not -- relative to pwd
        pwd = getpwd()
        # Pass pwd to get_dataset_root instead of os.path.curdir to handle
        # repos whose leading paths have a symlinked directory (see the
        # TMPDIR="/var/tmp/sym link" test case).
        dataset = get_dataset_root(pwd)

        if dataset:
            rel_pwd = relpath(pwd, dataset)
        else:
            rel_pwd = pwd  # and leave handling on deciding either we
                           # deal with it or crash to checks below
    return pwd, rel_pwd
Beispiel #18
0
def get_command_pwds(dataset):
    """Return the directory for the command.

    Parameters
    ----------
    dataset : Dataset

    Returns
    -------
    A tuple, where the first item is the absolute path of the pwd and the
    second is the pwd relative to the dataset's path.
    """
    if dataset:
        pwd = dataset.path
        rel_pwd = curdir
    else:
        # act on the whole dataset if nothing else was specified
        dataset = get_dataset_root(curdir)
        # Follow our generic semantic that if dataset is specified,
        # paths are relative to it, if not -- relative to pwd
        pwd = getpwd()
        if dataset:
            rel_pwd = relpath(pwd, dataset)
        else:
            rel_pwd = pwd  # and leave handling on deciding either we
                           # deal with it or crash to checks below
    return pwd, rel_pwd
Beispiel #19
0
def _test_assert_Xwd_unchanged_ok_chdir(func):
    # Test that we are not masking out other "more important" exceptions

    orig_cwd = os.getcwd()
    orig_pwd = getpwd()

    @assert_cwd_unchanged(ok_to_chdir=True)
    def do_chdir_value_error():
        func(os.pardir)
        return "a value"

    with swallow_logs() as cml:
        eq_(do_chdir_value_error(), "a value")
        eq_(orig_cwd, os.getcwd(),
            "assert_cwd_unchanged didn't return us back to cwd %s" % orig_cwd)
        eq_(orig_pwd, getpwd(),
            "assert_cwd_unchanged didn't return us back to cwd %s" % orig_pwd)
        assert_not_in("Mitigating and changing back", cml.out)
Beispiel #20
0
def test_run_under_dir(d):
    orig_pwd = getpwd()
    orig_cwd = os.getcwd()

    @run_under_dir(d)
    def f(arg, kwarg=None):
        eq_(arg, 1)
        eq_(kwarg, 2)
        eq_(getpwd(), d)

    f(1, 2)
    eq_(getpwd(), orig_pwd)
    eq_(os.getcwd(), orig_cwd)

    # and if fails
    assert_raises(AssertionError, f, 1, 3)
    eq_(getpwd(), orig_pwd)
    eq_(os.getcwd(), orig_cwd)
Beispiel #21
0
def test_normalize_path(git_path):

    gr = GitRepo(git_path)

    # cwd is currently outside the repo, so any relative path
    # should be interpreted as relative to `annex_path`
    assert_raises(FileNotInRepositoryError, _normalize_path, gr.path, getpwd())

    result = _normalize_path(gr.path, "testfile")
    eq_(result, "testfile", "_normalize_path() returned %s" % result)

    # result = _normalize_path(gr.path, op.join('.', 'testfile'))
    # eq_(result, "testfile", "_normalize_path() returned %s" % result)
    #
    # result = _normalize_path(gr.path, op.join('testdir', '..', 'testfile'))
    # eq_(result, "testfile", "_normalize_path() returned %s" % result)
    # Note: By now, normpath within normalize_paths() is disabled, therefore
    # disable these tests.

    result = _normalize_path(gr.path, op.join('testdir', 'testfile'))
    eq_(result, op.join("testdir", "testfile"),
        "_normalize_path() returned %s" % result)

    result = _normalize_path(gr.path, op.join(git_path, "testfile"))
    eq_(result, "testfile", "_normalize_path() returned %s" % result)

    # now we are inside, so
    # OLD PHILOSOPHY: relative paths are relative to cwd and have
    # to be converted to be relative to annex_path
    # NEW PHILOSOPHY: still relative to repo! unless starts with . (curdir) or .. (pardir)
    with chpwd(op.join(git_path, 'd1', 'd2')):

        result = _normalize_path(gr.path, "testfile")
        eq_(result, 'testfile', "_normalize_path() returned %s" % result)

        # if not joined as directory name but just a prefix to the filename, should
        # behave correctly
        for d in (op.curdir, op.pardir):
            result = _normalize_path(gr.path, d + "testfile")
            eq_(result, d + 'testfile',
                "_normalize_path() returned %s" % result)

        result = _normalize_path(gr.path, op.join(op.curdir, "testfile"))
        eq_(result, op.join('d1', 'd2', 'testfile'),
            "_normalize_path() returned %s" % result)

        result = _normalize_path(gr.path, op.join(op.pardir, 'testfile'))
        eq_(result, op.join('d1', 'testfile'),
            "_normalize_path() returned %s" % result)

        assert_raises(FileNotInRepositoryError, _normalize_path, gr.path,
                      op.join(git_path, '..', 'outside'))

        result = _normalize_path(gr.path, op.join(git_path, 'd1', 'testfile'))
        eq_(result, op.join('d1', 'testfile'),
            "_normalize_path() returned %s" % result)
Beispiel #22
0
def test_rev_resolve_path(path):
    if op.realpath(path) != path:
        raise SkipTest("Test assumptions require non-symlinked parent paths")
    # initially ran into on OSX https://github.com/datalad/datalad/issues/2406
    opath = op.join(path, "origin")
    os.makedirs(opath)
    if not on_windows:
        lpath = op.join(path, "linked")
        os.symlink('origin', lpath)

    ds_global = Dataset(path)
    # path resolution of absolute paths is not influenced by symlinks
    # ignore the linked path on windows, it is not a symlink in the POSIX sense
    for d in (opath, ) if on_windows else (opath, lpath):
        ds_local = Dataset(d)
        # no symlink resolution
        eq_(str(rev_resolve_path(d)), d)
        with chpwd(d):
            # be aware: knows about cwd, but this CWD has symlinks resolved
            eq_(str(rev_resolve_path(d).cwd()), opath)
            # using pathlib's `resolve()` will resolve any
            # symlinks
            # also resolve `opath`, as on old windows systems the path might
            # come in crippled (e.g. C:\Users\MIKE~1/...)
            # and comparison would fails unjustified
            eq_(rev_resolve_path('.').resolve(), ut.Path(opath).resolve())
            # no norming, but absolute paths, without resolving links
            eq_(rev_resolve_path('.'), ut.Path(d))
            eq_(str(rev_resolve_path('.')), d)

            eq_(str(rev_resolve_path(op.join(os.curdir, 'bu'), ds=ds_global)),
                op.join(d, 'bu'))
            eq_(str(rev_resolve_path(op.join(os.pardir, 'bu'), ds=ds_global)),
                op.join(ds_global.path, 'bu'))

        # resolve against a dataset
        eq_(str(rev_resolve_path('bu', ds=ds_local)), op.join(d, 'bu'))
        eq_(str(rev_resolve_path('bu', ds=ds_global)), op.join(path, 'bu'))
        # but paths outside the dataset are left untouched
        eq_(str(rev_resolve_path(op.join(os.curdir, 'bu'), ds=ds_global)),
            op.join(getpwd(), 'bu'))
        eq_(str(rev_resolve_path(op.join(os.pardir, 'bu'), ds=ds_global)),
            op.normpath(op.join(getpwd(), os.pardir, 'bu')))
Beispiel #23
0
def test_GitRepo_files_decorator():

    class testclass(object):
        def __init__(self):
            self.path = opj('some', 'where')

        # TODO
        # yoh:  logic is alien to me below why to have two since both look identical!
        @normalize_paths
        def decorated_many(self, files):
            return files

        @normalize_paths
        def decorated_one(self, file_):
            return file_

    test_instance = testclass()

    # When a single file passed -- single path returned
    obscure_filename = get_most_obscure_supported_name()
    file_to_test = opj(test_instance.path, 'deep', obscure_filename)
    # file doesn't exist
    eq_(test_instance.decorated_one(file_to_test),
                 _normalize_path(test_instance.path, file_to_test))
    eq_(test_instance.decorated_one(file_to_test),
                 _normalize_path(test_instance.path, file_to_test))

    file_to_test = obscure_filename
    eq_(test_instance.decorated_many(file_to_test),
                 _normalize_path(test_instance.path, file_to_test))
    eq_(test_instance.decorated_one(file_to_test),
                 _normalize_path(test_instance.path, file_to_test))


    file_to_test = opj(obscure_filename, 'beyond', 'obscure')
    eq_(test_instance.decorated_many(file_to_test),
                 _normalize_path(test_instance.path, file_to_test))

    file_to_test = opj(getpwd(), 'somewhere', 'else', obscure_filename)
    assert_raises(FileNotInRepositoryError, test_instance.decorated_many,
                  file_to_test)

    # If a list passed -- list returned
    files_to_test = ['now', opj('a list', 'of'), 'paths']
    expect = []
    for item in files_to_test:
        expect.append(_normalize_path(test_instance.path, item))
    eq_(test_instance.decorated_many(files_to_test), expect)

    eq_(test_instance.decorated_many(''), [])

    assert_raises(ValueError, test_instance.decorated_many, 1)
    assert_raises(ValueError, test_instance.decorated_one, 1)
Beispiel #24
0
def test_GitRepo_files_decorator():

    class testclass(object):
        def __init__(self):
            self.path = op.join('some', 'where')

        # TODO
        # yoh:  logic is alien to me below why to have two since both look identical!
        @normalize_paths
        def decorated_many(self, files):
            return files

        @normalize_paths
        def decorated_one(self, file_):
            return file_

    test_instance = testclass()

    # When a single file passed -- single path returned
    obscure_filename = get_most_obscure_supported_name()
    file_to_test = op.join(test_instance.path, 'deep', obscure_filename)
    # file doesn't exist
    eq_(test_instance.decorated_one(file_to_test),
                 _normalize_path(test_instance.path, file_to_test))
    eq_(test_instance.decorated_one(file_to_test),
                 _normalize_path(test_instance.path, file_to_test))

    file_to_test = obscure_filename
    eq_(test_instance.decorated_many(file_to_test),
                 _normalize_path(test_instance.path, file_to_test))
    eq_(test_instance.decorated_one(file_to_test),
                 _normalize_path(test_instance.path, file_to_test))


    file_to_test = op.join(obscure_filename, 'beyond', 'obscure')
    eq_(test_instance.decorated_many(file_to_test),
                 _normalize_path(test_instance.path, file_to_test))

    file_to_test = op.join(getpwd(), 'somewhere', 'else', obscure_filename)
    assert_raises(FileNotInRepositoryError, test_instance.decorated_many,
                  file_to_test)

    # If a list passed -- list returned
    files_to_test = ['now', op.join('a list', 'of'), 'paths']
    expect = []
    for item in files_to_test:
        expect.append(_normalize_path(test_instance.path, item))
    eq_(test_instance.decorated_many(files_to_test), expect)

    eq_(test_instance.decorated_many(''), [])

    assert_raises(ValueError, test_instance.decorated_many, 1)
    assert_raises(ValueError, test_instance.decorated_one, 1)
Beispiel #25
0
def test_getpwd_change_mode(tdir):
    from datalad import utils
    if utils._pwd_mode != 'PWD':
        raise SkipTest("Makes sense to be tested only in PWD mode, "
                       "but we seems to be beyond that already")
    # The evil plain chdir call
    os.chdir(tdir)
    # Just testing the logic of switching to cwd mode and issuing a warning
    with swallow_logs(new_level=logging.DEBUG) as cml:
        pwd = getpwd()
        eq_(pwd, str(Path(pwd).resolve()))  # might have symlinks, thus realpath
    assert_in("symlinks in the paths will be resolved", cml.out)
    eq_(utils._pwd_mode, 'cwd')
    def __call__(url, dataset=None, recursive=False):

        # shortcut
        ds = dataset

        if ds is not None and not isinstance(ds, Dataset):
            ds = Dataset(ds)

        # if we have no dataset given, figure out which one we need to operate
        # on, based on the current working directory of the process:
        if ds is None:
            # try to find a dataset at or above PWD:
            dspath = GitRepo.get_toppath(getpwd())
            if dspath is None:
                raise ValueError("No dataset found at %s." % getpwd())
            ds = Dataset(dspath)
        assert (ds is not None)

        if not ds.is_installed():
            raise ValueError("No installed dataset found at "
                             "{0}.".format(ds.path))
        assert (ds.repo is not None)

        repos_to_update = [ds.repo]
        if recursive:
            repos_to_update += [
                GitRepo(opj(ds.path, sub_path))
                for sub_path in ds.get_dataset_handles(recursive=True)
            ]

        for handle_repo in repos_to_update:
            parser = get_module_parser(handle_repo)
            for submodule_section in parser.sections():
                submodule_name = submodule_section[11:-1]
                parser.set_value(
                    submodule_section, "url",
                    url.replace("%NAME", submodule_name.replace("/", "-")))

        return  # TODO: return value?
Beispiel #27
0
    def __init__(self, patterns, pwd=None, expand=False):
        self.pwd = pwd or getpwd()
        self._expand = expand

        if patterns is None:
            self._maybe_dot = []
            self._paths = {"patterns": []}
        else:
            patterns, dots = partition(patterns, lambda i: i.strip() == ".")
            self._maybe_dot = ["."] if list(dots) else []
            self._paths = {
                "patterns": [relpath(p, start=pwd) if isabs(p) else p
                             for p in patterns]}
    def __call__(url, dataset=None, recursive=False):

        # shortcut
        ds = dataset

        if ds is not None and not isinstance(ds, Dataset):
            ds = Dataset(ds)

        # if we have no dataset given, figure out which one we need to operate
        # on, based on the current working directory of the process:
        if ds is None:
            # try to find a dataset at or above PWD:
            dspath = GitRepo.get_toppath(getpwd())
            if dspath is None:
                raise ValueError("No dataset found at %s." % getpwd())
            ds = Dataset(dspath)
        assert(ds is not None)

        if not ds.is_installed():
            raise ValueError("No installed dataset found at "
                             "{0}.".format(ds.path))
        assert(ds.repo is not None)

        repos_to_update = [ds.repo]
        if recursive:
            repos_to_update += [GitRepo(opj(ds.path, sub_path))
                                for sub_path in
                                ds.get_dataset_handles(recursive=True)]

        for handle_repo in repos_to_update:
            parser = get_module_parser(handle_repo)
            for submodule_section in parser.sections():
                submodule_name = submodule_section[11:-1]
                parser.set_value(submodule_section, "url",
                                 url.replace("%NAME",
                                             submodule_name.replace("/", "-")))

        return  # TODO: return value?
Beispiel #29
0
def _describe_system():
    import platform as pl
    from datalad import get_encoding_info
    return {
        'type': os.name,
        'name': pl.system(),
        'release': pl.release(),
        'version': pl.version(),
        'distribution': ' '.join([_t2s(pl.dist()),
                                  _t2s(pl.mac_ver()),
                                  _t2s(pl.win32_ver())]).rstrip(),
        'max_path_length': get_max_path_length(getpwd()),
        'encoding': get_encoding_info(),
    }
Beispiel #30
0
def _describe_system():
    import platform as pl
    from datalad import get_encoding_info
    return {
        'type': os.name,
        'name': pl.system(),
        'release': pl.release(),
        'version': pl.version(),
        'distribution': ' '.join([_t2s(pl.dist()),
                                  _t2s(pl.mac_ver()),
                                  _t2s(pl.win32_ver())]).rstrip(),
        'max_path_length': get_max_path_length(getpwd()),
        'encoding': get_encoding_info(),
    }
Beispiel #31
0
def test_normalize_path(git_path):

    gr = GitRepo(git_path)

    # cwd is currently outside the repo, so any relative path
    # should be interpreted as relative to `annex_path`
    assert_raises(FileNotInRepositoryError, _normalize_path, gr.path, getpwd())

    result = _normalize_path(gr.path, "testfile")
    eq_(result, "testfile", "_normalize_path() returned %s" % result)

    # result = _normalize_path(gr.path, op.join('.', 'testfile'))
    # eq_(result, "testfile", "_normalize_path() returned %s" % result)
    #
    # result = _normalize_path(gr.path, op.join('testdir', '..', 'testfile'))
    # eq_(result, "testfile", "_normalize_path() returned %s" % result)
    # Note: By now, normpath within normalize_paths() is disabled, therefore
    # disable these tests.

    result = _normalize_path(gr.path, op.join('testdir', 'testfile'))
    eq_(result, op.join("testdir", "testfile"), "_normalize_path() returned %s" % result)

    result = _normalize_path(gr.path, op.join(git_path, "testfile"))
    eq_(result, "testfile", "_normalize_path() returned %s" % result)

    # now we are inside, so
    # OLD PHILOSOPHY: relative paths are relative to cwd and have
    # to be converted to be relative to annex_path
    # NEW PHILOSOPHY: still relative to repo! unless starts with . (curdir) or .. (pardir)
    with chpwd(op.join(git_path, 'd1', 'd2')):

        result = _normalize_path(gr.path, "testfile")
        eq_(result, 'testfile', "_normalize_path() returned %s" % result)

        # if not joined as directory name but just a prefix to the filename, should
        # behave correctly
        for d in (op.curdir, op.pardir):
            result = _normalize_path(gr.path, d + "testfile")
            eq_(result, d + 'testfile', "_normalize_path() returned %s" % result)

        result = _normalize_path(gr.path, op.join(op.curdir, "testfile"))
        eq_(result, op.join('d1', 'd2', 'testfile'), "_normalize_path() returned %s" % result)

        result = _normalize_path(gr.path, op.join(op.pardir, 'testfile'))
        eq_(result, op.join('d1', 'testfile'), "_normalize_path() returned %s" % result)

        assert_raises(FileNotInRepositoryError, _normalize_path, gr.path, op.join(git_path, '..', 'outside'))

        result = _normalize_path(gr.path, op.join(git_path, 'd1', 'testfile'))
        eq_(result, op.join('d1', 'testfile'), "_normalize_path() returned %s" % result)
Beispiel #32
0
 def setup_cache(self):
     ds_path = create_test_dataset(self.dsname, spec='2/-2/-2', seed=0)[0]
     self.log("Setup cache ds path %s. CWD: %s", ds_path, getpwd())
     # Will store into a tarfile since otherwise install -r is way too slow
     # to be invoked for every benchmark
     # Store full path since apparently setup is not ran in that directory
     self.tarfile = op.realpath(SampleSuperDatasetBenchmarks.tarfile)
     with tarfile.open(self.tarfile, "w") as tar:
         # F.CK -- Python tarfile can't later extract those because key dirs are
         # read-only.  For now just a workaround - make it all writeable
         from datalad.utils import rotree
         rotree(self.dsname, ro=False, chmod_files=False)
         tar.add(self.dsname, recursive=True)
     rmtree(self.dsname)
Beispiel #33
0
    def __init__(self, patterns, pwd=None, expand=False):
        self.pwd = pwd or getpwd()
        self._expand = expand

        if patterns is None:
            self._maybe_dot = []
            self._paths = {"patterns": [], "sub_patterns": {}}
        else:
            patterns = list(map(assure_unicode, patterns))
            patterns, dots = partition(patterns, lambda i: i.strip() == ".")
            self._maybe_dot = ["."] if list(dots) else []
            self._paths = {
                "patterns": [op.relpath(p, start=pwd) if op.isabs(p) else p
                             for p in patterns],
                "sub_patterns": {}}
Beispiel #34
0
    def _flyweight_id_from_args(cls, *args, **kwargs):

        if args:
            # to a certain degree we need to simulate an actual call to __init__
            # and make sure, passed arguments are fitting:
            # TODO: Figure out, whether there is a cleaner way to do this in a
            # generic fashion
            assert('path' not in kwargs)
            path = args[0]
            args = args[1:]
        elif 'path' in kwargs:
            path = kwargs.pop('path')
        else:
            raise TypeError("__init__() requires argument `path`")

        if path is None:
            raise AttributeError

        # mirror what is happening in __init__
        if isinstance(path, ut.PurePath):
            path = text_type(path)

        # Custom handling for few special abbreviations
        path_ = path
        if path == '^':
            # get the topmost dataset from current location. Note that 'zsh'
            # might have its ideas on what to do with ^, so better use as -d^
            path_ = Dataset(curdir).get_superdataset(topmost=True).path
        elif path == '///':
            # TODO: logic/UI on installing a default dataset could move here
            # from search?
            path_ = cfg.obtain('datalad.locations.default-dataset')
        if path != path_:
            lgr.debug("Resolved dataset alias %r to path %r", path, path_)

        # Sanity check for argument `path`:
        # raise if we cannot deal with `path` at all or
        # if it is not a local thing:
        path_ = RI(path_).localpath

        # we want an absolute path, but no resolved symlinks
        if not isabs(path_):
            path_ = opj(getpwd(), path_)

        # use canonical paths only:
        path_ = normpath(path_)
        kwargs['path'] = path_
        return path_, args, kwargs
Beispiel #35
0
    def _flyweight_id_from_args(cls, *args, **kwargs):

        if args:
            # to a certain degree we need to simulate an actual call to __init__
            # and make sure, passed arguments are fitting:
            # TODO: Figure out, whether there is a cleaner way to do this in a
            # generic fashion
            assert('path' not in kwargs)
            path = args[0]
            args = args[1:]
        elif 'path' in kwargs:
            path = kwargs.pop('path')
        else:
            raise TypeError("__init__() requires argument `path`")

        if path is None:
            raise AttributeError

        # mirror what is happening in __init__
        if isinstance(path, ut.PurePath):
            path = text_type(path)

        # Custom handling for few special abbreviations
        path_ = path
        if path == '^':
            # get the topmost dataset from current location. Note that 'zsh'
            # might have its ideas on what to do with ^, so better use as -d^
            path_ = Dataset(curdir).get_superdataset(topmost=True).path
        elif path == '///':
            # TODO: logic/UI on installing a default dataset could move here
            # from search?
            path_ = cfg.obtain('datalad.locations.default-dataset')
        if path != path_:
            lgr.debug("Resolved dataset alias %r to path %r", path, path_)

        # Sanity check for argument `path`:
        # raise if we cannot deal with `path` at all or
        # if it is not a local thing:
        path_ = RI(path_).localpath

        # we want an absolute path, but no resolved symlinks
        if not isabs(path_):
            path_ = opj(getpwd(), path_)

        # use canonical paths only:
        path_ = normpath(path_)
        kwargs['path'] = path_
        return path_, args, kwargs
Beispiel #36
0
    def setup(self):
        self.log("Setup ran in %s, existing paths: %s", getpwd(), glob('*'))

        tempdir = tempfile.mkdtemp(**get_tempfile_kwargs({}, prefix="bm"))
        self.remove_paths.append(tempdir)
        with tarfile.open(self.tarfile) as tar:
            tar.extractall(tempdir)

        # TODO -- remove this abomination after https://github.com/datalad/datalad/issues/1512 is fixed
        epath = op.join(tempdir, 'testds1')
        epath_unique = epath + str(self.__class__.ds_count)
        os.rename(epath, epath_unique)
        self.__class__.ds_count += 1
        self.ds = Dataset(epath_unique)
        self.repo = self.ds.repo
        self.log("Finished setup for %s", tempdir)
Beispiel #37
0
    def __init__(self, patterns, pwd=None, expand=False):
        self.pwd = pwd or getpwd()
        self._expand = expand

        if patterns is None:
            self._maybe_dot = []
            self._patterns = []
        else:
            patterns = list(map(ensure_unicode, patterns))
            patterns, dots = partition(patterns, lambda i: i.strip() == ".")
            self._maybe_dot = ["."] if list(dots) else []
            self._patterns = [
                op.relpath(p, start=pwd) if op.isabs(p) else p
                for p in patterns
            ]
        self._cache = {}
Beispiel #38
0
    def setup(self):
        self.log("Setup ran in %s, existing paths: %s", getpwd(), glob('*'))

        tempdir = tempfile.mkdtemp(
            **get_tempfile_kwargs({}, prefix="bm")
        )
        self.remove_paths.append(tempdir)
        with tarfile.open(self.tarfile) as tar:
            tar.extractall(tempdir)

        # TODO -- remove this abomination after https://github.com/datalad/datalad/issues/1512 is fixed
        epath = op.join(tempdir, 'testds1')
        epath_unique = epath + str(self.__class__.ds_count)
        os.rename(epath, epath_unique)
        self.__class__.ds_count += 1
        self.ds = Dataset(epath_unique)
        self.repo = self.ds.repo
        self.log("Finished setup for %s", tempdir)
Beispiel #39
0
 def setup_cache(self):
     ds_path = create_test_dataset(
         self.dsname
         , spec='2/-2/-2'
         , seed=0
     )[0]
     self.log("Setup cache ds path %s. CWD: %s", ds_path, getpwd())
     # Will store into a tarfile since otherwise install -r is way too slow
     # to be invoked for every benchmark
     # Store full path since apparently setup is not ran in that directory
     self.tarfile = op.realpath(SampleSuperDatasetBenchmarks.tarfile)
     with tarfile.open(self.tarfile, "w") as tar:
         # F.CK -- Python tarfile can't later extract those because key dirs are
         # read-only.  For now just a workaround - make it all writeable
         from datalad.utils import rotree
         rotree(self.dsname, ro=False, chmod_files=False)
         tar.add(self.dsname, recursive=True)
     rmtree(self.dsname)
Beispiel #40
0
def require_dataset(dataset, check_installed=True, purpose=None):
    """Helper function to resolve a dataset.

    This function tries to resolve a dataset given an input argument,
    or based on the process' working directory, if `None` is given.

    Parameters
    ----------
    dataset : None or path or Dataset
      Some value identifying a dataset or `None`. In the latter case
      a dataset will be searched based on the process working directory.
    check_installed : bool, optional
      If True, an optional check whether the resolved dataset is
      properly installed will be performed.
    purpose : str, optional
      This string will be inserted in error messages to make them more
      informative. The pattern is "... dataset for <STRING>".

    Returns
    -------
    Dataset
      Or raises an exception (InsufficientArgumentsError).
    """
    if dataset is not None and not isinstance(dataset, Dataset):
        dataset = Dataset(dataset)

    if dataset is None:  # possible scenario of cmdline calls
        dspath = get_dataset_root(getpwd())
        if not dspath:
            raise NoDatasetArgumentFound("No dataset found")
        dataset = Dataset(dspath)

    assert(dataset is not None)
    lgr.debug(u"Resolved dataset{0}: {1}".format(
        ' for {}'.format(purpose) if purpose else '',
        dataset))

    if check_installed and not dataset.is_installed():
        raise ValueError("No installed dataset found at "
                         "{0}.".format(dataset.path))

    return dataset
Beispiel #41
0
def require_dataset(dataset, check_installed=True, purpose=None):
    """Helper function to resolve a dataset.

    This function tries to resolve a dataset given an input argument,
    or based on the process' working directory, if `None` is given.

    Parameters
    ----------
    dataset : None or path or Dataset
      Some value identifying a dataset or `None`. In the latter case
      a dataset will be searched based on the process working directory.
    check_installed : bool, optional
      If True, an optional check whether the resolved dataset is
      properly installed will be performed.
    purpose : str, optional
      This string will be inserted in error messages to make them more
      informative. The pattern is "... dataset for <STRING>".

    Returns
    -------
    Dataset
      Or raises an exception (InsufficientArgumentsError).
    """
    if dataset is not None and not isinstance(dataset, Dataset):
        dataset = Dataset(dataset)

    if dataset is None:  # possible scenario of cmdline calls
        dspath = get_dataset_root(getpwd())
        if not dspath:
            raise NoDatasetArgumentFound("No dataset found")
        dataset = Dataset(dspath)

    assert(dataset is not None)
    lgr.debug("Resolved dataset{0}: {1}".format(
        ' for {}'.format(purpose) if purpose else '',
        dataset))

    if check_installed and not dataset.is_installed():
        raise ValueError("No installed dataset found at "
                         "{0}.".format(dataset.path))

    return dataset
Beispiel #42
0
def _describe_system():
    import platform as pl
    from datalad import get_encoding_info

    if hasattr(pl, 'dist'):
        dist = pl.dist()
    else:
        # Python 3.8 removed .dist but recommended "distro" is slow, so we
        # try it only if needed
        try:
            import distro
            dist = distro.linux_distribution(full_distribution_name=False)
        except ImportError:
            lgr.info(
                "Please install 'distro' package to obtain distribution information"
            )
            dist = tuple()
        except Exception as exc:
            lgr.warning(
                "No distribution information will be provided since 'distro' "
                "fails to import/run: %s", exc_str(exc))
            dist = tuple()

    return {
        'type':
        os.name,
        'name':
        pl.system(),
        'release':
        pl.release(),
        'version':
        pl.version(),
        'distribution':
        ' '.join([_t2s(dist),
                  _t2s(pl.mac_ver()),
                  _t2s(pl.win32_ver())]).rstrip(),
        'max_path_length':
        get_max_path_length(getpwd()),
        'encoding':
        get_encoding_info(),
    }
Beispiel #43
0
def test_resolve_path(somedir):

    abs_path = abspath(somedir)  # just to be sure
    rel_path = "some"
    expl_path_cur = opj(os.curdir, rel_path)
    expl_path_par = opj(os.pardir, rel_path)

    eq_(resolve_path(abs_path), abs_path)

    current = getpwd()
    # no Dataset => resolve using cwd:
    eq_(resolve_path(abs_path), abs_path)
    eq_(resolve_path(rel_path), opj(current, rel_path))
    eq_(resolve_path(expl_path_cur), normpath(opj(current, expl_path_cur)))
    eq_(resolve_path(expl_path_par), normpath(opj(current, expl_path_par)))

    # now use a Dataset as reference:
    ds = Dataset(abs_path)
    eq_(resolve_path(abs_path, ds), abs_path)
    eq_(resolve_path(rel_path, ds), opj(abs_path, rel_path))
    eq_(resolve_path(expl_path_cur, ds), normpath(opj(current, expl_path_cur)))
    eq_(resolve_path(expl_path_par, ds), normpath(opj(current, expl_path_par)))
Beispiel #44
0
def test_resolve_path(somedir):

    abs_path = abspath(somedir)  # just to be sure
    rel_path = "some"
    expl_path_cur = opj(os.curdir, rel_path)
    expl_path_par = opj(os.pardir, rel_path)

    eq_(resolve_path(abs_path), abs_path)

    current = getpwd()
    # no Dataset => resolve using cwd:
    eq_(resolve_path(abs_path), abs_path)
    eq_(resolve_path(rel_path), opj(current, rel_path))
    eq_(resolve_path(expl_path_cur), normpath(opj(current, expl_path_cur)))
    eq_(resolve_path(expl_path_par), normpath(opj(current, expl_path_par)))

    # now use a Dataset as reference:
    ds = Dataset(abs_path)
    eq_(resolve_path(abs_path, ds), abs_path)
    eq_(resolve_path(rel_path, ds), opj(abs_path, rel_path))
    eq_(resolve_path(expl_path_cur, ds), normpath(opj(current, expl_path_cur)))
    eq_(resolve_path(expl_path_par, ds), normpath(opj(current, expl_path_par)))
Beispiel #45
0
def test_install_subds_from_another_remote(topdir):
    # https://github.com/datalad/datalad/issues/1905
    from datalad.support.network import PathRI
    with chpwd(topdir):
        origin_ = 'origin'
        clone1_ = 'clone1'
        clone2_ = 'clone2'

        origin = create(origin_, no_annex=True)
        clone1 = install(source=origin, path=clone1_)
        # print("Initial clone")
        clone1.create_sibling('ssh://localhost%s/%s' % (PathRI(getpwd()).posixpath, clone2_), name=clone2_)

        # print("Creating clone2")
        clone1.publish(to=clone2_)
        clone2 = Dataset(clone2_)
        # print("Initiating subdataset")
        clone2.create('subds1')

        # print("Updating")
        clone1.update(merge=True, sibling=clone2_)
        # print("Installing within updated dataset -- should be able to install from clone2")
        clone1.install('subds1')
Beispiel #46
0
def resolve_path(path, ds=None):
    """Resolve a path specification (against a Dataset location)

    Any explicit path (absolute or relative) is returned as an absolute path.
    In case of an explicit relative path, the current working directory is
    used as a reference. Any non-explicit relative path is resolved against
    as dataset location, i.e. considered relative to the location of the
    dataset. If no dataset is provided, the current working directory is
    used.

    Returns
    -------
    Absolute path
    """
    path = expandpath(path, force_absolute=False)
    # TODO: normpath?!
    if is_explicit_path(path):
        return abspath(path)
    # no dataset given, use CWD as reference
    # note: abspath would disregard symlink in CWD
    top_path = getpwd() \
        if ds is None else ds.path if isinstance(ds, Dataset) else ds
    return normpath(opj(top_path, path))
Beispiel #47
0
    def __call__(
            path=None,
            initopts=None,
            force=False,
            description=None,
            dataset=None,
            no_annex=False,
            fake_dates=False,
            cfg_proc=None
    ):
        refds_path = dataset.path if hasattr(dataset, 'path') else dataset

        # two major cases
        # 1. we got a `dataset` -> we either want to create it (path is None),
        #    or another dataset in it (path is not None)
        # 2. we got no dataset -> we want to create a fresh dataset at the
        #    desired location, either at `path` or PWD

        # sanity check first
        if no_annex:
            if description:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "description for annex repo and declaring "
                                 "no annex repo.")

        if path:
            path = rev_resolve_path(path, dataset)

        path = path if path \
            else getpwd() if dataset is None \
            else refds_path

        # we know that we need to create a dataset at `path`
        assert(path is not None)

        # prep for yield
        res = dict(action='create', path=text_type(path),
                   logger=lgr, type='dataset',
                   refds=refds_path)

        refds = None
        if refds_path and refds_path != path:
            refds = require_dataset(
                refds_path, check_installed=True,
                purpose='creating a subdataset')

            path_inrefds = path_under_rev_dataset(refds, path)
            if path_inrefds is None:
                yield dict(
                    res,
                    status='error',
                    message=(
                        "dataset containing given paths is not underneath "
                        "the reference dataset %s: %s",
                        dataset, text_type(path)),
                )
                return

        # try to locate an immediate parent dataset
        # we want to know this (irrespective of whether we plan on adding
        # this new dataset to a parent) in order to avoid conflicts with
        # a potentially absent/uninstalled subdataset of the parent
        # in this location
        # it will cost some filesystem traversal though...
        parentds_path = rev_get_dataset_root(
            op.normpath(op.join(text_type(path), os.pardir)))
        if parentds_path:
            prepo = GitRepo(parentds_path)
            parentds_path = ut.Path(parentds_path)
            # we cannot get away with a simple
            # GitRepo.get_content_info(), as we need to detect
            # uninstalled/added subdatasets too
            check_path = ut.Path(path)
            pstatus = prepo.status(
                untracked='no',
                # limit query to target path for a potentially massive speed-up
                paths=[check_path.relative_to(parentds_path)])
            if any(
                    check_path == p or check_path in p.parents
                    for p in pstatus):
                # redo the check in a slower fashion, it is already broken
                # let's take our time for a proper error message
                conflict = [
                    p for p in pstatus
                    if check_path == p or check_path in p.parents]
                res.update({
                    'status': 'error',
                    'message': (
                        'collision with content in parent dataset at %s: %s',
                        text_type(parentds_path),
                        [text_type(c) for c in conflict])})
                yield res
                return
            # another set of check to see whether the target path is pointing
            # into a known subdataset that is not around ATM
            subds_status = {
                parentds_path / k.relative_to(prepo.path)
                for k, v in iteritems(pstatus)
                if v.get('type', None) == 'dataset'}
            check_paths = [check_path]
            check_paths.extend(check_path.parents)
            if any(p in subds_status for p in check_paths):
                conflict = [p for p in check_paths if p in subds_status]
                res.update({
                    'status': 'error',
                    'message': (
                        'collision with %s (dataset) in dataset %s',
                        text_type(conflict[0]),
                        text_type(parentds_path))})
                yield res
                return

        # important to use the given Dataset object to avoid spurious ID
        # changes with not-yet-materialized Datasets
        tbds = dataset if isinstance(dataset, Dataset) and \
            dataset.path == path else Dataset(text_type(path))

        # don't create in non-empty directory without `force`:
        if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force:
            res.update({
                'status': 'error',
                'message':
                    'will not create a dataset in a non-empty directory, use '
                    '`force` option to ignore'})
            yield res
            return

        # stuff that we create and want to have tracked with git (not annex)
        add_to_git = {}

        if initopts is not None and isinstance(initopts, list):
            initopts = {'_from_cmdline_': initopts}

        # create and configure desired repository
        if no_annex:
            lgr.info("Creating a new git repo at %s", tbds.path)
            tbrepo = GitRepo(
                tbds.path,
                url=None,
                create=True,
                create_sanity_checks=False,
                git_opts=initopts,
                fake_dates=fake_dates)
            # place a .noannex file to indicate annex to leave this repo alone
            stamp_path = ut.Path(tbrepo.path) / '.noannex'
            stamp_path.touch()
            add_to_git[stamp_path] = {
                'type': 'file',
                'state': 'untracked'}
        else:
            # always come with annex when created from scratch
            lgr.info("Creating a new annex repo at %s", tbds.path)
            tbrepo = AnnexRepo(
                tbds.path,
                url=None,
                create=True,
                create_sanity_checks=False,
                # do not set backend here, to avoid a dedicated commit
                backend=None,
                # None causes version to be taken from config
                version=None,
                description=description,
                git_opts=initopts,
                fake_dates=fake_dates
            )
            # set the annex backend in .gitattributes as a staged change
            tbrepo.set_default_backend(
                cfg.obtain('datalad.repo.backend'),
                persistent=True, commit=False)
            add_to_git[tbds.repo.pathobj / '.gitattributes'] = {
                'type': 'file',
                'state': 'added'}
            # make sure that v6 annex repos never commit content under .datalad
            attrs_cfg = (
                ('config', 'annex.largefiles', 'nothing'),
                ('metadata/aggregate*', 'annex.largefiles', 'nothing'),
                ('metadata/objects/**', 'annex.largefiles',
                 '({})'.format(cfg.obtain(
                     'datalad.metadata.create-aggregate-annex-limit'))))
            attrs = tbds.repo.get_gitattributes(
                [op.join('.datalad', i[0]) for i in attrs_cfg])
            set_attrs = []
            for p, k, v in attrs_cfg:
                if not attrs.get(
                        op.join('.datalad', p), {}).get(k, None) == v:
                    set_attrs.append((p, {k: v}))
            if set_attrs:
                tbds.repo.set_gitattributes(
                    set_attrs,
                    attrfile=op.join('.datalad', '.gitattributes'))

            # prevent git annex from ever annexing .git* stuff (gh-1597)
            attrs = tbds.repo.get_gitattributes('.git')
            if not attrs.get('.git', {}).get(
                    'annex.largefiles', None) == 'nothing':
                tbds.repo.set_gitattributes([
                    ('**/.git*', {'annex.largefiles': 'nothing'})])
                # must use the repo.pathobj as this will have resolved symlinks
                add_to_git[tbds.repo.pathobj / '.gitattributes'] = {
                    'type': 'file',
                    'state': 'untracked'}

        # record an ID for this repo for the afterlife
        # to be able to track siblings and children
        id_var = 'datalad.dataset.id'
        # Note, that Dataset property `id` will change when we unset the
        # respective config. Therefore store it before:
        tbds_id = tbds.id
        if id_var in tbds.config:
            # make sure we reset this variable completely, in case of a
            # re-create
            tbds.config.unset(id_var, where='dataset')

        if _seed is None:
            # just the standard way
            uuid_id = uuid.uuid1().urn.split(':')[-1]
        else:
            # Let's generate preseeded ones
            uuid_id = str(uuid.UUID(int=random.getrandbits(128)))
        tbds.config.add(
            id_var,
            tbds_id if tbds_id is not None else uuid_id,
            where='dataset',
            reload=False)

        # make config overrides permanent in the repo config
        # this is similar to what `annex init` does
        # we are only doing this for config overrides and do not expose
        # a dedicated argument, because it is sufficient for the cmdline
        # and unnecessary for the Python API (there could simply be a
        # subsequence ds.config.add() call)
        for k, v in iteritems(tbds.config.overrides):
            tbds.config.add(k, v, where='local', reload=False)

        # all config manipulation is done -> fll reload
        tbds.config.reload()

        # must use the repo.pathobj as this will have resolved symlinks
        add_to_git[tbds.repo.pathobj / '.datalad'] = {
            'type': 'directory',
            'state': 'untracked'}

        # save everything, we need to do this now and cannot merge with the
        # call below, because we may need to add this subdataset to a parent
        # but cannot until we have a first commit
        tbds.repo.save(
            message='[DATALAD] new dataset',
            git=True,
            # we have to supply our own custom status, as the repo does
            # not have a single commit yet and the is no HEAD reference
            # TODO make `GitRepo.status()` robust to this state.
            _status=add_to_git,
        )

        # the next only makes sense if we saved the created dataset,
        # otherwise we have no committed state to be registered
        # in the parent
        if isinstance(dataset, Dataset) and dataset.path != tbds.path:
            # we created a dataset in another dataset
            # -> make submodule
            for r in dataset.save(
                    path=tbds.path,
            ):
                yield r

        res.update({'status': 'ok'})
        yield res

        for cfg_proc_ in cfg_proc or []:
            for r in tbds.run_procedure('cfg_' + cfg_proc_):
                yield r
Beispiel #48
0
def rev_resolve_path(path, ds=None):
    """Resolve a path specification (against a Dataset location)

    Any path is returned as an absolute path. If, and only if, a dataset
    object instance is given as `ds`, relative paths are interpreted as
    relative to the given dataset. In all other cases, relative paths are
    treated as relative to the current working directory.

    Note however, that this function is not able to resolve arbitrarily
    obfuscated path specifications. All operations are purely lexical, and no
    actual path resolution against the filesystem content is performed.
    Consequently, common relative path arguments like '../something' (relative
    to PWD) can be handled properly, but things like 'down/../under' cannot, as
    resolving this path properly depends on the actual target of any
    (potential) symlink leading up to '..'.

    Parameters
    ----------
    path : str or PathLike or list
      Platform-specific path specific path specification. Multiple path
      specifications can be given as a list
    ds : Dataset or None
      Dataset instance to resolve relative paths against.

    Returns
    -------
    `pathlib.Path` object or list(Path)
      When a list was given as input a list is returned, a Path instance
      otherwise.
    """
    got_ds_instance = isinstance(ds, Dataset)
    if ds is not None and not got_ds_instance:
        ds = require_dataset(
            ds, check_installed=False, purpose='path resolution')
    out = []
    for p in assure_list(path):
        if ds is None or not got_ds_instance:
            # no dataset at all or no instance provided -> CWD is always the reference
            # nothing needs to be done here. Path-conversion and absolutification
            # are done next
            pass
        # we have a given datasets instance
        elif not Path(p).is_absolute():
            # we have a dataset and no abspath nor an explicit relative path ->
            # resolve it against the dataset
            p = ds.pathobj / p

        p = ut.Path(p)

        # make sure we return an absolute path, but without actually
        # resolving anything
        if not p.is_absolute():
            # in general it is almost impossible to use resolve() when
            # we can have symlinks in the root path of a dataset
            # (that we don't want to resolve here), symlinks to annex'ed
            # files (that we never want to resolve), and other within-repo
            # symlinks that we (sometimes) want to resolve (i.e. symlinked
            # paths for addressing content vs adding content)
            # CONCEPT: do the minimal thing to catch most real-world inputs
            # ASSUMPTION: the only sane relative path input that needs
            # handling and can be handled are upward references like
            # '../../some/that', wherease stuff like 'down/../someotherdown'
            # are intellectual excercises
            # ALGORITHM: match any number of leading '..' path components
            # and shorten the PWD by that number
            # NOT using ut.Path.cwd(), because it has symlinks resolved!!
            pwd_parts = ut.Path(getpwd()).parts
            path_parts = p.parts
            leading_parents = 0
            for pp in p.parts:
                if pp == op.pardir:
                    leading_parents += 1
                    path_parts = path_parts[1:]
                elif pp == op.curdir:
                    # we want to discard that, but without stripping
                    # a corresponding parent
                    path_parts = path_parts[1:]
                else:
                    break
            p = ut.Path(
                op.join(
                    *(pwd_parts[:-leading_parents if leading_parents else None]
                      + path_parts)))
        # note that we will not "normpath()" the result, check the
        # pathlib docs for why this is the only sane choice in the
        # face of the possibility of symlinks in the path
        out.append(p)
    return out[0] if isinstance(path, (string_types, PurePath)) else out
Beispiel #49
0
    def __call__(name=None, dataset=None,
                 merge=False, recursive=False, fetch_all=False,
                 reobtain_data=False):
        """
        """
        # TODO: Is there an 'update filehandle' similar to install and publish?
        # What does it mean?

        if reobtain_data:
            # TODO: properly define, what to do
            raise NotImplementedError("TODO: Option '--reobtain-data' not "
                                      "implemented yet.")

        # shortcut
        ds = dataset

        if ds is not None and not isinstance(ds, Dataset):
            ds = Dataset(ds)

        # if we have no dataset given, figure out which one we need to operate
        # on, based on the current working directory of the process:
        if ds is None:
            # try to find a dataset at or above PWD:
            dspath = GitRepo.get_toppath(getpwd())
            if dspath is None:
                raise ValueError("No dataset found at %s." % getpwd())
            ds = Dataset(dspath)
        assert(ds is not None)

        if not ds.is_installed():
            raise ValueError("No installed dataset found at "
                             "{0}.".format(ds.path))
        assert(ds.repo is not None)

        repos_to_update = [ds.repo]
        if recursive:
            repos_to_update += [GitRepo(opj(ds.path, sub_path))
                                for sub_path in
                                ds.get_dataset_handles(recursive=True)]

        for repo in repos_to_update:
            # get all remotes:
            remotes = repo.git_get_remotes()
            if name and name not in remotes:
                lgr.warning("'%s' not known to dataset %s.\nSkipping" %
                            (name, repo.path))
                continue

            # Currently '--merge' works for single remote only:
            # TODO: - condition still incomplete
            #       - We can merge if a remote was given or there is a
            #         tracking branch
            #       - we also can fetch all remotes independently on whether or
            #         not we merge a certain remote
            if not name and len(remotes) > 1 and merge:
                lgr.debug("Found multiple remotes:\n%s" % remotes)
                raise NotImplementedError("No merge strategy for multiple "
                                          "remotes implemented yet.")
            lgr.info("Updating handle '%s' ..." % repo.path)

            # fetch remote(s):
            repo.git_fetch(name if name else '',
                           "--all" if fetch_all else '')

            # if it is an annex and there is a tracking branch, and we didn't
            # fetch the entire remote anyway, explicitly fetch git-annex
            # branch:
            # TODO: Is this logic correct? Shouldn't we fetch git-annex from
            # `name` if there is any (or if there is no tracking branch but we
            # have a `name`?
            if knows_annex(repo.path) and not fetch_all:
                # check for tracking branch's remote:
                try:
                    std_out, std_err = \
                        repo._git_custom_command('',
                        ["git", "config", "--get",
                         "branch.{active_branch}.remote".format(
                             active_branch=repo.git_get_active_branch())])
                except CommandError as e:
                    if e.code == 1 and e.stdout == "":
                        std_out = None
                    else:
                        raise
                if std_out:  # we have a "tracking remote"
                    repo.git_fetch("%s git-annex" % std_out.strip())

            # merge:
            if merge:
                lgr.info("Applying changes from tracking branch...")
                cmd_list = ["git", "pull"]
                if name:
                    cmd_list.append(name)
                    # branch needed, if not default remote
                    # => TODO: use default remote/tracking branch to compare
                    #          (see above, where git-annex is fetched)
                    # => TODO: allow for passing a branch
                    # (or more general refspec?)
                    # For now, just use the same name
                    cmd_list.append(repo.git_get_active_branch())

                out, err = repo._git_custom_command('', cmd_list)
                lgr.info(out)
                if knows_annex(repo.path):
                    # annex-apply:
                    lgr.info("Updating annex ...")
                    out, err = repo._git_custom_command('', ["git", "annex", "merge"])
                    lgr.info(out)
    def __call__(sshurl, target=None, target_dir=None,
                 target_url=None, target_pushurl=None,
                 dataset=None, recursive=False,
                 existing='raise', shared=False):

        if sshurl is None:
            raise ValueError("""insufficient information for target creation
            (needs at least a dataset and a SSH URL).""")

        if target is None and (target_url is not None
                               or target_pushurl is not None):
            raise ValueError("""insufficient information for adding the target
            as a sibling (needs at least a name)""")

        # shortcut
        ds = dataset

        if ds is not None and not isinstance(ds, Dataset):
            ds = Dataset(ds)
        if ds is None:
            # try to find a dataset at or above CWD
            dspath = GitRepo.get_toppath(abspath(getpwd()))
            if dspath is None:
                raise ValueError("""No dataset found
                                 at or above {0}.""".format(getpwd()))
            ds = Dataset(dspath)
            lgr.debug("Resolved dataset for target creation: {0}".format(ds))
        assert(ds is not None and sshurl is not None)

        if not ds.is_installed():
            raise ValueError("""Dataset {0} is not installed yet.""".format(ds))
        assert(ds.repo is not None)

        # determine target parameters:
        parsed_target = urlparse(sshurl)
        host_name = parsed_target.netloc

        # TODO: Sufficient to fail on this condition?
        if not parsed_target.netloc:
            raise ValueError("Malformed URL: {0}".format(sshurl))

        if target_dir is None:
            if parsed_target.path:
                target_dir = parsed_target.path
            else:
                target_dir = '.'

        # TODO: centralize and generalize template symbol handling
        replicate_local_structure = False
        if "%NAME" not in target_dir:
            replicate_local_structure = True

        # collect datasets to use:
        datasets = dict()
        datasets[basename(ds.path)] = ds
        if recursive:
            for subds in ds.get_dataset_handles(recursive=True):
                sub_path = opj(ds.path, subds)
                # TODO: when enhancing Dataset/*Repo classes and therefore
                # adapt to moved code, make proper distinction between name and
                # path of a submodule, which are technically different. This
                # probably will become important on windows as well as whenever
                # we want to allow for moved worktrees.
                datasets[basename(ds.path) + '/' + subds] = \
                    Dataset(sub_path)

        # setup SSH Connection:
        # TODO: Make the entire setup a helper to use it when pushing via
        # publish?

        # - build control master:
        from datalad.utils import assure_dir
        not_supported_on_windows("TODO")
        from os import geteuid  # Linux specific import
        var_run_user_datalad = "/var/run/user/%s/datalad" % geteuid()
        assure_dir(var_run_user_datalad)
        control_path = "%s/%s" % (var_run_user_datalad, host_name)
        control_path += ":%s" % parsed_target.port if parsed_target.port else ""

        # - start control master:
        cmd = "ssh -o ControlMaster=yes -o \"ControlPath=%s\" " \
              "-o ControlPersist=yes %s exit" % (control_path, host_name)
        lgr.debug("Try starting control master by calling:\n%s" % cmd)
        import subprocess
        proc = subprocess.Popen(cmd, shell=True)
        proc.communicate(input="\n")  # why the f.. this is necessary?

        runner = Runner()
        ssh_cmd = ["ssh", "-S", control_path, host_name]

        lgr.info("Creating target datasets ...")
        for current_dataset in datasets:
            if not replicate_local_structure:
                path = target_dir.replace("%NAME",
                                          current_dataset.replace("/", "-"))
            else:
                # TODO: opj depends on local platform, not the remote one.
                # check how to deal with it. Does windows ssh server accept
                # posix paths? vice versa? Should planned SSH class provide
                # tools for this issue?
                path = normpath(opj(target_dir,
                                    relpath(datasets[current_dataset].path,
                                            start=ds.path)))

            if path != '.':
                # check if target exists
                # TODO: Is this condition valid for != '.' only?
                path_exists = True
                cmd = ssh_cmd + ["ls", path]
                try:
                    out, err = runner.run(cmd, expect_fail=True,
                                          expect_stderr=True)
                except CommandError as e:
                    if "No such file or directory" in e.stderr and \
                                    path in e.stderr:
                        path_exists = False
                    else:
                        raise  # It's an unexpected failure here

                if path_exists:
                    if existing == 'raise':
                        raise RuntimeError(
                            "Target directory %s already exists." % path)
                    elif existing == 'skip':
                        continue
                    elif existing == 'replace':
                        pass
                    else:
                        raise ValueError("Do not know how to hand existing=%s" % repr(existing))

                cmd = ssh_cmd + ["mkdir", "-p", path]
                try:
                    runner.run(cmd)
                except CommandError as e:
                    lgr.error("Remotely creating target directory failed at "
                              "%s.\nError: %s" % (path, str(e)))
                    continue

            # init git repo
            cmd = ssh_cmd + ["git", "-C", path, "init"]
            if shared:
                cmd.append("--shared=%s" % shared)
            try:
                runner.run(cmd)
            except CommandError as e:
                lgr.error("Remotely initializing git repository failed at %s."
                          "\nError: %s\nSkipping ..." % (path, str(e)))
                continue

            # check git version on remote end:
            cmd = ssh_cmd + ["git", "version"]
            try:
                out, err = runner.run(cmd)
                git_version = out.lstrip("git version").strip()
                lgr.debug("Detected git version on server: %s" % git_version)
                if git_version < "2.4":
                    lgr.error("Git version >= 2.4 needed to configure remote."
                              " Version detected on server: %s\nSkipping ..."
                              % git_version)
                    continue

            except CommandError as e:
                lgr.warning(
                    "Failed to determine git version on remote.\n"
                    "Error: {0}\nTrying to configure anyway "
                    "...".format(e.message))

            # allow for pushing to checked out branch
            cmd = ssh_cmd + ["git", "-C", path, "config",
                             "receive.denyCurrentBranch",
                             "updateInstead"]
            try:
                runner.run(cmd)
            except CommandError as e:
                lgr.warning("git config failed at remote location %s.\n"
                            "You will not be able to push to checked out "
                            "branch." % path)

            # enable post-update hook:
            cmd = ssh_cmd + ["mv", opj(path, ".git/hooks/post-update.sample"),
                             opj(path, ".git/hooks/post-update")]
            try:
                runner.run(cmd)
            except CommandError as e:
                lgr.error("Failed to enable post update hook.\n"
                          "Error: %s" % e.message)

            # initially update server info "manually":
            cmd = ssh_cmd + ["git", "-C", path, "update-server-info"]
            try:
                runner.run(cmd)
            except CommandError as e:
                lgr.error("Failed to update server info.\n"
                          "Error: %s" % e.message)

        # stop controlmaster (close ssh connection):
        cmd = ["ssh", "-O", "stop", "-S", control_path, host_name]
        out, err = runner.run(cmd, expect_stderr=True)

        if target:
            # add the sibling(s):
            if target_url is None:
                target_url = sshurl
            if target_pushurl is None:
                target_pushurl = sshurl
            result_adding = AddSibling()(dataset=ds,
                                         name=target,
                                         url=target_url,
                                         pushurl=target_pushurl,
                                         recursive=recursive,
                                         force=existing in {'replace'})
Beispiel #51
0
    def __call__(dataset=None, name=None, url=None,
                 pushurl=None, recursive=False, force=False):

        # TODO: Detect malformed URL and fail?

        if name is None or (url is None and pushurl is None):
            raise ValueError("""insufficient information to add a sibling
                (needs at least a dataset, a name and an URL).""")
        if url is None:
            url = pushurl

        # shortcut
        ds = dataset

        if ds is not None and not isinstance(ds, Dataset):
            ds = Dataset(ds)
        if ds is None:
            # try to find a dataset at or above CWD
            dspath = GitRepo.get_toppath(abspath(getpwd()))
            if dspath is None:
                raise ValueError(
                        "No dataset found at or above {0}.".format(getpwd()))
            ds = Dataset(dspath)
            lgr.debug("Resolved dataset for target creation: {0}".format(ds))

        assert(ds is not None and name is not None and url is not None)

        if not ds.is_installed():
            raise ValueError("Dataset {0} is not installed yet.".format(ds))
        assert(ds.repo is not None)

        ds_basename = basename(ds.path)
        repos = {
            ds_basename: {'repo': ds.repo}
        }
        if recursive:
            for subds in ds.get_dataset_handles(recursive=True):
                sub_path = opj(ds.path, subds)
                repos[ds_basename + '/' + subds] = {
#                repos[subds] = {
                    'repo': GitRepo(sub_path, create=False)
                }

        # Note: This is copied from create_publication_target_sshwebserver
        # as it is the same logic as for its target_dir.
        # TODO: centralize and generalize template symbol handling
        # TODO: Check pushurl for template symbols too. Probably raise if only
        #       one of them uses such symbols

        replicate_local_structure = False
        if "%NAME" not in url:
            replicate_local_structure = True

        for repo in repos:
            if not replicate_local_structure:
                repos[repo]['url'] = url.replace("%NAME",
                                                 repo.replace("/", "-"))
                if pushurl:
                    repos[repo]['pushurl'] = pushurl.replace("%NAME",
                                                             repo.replace("/",
                                                                          "-"))
            else:
                repos[repo]['url'] = url
                if pushurl:
                    repos[repo]['pushurl'] = pushurl

                if repo != ds_basename:
                    repos[repo]['url'] = _urljoin(repos[repo]['url'], repo[len(ds_basename)+1:])
                    if pushurl:
                        repos[repo]['pushurl'] = _urljoin(repos[repo]['pushurl'], repo[len(ds_basename)+1:])

        # collect existing remotes:
        already_existing = list()
        conflicting = list()
        for repo in repos:
            if name in repos[repo]['repo'].git_get_remotes():
                already_existing.append(repo)
                lgr.debug("""Remote '{0}' already exists
                          in '{1}'.""".format(name, repo))

                existing_url = repos[repo]['repo'].git_get_remote_url(name)
                existing_pushurl = \
                    repos[repo]['repo'].git_get_remote_url(name, push=True)

                if repos[repo]['url'].rstrip('/') != existing_url.rstrip('/') \
                        or (pushurl and existing_pushurl and
                            repos[repo]['pushurl'].rstrip('/') !=
                                    existing_pushurl.rstrip('/')) \
                        or (pushurl and not existing_pushurl):
                    conflicting.append(repo)

        if not force and conflicting:
            raise RuntimeError("Sibling '{0}' already exists with conflicting"
                               " URL for {1} dataset(s). {2}".format(
                                   name, len(conflicting), conflicting))

        runner = Runner()
        successfully_added = list()
        for repo in repos:
            if repo in already_existing:
                if repo not in conflicting:
                    lgr.debug("Skipping {0}. Nothing to do.".format(repo))
                    continue
                # rewrite url
                cmd = ["git", "remote", "set-url", name, repos[repo]['url']]
                runner.run(cmd, cwd=repos[repo]['repo'].path)
            else:
                # add the remote
                cmd = ["git", "remote", "add", name, repos[repo]['url']]
                runner.run(cmd, cwd=repos[repo]['repo'].path)
            if pushurl:
                cmd = ["git", "remote", "set-url", "--push", name,
                       repos[repo]['pushurl']]
                runner.run(cmd, cwd=repos[repo]['repo'].path)
            successfully_added.append(repo)

        return successfully_added
Beispiel #52
0
    def __call__(
            path=None,
            force=False,
            description=None,
            dataset=None,
            no_annex=False,
            save=True,
            annex_version=None,
            annex_backend='MD5E',
            native_metadata_type=None,
            shared_access=None,
            git_opts=None,
            annex_opts=None,
            annex_init_opts=None):

        # two major cases
        # 1. we got a `dataset` -> we either want to create it (path is None),
        #    or another dataset in it (path is not None)
        # 2. we got no dataset -> we want to create a fresh dataset at the
        #    desired location, either at `path` or PWD

        # sanity check first
        if git_opts:
            lgr.warning(
                "`git_opts` argument is presently ignored, please complain!")
        if no_annex:
            if description:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "description for annex repo and declaring "
                                 "no annex repo.")
            if annex_opts:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "options for annex and declaring no "
                                 "annex repo.")
            if annex_init_opts:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "options for annex init and declaring no "
                                 "annex repo.")

        if not isinstance(force, bool):
            raise ValueError("force should be bool, got %r.  Did you mean to provide a 'path'?" % force)

        # straight from input arg, no messing around before this
        if path is None:
            if dataset is None:
                # nothing given explicity, assume create fresh right here
                path = getpwd()
            else:
                # no path, but dataset -> create that dataset
                path = dataset.path
        else:
            # resolve the path against a potential dataset
            path = resolve_path(path, ds=dataset)

        # we know that we need to create a dataset at `path`
        assert(path is not None)

        if git_opts is None:
            git_opts = {}
        if shared_access:
            # configure `git --shared` value
            git_opts['shared'] = shared_access

        # check for sane subdataset path
        real_targetpath = with_pathsep(realpath(path))  # realpath OK
        if dataset is not None:
            # make sure we get to an expected state
            if not real_targetpath.startswith(  # realpath OK
                    with_pathsep(realpath(dataset.path))):  # realpath OK
                raise ValueError("path {} outside {}".format(path, dataset))

        # important to use the given Dataset object to avoid spurious ID
        # changes with not-yet-materialized Datasets
        tbds = dataset if dataset is not None and dataset.path == path else Dataset(path)

        # don't create in non-empty directory without `force`:
        if isdir(tbds.path) and listdir(tbds.path) != [] and not force:
            raise ValueError("Cannot create dataset in directory %s "
                             "(not empty). Use option 'force' in order to "
                             "ignore this and enforce creation." % tbds.path)

        if no_annex:
            lgr.info("Creating a new git repo at %s", tbds.path)
            GitRepo(
                tbds.path,
                url=None,
                create=True,
                git_opts=git_opts)
        else:
            # always come with annex when created from scratch
            lgr.info("Creating a new annex repo at %s", tbds.path)
            AnnexRepo(
                tbds.path,
                url=None,
                create=True,
                backend=annex_backend,
                version=annex_version,
                description=description,
                git_opts=git_opts,
                annex_opts=annex_opts,
                annex_init_opts=annex_init_opts)

        if native_metadata_type is not None:
            if not isinstance(native_metadata_type, list):
                native_metadata_type = [native_metadata_type]
            for nt in native_metadata_type:
                tbds.config.add('datalad.metadata.nativetype', nt)

        # record an ID for this repo for the afterlife
        # to be able to track siblings and children
        id_var = 'datalad.dataset.id'
        if id_var in tbds.config:
            # make sure we reset this variable completely, in case of a re-create
            tbds.config.unset(id_var, where='dataset')
        tbds.config.add(
            id_var,
            tbds.id if tbds.id is not None else uuid.uuid1().urn.split(':')[-1],
            where='dataset')

        # make sure that v6 annex repos never commit content under .datalad
        with open(opj(tbds.path, '.datalad', '.gitattributes'), 'a') as gitattr:
            # TODO this will need adjusting, when annex'ed aggregate meta data
            # comes around
            gitattr.write('** annex.largefiles=nothing\n')

        # save everthing
        tbds.add('.datalad', to_git=True, save=False)

        if save:
            save_dataset(
                tbds,
                paths=['.datalad'],
                message='[DATALAD] new dataset')

            # the next only makes sense if we saved the created dataset,
            # otherwise we have no committed state to be registered
            # in the parent
            if dataset is not None and dataset.path != tbds.path:
                # we created a dataset in another dataset
                # -> make submodule
                dataset.add(tbds.path, save=save, ds2super=True)

        return tbds
Beispiel #53
0
# If there is a bundled git, make sure GitPython uses it too:
from datalad.cmd import GitRunner
GitRunner._check_git_path()
if GitRunner._GIT_PATH:
    import os
    os.environ['GIT_PYTHON_GIT_EXECUTABLE'] = \
        os.path.join(GitRunner._GIT_PATH, 'git')

from .config import ConfigManager
cfg = ConfigManager()

from .log import lgr
from datalad.utils import get_encoding_info, get_envvars_info, getpwd

# To analyze/initiate our decision making on what current directory to return
getpwd()

lgr.log(5, "Instantiating ssh manager")
from .support.sshconnector import SSHManager
ssh_manager = SSHManager()
atexit.register(ssh_manager.close, allow_fail=False)
atexit.register(lgr.log, 5, "Exiting")

from .version import __version__


def test(module='datalad', verbose=False, nocapture=False, pdb=False, stop=False):
    """A helper to run datalad's tests.  Requires nose
    """
    argv = [] #module]
    # could make it 'smarter' but decided to be explicit so later we could