Exemple #1
0
def test_optimized_cloning(path):
    # make test repo with one file and one commit
    originpath = op.join(path, 'origin')
    repo = GitRepo(originpath, create=True)
    with open(op.join(originpath, 'test'), 'w') as f:
        f.write('some')
    repo.add('test')
    repo.commit('init')
    ok_clean_git(originpath, annex=False)
    from glob import glob

    def _get_inodes(repo):
        return dict([
            (os.path.join(*o.split(os.sep)[-2:]), os.stat(o).st_ino)
            for o in glob(os.path.join(repo.repo.git_dir, 'objects', '*', '*'))
        ])

    origin_inodes = _get_inodes(repo)
    # now clone it in different ways and see what happens to the object storage
    from datalad.support.network import get_local_file_url
    clonepath = op.join(path, 'clone')
    for src in (originpath, get_local_file_url(originpath)):
        # deprecated
        assert_raises(DeprecatedError, GitRepo, url=src, path=clonepath)
        clone = GitRepo.clone(url=src, path=clonepath, create=True)
        clone_inodes = _get_inodes(clone)
        eq_(origin_inodes, clone_inodes, msg='with src={}'.format(src))
        rmtree(clonepath)
Exemple #2
0
def test_recurse_existing(src, path):
    origin_ds = _make_dataset_hierarchy(src)

    # make sure recursion_limit works as expected across a range of depths
    for depth in range(len(origin_ds)):
        datasets = assure_list(
            install(path, source=src, recursive=True, recursion_limit=depth))
        # we expect one dataset per level
        eq_(len(datasets), depth + 1)
        rmtree(path)

    # now install all but the last two levels, no data
    root, sub1, sub2 = install(path, source=src, recursive=True, recursion_limit=2)
    ok_(sub2.repo.file_has_content('file_in_annex.txt') is False)
    sub3 = Dataset(opj(sub2.path, 'sub3'))
    ok_(not sub3.is_installed())
    # now get all content in all existing datasets, no new datasets installed
    # in the process
    files = root.get(curdir, recursive=True, recursion_limit='existing')
    eq_(len(files), 1)
    ok_(sub2.repo.file_has_content('file_in_annex.txt') is True)
    ok_(not sub3.is_installed())
    # now pull down all remaining datasets, no data
    sub3, sub4 = root.get(curdir, recursive=True, get_data=False)
    ok_(sub4.is_installed())
    ok_(sub3.repo.file_has_content('file_in_annex.txt') is False)
    # aaannd all data
    files = root.get(curdir, recursive=True)
    eq_(len(files), 1)
    ok_(sub3.repo.file_has_content('file_in_annex.txt') is True)
Exemple #3
0
def test_get_missing(path):
    repo = GitRepo(path, create=True)
    os.makedirs(op.join(path, 'deep'))
    with open(op.join(path, 'test1'), 'w') as f:
        f.write('some')
    with open(op.join(path, 'deep', 'test2'), 'w') as f:
        f.write('some more')
    # no files tracked yet, so nothing changed
    eq_(repo.get_changed_files(), [])
    repo.add('.')
    # still no differences between worktree and staged
    eq_(repo.get_changed_files(), [])
    eq_(set(repo.get_changed_files(staged=True)),
        {'test1', op.join('deep', 'test2')})
    eq_(set(repo.get_changed_files(staged=True, diff_filter='AD')),
        {'test1', op.join('deep', 'test2')})
    eq_(repo.get_changed_files(staged=True, diff_filter='D'), [])
    repo.commit()
    eq_(repo.get_changed_files(), [])
    eq_(repo.get_changed_files(staged=True), [])
    ok_clean_git(path, annex=False)
    unlink(op.join(path, 'test1'))
    eq_(repo.get_missing_files(), ['test1'])
    rmtree(op.join(path, 'deep'))
    eq_(sorted(repo.get_missing_files()), [op.join('deep', 'test2'), 'test1'])
    # nothing is actually known to be deleted
    eq_(repo.get_deleted_files(), [])
    # do proper removal
    repo.remove(op.join(path, 'test1'))
    # no longer missing
    eq_(repo.get_missing_files(), [op.join('deep', 'test2')])
    # but deleted
    eq_(repo.get_deleted_files(), ['test1'])
Exemple #4
0
def test_recurse_existing(src, path):
    origin_ds = _make_dataset_hierarchy(src)

    # make sure recursion_limit works as expected across a range of depths
    for depth in range(len(origin_ds)):
        datasets = assure_list(
            install(path, source=src, recursive=True, recursion_limit=depth))
        # we expect one dataset per level
        eq_(len(datasets), depth + 1)
        rmtree(path)

    # now install all but the last two levels, no data
    root, sub1, sub2 = install(path, source=src, recursive=True, recursion_limit=2)
    ok_(sub2.repo.file_has_content('file_in_annex.txt') is False)
    sub3 = Dataset(opj(sub2.path, 'sub3'))
    ok_(not sub3.is_installed())
    # now get all content in all existing datasets, no new datasets installed
    # in the process
    files = root.get(curdir, recursive=True, recursion_limit='existing')
    eq_(len(files), 1)
    ok_(sub2.repo.file_has_content('file_in_annex.txt') is True)
    ok_(not sub3.is_installed())
    # now pull down all remaining datasets, no data
    sub3, sub4 = root.get(curdir, recursive=True, get_data=False)
    ok_(sub4.is_installed())
    ok_(sub3.repo.file_has_content('file_in_annex.txt') is False)
    # aaannd all data
    files = root.get(curdir, recursive=True)
    eq_(len(files), 1)
    ok_(sub3.repo.file_has_content('file_in_annex.txt') is True)
Exemple #5
0
def test_get_missing(path):
    repo = GitRepo(path, create=True)
    os.makedirs(op.join(path, 'deep'))
    with open(op.join(path, 'test1'), 'w') as f:
        f.write('some')
    with open(op.join(path, 'deep', 'test2'), 'w') as f:
        f.write('some more')
    # no files tracked yet, so nothing changed
    eq_(repo.get_changed_files(), [])
    repo.add('.')
    # still no differences between worktree and staged
    eq_(repo.get_changed_files(), [])
    eq_(set(repo.get_changed_files(staged=True)),
        {'test1', op.join('deep', 'test2')})
    eq_(set(repo.get_changed_files(staged=True, diff_filter='AD')),
        {'test1', op.join('deep', 'test2')})
    eq_(repo.get_changed_files(staged=True, diff_filter='D'), [])
    repo.commit()
    eq_(repo.get_changed_files(), [])
    eq_(repo.get_changed_files(staged=True), [])
    ok_clean_git(path, annex=False)
    unlink(op.join(path, 'test1'))
    eq_(repo.get_missing_files(), ['test1'])
    rmtree(op.join(path, 'deep'))
    eq_(sorted(repo.get_missing_files()), [op.join('deep', 'test2'), 'test1'])
    # nothing is actually known to be deleted
    eq_(repo.get_deleted_files(), [])
    # do proper removal
    repo.remove(op.join(path, 'test1'))
    # no longer missing
    eq_(repo.get_missing_files(), [op.join('deep', 'test2')])
    # but deleted
    eq_(repo.get_deleted_files(), ['test1'])
Exemple #6
0
def test_is_installed(src, path):
    ds = Dataset(path)
    assert_false(ds.is_installed())

    # get a clone:
    AnnexRepo.clone(src, path)
    ok_(ds.is_installed())
    # submodule still not installed:
    subds = Dataset(opj(path, 'subm 1'))
    assert_false(subds.is_installed())
    # We must not be able to create a new repository under a known
    # subdataset path.
    # Note: Unfortunately we would still be able to generate it under
    # subdirectory within submodule, e.g. `subm 1/subdir` but that is
    # not checked here. `rev-create` will provide that protection
    # when create/rev-create merge.
    with assert_raises(PathKnownToRepositoryError):
        subds.create()
    # get the submodule
    # This would init so there is a .git file with symlink info, which is
    # as we agreed is more pain than gain, so let's use our install which would
    # do it right, after all we are checking 'is_installed' ;)
    # from datalad.cmd import Runner
    # Runner().run(['git', 'submodule', 'update', '--init', 'subm 1'], cwd=path)
    with chpwd(path):
        get('subm 1')
    ok_(subds.is_installed())
    # wipe it out
    rmtree(ds.path)
    assert_false(ds.is_installed())
Exemple #7
0
 def _extract_archive(self, path):
     # we need to extract the archive
     # TODO: extract to _tmp and then move in a single command so we
     # don't end up picking up broken pieces
     lgr.debug(
         u"Extracting {self._archive} under {path}".format(**locals()))
     if exists(path):
         lgr.debug(
             "Previous extracted (but probably not fully) cached archive "
             "found. Removing %s", path)
         rmtree(path)
     os.makedirs(path)
     assert (exists(path))
     # remove old stamp
     if exists(self.stamp_path):
         rmtree(self.stamp_path)
     decompress_file(self._archive, path, leading_directories=None)
     # TODO: must optional since we might to use this content, move it
     # into the tree etc
     # lgr.debug("Adjusting permissions to R/O for the extracted content")
     # rotree(path)
     assert (exists(path))
     # create a stamp
     with open(self.stamp_path, 'wb') as f:
         f.write(ensure_bytes(self._archive))
     # assert that stamp mtime is not older than archive's directory
     assert (self.is_extracted)
Exemple #8
0
def test_install_dataset_from_just_source(src_repo=None, path=None):

    src_ds = Dataset(src_repo).create(result_renderer='disabled', force=True)
    src_ds.save(['INFO.txt', 'test.dat'], to_git=True)
    src_ds.save('test-annex.dat', to_git=False)
    # equivalent repo on github:
    src_url = "https://github.com/datalad/testrepo--basic--r1.git"
    sources = [
        src_ds.path,
        get_local_file_url(src_ds.path, compatibility='git')
    ]
    if not dl_cfg.get('datalad.tests.nonetwork'):
        sources.append(src_url)

    for url in sources:

        with chpwd(path, mkdir=True):
            ds = install(source=url)

        ok_startswith(ds.path, path)
        ok_(ds.is_installed())
        ok_(GitRepo.is_valid_repo(ds.path))
        assert_repo_status(ds.path, annex=None)
        assert_in('INFO.txt', ds.repo.get_indexed_files())

        # cleanup before next iteration
        rmtree(path)
Exemple #9
0
def decompress_file(archive, dir_, leading_directories='strip'):
    """Decompress `archive` into a directory `dir_`

    Parameters
    ----------
    archive: str
    dir_: str
    leading_directories: {'strip', None}
      If `strip`, and archive contains a single leading directory under which
      all content is stored, all the content will be moved one directory up
      and that leading directory will be removed.
    """
    if not exists(dir_):
        lgr.debug("Creating directory %s to extract archive into", dir_)
        os.makedirs(dir_)

    _decompress_file(archive, dir_)

    if leading_directories == 'strip':
        _, dirs, files = next(os.walk(dir_))
        if not len(files) and len(dirs) == 1:
            # move all the content under dirs[0] up 1 level
            widow_dir = opj(dir_, dirs[0])
            lgr.debug("Moving content within %s upstairs", widow_dir)
            subdir, subdirs_, files_ = next(os.walk(opj(dir_, dirs[0])))
            for f in subdirs_ + files_:
                os.rename(opj(subdir, f), opj(dir_, f))
            # NFS might hold it victim so use rmtree so it tries a few times
            rmtree(widow_dir)
    elif leading_directories is None:
        pass  # really do nothing
    else:
        raise NotImplementedError("Not supported %s" % leading_directories)
Exemple #10
0
def test_subsuperdataset_save(path):
    # Verify that when invoked without recursion save does not
    # cause querying of subdatasets of the subdataset
    # see https://github.com/datalad/datalad/issues/4523
    parent = Dataset(path).create()
    # Create 3 levels of subdatasets so later to check operation
    # with or without --dataset being specified
    sub1 = parent.create('sub1')
    sub2 = parent.create(sub1.pathobj / 'sub2')
    sub3 = parent.create(sub2.pathobj / 'sub3')
    assert_repo_status(path)
    # now we will lobotomize that sub2 so git would fail if any query is performed.
    rmtree(str(sub3.pathobj / '.git' / 'objects'))
    # the call should proceed fine since neither should care about sub3
    # default is no recursion
    parent.save('sub1')
    sub1.save('sub2')
    assert_raises(CommandError, parent.save, 'sub1', recursive=True)
    # and should fail if we request saving while in the parent directory
    # but while not providing a dataset, since operation would run within
    # pointed subdataset
    with chpwd(sub1.path):
        assert_raises(CommandError, save, 'sub2')
    # but should not fail in the top level superdataset
    with chpwd(parent.path):
        save('sub1')
Exemple #11
0
def test_optimized_cloning(path):
    # make test repo with one file and one commit
    originpath = op.join(path, 'origin')
    repo = GitRepo(originpath, create=True)
    with open(op.join(originpath, 'test'), 'w') as f:
        f.write('some')
    repo.add('test')
    repo.commit('init')
    ok_clean_git(originpath, annex=False)
    from glob import glob

    def _get_inodes(repo):
        return dict(
            [(os.path.join(*o.split(os.sep)[-2:]),
              os.stat(o).st_ino)
             for o in glob(os.path.join(repo.path,
                                        repo.get_git_dir(repo),
                                        'objects', '*', '*'))])

    origin_inodes = _get_inodes(repo)
    # now clone it in different ways and see what happens to the object storage
    from datalad.support.network import get_local_file_url
    clonepath = op.join(path, 'clone')
    for src in (originpath, get_local_file_url(originpath)):
        # deprecated
        assert_raises(DeprecatedError, GitRepo, url=src, path=clonepath)
        clone = GitRepo.clone(url=src, path=clonepath, create=True)
        clone_inodes = _get_inodes(clone)
        eq_(origin_inodes, clone_inodes, msg='with src={}'.format(src))
        rmtree(clonepath)
Exemple #12
0
def _uninstall_dataset(ds, check, has_super, **kwargs):
    if check and ds.is_installed():
        for r in _drop_files(ds,
                             curdir,
                             check=True,
                             noannex_iserror=False,
                             **kwargs):
            yield r
    # TODO: uninstall of a subdataset that has a local URL
    #       (e.g. ./anything) implies cannot be undone, decide how, and
    #       if to check for that
    # TODO check that the relevant branched are pushed to a remote
    if ds.subdatasets(fulfilled=True):
        yield get_status_dict(
            status='error',
            ds=ds,
            message=
            ('to be uninstalled dataset %s has present subdatasets, forgot --recursive?',
             ds),
            **kwargs)
        return
    # Close any possibly associated process etc with underlying repo.
    # Otherwise - rmtree could fail to remove e.g. under NFS which would
    # still have some files opened by them (thus having .nfs00000xxxx
    # files) forbidding rmdir to work in rmtree
    ds.close()
    if ds.is_installed():
        rmtree(ds.path)
    if has_super and not exists(ds.path):
        # recreate an empty mountpoint to make Git happier
        os.makedirs(ds.path)
    # invalidate loaded ConfigManager:
    ds._cfg = None
    yield get_status_dict(status='ok', ds=ds, **kwargs)
Exemple #13
0
def test_implicit_install(src, dst):

    origin_top = create(src)
    origin_sub = origin_top.create("sub")
    origin_subsub = origin_sub.create("subsub")
    with open(opj(origin_top.path, "file1.txt"), "w") as f:
        f.write("content1")
    origin_top.add("file1.txt")
    with open(opj(origin_sub.path, "file2.txt"), "w") as f:
        f.write("content2")
    origin_sub.add("file2.txt")
    with open(opj(origin_subsub.path, "file3.txt"), "w") as f:
        f.write("content3")
    origin_subsub.add("file3.txt")
    origin_top.save(recursive=True)

    # first, install toplevel:
    ds = install(dst, source=src)
    ok_(ds.is_installed())

    sub = Dataset(opj(ds.path, "sub"))
    ok_(not sub.is_installed())
    subsub = Dataset(opj(sub.path, "subsub"))
    ok_(not subsub.is_installed())

    # fail on obscure non-existing one
    assert_raises(IncompleteResultsError, ds.install, source='obscure')

    # install 3rd level and therefore implicitly the 2nd:
    result = ds.install(path=opj("sub", "subsub"))
    ok_(sub.is_installed())
    ok_(subsub.is_installed())
    # but by default implicit results are not reported
    eq_(result, subsub)

    # fail on obscure non-existing one in subds
    assert_raises(IncompleteResultsError, ds.install, source=opj('sub', 'obscure'))

    # clean up, the nasty way
    rmtree(dst, chmod_files=True)
    ok_(not exists(dst))

    # again first toplevel:
    ds = install(dst, source=src)
    ok_(ds.is_installed())
    sub = Dataset(opj(ds.path, "sub"))
    ok_(not sub.is_installed())
    subsub = Dataset(opj(sub.path, "subsub"))
    ok_(not subsub.is_installed())

    # now implicit but without an explicit dataset to install into
    # (deriving from CWD):
    with chpwd(dst):
        # don't ask for the file content to make return value comparison
        # simpler
        result = get(path=opj("sub", "subsub"), get_data=False, result_xfm='datasets')
        ok_(sub.is_installed())
        ok_(subsub.is_installed())
        eq_(result, [sub, subsub])
Exemple #14
0
def test_implicit_install(src, dst):

    origin_top = create(src)
    origin_sub = origin_top.create("sub")
    origin_subsub = origin_sub.create("subsub")
    with open(opj(origin_top.path, "file1.txt"), "w") as f:
        f.write("content1")
    origin_top.save("file1.txt")
    with open(opj(origin_sub.path, "file2.txt"), "w") as f:
        f.write("content2")
    origin_sub.save("file2.txt")
    with open(opj(origin_subsub.path, "file3.txt"), "w") as f:
        f.write("content3")
    origin_subsub.save("file3.txt")
    origin_top.save(recursive=True)

    # first, install toplevel:
    ds = install(dst, source=src)
    ok_(ds.is_installed())

    sub = Dataset(opj(ds.path, "sub"))
    ok_(not sub.is_installed())
    subsub = Dataset(opj(sub.path, "subsub"))
    ok_(not subsub.is_installed())

    # fail on obscure non-existing one
    assert_raises(IncompleteResultsError, ds.install, source='obscure')

    # install 3rd level and therefore implicitly the 2nd:
    result = ds.install(path=opj("sub", "subsub"))
    ok_(sub.is_installed())
    ok_(subsub.is_installed())
    # but by default implicit results are not reported
    eq_(result, subsub)

    # fail on obscure non-existing one in subds
    assert_raises(IncompleteResultsError, ds.install, source=opj('sub', 'obscure'))

    # clean up, the nasty way
    rmtree(dst, chmod_files=True)
    ok_(not exists(dst))

    # again first toplevel:
    ds = install(dst, source=src)
    ok_(ds.is_installed())
    sub = Dataset(opj(ds.path, "sub"))
    ok_(not sub.is_installed())
    subsub = Dataset(opj(sub.path, "subsub"))
    ok_(not subsub.is_installed())

    # now implicit but without an explicit dataset to install into
    # (deriving from CWD):
    with chpwd(dst):
        # don't ask for the file content to make return value comparison
        # simpler
        result = get(path=opj("sub", "subsub"), get_data=False, result_xfm='datasets')
        ok_(sub.is_installed())
        ok_(subsub.is_installed())
        eq_(result, [sub, subsub])
Exemple #15
0
 def teardown(self, exclude_metadata):
     # would make no sense if doesn't work correctly
     # IIRC we cannot provide custom additional depends so cannot import nose
     # assert_repo_status(self.ds.path)
     status = self.ds.status()
     assert all(r['state'] == 'clean' for r in status)
     assert len(status) >= self.nfiles
     rmtree(self.temp)
Exemple #16
0
 def _cleanup(self):
     if not self.remove_paths:
         return  # Nothing TODO
     self.log("Cleaning up %d paths", len(self.remove_paths))
     while self.remove_paths:
         path = self.remove_paths.pop()
         if op.lexists(path):
             rmtree(path)
Exemple #17
0
 def _cleanup(self):
     if not self.remove_paths:
         return  # Nothing TODO
     self.log("Cleaning up %d paths", len(self.remove_paths))
     while self.remove_paths:
         path = self.remove_paths.pop()
         if op.lexists(path):
             rmtree(path)
Exemple #18
0
def test_newthings_coming_down(originpath, destpath):
    origin = GitRepo(originpath, create=True)
    create_tree(originpath, {'load.dat': 'heavy'})
    Dataset(originpath).save('load.dat')
    ds = install(source=originpath,
                 path=destpath,
                 result_xfm='datasets',
                 return_type='item-or-list')
    assert_is_instance(ds.repo, GitRepo)
    assert_in('origin', ds.repo.get_remotes())
    # turn origin into an annex
    origin = AnnexRepo(originpath, create=True)
    # clone doesn't know yet
    assert_false(knows_annex(ds.path))
    # but after an update it should
    # no merge, only one sibling, no parameters should be specific enough
    assert_result_count(ds.update(), 1, status='ok', type='dataset')
    assert (knows_annex(ds.path))
    # no branches appeared
    eq_(ds.repo.get_branches(), [DEFAULT_BRANCH])
    # now merge, and get an annex
    assert_result_count(ds.update(merge=True),
                        1,
                        action='update',
                        status='ok',
                        type='dataset')
    assert_in('git-annex', ds.repo.get_branches())
    assert_is_instance(ds.repo, AnnexRepo)
    # should be fully functional
    testfname = opj(ds.path, 'load.dat')
    assert_false(ds.repo.file_has_content(testfname))
    ds.get('.')
    ok_file_has_content(opj(ds.path, 'load.dat'), 'heavy')
    # check that a new tag comes down
    origin.tag('first!')
    assert_result_count(ds.update(), 1, status='ok', type='dataset')
    eq_(ds.repo.get_tags(output='name')[0], 'first!')

    # and now we destroy the remote annex
    origin.call_git(['config', '--remove-section', 'annex'])
    rmtree(opj(origin.path, '.git', 'annex'), chmod_files=True)
    origin.call_git(['branch', '-D', 'git-annex'])
    origin = GitRepo(originpath)
    assert_false(knows_annex(originpath))

    # and update the local clone
    # for now this should simply not fail (see gh-793), later might be enhanced to a
    # graceful downgrade
    before_branches = ds.repo.get_branches()
    assert_result_count(ds.update(), 1, status='ok', type='dataset')
    eq_(before_branches, ds.repo.get_branches())
    # annex branch got pruned
    eq_(['origin/HEAD', 'origin/' + DEFAULT_BRANCH],
        ds.repo.get_remote_branches())
    # check that a new tag comes down even if repo types mismatch
    origin.tag('second!')
    assert_result_count(ds.update(), 1, status='ok', type='dataset')
    eq_(ds.repo.get_tags(output='name')[-1], 'second!')
Exemple #19
0
def _clone_from_any_source(sources, dest):
    # should not be the case, but we need to distinguish between failure
    # of git-clone, due to existing target and an unsuccessful clone
    # attempt. See below.
    existed = dest and exists(dest)
    for source_ in sources:
        try:
            lgr.debug("Retrieving a dataset from URL: "
                      "{0}".format(source_))
            with swallow_logs():
                GitRepo.clone(path=dest, url=source_, create=True)
            return source_  # do not bother with other sources if succeeded
        except GitCommandError as e:
            lgr.debug("Failed to retrieve from URL: "
                      "{0}".format(source_))
            if not existed and dest \
                    and exists(dest):
                lgr.debug("Wiping out unsuccessful clone attempt at "
                          "{}".format(dest))
                rmtree(dest)

            if source_ == sources[-1]:
                # Note: The following block is evaluated whenever we
                # fail even with the last try. Not nice, but currently
                # necessary until we get a more precise exception:
                ####################################
                # TODO: We may want to introduce a --force option to
                # overwrite the target.
                # TODO: Currently assuming if `existed` and there is a
                # GitCommandError means that these both things are connected.
                # Need newer GitPython to get stderr from GitCommandError
                # (already fixed within GitPython.)
                if existed:
                    # rudimentary check for an installed dataset at target:
                    # (TODO: eventually check for being the one, that this
                    # is about)
                    dest_ds = Dataset(dest)
                    if dest_ds.is_installed():
                        lgr.info("{0} appears to be installed already."
                                 "".format(dest_ds))
                        break
                    else:
                        lgr.warning("Target {0} already exists and is not "
                                    "an installed dataset. Skipped."
                                    "".format(dest))
                        # Keep original in debug output:
                        lgr.debug("Original failure:{0}"
                                  "{1}".format(linesep, exc_str(e)))
                        return None
                ##################

                # Re-raise if failed even with the last candidate
                lgr.debug("Unable to establish repository instance at "
                          "{0} from {1}"
                          "".format(dest, sources))
                raise
Exemple #20
0
def test_implicit_install(src, dst):

    origin_top = create(src)
    origin_sub = origin_top.create("sub")
    origin_subsub = origin_sub.create("subsub")
    with open(opj(origin_top.path, "file1.txt"), "w") as f:
        f.write("content1")
    origin_top.add("file1.txt")
    with open(opj(origin_sub.path, "file2.txt"), "w") as f:
        f.write("content2")
    origin_sub.add("file2.txt")
    with open(opj(origin_subsub.path, "file3.txt"), "w") as f:
        f.write("content3")
    origin_subsub.add("file3.txt")
    origin_top.save(auto_add_changes=True)

    # first, install toplevel:
    ds = install(dst, source=src)
    ok_(ds.is_installed())

    sub = Dataset(opj(ds.path, "sub"))
    ok_(not sub.is_installed())
    subsub = Dataset(opj(sub.path, "subsub"))
    ok_(not subsub.is_installed())

    # fail on obscure non-existing one
    assert_raises(InstallFailedError, ds.install, source='obscure')

    # install 3rd level and therefore implicitly the 2nd:
    result = ds.install(path=opj("sub", "subsub"))
    ok_(sub.is_installed())
    ok_(subsub.is_installed())
    eq_(result, subsub)

    # fail on obscure non-existing one in subds
    assert_raises(InstallFailedError, ds.install, source=opj('sub', 'obscure'))

    # clean up:
    rmtree(dst, chmod_files=True)
    ok_(not exists(dst))

    # again first toplevel:
    ds = install(dst, source=src)
    ok_(ds.is_installed())
    sub = Dataset(opj(ds.path, "sub"))
    ok_(not sub.is_installed())
    subsub = Dataset(opj(sub.path, "subsub"))
    ok_(not subsub.is_installed())

    # now implicit but without an explicit dataset to install into
    # (deriving from CWD):
    with chpwd(dst):
        result = get(path=opj("sub", "subsub"))
        ok_(sub.is_installed())
        ok_(subsub.is_installed())
        eq_(result, [subsub])
Exemple #21
0
def _clone_from_any_source(sources, dest):
    # should not be the case, but we need to distinguish between failure
    # of git-clone, due to existing target and an unsuccessful clone
    # attempt. See below.
    existed = dest and exists(dest)
    for source_ in sources:
        try:
            lgr.debug("Retrieving a dataset from URL: " "{0}".format(source_))
            with swallow_logs():
                GitRepo.clone(path=dest, url=source_, create=True)
            return source_  # do not bother with other sources if succeeded
        except GitCommandError as e:
            lgr.debug("Failed to retrieve from URL: " "{0}".format(source_))
            if not existed and dest \
                    and exists(dest):
                lgr.debug("Wiping out unsuccessful clone attempt at "
                          "{}".format(dest))
                rmtree(dest)

            if source_ == sources[-1]:
                # Note: The following block is evaluated whenever we
                # fail even with the last try. Not nice, but currently
                # necessary until we get a more precise exception:
                ####################################
                # TODO: We may want to introduce a --force option to
                # overwrite the target.
                # TODO: Currently assuming if `existed` and there is a
                # GitCommandError means that these both things are connected.
                # Need newer GitPython to get stderr from GitCommandError
                # (already fixed within GitPython.)
                if existed:
                    # rudimentary check for an installed dataset at target:
                    # (TODO: eventually check for being the one, that this
                    # is about)
                    dest_ds = Dataset(dest)
                    if dest_ds.is_installed():
                        lgr.info("{0} appears to be installed already."
                                 "".format(dest_ds))
                        break
                    else:
                        lgr.warning("Target {0} already exists and is not "
                                    "an installed dataset. Skipped."
                                    "".format(dest))
                        # Keep original in debug output:
                        lgr.debug("Original failure:{0}"
                                  "{1}".format(linesep, exc_str(e)))
                        return None
                ##################

                # Re-raise if failed even with the last candidate
                lgr.debug("Unable to establish repository instance at "
                          "{0} from {1}"
                          "".format(dest, sources))
                raise
Exemple #22
0
def _uninstall_dataset(ds, check, has_super, **kwargs):
    cwd = Path.cwd()
    if ds.pathobj == cwd or ds.pathobj in cwd.parents:
        yield get_status_dict(
            status='error',
            ds=ds,
            message='refusing to uninstall a dataset at or above the '
            'current working directory',
            **kwargs)
        return
    if check and ds.is_installed():
        # if the checks are on we need to make sure to exit this function
        # whenever any drop failed, because we cannot rely on the error
        # to actually cause a stop in upstairs code
        bad_things_happened = False
        for r in _drop_files(ds,
                             op.curdir,
                             check=True,
                             noannex_iserror=False,
                             **kwargs):
            yield r
            if r['action'] == 'drop' and \
                    not r.get('status', None) in ('ok', 'notneeded'):
                bad_things_happened = True
        if bad_things_happened:
            # error reporting already happened, we can just stop here
            return

    # TODO: uninstall of a subdataset that has a local URL
    #       (e.g. ./anything) implies cannot be undone, decide how, and
    #       if to check for that
    # TODO check that the relevant branched are pushed to a remote
    if ds.subdatasets(fulfilled=True):
        yield get_status_dict(
            status='error',
            ds=ds,
            message=
            ('to be uninstalled dataset %s has present subdatasets, forgot --recursive?',
             ds),
            **kwargs)
        return
    # Close any possibly associated process etc with underlying repo.
    # Otherwise - rmtree could fail to remove e.g. under NFS which would
    # still have some files opened by them (thus having .nfs00000xxxx
    # files) forbidding rmdir to work in rmtree
    ds.close()
    if ds.is_installed():
        rmtree(ds.path)
    if has_super and not op.exists(ds.path):
        # recreate an empty mountpoint to make Git happier
        os.makedirs(ds.path)
    # invalidate loaded ConfigManager:
    ds._cfg = None
    yield get_status_dict(status='ok', ds=ds, **kwargs)
Exemple #23
0
def test_newthings_coming_down(originpath, destpath):
    origin = GitRepo(originpath, create=True)
    create_tree(originpath, {'load.dat': 'heavy'})
    Dataset(originpath).save('load.dat')
    ds = install(
        source=originpath, path=destpath,
        result_xfm='datasets', return_type='item-or-list')
    assert_is_instance(ds.repo, GitRepo)
    assert_in('origin', ds.repo.get_remotes())
    # turn origin into an annex
    origin = AnnexRepo(originpath, create=True)
    # clone doesn't know yet
    assert_false(knows_annex(ds.path))
    # but after an update it should
    # no merge, only one sibling, no parameters should be specific enough
    assert_result_count(ds.update(), 1, status='ok', type='dataset')
    assert(knows_annex(ds.path))
    # no branches appeared
    eq_(ds.repo.get_branches(), ['master'])
    # now merge, and get an annex
    assert_result_count(ds.update(merge=True), 1, status='ok', type='dataset')
    assert_in('git-annex', ds.repo.get_branches())
    assert_is_instance(ds.repo, AnnexRepo)
    # should be fully functional
    testfname = opj(ds.path, 'load.dat')
    assert_false(ds.repo.file_has_content(testfname))
    ds.get('.')
    ok_file_has_content(opj(ds.path, 'load.dat'), 'heavy')
    # check that a new tag comes down
    origin.tag('first!')
    assert_result_count(ds.update(), 1, status='ok', type='dataset')
    eq_(ds.repo.get_tags(output='name')[0], 'first!')

    # and now we destroy the remote annex
    origin._git_custom_command([], ['git', 'config', '--remove-section', 'annex'])
    rmtree(opj(origin.path, '.git', 'annex'), chmod_files=True)
    origin._git_custom_command([], ['git', 'branch', '-D', 'git-annex'])
    origin = GitRepo(originpath)
    assert_false(knows_annex(originpath))

    # and update the local clone
    # for now this should simply not fail (see gh-793), later might be enhanced to a
    # graceful downgrade
    before_branches = ds.repo.get_branches()
    assert_result_count(ds.update(), 1, status='ok', type='dataset')
    eq_(before_branches, ds.repo.get_branches())
    # annex branch got pruned
    eq_(['origin/HEAD', 'origin/master'], ds.repo.get_remote_branches())
    # check that a new tag comes down even if repo types mismatch
    origin.tag('second!')
    assert_result_count(ds.update(), 1, status='ok', type='dataset')
    eq_(ds.repo.get_tags(output='name')[-1], 'second!')
Exemple #24
0
 def setup_cache(self):
     ds_path = create_test_dataset(self.dsname, spec='2/-2/-2', seed=0)[0]
     self.log("Setup cache ds path %s. CWD: %s", ds_path, getpwd())
     # Will store into a tarfile since otherwise install -r is way too slow
     # to be invoked for every benchmark
     # Store full path since apparently setup is not ran in that directory
     self.tarfile = op.realpath(SampleSuperDatasetBenchmarks.tarfile)
     with tarfile.open(self.tarfile, "w") as tar:
         # F.CK -- Python tarfile can't later extract those because key dirs are
         # read-only.  For now just a workaround - make it all writeable
         from datalad.utils import rotree
         rotree(self.dsname, ro=False, chmod_files=False)
         tar.add(self.dsname, recursive=True)
     rmtree(self.dsname)
Exemple #25
0
    def setup_cache(self):
        # creating in CWD so things get removed when ASV is done
        ds_path = create_test_dataset("testds1", spec='2/-2/-2', seed=0)[0]
        # Will store into a tarfile since otherwise install -r is way too slow
        # to be invoked for every benchmark
        tarfile_path = opj(osp.dirname(ds_path), 'testds1.tar')
        with tarfile.open(tarfile_path, "w") as tar:
            # F.CK -- Python tarfile can't later extract those because key dirs are
            # read-only.  For now just a workaround - make it all writeable
            from datalad.utils import rotree
            rotree('testds1', ro=False, chmod_files=False)
            tar.add('testds1', recursive=True)
        rmtree('testds1')

        return tarfile_path
Exemple #26
0
    def setup_cache(self):
        # creating in CWD so things get removed when ASV is done
        ds_path = create_test_dataset("testds1", spec='2/-2/-2', seed=0)[0]
        # Will store into a tarfile since otherwise install -r is way too slow
        # to be invoked for every benchmark
        tarfile_path = opj(osp.dirname(ds_path), 'testds1.tar')
        with tarfile.open(tarfile_path, "w") as tar:
            # F.CK -- Python tarfile can't later extract those because key dirs are
            # read-only.  For now just a workaround - make it all writeable
            from datalad.utils import rotree
            rotree('testds1', ro=False, chmod_files=False)
            tar.add('testds1', recursive=True)
        rmtree('testds1')

        return tarfile_path
Exemple #27
0
def test_recurse_existing(src, path):
    origin_ds = _make_dataset_hierarchy(src)

    # make sure recursion_limit works as expected across a range of depths
    for depth in range(len(origin_ds)):
        res = install(path,
                      source=src,
                      recursive=True,
                      recursion_limit=depth,
                      result_xfm=None,
                      return_type='list',
                      result_filter=None)
        # we expect one dataset per level
        assert_result_count(res, depth + 1, type='dataset', status='ok')
        rmtree(path)

    # now install all but the last two levels, no data
    root, sub1, sub2 = install(path,
                               source=src,
                               recursive=True,
                               recursion_limit=2,
                               result_xfm='datasets',
                               result_filter=None)
    ok_(sub2.repo.file_has_content('file_in_annex.txt') is False)
    sub3 = Dataset(opj(sub2.path, 'sub3'))
    ok_(not sub3.is_installed())
    # now get all content in all existing datasets, no new datasets installed
    # in the process
    files = root.get(curdir, recursive=True, recursion_limit='existing')
    assert_not_in_results(files, type='dataset', status='ok')
    assert_result_count(files, 1, type='file', status='ok')
    ok_(sub2.repo.file_has_content('file_in_annex.txt') is True)
    ok_(not sub3.is_installed())
    # now pull down all remaining datasets, no data
    sub3, sub4 = root.get(curdir,
                          recursive=True,
                          get_data=False,
                          result_xfm='datasets',
                          result_filter=lambda x: x['status'] == 'ok')
    ok_(sub4.is_installed())
    ok_(sub3.repo.file_has_content('file_in_annex.txt') is False)
    # aaannd all data
    files = root.get(
        curdir,
        recursive=True,
        result_filter=lambda x: x['status'] == 'ok' and x['type'] == 'file')
    eq_(len(files), 1)
    ok_(sub3.repo.file_has_content('file_in_annex.txt') is True)
Exemple #28
0
def test_remove_subds(path):
    ds = create(path)
    ds.create('sub')
    ds.create(op.join('sub', 'subsub'))
    assert_repo_status(ds.path)
    assert_result_count(ds.subdatasets(), 1, path=op.join(ds.path, 'sub'))
    # all good at this point, subdataset known, dataset clean
    # now have some external force wipe out the subdatasets
    rmtree(op.join(ds.path, 'sub'))
    assert_result_count(ds.status(),
                        1,
                        path=op.join(ds.path, 'sub'),
                        state='deleted')
    # a single call to save() must fix up the mess
    assert_status('ok', ds.save())
    assert_repo_status(ds.path)
Exemple #29
0
def _uninstall_dataset(ds, check, has_super):
    results = []
    if check and ds.is_installed():
        results.extend(_drop_files(ds, curdir, check=True))
    # TODO: uninstall of a subdataset that has a local URL
    #       (e.g. ./anything) implies cannot be undone, decide how, and
    #       if to check for that
    # TODO check that the relevant branched are pushed to a remote
    if ds.get_subdatasets(fulfilled=True):
        raise ValueError('to be uninstalled dataset has present subdatasets, forgot --recursive?')
    if ds.is_installed():
        rmtree(ds.path)
    if has_super and not exists(ds.path):
        # recreate an empty mountpoint to make Git happier
        os.makedirs(ds.path)
    results.append(ds)
    return results
Exemple #30
0
 def setup_cache(self):
     ds_path = create_test_dataset(
         self.dsname
         , spec='2/-2/-2'
         , seed=0
     )[0]
     self.log("Setup cache ds path %s. CWD: %s", ds_path, getpwd())
     # Will store into a tarfile since otherwise install -r is way too slow
     # to be invoked for every benchmark
     # Store full path since apparently setup is not ran in that directory
     self.tarfile = op.realpath(SampleSuperDatasetBenchmarks.tarfile)
     with tarfile.open(self.tarfile, "w") as tar:
         # F.CK -- Python tarfile can't later extract those because key dirs are
         # read-only.  For now just a workaround - make it all writeable
         from datalad.utils import rotree
         rotree(self.dsname, ro=False, chmod_files=False)
         tar.add(self.dsname, recursive=True)
     rmtree(self.dsname)
Exemple #31
0
def test_remove_subds(path):
    ds = create(path)
    ds.create('sub')
    ds.create(op.join('sub', 'subsub'))
    assert_repo_status(ds.path)
    assert_result_count(
        ds.subdatasets(), 1,
        path=op.join(ds.path, 'sub'))
    # all good at this point, subdataset known, dataset clean
    # now have some external force wipe out the subdatasets
    rmtree(op.join(ds.path, 'sub'))
    assert_result_count(
        ds.status(), 1,
        path=op.join(ds.path, 'sub'),
        state='deleted')
    # a single call to save() must fix up the mess
    assert_status('ok', ds.save())
    assert_repo_status(ds.path)
Exemple #32
0
def _uninstall_dataset(ds, check, has_super, **kwargs):
    if check and ds.is_installed():
        # if the checks are on we need to make sure to exit this function
        # whenever any drop failed, because we cannot rely on the error
        # to actually cause a stop in upstairs code
        bad_things_happened = False
        for r in _drop_files(
                ds, curdir, check=True, noannex_iserror=False, **kwargs):
            yield r
            if r['action'] == 'drop' and \
                    not r.get('status', None) in ('ok', 'notneeded'):
                bad_things_happened = True
        if bad_things_happened:
            # error reporting already happened, we can just stop here
            return

    # we want to use the bound dataset method
    from datalad.distribution.subdatasets import Subdatasets
    # TODO: uninstall of a subdataset that has a local URL
    #       (e.g. ./anything) implies cannot be undone, decide how, and
    #       if to check for that
    # TODO check that the relevant branched are pushed to a remote
    if ds.subdatasets(fulfilled=True):
        yield get_status_dict(
            status='error',
            ds=ds,
            message=(
                'to be uninstalled dataset %s has present subdatasets, forgot --recursive?',
                ds),
            **kwargs)
        return
    # Close any possibly associated process etc with underlying repo.
    # Otherwise - rmtree could fail to remove e.g. under NFS which would
    # still have some files opened by them (thus having .nfs00000xxxx
    # files) forbidding rmdir to work in rmtree
    ds.close()
    if ds.is_installed():
        rmtree(ds.path)
    if has_super and not exists(ds.path):
        # recreate an empty mountpoint to make Git happier
        os.makedirs(ds.path)
    # invalidate loaded ConfigManager:
    ds._cfg = None
    yield get_status_dict(status='ok', ds=ds, **kwargs)
Exemple #33
0
def test_nested_create(path):
    # to document some more organic usage pattern
    ds = Dataset(path).create()
    ok_clean_git(ds.path)
    lvl2relpath = opj('lvl1', 'lvl2')
    lvl2path = opj(ds.path, lvl2relpath)
    os.makedirs(lvl2path)
    os.makedirs(opj(ds.path, 'lvl1', 'empty'))
    with open(opj(lvl2path, 'file'), 'w') as f:
        f.write('some')
    ok_(ds.save(auto_add_changes=True))
    # later create subdataset in a fresh dir
    subds1 = ds.create(opj('lvl1', 'subds'))
    ok_clean_git(ds.path)
    eq_(ds.get_subdatasets(), [opj('lvl1', 'subds')])
    # later create subdataset in an existing empty dir
    subds2 = ds.create(opj('lvl1', 'empty'))
    ok_clean_git(ds.path)
    # later try to wrap existing content into a new subdataset
    # but that won't work
    assert_raises(ValueError, ds.create, lvl2relpath)
    # even with force, as to do this properly complicated surgery would need to
    # take place
    assert_raises(CommandError, ds.create, lvl2relpath, force=True)
    # only way to make it work is to unannex the content upfront
    ds.repo._run_annex_command('unannex',
                               annex_options=[opj(lvl2relpath, 'file')])
    # nothing to save, git-annex commits the unannex itself
    ok_(not ds.save())
    # still nothing without force
    # "err='lvl1/lvl2' already exists in the index"
    assert_raises(ValueError, ds.create, lvl2relpath)
    # XXX even force doesn't help, because (I assume) GitPython doesn't update
    # its representation of the Git index properly
    assert_raises(CommandError, ds.create, lvl2relpath, force=True)
    # it is not GitPython's state that is at fault here, test with fresh
    # dataset isnstance
    ds = Dataset(ds.path)
    assert_raises(CommandError, ds.create, lvl2relpath, force=True)
    # it seems we are at fault here
    rmtree(opj(lvl2path, '.git'))
    assert_raises(CommandError, ds.repo.add_submodule, lvl2relpath)
    # despite the failure:
    assert_in(lvl2relpath, ds.get_subdatasets())
Exemple #34
0
def test_is_installed(src, path):
    ds = Dataset(path)
    assert_false(ds.is_installed())

    # get a clone:
    AnnexRepo.clone(src, path)
    ok_(ds.is_installed())
    # submodule still not installed:
    subds = Dataset(opj(path, 'subm 1'))
    assert_false(subds.is_installed())
    # We must not be able to create a new repository under a known
    # subdataset path.
    # Note: Unfortunately we would still be able to generate it under
    # subdirectory within submodule, e.g. `subm 1/subdir` but that is
    # not checked here. `rev-create` will provide that protection
    # when create/rev-create merge.
    res = subds.rev_create(on_failure='ignore',
                           return_type='list',
                           result_filter=None,
                           result_xfm=None)
    assert_result_count(res, 1)
    assert_result_count(
        res,
        1,
        status='error',
        path=subds.path,
        message=('collision with content in parent dataset at %s: %s', ds.path,
                 [subds.path]))
    # get the submodule
    # This would init so there is a .git file with symlink info, which is
    # as we agreed is more pain than gain, so let's use our install which would
    # do it right, after all we are checking 'is_installed' ;)
    # from datalad.cmd import Runner
    # Runner().run(['git', 'submodule', 'update', '--init', 'subm 1'], cwd=path)
    with chpwd(path):
        get('subm 1')
    ok_(subds.is_installed())
    # wipe it out
    rmtree(ds.path)
    assert_false(ds.is_installed())
Exemple #35
0
def test_is_installed(src=None, path=None):
    ca = dict(result_renderer='disabled')
    # a remote dataset with a subdataset underneath
    origds = Dataset(src).create(**ca)
    _ = origds.create('subm 1', **ca)

    ds = Dataset(path)
    assert_false(ds.is_installed())

    # get a clone:
    clone(src, path, **ca)
    ok_(ds.is_installed())
    # submodule still not installed:
    subds = Dataset(ds.pathobj / 'subm 1')
    assert_false(subds.is_installed())
    # We must not be able to create a new repository under a known
    # subdataset path.
    # Note: Unfortunately we would still be able to generate it under
    # subdirectory within submodule, e.g. `subm 1/subdir` but that is
    # not checked here. `create` provides that protection though.
    res = subds.create(on_failure='ignore',
                       return_type='list',
                       result_filter=None,
                       result_xfm=None,
                       **ca)
    assert_result_count(res, 1)
    assert_result_count(res,
                        1,
                        status='error',
                        path=subds.path,
                        message=('collision with %s (dataset) in dataset %s',
                                 subds.path, ds.path))
    # get the submodule
    with chpwd(ds.path):
        get('subm 1', **ca)
    ok_(subds.is_installed())
    # wipe it out
    rmtree(ds.path)
    assert_false(ds.is_installed())
Exemple #36
0
def test_get_missing(path):
    repo = GitRepo(path, create=True)
    os.makedirs(op.join(path, 'deep'))
    with open(op.join(path, 'test1'), 'w') as f:
        f.write('some')
    with open(op.join(path, 'deep', 'test2'), 'w') as f:
        f.write('some more')
    repo.add('.')
    repo.commit()
    ok_clean_git(path, annex=False)
    os.unlink(op.join(path, 'test1'))
    eq_(repo.get_missing_files(), ['test1'])
    rmtree(op.join(path, 'deep'))
    eq_(sorted(repo.get_missing_files()), [op.join('deep', 'test2'), 'test1'])
    # nothing is actually known to be deleted
    eq_(repo.get_deleted_files(), [])
    # do proper removal
    repo.remove(op.join(path, 'test1'))
    # no longer missing
    eq_(repo.get_missing_files(), [op.join('deep', 'test2')])
    # but deleted
    eq_(repo.get_deleted_files(), ['test1'])
Exemple #37
0
def test_is_installed(src, path):
    ds = Dataset(path)
    assert_false(ds.is_installed())

    # get a clone:
    AnnexRepo.clone(src, path)
    ok_(ds.is_installed())
    # submodule still not installed:
    subds = Dataset(opj(path, 'subm 1'))
    assert_false(subds.is_installed())
    subds.create()
    # get the submodule
    # This would init so there is a .git file with symlink info, which is
    # as we agreed is more pain than gain, so let's use our install which would
    # do it right, after all we are checking 'is_installed' ;)
    # from datalad.cmd import Runner
    # Runner().run(['git', 'submodule', 'update', '--init', 'subm 1'], cwd=path)
    with chpwd(path):
        get('subm 1')
    ok_(subds.is_installed())
    # wipe it out
    rmtree(ds.path)
    assert_false(ds.is_installed())
Exemple #38
0
def test_is_installed(src, path):
    ds = Dataset(path)
    assert_false(ds.is_installed())

    # get a clone:
    AnnexRepo.clone(src, path)
    ok_(ds.is_installed())
    # submodule still not installed:
    subds = Dataset(opj(path, 'subm 1'))
    assert_false(subds.is_installed())
    subds.create()
    # get the submodule
    # This would init so there is a .git file with symlink info, which is
    # as we agreed is more pain than gain, so let's use our install which would
    # do it right, after all we are checking 'is_installed' ;)
    # from datalad.cmd import Runner
    # Runner().run(['git', 'submodule', 'update', '--init', 'subm 1'], cwd=path)
    with chpwd(path):
        get('subm 1')
    ok_(subds.is_installed())
    # wipe it out
    rmtree(ds.path)
    assert_false(ds.is_installed())
Exemple #39
0
def test_recurse_existing(src, path):
    origin_ds = _make_dataset_hierarchy(src)

    # make sure recursion_limit works as expected across a range of depths
    for depth in range(len(origin_ds)):
        res = install(
            path, source=src, recursive=True, recursion_limit=depth,
            result_xfm=None, return_type='list', result_filter=None)
        # we expect one dataset per level
        assert_result_count(
            res, depth + 1, type='dataset', status='ok')
        rmtree(path)

    # now install all but the last two levels, no data
    root, sub1, sub2 = install(
        path, source=src, recursive=True, recursion_limit=2,
        result_xfm='datasets', result_filter=None)
    ok_(sub2.repo.file_has_content('file_in_annex.txt') is False)
    sub3 = Dataset(opj(sub2.path, 'sub3'))
    ok_(not sub3.is_installed())
    # now get all content in all existing datasets, no new datasets installed
    # in the process
    files = root.get(curdir, recursive=True, recursion_limit='existing')
    assert_not_in_results(files, type='dataset', status='ok')
    assert_result_count(files, 1, type='file', status='ok')
    ok_(sub2.repo.file_has_content('file_in_annex.txt') is True)
    ok_(not sub3.is_installed())
    # now pull down all remaining datasets, no data
    sub3, sub4 = root.get(
        curdir, recursive=True, get_data=False,
        result_xfm='datasets', result_filter=lambda x: x['status'] == 'ok')
    ok_(sub4.is_installed())
    ok_(sub3.repo.file_has_content('file_in_annex.txt') is False)
    # aaannd all data
    files = root.get(curdir, recursive=True, result_filter=lambda x: x['status'] == 'ok' and x['type'] == 'file')
    eq_(len(files), 1)
    ok_(sub3.repo.file_has_content('file_in_annex.txt') is True)
Exemple #40
0
def _kill_dataset(ds):
    """This is a harsh internal helper: it will wipe out a dataset, no checks
    """
    # figure out whether we should be nice to a superdataset later on
    has_super = ds.get_superdataset(topmost=False, registered_only=True)
    # Close any possibly associated process etc with underlying repo.
    # Otherwise - rmtree could fail to remove e.g. under NFS which would
    # still have some files opened by them (thus having .nfs00000xxxx
    # files) forbidding rmdir to work in rmtree
    ds.close()
    rmtree(ds.path)
    # invalidate loaded ConfigManager -- datasets are singletons!!
    ds._cfg = None
    if has_super:
        # recreate an empty mountpoint to make Git happier
        ds.pathobj.mkdir(exist_ok=True)
    yield dict(
        # keep uninstall to please the gods of a distant past
        #action='drop',
        action='uninstall',
        path=ds.path,
        type='dataset',
        status='ok',
    )
Exemple #41
0
def test_no_blows(cookiesdir):
    cookies = CookiesDB(op.join(cookiesdir, 'mycookies'))
    # set the cookie
    cookies['best'] = 'mine'
    assert_equal(cookies['best'], 'mine')
    """
    Somehow this manages to trigger on conda but not on debian for me
    File "/home/yoh/anaconda-2018.12-3.7/envs/test-gitpython/lib/python3.7/shelve.py", line 125, in __setitem__
        self.dict[key.encode(self.keyencoding)] = f.getvalue()
    File "/home/yoh/anaconda-2018.12-3.7/envs/test-gitpython/lib/python3.7/dbm/dumb.py", line 216, in __setitem__
        self._index[key] = self._setval(pos, val)
    File "/home/yoh/anaconda-2018.12-3.7/envs/test-gitpython/lib/python3.7/dbm/dumb.py", line 178, in _setval
        with _io.open(self._datfile, 'rb+') as f:
        FileNotFoundError: [Errno 2] No such file or directory: '/home/yoh/.tmp/datalad_temp_test_no_blowsalnsw_wk/mycookies.dat'

    on Debian (python 3.7.3~rc1-1) I just get a warning: BDB3028 /home/yoh/.tmp/datalad_temp_test_no_blows58tdg67s/mycookies.db: unable to flush: No such file or directory
    """
    try:
        rmtree(cookiesdir)
    except OSError:
        # on NFS directory might still be open, so .nfs* lock file would prevent
        # removal, but it shouldn't matter and .close should succeed
        pass
    cookies.close()
Exemple #42
0
def postclonecfg_annexdataset(ds, reckless, description=None):
    """If ds "knows annex" -- annex init it, set into reckless etc

    Provides additional tune up to a possibly an annex repo, e.g.
    "enables" reckless mode, sets up description
    """
    # in any case check whether we need to annex-init the installed thing:
    if not knows_annex(ds.path):
        # not for us
        return

    # init annex when traces of a remote annex can be detected
    if reckless == 'auto':
        lgr.debug(
            "Instruct annex to hardlink content in %s from local "
            "sources, if possible (reckless)", ds.path)
        ds.config.set(
            'annex.hardlink', 'true', where='local', reload=True)

    lgr.debug("Initializing annex repo at %s", ds.path)
    # Note, that we cannot enforce annex-init via AnnexRepo().
    # If such an instance already exists, its __init__ will not be executed.
    # Therefore do quick test once we have an object and decide whether to call
    # its _init().
    #
    # Additionally, call init if we need to add a description (see #1403),
    # since AnnexRepo.__init__ can only do it with create=True
    repo = AnnexRepo(ds.path, init=True)
    if not repo.is_initialized() or description:
        repo._init(description=description)
    if reckless == 'auto' or (reckless and reckless.startswith('shared-')):
        repo.call_annex(['untrust', 'here'])

    elif reckless == 'ephemeral':
        # with ephemeral we declare 'here' as 'dead' right away, whenever
        # we symlink origin's annex, since availability from 'here' should
        # not be propagated for an ephemeral clone when we publish back to
        # origin.
        # This will cause stuff like this for a locally present annexed file:
        # % git annex whereis d1
        # whereis d1 (0 copies) failed
        # BUT this works:
        # % git annex find . --not --in here
        # % git annex find . --in here
        # d1

        # we don't want annex copy-to origin
        ds.config.set(
            'remote.origin.annex-ignore', 'true',
            where='local')

        ds.repo.set_remote_dead('here')

        if check_symlink_capability(ds.repo.dot_git / 'dl_link_test',
                                    ds.repo.dot_git / 'dl_target_test'):
            # symlink the annex to avoid needless copies in an ephemeral clone
            annex_dir = ds.repo.dot_git / 'annex'
            origin_annex_url = ds.config.get("remote.origin.url", None)
            origin_git_path = None
            if origin_annex_url:
                try:
                    # Deal with file:// scheme URLs as well as plain paths.
                    # If origin isn't local, we have nothing to do.
                    origin_git_path = Path(RI(origin_annex_url).localpath)

                    # we are local; check for a bare repo first to not mess w/
                    # the path
                    if GitRepo(origin_git_path, create=False).bare:
                        # origin is a bare repo -> use path as is
                        pass
                    elif origin_git_path.name != '.git':
                        origin_git_path /= '.git'
                except ValueError:
                    # Note, that accessing localpath on a non-local RI throws
                    # ValueError rather than resulting in an AttributeError.
                    # TODO: Warning level okay or is info level sufficient?
                    # Note, that setting annex-dead is independent of
                    # symlinking .git/annex. It might still make sense to
                    # have an ephemeral clone that doesn't propagate its avail.
                    # info. Therefore don't fail altogether.
                    lgr.warning("reckless=ephemeral mode: origin doesn't seem "
                                "local: %s\nno symlinks being used",
                                origin_annex_url)
            if origin_git_path:
                # TODO make sure that we do not delete any unique data
                rmtree(str(annex_dir)) \
                    if not annex_dir.is_symlink() else annex_dir.unlink()
                annex_dir.symlink_to(origin_git_path / 'annex',
                                     target_is_directory=True)
        else:
            # TODO: What level? + note, that annex-dead is independ
            lgr.warning("reckless=ephemeral mode: Unable to create symlinks on "
                        "this file system.")

    srs = {True: [], False: []}  # special remotes by "autoenable" key
    remote_uuids = None  # might be necessary to discover known UUIDs

    repo_config = repo.config
    # Note: The purpose of this function is to inform the user. So if something
    # looks misconfigured, we'll warn and move on to the next item.
    for uuid, config in repo.get_special_remotes().items():
        sr_name = config.get('name', None)
        if sr_name is None:
            lgr.warning(
                'Ignoring special remote %s because it does not have a name. '
                'Known information: %s',
                uuid, config)
            continue
        sr_autoenable = config.get('autoenable', False)
        try:
            sr_autoenable = ensure_bool(sr_autoenable)
        except ValueError:
            lgr.warning(
                'Failed to process "autoenable" value %r for sibling %s in '
                'dataset %s as bool.'
                'You might need to enable it later manually and/or fix it up to'
                ' avoid this message in the future.',
                sr_autoenable, sr_name, ds.path)
            continue

        # If it looks like a type=git special remote, make sure we have up to
        # date information. See gh-2897.
        if sr_autoenable and repo_config.get("remote.{}.fetch".format(sr_name)):
            try:
                repo.fetch(remote=sr_name)
            except CommandError as exc:
                lgr.warning("Failed to fetch type=git special remote %s: %s",
                            sr_name, exc_str(exc))

        # determine whether there is a registered remote with matching UUID
        if uuid:
            if remote_uuids is None:
                remote_uuids = {
                    # Check annex-config-uuid first. For sameas annex remotes,
                    # this will point to the UUID for the configuration (i.e.
                    # the key returned by get_special_remotes) rather than the
                    # shared UUID.
                    (repo_config.get('remote.%s.annex-config-uuid' % r) or
                     repo_config.get('remote.%s.annex-uuid' % r))
                    for r in repo.get_remotes()
                }
            if uuid not in remote_uuids:
                srs[sr_autoenable].append(sr_name)

    if srs[True]:
        lgr.debug(
            "configuration for %s %s added because of autoenable,"
            " but no UUIDs for them yet known for dataset %s",
            # since we are only at debug level, we could call things their
            # proper names
            single_or_plural("special remote",
                             "special remotes", len(srs[True]), True),
            ", ".join(srs[True]),
            ds.path
        )

    if srs[False]:
        # if has no auto-enable special remotes
        lgr.info(
            'access to %s %s not auto-enabled, enable with:\n'
            '\t\tdatalad siblings -d "%s" enable -s %s',
            # but since humans might read it, we better confuse them with our
            # own terms!
            single_or_plural("dataset sibling",
                             "dataset siblings", len(srs[False]), True),
            ", ".join(srs[False]),
            ds.path,
            srs[False][0] if len(srs[False]) == 1 else "SIBLING",
        )

    # we have just cloned the repo, so it has 'origin', configure any
    # reachable origin of origins
    yield from configure_origins(ds, ds)
Exemple #43
0
def clone_dataset(
        srcs,
        destds,
        reckless=None,
        description=None,
        result_props=None,
        cfg=None):
    """Internal helper to perform cloning without sanity checks (assumed done)

    This helper does not handle any saving of subdataset modification or adding
    in a superdataset.

    Parameters
    ----------
    srcs : list
      Any suitable clone source specifications (paths, URLs)
    destds : Dataset
      Dataset instance for the clone destination
    reckless : {None, 'auto', 'ephemeral', 'shared-...'}, optional
      Mode switch to put cloned dataset into unsafe/throw-away configurations, i.e.
      sacrifice data safety for performance or resource footprint. When None
      and `cfg` is specified, use the value of `datalad.clone.reckless`.
    description : str, optional
      Location description for the annex of the dataset clone (if there is any).
    result_props : dict, optional
      Default properties for any yielded result, passed on to get_status_dict().
    cfg : ConfigManager, optional
      Configuration for parent dataset. This will be queried instead
      of the global DataLad configuration.

    Yields
    ------
    dict
      DataLad result records
    """
    if not result_props:
        # in case the caller had no specific idea on how results should look
        # like, provide sensible defaults
        result_props = dict(
            action='install',
            logger=lgr,
            ds=destds,
        )

    if reckless is None and cfg:
        # if reckless is not explicitly given, but we operate on a
        # superdataset, query whether it has been instructed to operate
        # in a reckless mode, and inherit it for the coming clone
        reckless = cfg.get('datalad.clone.reckless', None)

    dest_path = destds.pathobj

    # decode all source candidate specifications
    candidate_sources = [decode_source_spec(s, cfg=cfg) for s in srcs]

    # now expand the candidate sources with additional variants of the decoded
    # giturl, while duplicating the other properties in the additional records
    # for simplicity. The hope is to overcome a few corner cases and be more
    # robust than git clone
    candidate_sources = [
        dict(props, giturl=s) for props in candidate_sources
        for s in _get_flexible_source_candidates(props['giturl'])
    ]

    # important test! based on this `rmtree` will happen below after failed clone
    dest_path_existed = dest_path.exists()
    if dest_path_existed and any(dest_path.iterdir()):
        if destds.is_installed():
            # check if dest was cloned from the given source before
            # this is where we would have installed this from
            # this is where it was actually installed from
            track_name, track_url = _get_tracking_source(destds)
            try:
                # this will get us track_url in system native path conventions,
                # whenever it is a path (and not a URL)
                # this is needed to match it to any potentially incoming local
                # source path in the 'notneeded' test below
                track_path = str(Path(track_url))
            except Exception:
                # this should never happen, because Path() will let any non-path stringification
                # pass through unmodified, but we do not want any potential crash due to
                # pathlib behavior changes
                lgr.debug("Unexpected behavior of pathlib!")
                track_path = None
            for cand in candidate_sources:
                src = cand['giturl']
                if track_url == src \
                        or (not is_url(track_url)
                            and get_local_file_url(track_url, compatibility='git') == src) \
                        or track_path == expanduser(src):
                    yield get_status_dict(
                        status='notneeded',
                        message=("dataset %s was already cloned from '%s'",
                                 destds,
                                 src),
                        **result_props)
                    return
        # anything else is an error
        yield get_status_dict(
            status='error',
            message='target path already exists and not empty, refuse to clone into target path',
            **result_props)
        return

    log_progress(
        lgr.info,
        'cloneds',
        'Cloning dataset to %s', destds,
        total=len(candidate_sources),
        label='Clone attempt',
        unit=' Candidate locations',
    )
    error_msgs = OrderedDict()  # accumulate all error messages formatted per each url
    for cand in candidate_sources:
        log_progress(
            lgr.info,
            'cloneds',
            'Attempting to clone from %s to %s', cand['giturl'], dest_path,
            update=1,
            increment=True)

        clone_opts = {}

        if cand.get('version', None):
            clone_opts['branch'] = cand['version']
        try:
            # TODO for now GitRepo.clone() cannot handle Path instances, and PY35
            # doesn't make it happen seemlessly
            GitRepo.clone(
                path=str(dest_path),
                url=cand['giturl'],
                clone_options=clone_opts,
                create=True)

        except CommandError as e:
            e_stderr = e.stderr

            error_msgs[cand['giturl']] = e
            lgr.debug("Failed to clone from URL: %s (%s)",
                      cand['giturl'], exc_str(e))
            if dest_path.exists():
                lgr.debug("Wiping out unsuccessful clone attempt at: %s",
                          dest_path)
                # We must not just rmtree since it might be curdir etc
                # we should remove all files/directories under it
                # TODO stringification can be removed once patlib compatible
                # or if PY35 is no longer supported
                rmtree(str(dest_path), children_only=dest_path_existed)

            if e_stderr and 'could not create work tree' in e_stderr.lower():
                # this cannot be fixed by trying another URL
                re_match = re.match(r".*fatal: (.*)$", e_stderr,
                                    flags=re.MULTILINE | re.DOTALL)
                # cancel progress bar
                log_progress(
                    lgr.info,
                    'cloneds',
                    'Completed clone attempts for %s', destds
                )
                yield get_status_dict(
                    status='error',
                    message=re_match.group(1).strip()
                    if re_match else "stderr: " + e_stderr,
                    **result_props)
                return
            # next candidate
            continue

        result_props['source'] = cand
        # do not bother with other sources if succeeded
        break

    log_progress(
        lgr.info,
        'cloneds',
        'Completed clone attempts for %s', destds
    )

    if not destds.is_installed():
        if len(error_msgs):
            if all(not e.stdout and not e.stderr for e in error_msgs.values()):
                # there is nothing we can learn from the actual exception,
                # the exit code is uninformative, the command is predictable
                error_msg = "Failed to clone from all attempted sources: %s"
                error_args = list(error_msgs.keys())
            else:
                error_msg = "Failed to clone from any candidate source URL. " \
                            "Encountered errors per each url were:\n- %s"
                error_args = '\n- '.join(
                    '{}\n  {}'.format(url, exc_str(exc))
                    for url, exc in error_msgs.items()
                )
        else:
            # yoh: Not sure if we ever get here but I felt that there could
            #      be a case when this might happen and original error would
            #      not be sufficient to troubleshoot what is going on.
            error_msg = "Awkward error -- we failed to clone properly. " \
                        "Although no errors were encountered, target " \
                        "dataset at %s seems to be not fully installed. " \
                        "The 'succesful' source was: %s"
            error_args = (destds.path, cand['giturl'])
        yield get_status_dict(
            status='error',
            message=(error_msg, error_args),
            **result_props)
        return

    if not cand.get("version"):
        postclone_check_head(destds)

    # act on --reckless=shared-...
    # must happen prior git-annex-init, where we can cheaply alter the repo
    # setup through safe re-init'ing
    if reckless and reckless.startswith('shared-'):
        lgr.debug('Reinit %s to enable shared access permissions', destds)
        destds.repo.call_git(['init', '--shared={}'.format(reckless[7:])])

    yield from postclonecfg_annexdataset(
        destds,
        reckless,
        description)

    # perform any post-processing that needs to know details of the clone
    # source
    if result_props['source']['type'] == 'ria':
        yield from postclonecfg_ria(destds, result_props['source'])

    if reckless:
        # store the reckless setting in the dataset to make it
        # known to later clones of subdatasets via get()
        destds.config.set(
            'datalad.clone.reckless', reckless,
            where='local',
            reload=True)

    # yield successful clone of the base dataset now, as any possible
    # subdataset clone down below will not alter the Git-state of the
    # parent
    yield get_status_dict(status='ok', **result_props)
Exemple #44
0
    def __call__(
            source,
            path=None,
            dataset=None,
            description=None,
            reckless=False,
            alt_sources=None):
            # TODO next ones should be there, but cannot go anywhere
            # git_opts=None,
            # git_clone_opts=None,
            # annex_opts=None,
            # annex_init_opts=None

        # did we explicitly get a dataset to install into?
        # if we got a dataset, path will be resolved against it.
        # Otherwise path will be resolved first.
        dataset = require_dataset(
            dataset, check_installed=True, purpose='cloning') \
            if dataset is not None else dataset
        refds_path = dataset.path if dataset else None

        if isinstance(source, Dataset):
            source = source.path

        if source == path:
            # even if they turn out to be identical after resolving symlinks
            # and more sophisticated witchcraft, it would still happily say
            # "it appears to be already installed", so we just catch an
            # obviously pointless input combination
            raise ValueError(
                "clone `source` and destination `path` are identical [{}]. "
                "If you are trying to add a subdataset simply use `add`".format(
                    path))

        if path is not None:
            path = resolve_path(path, dataset)

        # Possibly do conversion from source into a git-friendly url
        # luckily GitRepo will undo any fancy file:/// url to make use of Git's
        # optimization for local clones....
        source_url = source
        source_ = _get_git_url_from_source(source)
        lgr.debug("Resolved clone source from '%s' to '%s'",
                  source, source_)
        source = source_

        # derive target from source:
        if path is None:
            # we got nothing but a source. do something similar to git clone
            # and derive the path from the source and continue
            path = _get_installationpath_from_url(source)
            # since this is a relative `path`, resolve it:
            path = resolve_path(path, dataset)
            lgr.debug("Determined clone target path from source")
        lgr.debug("Resolved clone target path to: '%s'", path)

        # there is no other way -- my intoxicated brain tells me
        assert(path is not None)

        destination_dataset = Dataset(path)
        dest_path = path

        status_kwargs = dict(
            action='install', ds=destination_dataset, logger=lgr,
            refds=refds_path, source_url=source_url)

        # important test! based on this `rmtree` will happen below after failed clone
        if exists(dest_path) and listdir(dest_path):
            if destination_dataset.is_installed():
                # check if dest was cloned from the given source before
                # this is where we would have installed this from
                guessed_sources = _get_flexible_source_candidates(
                    source, dest_path)
                # this is where it was actually installed from
                track_name, track_url = _get_tracking_source(destination_dataset)
                if track_url in guessed_sources or \
                        get_local_file_url(track_url) in guessed_sources:
                    yield get_status_dict(
                        status='notneeded',
                        message=("dataset %s was already cloned from '%s'",
                                 destination_dataset,
                                 source),
                        **status_kwargs)
                    return
            # anything else is an error
            yield get_status_dict(
                status='error',
                message='target path already exists and not empty, refuse to clone into target path',
                **status_kwargs)
            return

        if dataset is not None and relpath(path, start=dataset.path).startswith(pardir):
            yield get_status_dict(
                status='error',
                message=("clone target path '%s' not in specified target dataset '%s'",
                         path, dataset),
                **status_kwargs)
            return

        # generate candidate URLs from source argument to overcome a few corner cases
        # and hopefully be more robust than git clone
        candidate_sources = []
        # combine all given sources (incl. alternatives), maintain order
        for s in [source] + assure_list(alt_sources):
            candidate_sources.extend(_get_flexible_source_candidates(s))
        candidates_str = \
            " [%d other candidates]" % (len(candidate_sources) - 1) \
            if len(candidate_sources) > 1 \
            else ''
        lgr.info("Cloning %s%s into '%s'",
                 source, candidates_str, dest_path)
        dest_path_existed = exists(dest_path)
        error_msgs = OrderedDict()  # accumulate all error messages formatted per each url
        for isource_, source_ in enumerate(candidate_sources):
            try:
                lgr.debug("Attempting to clone %s (%d out of %d candidates) to '%s'",
                          source_, isource_ + 1, len(candidate_sources), dest_path)
                GitRepo.clone(path=dest_path, url=source_, create=True)
                break  # do not bother with other sources if succeeded
            except GitCommandError as e:
                error_msgs[source_] = exc_str_ = exc_str(e)
                lgr.debug("Failed to clone from URL: %s (%s)",
                          source_, exc_str_)
                if exists(dest_path):
                    lgr.debug("Wiping out unsuccessful clone attempt at: %s",
                              dest_path)
                    # We must not just rmtree since it might be curdir etc
                    # we should remove all files/directories under it
                    rmtree(dest_path, children_only=dest_path_existed)
                # Whenever progress reporting is enabled, as it is now,
                # we end up without e.stderr since it is "processed" out by
                # GitPython/our progress handler.
                e_stderr = e.stderr
                from datalad.support.gitrepo import GitPythonProgressBar
                if not e_stderr and GitPythonProgressBar._last_error_lines:
                    e_stderr = os.linesep.join(GitPythonProgressBar._last_error_lines)
                if 'could not create work tree' in e_stderr.lower():
                    # this cannot be fixed by trying another URL
                    re_match = re.match(r".*fatal: (.*)$", e_stderr,
                                        flags=re.MULTILINE | re.DOTALL)
                    yield get_status_dict(
                        status='error',
                        message=re_match.group(1) if re_match else "stderr: " + e_stderr,
                        **status_kwargs)
                    return

        if not destination_dataset.is_installed():
            if len(error_msgs):
                error_msg = "Failed to clone from any candidate source URL. " \
                            "Encountered errors per each url were: %s"
                error_args = (error_msgs, )
            else:
                # yoh: Not sure if we ever get here but I felt that there could
                #      be a case when this might happen and original error would
                #      not be sufficient to troubleshoot what is going on.
                error_msg = "Awkward error -- we failed to clone properly. " \
                            "Although no errors were encountered, target " \
                            "dataset at %s seems to be not fully installed. " \
                            "The 'succesful' source was: %s"
                error_args = (destination_dataset.path, source_)
            yield get_status_dict(
                status='error',
                message=(error_msg, error_args),
                **status_kwargs)
            return

        if dataset is not None:
            # we created a dataset in another dataset
            # -> make submodule
            for r in dataset.save(
                    dest_path,
                    return_type='generator',
                    result_filter=None,
                    result_xfm=None,
                    on_failure='ignore'):
                yield r

        _handle_possible_annex_dataset(
            destination_dataset,
            reckless,
            description=description)

        # yield successful clone of the base dataset now, as any possible
        # subdataset clone down below will not alter the Git-state of the
        # parent
        yield get_status_dict(status='ok', **status_kwargs)
Exemple #45
0
 def teardown(self, tarfile_path):
     for path in [self.ds.path + '_', self.ds.path]:
         print("Cleaning up %s" % path)
         if osp.exists(path):
             rmtree(path)
Exemple #46
0
 def teardown(self):
     if osp.exists(self.path):
         rmtree(self.path)