def test_cached_url(cache_dir):

    # patch DATALAD_TESTS_CACHE to not use the actual cache with
    # the test testing that very cache.
    cache_dir = Path(cache_dir)

    ds_url = "https://github.com/datalad/testrepo--minimalds"
    name_in_cache = url2filename(ds_url)
    annexed_file = Path("inannex") / "animated.gif"
    annexed_file_key = "MD5E-s144625--4c458c62b7ac8ec8e19c8ff14b2e34ad.gif"

    with patch(CACHE_PATCH_STR, new=cache_dir):

        @cached_url(url=ds_url)
        def decorated_test1(url):
            # we expect a file-scheme url to a cached version of `ds_url`
            expect_origin_path = cache_dir / name_in_cache
            assert_equal(expect_origin_path.as_uri(),
                         url)
            origin = Dataset(expect_origin_path)
            assert_true(origin.is_installed())
            assert_false(origin.repo.file_has_content(str(annexed_file)))

        decorated_test1()

        @cached_url(url=ds_url, keys=annexed_file_key)
        def decorated_test2(url):
            # we expect a file-scheme url to a "different" cached version of
            # `ds_url`
            expect_origin_path = cache_dir / name_in_cache
            assert_equal(expect_origin_path.as_uri(),
                         url)
            origin = Dataset(expect_origin_path)
            assert_true(origin.is_installed())
            assert_true(origin.repo.file_has_content(str(annexed_file)))

        decorated_test2()

    # disable caching. Note, that in reality DATALAD_TESTS_CACHE is determined
    # on import time of datalad.tests.fixtures based on the config
    # "datalad.tests.cache". We patch the result here, not the config itself.
    with patch(CACHE_PATCH_STR, new=None):

        @cached_url(url=ds_url)
        def decorated_test3(url):
            # we expect the original url, since caching is disabled
            assert_equal(url, ds_url)

        decorated_test3()
def test_get_cached_dataset(cache_dir):

    # patch DATALAD_TESTS_CACHE to not use the actual cache with
    # the test testing that very cache.
    cache_dir = Path(cache_dir)

    # store file-based values for testrepo-minimalds for readability:
    annexed_file = opj('inannex', 'animated.gif')
    annexed_file_key = "MD5E-s144625--4c458c62b7ac8ec8e19c8ff14b2e34ad.gif"

    with patch(CACHE_PATCH_STR, new=cache_dir):

        # tuples to test (url, version, keys, class):
        test_cases = [

            # a simple testrepo
            ("https://github.com/datalad/testrepo--minimalds",
             "541cf855d13c2a338ff2803d4488daf0035e568f", None, AnnexRepo),
            # Same repo, but request paths to be present. This should work
            # with a subsequent call, although the first one did not already
            # request any:
            ("https://github.com/datalad/testrepo--minimalds",
             "9dd8b56cc706ab56185f2ceb75fbe9de9b606724", annexed_file_key,
             AnnexRepo),
            # Same repo again, but invalid version
            (
                "https://github.com/datalad/testrepo--minimalds",
                "nonexistent",
                "irrelevantkey",  # invalid version; don't even try to get the key
                AnnexRepo),
            # same thing with different name should be treated as a new thing:
            ("https://github.com/datalad/testrepo--minimalds", "git-annex",
             None, AnnexRepo),
            # try a plain git repo to make sure we can deal with that:
            # Note, that we first need a test case w/o a `key` parameter to not
            # blow up the test when Clone is patched, resulting in a MagicMock
            # instead of a Dataset instance within get_cached_dataset. In the
            # second case it's already cached then, so the patched Clone is
            # never executed.
            ("https://github.com/datalad/datalad.org", None, None, GitRepo),
            (
                "https://github.com/datalad/datalad.org",
                "gh-pages",
                "ignored-key",  # it's a git repo; don't even try to get a key
                GitRepo),
        ]
        for url, version, keys, cls in test_cases:
            target = cache_dir / url2filename(url)

            # assuming it doesn't exist yet - patched cache dir!
            in_cache_before = target.exists()
            with patch(CLONE_PATCH_STR) as exec_clone:
                try:
                    ds = get_cached_dataset(url, version, keys)
                    invalid_version = False
                except AssertionError:
                    # should happen only if `version` wasn't found. Implies
                    # that the dataset exists in cache (although not returned
                    # due to exception)
                    assert_true(version)
                    assert_false(Dataset(target).repo.commit_exists(version))
                    # mark for later assertions (most of them should still hold
                    # true)
                    invalid_version = True

            assert_equal(exec_clone.call_count, 0 if in_cache_before else 1)

            # Patch prevents actual execution. Now do it for real. Note, that
            # this might be necessary for content retrieval even if dataset was
            # in cache before.
            try:
                ds = get_cached_dataset(url, version, keys)
            except AssertionError:
                # see previous call
                assert_true(invalid_version)

            assert_is_instance(ds, Dataset)
            assert_true(ds.is_installed())
            assert_equal(target, ds.pathobj)
            assert_is_instance(ds.repo, cls)

            if keys and not invalid_version and \
                    AnnexRepo.is_valid_repo(ds.path):
                # Note: it's not supposed to get that content if passed
                # `version` wasn't available. get_cached_dataset would then
                # raise before and not download anything only to raise
                # afterwards.
                here = ds.config.get("annex.uuid")
                where = ds.repo.whereis(ensure_list(keys), key=True)
                assert_true(all(here in remotes for remotes in where))

            # version check. Note, that all `get_cached_dataset` is supposed to
            # do, is verifying, that specified version exists - NOT check it
            # out"
            if version and not invalid_version:
                assert_true(ds.repo.commit_exists(version))

            # re-execution
            with patch(CLONE_PATCH_STR) as exec_clone:
                try:
                    ds2 = get_cached_dataset(url, version, keys)
                except AssertionError:
                    assert_true(invalid_version)
            exec_clone.assert_not_called()
            # returns the same Dataset as before:
            assert_is(ds, ds2)
def test_cached_dataset(cache_dir):

    # patch DATALAD_TESTS_CACHE to not use the actual cache with
    # the test testing that very cache.
    cache_dir = Path(cache_dir)
    ds_url = "https://github.com/datalad/testrepo--minimalds"
    name_in_cache = url2filename(ds_url)
    annexed_file = Path("inannex") / "animated.gif"

    with patch(CACHE_PATCH_STR, new=cache_dir):

        @cached_dataset(url=ds_url)
        def decorated_test1(ds):
            # we get a Dataset instance
            assert_is_instance(ds, Dataset)
            # it's a clone in a temp. location, not within the cache
            assert_not_in(cache_dir, ds.pathobj.parents)
            assert_result_count(ds.siblings(),
                                1,
                                type="sibling",
                                name="origin",
                                url=str(cache_dir / name_in_cache))
            here = ds.config.get("annex.uuid")
            origin = ds.config.get("remote.origin.annex-uuid")
            where = ds.repo.whereis(str(annexed_file))
            assert_not_in(here, where)
            assert_not_in(origin, where)

            return ds.pathobj, ds.repo.pathobj

        @cached_dataset(url=ds_url, paths=str(annexed_file))
        def decorated_test2(ds):
            # we get a Dataset instance
            assert_is_instance(ds, Dataset)
            # it's a clone in a temp. location, not within the cache
            assert_not_in(cache_dir, ds.pathobj.parents)
            assert_result_count(ds.siblings(),
                                1,
                                type="sibling",
                                name="origin",
                                url=str(cache_dir / name_in_cache))
            here = ds.config.get("annex.uuid")
            origin = ds.config.get("remote.origin.annex-uuid")
            where = ds.repo.whereis(str(annexed_file))
            assert_in(here, where)
            assert_in(origin, where)

            return ds.pathobj, ds.repo.pathobj

        @cached_dataset(url=ds_url)
        def decorated_test3(ds):
            # we get a Dataset instance
            assert_is_instance(ds, Dataset)
            # it's a clone in a temp. location, not within the cache
            assert_not_in(cache_dir, ds.pathobj.parents)
            assert_result_count(ds.siblings(),
                                1,
                                type="sibling",
                                name="origin",
                                url=str(cache_dir / name_in_cache))
            # origin is the same cached dataset, that got this content in
            # decorated_test2 before. Should still be there. But "here" we
            # didn't request it
            here = ds.config.get("annex.uuid")
            origin = ds.config.get("remote.origin.annex-uuid")
            where = ds.repo.whereis(str(annexed_file))
            assert_not_in(here, where)
            assert_in(origin, where)

            return ds.pathobj, ds.repo.pathobj

        @cached_dataset(url=ds_url,
                        version="541cf855d13c2a338ff2803d4488daf0035e568f")
        def decorated_test4(ds):
            # we get a Dataset instance
            assert_is_instance(ds, Dataset)
            # it's a clone in a temp. location, not within the cache
            assert_not_in(cache_dir, ds.pathobj.parents)
            assert_result_count(ds.siblings(),
                                1,
                                type="sibling",
                                name="origin",
                                url=str(cache_dir / name_in_cache))
            # origin is the same cached dataset, that got this content in
            # decorated_test2 before. Should still be there. But "here" we
            # didn't request it
            here = ds.config.get("annex.uuid")
            origin = ds.config.get("remote.origin.annex-uuid")
            where = ds.repo.whereis(str(annexed_file))
            assert_not_in(here, where)
            assert_in(origin, where)

            assert_equal(ds.repo.get_hexsha(),
                         "541cf855d13c2a338ff2803d4488daf0035e568f")

            return ds.pathobj, ds.repo.pathobj

        first_dspath, first_repopath = decorated_test1()
        second_dspath, second_repopath = decorated_test2()
        decorated_test3()
        decorated_test4()

        # first and second are not the same, only their origin is:
        assert_not_equal(first_dspath, second_dspath)
        assert_not_equal(first_repopath, second_repopath)