Exemple #1
0
def test_add_archive_use_archive_dir(repo_path):
    repo = AnnexRepo(repo_path, create=True)
    with chpwd(repo_path):
        # Let's add first archive to the repo with default setting
        archive_path = opj('4u', '1.tar.gz')
        # check it gives informative error if archive is not already added
        with assert_raises(RuntimeError) as cmr:
            add_archive_content(archive_path)
        assert_re_in(
            "You should run ['\"]datalad save 4u\\\\1\\.tar\\.gz['\"] first"
            if on_windows else
            "You should run ['\"]datalad save 4u/1\\.tar\\.gz['\"] first",
            str(cmr.exception),
            match=False)
        with swallow_outputs():
            repo.add(archive_path)
        repo.commit("added 1.tar.gz")

        ok_archives_caches(repo.path, 0)
        add_archive_content(archive_path,
                            strip_leading_dirs=True,
                            use_current_dir=True)
        ok_(not exists(opj('4u', '1 f.txt')))
        ok_file_under_git(repo.path, '1 f.txt', annexed=True)
        ok_archives_caches(repo.path, 0)

        # and now let's extract under archive dir
        add_archive_content(archive_path, strip_leading_dirs=True)
        ok_file_under_git(repo.path, opj('4u', '1 f.txt'), annexed=True)
        ok_archives_caches(repo.path, 0)

        add_archive_content(opj('4u', 'sub.tar.gz'))
        ok_file_under_git(repo.path, opj('4u', 'sub', '2 f.txt'), annexed=True)
        ok_archives_caches(repo.path, 0)
Exemple #2
0
def check_compress_file(ext, annex, path, name):
    # we base the archive name on the filename, in order to also
    # be able to properly test compressors where the corresponding
    # archive format has no capability of storing a filename
    # (i.e. where the archive name itself determines the filename
    # of the decompressed file, like .xz)
    archive = op.join(name, _filename + ext)
    compress_files([_filename], archive,
                   path=path)
    assert_true(op.exists(archive))
    if annex:
        # It should work even when file is annexed and is a symlink to the
        # key
        from datalad.support.annexrepo import AnnexRepo
        repo = AnnexRepo(path, init=True)
        repo.add(_filename)
        repo.commit(files=[_filename], msg="commit")

    dir_extracted = name + "_extracted"
    try:
        decompress_file(archive, dir_extracted)
    except MissingExternalDependency as exc:
        raise SkipTest(exc_str(exc))
    _filepath = op.join(dir_extracted, _filename)

    ok_file_has_content(_filepath, 'content')
Exemple #3
0
def test_proxying_open_testrepobased(repo):
    TEST_CONTENT = "content to be annex-addurl'd"
    fname = 'test-annex.dat'
    fpath = opj(repo, fname)
    assert_raises(IOError, open, fpath)

    aio = AutomagicIO(activate=True)
    try:
        with swallow_outputs():
            # now we should be able just to request to open this file
            with open(fpath) as f:
                content = f.read()
                eq_(content, TEST_CONTENT)
    finally:
        aio.deactivate()

    # and now that we have fetched it, nothing should forbid us to open it again
    with open(fpath) as f:
        eq_(f.read(), TEST_CONTENT)

    annex = AnnexRepo(repo, create=False)
    # Let's create another file deeper under the directory with the same content
    # so it would point to the same key, which we would drop and repeat the drill
    fpath2 = opj(repo, 'd1', 'd2', 'test2.dat')
    os.makedirs(dirname(fpath2))
    with open(fpath2, 'w') as f:
        f.write(content)
    annex.add(fpath2)
    annex.drop(fpath2)
    annex.commit("added and dropped")
    assert_raises(IOError, open, fpath2)

    # Let's use context manager form
    with AutomagicIO() as aio:
        ok_(isinstance(aio, AutomagicIO))
        ok_(aio.active)
        # swallowing output would cause trouble while testing with
        # DATALAD_ASSERT_NO_OPEN_FILES mode on.  Reason is not 100% clear
        # on why underlying git-annex process would be dumping to stdout or err
        #with swallow_outputs():

        # now we should be able just to request to open this file
        with open(fpath2) as f:
            content = f.read()
            eq_(content, TEST_CONTENT)

    annex.drop(fpath2)
    assert_raises(IOError, open, fpath2)

    # Let's use relative path
    with chpwd(opj(repo, 'd1')):
        # Let's use context manager form
        with AutomagicIO() as aio, \
                swallow_outputs(), \
                open(opj('d2', 'test2.dat')) as f:
            content = f.read()
            eq_(content, TEST_CONTENT)
Exemple #4
0
def test_direct_cfg(path1, path2):
    # and if repo already exists and we have env var - we fail too
    # Adding backend so we get some commit into the repo
    ar = AnnexRepo(path1, create=True, backend='MD5E')
    del ar
    AnnexRepo._unique_instances.clear()  # fight flyweight
    for path in (path1, path2):
        with patch.dict('os.environ', {'DATALAD_REPO_DIRECT': 'True'}):
            # try to create annex repo in direct mode as see how it fails
            with assert_raises(DirectModeNoLongerSupportedError) as cme:
                AnnexRepo(path, create=True)
            assert_in("no longer supported by DataLad",
                      str(cme.exception))  # we have generic part
            assert_in("datalad.repo.direct configuration",
                      str(cme.exception))  # situation specific part
    # assert not op.exists(path2)   # that we didn't create it - we do!
    #   fixing for that would be too cumbersome since we first call GitRepo.__init__
    #   with create
    ar = AnnexRepo(path1)
    # check if we somehow didn't reset the flag
    assert not ar.is_direct_mode()

    if ar.config.obtain("datalad.repo.version") >= 6:
        raise SkipTest(
            "Created repo not v5, cannot test detection of direct mode repos")
    # and if repo existed before and was in direct mode, we fail too
    # Since direct= option was deprecated entirely, we use protected method now
    ar._set_direct_mode(True)
    assert ar.is_direct_mode()
    del ar  # but we would need to disable somehow the flywheel
    with patch.dict('os.environ', {'DATALAD_REPO_DIRECT': 'True'}):
        with assert_raises(DirectModeNoLongerSupportedError) as cme:
            AnnexRepo(path1, create=False)

    # TODO: RM DIRECT decide what should we here -- should we test/blow?
    #   ATM both tests below just pass
    ar2 = AnnexRepo(path2, create=True)
    # happily can do it since it doesn't need a worktree to do the clone
    ar2.add_submodule('sub1', url=path1)
    ar2sub1 = AnnexRepo(op.join(path2, 'sub1'))
    # but now let's convert that sub1 to direct mode
    assert not ar2sub1.is_direct_mode()
    ar2sub1._set_direct_mode(True)
    assert ar2sub1.is_direct_mode()
    del ar2
    del ar2sub1
    AnnexRepo._unique_instances.clear()  # fight flyweight

    ar2 = AnnexRepo(path2)
    ar2.get_submodules()

    # And what if we are trying to add pre-cloned repo in direct mode?
    ar2sub2 = AnnexRepo.clone(path1, op.join(path2, 'sub2'))
    ar2sub2._set_direct_mode(True)
    del ar2sub2
    AnnexRepo._unique_instances.clear()  # fight flyweight
    ar2.add('sub2')
Exemple #5
0
def test_add_archive_content_zip(repo_path):
    repo = AnnexRepo(repo_path, create=True)
    with chpwd(repo_path):
        with swallow_outputs():
            repo.add(["1.zip"])
        repo.commit("add 1.zip")
        add_archive_content("1.zip")
        ok_file_under_git(opj(repo.path, "1", "foo"), annexed=True)
        ok_file_under_git(opj("1", "dir", "bar"), annexed=True)
        ok_archives_caches(repo.path, 0)
Exemple #6
0
def test_direct_cfg(path1, path2):
    # and if repo already exists and we have env var - we fail too
    # Adding backend so we get some commit into the repo
    ar = AnnexRepo(path1, create=True, backend='MD5E')
    del ar;  AnnexRepo._unique_instances.clear()  # fight flyweight
    for path in (path1, path2):
        with patch.dict('os.environ', {'DATALAD_REPO_DIRECT': 'True'}):
            # try to create annex repo in direct mode as see how it fails
            with assert_raises(DirectModeNoLongerSupportedError) as cme:
                AnnexRepo(path, create=True)
            assert_in("no longer supported by DataLad", str(cme.exception)) # we have generic part
            assert_in("datalad.repo.direct configuration", str(cme.exception)) # situation specific part
    # assert not op.exists(path2)   # that we didn't create it - we do!
    #   fixing for that would be too cumbersome since we first call GitRepo.__init__
    #   with create
    ar = AnnexRepo(path1)
    # check if we somehow didn't reset the flag
    assert not ar.is_direct_mode()

    if ar.config.obtain("datalad.repo.version") >= 6:
        raise SkipTest("Created repo not v5, cannot test detection of direct mode repos")
    # and if repo existed before and was in direct mode, we fail too
    # Since direct= option was deprecated entirely, we use protected method now
    ar._set_direct_mode(True)
    assert ar.is_direct_mode()
    del ar  # but we would need to disable somehow the flywheel
    with patch.dict('os.environ', {'DATALAD_REPO_DIRECT': 'True'}):
        with assert_raises(DirectModeNoLongerSupportedError) as cme:
            AnnexRepo(path1, create=False)


    # TODO: RM DIRECT decide what should we here -- should we test/blow?
    #   ATM both tests below just pass
    ar2 = AnnexRepo(path2, create=True)
    # happily can do it since it doesn't need a worktree to do the clone
    ar2.add_submodule('sub1', url=path1)
    ar2sub1 = AnnexRepo(op.join(path2, 'sub1'))
    # but now let's convert that sub1 to direct mode
    assert not ar2sub1.is_direct_mode()
    ar2sub1._set_direct_mode(True)
    assert ar2sub1.is_direct_mode()
    del ar2; del ar2sub1; AnnexRepo._unique_instances.clear()  # fight flyweight

    ar2 = AnnexRepo(path2)
    ar2.get_submodules()

    # And what if we are trying to add pre-cloned repo in direct mode?
    ar2sub2 = AnnexRepo.clone(path1, op.join(path2, 'sub2'))
    ar2sub2._set_direct_mode(True)
    del ar2sub2; AnnexRepo._unique_instances.clear()  # fight flyweight
    ar2.add('sub2')
Exemple #7
0
def test_add_archive_content_absolute_path(path):
    repo = AnnexRepo(opj(path, "ds"), create=True)
    repo.add(["1.tar.gz"])
    repo.commit("1.tar.gz")
    abs_tar_gz = opj(path, "ds", "1.tar.gz")
    add_archive_content(abs_tar_gz, annex=repo)
    ok_file_under_git(opj(path, "ds", "1", "foo"), annexed=True)
    commit_msg = repo.format_commit("%B")
    # The commit message uses relative paths.
    assert_not_in(abs_tar_gz, commit_msg)
    assert_in("1.tar.gz", commit_msg)

    with assert_raises(FileNotInRepositoryError):
        add_archive_content(opj(path, "notds", "2.tar.gz"), annex=repo)
Exemple #8
0
def test_get_contentlocation(tdir):
    repo = AnnexRepo(tdir, create=True, init=True)
    repo.add('file.dat')
    repo.commit('added file.dat')

    key = repo.get_file_key('file.dat')
    cr = AnnexCustomRemote(tdir)
    key_path = cr.get_contentlocation(key, absolute=False)
    assert not isabs(key_path)
    key_path_abs = cr.get_contentlocation(key, absolute=True)
    assert isabs(key_path_abs)
    assert cr._contentlocations == {key: key_path}
    repo.drop('file.dat', options=['--force'])
    assert not cr.get_contentlocation(key, absolute=True)
Exemple #9
0
def test_get_contentlocation(tdir):
    repo = AnnexRepo(tdir, create=True, init=True)
    repo.add('file.dat')
    repo.commit('added file.dat')

    key = repo.get_file_key('file.dat')
    cr = AnnexCustomRemote(tdir)
    key_path = cr.get_contentlocation(key, absolute=False)
    assert not isabs(key_path)
    key_path_abs = cr.get_contentlocation(key, absolute=True)
    assert isabs(key_path_abs)
    assert cr._contentlocations == {key: key_path}
    repo.drop('file.dat', options=['--force'])
    assert not cr.get_contentlocation(key, absolute=True)
Exemple #10
0
def test_get_contentlocation(tdir=None):
    repo = AnnexRepo(tdir, create=True, init=True)
    repo.add('file.dat')
    repo.commit('added file.dat')

    # TODO contentlocation would come with eval_availability=True
    key = repo.get_file_annexinfo('file.dat')['key']
    cr = ArchiveAnnexCustomRemote(None, path=tdir)
    key_path = cr.get_contentlocation(key, absolute=False)
    assert not isabs(key_path)
    key_path_abs = cr.get_contentlocation(key, absolute=True)
    assert isabs(key_path_abs)
    assert cr._contentlocations == {key: key_path}
    repo.drop('file.dat', options=['--force'])
    assert not cr.get_contentlocation(key, absolute=True)
Exemple #11
0
def put_file_under_git(path, filename=None, content=None, annexed=False):
    """Place file under git/annex and return used Repo
    """
    annex, file_repo_path, filename, path, repo = _prep_file_under_git(path, filename)
    if content is None:
        content = ""
    with open(opj(repo.path, file_repo_path), 'w') as f_:
        f_.write(content)

    if annexed:
        if not isinstance(repo, AnnexRepo):
            repo = AnnexRepo(repo.path)
        repo.add(file_repo_path, commit=True, _datalad_msg=True)
    else:
        repo.add(file_repo_path, git=True, _datalad_msg=True)
    ok_file_under_git(repo.path, file_repo_path, annexed)
    return repo
Exemple #12
0
def put_file_under_git(path, filename=None, content=None, annexed=False):
    """Place file under git/annex and return used Repo
    """
    annex, file_repo_path, filename, path, repo = _prep_file_under_git(path, filename)
    if content is None:
        content = ""
    with open(opj(repo.path, file_repo_path), 'w') as f_:
        f_.write(content)

    if annexed:
        if not isinstance(repo, AnnexRepo):
            repo = AnnexRepo(repo.path)
        repo.add(file_repo_path, commit=True, _datalad_msg=True)
    else:
        repo.add(file_repo_path, git=True, _datalad_msg=True)
    ok_file_under_git(repo.path, file_repo_path, annexed)
    return repo
Exemple #13
0
def test_check_dates(path):
    skip_if_no_module("dateutil")

    ref_ts = 1218182889  # Fri, 08 Aug 2008 04:08:09 -0400
    refdate = "@{}".format(ref_ts)

    repo = os.path.join(path, "repo")
    with set_date(ref_ts + 5000):
        ar = AnnexRepo(repo)
        ar.add(".")
        ar.commit()

    # The standard renderer outputs json.
    with swallow_outputs() as cmo:
        # Set level to WARNING to avoid the progress bar when
        # DATALAD_TESTS_UI_BACKEND=console.
        with swallow_logs(new_level=logging.WARNING):
            check_dates([repo],
                        reference_date=refdate,
                        return_type="list")
        assert_in("report", json.loads(cmo.out).keys())

    # We find the newer objects.
    newer = call([path], reference_date=refdate)
    eq_(len(newer), 1)
    ok_(newer[0]["report"]["objects"])

    # There are no older objects to find.
    older = call([repo], reference_date=refdate, older=True)
    assert_false(older[0]["report"]["objects"])

    # We can pass the date in RFC 2822 format.
    assert_dict_equal(
        newer[0],
        call([path], reference_date="08 Aug 2008 04:08:09 -0400")[0])

    # paths=None defaults to the current directory.
    with chpwd(path):
        assert_dict_equal(
            newer[0]["report"],
            call(paths=None, reference_date=refdate)[0]["report"])

    # Only commit type is present when annex='none'.
    newer_noannex = call([path], reference_date=refdate, annex="none")
    for entry in newer_noannex[0]["report"]["objects"].values():
        ok_(entry["type"] == "commit")
Exemple #14
0
def test_get_dataset_root(path):
    eq_(get_dataset_root('/nonexistent'), None)
    with chpwd(path):
        repo = AnnexRepo(os.curdir, create=True)
        subdir = opj('some', 'deep')
        fname = opj(subdir, 'dummy')
        os.makedirs(subdir)
        with open(fname, 'w') as f:
            f.write('some')
        repo.add(fname)
        # we can find this repo
        eq_(get_dataset_root(os.curdir), os.curdir)
        # and we get the type of path that we fed in
        eq_(get_dataset_root(abspath(os.curdir)), abspath(os.curdir))
        # subdirs are no issue
        eq_(get_dataset_root(subdir), os.curdir)
        # non-dir paths are no issue
        eq_(get_dataset_root(fname), os.curdir)
Exemple #15
0
def test_get_dataset_root(path):
    eq_(get_dataset_root('/nonexistent'), None)
    with chpwd(path):
        repo = AnnexRepo(os.curdir, create=True)
        subdir = opj('some', 'deep')
        fname = opj(subdir, 'dummy')
        os.makedirs(subdir)
        with open(fname, 'w') as f:
            f.write('some')
        repo.add(fname)
        # we can find this repo
        eq_(get_dataset_root(os.curdir), os.curdir)
        # and we get the type of path that we fed in
        eq_(get_dataset_root(abspath(os.curdir)), abspath(os.curdir))
        # subdirs are no issue
        eq_(get_dataset_root(subdir), os.curdir)
        # non-dir paths are no issue
        eq_(get_dataset_root(fname), os.curdir)
Exemple #16
0
def test_check_dates(path=None):
    skip_if_no_module("dateutil")

    ref_ts = 1218182889  # Fri, 08 Aug 2008 04:08:09 -0400
    refdate = "@{}".format(ref_ts)

    repo = os.path.join(path, "repo")
    with set_date(ref_ts + 5000):
        ar = AnnexRepo(repo)
        ar.add(".")
        ar.commit()

    # The standard renderer outputs json.
    with swallow_outputs() as cmo:
        # Set level to WARNING to avoid the progress bar when
        # DATALAD_TESTS_UI_BACKEND=console.
        with swallow_logs(new_level=logging.WARNING):
            check_dates([repo], reference_date=refdate, return_type="list")
        assert_in("report", json.loads(cmo.out).keys())

    # We find the newer objects.
    newer = call([path], reference_date=refdate)
    eq_(len(newer), 1)
    ok_(newer[0]["report"]["objects"])

    # There are no older objects to find.
    older = call([repo], reference_date=refdate, older=True)
    assert_false(older[0]["report"]["objects"])

    # We can pass the date in RFC 2822 format.
    assert_dict_equal(
        newer[0],
        call([path], reference_date="08 Aug 2008 04:08:09 -0400")[0])

    # paths=None defaults to the current directory.
    with chpwd(path):
        assert_dict_equal(
            newer[0]["report"],
            call(paths=None, reference_date=refdate)[0]["report"])

    # Only commit type is present when annex='none'.
    newer_noannex = call([path], reference_date=refdate, annex="none")
    for entry in newer_noannex[0]["report"]["objects"].values():
        ok_(entry["type"] == "commit")
Exemple #17
0
def test_update_fetch_all(src, remote_1, remote_2):
    rmt1 = AnnexRepo(remote_1, src)
    rmt2 = AnnexRepo(remote_2, src)

    ds = Dataset(src)
    ds.add_sibling(name="sibling_1", url=remote_1)
    ds.add_sibling(name="sibling_2", url=remote_2)

    # modify the remotes:
    with open(opj(remote_1, "first.txt"), "w") as f:
        f.write("some file load")
    rmt1.add("first.txt", commit=True)
    # TODO: Modify an already present file!

    with open(opj(remote_2, "second.txt"), "w") as f:
        f.write("different file load")
    rmt2.add("second.txt", git=True, commit=True, msg="Add file to git.")

    # fetch all remotes
    ds.update(fetch_all=True)

    # no merge, so changes are not in active branch:
    assert_not_in("first.txt", ds.repo.get_files(ds.repo.get_active_branch()))
    assert_not_in("second.txt", ds.repo.get_files(ds.repo.get_active_branch()))
    # but we know the changes in remote branches:
    assert_in("first.txt", ds.repo.get_files("sibling_1/master"))
    assert_in("second.txt", ds.repo.get_files("sibling_2/master"))

    # no merge strategy for multiple remotes yet:
    assert_raises(NotImplementedError, ds.update, merge=True, fetch_all=True)

    # merge a certain remote:
    ds.update(name="sibling_1", merge=True)

    # changes from sibling_2 still not present:
    assert_not_in("second.txt", ds.repo.get_files(ds.repo.get_active_branch()))
    # changes from sibling_1 merged:
    assert_in("first.txt", ds.repo.get_files(ds.repo.get_active_branch()))
    # it's known to annex, but has no content yet:
    ds.repo.get_file_key("first.txt")  # raises if unknown
    eq_([False], ds.repo.file_has_content(["first.txt"]))
Exemple #18
0
def test_interactions(tdir):
    # Just a placeholder since constructor expects a repo
    repo = AnnexRepo(tdir, create=True, init=True)
    repo.add('file.dat')
    repo.commit('added file.dat')
    for scenario in BASE_INTERACTION_SCENARIOS + [
        [
            ('GETAVAILABILITY', 'AVAILABILITY %s' % DEFAULT_AVAILABILITY),
            ('GETCOST', 'COST %d' % DEFAULT_COST),
            ('TRANSFER RETRIEVE somekey somefile',
             re.compile('TRANSFER-FAILURE RETRIEVE somekey NotImplementedError().*')),
        ],
        [
            # by default we do not require any fancy init
            # no urls supported by default
            ('CLAIMURL http://example.com', 'CLAIMURL-FAILURE'),
            # we know that is just a single option, url, is expected so full
            # one would be passed
            ('CLAIMURL http://example.com roguearg', 'CLAIMURL-FAILURE'),
        ]
    ]:
        check_interaction_scenario(AnnexCustomRemote, tdir, scenario)
Exemple #19
0
def check_compress_file(ext, annex, path, name):
    archive = name + ext
    compress_files([_filename], archive, path=path)
    assert_true(exists(archive))
    if annex:
        # It should work even when file is annexed and is a symlink to the
        # key
        from datalad.support.annexrepo import AnnexRepo
        repo = AnnexRepo(path, init=True)
        repo.add(_filename)
        repo.commit(files=[_filename], msg="commit")

    dir_extracted = name + "_extracted"
    try:
        decompress_file(archive, dir_extracted)
    except MissingExternalDependency as exc:
        raise SkipTest(exc_str(exc))
    _filepath = op.join(dir_extracted, _filename)

    import glob
    print(dir_extracted)
    print(glob.glob(dir_extracted + '/*'))
    ok_file_has_content(_filepath, 'content')
Exemple #20
0
def check_compress_file(ext, annex, path, name):
    archive = name + ext
    compress_files([_filename], archive,
                   path=path)
    assert_true(exists(archive))
    if annex:
        # It should work even when file is annexed and is a symlink to the
        # key
        from datalad.support.annexrepo import AnnexRepo
        repo = AnnexRepo(path, init=True)
        repo.add(_filename)
        repo.commit(files=[_filename], msg="commit")

    dir_extracted = name + "_extracted"
    try:
        decompress_file(archive, dir_extracted)
    except MissingExternalDependency as exc:
        raise SkipTest(exc_str(exc))
    _filepath = op.join(dir_extracted, _filename)

    import glob
    print(dir_extracted)
    print(glob.glob(dir_extracted + '/*'))
    ok_file_has_content(_filepath, 'content')
Exemple #21
0
def test_ls_json(topdir):
    annex = AnnexRepo(topdir, create=True)
    dsj = Dataset(topdir)
    # create some file and commit it
    with open(opj(dsj.path, 'subdsfile.txt'), 'w') as f:
        f.write('123')
    dsj.add(path='subdsfile.txt')
    dsj.save("Hello!", version_tag=1)

    # add a subdataset
    dsj.install('subds', source=topdir)

    subdirds = dsj.create(_path_('dir/subds2'), force=True)
    subdirds.add('file')

    git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True)  # create git repo
    git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt'),
            commit=True)  # commit to git to init git repo
    annex.add(opj(topdir, 'dir', 'subgit'),
              commit=True)  # add the non-dataset git repo to annex
    annex.add(opj(topdir, 'dir'), commit=True)  # add to annex (links)
    annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'),
               options=['--force'])  # broken-link

    meta_dir = opj('.git', 'datalad', 'metadata')
    meta_path = opj(topdir, meta_dir)

    def get_metahash(*path):
        if not path:
            path = ['/']
        return hashlib.md5(opj(*path).encode('utf-8')).hexdigest()

    def get_metapath(dspath, *path):
        return _path_(dspath, meta_dir, get_metahash(*path))

    def get_meta(dspath, *path):
        with open(get_metapath(dspath, *path)) as f:
            return js.load(f)

    for all_ in [True, False]:  # recurse directories
        for recursive in [True, False]:
            for state in ['file', 'delete']:
                # subdataset should have its json created and deleted when
                # all=True else not
                subds_metapath = get_metapath(opj(topdir, 'subds'))
                exists_prior = exists(subds_metapath)

                #with swallow_logs(), swallow_outputs():
                dsj = _ls_json(topdir,
                               json=state,
                               all_=all_,
                               recursive=recursive)

                exists_post = exists(subds_metapath)
                # print("%s %s -> %s" % (state, exists_prior, exists_post))
                assert_equal(exists_post, (state == 'file' and recursive))

                # root should have its json file created and deleted in all cases
                ds_metapath = get_metapath(topdir)
                assert_equal(exists(ds_metapath), state == 'file')

                # children should have their metadata json's created and deleted only when recursive=True
                child_metapath = get_metapath(topdir, 'dir', 'subdir')
                assert_equal(exists(child_metapath),
                             (state == 'file' and all_))

                # ignored directories should not have json files created in any case
                for subdir in [('.hidden', ), ('dir', 'subgit')]:
                    assert_false(exists(get_metapath(topdir, *subdir)))

                # check if its updated in its nodes sublist too. used by web-ui json. regression test
                assert_equal(dsj['nodes'][0]['size']['total'],
                             dsj['size']['total'])

                # check size of subdataset
                subds = [
                    item for item in dsj['nodes']
                    if item['name'] == ('subdsfile.txt' or 'subds')
                ][0]
                assert_equal(subds['size']['total'], '3 Bytes')

                # dir/subds2 must not be listed among nodes of the top dataset:
                topds_nodes = {x['name']: x for x in dsj['nodes']}

                assert_in('subds', topds_nodes)
                # XXX
                # # condition here is a bit a guesswork by yoh later on
                # # TODO: here and below clear destiny/interaction of all_ and recursive
                # assert_equal(dsj['size']['total'],
                #              '15 Bytes' if (recursive and all_) else
                #              ('9 Bytes' if (recursive or all_) else '3 Bytes')
                # )

                # https://github.com/datalad/datalad/issues/1674
                if state == 'file' and all_:
                    dirj = get_meta(topdir, 'dir')
                    dir_nodes = {x['name']: x for x in dirj['nodes']}
                    # it should be present in the subdir meta
                    assert_in('subds2', dir_nodes)
                # and not in topds
                assert_not_in('subds2', topds_nodes)

                # run non-recursive dataset traversal after subdataset metadata already created
                # to verify sub-dataset metadata being picked up from its metadata file in such cases
                if state == 'file' and recursive and not all_:
                    dsj = _ls_json(topdir, json='file', all_=False)
                    subds = [
                        item for item in dsj['nodes']
                        if item['name'] == ('subdsfile.txt' or 'subds')
                    ][0]
                    assert_equal(subds['size']['total'], '3 Bytes')
Exemple #22
0
def test_ls_json(topdir):
    annex = AnnexRepo(topdir, create=True)
    ds = Dataset(topdir)
    # create some file and commit it
    with open(opj(ds.path, 'subdsfile.txt'), 'w') as f:
        f.write('123')
    ds.add(path='subdsfile.txt')
    ds.save("Hello!", version_tag=1)
    # add a subdataset
    ds.install('subds', source=topdir)

    git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True)  # create git repo
    git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt'),
            commit=True)  # commit to git to init git repo
    annex.add(opj(topdir, 'dir', 'subgit'),
              commit=True)  # add the non-dataset git repo to annex
    annex.add(opj(topdir, 'dir'), commit=True)  # add to annex (links)
    annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'),
               options=['--force'])  # broken-link

    meta_dir = opj('.git', 'datalad', 'metadata')
    meta_path = opj(topdir, meta_dir)

    def get_metahash(*path):
        return hashlib.md5(opj(*path).encode('utf-8')).hexdigest()

    for all_ in [True, False]:
        for recursive in [True, False]:
            for state in ['file', 'delete']:
                with swallow_logs(), swallow_outputs():
                    ds = _ls_json(topdir,
                                  json=state,
                                  all_=all_,
                                  recursive=recursive)

                # subdataset should have its json created and deleted when all=True else not
                subds_metahash = get_metahash('/')
                subds_metapath = opj(topdir, 'subds', meta_dir, subds_metahash)
                assert_equal(exists(subds_metapath),
                             (state == 'file' and recursive))

                # root should have its json file created and deleted in all cases
                ds_metahash = get_metahash('/')
                ds_metapath = opj(meta_path, ds_metahash)
                assert_equal(exists(ds_metapath), state == 'file')

                # children should have their metadata json's created and deleted only when recursive=True
                child_metahash = get_metahash('dir', 'subdir')
                child_metapath = opj(meta_path, child_metahash)
                assert_equal(exists(child_metapath),
                             (state == 'file' and all_))

                # ignored directories should not have json files created in any case
                for subdir in [('.hidden'), ('dir', 'subgit')]:
                    child_metahash = get_metahash(*subdir)
                    assert_equal(exists(opj(meta_path, child_metahash)), False)

                # check if its updated in its nodes sublist too. used by web-ui json. regression test
                assert_equal(ds['nodes'][0]['size']['total'],
                             ds['size']['total'])

                # check size of subdataset
                subds = [
                    item for item in ds['nodes']
                    if item['name'] == ('subdsfile.txt' or 'subds')
                ][0]
                assert_equal(subds['size']['total'], '3 Bytes')

                # run non-recursive dataset traversal after subdataset metadata already created
                # to verify sub-dataset metadata being picked up from its metadata file in such cases
                if state == 'file' and recursive and not all_:
                    ds = _ls_json(topdir, json='file', all_=False)
                    subds = [
                        item for item in ds['nodes']
                        if item['name'] == ('subdsfile.txt' or 'subds')
                    ][0]
                    assert_equal(subds['size']['total'], '3 Bytes')
Exemple #23
0
def test_check_dates(path):
    refdate = 1218182889

    with set_date(refdate - 1):
        ar = AnnexRepo(path, create=True)

        def tag_object(tag):
            """Return object for tag.  Do not dereference it.
            """
            # We can't use ar.get_tags because that returns the commit's hexsha,
            # not the tag's, and ar.get_hexsha is limited to commit objects.
            return ar.call_git_oneline(
                ["rev-parse", "refs/tags/{}".format(tag)], read_only=True)

        ar.add("foo")
        ar.commit("add foo")
        foo_commit = ar.get_hexsha()
        ar.commit("add foo")
        ar.tag("foo-tag", "tag before refdate")
        foo_tag = tag_object("foo-tag")
        # Make a lightweight tag to make sure `tag_dates` doesn't choke on it.
        ar.tag("light")
    with set_date(refdate + 1):
        ar.add("bar")
        ar.commit("add bar")
        bar_commit = ar.get_hexsha()
        ar.tag("bar-tag", "tag after refdate")
        bar_tag = tag_object("bar-tag")
    with set_date(refdate + 2):
        # Drop an annexed file so that we have more blobs in the git-annex
        # branch than its current tree.
        ar.drop("bar", options=["--force"])

    results = {}
    for which in ["older", "newer"]:
        result = check_dates(ar, refdate, which=which)["objects"]
        ok_(result)
        if which == "newer":
            assert_in(bar_commit, result)
            assert_not_in(foo_commit, result)
            assert_in(bar_tag, result)
        elif which == "older":
            assert_in(foo_commit, result)
            assert_not_in(bar_commit, result)
            assert_in(foo_tag, result)
        results[which] = result

    ok_(any(
        x.get("filename") == "uuid.log" for x in results["older"].values()))

    newer_tree = check_dates(ar, refdate, annex="tree")["objects"]

    def is_annex_log_blob(entry):
        return (entry["type"] == "annex-blob"
                and entry["filename"].endswith(".log"))

    def num_logs(entries):
        return sum(map(is_annex_log_blob, entries.values()))

    # Because we dropped bar above, we should have one more blob in the
    # git-annex branch than in the current tree of the git-annex branch.
    eq_(num_logs(results["newer"]) - num_logs(newer_tree), 1)

    # Act like today is one day from the reference timestamp to check that we
    # get the same results with the one-day-back default.
    seconds_in_day = 60 * 60 * 24
    with patch('time.time', return_value=refdate + seconds_in_day):
        assert_equal(check_dates(ar, annex="tree")["objects"], newer_tree)

    # We can give a path (str) instead of a GitRepo object.
    assert_equal(
        check_dates(path, refdate, annex="tree")["objects"], newer_tree)

    with assert_raises(ValueError):
        check_dates(ar, refdate, which="unrecognized")
Exemple #24
0
def test_check_dates(path):
    refdate = 1218182889

    with set_date(refdate - 1):
        ar = AnnexRepo(path, create=True)
        ar.add("foo")
        ar.commit("add foo")
        foo_commit = ar.get_hexsha()
        ar.commit("add foo")
        ar.tag("foo-tag", "tag before refdate")
        # We can't use ar.get_tags because that returns the commit's hexsha,
        # not the tag's, and ar.get_hexsha is limited to commit objects.
        foo_tag = ar.repo.git.rev_parse("foo-tag")
        # Make a lightweight tag to make sure `tag_dates` doesn't choke on it.
        ar.tag("light")
    with set_date(refdate + 1):
        ar.add("bar")
        ar.commit("add bar")
        bar_commit = ar.get_hexsha()
        ar.tag("bar-tag", "tag after refdate")
        bar_tag = ar.repo.git.rev_parse("bar-tag")
    with set_date(refdate + 2):
        # Drop an annexed file so that we have more blobs in the git-annex
        # branch than its current tree.
        ar.drop("bar", options=["--force"])

    results = {}
    for which in ["older", "newer"]:
        result = check_dates(ar, refdate, which=which)["objects"]
        ok_(result)
        if which == "newer":
            assert_in(bar_commit, result)
            assert_not_in(foo_commit, result)
            assert_in(bar_tag, result)
        elif which == "older":
            assert_in(foo_commit, result)
            assert_not_in(bar_commit, result)
            assert_in(foo_tag, result)
        results[which] = result

    ok_(any(x.get("filename") == "uuid.log"
            for x in results["older"].values()))

    newer_tree = check_dates(ar, refdate, annex="tree")["objects"]

    def is_annex_log_blob(entry):
        return (entry["type"] == "annex-blob"
                and entry["filename"].endswith(".log"))

    def num_logs(entries):
        return sum(map(is_annex_log_blob, entries.values()))

    # Because we dropped bar above, we should have one more blob in the
    # git-annex branch than in the current tree of the git-annex branch.
    eq_(num_logs(results["newer"]) - num_logs(newer_tree), 1)

    # Act like today is one day from the reference timestamp to check that we
    # get the same results with the one-day-back default.
    seconds_in_day = 60 * 60 * 24
    with patch('time.time', return_value=refdate + seconds_in_day):
        assert_equal(check_dates(ar, annex="tree")["objects"],
                     newer_tree)

    # We can give a path (str) instead of a GitRepo object.
    assert_equal(check_dates(path, refdate, annex="tree")["objects"],
                 newer_tree)

    with assert_raises(ValueError):
        check_dates(ar, refdate, which="unrecognized")
Exemple #25
0
def _test_proxying_open(generate_load, verify_load, repo):
    annex = AnnexRepo(repo, create=True)
    fpath1 = opj(repo, "test")
    fpath2 = opj(repo, 'd1', 'd2', 'test2')
    # generate load
    fpath1 = generate_load(fpath1) or fpath1
    os.makedirs(dirname(fpath2))
    fpath2 = generate_load(fpath2) or fpath2
    annex.add([fpath1, fpath2])
    verify_load(fpath1)
    verify_load(fpath2)
    annex.commit("Added some files")

    # clone to another repo
    repo2 = repo + "_2"
    annex2 = AnnexRepo.clone(repo, repo2)
    # verify that can't access
    fpath1_2 = fpath1.replace(repo, repo2)
    fpath2_2 = fpath2.replace(repo, repo2)

    EXPECTED_EXCEPTIONS = (IOError, OSError)
    assert_raises(EXPECTED_EXCEPTIONS, verify_load, fpath1_2)

    with AutomagicIO():
        # verify that it doesn't even try to get files which do not exist
        with patch('datalad.support.annexrepo.AnnexRepo.get') as gricm:
            # if we request absent file
            assert_raises(EXPECTED_EXCEPTIONS, open, fpath1_2 + "_", 'r')
            # no get should be called
            assert_false(gricm.called)
        verify_load(fpath1_2)
        verify_load(fpath2_2)
        # and even if we drop it -- we still can get it no problem
        annex2.drop(fpath2_2)
        assert_false(annex2.file_has_content(fpath2_2))
        verify_load(fpath2_2)
        assert_true(annex2.file_has_content(fpath2_2))
        annex2.drop(fpath2_2)
        assert_false(annex2.file_has_content(fpath2_2))
        assert_true(os.path.isfile(fpath2_2))

    # In check_once mode, if we drop it, it wouldn't be considered again
    annex2.drop(fpath2_2)
    assert_false(annex2.file_has_content(fpath2_2))
    with AutomagicIO(check_once=True):
        verify_load(fpath2_2)
        assert_true(annex2.file_has_content(fpath2_2))
        annex2.drop(fpath2_2)
        assert_false(annex2.file_has_content(fpath2_2))
        assert_false(os.path.isfile(fpath2_2))

    # if we override stdout with something not supporting fileno, like tornado
    # does which ruins using get under IPython
    # TODO: we might need to refuse any online logging in other places like that
    annex2.drop(fpath2_2)

    class StringIOfileno(StringIO):
        def fileno(self):
            raise Exception("I have no clue how to do fileno")

    with patch('sys.stdout', new_callable=StringIOfileno), \
         patch('sys.stderr', new_callable=StringIOfileno):
        with AutomagicIO():
            assert_false(annex2.file_has_content(fpath2_2))
            verify_load(fpath2_2)
            assert_true(annex2.file_has_content(fpath2_2))
Exemple #26
0
def test_fs_traverse(topdir):
    # setup temp directory tree for testing
    annex = AnnexRepo(topdir)
    AnnexRepo(opj(topdir, 'annexdir'), create=True)
    GitRepo(opj(topdir, 'gitdir'), create=True)
    GitRepo(opj(topdir, 'dir', 'subgit'), create=True)
    annex.add(opj(topdir, 'dir'), commit=True)
    annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force'])

    # traverse file system in recursive and non-recursive modes
    for recursive in [True, False]:
        # test fs_traverse in display mode
        with swallow_logs(new_level=logging.INFO) as log, swallow_outputs() as cmo:
            fs = fs_traverse(topdir, AnnexRepo(topdir), recursive=recursive, json='display')
            if recursive:
                # fs_traverse logs should contain all not ignored subdirectories
                for subdir in [opj(topdir, 'dir'), opj(topdir, 'dir', 'subdir')]:
                    assert_in('Directory: ' + subdir, log.out)
                # fs_traverse stdout contains subdirectory
                assert_in(('file2.txt' and 'dir'), cmo.out)

            # extract info of the top-level child directory
            child = [item for item in fs['nodes'] if item['name'] == 'dir'][0]
            # size of dir type child in non-recursive modes should be 0 Bytes(default) as
            # dir type child's size currently has no metadata file for traverser to pick its size from
            # and would require a recursive traversal w/ write to child metadata file mode
            assert_equal(child['size']['total'], {True: '6 Bytes', False: '0 Bytes'}[recursive])

    for recursive in [True, False]:
        # run fs_traverse in write to json 'file' mode
        fs = fs_traverse(topdir, AnnexRepo(topdir), recursive=recursive, json='file')
        # fs_traverse should return a dictionary
        assert_equal(isinstance(fs, dict), True)
        # not including git and annex folders
        assert_equal([item for item in fs['nodes'] if ('gitdir' or 'annexdir') == item['name']], [])
        # extract info of the top-level child directory
        child = [item for item in fs['nodes'] if item['name'] == 'dir'][0]
        # verify node type
        assert_equal(child['type'], 'dir')
        # same node size on running fs_traversal in recursive followed by non-recursive mode
        # verifies child's metadata file being used to find its size
        # running in reverse order (non-recursive followed by recursive mode) will give (0, actual size)
        assert_equal(child['size']['total'], '6 Bytes')

        # verify subdirectory traversal if run in recursive mode
        if recursive:
            # sub-dictionary should not include git and hidden directory info
            assert_equal([item for item in child['nodes'] if ('subgit' or '.fgit') == item['name']], [])
            # extract subdirectory dictionary, else fail
            subchild = [subitem for subitem in child["nodes"] if subitem['name'] == 'subdir'][0]
            # extract info of file1.txts, else fail
            link = [subnode for subnode in subchild["nodes"] if subnode['name'] == 'file1.txt'][0]
            # verify node's sizes and type
            assert_equal(link['size']['total'], '3 Bytes')
            assert_equal(link['size']['ondisk'], link['size']['total'])
            assert_equal(link['type'], 'link')
            # extract info of file2.txt, else fail
            brokenlink = [subnode for subnode in subchild["nodes"] if subnode['name'] == 'file2.txt'][0]
            # verify node's sizes and type
            assert_equal(brokenlink['type'], 'link-broken')
            assert_equal(brokenlink['size']['ondisk'], '0 Bytes')
            assert_equal(brokenlink['size']['total'], '3 Bytes')
Exemple #27
0
def test_ls_json(topdir, topurl):
    annex = AnnexRepo(topdir, create=True)
    ds = Dataset(topdir)
    # create some file and commit it
    with open(opj(ds.path, 'subdsfile.txt'), 'w') as f:
        f.write('123')
    ds.add(path='subdsfile.txt')
    ds.save("Hello!", version_tag=1)

    # add a subdataset
    ds.install('subds', source=topdir)

    subdirds = ds.create(_path_('dir/subds2'), force=True)
    subdirds.add('file')

    git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True)                    # create git repo
    git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt'))                           # commit to git to init git repo
    git.commit()
    annex.add(opj(topdir, 'dir', 'subgit'))                                     # add the non-dataset git repo to annex
    annex.add(opj(topdir, 'dir'))                                               # add to annex (links)
    annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force'])  # broken-link
    annex.commit()

    git.add('fgit.txt')              # commit to git to init git repo
    git.commit()
    # annex.add doesn't add submodule, so using ds.add
    ds.add(opj('dir', 'subgit'))                        # add the non-dataset git repo to annex
    ds.add('dir')                                  # add to annex (links)
    ds.drop(opj('dir', 'subdir', 'file2.txt'), check=False)  # broken-link

    # register "external" submodule  by installing and uninstalling it
    ext_url = topurl + '/dir/subgit/.git'
    # need to make it installable via http
    Runner()('git update-server-info', cwd=opj(topdir, 'dir', 'subgit'))
    ds.install(opj('dir', 'subgit_ext'), source=ext_url)
    ds.uninstall(opj('dir', 'subgit_ext'))
    meta_dir = opj('.git', 'datalad', 'metadata')

    def get_metahash(*path):
        if not path:
            path = ['/']
        return hashlib.md5(opj(*path).encode('utf-8')).hexdigest()

    def get_metapath(dspath, *path):
        return _path_(dspath, meta_dir, get_metahash(*path))

    def get_meta(dspath, *path):
        with open(get_metapath(dspath, *path)) as f:
            return js.load(f)

    # Let's see that there is no crash if one of the files is available only
    # in relaxed URL mode, so no size could be picked up
    ds.repo.add_url_to_file(
        'fromweb', topurl + '/noteventhere', options=['--relaxed'])

    for all_ in [True, False]:  # recurse directories
        for recursive in [True, False]:
            for state in ['file', 'delete']:
                # subdataset should have its json created and deleted when
                # all=True else not
                subds_metapath = get_metapath(opj(topdir, 'subds'))
                exists_prior = exists(subds_metapath)

                #with swallow_logs(), swallow_outputs():
                dsj = _ls_json(
                    topdir,
                    json=state,
                    all_=all_,
                    recursive=recursive
                )
                ok_startswith(dsj['tags'], '1-')

                exists_post = exists(subds_metapath)
                # print("%s %s -> %s" % (state, exists_prior, exists_post))
                assert_equal(exists_post, (state == 'file' and recursive))

                # root should have its json file created and deleted in all cases
                ds_metapath = get_metapath(topdir)
                assert_equal(exists(ds_metapath), state == 'file')

                # children should have their metadata json's created and deleted only when recursive=True
                child_metapath = get_metapath(topdir, 'dir', 'subdir')
                assert_equal(exists(child_metapath), (state == 'file' and all_))

                # ignored directories should not have json files created in any case
                for subdir in [('.hidden',), ('dir', 'subgit')]:
                    assert_false(exists(get_metapath(topdir, *subdir)))

                # check if its updated in its nodes sublist too. used by web-ui json. regression test
                assert_equal(dsj['nodes'][0]['size']['total'], dsj['size']['total'])

                # check size of subdataset
                subds = [item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds')][0]
                assert_equal(subds['size']['total'], '3 Bytes')

                # dir/subds2 must not be listed among nodes of the top dataset:
                topds_nodes = {x['name']: x for x in dsj['nodes']}

                assert_in('subds', topds_nodes)
                # XXX
                # # condition here is a bit a guesswork by yoh later on
                # # TODO: here and below clear destiny/interaction of all_ and recursive
                # assert_equal(dsj['size']['total'],
                #              '15 Bytes' if (recursive and all_) else
                #              ('9 Bytes' if (recursive or all_) else '3 Bytes')
                # )

                # https://github.com/datalad/datalad/issues/1674
                if state == 'file' and all_:
                    dirj = get_meta(topdir, 'dir')
                    dir_nodes = {x['name']: x for x in dirj['nodes']}
                    # it should be present in the subdir meta
                    assert_in('subds2', dir_nodes)
                    assert_not_in('url_external', dir_nodes['subds2'])
                    assert_in('subgit_ext', dir_nodes)
                    assert_equal(dir_nodes['subgit_ext']['url'], ext_url)
                # and not in topds
                assert_not_in('subds2', topds_nodes)

                # run non-recursive dataset traversal after subdataset metadata already created
                # to verify sub-dataset metadata being picked up from its metadata file in such cases
                if state == 'file' and recursive and not all_:
                    dsj = _ls_json(topdir, json='file', all_=False)
                    subds = [
                        item for item in dsj['nodes']
                        if item['name'] == ('subdsfile.txt' or 'subds')
                    ][0]
                    assert_equal(subds['size']['total'], '3 Bytes')

                assert_equal(
                    topds_nodes['fromweb']['size']['total'], UNKNOWN_SIZE
                )
Exemple #28
0
def test_ls_json(topdir):
    annex = AnnexRepo(topdir, create=True)
    ds = Dataset(topdir)
    # create some file and commit it
    open(opj(ds.path, 'subdsfile.txt'), 'w').write('123')
    ds.add(path='subdsfile.txt')
    ds.save("Hello!", version_tag=1)
    # add a subdataset
    ds.install('subds', source=topdir)

    git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True)                    # create git repo
    git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt'), commit=True)              # commit to git to init git repo
    annex.add(opj(topdir, 'dir', 'subgit'), commit=True)                        # add the non-dataset git repo to annex
    annex.add(opj(topdir, 'dir'), commit=True)                                  # add to annex (links)
    annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force'])  # broken-link

    meta_dir = opj('.git', 'datalad', 'metadata')
    meta_path = opj(topdir, meta_dir)

    def get_metahash(*path):
        return hashlib.md5(opj(*path).encode('utf-8')).hexdigest()

    for all_ in [True, False]:
        for recursive in [True, False]:
            for state in ['file', 'delete']:
                with swallow_logs(), swallow_outputs():
                    ds = _ls_json(topdir, json=state, all_=all_, recursive=recursive)

                # subdataset should have its json created and deleted when all=True else not
                subds_metahash = get_metahash('/')
                subds_metapath = opj(topdir, 'subds', meta_dir, subds_metahash)
                assert_equal(exists(subds_metapath), (state == 'file' and recursive))

                # root should have its json file created and deleted in all cases
                ds_metahash = get_metahash('/')
                ds_metapath = opj(meta_path, ds_metahash)
                assert_equal(exists(ds_metapath), state == 'file')

                # children should have their metadata json's created and deleted only when recursive=True
                child_metahash = get_metahash('dir', 'subdir')
                child_metapath = opj(meta_path, child_metahash)
                assert_equal(exists(child_metapath), (state == 'file' and all_))

                # ignored directories should not have json files created in any case
                for subdir in [('.hidden'), ('dir', 'subgit')]:
                    child_metahash = get_metahash(*subdir)
                    assert_equal(exists(opj(meta_path, child_metahash)), False)

                # check if its updated in its nodes sublist too. used by web-ui json. regression test
                assert_equal(ds['nodes'][0]['size']['total'], ds['size']['total'])

                # check size of subdataset
                subds = [item for item in ds['nodes'] if item['name'] == ('subdsfile.txt' or 'subds')][0]
                assert_equal(subds['size']['total'], '3 Bytes')

                # run non-recursive dataset traversal after subdataset metadata already created
                # to verify sub-dataset metadata being picked up from its metadata file in such cases
                if state == 'file' and recursive and not all_:
                    ds = _ls_json(topdir, json='file', all_=False)
                    subds = [item for item in ds['nodes'] if item['name'] == ('subdsfile.txt' or 'subds')][0]
                    assert_equal(subds['size']['total'], '3 Bytes')
Exemple #29
0
def test_fs_traverse(topdir):
    # setup temp directory tree for testing
    annex = AnnexRepo(topdir)
    AnnexRepo(opj(topdir, 'annexdir'), create=True)
    GitRepo(opj(topdir, 'gitdir'), create=True)
    GitRepo(opj(topdir, 'dir', 'subgit'), create=True)
    annex.add(opj(topdir, 'dir'), commit=True)
    annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force'])

    # traverse file system in recursive and non-recursive modes
    for recursive in [True, False]:
        # test fs_traverse in display mode
        with swallow_logs(
                new_level=logging.INFO) as log, swallow_outputs() as cmo:
            fs = fs_traverse(topdir,
                             AnnexRepo(topdir),
                             recurse_directories=recursive,
                             json='display')
            if recursive:
                # fs_traverse logs should contain all not ignored subdirectories
                for subdir in [
                        opj(topdir, 'dir'),
                        opj(topdir, 'dir', 'subdir')
                ]:
                    assert_in('Directory: ' + subdir, log.out)
                # fs_traverse stdout contains subdirectory
                assert_in(('file2.txt' and 'dir'), cmo.out)

            # extract info of the top-level child directory
            child = [item for item in fs['nodes'] if item['name'] == 'dir'][0]
            # size of dir type child in non-recursive modes should be 0 Bytes(default) as
            # dir type child's size currently has no metadata file for traverser to pick its size from
            # and would require a recursive traversal w/ write to child metadata file mode
            assert_equal(child['size']['total'], {
                True: '6 Bytes',
                False: '0 Bytes'
            }[recursive])

    for recursive in [True, False]:
        # run fs_traverse in write to json 'file' mode
        fs = fs_traverse(topdir,
                         AnnexRepo(topdir),
                         recurse_directories=recursive,
                         json='file')
        # fs_traverse should return a dictionary
        assert_equal(isinstance(fs, dict), True)
        # not including git and annex folders
        assert_equal([
            item
            for item in fs['nodes'] if ('gitdir' or 'annexdir') == item['name']
        ], [])
        # extract info of the top-level child directory
        child = [item for item in fs['nodes'] if item['name'] == 'dir'][0]
        # verify node type
        assert_equal(child['type'], 'dir')
        # same node size on running fs_traversal in recursive followed by non-recursive mode
        # verifies child's metadata file being used to find its size
        # running in reverse order (non-recursive followed by recursive mode) will give (0, actual size)
        assert_equal(child['size']['total'], '6 Bytes')

        # verify subdirectory traversal if run in recursive mode
        # In current RF 'nodes' are stripped away during recursive traversal
        # for now... later we might reincarnate them "differently"
        # TODO!
        if False:  # recursive:
            # sub-dictionary should not include git and hidden directory info
            assert_equal([
                item for item in child['nodes']
                if ('subgit' or '.fgit') == item['name']
            ], [])
            # extract subdirectory dictionary, else fail
            subchild = [
                subitem for subitem in child["nodes"]
                if subitem['name'] == 'subdir'
            ][0]
            # extract info of file1.txts, else fail
            link = [
                subnode for subnode in subchild["nodes"]
                if subnode['name'] == 'file1.txt'
            ][0]
            # verify node's sizes and type
            assert_equal(link['size']['total'], '3 Bytes')
            assert_equal(link['size']['ondisk'], link['size']['total'])
            assert_equal(link['type'], 'link')
            # extract info of file2.txt, else fail
            brokenlink = [
                subnode for subnode in subchild["nodes"]
                if subnode['name'] == 'file2.txt'
            ][0]
            # verify node's sizes and type
            assert_equal(brokenlink['type'], 'link-broken')
            assert_equal(brokenlink['size']['ondisk'], '0 Bytes')
            assert_equal(brokenlink['size']['total'], '3 Bytes')
def _test_AnnexDB(cls, path):
    filepath1 = opj(path, 'file1.txt')
    filep2 = opj('d', 'file2.txt')
    filepath2 = opj(path, filep2)

    annex = AnnexRepo(path, create=True)
    # PhysicalFileStatusesDB relies on information in annex so files
    # must be committed first
    annex.add('file1.txt')
    annex.commit("initial commit")
    db = cls(annex=annex)

    def set_db_status_from_file(fpath):
        """To test JsonFileStatusesDB, we need to keep updating the status stored"""
        if cls is JsonFileStatusesDB:
            # we need first to set the status
            db.set(fpath, db._get_fileattributes_status(fpath))

    set_db_status_from_file('file1.txt')
    status1 = db.get('file1.txt')
    assert(status1.size)

    status1_ = db.get('file1.txt')
    assert_equal(status1, status1_)
    assert_false(db.is_different('file1.txt', status1))
    assert_false(db.is_different('file1.txt', status1_))
    # even if we add a filename specification
    status1_.filename = 'file1.txt'
    assert_false(db.is_different('file1.txt', status1_))
    status1_.filename = 'different.txt'
    assert_false(db.is_different('file1.txt', status1_))

    os.unlink(filepath1)  # under annex- - we don't have unlock yet and thus can't inplace augment
    with open(filepath1, 'a') as f:
        f.write('+')
    # Note/TODO: fixed (realpath) path should go. Inner logic has to adapt to
    # dataset singletons, that don't resolve symlinks
    set_db_status_from_file(realpath(filepath1))
    assert(db.is_different('file1.txt', status1))

    # we should be able to get status of files out and inside of git
    set_db_status_from_file('2git')
    status_git1 = db.get('2git')
    annex.add('2git', git=True)
    annex.commit("added 2git")
    assert_equal(db.get('2git'), status_git1)

    # we should be able to get status of files with relative path to top dir and abs path
    set_db_status_from_file(filep2)
    status2 = db.get(filep2)
    # Note/TODO: fixed (realpath) path should go. Inner logic has to adapt to
    # dataset singletons, that don't resolve symlinks
    status2_full = db.get(realpath(filepath2))
    assert_equal(status2, status2_full)
    # TODO? what about relative to curdir??
    #with chpwd(opj(path, 'd')):
    #    status2_dir = db.get('./file2.txt')
    #    assert_equal(status2, status2_dir)

    # since we asked about each file we added to DB/annex -- none should be
    # known as "deleted"
    assert_equal(db.get_obsolete(), [])

    # Possibly save its state for persistent storage
    #import pdb; pdb.set_trace()
    db.save()

    # but, if we create another DB which wasn't queried yet
    db2 = cls(annex=annex)
    # all files should be returned
    # TODO: fixed by using realpath, but there should be a cleaner
    # adaption to dataset singletons, that are NOT resolving symlinks, while the
    # underlying repos do!
    assert_equal(
            set(db2.get_obsolete()),
            {opj(realpath(path), p) for p in ['file1.txt', filep2, '2git']})
    # and if we query one, it shouldn't be listed as deleted any more
    status2_ = db2.get(filep2)
    assert_equal(status2, status2_)
    # TODO: fixed by using realpath, but there should be a cleaner
    # adaption to dataset singletons, that are NOT resolving symlinks, while the
    # underlying repos do!
    assert_equal(
            set(db2.get_obsolete()),
            {opj(realpath(path), p) for p in ['file1.txt', '2git']})

    # and if we queried with ./ prefix, should still work
    db2.get(curdir + sep + 'file1.txt')
    # TODO: fixed by using realpath, but there should be a cleaner
    # adaption to dataset singletons, that are NOT resolving symlinks, while the
    # underlying repos do!
    assert_equal(
            set(db2.get_obsolete()),
            {opj(realpath(path), p) for p in ['2git']})

    # and if we queried with a full path, should still work
    # TODO: fixed by using realpath, but there should be a cleaner
    # adaption to dataset singletons, that are NOT resolving symlinks, while the
    # underlying repos do!
    db2.get(opj(realpath(path), '2git'))
    assert_equal(db2.get_obsolete(), [])
def test_ls_json(topdir, topurl):
    annex = AnnexRepo(topdir, create=True)
    ds = Dataset(topdir)
    # create some file and commit it
    with open(opj(ds.path, 'subdsfile.txt'), 'w') as f:
        f.write('123')
    ds.save(path='subdsfile.txt', message="Hello!", version_tag=1)

    # add a subdataset
    ds.install('subds', source=topdir)

    subdirds = ds.create(_path_('dir/subds2'), force=True)
    subdirds.save('file')

    git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True)  # create git repo
    git.add(opj(topdir, 'dir', 'subgit',
                'fgit.txt'))  # commit to git to init git repo
    git.commit()
    annex.add(opj(topdir, 'dir',
                  'subgit'))  # add the non-dataset git repo to annex
    annex.add(opj(topdir, 'dir'))  # add to annex (links)
    annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'),
               options=['--force'])  # broken-link
    annex.commit()

    git.add('fgit.txt')  # commit to git to init git repo
    git.commit()
    # annex.add doesn't add submodule, so using ds.add
    ds.save(opj('dir', 'subgit'))  # add the non-dataset git repo to annex
    ds.save('dir')  # add to annex (links)
    ds.drop(opj('dir', 'subdir', 'file2.txt'), check=False)  # broken-link

    # register "external" submodule  by installing and uninstalling it
    ext_url = topurl + '/dir/subgit/.git'
    # need to make it installable via http
    WitlessRunner(cwd=opj(topdir, 'dir', 'subgit')).run(
        ['git', 'update-server-info'])
    ds.install(opj('dir', 'subgit_ext'), source=ext_url)
    ds.uninstall(opj('dir', 'subgit_ext'))
    meta_dir = opj('.git', 'datalad', 'metadata')

    def get_metahash(*path):
        if not path:
            path = ['/']
        return hashlib.md5(opj(*path).encode('utf-8')).hexdigest()

    def get_metapath(dspath, *path):
        return _path_(dspath, meta_dir, get_metahash(*path))

    def get_meta(dspath, *path):
        with open(get_metapath(dspath, *path)) as f:
            return js.load(f)

    # Let's see that there is no crash if one of the files is available only
    # in relaxed URL mode, so no size could be picked up
    ds.repo.add_url_to_file('fromweb',
                            topurl + '/noteventhere',
                            options=['--relaxed'])

    for all_ in [True, False]:  # recurse directories
        for recursive in [True, False]:
            for state in ['file', 'delete']:
                # subdataset should have its json created and deleted when
                # all=True else not
                subds_metapath = get_metapath(opj(topdir, 'subds'))
                exists_prior = exists(subds_metapath)

                #with swallow_logs(), swallow_outputs():
                dsj = _ls_json(topdir,
                               json=state,
                               all_=all_,
                               recursive=recursive)
                ok_startswith(dsj['tags'], '1-')

                exists_post = exists(subds_metapath)
                # print("%s %s -> %s" % (state, exists_prior, exists_post))
                assert_equal(exists_post, (state == 'file' and recursive))

                # root should have its json file created and deleted in all cases
                ds_metapath = get_metapath(topdir)
                assert_equal(exists(ds_metapath), state == 'file')

                # children should have their metadata json's created and deleted only when recursive=True
                child_metapath = get_metapath(topdir, 'dir', 'subdir')
                assert_equal(exists(child_metapath),
                             (state == 'file' and all_))

                # ignored directories should not have json files created in any case
                for subdir in [('.hidden', ), ('dir', 'subgit')]:
                    assert_false(exists(get_metapath(topdir, *subdir)))

                # check if its updated in its nodes sublist too. used by web-ui json. regression test
                assert_equal(dsj['nodes'][0]['size']['total'],
                             dsj['size']['total'])

                # check size of subdataset
                subds = [
                    item for item in dsj['nodes']
                    if item['name'] == ('subdsfile.txt' or 'subds')
                ][0]
                assert_equal(subds['size']['total'], '3 Bytes')

                # dir/subds2 must not be listed among nodes of the top dataset:
                topds_nodes = {x['name']: x for x in dsj['nodes']}

                assert_in('subds', topds_nodes)
                # XXX
                # # condition here is a bit a guesswork by yoh later on
                # # TODO: here and below clear destiny/interaction of all_ and recursive
                # assert_equal(dsj['size']['total'],
                #              '15 Bytes' if (recursive and all_) else
                #              ('9 Bytes' if (recursive or all_) else '3 Bytes')
                # )

                # https://github.com/datalad/datalad/issues/1674
                if state == 'file' and all_:
                    dirj = get_meta(topdir, 'dir')
                    dir_nodes = {x['name']: x for x in dirj['nodes']}
                    # it should be present in the subdir meta
                    assert_in('subds2', dir_nodes)
                    assert_not_in('url_external', dir_nodes['subds2'])
                    assert_in('subgit_ext', dir_nodes)
                    assert_equal(dir_nodes['subgit_ext']['url'], ext_url)
                # and not in topds
                assert_not_in('subds2', topds_nodes)

                # run non-recursive dataset traversal after subdataset metadata already created
                # to verify sub-dataset metadata being picked up from its metadata file in such cases
                if state == 'file' and recursive and not all_:
                    dsj = _ls_json(topdir, json='file', all_=False)
                    subds = [
                        item for item in dsj['nodes']
                        if item['name'] == ('subdsfile.txt' or 'subds')
                    ][0]
                    assert_equal(subds['size']['total'], '3 Bytes')

                assert_equal(topds_nodes['fromweb']['size']['total'],
                             UNKNOWN_SIZE)
Exemple #32
0
    def __call__(path=None,
                 force=False,
                 description=None,
                 dataset=None,
                 no_annex=False,
                 save=True,
                 annex_version=None,
                 annex_backend='MD5E',
                 native_metadata_type=None,
                 shared_access=None,
                 git_opts=None,
                 annex_opts=None,
                 annex_init_opts=None,
                 text_no_annex=None):

        # two major cases
        # 1. we got a `dataset` -> we either want to create it (path is None),
        #    or another dataset in it (path is not None)
        # 2. we got no dataset -> we want to create a fresh dataset at the
        #    desired location, either at `path` or PWD
        if path and dataset:
            # Given a path and a dataset (path) not pointing to installed
            # dataset
            if not dataset.is_installed():
                msg = "No installed dataset at %s found." % dataset.path
                dsroot = get_dataset_root(dataset.path)
                if dsroot:
                    msg += " If you meant to add to the %s dataset, use that path " \
                           "instead but remember that if dataset is provided, " \
                           "relative paths are relative to the top of the " \
                           "dataset." % dsroot
                raise ValueError(msg)

        # sanity check first
        if git_opts:
            lgr.warning(
                "`git_opts` argument is presently ignored, please complain!")
        if no_annex:
            if description:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "description for annex repo and declaring "
                                 "no annex repo.")
            if annex_opts:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "options for annex and declaring no "
                                 "annex repo.")
            if annex_init_opts:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "options for annex init and declaring no "
                                 "annex repo.")

        if not isinstance(force, bool):
            raise ValueError(
                "force should be bool, got %r.  Did you mean to provide a 'path'?"
                % force)
        annotated_paths = AnnotatePaths.__call__(
            # nothing given explicitly, assume create fresh right here
            path=path if path else getpwd() if dataset is None else None,
            dataset=dataset,
            recursive=False,
            action='create',
            # we need to know whether we have to check for potential
            # subdataset collision
            force_parentds_discovery=True,
            # it is absolutely OK to have something that does not exist
            unavailable_path_status='',
            unavailable_path_msg=None,
            # if we have a dataset given that actually exists, we want to
            # fail if the requested path is not in it
            nondataset_path_status='error' \
                if isinstance(dataset, Dataset) and dataset.is_installed() else '',
            on_failure='ignore')
        path = None
        for r in annotated_paths:
            if r['status']:
                # this is dealt with already
                yield r
                continue
            if path is not None:
                raise ValueError(
                    "`create` can only handle single target path or dataset")
            path = r

        if len(annotated_paths) and path is None:
            # we got something, we complained already, done
            return

        # we know that we need to create a dataset at `path`
        assert (path is not None)

        # prep for yield
        path.update({'logger': lgr, 'type': 'dataset'})
        # just discard, we have a new story to tell
        path.pop('message', None)
        if 'parentds' in path:
            subs = Subdatasets.__call__(
                dataset=path['parentds'],
                # any known
                fulfilled=None,
                recursive=False,
                contains=path['path'],
                result_xfm='relpaths')
            if len(subs):
                path.update({
                    'status':
                    'error',
                    'message':
                    ('collision with known subdataset %s/ in dataset %s',
                     subs[0], path['parentds'])
                })
                yield path
                return

        # TODO here we need a further test that if force=True, we need to look if
        # there is a superdataset (regardless of whether we want to create a
        # subdataset or not), and if that superdataset tracks anything within
        # this directory -- if so, we need to stop right here and whine, because
        # the result of creating a repo here will produce an undesired mess

        if git_opts is None:
            git_opts = {}
        if shared_access:
            # configure `git --shared` value
            git_opts['shared'] = shared_access

        # important to use the given Dataset object to avoid spurious ID
        # changes with not-yet-materialized Datasets
        tbds = dataset if isinstance(dataset, Dataset) and dataset.path == path['path'] \
            else Dataset(path['path'])

        # don't create in non-empty directory without `force`:
        if isdir(tbds.path) and listdir(tbds.path) != [] and not force:
            path.update({
                'status':
                'error',
                'message':
                'will not create a dataset in a non-empty directory, use '
                '`force` option to ignore'
            })
            yield path
            return

        if no_annex:
            lgr.info("Creating a new git repo at %s", tbds.path)
            GitRepo(tbds.path, url=None, create=True, git_opts=git_opts)
        else:
            # always come with annex when created from scratch
            lgr.info("Creating a new annex repo at %s", tbds.path)
            tbrepo = AnnexRepo(tbds.path,
                               url=None,
                               create=True,
                               backend=annex_backend,
                               version=annex_version,
                               description=description,
                               git_opts=git_opts,
                               annex_opts=annex_opts,
                               annex_init_opts=annex_init_opts)

            if text_no_annex:
                git_attributes_file = opj(tbds.path, '.gitattributes')
                with open(git_attributes_file, 'a') as f:
                    f.write('* annex.largefiles=(not(mimetype=text/*))\n')
                tbrepo.add([git_attributes_file], git=True)
                tbrepo.commit("Instructed annex to add text files to git",
                              _datalad_msg=True,
                              files=[git_attributes_file])

        if native_metadata_type is not None:
            if not isinstance(native_metadata_type, list):
                native_metadata_type = [native_metadata_type]
            for nt in native_metadata_type:
                tbds.config.add('datalad.metadata.nativetype', nt)

        # record an ID for this repo for the afterlife
        # to be able to track siblings and children
        id_var = 'datalad.dataset.id'
        if id_var in tbds.config:
            # make sure we reset this variable completely, in case of a re-create
            tbds.config.unset(id_var, where='dataset')
        tbds.config.add(id_var,
                        tbds.id if tbds.id is not None else
                        uuid.uuid1().urn.split(':')[-1],
                        where='dataset')

        # make sure that v6 annex repos never commit content under .datalad
        with open(opj(tbds.path, '.datalad', '.gitattributes'),
                  'a') as gitattr:
            # TODO this will need adjusting, when annex'ed aggregate meta data
            # comes around
            gitattr.write(
                '# Text files (according to file --mime-type) are added directly to git.\n'
            )
            gitattr.write(
                '# See http://git-annex.branchable.com/tips/largefiles/ for more info.\n'
            )
            gitattr.write('** annex.largefiles=nothing\n')

        # save everything, we need to do this now and cannot merge with the
        # call below, because we may need to add this subdataset to a parent
        # but cannot until we have a first commit
        tbds.add('.datalad',
                 to_git=True,
                 save=save,
                 message='[DATALAD] new dataset')

        # the next only makes sense if we saved the created dataset,
        # otherwise we have no committed state to be registered
        # in the parent
        if save and isinstance(dataset, Dataset) and dataset.path != tbds.path:
            # we created a dataset in another dataset
            # -> make submodule
            for r in dataset.add(tbds.path,
                                 save=True,
                                 return_type='generator',
                                 result_filter=None,
                                 result_xfm=None,
                                 on_failure='ignore'):
                yield r

        path.update({'status': 'ok'})
        yield path
Exemple #33
0
def test_interactions(tdir):
    # Just a placeholder since constructor expects a repo
    repo = AnnexRepo(tdir, create=True, init=True)
    repo.add('file.dat')
    repo.commit('added file.dat')

    class FIFO(object):
        def __init__(self, content=None, default=None):
            """

            Parameters
            ----------
            content
            default
              If defined, will be the one returned if empty.
              If not defined -- would raise an Exception
            """
            self.content = content or []
            self.default = default

        def _pop(self):
            # return empty line, usually to signal
            if self.content:
                v = self.content.pop(0)
                # allow for debug
                if v.startswith('DEBUG '):
                    # next one
                    return self._pop()
                return v
            else:
                if self.default is not None:
                    return self.default
                else:
                    raise IndexError("we are empty")

        def write(self, l):
            self.content.append(l)

        def read(self):
            return self._pop()

        def readline(self):
            return self._pop().rstrip('\n')

        def flush(self):
            pass  # working hard

    # now we should test interactions
    import re
    ERROR_ARGS = re.compile('^ERROR .*(missing|takes) .*\d+ .*argument')
    for scenario in [
        [],  # default of doing nothing
        [  # support of EXPORT which by default is not supported
            ('EXPORTSUPPORTED', 'EXPORTSUPPORTED-FAILURE'),
        ],
        [  # some unknown option
            ('FANCYNEWOPTION', 'UNSUPPORTED-REQUEST'),
        ],
        [
            # get the COST etc for , and make sure we do not
            # fail right on unsupported
            ('FANCYNEWOPTION', 'UNSUPPORTED-REQUEST'),
            ('GETCOST', 'COST %d' % DEFAULT_COST),
            ('GETCOST roguearg', ERROR_ARGS),
            ('GETAVAILABILITY', 'AVAILABILITY %s' % DEFAULT_AVAILABILITY),
            ('INITREMOTE', 'INITREMOTE-SUCCESS'
             ),  # by default we do not require any fancy init
            # no urls supported by default
            ('CLAIMURL http://example.com', 'CLAIMURL-FAILURE'),
            # we know that is just a single option, url, is expected so full
            # one would be passed
            ('CLAIMURL http://example.com roguearg', 'CLAIMURL-FAILURE'),
            # but if not enough params -- ERROR_ARGS
            ('CLAIMURL', ERROR_ARGS)
        ]
    ]:
        # First one is always version and
        # Final empty command to signal the end of the transactions
        scenario = [(None, 'VERSION 1')] + scenario + [('', None)]
        fin, fout = FIFO(), FIFO(default='')
        for in_, out_ in scenario:
            if in_ is not None:
                fin.write(in_ + '\n')
        cr = AnnexCustomRemote(tdir, fin=fin, fout=fout)
        cr.main()
        for in_, out_ in scenario:
            if out_ is not None:
                out_read = fout.readline()
                if isinstance(out_, type(ERROR_ARGS)):
                    assert out_.match(out_read), (out_, out_read)
                else:
                    eq_(out_, out_read)
        out_read = fout.readline()
        eq_(out_read, '')  # nothing left to say