Ejemplo n.º 1
0
def test_proxying_open_testrepobased(repo):
    TEST_CONTENT = "content to be annex-addurl'd"
    fname = 'test-annex.dat'
    fpath = opj(repo, fname)
    assert_raises(IOError, open, fpath)

    aio = AutomagicIO(activate=True)
    try:
        with swallow_outputs():
            # now we should be able just to request to open this file
            with open(fpath) as f:
                content = f.read()
                eq_(content, TEST_CONTENT)
    finally:
        aio.deactivate()

    # and now that we have fetched it, nothing should forbid us to open it again
    with open(fpath) as f:
        eq_(f.read(), TEST_CONTENT)

    annex = AnnexRepo(repo, create=False)
    # Let's create another file deeper under the directory with the same content
    # so it would point to the same key, which we would drop and repeat the drill
    fpath2 = opj(repo, 'd1', 'd2', 'test2.dat')
    os.makedirs(dirname(fpath2))
    with open(fpath2, 'w') as f:
        f.write(content)
    annex.add(fpath2)
    annex.drop(fpath2)
    annex.commit("added and dropped")
    assert_raises(IOError, open, fpath2)

    # Let's use context manager form
    with AutomagicIO() as aio:
        ok_(isinstance(aio, AutomagicIO))
        ok_(aio.active)
        # swallowing output would cause trouble while testing with
        # DATALAD_ASSERT_NO_OPEN_FILES mode on.  Reason is not 100% clear
        # on why underlying git-annex process would be dumping to stdout or err
        #with swallow_outputs():

        # now we should be able just to request to open this file
        with open(fpath2) as f:
            content = f.read()
            eq_(content, TEST_CONTENT)

    annex.drop(fpath2)
    assert_raises(IOError, open, fpath2)

    # Let's use relative path
    with chpwd(opj(repo, 'd1')):
        # Let's use context manager form
        with AutomagicIO() as aio, \
                swallow_outputs(), \
                open(opj('d2', 'test2.dat')) as f:
            content = f.read()
            eq_(content, TEST_CONTENT)
Ejemplo n.º 2
0
def test_get_contentlocation(tdir):
    repo = AnnexRepo(tdir, create=True, init=True)
    repo.add('file.dat')
    repo.commit('added file.dat')

    key = repo.get_file_key('file.dat')
    cr = AnnexCustomRemote(tdir)
    key_path = cr.get_contentlocation(key, absolute=False)
    assert not isabs(key_path)
    key_path_abs = cr.get_contentlocation(key, absolute=True)
    assert isabs(key_path_abs)
    assert cr._contentlocations == {key: key_path}
    repo.drop('file.dat', options=['--force'])
    assert not cr.get_contentlocation(key, absolute=True)
Ejemplo n.º 3
0
def test_get_contentlocation(tdir):
    repo = AnnexRepo(tdir, create=True, init=True)
    repo.add('file.dat')
    repo.commit('added file.dat')

    key = repo.get_file_key('file.dat')
    cr = AnnexCustomRemote(tdir)
    key_path = cr.get_contentlocation(key, absolute=False)
    assert not isabs(key_path)
    key_path_abs = cr.get_contentlocation(key, absolute=True)
    assert isabs(key_path_abs)
    assert cr._contentlocations == {key: key_path}
    repo.drop('file.dat', options=['--force'])
    assert not cr.get_contentlocation(key, absolute=True)
Ejemplo n.º 4
0
def test_get_contentlocation(tdir=None):
    repo = AnnexRepo(tdir, create=True, init=True)
    repo.add('file.dat')
    repo.commit('added file.dat')

    # TODO contentlocation would come with eval_availability=True
    key = repo.get_file_annexinfo('file.dat')['key']
    cr = ArchiveAnnexCustomRemote(None, path=tdir)
    key_path = cr.get_contentlocation(key, absolute=False)
    assert not isabs(key_path)
    key_path_abs = cr.get_contentlocation(key, absolute=True)
    assert isabs(key_path_abs)
    assert cr._contentlocations == {key: key_path}
    repo.drop('file.dat', options=['--force'])
    assert not cr.get_contentlocation(key, absolute=True)
Ejemplo n.º 5
0
def test_fs_traverse(topdir):
    # setup temp directory tree for testing
    annex = AnnexRepo(topdir)
    AnnexRepo(opj(topdir, 'annexdir'), create=True)
    GitRepo(opj(topdir, 'gitdir'), create=True)
    GitRepo(opj(topdir, 'dir', 'subgit'), create=True)
    annex.add(opj(topdir, 'dir'), commit=True)
    annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force'])

    # traverse file system in recursive and non-recursive modes
    for recursive in [True, False]:
        # test fs_traverse in display mode
        with swallow_logs(
                new_level=logging.INFO) as log, swallow_outputs() as cmo:
            fs = fs_traverse(topdir,
                             AnnexRepo(topdir),
                             recurse_directories=recursive,
                             json='display')
            if recursive:
                # fs_traverse logs should contain all not ignored subdirectories
                for subdir in [
                        opj(topdir, 'dir'),
                        opj(topdir, 'dir', 'subdir')
                ]:
                    assert_in('Directory: ' + subdir, log.out)
                # fs_traverse stdout contains subdirectory
                assert_in(('file2.txt' and 'dir'), cmo.out)

            # extract info of the top-level child directory
            child = [item for item in fs['nodes'] if item['name'] == 'dir'][0]
            # size of dir type child in non-recursive modes should be 0 Bytes(default) as
            # dir type child's size currently has no metadata file for traverser to pick its size from
            # and would require a recursive traversal w/ write to child metadata file mode
            assert_equal(child['size']['total'], {
                True: '6 Bytes',
                False: '0 Bytes'
            }[recursive])

    for recursive in [True, False]:
        # run fs_traverse in write to json 'file' mode
        fs = fs_traverse(topdir,
                         AnnexRepo(topdir),
                         recurse_directories=recursive,
                         json='file')
        # fs_traverse should return a dictionary
        assert_equal(isinstance(fs, dict), True)
        # not including git and annex folders
        assert_equal([
            item
            for item in fs['nodes'] if ('gitdir' or 'annexdir') == item['name']
        ], [])
        # extract info of the top-level child directory
        child = [item for item in fs['nodes'] if item['name'] == 'dir'][0]
        # verify node type
        assert_equal(child['type'], 'dir')
        # same node size on running fs_traversal in recursive followed by non-recursive mode
        # verifies child's metadata file being used to find its size
        # running in reverse order (non-recursive followed by recursive mode) will give (0, actual size)
        assert_equal(child['size']['total'], '6 Bytes')

        # verify subdirectory traversal if run in recursive mode
        # In current RF 'nodes' are stripped away during recursive traversal
        # for now... later we might reincarnate them "differently"
        # TODO!
        if False:  # recursive:
            # sub-dictionary should not include git and hidden directory info
            assert_equal([
                item for item in child['nodes']
                if ('subgit' or '.fgit') == item['name']
            ], [])
            # extract subdirectory dictionary, else fail
            subchild = [
                subitem for subitem in child["nodes"]
                if subitem['name'] == 'subdir'
            ][0]
            # extract info of file1.txts, else fail
            link = [
                subnode for subnode in subchild["nodes"]
                if subnode['name'] == 'file1.txt'
            ][0]
            # verify node's sizes and type
            assert_equal(link['size']['total'], '3 Bytes')
            assert_equal(link['size']['ondisk'], link['size']['total'])
            assert_equal(link['type'], 'link')
            # extract info of file2.txt, else fail
            brokenlink = [
                subnode for subnode in subchild["nodes"]
                if subnode['name'] == 'file2.txt'
            ][0]
            # verify node's sizes and type
            assert_equal(brokenlink['type'], 'link-broken')
            assert_equal(brokenlink['size']['ondisk'], '0 Bytes')
            assert_equal(brokenlink['size']['total'], '3 Bytes')
Ejemplo n.º 6
0
def test_ls_json(topdir):
    annex = AnnexRepo(topdir, create=True)
    dsj = Dataset(topdir)
    # create some file and commit it
    with open(opj(dsj.path, 'subdsfile.txt'), 'w') as f:
        f.write('123')
    dsj.add(path='subdsfile.txt')
    dsj.save("Hello!", version_tag=1)

    # add a subdataset
    dsj.install('subds', source=topdir)

    subdirds = dsj.create(_path_('dir/subds2'), force=True)
    subdirds.add('file')

    git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True)  # create git repo
    git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt'),
            commit=True)  # commit to git to init git repo
    annex.add(opj(topdir, 'dir', 'subgit'),
              commit=True)  # add the non-dataset git repo to annex
    annex.add(opj(topdir, 'dir'), commit=True)  # add to annex (links)
    annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'),
               options=['--force'])  # broken-link

    meta_dir = opj('.git', 'datalad', 'metadata')
    meta_path = opj(topdir, meta_dir)

    def get_metahash(*path):
        if not path:
            path = ['/']
        return hashlib.md5(opj(*path).encode('utf-8')).hexdigest()

    def get_metapath(dspath, *path):
        return _path_(dspath, meta_dir, get_metahash(*path))

    def get_meta(dspath, *path):
        with open(get_metapath(dspath, *path)) as f:
            return js.load(f)

    for all_ in [True, False]:  # recurse directories
        for recursive in [True, False]:
            for state in ['file', 'delete']:
                # subdataset should have its json created and deleted when
                # all=True else not
                subds_metapath = get_metapath(opj(topdir, 'subds'))
                exists_prior = exists(subds_metapath)

                #with swallow_logs(), swallow_outputs():
                dsj = _ls_json(topdir,
                               json=state,
                               all_=all_,
                               recursive=recursive)

                exists_post = exists(subds_metapath)
                # print("%s %s -> %s" % (state, exists_prior, exists_post))
                assert_equal(exists_post, (state == 'file' and recursive))

                # root should have its json file created and deleted in all cases
                ds_metapath = get_metapath(topdir)
                assert_equal(exists(ds_metapath), state == 'file')

                # children should have their metadata json's created and deleted only when recursive=True
                child_metapath = get_metapath(topdir, 'dir', 'subdir')
                assert_equal(exists(child_metapath),
                             (state == 'file' and all_))

                # ignored directories should not have json files created in any case
                for subdir in [('.hidden', ), ('dir', 'subgit')]:
                    assert_false(exists(get_metapath(topdir, *subdir)))

                # check if its updated in its nodes sublist too. used by web-ui json. regression test
                assert_equal(dsj['nodes'][0]['size']['total'],
                             dsj['size']['total'])

                # check size of subdataset
                subds = [
                    item for item in dsj['nodes']
                    if item['name'] == ('subdsfile.txt' or 'subds')
                ][0]
                assert_equal(subds['size']['total'], '3 Bytes')

                # dir/subds2 must not be listed among nodes of the top dataset:
                topds_nodes = {x['name']: x for x in dsj['nodes']}

                assert_in('subds', topds_nodes)
                # XXX
                # # condition here is a bit a guesswork by yoh later on
                # # TODO: here and below clear destiny/interaction of all_ and recursive
                # assert_equal(dsj['size']['total'],
                #              '15 Bytes' if (recursive and all_) else
                #              ('9 Bytes' if (recursive or all_) else '3 Bytes')
                # )

                # https://github.com/datalad/datalad/issues/1674
                if state == 'file' and all_:
                    dirj = get_meta(topdir, 'dir')
                    dir_nodes = {x['name']: x for x in dirj['nodes']}
                    # it should be present in the subdir meta
                    assert_in('subds2', dir_nodes)
                # and not in topds
                assert_not_in('subds2', topds_nodes)

                # run non-recursive dataset traversal after subdataset metadata already created
                # to verify sub-dataset metadata being picked up from its metadata file in such cases
                if state == 'file' and recursive and not all_:
                    dsj = _ls_json(topdir, json='file', all_=False)
                    subds = [
                        item for item in dsj['nodes']
                        if item['name'] == ('subdsfile.txt' or 'subds')
                    ][0]
                    assert_equal(subds['size']['total'], '3 Bytes')
Ejemplo n.º 7
0
def test_ls_json(topdir):
    annex = AnnexRepo(topdir, create=True)
    ds = Dataset(topdir)
    # create some file and commit it
    with open(opj(ds.path, 'subdsfile.txt'), 'w') as f:
        f.write('123')
    ds.add(path='subdsfile.txt')
    ds.save("Hello!", version_tag=1)
    # add a subdataset
    ds.install('subds', source=topdir)

    git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True)  # create git repo
    git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt'),
            commit=True)  # commit to git to init git repo
    annex.add(opj(topdir, 'dir', 'subgit'),
              commit=True)  # add the non-dataset git repo to annex
    annex.add(opj(topdir, 'dir'), commit=True)  # add to annex (links)
    annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'),
               options=['--force'])  # broken-link

    meta_dir = opj('.git', 'datalad', 'metadata')
    meta_path = opj(topdir, meta_dir)

    def get_metahash(*path):
        return hashlib.md5(opj(*path).encode('utf-8')).hexdigest()

    for all_ in [True, False]:
        for recursive in [True, False]:
            for state in ['file', 'delete']:
                with swallow_logs(), swallow_outputs():
                    ds = _ls_json(topdir,
                                  json=state,
                                  all_=all_,
                                  recursive=recursive)

                # subdataset should have its json created and deleted when all=True else not
                subds_metahash = get_metahash('/')
                subds_metapath = opj(topdir, 'subds', meta_dir, subds_metahash)
                assert_equal(exists(subds_metapath),
                             (state == 'file' and recursive))

                # root should have its json file created and deleted in all cases
                ds_metahash = get_metahash('/')
                ds_metapath = opj(meta_path, ds_metahash)
                assert_equal(exists(ds_metapath), state == 'file')

                # children should have their metadata json's created and deleted only when recursive=True
                child_metahash = get_metahash('dir', 'subdir')
                child_metapath = opj(meta_path, child_metahash)
                assert_equal(exists(child_metapath),
                             (state == 'file' and all_))

                # ignored directories should not have json files created in any case
                for subdir in [('.hidden'), ('dir', 'subgit')]:
                    child_metahash = get_metahash(*subdir)
                    assert_equal(exists(opj(meta_path, child_metahash)), False)

                # check if its updated in its nodes sublist too. used by web-ui json. regression test
                assert_equal(ds['nodes'][0]['size']['total'],
                             ds['size']['total'])

                # check size of subdataset
                subds = [
                    item for item in ds['nodes']
                    if item['name'] == ('subdsfile.txt' or 'subds')
                ][0]
                assert_equal(subds['size']['total'], '3 Bytes')

                # run non-recursive dataset traversal after subdataset metadata already created
                # to verify sub-dataset metadata being picked up from its metadata file in such cases
                if state == 'file' and recursive and not all_:
                    ds = _ls_json(topdir, json='file', all_=False)
                    subds = [
                        item for item in ds['nodes']
                        if item['name'] == ('subdsfile.txt' or 'subds')
                    ][0]
                    assert_equal(subds['size']['total'], '3 Bytes')
Ejemplo n.º 8
0
def test_ls_json(topdir, topurl):
    annex = AnnexRepo(topdir, create=True)
    ds = Dataset(topdir)
    # create some file and commit it
    with open(opj(ds.path, 'subdsfile.txt'), 'w') as f:
        f.write('123')
    ds.save(path='subdsfile.txt', message="Hello!", version_tag=1)

    # add a subdataset
    ds.install('subds', source=topdir)

    subdirds = ds.create(_path_('dir/subds2'), force=True)
    subdirds.save('file')

    git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True)  # create git repo
    git.add(opj(topdir, 'dir', 'subgit',
                'fgit.txt'))  # commit to git to init git repo
    git.commit()
    annex.add(opj(topdir, 'dir',
                  'subgit'))  # add the non-dataset git repo to annex
    annex.add(opj(topdir, 'dir'))  # add to annex (links)
    annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'),
               options=['--force'])  # broken-link
    annex.commit()

    git.add('fgit.txt')  # commit to git to init git repo
    git.commit()
    # annex.add doesn't add submodule, so using ds.add
    ds.save(opj('dir', 'subgit'))  # add the non-dataset git repo to annex
    ds.save('dir')  # add to annex (links)
    ds.drop(opj('dir', 'subdir', 'file2.txt'), check=False)  # broken-link

    # register "external" submodule  by installing and uninstalling it
    ext_url = topurl + '/dir/subgit/.git'
    # need to make it installable via http
    WitlessRunner(cwd=opj(topdir, 'dir', 'subgit')).run(
        ['git', 'update-server-info'])
    ds.install(opj('dir', 'subgit_ext'), source=ext_url)
    ds.uninstall(opj('dir', 'subgit_ext'))
    meta_dir = opj('.git', 'datalad', 'metadata')

    def get_metahash(*path):
        if not path:
            path = ['/']
        return hashlib.md5(opj(*path).encode('utf-8')).hexdigest()

    def get_metapath(dspath, *path):
        return _path_(dspath, meta_dir, get_metahash(*path))

    def get_meta(dspath, *path):
        with open(get_metapath(dspath, *path)) as f:
            return js.load(f)

    # Let's see that there is no crash if one of the files is available only
    # in relaxed URL mode, so no size could be picked up
    ds.repo.add_url_to_file('fromweb',
                            topurl + '/noteventhere',
                            options=['--relaxed'])

    for all_ in [True, False]:  # recurse directories
        for recursive in [True, False]:
            for state in ['file', 'delete']:
                # subdataset should have its json created and deleted when
                # all=True else not
                subds_metapath = get_metapath(opj(topdir, 'subds'))
                exists_prior = exists(subds_metapath)

                #with swallow_logs(), swallow_outputs():
                dsj = _ls_json(topdir,
                               json=state,
                               all_=all_,
                               recursive=recursive)
                ok_startswith(dsj['tags'], '1-')

                exists_post = exists(subds_metapath)
                # print("%s %s -> %s" % (state, exists_prior, exists_post))
                assert_equal(exists_post, (state == 'file' and recursive))

                # root should have its json file created and deleted in all cases
                ds_metapath = get_metapath(topdir)
                assert_equal(exists(ds_metapath), state == 'file')

                # children should have their metadata json's created and deleted only when recursive=True
                child_metapath = get_metapath(topdir, 'dir', 'subdir')
                assert_equal(exists(child_metapath),
                             (state == 'file' and all_))

                # ignored directories should not have json files created in any case
                for subdir in [('.hidden', ), ('dir', 'subgit')]:
                    assert_false(exists(get_metapath(topdir, *subdir)))

                # check if its updated in its nodes sublist too. used by web-ui json. regression test
                assert_equal(dsj['nodes'][0]['size']['total'],
                             dsj['size']['total'])

                # check size of subdataset
                subds = [
                    item for item in dsj['nodes']
                    if item['name'] == ('subdsfile.txt' or 'subds')
                ][0]
                assert_equal(subds['size']['total'], '3 Bytes')

                # dir/subds2 must not be listed among nodes of the top dataset:
                topds_nodes = {x['name']: x for x in dsj['nodes']}

                assert_in('subds', topds_nodes)
                # XXX
                # # condition here is a bit a guesswork by yoh later on
                # # TODO: here and below clear destiny/interaction of all_ and recursive
                # assert_equal(dsj['size']['total'],
                #              '15 Bytes' if (recursive and all_) else
                #              ('9 Bytes' if (recursive or all_) else '3 Bytes')
                # )

                # https://github.com/datalad/datalad/issues/1674
                if state == 'file' and all_:
                    dirj = get_meta(topdir, 'dir')
                    dir_nodes = {x['name']: x for x in dirj['nodes']}
                    # it should be present in the subdir meta
                    assert_in('subds2', dir_nodes)
                    assert_not_in('url_external', dir_nodes['subds2'])
                    assert_in('subgit_ext', dir_nodes)
                    assert_equal(dir_nodes['subgit_ext']['url'], ext_url)
                # and not in topds
                assert_not_in('subds2', topds_nodes)

                # run non-recursive dataset traversal after subdataset metadata already created
                # to verify sub-dataset metadata being picked up from its metadata file in such cases
                if state == 'file' and recursive and not all_:
                    dsj = _ls_json(topdir, json='file', all_=False)
                    subds = [
                        item for item in dsj['nodes']
                        if item['name'] == ('subdsfile.txt' or 'subds')
                    ][0]
                    assert_equal(subds['size']['total'], '3 Bytes')

                assert_equal(topds_nodes['fromweb']['size']['total'],
                             UNKNOWN_SIZE)
Ejemplo n.º 9
0
def test_add_archive_content(path_orig, url, repo_path):
    with chpwd(repo_path):
        # TODO we need to be able to pass path into add_archive_content
        # We could mock but I mean for the API
        assert_raises(RuntimeError, add_archive_content,
                      "nonexisting.tar.gz")  # no repo yet

        repo = AnnexRepo(repo_path, create=True)
        assert_raises(ValueError, add_archive_content, "nonexisting.tar.gz")
        # we can't add a file from outside the repo ATM
        assert_raises(FileNotInRepositoryError, add_archive_content,
                      opj(path_orig, '1.tar.gz'))

        # Let's add first archive to the repo so we could test
        with swallow_outputs():
            repo.add_urls([opj(url, '1.tar.gz')],
                          options=["--pathdepth", "-1"])
            for s in range(1, 5):
                repo.add_urls([opj(url, '%du/1.tar.gz' % s)],
                              options=["--pathdepth", "-2"])
        repo.commit("added 1.tar.gz")

        key_1tar = repo.get_file_key(
            '1.tar.gz')  # will be used in the test later

        def d1_basic_checks():
            ok_(exists('1'))
            ok_file_under_git('1', '1 f.txt', annexed=True)
            ok_file_under_git(opj('1', 'd', '1d'), annexed=True)
            ok_archives_caches(repo_path, 0)

        # and by default it just does it, everything goes to annex
        repo_ = add_archive_content('1.tar.gz')
        eq_(repo.path, repo_.path)
        d1_basic_checks()

        # If ran again, should proceed just fine since the content is the same so no changes would be made really
        add_archive_content('1.tar.gz')

        # But that other one carries updated file, so should fail due to overwrite
        with assert_raises(RuntimeError) as cme:
            add_archive_content(opj('1u', '1.tar.gz'), use_current_dir=True)

        # TODO: somewhat not precise since we have two possible "already exists"
        # -- in caching and overwrite check
        assert_in("already exists", str(cme.exception))
        # but should do fine if overrides are allowed
        add_archive_content(opj('1u', '1.tar.gz'),
                            existing='overwrite',
                            use_current_dir=True)
        add_archive_content(opj('2u', '1.tar.gz'),
                            existing='archive-suffix',
                            use_current_dir=True)
        add_archive_content(opj('3u', '1.tar.gz'),
                            existing='archive-suffix',
                            use_current_dir=True)
        add_archive_content(opj('4u', '1.tar.gz'),
                            existing='archive-suffix',
                            use_current_dir=True)

        # rudimentary test
        assert_equal(sorted(map(basename, glob(opj(repo_path, '1', '1*')))),
                     ['1 f-1.1.txt', '1 f-1.2.txt', '1 f-1.txt', '1 f.txt'])
        whereis = repo.whereis(glob(opj(repo_path, '1', '1*')))
        # they all must be the same
        assert (all([x == whereis[0] for x in whereis[1:]]))

    # and we should be able to reference it while under subdirectory
    subdir = opj(repo_path, 'subdir')
    with chpwd(subdir, mkdir=True):
        add_archive_content(opj(pardir, '1.tar.gz'), use_current_dir=True)
        d1_basic_checks()
        # or we could keep relative path and also demand to keep the archive prefix
        # while extracting under original (annex root) dir
        add_archive_content(opj(pardir, '1.tar.gz'),
                            add_archive_leading_dir=True)

    with chpwd(opj(repo_path, '1')):
        d1_basic_checks()

    with chpwd(repo_path):
        # test with excludes and renames and annex options
        add_archive_content('1.tar.gz',
                            exclude=['d'],
                            rename=['/ /_', '/^1/2'],
                            annex_options="-c annex.largefiles=exclude=*.txt",
                            delete=True)
        # no conflicts since new name
        ok_file_under_git('2', '1_f.txt', annexed=False)
        assert_false(exists(opj('2', 'd')))
        assert_false(exists('1.tar.gz'))  # delete was in effect

    # now test ability to extract within subdir
    with chpwd(opj(repo_path, 'd1'), mkdir=True):
        # Let's add first archive to the repo so we could test
        # named the same way but different content
        with swallow_outputs():
            repo.add_urls([opj(url, 'd1', '1.tar.gz')],
                          options=["--pathdepth", "-1"],
                          cwd=getpwd())  # invoke under current subdir
        repo.commit("added 1.tar.gz in d1")

        def d2_basic_checks():
            ok_(exists('1'))
            ok_file_under_git('1', '2 f.txt', annexed=True)
            ok_file_under_git(opj('1', 'd2', '2d'), annexed=True)
            ok_archives_caches(repo.path, 0)

        add_archive_content('1.tar.gz')
        d2_basic_checks()

    # in manual tests ran into the situation of inability to obtain on a single run
    # a file from an archive which was coming from a dropped key.  I thought it was
    # tested in custom remote tests, but I guess not sufficiently well enough
    repo.drop(opj('1', '1 f.txt'))  # should be all kosher
    repo.get(opj('1', '1 f.txt'))
    ok_archives_caches(repo.path, 1, persistent=True)
    ok_archives_caches(repo.path, 0, persistent=False)

    repo.drop(opj('1', '1 f.txt'))  # should be all kosher
    repo.drop(key_1tar,
              key=True)  # is available from the URL -- should be kosher
    repo.get(opj('1', '1 f.txt'))  # that what managed to not work

    # TODO: check if persistent archive is there for the 1.tar.gz

    # We should be able to drop everything since available online
    with swallow_outputs():
        clean(dataset=repo.path)
    repo.drop(key_1tar,
              key=True)  # is available from the URL -- should be kosher

    repo.drop(opj('1', '1 f.txt'))  # should be all kosher
    repo.get(opj('1', '1 f.txt'))  # and should be able to get it again

    # bug was that dropping didn't work since archive was dropped first
    repo.call_annex(["drop", "--all"])

    # verify that we can't drop a file if archive key was dropped and online archive was removed or changed size! ;)
    repo.get(key_1tar, key=True)
    unlink(opj(path_orig, '1.tar.gz'))
    with assert_raises(CommandError) as e:
        repo.drop(key_1tar, key=True)
        assert_equal(e.kwargs['stdout_json'][0]['success'], False)
        assert_result_values_cond(
            e.kwargs['stdout_json'], 'note', lambda x:
            '(Use --force to override this check, or adjust numcopies.)' in x)
    assert exists(opj(repo.path, repo.get_contentlocation(key_1tar)))
Ejemplo n.º 10
0
def test_check_dates(path):
    refdate = 1218182889

    with set_date(refdate - 1):
        ar = AnnexRepo(path, create=True)

        def tag_object(tag):
            """Return object for tag.  Do not dereference it.
            """
            # We can't use ar.get_tags because that returns the commit's hexsha,
            # not the tag's, and ar.get_hexsha is limited to commit objects.
            return ar.call_git_oneline(
                ["rev-parse", "refs/tags/{}".format(tag)], read_only=True)

        ar.add("foo")
        ar.commit("add foo")
        foo_commit = ar.get_hexsha()
        ar.commit("add foo")
        ar.tag("foo-tag", "tag before refdate")
        foo_tag = tag_object("foo-tag")
        # Make a lightweight tag to make sure `tag_dates` doesn't choke on it.
        ar.tag("light")
    with set_date(refdate + 1):
        ar.add("bar")
        ar.commit("add bar")
        bar_commit = ar.get_hexsha()
        ar.tag("bar-tag", "tag after refdate")
        bar_tag = tag_object("bar-tag")
    with set_date(refdate + 2):
        # Drop an annexed file so that we have more blobs in the git-annex
        # branch than its current tree.
        ar.drop("bar", options=["--force"])

    results = {}
    for which in ["older", "newer"]:
        result = check_dates(ar, refdate, which=which)["objects"]
        ok_(result)
        if which == "newer":
            assert_in(bar_commit, result)
            assert_not_in(foo_commit, result)
            assert_in(bar_tag, result)
        elif which == "older":
            assert_in(foo_commit, result)
            assert_not_in(bar_commit, result)
            assert_in(foo_tag, result)
        results[which] = result

    ok_(any(
        x.get("filename") == "uuid.log" for x in results["older"].values()))

    newer_tree = check_dates(ar, refdate, annex="tree")["objects"]

    def is_annex_log_blob(entry):
        return (entry["type"] == "annex-blob"
                and entry["filename"].endswith(".log"))

    def num_logs(entries):
        return sum(map(is_annex_log_blob, entries.values()))

    # Because we dropped bar above, we should have one more blob in the
    # git-annex branch than in the current tree of the git-annex branch.
    eq_(num_logs(results["newer"]) - num_logs(newer_tree), 1)

    # Act like today is one day from the reference timestamp to check that we
    # get the same results with the one-day-back default.
    seconds_in_day = 60 * 60 * 24
    with patch('time.time', return_value=refdate + seconds_in_day):
        assert_equal(check_dates(ar, annex="tree")["objects"], newer_tree)

    # We can give a path (str) instead of a GitRepo object.
    assert_equal(
        check_dates(path, refdate, annex="tree")["objects"], newer_tree)

    with assert_raises(ValueError):
        check_dates(ar, refdate, which="unrecognized")
Ejemplo n.º 11
0
def test_ls_json(topdir):
    annex = AnnexRepo(topdir, create=True)
    ds = Dataset(topdir)
    # create some file and commit it
    open(opj(ds.path, 'subdsfile.txt'), 'w').write('123')
    ds.add(path='subdsfile.txt')
    ds.save("Hello!", version_tag=1)
    # add a subdataset
    ds.install('subds', source=topdir)

    git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True)                    # create git repo
    git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt'), commit=True)              # commit to git to init git repo
    annex.add(opj(topdir, 'dir', 'subgit'), commit=True)                        # add the non-dataset git repo to annex
    annex.add(opj(topdir, 'dir'), commit=True)                                  # add to annex (links)
    annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force'])  # broken-link

    meta_dir = opj('.git', 'datalad', 'metadata')
    meta_path = opj(topdir, meta_dir)

    def get_metahash(*path):
        return hashlib.md5(opj(*path).encode('utf-8')).hexdigest()

    for all_ in [True, False]:
        for recursive in [True, False]:
            for state in ['file', 'delete']:
                with swallow_logs(), swallow_outputs():
                    ds = _ls_json(topdir, json=state, all_=all_, recursive=recursive)

                # subdataset should have its json created and deleted when all=True else not
                subds_metahash = get_metahash('/')
                subds_metapath = opj(topdir, 'subds', meta_dir, subds_metahash)
                assert_equal(exists(subds_metapath), (state == 'file' and recursive))

                # root should have its json file created and deleted in all cases
                ds_metahash = get_metahash('/')
                ds_metapath = opj(meta_path, ds_metahash)
                assert_equal(exists(ds_metapath), state == 'file')

                # children should have their metadata json's created and deleted only when recursive=True
                child_metahash = get_metahash('dir', 'subdir')
                child_metapath = opj(meta_path, child_metahash)
                assert_equal(exists(child_metapath), (state == 'file' and all_))

                # ignored directories should not have json files created in any case
                for subdir in [('.hidden'), ('dir', 'subgit')]:
                    child_metahash = get_metahash(*subdir)
                    assert_equal(exists(opj(meta_path, child_metahash)), False)

                # check if its updated in its nodes sublist too. used by web-ui json. regression test
                assert_equal(ds['nodes'][0]['size']['total'], ds['size']['total'])

                # check size of subdataset
                subds = [item for item in ds['nodes'] if item['name'] == ('subdsfile.txt' or 'subds')][0]
                assert_equal(subds['size']['total'], '3 Bytes')

                # run non-recursive dataset traversal after subdataset metadata already created
                # to verify sub-dataset metadata being picked up from its metadata file in such cases
                if state == 'file' and recursive and not all_:
                    ds = _ls_json(topdir, json='file', all_=False)
                    subds = [item for item in ds['nodes'] if item['name'] == ('subdsfile.txt' or 'subds')][0]
                    assert_equal(subds['size']['total'], '3 Bytes')
Ejemplo n.º 12
0
def test_fs_traverse(topdir):
    # setup temp directory tree for testing
    annex = AnnexRepo(topdir)
    AnnexRepo(opj(topdir, 'annexdir'), create=True)
    GitRepo(opj(topdir, 'gitdir'), create=True)
    GitRepo(opj(topdir, 'dir', 'subgit'), create=True)
    annex.add(opj(topdir, 'dir'), commit=True)
    annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force'])

    # traverse file system in recursive and non-recursive modes
    for recursive in [True, False]:
        # test fs_traverse in display mode
        with swallow_logs(new_level=logging.INFO) as log, swallow_outputs() as cmo:
            fs = fs_traverse(topdir, AnnexRepo(topdir), recursive=recursive, json='display')
            if recursive:
                # fs_traverse logs should contain all not ignored subdirectories
                for subdir in [opj(topdir, 'dir'), opj(topdir, 'dir', 'subdir')]:
                    assert_in('Directory: ' + subdir, log.out)
                # fs_traverse stdout contains subdirectory
                assert_in(('file2.txt' and 'dir'), cmo.out)

            # extract info of the top-level child directory
            child = [item for item in fs['nodes'] if item['name'] == 'dir'][0]
            # size of dir type child in non-recursive modes should be 0 Bytes(default) as
            # dir type child's size currently has no metadata file for traverser to pick its size from
            # and would require a recursive traversal w/ write to child metadata file mode
            assert_equal(child['size']['total'], {True: '6 Bytes', False: '0 Bytes'}[recursive])

    for recursive in [True, False]:
        # run fs_traverse in write to json 'file' mode
        fs = fs_traverse(topdir, AnnexRepo(topdir), recursive=recursive, json='file')
        # fs_traverse should return a dictionary
        assert_equal(isinstance(fs, dict), True)
        # not including git and annex folders
        assert_equal([item for item in fs['nodes'] if ('gitdir' or 'annexdir') == item['name']], [])
        # extract info of the top-level child directory
        child = [item for item in fs['nodes'] if item['name'] == 'dir'][0]
        # verify node type
        assert_equal(child['type'], 'dir')
        # same node size on running fs_traversal in recursive followed by non-recursive mode
        # verifies child's metadata file being used to find its size
        # running in reverse order (non-recursive followed by recursive mode) will give (0, actual size)
        assert_equal(child['size']['total'], '6 Bytes')

        # verify subdirectory traversal if run in recursive mode
        if recursive:
            # sub-dictionary should not include git and hidden directory info
            assert_equal([item for item in child['nodes'] if ('subgit' or '.fgit') == item['name']], [])
            # extract subdirectory dictionary, else fail
            subchild = [subitem for subitem in child["nodes"] if subitem['name'] == 'subdir'][0]
            # extract info of file1.txts, else fail
            link = [subnode for subnode in subchild["nodes"] if subnode['name'] == 'file1.txt'][0]
            # verify node's sizes and type
            assert_equal(link['size']['total'], '3 Bytes')
            assert_equal(link['size']['ondisk'], link['size']['total'])
            assert_equal(link['type'], 'link')
            # extract info of file2.txt, else fail
            brokenlink = [subnode for subnode in subchild["nodes"] if subnode['name'] == 'file2.txt'][0]
            # verify node's sizes and type
            assert_equal(brokenlink['type'], 'link-broken')
            assert_equal(brokenlink['size']['ondisk'], '0 Bytes')
            assert_equal(brokenlink['size']['total'], '3 Bytes')
Ejemplo n.º 13
0
def test_check_dates(path):
    refdate = 1218182889

    with set_date(refdate - 1):
        ar = AnnexRepo(path, create=True)
        ar.add("foo")
        ar.commit("add foo")
        foo_commit = ar.get_hexsha()
        ar.commit("add foo")
        ar.tag("foo-tag", "tag before refdate")
        # We can't use ar.get_tags because that returns the commit's hexsha,
        # not the tag's, and ar.get_hexsha is limited to commit objects.
        foo_tag = ar.repo.git.rev_parse("foo-tag")
        # Make a lightweight tag to make sure `tag_dates` doesn't choke on it.
        ar.tag("light")
    with set_date(refdate + 1):
        ar.add("bar")
        ar.commit("add bar")
        bar_commit = ar.get_hexsha()
        ar.tag("bar-tag", "tag after refdate")
        bar_tag = ar.repo.git.rev_parse("bar-tag")
    with set_date(refdate + 2):
        # Drop an annexed file so that we have more blobs in the git-annex
        # branch than its current tree.
        ar.drop("bar", options=["--force"])

    results = {}
    for which in ["older", "newer"]:
        result = check_dates(ar, refdate, which=which)["objects"]
        ok_(result)
        if which == "newer":
            assert_in(bar_commit, result)
            assert_not_in(foo_commit, result)
            assert_in(bar_tag, result)
        elif which == "older":
            assert_in(foo_commit, result)
            assert_not_in(bar_commit, result)
            assert_in(foo_tag, result)
        results[which] = result

    ok_(any(x.get("filename") == "uuid.log"
            for x in results["older"].values()))

    newer_tree = check_dates(ar, refdate, annex="tree")["objects"]

    def is_annex_log_blob(entry):
        return (entry["type"] == "annex-blob"
                and entry["filename"].endswith(".log"))

    def num_logs(entries):
        return sum(map(is_annex_log_blob, entries.values()))

    # Because we dropped bar above, we should have one more blob in the
    # git-annex branch than in the current tree of the git-annex branch.
    eq_(num_logs(results["newer"]) - num_logs(newer_tree), 1)

    # Act like today is one day from the reference timestamp to check that we
    # get the same results with the one-day-back default.
    seconds_in_day = 60 * 60 * 24
    with patch('time.time', return_value=refdate + seconds_in_day):
        assert_equal(check_dates(ar, annex="tree")["objects"],
                     newer_tree)

    # We can give a path (str) instead of a GitRepo object.
    assert_equal(check_dates(path, refdate, annex="tree")["objects"],
                 newer_tree)

    with assert_raises(ValueError):
        check_dates(ar, refdate, which="unrecognized")
Ejemplo n.º 14
0
def test_ls_json(topdir, topurl):
    annex = AnnexRepo(topdir, create=True)
    ds = Dataset(topdir)
    # create some file and commit it
    with open(opj(ds.path, 'subdsfile.txt'), 'w') as f:
        f.write('123')
    ds.add(path='subdsfile.txt')
    ds.save("Hello!", version_tag=1)

    # add a subdataset
    ds.install('subds', source=topdir)

    subdirds = ds.create(_path_('dir/subds2'), force=True)
    subdirds.add('file')

    git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True)                    # create git repo
    git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt'))                           # commit to git to init git repo
    git.commit()
    annex.add(opj(topdir, 'dir', 'subgit'))                                     # add the non-dataset git repo to annex
    annex.add(opj(topdir, 'dir'))                                               # add to annex (links)
    annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force'])  # broken-link
    annex.commit()

    git.add('fgit.txt')              # commit to git to init git repo
    git.commit()
    # annex.add doesn't add submodule, so using ds.add
    ds.add(opj('dir', 'subgit'))                        # add the non-dataset git repo to annex
    ds.add('dir')                                  # add to annex (links)
    ds.drop(opj('dir', 'subdir', 'file2.txt'), check=False)  # broken-link

    # register "external" submodule  by installing and uninstalling it
    ext_url = topurl + '/dir/subgit/.git'
    # need to make it installable via http
    Runner()('git update-server-info', cwd=opj(topdir, 'dir', 'subgit'))
    ds.install(opj('dir', 'subgit_ext'), source=ext_url)
    ds.uninstall(opj('dir', 'subgit_ext'))
    meta_dir = opj('.git', 'datalad', 'metadata')

    def get_metahash(*path):
        if not path:
            path = ['/']
        return hashlib.md5(opj(*path).encode('utf-8')).hexdigest()

    def get_metapath(dspath, *path):
        return _path_(dspath, meta_dir, get_metahash(*path))

    def get_meta(dspath, *path):
        with open(get_metapath(dspath, *path)) as f:
            return js.load(f)

    # Let's see that there is no crash if one of the files is available only
    # in relaxed URL mode, so no size could be picked up
    ds.repo.add_url_to_file(
        'fromweb', topurl + '/noteventhere', options=['--relaxed'])

    for all_ in [True, False]:  # recurse directories
        for recursive in [True, False]:
            for state in ['file', 'delete']:
                # subdataset should have its json created and deleted when
                # all=True else not
                subds_metapath = get_metapath(opj(topdir, 'subds'))
                exists_prior = exists(subds_metapath)

                #with swallow_logs(), swallow_outputs():
                dsj = _ls_json(
                    topdir,
                    json=state,
                    all_=all_,
                    recursive=recursive
                )
                ok_startswith(dsj['tags'], '1-')

                exists_post = exists(subds_metapath)
                # print("%s %s -> %s" % (state, exists_prior, exists_post))
                assert_equal(exists_post, (state == 'file' and recursive))

                # root should have its json file created and deleted in all cases
                ds_metapath = get_metapath(topdir)
                assert_equal(exists(ds_metapath), state == 'file')

                # children should have their metadata json's created and deleted only when recursive=True
                child_metapath = get_metapath(topdir, 'dir', 'subdir')
                assert_equal(exists(child_metapath), (state == 'file' and all_))

                # ignored directories should not have json files created in any case
                for subdir in [('.hidden',), ('dir', 'subgit')]:
                    assert_false(exists(get_metapath(topdir, *subdir)))

                # check if its updated in its nodes sublist too. used by web-ui json. regression test
                assert_equal(dsj['nodes'][0]['size']['total'], dsj['size']['total'])

                # check size of subdataset
                subds = [item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds')][0]
                assert_equal(subds['size']['total'], '3 Bytes')

                # dir/subds2 must not be listed among nodes of the top dataset:
                topds_nodes = {x['name']: x for x in dsj['nodes']}

                assert_in('subds', topds_nodes)
                # XXX
                # # condition here is a bit a guesswork by yoh later on
                # # TODO: here and below clear destiny/interaction of all_ and recursive
                # assert_equal(dsj['size']['total'],
                #              '15 Bytes' if (recursive and all_) else
                #              ('9 Bytes' if (recursive or all_) else '3 Bytes')
                # )

                # https://github.com/datalad/datalad/issues/1674
                if state == 'file' and all_:
                    dirj = get_meta(topdir, 'dir')
                    dir_nodes = {x['name']: x for x in dirj['nodes']}
                    # it should be present in the subdir meta
                    assert_in('subds2', dir_nodes)
                    assert_not_in('url_external', dir_nodes['subds2'])
                    assert_in('subgit_ext', dir_nodes)
                    assert_equal(dir_nodes['subgit_ext']['url'], ext_url)
                # and not in topds
                assert_not_in('subds2', topds_nodes)

                # run non-recursive dataset traversal after subdataset metadata already created
                # to verify sub-dataset metadata being picked up from its metadata file in such cases
                if state == 'file' and recursive and not all_:
                    dsj = _ls_json(topdir, json='file', all_=False)
                    subds = [
                        item for item in dsj['nodes']
                        if item['name'] == ('subdsfile.txt' or 'subds')
                    ][0]
                    assert_equal(subds['size']['total'], '3 Bytes')

                assert_equal(
                    topds_nodes['fromweb']['size']['total'], UNKNOWN_SIZE
                )