Ejemplo n.º 1
0
def test_uninstall_git_file(path):
    ds = Dataset(path)
    ok_(ds.is_installed())
    ok_(exists(opj(path, 'INFO.txt')))
    ok_file_under_git(ds.repo.path, 'INFO.txt')

    # drop file in Git in an annex repo
    # regardless of the type of repo this is 'notneeded'...
    # it is less about education that about "can we
    # we get the content back?", and for a file in Git we can
    assert_result_count(
        ds.drop(path='INFO.txt'),
        1,
        status='notneeded',
        message="no annex'ed content")

    res = ds.uninstall(path="INFO.txt", on_failure='ignore')
    assert_result_count(
        res, 1,
        status='impossible',
        message='can only uninstall datasets (consider the `drop` command)')

    # remove the file:
    res = ds.remove(path='INFO.txt', result_xfm='paths',
                    result_filter=lambda x: x['action'] == 'remove')
    assert_raises(AssertionError, ok_file_under_git, ds.repo.path, 'INFO.txt')
    ok_(not exists(opj(path, 'INFO.txt')))
    eq_(res, ['INFO.txt'])
Ejemplo n.º 2
0
def _test_target_ssh_inherit(standardgroup, src_path, target_path):
    ds = Dataset(src_path).create()
    target_url = 'localhost:%s' % target_path
    remote = "magical"
    # for the test of setting a group, will just smoke test while using current
    # user's group
    ds.create_sibling(target_url, name=remote, shared='group', group=os.getgid())  # not doing recursively
    if standardgroup:
        ds.repo.set_preferred_content('wanted', 'standard', remote)
        ds.repo.set_preferred_content('group', standardgroup, remote)
    ds.publish(to=remote)

    # now a month later we created a new subdataset
    subds = ds.create('sub')  # so now we got a hierarchy!
    create_tree(subds.path, {'sub.dat': 'lots of data'})
    subds.add('sub.dat')
    ok_file_under_git(subds.path, 'sub.dat', annexed=True)

    target_sub = Dataset(opj(target_path, 'sub'))
    # since we do not have yet/thus have not used an option to record to publish
    # to that sibling by default (e.g. --set-upstream), if we run just ds.publish
    # -- should fail
    assert_result_count(
        ds.publish(on_failure='ignore'),
        1,
        status='impossible',
        message='No target sibling configured for default publication, please specific via --to')
    ds.publish(to=remote)  # should be ok, non recursive; BUT it (git or us?) would
                  # create an empty sub/ directory
    ok_(not target_sub.is_installed())  # still not there
    res = ds.publish(to=remote, recursive=True, on_failure='ignore')
    assert_result_count(res, 2)
    assert_status(('error', 'notneeded'), res)
    assert_result_count(
        res, 1,
        status='error',
        message=("Unknown target sibling '%s' for publication", 'magical'))
    ds.publish(to=remote, recursive=True, missing='inherit')
    # we added the remote and set all the
    eq_(subds.repo.get_preferred_content('wanted', remote), 'standard' if standardgroup else '')
    eq_(subds.repo.get_preferred_content('group', remote), standardgroup or '')

    ok_(target_sub.is_installed())  # it is there now
    eq_(target_sub.repo.config.get('core.sharedrepository'), '1')
    # and we have transferred the content
    if standardgroup and standardgroup == 'backup':
        # only then content should be copied
        ok_file_has_content(opj(target_sub.path, 'sub.dat'), 'lots of data')
    else:
        # otherwise nothing is copied by default
        assert_false(target_sub.repo.file_has_content('sub.dat'))
Ejemplo n.º 3
0
def test_gh1597_simpler(path):
    ds = Dataset(path).create()
    # same goes for .gitattributes
    with open(opj(ds.path, '.gitignore'), 'a') as f:
        f.write('*.swp\n')
    ds.add('.gitignore')
    ok_clean_git(ds.path)
    ok_file_under_git(ds.path, '.gitignore', annexed=False)
    # put .gitattributes in some subdir and add all, should also go into Git
    os.makedirs(op.join(ds.path, 'subdir'))
    attrfile = op.join(ds.path, 'subdir', '.gitattributes')
    with open(attrfile, 'a') as f:
        f.write('# just a comment\n')
    ds.add('.')
    ok_clean_git(ds.path)
    ok_file_under_git(ds.path, op.relpath(attrfile, start=ds.path), annexed=False)
Ejemplo n.º 4
0
def test_gh1597(path):
    ds = Dataset(path).create()
    sub = ds.create('sub', save=False)
    # only staged at this point, but known, and not annexed
    ok_file_under_git(ds.path, '.gitmodules', annexed=False)
    res = ds.subdatasets()
    assert_result_count(res, 1, path=sub.path)
    # now modify .gitmodules with another command
    ds.subdatasets(contains=sub.path, set_property=[('this', 'that')])
    ok_clean_git(ds.path, index_modified=['sub'])
    # now modify low-level
    with open(opj(ds.path, '.gitmodules'), 'a') as f:
        f.write('\n')
    ok_clean_git(ds.path, index_modified=['.gitmodules', 'sub'])
    ds.add('.gitmodules')
    # must not come under annex mangement
    ok_file_under_git(ds.path, '.gitmodules', annexed=False)
Ejemplo n.º 5
0
def test_create_text_no_annex(path):
    ds = create(path, text_no_annex=True)
    ok_clean_git(path)
    import re
    ok_file_has_content(
        _path_(path, '.gitattributes'),
        content='\* annex\.largefiles=\(not\(mimetype=text/\*\)\)',
        re_=True,
        match=False,
        flags=re.MULTILINE
    )
    # and check that it is really committing text files to git and binaries
    # to annex
    create_tree(path,
        {
            't': 'some text',
            'b': ''  # empty file is not considered to be a text file
                     # should we adjust the rule to consider only non empty files?
        }
    )
    ds.add(['t', 'b'])
    ok_file_under_git(path, 't', annexed=False)
    ok_file_under_git(path, 'b', annexed=True)
Ejemplo n.º 6
0
def test_uninstall_annex_file(path):
    ds = Dataset(path)
    ok_(ds.is_installed())
    ok_file_under_git(ds.repo.path, 'test-annex.dat', annexed=True)
    ds.repo.get('test-annex.dat')
    ok_(ds.repo.file_has_content('test-annex.dat'))

    # remove file's content:
    res = ds.drop(path='test-annex.dat', result_xfm='paths')
    # test it happened:
    ok_(not ds.repo.file_has_content('test-annex.dat'))
    ok_file_under_git(ds.repo.path, 'test-annex.dat', annexed=True)
    # test result:
    eq_(res, [opj(ds.path, 'test-annex.dat')])

    ds.repo.get('test-annex.dat')

    # remove file:
    ds.remove(path='test-annex.dat')
    assert_raises(AssertionError, ok_file_under_git, ds.repo.path, 'test-annex.dat',
                  annexed=True)
    assert_raises(AssertionError, ok_file_under_git, ds.repo.path, 'test-annex.dat',
                  annexed=False)
    ok_(not exists(opj(path, 'test-annex.dat')))
Ejemplo n.º 7
0
def _test_BasicAnnexTestRepo(repodir):
    trepo = BasicAnnexTestRepo(repodir)
    trepo.create()
    assert_repo_status(trepo.path)
    ok_file_under_git(trepo.path, 'test.dat')
    ok_file_under_git(trepo.path, 'INFO.txt')
    ok_file_under_git(trepo.path, 'test-annex.dat', annexed=True)
    ok_(trepo.repo.file_has_content('test-annex.dat') is False)
    with swallow_outputs():
        trepo.repo.get('test-annex.dat')
    ok_(trepo.repo.file_has_content('test-annex.dat'))
def test_crawl_autoaddtext(ind, topurl, outd):
    ds = create(outd, text_no_annex=True)
    with chpwd(outd):  # TODO -- dataset argument
        crawl_init({
            'url': topurl,
            'a_href_match_': '.*'
        },
                   save=True,
                   template='simple_with_archives')
        crawl()
    ok_clean_git(outd)
    ok_file_under_git(outd, "anothertext", annexed=False)
    ok_file_under_git(outd, "d/textfile", annexed=False)
    ok_file_under_git(outd, "d/tooshort", annexed=True)
Ejemplo n.º 9
0
def test_add_dir_file(repo_path, p, topurl):
    # test whenever file becomes a directory and then back a file.  Should all work!
    annex = Annexificator(path=repo_path, auto_finalize=False)
    url = "%s/file" % topurl

    path1 = opj(repo_path, 'd')
    data1 = {'filename': 'd', 'url': url}
    out1 = list(annex(data1))

    # becomes a directory which carries a file
    data2 = {'filename': 'f', 'url': url, 'path': 'd'}
    # but since we didn't commit previous file yet -- should puke!
    assert_raises(RuntimeError, list, annex(data2))
    list(annex.finalize()({}))  # so it gets committed
    ok_file_under_git(path1, annexed=True)

    # and after that it should proceed normally
    #import pdb; pdb.set_trace()
    out2 = list(annex(data2))
    path2 = opj(repo_path, 'd', 'f')
    ok_(exists(path2))

    # tricky one -- becomes back a file... what if repo was dirty and files under dir were staged? TODO
    assert_raises(RuntimeError, list, annex(data1))
    list(annex.finalize()({}))  # so it gets committed
    ok_file_under_git(path2, annexed=True)

    list(annex(data1))
    list(annex.finalize()({}))  # so it gets committed
    ok_file_under_git(path1, annexed=True)

    # with auto_finalize (default) it should go smoother ;)
    annex = Annexificator(path=repo_path)
    list(annex(data2))
    # wouldn't happen without explicit finalize to commit whatever new is staged
    # ok_file_under_git(path2, annexed=True)
    list(annex(data1))
    list(annex.finalize()({}))  # so it gets committed
    ok_file_under_git(path1, annexed=True)
Ejemplo n.º 10
0
def test_add_mimetypes(path):
    # XXX apparently there is symlinks dereferencing going on while deducing repo
    #    type there!!!! so can't use following invocation  -- TODO separately
    import os
    path = os.path.realpath(path)  # yoh gives up for now
    ds = Dataset(path).create(force=True)
    ds.repo.add('.gitattributes')
    ds.repo.commit('added attributes to git explicitly')
    # now test that those files will go into git/annex correspondingly
    __not_tested__ = ds.add(['file.txt', 'empty'])
    ok_clean_git(path, untracked=['file2.txt'])
    # Empty one considered to be  application/octet-stream  i.e. non-text
    ok_file_under_git(path, 'empty', annexed=True)
    ok_file_under_git(path, 'file.txt', annexed=False)

    # But we should be able to force adding file to annex when desired
    ds.add('file2.txt', to_git=False)
    ok_file_under_git(path, 'file2.txt', annexed=True)
Ejemplo n.º 11
0
def test_add_mimetypes(path):
    # XXX apparently there is symlinks dereferencing going on while deducing repo
    #    type there!!!! so can't use following invocation  -- TODO separately
    import os
    path = os.path.realpath(path)  # yoh gives up for now
    ds = Dataset(path).create(force=True)
    ds.repo.add('.gitattributes')
    ds.repo.commit('added attributes to git explicitly')
    # now test that those files will go into git/annex correspondingly
    __not_tested__ = ds.add(['file.txt', 'empty'])
    ok_clean_git(path, untracked=['file2.txt'])
    # Empty one considered to be  application/octet-stream  i.e. non-text
    ok_file_under_git(path, 'empty', annexed=True)
    ok_file_under_git(path, 'file.txt', annexed=False)

    # But we should be able to force adding file to annex when desired
    ds.add('file2.txt', to_git=False)
    ok_file_under_git(path, 'file2.txt', annexed=True)
Ejemplo n.º 12
0
 def ok_file_under_git_kludge(path, basename):
     ok_file_under_git(opj(op.realpath(path), basename), annexed=True)
Ejemplo n.º 13
0
def test_BasicGitTestRepo(path):
    trepo = BasicGitTestRepo(path)
    trepo.create()
    assert_repo_status(trepo.path, annex=False)
    ok_file_under_git(trepo.path, 'test.dat')
    ok_file_under_git(trepo.path, 'INFO.txt')
Ejemplo n.º 14
0
 def ok_file_under_git_kludge(path, basename):
     ok_file_under_git(opj(op.realpath(path), basename), annexed=True)
Ejemplo n.º 15
0
def _test_target_ssh_inherit(standardgroup, ui, src_path, target_path):
    ds = Dataset(src_path).create()
    target_url = 'localhost:%s' % target_path
    remote = "magical"
    # for the test of setting a group, will just smoke test while using current
    # user's group
    ds.create_sibling(target_url, name=remote, shared='group', group=os.getgid(), ui=ui)  # not doing recursively
    if standardgroup:
        ds.repo.set_preferred_content('wanted', 'standard', remote)
        ds.repo.set_preferred_content('group', standardgroup, remote)
    ds.publish(to=remote)

    # now a month later we created a new subdataset... a few of the nested ones
    # A known hiccup happened when there
    # is also subsub ds added - we might incorrectly traverse and not prepare
    # sub first for subsub to inherit etc
    parent_ds = ds
    subdss = []
    nlevels = 2  # gets slow: 1 - 43 sec, 2 - 49 sec , 3 - 69 sec
    for levels in range(nlevels):
        subds = parent_ds.create('sub')
        create_tree(subds.path, {'sub.dat': 'lots of data'})
        parent_ds.save('sub', recursive=True)
        ok_file_under_git(subds.path, 'sub.dat', annexed=True)
        parent_ds = subds
        subdss.append(subds)

    target_subdss = [
        Dataset(opj(*([target_path] + ['sub'] * (i+1))))
        for i in range(nlevels)
    ]
    # since we do not have yet/thus have not used an option to record to publish
    # to that sibling by default (e.g. --set-upstream), if we run just ds.publish
    # -- should fail
    assert_result_count(
        ds.publish(on_failure='ignore'),
        1,
        status='impossible',
        message='No target sibling configured for default publication, please specific via --to')
    ds.publish(to=remote)  # should be ok, non recursive; BUT it (git or us?) would
                  # create an empty sub/ directory
    assert_postupdate_hooks(target_path, installed=ui)
    for target_sub in target_subdss:
        ok_(not target_sub.is_installed())  # still not there
    res = ds.publish(to=remote, recursive=True, on_failure='ignore')
    assert_result_count(res, 1 + len(subdss))
    assert_status(('error', 'notneeded'), res)
    assert_result_count(
        res, len(subdss),
        status='error',
        message=("Unknown target sibling '%s' for publication", 'magical'))

    # Finally publishing with inheritance
    ds.publish(to=remote, recursive=True, missing='inherit')
    assert_postupdate_hooks(target_path, installed=ui)

    def check_dss():
        # we added the remote and set all the
        for subds in subdss:
            eq_(subds.repo.get_preferred_content('wanted', remote), 'standard' if standardgroup else '')
            eq_(subds.repo.get_preferred_content('group', remote), standardgroup or '')

        for target_sub in target_subdss:
            ok_(target_sub.is_installed())  # it is there now
            eq_(target_sub.repo.config.get('core.sharedrepository'), '1')
            # and we have transferred the content
            if standardgroup and standardgroup == 'backup':
                # only then content should be copied
                ok_file_has_content(opj(target_sub.path, 'sub.dat'), 'lots of data')
            else:
                # otherwise nothing is copied by default
                assert_false(target_sub.repo.file_has_content('sub.dat'))

    check_dss()
    # and it should be ok to reconfigure the full hierarchy of datasets
    # while "inheriting". No URL must be specified, and we must not blow
    # but just issue a warning for the top level dataset which has no super,
    # so cannot inherit anything - use case is to fixup/establish the full
    # hierarchy on the remote site
    with swallow_logs(logging.WARNING) as cml:
        out = ds.create_sibling(
            None, name=remote, existing="reconfigure", inherit=True,
            ui=ui, recursive=True)
        eq_(len(out), 1 + len(subdss))
        assert_in("Cannot determine super dataset", cml.out)

    check_dss()
Ejemplo n.º 16
0
def test_demo_repro_analysis(bids_path, ana_path, toolbox_url):

    import glob

    localizer_ds = Dataset(bids_path).create()
    localizer_ds.run_procedure('cfg_bids')

    # TODO: decorator
    # TODO: with config patch for toolbox ? -> overwrite?
    # localizer_ds.install(source="https://github.com/psychoinformatics-de/hirni-demo",
    #                      path="sourcedata",
    #                      recursive=True)
    with patch.dict('os.environ', {'DATALAD_HIRNI_TOOLBOX_URL': toolbox_url}):
        install_demo_dataset(localizer_ds, "sourcedata", recursive=True)

    assert_repo_status(localizer_ds.repo)
    subs = localizer_ds.subdatasets(recursive=True)
    assert_result_count(subs, 4)
    assert_result_count(subs, 1, path=op.join(localizer_ds.path, 'sourcedata'))
    assert_result_count(subs,
                        1,
                        path=op.join(localizer_ds.path, 'sourcedata', 'code',
                                     'hirni-toolbox'))
    assert_result_count(subs,
                        1,
                        path=op.join(localizer_ds.path, 'sourcedata', 'acq1',
                                     'dicoms'))
    assert_result_count(subs,
                        1,
                        path=op.join(localizer_ds.path, 'sourcedata', 'acq2',
                                     'dicoms'))

    localizer_ds.hirni_spec2bids(
        [op.join(localizer_ds.path, 'sourcedata', 'studyspec.json')] +
        glob.glob(
            op.join(localizer_ds.path, 'sourcedata', '*', 'studyspec.json')),
        anonymize=True)

    for f in [
            'sub-001',
            'task-oneback_bold.json',
            'participants.tsv',
            op.join('sub-001', 'sub-001_scans.tsv'),
            op.join('sub-001', 'anat'),
            op.join('sub-001', 'anat', 'sub-001_run-1_T1w.json'),
            op.join('sub-001', 'anat', 'sub-001_run-1_T1w.nii.gz'),
            op.join('sub-001', 'func'),
            op.join('sub-001', 'func',
                    'sub-001_task-oneback_run-01_bold.json'),
            op.join('sub-001', 'func',
                    'sub-001_task-oneback_run-01_bold.nii.gz'),
            op.join('sub-001', 'func',
                    'sub-001_task-oneback_run-01_events.tsv'),
    ]:
        assert_true(op.lexists(op.join(localizer_ds.path, f)))

    analysis_ds = Dataset(ana_path).create()
    analysis_ds.install(source=localizer_ds.path,
                        path=op.join('inputs', 'rawdata'))

    analysis_ds.run_procedure('cfg_yoda')
    # download-url expects the target dir to exist
    (analysis_ds.pathobj / 'code').mkdir(exist_ok=True)
    analysis_ds.download_url(
        path=op.join(analysis_ds.path, 'code') + op.
        sep,  # TODO: File issue. relative path via python API bound method doesn't work
        urls=[
            'https://raw.githubusercontent.com/myyoda/ohbm2018-training/master/section23/scripts/events2ev3.sh',
            'https://raw.githubusercontent.com/myyoda/ohbm2018-training/master/section23/scripts/ffa_design.fsf'
        ])

    assert_repo_status(analysis_ds.repo)
    ok_file_under_git(op.join(analysis_ds.path, 'code'),
                      'events2ev3.sh',
                      annexed=False)
    ok_file_under_git(op.join(analysis_ds.path, 'code'),
                      'ffa_design.fsf',
                      annexed=False)

    analysis_ds.run(inputs=[
        op.join('inputs', 'rawdata', 'sub-001', 'func',
                'sub-001_task-oneback_run-01_events.tsv')
    ],
                    outputs=[op.join('sub-001', 'onsets')],
                    cmd='bash code/events2ev3.sh sub-001 {inputs}',
                    message="Build FSL EV3 design files")

    raise SkipTest("Solve datalad-containers #115")

    analysis_ds.containers_add('fsl',
                               url="shub://ReproNim/ohbm2018-training:fsln")
    #   % datalad containers-list

    analysis_ds.save(version_tag="ready4analysis")

    assert_repo_status(analysis_ds.repo)

    #

    analysis_ds.run(
        outputs=[op.join('sub-001', '1stlvl_design.fsf')],
        cmd=
        "bash -c 'sed -e \"s,##BASEPATH##,{pwd},g\" -e \"s,##SUB##,sub-001,g\" code/ffa_design.fsf > {outputs}'",
        message="FSL FEAT analysis config script")

    assert_repo_status(analysis_ds.repo)
Ejemplo n.º 17
0
 def ok_file_under_git_kludge(path, basename):
     ok_file_under_git(op.join(str(Path(path).resolve()), basename),
                       annexed=True)
Ejemplo n.º 18
0
def _test_target_ssh_inherit(standardgroup, ui, use_ssh, src_path,
                             target_path):
    ds = Dataset(src_path).create()
    if use_ssh:
        target_url = 'datalad-test:%s' % target_path
    else:
        target_url = target_path
    remote = "magical"
    # for the test of setting a group, will just smoke test while using current
    # user's group
    ds.create_sibling(target_url,
                      name=remote,
                      shared='group',
                      group=os.getgid(),
                      ui=ui)  # not doing recursively
    if standardgroup:
        ds.repo.set_preferred_content('wanted', 'standard', remote)
        ds.repo.set_preferred_content('group', standardgroup, remote)
    ds.publish(to=remote)

    # now a month later we created a new subdataset... a few of the nested ones
    # A known hiccup happened when there
    # is also subsub ds added - we might incorrectly traverse and not prepare
    # sub first for subsub to inherit etc
    parent_ds = ds
    subdss = []
    nlevels = 2  # gets slow: 1 - 43 sec, 2 - 49 sec , 3 - 69 sec
    for levels in range(nlevels):
        subds = parent_ds.create('sub')
        create_tree(subds.path, {'sub.dat': 'lots of data'})
        parent_ds.save('sub', recursive=True)
        ok_file_under_git(subds.path, 'sub.dat', annexed=True)
        parent_ds = subds
        subdss.append(subds)

    target_subdss = [
        Dataset(opj(*([target_path] + ['sub'] * (i + 1))))
        for i in range(nlevels)
    ]
    # since we do not have yet/thus have not used an option to record to publish
    # to that sibling by default (e.g. --set-upstream), if we run just ds.publish
    # -- should fail
    assert_result_count(
        ds.publish(on_failure='ignore'),
        1,
        status='impossible',
        message=
        'No target sibling configured for default publication, please specify via --to'
    )
    ds.publish(
        to=remote)  # should be ok, non recursive; BUT it (git or us?) would
    # create an empty sub/ directory
    assert_postupdate_hooks(target_path, installed=ui)
    for target_sub in target_subdss:
        ok_(not target_sub.is_installed())  # still not there
    res = ds.publish(to=remote, recursive=True, on_failure='ignore')
    assert_result_count(res, 1 + len(subdss))
    assert_status(('error', 'notneeded'), res)
    assert_result_count(res,
                        len(subdss),
                        status='error',
                        message=("Unknown target sibling '%s' for publication",
                                 'magical'))

    # Finally publishing with inheritance
    ds.publish(to=remote, recursive=True, missing='inherit')
    assert_postupdate_hooks(target_path, installed=ui)

    def check_dss():
        # we added the remote and set all the
        for subds in subdss:
            eq_(subds.repo.get_preferred_content('wanted', remote),
                'standard' if standardgroup else '')
            eq_(subds.repo.get_preferred_content('group', remote),
                standardgroup or '')

        for target_sub in target_subdss:
            ok_(target_sub.is_installed())  # it is there now
            eq_(target_sub.repo.config.get('core.sharedrepository'), '1')
            # and we have transferred the content
            if standardgroup and standardgroup == 'backup':
                # only then content should be copied
                ok_file_has_content(opj(target_sub.path, 'sub.dat'),
                                    'lots of data')
            else:
                # otherwise nothing is copied by default
                assert_false(target_sub.repo.file_has_content('sub.dat'))

    check_dss()
    # and it should be ok to reconfigure the full hierarchy of datasets
    # while "inheriting". No URL must be specified, and we must not blow
    # but just issue a warning for the top level dataset which has no super,
    # so cannot inherit anything - use case is to fixup/establish the full
    # hierarchy on the remote site
    with swallow_logs(logging.WARNING) as cml:
        out = ds.create_sibling(None,
                                name=remote,
                                existing="reconfigure",
                                inherit=True,
                                ui=ui,
                                recursive=True)
        eq_(len(out), 1 + len(subdss))
        assert_in("Cannot determine super dataset", cml.out)

    check_dss()
Ejemplo n.º 19
0
def test_within_ds_file_search(path):
    try:
        import mutagen
    except ImportError:
        raise SkipTest
    ds = Dataset(path).create(force=True)
    # override default and search for datasets and files for this test
    for m in ('egrep', 'textblob', 'autofield'):
        ds.config.add(
            'datalad.search.index-{}-documenttype'.format(m), 'all',
            where='dataset')
    ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset')
    makedirs(opj(path, 'stim'))
    for src, dst in (
            ('audio.mp3', opj('stim', 'stim1.mp3')),):
        copy(
            opj(dirname(dirname(__file__)), 'tests', 'data', src),
            opj(path, dst))
    ds.save()
    ok_file_under_git(path, opj('stim', 'stim1.mp3'), annexed=True)
    # If it is not under annex, below addition of metadata silently does
    # not do anything
    ds.repo.set_metadata(
        opj('stim', 'stim1.mp3'), init={'importance': 'very'})
    ds.aggregate_metadata()
    ok_clean_git(ds.path)
    # basic sanity check on the metadata structure of the dataset
    dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata']
    for src in ('audio',):
        # something for each one
        assert_in(src, dsmeta)
        # each src declares its own context
        assert_in('@context', dsmeta[src])
        # we have a unique content metadata summary for each src
        assert_in(src, dsmeta['datalad_unique_content_properties'])

    # test default behavior
    with swallow_outputs() as cmo:
        ds.search(show_keys='name', mode='textblob')

        assert_in("""\
id
meta
parentds
path
type
""", cmo.out)

    target_out = """\
annex.importance
annex.key
audio.bitrate
audio.duration(s)
audio.format
audio.music-Genre
audio.music-album
audio.music-artist
audio.music-channels
audio.music-sample_rate
audio.name
audio.tracknumber
datalad_core.id
datalad_core.refcommit
id
parentds
path
type
"""

    # check generated autofield index keys
    with swallow_outputs() as cmo:
        ds.search(mode='autofield', show_keys='name')
        # it is impossible to assess what is different from that dump
        assert_in(target_out, cmo.out)

    assert_result_count(ds.search('blablob#'), 0)
    # now check that we can discover things from the aggregated metadata
    for mode, query, hitpath, matched in (
        ('egrep',
         ':mp3',
         opj('stim', 'stim1.mp3'),
         {'audio.format': 'mp3'}),
        # same as above, leading : is stripped, in indicates "ALL FIELDS"
        ('egrep',
         'mp3',
         opj('stim', 'stim1.mp3'),
         {'audio.format': 'mp3'}),
        # same as above, but with AND condition
        # get both matches
        ('egrep',
         ['mp3', 'type:file'],
         opj('stim', 'stim1.mp3'),
         {'type': 'file', 'audio.format': 'mp3'}),
        # case insensitive search
        ('egrep',
         'mp3',
         opj('stim', 'stim1.mp3'),
         {'audio.format': 'mp3'}),
        # field selection by expression
        ('egrep',
         'audio\.+:mp3',
         opj('stim', 'stim1.mp3'),
         {'audio.format': 'mp3'}),
        # random keyword query
        ('textblob',
         'mp3',
         opj('stim', 'stim1.mp3'),
         {'meta': 'mp3'}),
        # report which field matched with auto-field
        ('autofield',
         'mp3',
         opj('stim', 'stim1.mp3'),
         {'audio.format': 'mp3'}),
        # XXX next one is not supported by current text field analyser
        # decomposes the mime type in [mime, audio, mp3]
        # ('autofield',
        # "'mime:audio/mp3'",
        # opj('stim', 'stim1.mp3'),
        # 'audio.format', 'mime:audio/mp3'),
        # but this one works
        ('autofield',
         "'mime audio mp3'",
         opj('stim', 'stim1.mp3'),
         {'audio.format': 'mp3'}),
        # TODO extend with more complex queries to test whoosh
        # query language configuration
    ):
        res = ds.search(query, mode=mode, full_record=True)
        assert_result_count(
            res, 1, type='file', path=opj(ds.path, hitpath),
            # each file must report the ID of the dataset it is from, critical for
            # discovering related content
            dsid=ds.id)
        # in egrep we currently do not search unique values
        # and the queries above aim at files
        assert_result_count(res, 1 if mode == 'egrep' else 2)
        if mode != 'egrep':
            assert_result_count(
                res, 1, type='dataset', path=ds.path, dsid=ds.id)
        # test the key and specific value of the match
        for matched_key, matched_val in matched.items():
            assert_in(matched_key, res[-1]['query_matched'])
            assert_equal(res[-1]['query_matched'][matched_key], matched_val)

    # test a suggestion msg being logged if no hits and key is a bit off
    with swallow_logs(new_level=logging.INFO) as cml:
        res = ds.search('audio.formats:mp3 audio.bitsrate:1', mode='egrep')
        assert not res
        assert_in('Did you mean any of', cml.out)
        assert_in('audio.format', cml.out)
        assert_in('audio.bitrate', cml.out)
Ejemplo n.º 20
0
def test_openfmri_pipeline1(ind, topurl, outd, clonedir):
    index_html = opj(ind, 'ds666', 'index.html')

    list(
        initiate_dataset(template="openfmri",
                         dataset_name='dataladtest-ds666',
                         path=outd,
                         data_fields=['dataset'])({
                             'dataset': 'ds666'
                         }))

    repo = AnnexRepo(outd, create=False)  # to be used in the checks
    # Since datalad 0.11.2 all .metadata/objects go under annex.
    # Here we have a test where we force drop all annexed content,
    # to mitigate that let's place all metadata under git
    dotdatalad_attributes_file = opj('.datalad', '.gitattributes')
    repo.set_gitattributes([('metadata/objects/**', {
        'annex.largefiles': '(nothing)'
    })], dotdatalad_attributes_file)
    # --amend so we do not cause change in # of commits below
    repo.commit("gitattributes",
                files=dotdatalad_attributes_file,
                options=['--amend'])

    with chpwd(outd):
        pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl)
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    # Inspect the tree -- that we have all the branches
    branches = {'master', 'incoming', 'incoming-processed', 'git-annex'}
    eq_(set(repo.get_branches()), branches)
    # We do not have custom changes in master yet, so it just follows incoming-processed atm
    # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed'))
    # Since we did initiate_dataset -- now we have separate master!
    assert_not_equal(repo.get_hexsha('master'),
                     repo.get_hexsha('incoming-processed'))
    # and that one is different from incoming
    assert_not_equal(repo.get_hexsha('incoming'),
                     repo.get_hexsha('incoming-processed'))

    t1w_fpath_nover = opj(outd, 'sub1', 'anat', 'sub-1_T1w.dat')
    ok_file_has_content(t1w_fpath_nover, "mighty load in old format")

    #
    # And now versioned files were specified!
    #
    add_to_index(index_html, content=_versioned_files)

    with chpwd(outd):
        pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl)
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    ok_(
        not exists(t1w_fpath_nover),
        "%s file should no longer be there if unversioned files get removed correctly"
        % t1w_fpath_nover)
    repo = AnnexRepo(outd, create=False)  # to be used in the checks
    # Inspect the tree -- that we have all the branches
    branches = {'master', 'incoming', 'incoming-processed', 'git-annex'}
    eq_(set(repo.get_branches()), branches)
    # We do not have custom changes in master yet, so it just follows incoming-processed atm
    # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed'))
    # Since we did initiate_dataset -- now we have separate master!
    assert_not_equal(repo.get_hexsha('master'),
                     repo.get_hexsha('incoming-processed'))
    # and that one is different from incoming
    assert_not_equal(repo.get_hexsha('incoming'),
                     repo.get_hexsha('incoming-processed'))

    # actually the tree should look quite neat with 1.0.0 tag having 1 parent in incoming
    # 1.0.1 having 1.0.0 and the 2nd commit in incoming as parents

    commits_hexsha = {b: list(_get_branch_commits(repo, b)) for b in branches}
    commits_l = {
        b: list(_get_branch_commits(repo, b, limit='left-only'))
        for b in branches
    }

    # all commits out there:
    # dataset init, crawler init
    #   (2 commits)
    # + 3*(incoming, processed, merge)
    # + 3*aggregate-metadata update
    #   - 1 since now that incoming starts with master, there is one less merge
    # In --incremental mode there is a side effect of absent now
    #   2*remove of obsolete metadata object files,
    #     see https://github.com/datalad/datalad/issues/2772
    # TODO inspect by knowledgeable person and re-enable
    #ncommits_master = len(commits_hexsha['master'])
    #assert_in(ncommits_master, [13, 14])
    #assert_in(len(commits_l['master']), [8, 9])

    # TODO inspect by knowledgeable person and re-enable
    #eq_(len(commits_hexsha['incoming']), ncommits_master - 8)
    #eq_(len(commits_l['incoming']), ncommits_master - 8)
    #eq_(len(commits_hexsha['incoming-processed']), ncommits_master - 5)
    #eq_(len(commits_l['incoming-processed']), ncommits_master - 8)

    # Check tags for the versions
    eq_(out[0]['datalad_stats'].get_total().versions, ['1.0.0', '1.0.1'])
    # +1 because original "release" was assumed to be 1.0.0
    repo_tags = repo.get_tags()
    eq_(repo.get_tags(output='name'), ['1.0.0', '1.0.0+1', '1.0.1'])

    # Ben: The tagged ones currently are the ones with the message
    # '[DATALAD] dataset aggregate metadata update\n':
    #eq_(repo_tags[0]['hexsha'], commits_l['master'][4])  # next to the last one
    #eq_(repo_tags[-1]['hexsha'], commits_l['master'][0])  # the last one

    def hexsha(l):
        return l.__class__(x.hexsha for x in l)

    # TODO requires additional tooling to re-enable
    ## Verify that we have desired tree of merges
    #eq_(hexsha(commits_l['incoming-processed'][0].parents), (commits_l['incoming-processed'][1],
    #                                                         commits_l['incoming'][0]))
    #eq_(hexsha(commits_l['incoming-processed'][2].parents), (commits_l['incoming-processed'][3],  # also in master
    #                                                         commits_l['incoming'][2],))

    # ben: The following two comparisons are targeting these commits:
    # commit "Merge branch 'incoming-processed'\n" in commits_l['master'],
    # parents are:
    # commit "[DATALAD] dataset aggregate metadata update\n" in commits_l['master'] and
    # commit "[DATALAD] Added files from extracted archives\n\nFiles processed: 6\n renamed: 2\n +annex: 3\nBranches merged: incoming->incoming-processed\n" in commits_l['incoming-processed']
    # TODO requires additional tooling to re-enable
    #eq_(hexsha(commits_l['master'][1].parents), (commits_l['master'][2],
    #                                             commits_l['incoming-processed'][0]))
    #eq_(hexsha(commits_l['master'][3].parents), (commits_l['master'][4],
    #                                             commits_l['incoming-processed'][1]))

    with chpwd(outd):
        eq_(set(glob('*')), {'changelog.txt', 'sub-1'})
        all_files = sorted(find_files('.'))

    t1w_fpath = opj(outd, 'sub-1', 'anat', 'sub-1_T1w.dat')
    ok_file_has_content(t1w_fpath, "mighty load 1.0.1")
    ok_file_under_git(opj(outd, 'changelog.txt'), annexed=False)
    ok_file_under_git(t1w_fpath, annexed=True)

    try:
        # this is the new way
        from datalad.metadata.metadata import get_ds_aggregate_db_locations
        ds = Dataset('.')
        dbloc, objbase = get_ds_aggregate_db_locations(ds)
        dbloc = op.relpath(dbloc, start=ds.path)
    except ImportError:
        # this stopped working in early 2019 versions of datalad
        from datalad.metadata.metadata import agginfo_relpath
        dbloc = agginfo_relpath

    target_files = {
        './.datalad/config',
        './.datalad/crawl/crawl.cfg',
        # no more!
        # './.datalad/config.ttl', './.datalad/datalad.ttl',
        './.datalad/crawl/statuses/incoming.json',
        './.datalad/crawl/versions/incoming.json',
        './changelog.txt',
        './sub-1/anat/sub-1_T1w.dat',
        './sub-1/beh/responses.tsv',
        './' + dbloc,
    }
    target_incoming_files = {
        '.gitattributes',  # we marked default backend right in the incoming
        # we now base 'incoming' on master branch, so we get all those as well
        '.datalad/.gitattributes',
        '.datalad/config',
        '.datalad/crawl/crawl.cfg',
        'changelog.txt',
        'ds666.tar.gz',
        'ds666-beh_R1.0.1.tar.gz',
        'ds666_R1.0.0.tar.gz',
        'ds666_R1.0.1.tar.gz',
        'ds666_R2.0.0.tar.gz',
        '.datalad/crawl/statuses/incoming.json',
        '.datalad/crawl/versions/incoming.json'
    }
    # Ben: metadata object files may differ in their names containing some checksum-ish shit ...
    # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison
    eq_(
        set([
            f for f in all_files
            if not f.startswith('./.datalad/metadata/objects/')
        ]), target_files)

    # check that -beh was committed in 2nd commit in incoming, not the first one
    assert_not_in('ds666-beh_R1.0.1.tar.gz',
                  repo.get_files(commits_l['incoming'][-1]))
    assert_in('ds666-beh_R1.0.1.tar.gz',
              repo.get_files(commits_l['incoming'][0]))

    # rerun pipeline -- make sure we are on the same in all branches!
    with chpwd(outd):
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches}
    eq_(commits_hexsha, commits_hexsha_)  # i.e. nothing new
    # actually we do manage to add_git 1 (README) since it is generated committed directly to git
    # BUT now fixed -- if not committed (was the same), should be marked as skipped
    # Nothing was committed so stats leaked all the way up
    eq_(out[0]['datalad_stats'], ActivityStats(files=5, skipped=5, urls=5))
    eq_(out[0]['datalad_stats'], out[0]['datalad_stats'].get_total())

    # rerun pipeline when new content is available
    # add new revision, rerun pipeline and check that stuff was processed/added correctly
    add_to_index(
        index_html,
        content=
        '<a href="ds666_R2.0.0.tar.gz">Raw data on AWS version 2.0.0</a>')

    with chpwd(outd):
        out = run_pipeline(pipeline)
        all_files_updated = sorted(find_files('.'))
    eq_(len(out), 1)
    assert_not_equal(out[0]['datalad_stats'].get_total(), ActivityStats())
    # there is no overlays ATM, so behav would be gone since no 2.0.0 for it!
    target_files.remove('./sub-1/beh/responses.tsv')

    # Ben: metadata object files may differ in their names containing some checksum-ish shit ...
    # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison
    eq_(
        set([
            f for f in all_files_updated
            if not f.startswith('./.datalad/metadata/objects/')
        ]), target_files)

    # new instance so it re-reads git stuff etc
    # repo = AnnexRepo(outd, create=False)  # to be used in the checks
    commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches}
    commits_l_ = {
        b: list(_get_branch_commits(repo, b, limit='left-only'))
        for b in branches
    }

    assert_not_equal(commits_hexsha, commits_hexsha_)
    eq_(out[0]['datalad_stats'],
        ActivityStats())  # commit happened so stats were consumed
    # numbers seems to be right
    total_stats = out[0]['datalad_stats'].get_total()
    # but for some reason downloaded_size fluctuates.... why? probably archiving...?
    total_stats.downloaded_size = 0
    eq_(
        total_stats,
        ActivityStats(
            files=8,
            skipped=5,
            downloaded=1,
            renamed=1,
            urls=6,
            add_annex=2,  # add_git=1, # README
            versions=['2.0.0'],
            merges=[['incoming', 'incoming-processed']]))

    check_dropall_get(repo)

    # Let's see if pipeline would remove files we stopped tracking
    remove_from_index(index_html, '<a href=.ds666_R1.0.0[^<]*</a>')
    with chpwd(outd):
        with swallow_logs(new_level=logging.WARNING) as cml:
            out = run_pipeline(pipeline)
            # since files get removed in incoming, but repreprocessed completely
            # incomming-processed and merged into master -- new commits will come
            # They shouldn't have any difference but still should be new commits
            assert_in("There is already a tag 2.0.0 in the repository",
                      cml.out)
    eq_(len(out), 1)
    incoming_files = repo.get_files('incoming')
    target_incoming_files.remove('ds666_R1.0.0.tar.gz')
    eq_(set(incoming_files), target_incoming_files)
    commits_hexsha_removed = {
        b: list(_get_branch_commits(repo, b))
        for b in branches
    }
    # our 'statuses' database should have recorded the change thus got a diff
    # which propagated through all branches
    for b in 'master', 'incoming-processed':
        # with non persistent DB we had no changes
        # eq_(repo.repo.branches[b].commit.diff(commits_hexsha_[b][0]), [])
        assert_in(repo.pathobj / '.datalad/crawl/statuses/incoming.json',
                  repo.diff(b, commits_hexsha_[b][0]))
    dincoming = repo.diff('incoming', commits_hexsha_['incoming'][0])
    eq_(len(dincoming),
        2)  # 2 diff objects -- 1 file removed, 1 statuses updated
    eq_(
        set(dincoming.keys()), {
            repo.pathobj / '.datalad/crawl/statuses/incoming.json',
            repo.pathobj / 'ds666_R1.0.0.tar.gz'
        })

    eq_(out[0]['datalad_stats'].get_total().removed, 1)
    assert_not_equal(commits_hexsha_, commits_hexsha_removed)

    # we will check if a clone would be crawling just as good
    from datalad.api import crawl

    # make a brand new clone
    GitRepo.clone(outd, clonedir)

    def _pipeline(*args, **kwargs):
        """Helper to mock openfmri.pipeline invocation so it looks at our 'server'"""
        kwargs = updated(kwargs, {'topurl': topurl, 'versioned_urls': False})
        return ofpipeline(*args, **kwargs)

    with chpwd(clonedir), patch.object(openfmri, 'pipeline', _pipeline):
        output, stats = crawl(
        )  # we should be able to recrawl without doing anything
        ok_(stats, ActivityStats(files=6, skipped=6, urls=5))
Ejemplo n.º 21
0
def test_1(text_dandiset: Dict[str, Any], tmp_path: Path) -> None:
    # TODO: move pre-setup into a fixture, e.g. local_setup1 or make code work without?
    di = DandiDatasetter(
        dandi_client=text_dandiset["client"],
        target_path=tmp_path,
        config=Config(
            # gh_org=None,
            # re_filter=None,
            # backup_remote=None,
            # jobs=jobs,
            # force=force,
            content_url_regex=r".*/blobs/",
            s3bucket="dandi-api-staging-dandisets",
        ),
    )

    with pytest.raises(Exception):
        log.info("test_1: Testing sync of nonexistent Dandiset")
        di.update_from_backup(["999999"])
    assert not (tmp_path / "999999").exists()

    # Since we are using text_dandiset, that immediately creates us a dandiset
    # TODO: may be separate it out, so we could start "clean" and still work ok
    # clean run without dandisets is ok
    # ret = di.update_from_backup()
    # assert ret is None, "nothing is returned ATM, if added -- test should be extended"

    dandiset_id = text_dandiset["dandiset_id"]
    log.info("test_1: Syncing test dandiset")
    di.update_from_backup([dandiset_id])

    ds = Dataset(
        tmp_path /
        text_dandiset["dandiset_id"])  # but we should get the super-dataset?
    assert_repo_status(ds.path)  # that all is clean etc
    ok_file_under_git(ds.path, "file.txt")

    (text_dandiset["dspath"] / "new.txt").write_text("This is a new file.\n")
    log.info("test_1: Updating test dandiset on server")
    text_dandiset["reupload"]()
    assert_repo_status(ds.path)  # no side-effects somehow
    log.info("test_1: Syncing test dandiset")
    di.update_from_backup([dandiset_id])
    assert_repo_status(ds.path)  # that all is clean etc
    assert (ds.pathobj / "new.txt").read_text() == "This is a new file.\n"

    repo = GitRepo(ds.path)

    def check_version_tag(v: Version) -> None:
        vid = v.identifier

        # Assert tag has correct timestamp
        assert repo.get_tag_date(vid) == v.created.isoformat(
            timespec="seconds")

        # Assert tag has correct committer
        assert repo.get_tag_creator(
            vid) == "DANDI User <*****@*****.**>"

        # Assert tagged commit has correct timestamp
        assert repo.get_commit_date(vid) == v.created.isoformat(
            timespec="seconds")

        # Assert that tag was merged into default branch
        assert repo.is_ancestor(vid, DEFAULT_BRANCH)

        # Assert tag branches from default branch
        assert repo.parent_is_ancestor(DEFAULT_BRANCH, vid)

        # Assert dandiset.yaml in tagged commit has doi
        metadata = yaml_load(repo.get_blob(vid, dandiset_metadata_file))
        assert metadata.get("doi")

    log.info("test_1: Waiting for Dandiset to become valid")
    text_dandiset["dandiset"].wait_until_valid(65)
    log.info("test_1: Publishing Dandiset")
    v1 = text_dandiset["dandiset"].publish().version
    version1 = v1.identifier
    log.info("test_1: Syncing test dandiset")
    di.update_from_backup([dandiset_id])
    assert_repo_status(ds.path)  # that all is clean etc
    tags = {t["name"]: t["hexsha"] for t in ds.repo.get_tags()}
    assert version1 in tags
    v1_hash = tags[version1]
    check_version_tag(v1)

    (text_dandiset["dspath"] /
     "new.txt").write_text("This file's contents were changed.\n")
    log.info("test_1: Updating test dandiset on server")
    text_dandiset["reupload"]()
    log.info("test_1: Syncing test dandiset")
    di.update_from_backup([dandiset_id])
    assert_repo_status(ds.path)  # that all is clean etc
    assert (ds.pathobj /
            "new.txt").read_text() == "This file's contents were changed.\n"

    log.info("test_1: Waiting for Dandiset to become valid")
    text_dandiset["dandiset"].wait_until_valid(65)
    log.info("test_1: Publishing Dandiset")
    v2 = text_dandiset["dandiset"].publish().version
    version2 = v2.identifier
    log.info("test_1: Syncing test dandiset")
    di.update_from_backup([dandiset_id])
    assert_repo_status(ds.path)  # that all is clean etc
    tags = {t["name"]: t["hexsha"] for t in ds.repo.get_tags()}
    assert version1 in tags
    assert tags[version1] == v1_hash
    assert version2 in tags
    check_version_tag(v2)

    commit_authors = repo.readcmd("log", "--no-merges",
                                  "--format=%an <%ae>").splitlines()
    assert commit_authors == ["DANDI User <*****@*****.**>"
                              ] * len(commit_authors)

    for c in repo.get_backup_commits():
        assert repo.get_asset_files(c) == {
            asset["path"]
            for asset in repo.get_assets_json(c)
        }
Ejemplo n.º 22
0
 def d1_basic_checks():
     ok_(exists('1'))
     ok_file_under_git('1', '1 f.txt', annexed=True)
     ok_file_under_git(opj('1', 'd', '1d'), annexed=True)
     ok_archives_caches(repo_path, 0)
Ejemplo n.º 23
0
def test_add_archive_content(path_orig, url, repo_path):
    with chpwd(repo_path):
        # TODO we need to be able to pass path into add_archive_content
        # We could mock but I mean for the API
        assert_raises(RuntimeError, add_archive_content,
                      "nonexisting.tar.gz")  # no repo yet

        repo = AnnexRepo(repo_path, create=True)
        assert_raises(ValueError, add_archive_content, "nonexisting.tar.gz")
        # we can't add a file from outside the repo ATM
        assert_raises(FileNotInRepositoryError, add_archive_content,
                      opj(path_orig, '1.tar.gz'))

        # Let's add first archive to the repo so we could test
        with swallow_outputs():
            repo.add_urls([opj(url, '1.tar.gz')],
                          options=["--pathdepth", "-1"])
            for s in range(1, 5):
                repo.add_urls([opj(url, '%du/1.tar.gz' % s)],
                              options=["--pathdepth", "-2"])
        repo.commit("added 1.tar.gz")

        key_1tar = repo.get_file_key(
            '1.tar.gz')  # will be used in the test later

        def d1_basic_checks():
            ok_(exists('1'))
            ok_file_under_git('1', '1 f.txt', annexed=True)
            ok_file_under_git(opj('1', 'd', '1d'), annexed=True)
            ok_archives_caches(repo_path, 0)

        # and by default it just does it, everything goes to annex
        repo_ = add_archive_content('1.tar.gz')
        eq_(repo.path, repo_.path)
        d1_basic_checks()

        # If ran again, should proceed just fine since the content is the same so no changes would be made really
        add_archive_content('1.tar.gz')

        # But that other one carries updated file, so should fail due to overwrite
        with assert_raises(RuntimeError) as cme:
            add_archive_content(opj('1u', '1.tar.gz'), use_current_dir=True)

        # TODO: somewhat not precise since we have two possible "already exists"
        # -- in caching and overwrite check
        assert_in("already exists", str(cme.exception))
        # but should do fine if overrides are allowed
        add_archive_content(opj('1u', '1.tar.gz'),
                            existing='overwrite',
                            use_current_dir=True)
        add_archive_content(opj('2u', '1.tar.gz'),
                            existing='archive-suffix',
                            use_current_dir=True)
        add_archive_content(opj('3u', '1.tar.gz'),
                            existing='archive-suffix',
                            use_current_dir=True)
        add_archive_content(opj('4u', '1.tar.gz'),
                            existing='archive-suffix',
                            use_current_dir=True)

        # rudimentary test
        assert_equal(sorted(map(basename, glob(opj(repo_path, '1', '1*')))),
                     ['1 f-1.1.txt', '1 f-1.2.txt', '1 f-1.txt', '1 f.txt'])
        whereis = repo.whereis(glob(opj(repo_path, '1', '1*')))
        # they all must be the same
        assert (all([x == whereis[0] for x in whereis[1:]]))

    # and we should be able to reference it while under subdirectory
    subdir = opj(repo_path, 'subdir')
    with chpwd(subdir, mkdir=True):
        add_archive_content(opj(pardir, '1.tar.gz'), use_current_dir=True)
        d1_basic_checks()
        # or we could keep relative path and also demand to keep the archive prefix
        # while extracting under original (annex root) dir
        add_archive_content(opj(pardir, '1.tar.gz'),
                            add_archive_leading_dir=True)

    with chpwd(opj(repo_path, '1')):
        d1_basic_checks()

    with chpwd(repo_path):
        # test with excludes and renames and annex options
        add_archive_content('1.tar.gz',
                            exclude=['d'],
                            rename=['/ /_', '/^1/2'],
                            annex_options="-c annex.largefiles=exclude=*.txt",
                            delete=True)
        # no conflicts since new name
        ok_file_under_git('2', '1_f.txt', annexed=False)
        assert_false(exists(opj('2', 'd')))
        assert_false(exists('1.tar.gz'))  # delete was in effect

    # now test ability to extract within subdir
    with chpwd(opj(repo_path, 'd1'), mkdir=True):
        # Let's add first archive to the repo so we could test
        # named the same way but different content
        with swallow_outputs():
            repo.add_urls([opj(url, 'd1', '1.tar.gz')],
                          options=["--pathdepth", "-1"],
                          cwd=getpwd())  # invoke under current subdir
        repo.commit("added 1.tar.gz in d1")

        def d2_basic_checks():
            ok_(exists('1'))
            ok_file_under_git('1', '2 f.txt', annexed=True)
            ok_file_under_git(opj('1', 'd2', '2d'), annexed=True)
            ok_archives_caches(repo.path, 0)

        add_archive_content('1.tar.gz')
        d2_basic_checks()

    # in manual tests ran into the situation of inability to obtain on a single run
    # a file from an archive which was coming from a dropped key.  I thought it was
    # tested in custom remote tests, but I guess not sufficiently well enough
    repo.drop(opj('1', '1 f.txt'))  # should be all kosher
    repo.get(opj('1', '1 f.txt'))
    ok_archives_caches(repo.path, 1, persistent=True)
    ok_archives_caches(repo.path, 0, persistent=False)

    repo.drop(opj('1', '1 f.txt'))  # should be all kosher
    repo.drop(key_1tar,
              key=True)  # is available from the URL -- should be kosher
    repo.get(opj('1', '1 f.txt'))  # that what managed to not work

    # TODO: check if persistent archive is there for the 1.tar.gz

    # We should be able to drop everything since available online
    with swallow_outputs():
        clean(dataset=repo.path)
    repo.drop(key_1tar,
              key=True)  # is available from the URL -- should be kosher

    repo.drop(opj('1', '1 f.txt'))  # should be all kosher
    repo.get(opj('1', '1 f.txt'))  # and should be able to get it again

    # bug was that dropping didn't work since archive was dropped first
    repo.call_annex(["drop", "--all"])

    # verify that we can't drop a file if archive key was dropped and online archive was removed or changed size! ;)
    repo.get(key_1tar, key=True)
    unlink(opj(path_orig, '1.tar.gz'))
    with assert_raises(CommandError) as e:
        repo.drop(key_1tar, key=True)
        assert_equal(e.kwargs['stdout_json'][0]['success'], False)
        assert_result_values_cond(
            e.kwargs['stdout_json'], 'note', lambda x:
            '(Use --force to override this check, or adjust numcopies.)' in x)
    assert exists(opj(repo.path, repo.get_contentlocation(key_1tar)))
Ejemplo n.º 24
0
def _test_annex_file(mode, topdir, topurl, outdir):
    annex = Annexificator(path=outdir,
                          mode=mode,
                          statusdb='fileattr',
                          largefiles="exclude=*.txt")

    input = {'url': "%sd1/1.dat" % topurl, 'filename': '1-copy.dat'}
    tfile = opj(outdir, '1-copy.dat')
    # we add full filepath now
    expected_output = [dict(filepath=opj(outdir, input['filename']), **input)]
    output = list(annex(input))
    assert_equal(expected_output, output)

    # addurl is batched, and we haven't forced annex flushing so there should
    # be a batched process
    if not annex.repo.fake_dates_enabled:
        assert_equal(len(annex.repo._batched), 1)
    # if we finalize, it should flush batched annexes and commit
    list(annex.finalize()({}))
    assert (lexists(tfile))

    ok_file_under_git(tfile, annexed=True)
    if mode == 'full':
        ok_file_has_content(tfile, '1.dat load')
    else:
        # in fast or relaxed mode there must not be any content
        assert_raises(AssertionError, ok_file_has_content, tfile, '1.dat load')

    whereis = annex.repo.whereis(tfile)
    assert_in(annex.repo.WEB_UUID, whereis)  # url must have been added
    assert_equal(len(whereis), 1 + int(mode == 'full'))
    # TODO: check the url
    # Neither file should not be attempted to download again, since nothing changed
    # and by default we do use files db
    output = list(annex(input))
    assert_equal(output, [])  # nothing was done, so annex didn't yield data
    annex.yield_non_updated = True

    input_with_stats = input.copy()
    input_with_stats['datalad_stats'] = ActivityStats()
    output = list(annex(input_with_stats))
    assert_equal(output[0]['datalad_stats'],
                 ActivityStats(files=1, urls=1, skipped=1))

    # but if we change that file, it should re-download it now
    with open(opj(topdir, 'd1', '1.dat'), 'a') as f:
        f.write("+")
    output = list(annex(input_with_stats))
    stats = output[0]['datalad_stats']
    stats.downloaded_time = 0
    # 2 since we are reusing the same stats
    download_stats = dict(downloaded=1,
                          downloaded_size=11) if mode == 'full' else {}
    addskip_stats = dict(add_annex=0, skipped=2,
                         overwritten=0) if mode == 'relaxed' else dict(
                             add_annex=1, skipped=1, overwritten=1)
    kwargs = download_stats.copy()
    kwargs.update(addskip_stats)
    assert_equal(stats, ActivityStats(files=2, urls=2, **kwargs))

    # Download into a file which will be added to git
    # TODO: for now added to git only in full mode. in --fast or --relaxed, still goes to annex
    # http://git-annex.branchable.com/bugs/treatment_of_largefiles_is_not_working_for_addurl_--fast___40__or_--relaxed__41__/
    input = {
        'url': "%sd1/1.dat" % topurl,
        'filename': '1.txt',
        'datalad_stats': ActivityStats()
    }
    tfile = opj(outdir, '1.txt')
    output = list(annex(input))
    annexed = mode not in {'full'}
    list(annex.finalize()({}))
    if not annexed:
        ok_file_has_content(tfile, '1.dat load+')
    else:
        assert_raises(AssertionError, ok_file_has_content, tfile,
                      '1.dat load+')
    ok_file_under_git(tfile, annexed=annexed)
    assert_equal(len(output), 1)
    stats = output[0]['datalad_stats']
    # reset varying metric
    stats.downloaded_time = 0
    assert_equal(
        stats,
        ActivityStats(files=1,
                      urls=1,
                      add_git=1 - int(annexed),
                      add_annex=int(annexed),
                      **download_stats))

    # Let's add a file without specifying URL
    sfilepath = opj(outdir, 'sample.txt')
    with open(sfilepath, 'w') as f:
        f.write("sample")
    ok_file_has_content(sfilepath, "sample")
    output = list(
        annex({
            'filename': 'sample.txt',
            'datalad_stats': ActivityStats()
        }))
    ok_file_under_git(sfilepath, annexed=False)
    assert (output)
    assert_equal(output[0]['datalad_stats'], ActivityStats(files=1, add_git=1))
Ejemplo n.º 25
0
def test_within_ds_file_search(path):
    try:
        import mutagen
    except ImportError:
        raise SkipTest
    ds = Dataset(path).create(force=True)
    # override default and search for datasets and files for this test
    for m in ('egrep', 'textblob', 'autofield'):
        ds.config.add('datalad.search.index-{}-documenttype'.format(m),
                      'all',
                      where='dataset')
    ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset')
    makedirs(opj(path, 'stim'))
    for src, dst in (('audio.mp3', opj('stim', 'stim1.mp3')), ):
        copy(opj(dirname(dirname(__file__)), 'tests', 'data', src),
             opj(path, dst))
    ds.add('.')
    ok_file_under_git(path, opj('stim', 'stim1.mp3'), annexed=True)
    # If it is not under annex, below addition of metadata silently does
    # not do anything
    list(
        ds.repo.set_metadata(opj('stim', 'stim1.mp3'),
                             init={'importance': 'very'}))
    ds.aggregate_metadata()
    ok_clean_git(ds.path)
    # basic sanity check on the metadata structure of the dataset
    dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata']
    for src in ('audio', ):
        # something for each one
        assert_in(src, dsmeta)
        # each src declares its own context
        assert_in('@context', dsmeta[src])
        # we have a unique content metadata summary for each src
        assert_in(src, dsmeta['datalad_unique_content_properties'])

    # test default behavior
    with swallow_outputs() as cmo:
        ds.search(show_keys='name', mode='textblob')

        assert_in("""\
id
meta
parentds
path
type
""", cmo.out)

    target_out = """\
annex.importance
annex.key
audio.bitrate
audio.duration(s)
audio.format
audio.music-Genre
audio.music-album
audio.music-artist
audio.music-channels
audio.music-sample_rate
audio.name
audio.tracknumber
datalad_core.id
datalad_core.refcommit
id
parentds
path
type
"""

    # check generated autofield index keys
    with swallow_outputs() as cmo:
        ds.search(mode='autofield', show_keys='name')
        # it is impossible to assess what is different from that dump
        assert_in(target_out, cmo.out)

    assert_result_count(ds.search('blablob#'), 0)
    # now check that we can discover things from the aggregated metadata
    for mode, query, hitpath, matched in (
        ('egrep', ':mp3', opj('stim', 'stim1.mp3'), {
            'audio.format': 'mp3'
        }),
            # same as above, leading : is stripped, in indicates "ALL FIELDS"
        ('egrep', 'mp3', opj('stim', 'stim1.mp3'), {
            'audio.format': 'mp3'
        }),
            # same as above, but with AND condition
            # get both matches
        ('egrep', ['mp3', 'type:file'], opj('stim', 'stim1.mp3'), {
            'type': 'file',
            'audio.format': 'mp3'
        }),
            # case insensitive search
        ('egrep', 'mp3', opj('stim', 'stim1.mp3'), {
            'audio.format': 'mp3'
        }),
            # field selection by expression
        ('egrep', 'audio\.+:mp3', opj('stim', 'stim1.mp3'), {
            'audio.format': 'mp3'
        }),
            # random keyword query
        ('textblob', 'mp3', opj('stim', 'stim1.mp3'), {
            'meta': 'mp3'
        }),
            # report which field matched with auto-field
        ('autofield', 'mp3', opj('stim', 'stim1.mp3'), {
            'audio.format': 'mp3'
        }),
            # XXX next one is not supported by current text field analyser
            # decomposes the mime type in [mime, audio, mp3]
            # ('autofield',
            # "'mime:audio/mp3'",
            # opj('stim', 'stim1.mp3'),
            # 'audio.format', 'mime:audio/mp3'),
            # but this one works
        ('autofield', "'mime audio mp3'", opj('stim', 'stim1.mp3'), {
            'audio.format': 'mp3'
        }),
            # TODO extend with more complex queries to test whoosh
            # query language configuration
    ):
        res = ds.search(query, mode=mode, full_record=True)
        assert_result_count(
            res,
            1,
            type='file',
            path=opj(ds.path, hitpath),
            # each file must report the ID of the dataset it is from, critical for
            # discovering related content
            dsid=ds.id)
        # in egrep we currently do not search unique values
        # and the queries above aim at files
        assert_result_count(res, 1 if mode == 'egrep' else 2)
        if mode != 'egrep':
            assert_result_count(res,
                                1,
                                type='dataset',
                                path=ds.path,
                                dsid=ds.id)
        # test the key and specific value of the match
        for matched_key, matched_val in matched.items():
            assert_in(matched_key, res[-1]['query_matched'])
            assert_equal(res[-1]['query_matched'][matched_key], matched_val)

    # test a suggestion msg being logged if no hits and key is a bit off
    with swallow_logs(new_level=logging.INFO) as cml:
        res = ds.search('audio.formats:mp3 audio.bitsrate:1', mode='egrep')
        assert not res
        assert_in('Did you mean any of', cml.out)
        assert_in('audio.format', cml.out)
        assert_in('audio.bitrate', cml.out)
Ejemplo n.º 26
0
 def d2_basic_checks():
     ok_(exists('1'))
     ok_file_under_git('1', '2 f.txt', annexed=True)
     ok_file_under_git(opj('1', 'd2', '2d'), annexed=True)
     ok_archives_caches(repo.path, 0)
Ejemplo n.º 27
0
def test_demo_raw_ds(path, toolbox_url):

    ds = Dataset(path)

    with patch.dict('os.environ', {'DATALAD_HIRNI_TOOLBOX_URL': toolbox_url}):
        ds.create()  # TODO: May be move to ds.create(cfg_proc='hirni') in demo
        ds.run_procedure('cfg_hirni')

    # clean repo with an annex:
    assert_repo_status(ds.repo, annex=True)

    # README, dataset_description.json and studyspec.json at toplevel and in git
    for f in ['README', 'studyspec.json', 'dataset_description.json']:
        ok_file_under_git(ds.path, f, annexed=False)

    # toolbox installed under code/hirni-toolbox
    subs = ds.subdatasets()
    assert_result_count(subs, 1)
    assert_result_count(subs,
                        1,
                        path=op.join(ds.path, 'code', 'hirni-toolbox'))

    ds.hirni_import_dcm(
        'https://github.com/datalad/example-dicom-structural/archive/master.tar.gz',
        'acq1',
        anon_subject='001')

    # acquisition directory and studyspec created + subdataset 'dicoms' within the acquisition dir
    for f in [
            op.join(ds.path, 'acq1'),
            op.join(ds.path, 'acq1', 'studyspec.json'),
            op.join(ds.path, 'acq1', 'dicoms')
    ]:
        assert_true(op.exists(f))
    subs = ds.subdatasets()
    assert_result_count(subs, 2)
    assert_result_count(subs,
                        1,
                        path=op.join(ds.path, 'code', 'hirni-toolbox'))
    assert_result_count(subs, 1, path=op.join(ds.path, 'acq1', 'dicoms'))

    # TODO: check actual spec? (Prob. sufficient to test for that in dedicated import-dcm/dcm2spec tests
    # TODO: check dicom metadata

    ds.hirni_import_dcm(
        'https://github.com/datalad/example-dicom-functional/archive/master.tar.gz',
        'acq2',
        anon_subject='001')

    # acquisition directory and studyspec created + subdataset 'dicoms' within the acquisition dir
    for f in [
            op.join(ds.path, 'acq2'),
            op.join(ds.path, 'acq2', 'studyspec.json'),
            op.join(ds.path, 'acq2', 'dicoms')
    ]:
        assert_true(op.exists(f))
    subs = ds.subdatasets()
    assert_result_count(subs, 3)
    assert_result_count(subs,
                        1,
                        path=op.join(ds.path, 'code', 'hirni-toolbox'))
    assert_result_count(subs, 1, path=op.join(ds.path, 'acq1', 'dicoms'))
    assert_result_count(subs, 1, path=op.join(ds.path, 'acq2', 'dicoms'))

    # Note from demo: The calls to `git annex addurl` and `datalad save` currently replace a single call to
    # `datalad download-url` due to a bug in that command.
    events_file = op.join('acq2', 'events.tsv')
    ds.repo.add_url_to_file(
        file_=events_file,
        url=
        'https://github.com/datalad/example-dicom-functional/raw/master/events.tsv'
    )
    ds.save(message="Added stimulation protocol for acquisition 2")

    ok_file_under_git(ds.path, events_file, annexed=True)

    ds.hirni_spec4anything(
        events_file,
        properties=
        '{"procedures": {"procedure-name": "copy-converter", "procedure-call": "bash {script} {{location}} '
        '{ds}/sub-{{bids-subject}}/func/sub-{{bids-subject}}_task-{{bids-task}}_run-{{bids-run}}_events.tsv'
        '"}, "type": "events_file"}')

    ok_file_under_git(ds.path,
                      op.join('acq2', 'studyspec.json'),
                      annexed=False)
    assert_repo_status(ds.repo, annex=True)