def test_add_archive_content_zip(repo_path=None):
    ds = Dataset(repo_path).create(force=True)
    with chpwd(repo_path):
        with swallow_outputs():
            ds.save("1.zip", message="add 1.zip")
        add_archive_content("1.zip")
        ok_file_under_git(ds.pathobj / "1" / "foo", annexed=True)
        ok_file_under_git(ds.pathobj / "1" / "dir" / "bar", annexed=True)
        ok_archives_caches(ds.path, 0)
    def test_add_archive_leading_dir(self):
        import os
        os.mkdir(self.ds.pathobj / 'sub')
        f123 = Path('sub') / '123.tar'
        Path(self.ds.pathobj / '1.tar').rename(self.ds.pathobj / Path(f123))
        self.annex.remove('1.tar', force=True)
        self.ds.save(message="renamed")

        self.ds.add_archive_content(f123,
                                    add_archive_leading_dir=True,
                                    strip_leading_dirs=True)

        ok_file_under_git(self.ds.path,
                          str(Path('sub') / '123' / 'file.txt'),
                          annexed=True)
Esempio n. 3
0
def test_ok_file_under_git_symlinks(path=None):
    # Test that works correctly under symlinked path
    orepo = GitRepo(path)
    orepo.add('ingit')
    orepo.commit('msg')
    orepo.add('staged')
    lpath = path + "-symlink"  # will also be removed AFAIK by our tempfile handling
    Path(lpath).symlink_to(Path(path))
    ok_symlink(lpath)
    ok_file_under_git(op.join(path, 'ingit'))
    ok_file_under_git(op.join(lpath, 'ingit'))
    ok_file_under_git(op.join(lpath, 'staged'))
    with assert_raises(AssertionError):
        ok_file_under_git(op.join(lpath, 'notingit'))
    with assert_raises(AssertionError):
        ok_file_under_git(op.join(lpath, 'nonexisting'))
def test_add_archive_content_strip_leading(path_orig=None,
                                           url=None,
                                           repo_path=None):
    with chpwd(repo_path):
        ds = Dataset(repo_path).create(force=True)
        repo = ds.repo
        # Let's add first archive to the repo so we could test
        with swallow_outputs():
            repo.add_url_to_file('1.tar.gz', opj(url, '1.tar.gz'))
        repo.commit("added 1.tar.gz")

        add_archive_content('1.tar.gz', strip_leading_dirs=True)
        ok_(not exists('1'))
        ok_file_under_git(ds.path, '1 f.txt', annexed=True)
        ok_file_under_git('d', '1d', annexed=True)
        ok_archives_caches(ds.path, 0)
    def test_override_existing_under_git(self):
        create_tree(self.ds.path, {'1.dat': 'load2'})
        self.ds.save('1.dat', to_git=True, message='added to git')
        self.ds.add_archive_content(
            '1.tar',
            strip_leading_dirs=True,
        )
        # and we did not bother adding it to annex (for now) -- just skipped
        # since we have it and it is the same
        ok_file_under_git(self.ds.path, '1.dat', annexed=False)

        # but if we say 'overwrite' -- we would remove and replace
        self.ds.add_archive_content('1.tar',
                                    strip_leading_dirs=True,
                                    delete=True,
                                    existing='overwrite')
        ok_file_under_git(self.ds.path, '1.dat', annexed=True)
def test_add_archive_content_absolute_path(path=None):
    ds = Dataset(opj(path, "ds")).create(force=True)
    repo = ds.repo
    ds.save("1.tar.gz", message="1.tar.gz")
    abs_tar_gz = opj(path, "ds", "1.tar.gz")
    add_archive_content(abs_tar_gz, dataset=ds)
    ok_file_under_git(opj(path, "ds", "1", "foo"), annexed=True)
    commit_msg = repo.format_commit("%B")
    # The commit message uses relative paths.
    assert_not_in(abs_tar_gz, commit_msg)
    assert_in("1.tar.gz", commit_msg)
    res = add_archive_content(opj(path, "notds", "2.tar.gz"),
                              dataset=ds,
                              on_failure='ignore')

    assert_in_results(
        res,
        action='add-archive-content',
        status='impossible',
        message='Can not add archive outside of the dataset',
    )
def test_add_archive_use_archive_dir(repo_path=None):
    ds = Dataset(repo_path).create(force=True)
    with chpwd(repo_path):
        # Let's add first archive to the repo with default setting
        archive_path = opj('4u', '1.tar.gz')
        # check it gives informative error if archive is not already added
        res = add_archive_content(archive_path, on_failure='ignore')
        message = \
            "Can not add an untracked archive. Run 'datalad save 4u\\1.tar.gz'"\
        if on_windows else \
            "Can not add an untracked archive. Run 'datalad save 4u/1.tar.gz'"
        assert_in_results(res,
                          action='add-archive-content',
                          message=message,
                          status='impossible')

        with swallow_outputs():
            ds.save(archive_path)

        ok_archives_caches(ds.path, 0)
        add_archive_content(archive_path,
                            strip_leading_dirs=True,
                            use_current_dir=True)
        ok_(not exists(opj('4u', '1 f.txt')))
        ok_file_under_git(ds.path, '1 f.txt', annexed=True)
        ok_archives_caches(ds.path, 0)

        # and now let's extract under archive dir
        add_archive_content(archive_path, strip_leading_dirs=True)
        ok_file_under_git(ds.path, opj('4u', '1 f.txt'), annexed=True)
        ok_archives_caches(ds.path, 0)

        add_archive_content(opj('4u', 'sub.tar.gz'))
        ok_file_under_git(ds.path, opj('4u', 'sub', '2 f.txt'), annexed=True)
        ok_archives_caches(ds.path, 0)
Esempio n. 8
0
def _test_BasicAnnexTestRepo(repodir):
    trepo = BasicAnnexTestRepo(repodir)
    trepo.create()
    assert_repo_status(trepo.path)
    ok_file_under_git(trepo.path, 'test.dat')
    ok_file_under_git(trepo.path, 'INFO.txt')
    ok_file_under_git(trepo.path, 'test-annex.dat', annexed=True)
    ok_(trepo.repo.file_has_content('test-annex.dat') is False)
    with swallow_outputs():
        trepo.repo.get('test-annex.dat')
    ok_(trepo.repo.file_has_content('test-annex.dat'))
Esempio n. 9
0
def test_BasicGitTestRepo(path=None):
    trepo = BasicGitTestRepo(path)
    trepo.create()
    assert_repo_status(trepo.path, annex=False)
    ok_file_under_git(trepo.path, 'test.dat')
    ok_file_under_git(trepo.path, 'INFO.txt')
Esempio n. 10
0
def test_within_ds_file_search(path=None):
    try:
        import mutagen
    except ImportError:
        raise SkipTest
    ds = Dataset(path).create(force=True)
    # override default and search for datasets and files for this test
    for m in ('egrep', 'textblob', 'autofield'):
        ds.config.add('datalad.search.index-{}-documenttype'.format(m),
                      'all',
                      scope='branch')
    ds.config.add('datalad.metadata.nativetype', 'audio', scope='branch')
    makedirs(opj(path, 'stim'))
    for src, dst in (('audio.mp3', opj('stim', 'stim1.mp3')), ):
        copy(opj(dirname(dirname(__file__)), 'tests', 'data', src),
             opj(path, dst))
    ds.save()
    ok_file_under_git(path, opj('stim', 'stim1.mp3'), annexed=True)
    # If it is not under annex, below addition of metadata silently does
    # not do anything
    ds.repo.set_metadata(opj('stim', 'stim1.mp3'), init={'importance': 'very'})
    ds.aggregate_metadata()
    assert_repo_status(ds.path)
    # basic sanity check on the metadata structure of the dataset
    dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata']
    for src in ('audio', ):
        # something for each one
        assert_in(src, dsmeta)
        # each src declares its own context
        assert_in('@context', dsmeta[src])
        # we have a unique content metadata summary for each src
        assert_in(src, dsmeta['datalad_unique_content_properties'])

    # test default behavior
    with swallow_outputs() as cmo:
        ds.search(show_keys='name', mode='textblob')

        assert_in("""\
id
meta
parentds
path
type
""", cmo.out)

    target_out = """\
annex.importance
annex.key
audio.bitrate
audio.duration(s)
audio.format
audio.music-Genre
audio.music-album
audio.music-artist
audio.music-channels
audio.music-sample_rate
audio.name
audio.tracknumber
datalad_core.id
datalad_core.refcommit
id
parentds
path
type
"""

    # test default behavior while limiting set of keys reported
    with swallow_outputs() as cmo:
        ds.search([r'\.id', 'artist$'], show_keys='short')
        out_lines = [l for l in cmo.out.split(os.linesep) if l]
        # test that only the ones matching were returned
        assert_equal([l for l in out_lines if not l.startswith(' ')],
                     ['audio.music-artist', 'datalad_core.id'])
        # more specific test which would also test formatting
        assert_equal(
            out_lines,
            [
                'audio.music-artist',
                ' in  1 datasets',
                " has 1 unique values: 'dlartist'",
                'datalad_core.id',
                ' in  1 datasets',
                # we have them sorted
                " has 1 unique values: '%s'" % ds.id
            ])

    with assert_raises(ValueError) as cme:
        ds.search('*wrong')
    assert_re_in(
        r"regular expression '\(\?i\)\*wrong' \(original: '\*wrong'\) is incorrect: ",
        str(cme.value))

    # check generated autofield index keys
    with swallow_outputs() as cmo:
        ds.search(mode='autofield', show_keys='name')
        # it is impossible to assess what is different from that dump
        assert_in(target_out, cmo.out)

    assert_result_count(ds.search('blablob#'), 0)
    # now check that we can discover things from the aggregated metadata
    for mode, query, hitpath, matched in (
        ('egrep', ':mp3', opj('stim', 'stim1.mp3'), {
            'audio.format': 'mp3'
        }),
            # same as above, leading : is stripped, in indicates "ALL FIELDS"
        ('egrep', 'mp3', opj('stim', 'stim1.mp3'), {
            'audio.format': 'mp3'
        }),
            # same as above, but with AND condition
            # get both matches
        ('egrep', ['mp3', 'type:file'], opj('stim', 'stim1.mp3'), {
            'type': 'file',
            'audio.format': 'mp3'
        }),
            # case insensitive search
        ('egrep', 'mp3', opj('stim', 'stim1.mp3'), {
            'audio.format': 'mp3'
        }),
            # field selection by expression
        ('egrep', r'audio\.+:mp3', opj('stim', 'stim1.mp3'), {
            'audio.format': 'mp3'
        }),
            # random keyword query
        ('textblob', 'mp3', opj('stim', 'stim1.mp3'), {
            'meta': 'mp3'
        }),
            # report which field matched with auto-field
        ('autofield', 'mp3', opj('stim', 'stim1.mp3'), {
            'audio.format': 'mp3'
        }),
            # XXX next one is not supported by current text field analyser
            # decomposes the mime type in [mime, audio, mp3]
            # ('autofield',
            # "'mime:audio/mp3'",
            # opj('stim', 'stim1.mp3'),
            # 'audio.format', 'mime:audio/mp3'),
            # but this one works
        ('autofield', "'mime audio mp3'", opj('stim', 'stim1.mp3'), {
            'audio.format': 'mp3'
        }),
            # TODO extend with more complex queries to test whoosh
            # query language configuration
    ):
        res = ds.search(query, mode=mode, full_record=True)
        assert_result_count(
            res,
            1,
            type='file',
            path=opj(ds.path, hitpath),
            # each file must report the ID of the dataset it is from, critical for
            # discovering related content
            dsid=ds.id)
        # in egrep we currently do not search unique values
        # and the queries above aim at files
        assert_result_count(res, 1 if mode == 'egrep' else 2)
        if mode != 'egrep':
            assert_result_count(res,
                                1,
                                type='dataset',
                                path=ds.path,
                                dsid=ds.id)
        # test the key and specific value of the match
        for matched_key, matched_val in matched.items():
            assert_in(matched_key, res[-1]['query_matched'])
            assert_equal(res[-1]['query_matched'][matched_key], matched_val)

    # test a suggestion msg being logged if no hits and key is a bit off
    with swallow_logs(new_level=logging.INFO) as cml:
        res = ds.search('audio.formats:mp3 audio.bitsrate:1', mode='egrep')
        assert not res
        assert_in('Did you mean any of', cml.out)
        assert_in('audio.format', cml.out)
        assert_in('audio.bitrate', cml.out)
Esempio n. 11
0
def _test_target_ssh_inherit(standardgroup, ui, use_ssh, src_path,
                             target_path):
    ds = Dataset(src_path).create()
    if use_ssh:
        target_url = 'datalad-test:%s' % target_path
    else:
        target_url = target_path
    remote = "magical"
    # for the test of setting a group, will just smoke test while using current
    # user's group
    ds.create_sibling(target_url,
                      name=remote,
                      shared='group',
                      group=os.getgid(),
                      ui=ui)  # not doing recursively
    if standardgroup:
        ds.repo.set_preferred_content('wanted', 'standard', remote)
        ds.repo.set_preferred_content('group', standardgroup, remote)
    ds.publish(to=remote)

    # now a month later we created a new subdataset... a few of the nested ones
    # A known hiccup happened when there
    # is also subsub ds added - we might incorrectly traverse and not prepare
    # sub first for subsub to inherit etc
    parent_ds = ds
    subdss = []
    nlevels = 2  # gets slow: 1 - 43 sec, 2 - 49 sec , 3 - 69 sec
    for levels in range(nlevels):
        subds = parent_ds.create('sub')
        create_tree(subds.path, {'sub.dat': 'lots of data'})
        parent_ds.save('sub', recursive=True)
        ok_file_under_git(subds.path, 'sub.dat', annexed=True)
        parent_ds = subds
        subdss.append(subds)

    target_subdss = [
        Dataset(opj(*([target_path] + ['sub'] * (i + 1))))
        for i in range(nlevels)
    ]
    # since we do not have yet/thus have not used an option to record to publish
    # to that sibling by default (e.g. --set-upstream), if we run just ds.publish
    # -- should fail
    assert_result_count(
        ds.publish(on_failure='ignore'),
        1,
        status='impossible',
        message=
        'No target sibling configured for default publication, please specify via --to'
    )
    ds.publish(
        to=remote)  # should be ok, non recursive; BUT it (git or us?) would
    # create an empty sub/ directory
    assert_postupdate_hooks(target_path, installed=ui)
    for target_sub in target_subdss:
        ok_(not target_sub.is_installed())  # still not there
    res = ds.publish(to=remote, recursive=True, on_failure='ignore')
    assert_result_count(res, 1 + len(subdss))
    assert_status(('error', 'notneeded'), res)
    assert_result_count(res,
                        len(subdss),
                        status='error',
                        message=("Unknown target sibling '%s' for publication",
                                 'magical'))

    # Finally publishing with inheritance
    ds.publish(to=remote, recursive=True, missing='inherit')
    assert_postupdate_hooks(target_path, installed=ui)

    def check_dss():
        # we added the remote and set all the
        for subds in subdss:
            eq_(subds.repo.get_preferred_content('wanted', remote),
                'standard' if standardgroup else '')
            eq_(subds.repo.get_preferred_content('group', remote),
                standardgroup or '')

        for target_sub in target_subdss:
            ok_(target_sub.is_installed())  # it is there now
            eq_(target_sub.repo.config.get('core.sharedrepository'), '1')
            # and we have transferred the content
            if standardgroup and standardgroup == 'backup':
                # only then content should be copied
                ok_file_has_content(opj(target_sub.path, 'sub.dat'),
                                    'lots of data')
            else:
                # otherwise nothing is copied by default
                assert_false(target_sub.repo.file_has_content('sub.dat'))

    check_dss()
    # and it should be ok to reconfigure the full hierarchy of datasets
    # while "inheriting". No URL must be specified, and we must not blow
    # but just issue a warning for the top level dataset which has no super,
    # so cannot inherit anything - use case is to fixup/establish the full
    # hierarchy on the remote site
    ds.save(
        recursive=True)  # so we have committed hierarchy for create_sibling
    with swallow_logs(logging.WARNING) as cml:
        out = ds.create_sibling(None,
                                name=remote,
                                existing="reconfigure",
                                inherit=True,
                                ui=ui,
                                recursive=True)
        eq_(len(out), 1 + len(subdss))
        assert_in("Cannot determine super dataset", cml.out)

    check_dss()
Esempio n. 12
0
 def d2_basic_checks():
     ok_(exists('1'))
     ok_file_under_git('1', '2 f.txt', annexed=True)
     ok_file_under_git(opj('1', 'd2', '2d'), annexed=True)
     ok_archives_caches(repo.path, 0)
Esempio n. 13
0
 def d1_basic_checks():
     ok_(exists('1'))
     ok_file_under_git('1', '1 f.txt', annexed=True)
     ok_file_under_git(opj('1', 'd', '1d'), annexed=True)
     ok_archives_caches(repo_path, 0)
Esempio n. 14
0
def test_add_archive_content(path_orig=None, url=None, repo_path=None):
    with chpwd(repo_path):
        # TODO we need to be able to pass path into add_archive_content
        # We could mock but I mean for the API

        # no repo yet
        assert_raises(NoDatasetFound, add_archive_content,
                      "nonexisting.tar.gz")
        ds = Dataset(repo_path).create()
        res = ds.add_archive_content("nonexisting.tar.gz", on_failure='ignore')
        assert_in_results(res,
                          action='add-archive-content',
                          status='impossible')
        repo = ds.repo

        # we can't add a file from outside the repo ATM
        res = ds.add_archive_content(Path(path_orig) / '1.tar.gz',
                                     on_failure='ignore')
        assert_in_results(res,
                          action='add-archive-content',
                          status='impossible',
                          type="dataset",
                          message="Can not add archive outside of the dataset")

        # Let's add first archive to the repo so we could test
        with swallow_outputs():
            repo.add_url_to_file('1.tar.gz', opj(url, '1.tar.gz'))
            for s in range(1, 5):
                repo.add_url_to_file('%du/1.tar.gz' % s,
                                     opj(url, '%du/1.tar.gz' % s))
            repo.commit("added 1.tar.gz")

        key_1tar = repo.get_file_annexinfo('1.tar.gz')[
            'key']  # will be used in the test later

        def d1_basic_checks():
            ok_(exists('1'))
            ok_file_under_git('1', '1 f.txt', annexed=True)
            ok_file_under_git(opj('1', 'd', '1d'), annexed=True)
            ok_archives_caches(repo_path, 0)

        # and by default it just does it, everything goes to annex
        res = add_archive_content('1.tar.gz')
        assert_in_results(res, action='add-archive-content', status='ok')
        d1_basic_checks()

        # If ran again, should proceed just fine since the content is the same
        # so no changes would be made really
        res = add_archive_content('1.tar.gz')
        assert_in_results(res, action='add-archive-content', status='ok')

        # But that other one carries updated file, so should fail due to
        # overwrite
        res = add_archive_content(Path('1u') / '1.tar.gz',
                                  use_current_dir=True,
                                  on_failure='ignore')
        assert_in_results(
            res,
            action='add-archive-content',
            status='error',
        )
        assert_in('exists, but would be overwritten by new file',
                  res[0]['message'])
        # but should do fine if overrides are allowed
        add_archive_content(Path('1u') / '1.tar.gz',
                            existing='overwrite',
                            use_current_dir=True)
        add_archive_content(Path('2u') / '1.tar.gz',
                            existing='archive-suffix',
                            use_current_dir=True)
        add_archive_content(Path('3u') / '1.tar.gz',
                            existing='archive-suffix',
                            use_current_dir=True)
        add_archive_content(Path('4u') / '1.tar.gz',
                            existing='archive-suffix',
                            use_current_dir=True)

        # rudimentary test
        assert_equal(sorted(map(basename, glob(opj(repo_path, '1', '1*')))),
                     ['1 f-1.1.txt', '1 f-1.2.txt', '1 f-1.txt', '1 f.txt'])
        whereis = repo.whereis(glob(opj(repo_path, '1', '1*')))
        # they all must be the same
        assert (all([x == whereis[0] for x in whereis[1:]]))

    # and we should be able to reference it while under subdirectory
    subdir = opj(repo_path, 'subdir')
    with chpwd(subdir, mkdir=True):
        add_archive_content(opj(pardir, '1.tar.gz'),
                            dataset=ds.path,
                            use_current_dir=True)
        d1_basic_checks()
        # or we could keep relative path and also demand to keep the archive prefix
        # while extracting under original (annex root) dir
        add_archive_content(opj(pardir, '1.tar.gz'),
                            dataset=ds.path,
                            add_archive_leading_dir=True)

    with chpwd(opj(repo_path, '1')):
        d1_basic_checks()

    with chpwd(repo_path):
        # test with excludes and renames and annex options
        ds.add_archive_content(
            '1.tar.gz',
            exclude=['d'],
            rename=['/ /_', '/^1/2'],
            annex_options="-c annex.largefiles=exclude=*.txt",
            delete=True)
        # no conflicts since new name
        ok_file_under_git('2', '1_f.txt', annexed=False)
        assert_false(exists(opj('2', 'd')))
        assert_false(exists('1.tar.gz'))  # delete was in effect

    # now test ability to extract within subdir
    with chpwd(opj(repo_path, 'd1'), mkdir=True):
        # Let's add first archive to the repo so we could test
        # named the same way but different content
        with swallow_outputs():
            repo.add_url_to_file('d1/1.tar.gz', opj(url, 'd1', '1.tar.gz'))
        repo.commit("added 1.tar.gz in d1")

        def d2_basic_checks():
            ok_(exists('1'))
            ok_file_under_git('1', '2 f.txt', annexed=True)
            ok_file_under_git(opj('1', 'd2', '2d'), annexed=True)
            ok_archives_caches(repo.path, 0)

        add_archive_content('1.tar.gz', dataset=ds.path)
        d2_basic_checks()

    # in manual tests ran into the situation of inability to obtain on a single run
    # a file from an archive which was coming from a dropped key.  I thought it was
    # tested in custom remote tests, but I guess not sufficiently well enough
    repo.drop(opj('1', '1 f.txt'))  # should be all kosher
    repo.get(opj('1', '1 f.txt'))
    ok_archives_caches(repo.path, 1, persistent=True)
    ok_archives_caches(repo.path, 0, persistent=False)

    repo.drop(opj('1', '1 f.txt'))  # should be all kosher
    repo.drop(key_1tar,
              key=True)  # is available from the URL -- should be kosher
    repo.get(opj('1', '1 f.txt'))  # that what managed to not work

    # TODO: check if persistent archive is there for the 1.tar.gz

    # We should be able to drop everything since available online
    with swallow_outputs():
        clean(dataset=ds)
    repo.drop(key_1tar,
              key=True)  # is available from the URL -- should be kosher

    ds.drop(opj('1', '1 f.txt'))  # should be all kosher
    ds.get(opj('1', '1 f.txt'))  # and should be able to get it again

    # bug was that dropping didn't work since archive was dropped first
    repo.call_annex(["drop", "--all"])

    # verify that we can't drop a file if archive key was dropped and online archive was removed or changed size! ;)
    repo.get(key_1tar, key=True)
    unlink(opj(path_orig, '1.tar.gz'))
    with assert_raises(CommandError) as e:
        repo.drop(key_1tar, key=True)
        assert_equal(e.kwargs['stdout_json'][0]['success'], False)
        assert_result_values_cond(
            e.kwargs['stdout_json'], 'note', lambda x:
            '(Use --force to override this check, or adjust numcopies.)' in x)
    assert exists(opj(repo.path, repo.get_contentlocation(key_1tar)))
Esempio n. 15
0
 def ok_file_under_git_kludge(path, basename):
     ok_file_under_git(op.join(str(Path(path).resolve()), basename), annexed=True)