Esempio n. 1
0
def test_get_values():
    data = {'x': 'y', 'g': 'h', 'a': 'b'}
    keys = ['x', 'a']

    gen = Sink()
    list(gen(data))
    eq_(gen.get_values(keys), [['y', 'b']])
Esempio n. 2
0
def test_listdict2dictlist():
    f = _listdict2dictlist
    l1 = [1, 3, [1, 'a']]
    assert f(l1) is l1, "we return it as is if no emb dict"
    eq_(f([{1: 2}]), {1: 2})  # inside out no need for a list
    # inside out, join into the list, skip entry with a list, or space
    eq_(f([{1: [2, 3], 'a': 1}, {'a': 2, 'c': ''}]), {'a': [1, 2]})
Esempio n. 3
0
def test_add_readme(path):
    ds = Dataset(path).create(force=True)
    ds.save()
    ds.aggregate_metadata()
    ok_clean_git(ds.path)
    assert_status('ok', ds.add_readme())
    # should use default name
    eq_(
        open(opj(path, 'README.md')).read(),
        """\
# Dataset "demo_ds"

this is for play

### Authors

- Betty
- Tom

### License

PDDL

## General information

This is a DataLad dataset (id: {id}).

For more information on DataLad and on how to work with its datasets,
see the DataLad documentation at: http://docs.datalad.org
""".format(
    id=ds.id))

    # should skip on re-run
    assert_status('notneeded', ds.add_readme())
Esempio n. 4
0
def test_get_recurse_dirs(o_path, c_path):

    # prepare source:
    origin = Dataset(o_path).create(force=True)
    origin.add('.')

    ds = install(
        c_path, source=o_path,
        result_xfm='datasets', return_type='item-or-list')

    file_list = ['file1.txt',
                 opj('subdir', 'file2.txt'),
                 opj('subdir', 'subsubdir', 'file3.txt'),
                 opj('subdir', 'subsubdir', 'file4.txt')]
    files_in_sub = [f for f in file_list if f.startswith(with_pathsep('subdir'))]

    # no content present:
    ok_(not any(ds.repo.file_has_content(file_list)))

    result = ds.get('subdir')

    # check result:
    assert_status('ok', result)
    eq_(set([item.get('path')[len(ds.path) + 1:] for item in result
             if item['type'] == 'file']),
        set(files_in_sub))
    # we also get one report on the subdir
    eq_(len(result) - 1, len(files_in_sub))

    # got all files beneath subdir:
    ok_(all(ds.repo.file_has_content(files_in_sub)))

    # additionally got file1.txt silently, since it has the same content as
    # subdir/subsubdir/file4.txt:
    ok_(ds.repo.file_has_content('file1.txt') is True)
Esempio n. 5
0
def test_runnin_on_empty(path):
    # empty repo
    repo = AnnexRepo(path, create=True)
    # just wrap with a dataset
    ds = Dataset(path)
    # and run status ... should be good and do nothing
    eq_([], ds.status())
Esempio n. 6
0
def test_install_skip_list_arguments(src, path, path_outside):
    ds = install(path, source=src)
    ok_(ds.is_installed())

    # install a list with valid and invalid items:
    result = ds.install(
        path=['subm 1', 'not_existing', path_outside, '2'],
        get_data=False,
        on_failure='ignore', result_xfm=None, return_type='list')
    # good and bad results together
    ok_(isinstance(result, list))
    eq_(len(result), 4)
    # check that we have an 'impossible' status for both invalid args
    # but all the other tasks have been accomplished
    for skipped, msg in [(opj(ds.path, 'not_existing'), "path does not exist"),
                         (path_outside, "path not associated with any dataset")]:
        assert_result_count(
            result, 1, status='impossible', message=msg, path=skipped)
    for sub in [Dataset(opj(path, 'subm 1')), Dataset(opj(path, '2'))]:
        assert_result_count(
            result, 1, status='ok',
            message=('Installed subdataset in order to get %s', sub.path))
        ok_(sub.is_installed())

    # return of get is always a list, by default, even if just one thing was gotten
    # in this case 'subm1' was already obtained above, so this will get this
    # content of the subdataset
    with assert_raises(IncompleteResultsError) as cme:
        ds.install(path=['subm 1', 'not_existing'])
    with assert_raises(IncompleteResultsError) as cme:
        ds.get(path=['subm 1', 'not_existing'])
Esempio n. 7
0
    def test_addurls_subdataset(self, path):
        ds = Dataset(path).create(force=True)

        with chpwd(path):
            for save in True, False:
                label = "save" if save else "nosave"
                hexsha_before = ds.repo.get_hexsha()
                ds.addurls(self.json_file, "{url}",
                           "{subdir}-" + label + "//{name}",
                           save=save)
                hexsha_after = ds.repo.get_hexsha()

                for fname in ["foo-{}/a", "bar-{}/b", "foo-{}/c"]:
                    ok_exists(fname.format(label))

                assert_true(save ^ (hexsha_before == hexsha_after))
                assert_true(save ^ ds.repo.dirty)

            # Now save the "--nosave" changes and check that we have
            # all the subdatasets.
            ds.add(".")
            eq_(set(subdatasets(ds, recursive=True,
                                result_xfm="relpaths")),
                {"foo-save", "bar-save", "foo-nosave", "bar-nosave"})

            # We don't try to recreate existing subdatasets.
            with swallow_logs(new_level=logging.DEBUG) as cml:
                ds.addurls(self.json_file, "{url}", "{subdir}-nosave//{name}")
                assert_in("Not creating subdataset at existing path", cml.out)
Esempio n. 8
0
def test_install_known_subdataset(src, path):

    # get the superdataset:
    ds = install(path, source=src)
    # subdataset not installed:
    subds = Dataset(opj(path, 'subm 1'))
    assert_false(subds.is_installed())
    assert_in('subm 1', ds.subdatasets(fulfilled=False, result_xfm='relpaths'))
    assert_not_in('subm 1', ds.subdatasets(fulfilled=True, result_xfm='relpaths'))
    # install it:
    ds.install('subm 1')
    ok_(subds.is_installed())
    ok_(AnnexRepo.is_valid_repo(subds.path, allow_noninitialized=False))
    # Verify that it is the correct submodule installed and not
    # new repository initiated
    eq_(set(subds.repo.get_indexed_files()),
        {'test.dat', 'INFO.txt', 'test-annex.dat'})
    assert_not_in('subm 1', ds.subdatasets(fulfilled=False, result_xfm='relpaths'))
    assert_in('subm 1', ds.subdatasets(fulfilled=True, result_xfm='relpaths'))

    # now, get the data by reinstalling with -g:
    ok_(subds.repo.file_has_content('test-annex.dat') is False)
    with chpwd(ds.path):
        result = get(path='subm 1', dataset=os.curdir)
        assert_in_results(result, path=opj(subds.path, 'test-annex.dat'))
        ok_(subds.repo.file_has_content('test-annex.dat') is True)
        ok_(subds.is_installed())
Esempio n. 9
0
def test_install_list(path, top_path):

    # we want to be able to install several things, if these are known
    # (no 'source' allowed). Therefore first toplevel:
    ds = install(top_path, source=path, recursive=False)
    assert_not_in('annex.hardlink', ds.config)
    ok_(ds.is_installed())
    sub1 = Dataset(opj(top_path, 'subm 1'))
    sub2 = Dataset(opj(top_path, '2'))
    ok_(not sub1.is_installed())
    ok_(not sub2.is_installed())

    # fails, when `source` is passed:
    assert_raises(ValueError, ds.install,
                  path=['subm 1', '2'],
                  source='something')

    # now should work:
    result = ds.install(path=['subm 1', '2'], result_xfm='paths')
    ok_(sub1.is_installed())
    ok_(sub2.is_installed())
    eq_(set(result), {sub1.path, sub2.path})
    # and if we request it again via get, result should be empty
    get_result = ds.get(path=['subm 1', '2'], get_data=False)
    assert_status('notneeded', get_result)
Esempio n. 10
0
def test_install_simple_local(src, path):
    origin = Dataset(path)

    # now install it somewhere else
    ds = install(path, source=src, description='mydummy')
    eq_(ds.path, path)
    ok_(ds.is_installed())
    if not isinstance(origin.repo, AnnexRepo):
        # this means it is a GitRepo
        ok_(isinstance(origin.repo, GitRepo))
        # stays plain Git repo
        ok_(isinstance(ds.repo, GitRepo))
        ok_(not isinstance(ds.repo, AnnexRepo))
        ok_(GitRepo.is_valid_repo(ds.path))
        eq_(set(ds.repo.get_indexed_files()),
            {'test.dat', 'INFO.txt'})
        ok_clean_git(path, annex=False)
    else:
        # must be an annex
        ok_(isinstance(ds.repo, AnnexRepo))
        ok_(AnnexRepo.is_valid_repo(ds.path, allow_noninitialized=False))
        eq_(set(ds.repo.get_indexed_files()),
            {'test.dat', 'INFO.txt', 'test-annex.dat'})
        ok_clean_git(path, annex=True)
        # no content was installed:
        ok_(not ds.repo.file_has_content('test-annex.dat'))
        uuid_before = ds.repo.uuid
        eq_(ds.repo.get_description(), 'mydummy')

    # installing it again, shouldn't matter:
    res = install(path, source=src, result_xfm=None, return_type='list')
    assert_status('notneeded', res)
    ok_(ds.is_installed())
    if isinstance(origin.repo, AnnexRepo):
        eq_(uuid_before, ds.repo.uuid)
Esempio n. 11
0
def test_install_into_dataset(source, top_path):

    ds = create(top_path)
    ok_clean_git(ds.path)

    subds = ds.install("sub", source=source, save=False)
    if isinstance(subds.repo, AnnexRepo) and subds.repo.is_direct_mode():
        ok_(exists(opj(subds.path, '.git')))
    else:
        ok_(isdir(opj(subds.path, '.git')))
    ok_(subds.is_installed())
    assert_in('sub', ds.subdatasets(result_xfm='relpaths'))
    # sub is clean:
    ok_clean_git(subds.path, annex=None)
    # top is too:
    ok_clean_git(ds.path, annex=None)
    ds.save('addsub')
    # now it is:
    ok_clean_git(ds.path, annex=None)

    # but we could also save while installing and there should be no side-effect
    # of saving any other changes if we state to not auto-save changes
    # Create a dummy change
    create_tree(ds.path, {'dummy.txt': 'buga'})
    ok_clean_git(ds.path, untracked=['dummy.txt'])
    subds_ = ds.install("sub2", source=source)
    eq_(subds_.path, opj(ds.path, "sub2"))  # for paranoid yoh ;)
    ok_clean_git(ds.path, untracked=['dummy.txt'])

    # and we should achieve the same behavior if we create a dataset
    # and then decide to add it
    create(_path_(top_path, 'sub3'))
    ok_clean_git(ds.path, untracked=['dummy.txt', 'sub3/'])
    ds.add('sub3')
    ok_clean_git(ds.path, untracked=['dummy.txt'])
Esempio n. 12
0
def test_failed_install_multiple(top_path):
    ds = create(top_path)

    create(_path_(top_path, 'ds1'))
    create(_path_(top_path, 'ds3'))
    ok_clean_git(ds.path, annex=False, untracked=['ds1/', 'ds3/'])

    # specify install with multiple paths and one non-existing
    with assert_raises(IncompleteResultsError) as cme:
        ds.install(['ds1', 'ds2', '///crcns', '///nonexisting', 'ds3'])

    # install doesn't add existing submodules -- add does that
    ok_clean_git(ds.path, annex=False, untracked=['ds1/', 'ds3/'])
    ds.add(['ds1', 'ds3'])
    ok_clean_git(ds.path, annex=False)
    # those which succeeded should be saved now
    eq_(ds.get_subdatasets(), ['crcns', 'ds1', 'ds3'])
    # and those which didn't -- listed
    eq_(set(cme.exception.failed), {'///nonexisting', _path_(top_path, 'ds2')})

    # but if there was only a single installation requested -- it will be
    # InstallFailedError to stay consistent with single install behavior
    # TODO: unify at some point
    with assert_raises(InstallFailedError) as cme:
        ds.install('ds2')
    with assert_raises(InstallFailedError) as cme:
        ds.install('///nonexisting')
Esempio n. 13
0
def test_ssh_open_close(tfile1):

    manager = SSHManager()

    path = opj(manager.socket_dir, get_connection_hash('localhost'))
    # TODO: facilitate the test when it didn't exist
    existed_before = exists(path)
    print("%s existed: %s" % (path, existed_before))

    c1 = manager.get_connection('ssh://localhost')
    c1.open()
    # control master exists for sure now
    ok_(exists(path))

    # use connection to execute remote command:
    local_home = os.path.expanduser('~')
    # we list explicitly local HOME since we override it in module_setup
    out, err = c1('ls -a %r' % local_home)
    remote_ls = [entry for entry in out.splitlines()
                 if entry != '.' and entry != '..']
    local_ls = os.listdir(local_home)
    eq_(set(remote_ls), set(local_ls))

    # now test for arguments containing spaces and other pleasant symbols
    out, err = c1('ls -l {}'.format(sh_quote(tfile1)))
    assert_in(tfile1, out)
    eq_(err, '')

    c1.close()
    # control master doesn't exist anymore:
    ok_(exists(path) == existed_before)
Esempio n. 14
0
def test_ssh_manager_close():

    manager = SSHManager()

    # check for previously existing sockets:
    existed_before_1 = exists(opj(manager.socket_dir, 'localhost'))
    existed_before_2 = exists(opj(manager.socket_dir, 'datalad-test'))

    manager.get_connection('ssh://localhost').open()
    manager.get_connection('ssh://datalad-test').open()

    if existed_before_1 and existed_before_2:
        # we need one connection to be closed and therefore being opened
        # by `manager`
        manager.get_connection('ssh://localhost').close()
        manager.get_connection('ssh://localhost').open()

    ok_(exists(opj(manager.socket_dir, get_connection_hash('localhost'))))
    ok_(exists(opj(manager.socket_dir, get_connection_hash('datalad-test'))))

    manager.close()

    still_exists_1 = exists(opj(manager.socket_dir, 'localhost'))
    still_exists_2 = exists(opj(manager.socket_dir, 'datalad-test'))

    eq_(existed_before_1, still_exists_1)
    eq_(existed_before_2, still_exists_2)
Esempio n. 15
0
def test_get_disposition_filename():
    input = {'url': 'http://human.brain-map.org/api/v2/well_known_file_download/157722290'}
    with patch('datalad.crawler.nodes.misc.get_url_disposition_filename',
               return_value="T1.nii.gz"):
        output = list(get_disposition_filename(input))
    eq_(len(output), 1)
    eq_(output[0]['filename'], 'T1.nii.gz')
Esempio n. 16
0
def test_audio(path):
    ds = Dataset(path).create()
    ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset')
    copy(
        opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'audio.mp3'),
        path)
    ds.add('.')
    ok_clean_git(ds.path)
    res = ds.aggregate_metadata()
    assert_status('ok', res)
    res = ds.metadata('audio.mp3')
    assert_result_count(res, 1)

    # from this extractor
    meta = res[0]['metadata']['audio']
    for k, v in target.items():
        eq_(meta[k], v)

    assert_in('@context', meta)

    uniques = ds.metadata(
        reporton='datasets', return_type='item-or-list')['metadata']['datalad_unique_content_properties']
    # test file has it, but uniques have it blanked out, because the extractor considers it worthless
    # for discovering whole datasets
    assert_in('bitrate', meta)
    eq_(uniques['audio']['bitrate'], None)

    # 'date' field carries not value, hence gets exclude from the unique report
    assert_in('date', meta)
    assert(not meta['date'])
    assert_not_in('date', uniques['audio'])
Esempio n. 17
0
def test_get_mixed_hierarchy(src, path):

    origin = Dataset(src).create(no_annex=True)
    origin_sub = origin.create('subds')
    with open(opj(origin.path, 'file_in_git.txt'), "w") as f:
        f.write('no idea')
    with open(opj(origin_sub.path, 'file_in_annex.txt'), "w") as f:
        f.write('content')
    origin.add('file_in_git.txt', to_git=True)
    origin_sub.add('file_in_annex.txt')
    origin.save(all_changes=True)

    # now, install that thing:
    ds, subds = install(path, source=src, recursive=True)
    ok_(subds.repo.file_has_content("file_in_annex.txt") is False)

    # and get:
    with swallow_logs(new_level=logging.DEBUG) as cml:
        result = ds.get(curdir, recursive=True)
        assert_re_in('.*Found no annex at {0}. Skipped.'.format(ds),
                     cml.out, flags=re.DOTALL)
        eq_(len(result), 1)
        eq_(result[0]['file'], opj("subds", "file_in_annex.txt"))
        ok_(result[0]['success'] is True)
        ok_(subds.repo.file_has_content("file_in_annex.txt") is True)
Esempio n. 18
0
def test_submodule_deinit(path):
    from datalad.support.annexrepo import AnnexRepo

    top_repo = AnnexRepo(path, create=False)
    eq_({'subm 1', '2'}, {s.name for s in top_repo.get_submodules()})
    # note: here init=True is ok, since we are using it just for testing
    with swallow_logs(new_level=logging.WARN) as cml:
        top_repo.update_submodule('subm 1', init=True)
        assert_in('Do not use update_submodule with init=True', cml.out)
    top_repo.update_submodule('2', init=True)

    # ok_(all([s.module_exists() for s in top_repo.get_submodules()]))
    # TODO: old assertion above if non-bare? (can't use "direct mode" in test_gitrepo)
    # Alternatively: New testrepo (plain git submodules) and have a dedicated
    # test for annexes in addition
    ok_(all([GitRepo.is_valid_repo(op.join(top_repo.path, s.path))
             for s in top_repo.get_submodules()]))

    # modify submodule:
    with open(op.join(top_repo.path, 'subm 1', 'file_ut.dat'), "w") as f:
        f.write("some content")

    assert_raises(CommandError, top_repo.deinit_submodule, 'sub1')

    # using force should work:
    top_repo.deinit_submodule('subm 1', force=True)

    ok_(not top_repo.repo.submodule('subm 1').module_exists())
Esempio n. 19
0
def test_uninstall_git_file(path):
    ds = Dataset(path)
    ok_(ds.is_installed())
    ok_(exists(opj(path, 'INFO.txt')))
    ok_file_under_git(ds.repo.path, 'INFO.txt')

    # drop file in Git in an annex repo
    # regardless of the type of repo this is 'notneeded'...
    # it is less about education that about "can we
    # we get the content back?", and for a file in Git we can
    assert_result_count(
        ds.drop(path='INFO.txt'),
        1,
        status='notneeded',
        message="no annex'ed content")

    res = ds.uninstall(path="INFO.txt", on_failure='ignore')
    assert_result_count(
        res, 1,
        status='impossible',
        message='can only uninstall datasets (consider the `drop` command)')

    # remove the file:
    res = ds.remove(path='INFO.txt', result_xfm='paths',
                    result_filter=lambda x: x['action'] == 'remove')
    assert_raises(AssertionError, ok_file_under_git, ds.repo.path, 'INFO.txt')
    ok_(not exists(opj(path, 'INFO.txt')))
    eq_(res, ['INFO.txt'])
Esempio n. 20
0
def test_get_url_parts():
    eq_(au.get_url_parts(""), {})
    assert_dict_equal(au.get_url_parts("http://datalad.org"),
                      {"_url_hostname": "datalad.org"})

    assert_dict_equal(au.get_url_parts("http://datalad.org/about.html"),
                      {"_url_hostname": "datalad.org",
                       "_url0": "about.html",
                       "_url_basename": "about.html",
                       "_url_basename_root_py": "about",
                       "_url_basename_ext_py": ".html",
                       "_url_basename_root": "about",
                       "_url_basename_ext": ".html"})
    assert_dict_equal(au.get_url_parts("http://datalad.org/about.html"),
                      au.get_url_parts("http://datalad.org//about.html"))

    assert_dict_equal(
        au.get_url_parts("http://datalad.org/for/git-users"),
        {"_url_hostname": "datalad.org",
         "_url0": "for",
         "_url1": "git-users",
         "_url_basename": "git-users",
         "_url_basename_root_py": "git-users",
         "_url_basename_ext_py": "",
         "_url_basename_root": "git-users",
         "_url_basename_ext": ""})
Esempio n. 21
0
def test_GitRepo_pull(test_path, orig_path, clone_path):

    origin = GitRepo.clone(test_path, orig_path)
    clone = GitRepo.clone(orig_path, clone_path)
    filename = get_most_obscure_supported_name()

    with open(op.join(orig_path, filename), 'w') as f:
        f.write("New file.")
    origin.add(filename)
    origin.commit("new file added.")
    clone.pull()
    ok_(op.exists(op.join(clone_path, filename)))

    # While at it, let's test _get_remotes_having_commit a bit
    clone.add_remote("very_origin", test_path)
    clone.fetch("very_origin")
    eq_(
        clone._get_remotes_having_commit(clone.get_hexsha()),
        ['origin']
    )
    prev_commit = clone.get_hexsha('HEAD^')
    eq_(
        set(clone._get_remotes_having_commit(prev_commit)),
        {'origin', 'very_origin'}
    )
Esempio n. 22
0
def _test_match_basic(matcher, query):
    extracts = dict(
        xpaths={'text': 'text()'},
        csss={'favorite': '.class1::text'}
    )
    m = matcher(query, **extracts)

    mg = m(dict(response="<div></div>"))
    ok_(inspect.isgenerator(mg))
    eq_(list(mg), [])  # there should be no hits

    mg = m(dict(response=sample1.response))
    ok_(inspect.isgenerator(mg))
    hits = list(mg)
    eq_(len(hits), 3)
    for hit, a_html, a_text, class1_text in zip(
            hits, sample1.a_htmls, sample1.a_texts, sample1.class1_texts):
        ok_(hit['response'])
        eq_(hit['match'], a_html)
        eq_(hit['text'], a_text)
        eq_(hit.get('favorite', None), class1_text)

    m = matcher(query, min_count=4, **extracts)
    mg = m(dict(response=sample1.response))
    ok_(inspect.isgenerator(mg))
    assert_raises(ValueError, list, mg)

    m = matcher(query, max_count=2, **extracts)
    mg = m(dict(response=sample1.response))
    ok_(inspect.isgenerator(mg))
    assert_raises(ValueError, list, mg)
Esempio n. 23
0
def test_kill(path):
    # nested datasets with load
    ds = Dataset(path).create()
    testfile = opj(ds.path, "file.dat")
    with open(testfile, 'w') as f:
        f.write("load")
    ds.save("file.dat")
    subds = ds.create('deep1')
    eq_(sorted(ds.subdatasets(result_xfm='relpaths')), ['deep1'])
    ok_clean_git(ds.path)

    # and we fail to remove since content can't be dropped
    res = ds.remove(on_failure='ignore')
    assert_result_count(
        res, 1,
        status='error', path=testfile)
    # Following two assertions on message are relying on the actual error.
    # We have a second result with status 'impossible' for the ds, that we need
    # to filter out for those assertions:
    err_result = [r for r in res if r['status'] == 'error'][0]
    assert_result_values_cond(
        [err_result], 'message',
        lambda x: "configured minimum number of copies not found" in x or
        "Could only verify the existence of 0 out of 1 necessary copies" in x
    )
    eq_(ds.remove(recursive=True, check=False, result_xfm='datasets'),
        [subds, ds])
    ok_(not exists(path))
Esempio n. 24
0
def test_GitRepo_fetch(test_path, orig_path, clone_path):

    origin = GitRepo.clone(test_path, orig_path)
    clone = GitRepo.clone(orig_path, clone_path)
    filename = get_most_obscure_supported_name()

    origin.checkout("new_branch", ['-b'])
    with open(op.join(orig_path, filename), 'w') as f:
        f.write("New file.")
    origin.add(filename)
    origin.commit("new file added.")

    fetched = clone.fetch(remote='origin')
    # test FetchInfo list returned by fetch
    eq_([u'origin/' + clone.get_active_branch(), u'origin/new_branch'],
        [commit.name for commit in fetched])

    ok_clean_git(clone.path, annex=False)
    assert_in("origin/new_branch", clone.get_remote_branches())
    assert_in(filename, clone.get_files("origin/new_branch"))
    assert_false(op.exists(op.join(clone_path, filename)))  # not checked out

    # create a remote without an URL:
    origin.add_remote('not-available', 'git://example.com/not/existing')
    origin.config.unset('remote.not-available.url', where='local')

    # fetch without provided URL
    fetched = origin.fetch('not-available')
    # nothing was done, nothing returned:
    eq_([], fetched)
Esempio n. 25
0
def test_aggregate_removal(path):
    base = Dataset(opj(path, 'origin')).create(force=True)
    # force all metadata objects into the annex
    with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f:
        f.write(
            '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n')
    sub = base.create('sub', force=True)
    subsub = sub.create(opj('subsub'), force=True)
    base.add('.', recursive=True)
    base.aggregate_metadata(recursive=True, update_mode='all')
    ok_clean_git(base.path)
    res = base.metadata(get_aggregates=True)
    assert_result_count(res, 3)
    assert_result_count(res, 1, path=subsub.path)
    # check that we only have object files that are listed in agginfo
    eq_(_get_contained_objs(base), _get_referenced_objs(base))
    # now delete the deepest subdataset to test cleanup of aggregated objects
    # in the top-level ds
    base.remove(opj('sub', 'subsub'), check=False)
    # now aggregation has to detect that subsub is not simply missing, but gone
    # for good
    base.aggregate_metadata(recursive=True, update_mode='all')
    ok_clean_git(base.path)
    # internally consistent state
    eq_(_get_contained_objs(base), _get_referenced_objs(base))
    # info on subsub was removed at all levels
    res = base.metadata(get_aggregates=True)
    assert_result_count(res, 0, path=subsub.path)
    assert_result_count(res, 2)
    res = sub.metadata(get_aggregates=True)
    assert_result_count(res, 0, path=subsub.path)
    assert_result_count(res, 1)
Esempio n. 26
0
def test_GitRepo_get_remote_url(orig_path, path):

    gr = GitRepo.clone(orig_path, path)
    gr.add_remote('github', 'git://github.com/datalad/testrepo--basic--r1')
    eq_(gr.get_remote_url('origin'), orig_path)
    eq_(gr.get_remote_url('github'),
                 'git://github.com/datalad/testrepo--basic--r1')
Esempio n. 27
0
def test_GitRepo_add(src, path):

    gr = GitRepo.clone(src, path)
    filename = get_most_obscure_supported_name()
    with open(op.join(path, filename), 'w') as f:
        f.write("File to add to git")
    added = gr.add(filename)

    eq_(added, {'success': True, 'file': filename})
    assert_in(filename, gr.get_indexed_files(),
              "%s not successfully added to %s" % (filename, path))
    # uncommitted:
    ok_(gr.dirty)

    filename = "another.txt"
    with open(op.join(path, filename), 'w') as f:
        f.write("Another file to add to git")

    # include committing:
    added2 = gr.add(filename)
    gr.commit(msg="Add two files.")
    eq_(added2, {'success': True, 'file': filename})

    assert_in(filename, gr.get_indexed_files(),
              "%s not successfully added to %s" % (filename, path))
    ok_clean_git(path)
Esempio n. 28
0
def test_notclone_known_subdataset(src, path):
    # get the superdataset:
    ds = clone(src, path,
               result_xfm='datasets', return_type='item-or-list')

    # subdataset not installed:
    subds = Dataset(opj(path, 'subm 1'))
    assert_false(subds.is_installed())
    assert_in('subm 1', ds.subdatasets(fulfilled=False, result_xfm='relpaths'))
    assert_not_in('subm 1', ds.subdatasets(fulfilled=True, result_xfm='relpaths'))
    # clone is not meaningful
    res = ds.clone('subm 1', on_failure='ignore')
    assert_status('error', res)
    assert_message('Failed to clone from any candidate source URL. '
                   'Encountered errors per each url were: %s',
                   res)
    # get does the job
    res = ds.get(path='subm 1', get_data=False)
    assert_status('ok', res)
    ok_(subds.is_installed())
    ok_(AnnexRepo.is_valid_repo(subds.path, allow_noninitialized=False))
    # Verify that it is the correct submodule installed and not
    # new repository initiated
    eq_(set(subds.repo.get_indexed_files()),
        {'test.dat', 'INFO.txt', 'test-annex.dat'})
    assert_not_in('subm 1', ds.subdatasets(fulfilled=False, result_xfm='relpaths'))
    assert_in('subm 1', ds.subdatasets(fulfilled=True, result_xfm='relpaths'))
Esempio n. 29
0
def test_reaggregate_with_unavailable_objects(path):
    base = Dataset(opj(path, 'origin')).create(force=True)
    # force all metadata objects into the annex
    with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f:
        f.write(
            '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n')
    sub = base.create('sub', force=True)
    subsub = base.create(opj('sub', 'subsub'), force=True)
    base.add('.', recursive=True)
    ok_clean_git(base.path)
    base.aggregate_metadata(recursive=True, update_mode='all')
    ok_clean_git(base.path)
    objpath = opj('.datalad', 'metadata', 'objects')
    objs = list(sorted(base.repo.find(objpath)))
    # we have 3x2 metadata sets (dataset/files) under annex
    eq_(len(objs), 6)
    eq_(all(base.repo.file_has_content(objs)), True)
    # drop all object content
    base.drop(objs, check=False)
    eq_(all(base.repo.file_has_content(objs)), False)
    ok_clean_git(base.path)
    # now re-aggregate, the state hasn't changed, so the file names will
    # be the same
    base.aggregate_metadata(recursive=True, update_mode='all', force_extraction=True)
    eq_(all(base.repo.file_has_content(objs)), True)
    # and there are no new objects
    eq_(
        objs,
        list(sorted(base.repo.find(objpath)))
    )
Esempio n. 30
0
def test_optimized_cloning(path):
    # make test repo with one file and one commit
    originpath = op.join(path, 'origin')
    repo = GitRepo(originpath, create=True)
    with open(op.join(originpath, 'test'), 'w') as f:
        f.write('some')
    repo.add('test')
    repo.commit('init')
    ok_clean_git(originpath, annex=False)
    from glob import glob

    def _get_inodes(repo):
        return dict(
            [(os.path.join(*o.split(os.sep)[-2:]),
              os.stat(o).st_ino)
             for o in glob(os.path.join(repo.path,
                                        repo.get_git_dir(repo),
                                        'objects', '*', '*'))])

    origin_inodes = _get_inodes(repo)
    # now clone it in different ways and see what happens to the object storage
    from datalad.support.network import get_local_file_url
    clonepath = op.join(path, 'clone')
    for src in (originpath, get_local_file_url(originpath)):
        # deprecated
        assert_raises(DeprecatedError, GitRepo, url=src, path=clonepath)
        clone = GitRepo.clone(url=src, path=clonepath, create=True)
        clone_inodes = _get_inodes(clone)
        eq_(origin_inodes, clone_inodes, msg='with src={}'.format(src))
        rmtree(clonepath)
Esempio n. 31
0
def test_procedure_discovery(path, super_path):
    with chpwd(path):
        # ^ Change directory so that we don't fail with an
        # InvalidGitRepositoryError if the test is executed from a git
        # worktree.
        ps = run_procedure(discover=True)
        # there are a few procedures coming with datalad, needs to find them
        assert_true(len(ps) > 2)
        # we get three essential properties
        eq_(
            sum([
                'procedure_type' in p and 'procedure_callfmt' in p
                and 'path' in p for p in ps
            ]), len(ps))

    # set up dataset with registered procedure (c&p from test_basics):
    ds = Dataset(path).create(force=True)
    ds.run_procedure('cfg_yoda')
    ok_clean_git(ds.path)
    # configure dataset to look for procedures in its code folder
    ds.config.add('datalad.locations.dataset-procedures',
                  'code',
                  where='dataset')
    # configure dataset to run the demo procedure prior to the clean command
    ds.config.add('datalad.clean.proc-pre',
                  'datalad_test_proc',
                  where='dataset')
    ds.save(op.join('.datalad', 'config'))

    # run discovery on the dataset:
    ps = ds.run_procedure(discover=True)

    # still needs to find procedures coming with datalad
    assert_true(len(ps) > 2)
    # we get three essential properties
    eq_(
        sum([
            'procedure_type' in p and 'procedure_callfmt' in p and 'path' in p
            for p in ps
        ]), len(ps))
    # dataset's procedure needs to be in the results
    assert_in_results(ps,
                      path=op.join(ds.path, 'code', 'datalad_test_proc.py'))

    # make it a subdataset and try again:
    super = Dataset(super_path).create()
    super.install('sub', source=ds.path)

    ps = super.run_procedure(discover=True)
    # still needs to find procedures coming with datalad
    assert_true(len(ps) > 2)
    # we get three essential properties
    eq_(
        sum([
            'procedure_type' in p and 'procedure_callfmt' in p and 'path' in p
            for p in ps
        ]), len(ps))
    # dataset's procedure needs to be in the results
    assert_in_results(ps,
                      path=op.join(super.path, 'sub', 'code',
                                   'datalad_test_proc.py'))

    if not on_windows:  # no symlinks
        import os
        # create a procedure which is a broken symlink, but recognizable as a
        # python script:
        os.symlink(op.join(super.path, 'sub', 'not_existent'),
                   op.join(super.path, 'sub', 'code', 'broken_link_proc.py'))
        # broken symlink at procedure location, but we can't tell, whether it is
        # an actual procedure without any guess on how to execute it:
        os.symlink(op.join(super.path, 'sub', 'not_existent'),
                   op.join(super.path, 'sub', 'code', 'unknwon_broken_link'))

        ps = super.run_procedure(discover=True)
        # still needs to find procedures coming with datalad and the dataset
        # procedure registered before
        assert_true(len(ps) > 3)
        assert_in_results(ps,
                          path=op.join(super.path, 'sub', 'code',
                                       'broken_link_proc.py'),
                          state='absent')
        assert_not_in_results(ps,
                              path=op.join(super.path, 'sub', 'code',
                                           'unknwon_broken_link'))
Esempio n. 32
0
def test_nested_pushclone_cycle_allplatforms(origpath, storepath, clonepath):
    if 'DATALAD_SEED' in os.environ:
        # we are using create-sibling-ria via the cmdline in here
        # this will create random UUIDs for datasets
        # however, given a fixed seed each call to this command will start
        # with the same RNG seed, hence yield the same UUID on the same
        # machine -- leading to a collision
        raise SkipTest(
            'Test incompatible with fixed random number generator seed')
    # the aim here is this high-level test a std create-push-clone cycle for a
    # dataset with a subdataset, with the goal to ensure that correct branches
    # and commits are tracked, regardless of platform behavior and condition
    # of individual clones. Nothing fancy, just that the defaults behave in
    # sensible ways
    from datalad.cmd import WitlessRunner as Runner
    run = Runner().run

    # create original nested dataset
    with chpwd(origpath):
        run(['datalad', 'create', 'super'])
        run(['datalad', 'create', '-d', 'super', str(Path('super', 'sub'))])

    # verify essential linkage properties
    orig_super = Dataset(Path(origpath, 'super'))
    orig_sub = Dataset(orig_super.pathobj / 'sub')

    (orig_super.pathobj / 'file1.txt').write_text('some1')
    (orig_sub.pathobj / 'file2.txt').write_text('some1')
    with chpwd(orig_super.path):
        run(['datalad', 'save', '--recursive'])

    # TODO not yet reported clean with adjusted branches
    #assert_repo_status(orig_super.path)

    # the "true" branch that sub is on, and the gitsha of the HEAD commit of it
    orig_sub_corr_branch = \
        orig_sub.repo.get_corresponding_branch() or orig_sub.repo.get_active_branch()
    orig_sub_corr_commit = orig_sub.repo.get_hexsha(orig_sub_corr_branch)

    # make sure the super trackes this commit
    assert_in_results(
        orig_super.subdatasets(),
        path=orig_sub.path,
        gitshasum=orig_sub_corr_commit,
        # TODO it should also track the branch name
        # Attempted: https://github.com/datalad/datalad/pull/3817
        # But reverted: https://github.com/datalad/datalad/pull/4375
    )

    # publish to a store, to get into a platform-agnostic state
    # (i.e. no impact of an annex-init of any kind)
    store_url = 'ria+' + get_local_file_url(storepath)
    with chpwd(orig_super.path):
        run([
            'datalad', 'create-sibling-ria', '--recursive', '-s', 'store',
            store_url
        ])
        run(['datalad', 'push', '--recursive', '--to', 'store'])

    # we are using the 'store' sibling's URL, which should be a plain path
    store_super = AnnexRepo(orig_super.siblings(name='store')[0]['url'],
                            init=False)
    store_sub = AnnexRepo(orig_sub.siblings(name='store')[0]['url'],
                          init=False)

    # both datasets in the store only carry the real branches, and nothing
    # adjusted
    for r in (store_super, store_sub):
        eq_(set(r.get_branches()), set([orig_sub_corr_branch, 'git-annex']))

    # and reobtain from a store
    cloneurl = 'ria+' + get_local_file_url(str(storepath), compatibility='git')
    with chpwd(clonepath):
        run(['datalad', 'clone', cloneurl + '#' + orig_super.id, 'super'])
        run(['datalad', '-C', 'super', 'get', '--recursive', '.'])

    # verify that nothing has changed as a result of a push/clone cycle
    clone_super = Dataset(Path(clonepath, 'super'))
    clone_sub = Dataset(clone_super.pathobj / 'sub')
    assert_in_results(
        clone_super.subdatasets(),
        path=clone_sub.path,
        gitshasum=orig_sub_corr_commit,
    )

    for ds1, ds2, f in ((orig_super, clone_super, 'file1.txt'),
                        (orig_sub, clone_sub, 'file2.txt')):
        eq_((ds1.pathobj / f).read_text(), (ds2.pathobj / f).read_text())

    # get status info that does not recursive into subdatasets, i.e. not
    # looking for uncommitted changes
    # we should see no modification reported
    assert_not_in_results(clone_super.status(eval_subdataset_state='commit'),
                          state='modified')
    # and now the same for a more expensive full status
    assert_not_in_results(clone_super.status(recursive=True), state='modified')
Esempio n. 33
0
def test_force_checkdatapresent(srcpath, dstpath):
    src = Dataset(srcpath).create()
    target = mk_push_target(src, 'target', dstpath, annex=True, bare=True)
    (src.pathobj / 'test_mod_annex_file').write_text("Heavy stuff.")
    src.save(to_git=False, message="New annex file")
    assert_repo_status(src.path, annex=True)
    whereis_prior = src.repo.whereis(files=['test_mod_annex_file'])[0]

    res = src.push(to='target', data='nothing')
    # nothing reported to be copied
    assert_not_in_results(res, action='copy')
    # we got the git-push nevertheless
    eq_(src.repo.get_hexsha(DEFAULT_BRANCH), target.get_hexsha(DEFAULT_BRANCH))
    # nothing moved
    eq_(whereis_prior, src.repo.whereis(files=['test_mod_annex_file'])[0])

    # now a push without forced no-transfer
    # we do not give since, so the non-transfered file is picked up
    # and transferred
    res = src.push(to='target', force=None)
    # no branch change, done before
    assert_in_results(res,
                      action='publish',
                      status='notneeded',
                      refspec=DEFAULT_REFSPEC)
    # but availability update
    assert_in_results(res,
                      action='publish',
                      status='ok',
                      refspec='refs/heads/git-annex:refs/heads/git-annex')
    assert_in_results(res,
                      status='ok',
                      path=str(src.pathobj / 'test_mod_annex_file'),
                      action='copy')
    # whereis info reflects the change
    ok_(
        len(whereis_prior) < len(
            src.repo.whereis(files=['test_mod_annex_file'])[0]))

    # do it yet again will do nothing, because all is uptodate
    assert_status('notneeded', src.push(to='target', force=None))
    # an explicit reference point doesn't change that
    assert_status('notneeded', src.push(to='target',
                                        force=None,
                                        since='HEAD~1'))

    # now force data transfer
    res = src.push(to='target', force='checkdatapresent')
    # no branch change, done before
    assert_in_results(res,
                      action='publish',
                      status='notneeded',
                      refspec=DEFAULT_REFSPEC)
    # no availability update
    assert_in_results(res,
                      action='publish',
                      status='notneeded',
                      refspec='refs/heads/git-annex:refs/heads/git-annex')
    # but data transfer
    assert_in_results(res,
                      status='ok',
                      path=str(src.pathobj / 'test_mod_annex_file'),
                      action='copy')

    # force data transfer, but data isn't available
    src.repo.drop('test_mod_annex_file')
    res = src.push(to='target',
                   path='.',
                   force='checkdatapresent',
                   on_failure='ignore')
    assert_in_results(res,
                      status='impossible',
                      path=str(src.pathobj / 'test_mod_annex_file'),
                      action='copy',
                      message='Slated for transport, but no content present')
Esempio n. 34
0
def test_filter_legal_metafield():
    eq_(au.filter_legal_metafield(["legal", "_not", "legal_still"]),
        ["legal", "legal_still"])
Esempio n. 35
0
def test_rerun_branch(path):
    ds = Dataset(path).create()

    ds.repo.tag("prerun")

    outfile = op.join(path, "run-file")

    with swallow_outputs():
        ds.run('echo x$(cat run-file) > run-file')
    ds.rerun()
    eq_('xx\n', open(outfile).read())

    with open(op.join(path, "nonrun-file"), "w") as f:
        f.write("foo")
    ds.save("nonrun-file")

    # Rerun the commands on a new branch that starts at the parent
    # commit of the first run.
    with swallow_outputs():
        ds.rerun(since="prerun", onto="prerun", branch="rerun")

    eq_(ds.repo.get_active_branch(), "rerun")
    eq_('xx\n', open(outfile).read())

    # NOTE: This test depends on the non-run commit above following a run
    # commit.  Otherwise, all the metadata (e.g., author date) aside from the
    # parent commit that is used to generate the commit ID may be set when
    # running the tests, which would result in two commits rather than three.
    for revrange in ["rerun..master", "master..rerun"]:
        eq_(len(ds.repo.get_revisions(revrange)), 3)
    eq_(ds.repo.get_merge_base(["master", "rerun"]),
        ds.repo.get_hexsha("prerun"))

    # Start rerun branch at tip of current branch.
    ds.repo.checkout("master")
    ds.rerun(since="prerun", branch="rerun2")
    eq_(ds.repo.get_active_branch(), "rerun2")
    eq_('xxxx\n', open(outfile).read())

    eq_(len(ds.repo.get_revisions("master..rerun2")), 2)
    eq_(len(ds.repo.get_revisions("rerun2..master")), 0)

    # Using an existing branch name fails.
    ds.repo.checkout("master")
    assert_raises(IncompleteResultsError,
                  ds.rerun,
                  since="prerun",
                  branch="rerun2")
Esempio n. 36
0
def test_get_flexible_source_candidates_for_submodule(t, t2):
    f = _get_flexible_source_candidates_for_submodule
    # for now without mocking -- let's just really build a dataset
    ds = create(t)
    clone = install(
        t2, source=t,
        result_xfm='datasets', return_type='item-or-list')

    # first one could just know about itself or explicit url provided
    sshurl = 'ssh://e.c'
    httpurl = 'http://e.c'
    # Expansion with '/.git' no longer done in this helper
    #sm_httpurls = [httpurl, httpurl + '/.git']
    sm_httpurls = [httpurl]
    eq_(f(ds, 'sub'), [])
    eq_(f(ds, 'sub', sshurl), [sshurl])
    eq_(f(ds, 'sub', httpurl), sm_httpurls)
    eq_(f(ds, 'sub', None), [])  # otherwise really we have no clue were to get from

    # but if we work on dsclone then it should also add urls deduced from its
    # own location default remote for current branch
    eq_(f(clone, 'sub'), [t + '/sub'])
    eq_(f(clone, 'sub', sshurl), [t + '/sub', sshurl])
    eq_(f(clone, 'sub', httpurl), [t + '/sub'] + sm_httpurls)
    eq_(f(clone, 'sub'), [t + '/sub'])  # otherwise really we have no clue were to get from
Esempio n. 37
0
def test_loads():
    eq_(loads('{"a": 2}'), {'a': 2})
    with assert_raises(JSONDecodeError),\
            swallow_logs(new_level=logging.WARNING) as cml:
        loads('{"a": 2}x')
    assert_in('Failed to load content from', cml.out)
Esempio n. 38
0
def test_load_screwy_unicode(fname):
    # test that we can tollerate some screwy unicode embeddings within json
    assert_raises(JSONDecodeError, load, fname, fixup=False)
    with swallow_logs(new_level=logging.WARNING) as cml:
        eq_(load(fname), {'Authors': ['A1', 'A2']})
        assert_in('Failed to decode content', cml.out)
Esempio n. 39
0
def test_formatter_missing_arg():
    fmt = au.Formatter({}, "NA")
    eq_(fmt.format("{here},{nothere}", {"here": "ok", "nothere": ""}), "ok,NA")
Esempio n. 40
0
def test_formatter_placeholder_with_spaces():
    fmt = au.Formatter({})
    eq_(fmt.format("{with spaces}", {"with spaces": "value0"}), "value0")
Esempio n. 41
0
def test_split_ext():
    eq_(au.split_ext("file"), ("file", ""))

    eq_(au.split_ext("file.py"), ("file", ".py"))
    eq_(au.split_ext("file.tar.gz"), ("file", ".tar.gz"))
    eq_(au.split_ext("file.toolong.gz"), ("file.toolong", ".gz"))

    eq_(au.split_ext("file.a.b.c.d"), ("file", ".a.b.c.d"))
    eq_(au.split_ext("file.a.b.cccc.d"), ("file", ".a.b.cccc.d"))
    eq_(au.split_ext("file.a.b.ccccc.d"), ("file.a.b.ccccc", ".d"))

    eq_(au.split_ext("file.a.b..c"), ("file", ".a.b..c"))
Esempio n. 42
0
def test_save(path):

    ds = Dataset(path)

    with open(op.join(path, "new_file.tst"), "w") as f:
        f.write("something")

    ds.repo.add("new_file.tst", git=True)
    ok_(ds.repo.dirty)

    ds.save(message="add a new file")
    assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo))

    with open(op.join(path, "new_file.tst"), "w") as f:
        f.write("modify")

    ok_(ds.repo.dirty)
    ds.save(message="modified new_file.tst")
    assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo))

    # save works without ds and files given in the PWD
    with open(op.join(path, "new_file.tst"), "w") as f:
        f.write("rapunzel")
    with chpwd(path):
        save(message="love rapunzel")
    assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo))

    # and also without `-a` when things are staged
    with open(op.join(path, "new_file.tst"), "w") as f:
        f.write("exotic")
    ds.repo.add("new_file.tst", git=True)
    with chpwd(path):
        save(message="love marsians")
    assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo))

    files = ['one.txt', 'two.txt']
    for fn in files:
        with open(op.join(path, fn), "w") as f:
            f.write(fn)

    ds.save([op.join(path, f) for f in files])
    # superfluous call to save (alll saved it already), should not fail
    # but report that nothing was saved
    assert_status('notneeded', ds.save(message="set of new files"))
    assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo))

    # create subdataset
    subds = ds.create('subds')
    assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo))
    # modify subds
    with open(op.join(subds.path, "some_file.tst"), "w") as f:
        f.write("something")
    subds.save()
    assert_repo_status(subds.path, annex=isinstance(subds.repo, AnnexRepo))
    # ensure modified subds is committed
    ds.save()
    assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo))

    # now introduce a change downstairs
    subds.create('someotherds')
    assert_repo_status(subds.path, annex=isinstance(subds.repo, AnnexRepo))
    ok_(ds.repo.dirty)
    # and save via subdataset path
    ds.save('subds', version_tag='new_sub')
    assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo))
    tags = ds.repo.get_tags()
    ok_(len(tags) == 1)
    eq_(tags[0], dict(hexsha=ds.repo.get_hexsha(), name='new_sub'))
    # fails when retagged, like git does
    res = ds.save(version_tag='new_sub', on_failure='ignore')
    assert_status('error', res)
    assert_result_count(res,
                        1,
                        action='save',
                        type='dataset',
                        path=ds.path,
                        message=('cannot tag this version: %s',
                                 "fatal: tag 'new_sub' already exists"))
Esempio n. 43
0
def test_get_recurse_subdatasets(src, path):

    ds = install(
        path, source=src,
        result_xfm='datasets', return_type='item-or-list')

    # ask for the two subdatasets specifically. This will obtain them,
    # but not any content of any files in them
    subds1, subds2 = ds.get(['subm 1', 'subm 2'], get_data=False,
                            description="youcouldnotmakethisup",
                            result_xfm='datasets')
    for d in (subds1, subds2):
        eq_(d.repo.get_description(), 'youcouldnotmakethisup')

    # there are 3 files to get: test-annex.dat within each dataset:
    rel_path_sub1 = opj(basename(subds1.path), 'test-annex.dat')
    rel_path_sub2 = opj(basename(subds2.path), 'test-annex.dat')
    annexed_files = {'test-annex.dat',
                     rel_path_sub1,
                     rel_path_sub2}

    # None of them is currently present:
    ok_(ds.repo.file_has_content('test-annex.dat') is False)
    ok_(subds1.repo.file_has_content('test-annex.dat') is False)
    ok_(subds2.repo.file_has_content('test-annex.dat') is False)

    ok_clean_git(subds1.path)
    # explicitly given path in subdataset => implicit recursion:
    # MIH: Nope, we fulfill the dataset handle, but that doesn't
    #      imply fulfilling all file handles
    result = ds.get(rel_path_sub1, recursive=True)
    # all good actions
    assert_status('ok', result)

    assert_in_results(result, path=opj(ds.path, rel_path_sub1), status='ok')
    ok_(subds1.repo.file_has_content('test-annex.dat') is True)

    # drop it:
    subds1.repo.drop('test-annex.dat')
    ok_(subds1.repo.file_has_content('test-annex.dat') is False)

    # now, with a path not explicitly pointing within a
    # subdataset, but recursive option:
    # get everything:
    result = ds.get(recursive=True, result_filter=lambda x: x.get('type') != 'dataset')
    assert_status('ok', result)

    eq_(set([item.get('path')[len(ds.path) + 1:] for item in result
             if item['type'] == 'file']),
        annexed_files)
    ok_(ds.repo.file_has_content('test-annex.dat') is True)
    ok_(subds1.repo.file_has_content('test-annex.dat') is True)
    ok_(subds2.repo.file_has_content('test-annex.dat') is True)

    # drop them:
    ds.repo.drop('test-annex.dat')
    subds1.repo.drop('test-annex.dat')
    subds2.repo.drop('test-annex.dat')
    ok_(ds.repo.file_has_content('test-annex.dat') is False)
    ok_(subds1.repo.file_has_content('test-annex.dat') is False)
    ok_(subds2.repo.file_has_content('test-annex.dat') is False)

    # now, the very same call, but without recursive:
    result = ds.get('.', recursive=False)
    assert_status('ok', result)
    # one report is on the requested dir
    eq_(len(result) - 1, 1)
    assert_result_count(
        result, 1, path=opj(ds.path, 'test-annex.dat'), status='ok')
    ok_(ds.repo.file_has_content('test-annex.dat') is True)
    ok_(subds1.repo.file_has_content('test-annex.dat') is False)
    ok_(subds2.repo.file_has_content('test-annex.dat') is False)
Esempio n. 44
0
def test_aggregation(path):
    # a hierarchy of three (super/sub)datasets, each with some native metadata
    ds = Dataset(op.join(path, 'origin')).create(force=True)
    ds.config.add('datalad.metadata.nativetype',
                  'frictionless_datapackage',
                  where='dataset')
    subds = ds.create('sub', force=True)
    subds.config.add('datalad.metadata.nativetype',
                     'frictionless_datapackage',
                     where='dataset')
    subsubds = subds.create('subsub', force=True)
    subsubds.config.add('datalad.metadata.nativetype',
                        'frictionless_datapackage',
                        where='dataset')
    assert_status('ok', ds.save(recursive=True))
    # while we are at it: dot it again, nothing should happen
    assert_status('notneeded', ds.save(recursive=True))

    assert_repo_status(ds.path)
    # aggregate metadata from all subdatasets into any superdataset, including
    # intermediate ones
    res = ds.meta_aggregate(recursive=True, into='all')
    # we get success report for both subdatasets and the superdataset,
    # and they get saved
    assert_result_count(res, 3, status='ok', action='meta_aggregate')
    # the respective super datasets see two saves, one to record the change
    # in the subdataset after its own aggregation, and one after the super
    # updated with aggregated metadata
    assert_result_count(res, 5, status='ok', action='save', type='dataset')
    # nice and tidy
    assert_repo_status(ds.path)

    # quick test of aggregate report
    aggs = ds.meta_dump(reporton='aggregates', recursive=True)
    # one for each dataset
    assert_result_count(aggs, 3)
    # mother also report layout version
    assert_result_count(aggs, 1, path=ds.path, layout_version=1)

    # store clean direct result
    origres = ds.meta_dump(recursive=True)
    # basic sanity check
    assert_result_count(origres, 3, type='dataset')
    assert_result_count([r for r in origres if r['path'].endswith('.json')],
                        3,
                        type='file')  # Now that we have annex.key
    # three different IDs
    eq_(
        3,
        len(
            set([
                _get_dsid_from_core_metadata(s['metadata']['metalad_core'])
                for s in origres if s['type'] == 'dataset'
            ])))
    # and we know about all three datasets
    for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'):
        assert_true(
            sum([s['metadata']['frictionless_datapackage']['name'] \
                    == assure_unicode(name) for s in origres
                 if s['type'] == 'dataset']))

    # now clone the beast to simulate a new user installing an empty dataset
    clone = install(op.join(path, 'clone'),
                    source=ds.path,
                    result_xfm='datasets',
                    return_type='item-or-list')
    # ID mechanism works
    eq_(ds.id, clone.id)

    # get fresh metadata
    cloneres = clone.meta_dump()
    # basic sanity check
    assert_result_count(cloneres, 1, type='dataset')
    # payload file
    assert_result_count(cloneres, 1, type='file')

    # now loop over the previous results from the direct metadata query of
    # origin and make sure we get the extact same stuff from the clone
    _compare_metadata_helper(origres, clone)

    # now obtain a subdataset in the clone, should make no difference
    assert_status('ok',
                  clone.install('sub', result_xfm=None, return_type='list'))
    _compare_metadata_helper(origres, clone)

    # test search in search tests, not all over the place
    ## query smoke test
    assert_result_count(clone.search('mother', mode='egrep'), 1)
    assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1)

    child_res = clone.search('child', mode='egrep')
    assert_result_count(child_res, 2)
    for r in child_res:
        if r['type'] == 'dataset':
            assert_in(r['query_matched']['frictionless_datapackage.name'],
                      r['metadata']['frictionless_datapackage']['name'])
Esempio n. 45
0
def test_rerun_onto(path):
    ds = Dataset(path).create()
    # Make sure we have more than one commit. The one commit case is checked
    # elsewhere.
    ds.repo.commit(msg="noop commit", options=["--allow-empty"])

    grow_file = op.join(path, "grows")

    # Make sure we can handle range-specifications that yield no results.
    for since in ["", "HEAD"]:
        assert_result_count(ds.rerun("HEAD",
                                     onto="",
                                     since=since,
                                     on_failure="ignore"),
                            1,
                            status="impossible",
                            action="run")

    ds.run('echo static-content > static')
    ds.repo.tag("static")
    with swallow_outputs():
        ds.run('echo x$(cat grows) > grows')
    ds.rerun()
    eq_('xx\n', open(grow_file).read())

    # If we run the "static" change on top of itself, we end up in the
    # same (but detached) place.
    ds.rerun(revision="static", onto="static")
    ok_(ds.repo.get_active_branch() is None)
    eq_(ds.repo.get_hexsha(), ds.repo.get_hexsha("static"))

    # If we run the "static" change from the same "base", we end up
    # with a new commit.
    ds.repo.checkout("master")
    with swallow_outputs():
        ds.rerun(revision="static", onto="static^")
    ok_(ds.repo.get_active_branch() is None)
    neq_(ds.repo.get_hexsha(), ds.repo.get_hexsha("static"))
    ok_(all(r["state"] == "clean" for r in ds.diff(fr="HEAD", to="static")))
    for revrange in ["..static", "static.."]:
        eq_(len(ds.repo.get_revisions(revrange)), 1)

    # Unlike the static change, if we run the ever-growing change on
    # top of itself, we end up with a new commit.
    ds.repo.checkout("master")
    ds.rerun(onto="HEAD")
    ok_(ds.repo.get_active_branch() is None)
    neq_(ds.repo.get_hexsha(), ds.repo.get_hexsha("master"))

    # An empty `onto` means use the parent of the first revision.
    ds.repo.checkout("master")
    with swallow_outputs():
        ds.rerun(since="static^", onto="")
    ok_(ds.repo.get_active_branch() is None)
    for revrange in ["..master", "master.."]:
        eq_(len(ds.repo.get_revisions(revrange)), 3)

    # An empty `onto` means use the parent of the first revision that
    # has a run command.
    ds.repo.checkout("master")
    with swallow_outputs():
        ds.rerun(since="", onto="", branch="from-base")
    eq_(ds.repo.get_active_branch(), "from-base")
    ok_(
        all(r["state"] == "clean"
            for r in ds.diff(fr="master", to="from-base")))
    eq_(ds.repo.get_merge_base(["static", "from-base"]),
        ds.repo.get_hexsha("static^"))

    # We abort when an explicitly specified `onto` doesn't exist.
    ds.repo.checkout("master")
    assert_result_count(ds.rerun(since="",
                                 onto="doesnotexist",
                                 branch="from-base",
                                 on_failure="ignore"),
                        1,
                        status="error",
                        action="run")
Esempio n. 46
0
def test_openfmri_pipeline2(ind, topurl, outd):
    # no versioned files -- should still work! ;)

    list(
        initiate_dataset(template="openfmri",
                         dataset_name='dataladtest-ds666',
                         path=outd,
                         data_fields=['dataset'])({
                             'dataset': 'ds666'
                         }))

    with chpwd(outd):
        pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl)
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    repo = AnnexRepo(outd, create=False)  # to be used in the checks
    # Inspect the tree -- that we have all the branches
    branches = {'master', 'incoming', 'incoming-processed', 'git-annex'}
    eq_(set(repo.get_branches()), branches)
    # We do not have custom changes in master yet, so it just follows incoming-processed atm
    # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed'))
    # Since we did initiate_dataset -- now we have separate master!
    assert_not_equal(repo.get_hexsha('master'),
                     repo.get_hexsha('incoming-processed'))
    # and that one is different from incoming
    assert_not_equal(repo.get_hexsha('incoming'),
                     repo.get_hexsha('incoming-processed'))

    # actually the tree should look quite neat with 1.0.0 tag having 1 parent in incoming
    # 1.0.1 having 1.0.0 and the 2nd commit in incoming as parents

    commits_hexsha = {b: list(_get_branch_commits(repo, b)) for b in branches}
    commits_l = {
        b: list(_get_branch_commits(repo, b, limit='left-only'))
        for b in branches
    }

    # all commits out there:
    # backend set, dataset init, crawler, init, incoming (shares with master -1),
    #   (2 or 3 commits, depending on create variant)
    # incoming-processed, merge, aggregate metadata:
    ncommits_master = len(commits_hexsha['master'])
    assert_in(ncommits_master, [5, 6])
    assert_in(len(commits_l['master']), [4, 5])

    eq_(len(commits_hexsha['incoming']), ncommits_master - 2)
    eq_(len(commits_l['incoming']), ncommits_master - 2)
    eq_(len(commits_hexsha['incoming-processed']), ncommits_master - 1)
    # TODO inspect by knowledgeable person and re-enable
    #eq_(len(commits_l['incoming-processed']), ncommits_master - 2)

    # rerun pipeline -- make sure we are on the same in all branches!
    with chpwd(outd):
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches}
    eq_(commits_hexsha, commits_hexsha_)  # i.e. nothing new
    eq_(out[0]['datalad_stats'], ActivityStats(files=2, skipped=2, urls=2))
    eq_(out[0]['datalad_stats'], out[0]['datalad_stats'].get_total())

    os.rename(opj(ind, 'ds666', 'ds666_R2.0.0.tar.gz'),
              opj(ind, 'ds666', 'ds666.tar.gz'))

    with chpwd(outd):
        out = run_pipeline(pipeline)
    eq_(len(out), 1)
    eq_(out[0]['datalad_stats'], ActivityStats())  # was committed
    stats_total = out[0]['datalad_stats'].get_total()
    stats_total.downloaded_size = 0
    eq_(
        stats_total,
        ActivityStats(files=4,
                      overwritten=1,
                      skipped=1,
                      downloaded=1,
                      merges=[['incoming', 'incoming-processed']],
                      versions=['1.0.0'],
                      renamed=1,
                      urls=2,
                      add_annex=2))
    # in reality there is also 1.0.0+1 tag since file changed but no version suffix
    eq_(repo.get_tags(output='name'), ['1.0.0', '1.0.0+1'])

    check_dropall_get(repo)
Esempio n. 47
0
def test_run_inputs_outputs(src, path):
    for subds in [("s0", "s1_0", "s2"), ("s0", "s1_1", "s2"), ("s0", "s1_0"),
                  ("s0", "s1_1"), ("s0", "ss"), ("s0", )]:
        Dataset(op.join(*((src, ) + subds))).create(force=True).save()
    src_ds = Dataset(src).create(force=True)
    src_ds.save()

    ds = install(path,
                 source=src,
                 result_xfm='datasets',
                 return_type='item-or-list')
    assert_false(ds.repo.file_has_content("input.dat"))
    assert_false(ds.repo.file_has_content("extra-input.dat"))

    # The specified inputs and extra inputs will be retrieved before the run.
    # (Use run_command() to access the extra_inputs argument.)
    list(
        run_command("{} {{inputs}} {{inputs}} >doubled.dat".format(
            'type' if on_windows else 'cat'),
                    dataset=ds,
                    inputs=["input.dat"],
                    extra_inputs=["extra-input.dat"]))

    assert_repo_status(ds.path)
    ok_(ds.repo.file_has_content("input.dat"))
    ok_(ds.repo.file_has_content("extra-input.dat"))
    ok_(ds.repo.file_has_content("doubled.dat"))
    with open(op.join(path, "doubled.dat")) as fh:
        content = fh.read()
        assert_in("input", content)
        assert_not_in("extra-input", content)

    # Rerunning the commit will also get the input file.
    ds.repo.drop(["input.dat", "extra-input.dat"], options=["--force"])
    assert_false(ds.repo.file_has_content("input.dat"))
    assert_false(ds.repo.file_has_content("extra-input.dat"))
    ds.rerun()
    ok_(ds.repo.file_has_content("input.dat"))
    ok_(ds.repo.file_has_content("extra-input.dat"))

    with swallow_logs(new_level=logging.WARN) as cml:
        ds.run("cd .> dummy", inputs=["not-there"])
        assert_in("Input does not exist: ", cml.out)

    # Test different combinations of globs and explicit files.
    inputs = ["a.dat", "b.dat", "c.txt", "d.txt"]
    create_tree(ds.path, {i: i for i in inputs})

    ds.save()
    ds.repo.copy_to(inputs, remote="origin")
    ds.repo.drop(inputs, options=["--force"])

    test_cases = [(["*.dat"], ["a.dat", "b.dat"]),
                  (["*.dat", "c.txt"], ["a.dat", "b.dat", "c.txt"]),
                  (["*"], inputs)]

    for idx, (inputs_arg, expected_present) in enumerate(test_cases):
        assert_false(any(ds.repo.file_has_content(i) for i in inputs))

        ds.run("cd .> dummy{}".format(idx), inputs=inputs_arg)
        ok_(all(ds.repo.file_has_content(f) for f in expected_present))
        # Globs are stored unexpanded by default.
        assert_in(inputs_arg[0], ds.repo.format_commit("%B"))
        ds.repo.drop(inputs, options=["--force"])

    # --input can be passed a subdirectory.
    create_tree(ds.path, {"subdir": {"a": "subdir a", "b": "subdir b"}})
    ds.save("subdir")
    ds.repo.copy_to(["subdir/a", "subdir/b"], remote="origin")
    ds.repo.drop("subdir", options=["--force"])
    ds.run("cd .> subdir-dummy", inputs=[op.join(ds.path, "subdir")])
    ok_(all(
        ds.repo.file_has_content(op.join("subdir", f)) for f in ["a", "b"]))

    # Inputs are specified relative to a dataset's subdirectory.
    ds.repo.drop(op.join("subdir", "a"), options=["--force"])
    with chpwd(op.join(path, "subdir")):
        run("cd .> subdir-dummy1", inputs=["a"])
    ok_(ds.repo.file_has_content(op.join("subdir", "a")))

    # --input=. runs "datalad get ."
    ds.run("cd .> dot-dummy", inputs=["."])
    eq_(ds.repo.get_annexed_files(),
        ds.repo.get_annexed_files(with_content_only=True))
    # On rerun, we get all files, even those that weren't in the tree at the
    # time of the run.
    create_tree(ds.path, {"after-dot-run": "after-dot-run content"})
    ds.save()
    ds.repo.copy_to(["after-dot-run"], remote="origin")
    ds.repo.drop(["after-dot-run"], options=["--force"])
    ds.rerun("HEAD^")
    ds.repo.file_has_content("after-dot-run")

    # --output will unlock files that are present.
    ds.repo.get("a.dat")
    ds.run("echo ' appended' >>a.dat", outputs=["a.dat"])
    with open(op.join(path, "a.dat")) as fh:
        eq_(fh.read(),
            "a.dat' appended' \n" if on_windows else "a.dat appended\n")

    # --output will remove files that are not present.
    ds.repo.drop(["a.dat", "d.txt"], options=["--force"])
    ds.run("echo ' appended' >>a.dat", outputs=["a.dat"])
    if not on_windows:
        # MIH doesn't yet understand how to port this
        with open(op.join(path, "a.dat")) as fh:
            eq_(fh.read(), " appended\n")

    # --input can be combined with --output.
    ds.repo.call_git(["reset", "--hard", "HEAD~2"])
    ds.run("echo ' appended' >>a.dat", inputs=["a.dat"], outputs=["a.dat"])
    if not on_windows:
        # MIH doesn't yet understand how to port this
        with open(op.join(path, "a.dat")) as fh:
            eq_(fh.read(), "a.dat appended\n")

    if not on_windows:
        # see datalad#2606
        with swallow_logs(new_level=logging.DEBUG) as cml:
            with swallow_outputs():
                ds.run("echo blah", outputs=["not-there"])
                assert_in("Filtered out non-existing path: ", cml.out)

    ds.create('sub')
    ds.run("echo sub_orig >sub/subfile")
    ds.run("echo sub_overwrite >sub/subfile", outputs=["sub/subfile"])
    ds.drop("sub/subfile", check=False)
    ds.run("echo sub_overwrite >sub/subfile", outputs=["sub/subfile"])

    # --input/--output globs can be stored in expanded form.
    ds.run("cd .> expand-dummy",
           inputs=["a.*"],
           outputs=["b.*"],
           expand="both")
    assert_in("a.dat", ds.repo.format_commit("%B"))
    assert_in("b.dat", ds.repo.format_commit("%B"))

    res = ds.rerun(report=True, return_type='item-or-list')
    eq_(res["run_info"]['inputs'], ["a.dat"])
    eq_(res["run_info"]['outputs'], ["b.dat"])

    # We install subdatasets to fully resolve globs.
    ds.uninstall("s0")
    assert_false(Dataset(op.join(path, "s0")).is_installed())
    ds.run("echo {inputs} >globbed-subds", inputs=["s0/s1_*/s2/*.dat"])
    ok_file_has_content(op.join(ds.path, "globbed-subds"),
                        "'s0\\s1_0\\s2\\a.dat' 's0\\s1_1\\s2\\c.dat'"
                        if on_windows else "s0/s1_0/s2/a.dat s0/s1_1/s2/c.dat",
                        strip=True)

    ds_ss = Dataset(op.join(path, "s0", "ss"))
    assert_false(ds_ss.is_installed())
    ds.run("echo blah >{outputs}", outputs=["s0/ss/out"])
    ok_(ds_ss.is_installed())
    ok_file_has_content(op.join(ds.path, "s0", "ss", "out"),
                        "blah",
                        strip=True)
def test_clean_subds_removal(path):
    ds = Dataset(path).create()
    subds1 = ds.create('one')
    subds2 = ds.create('two')
    eq_(sorted(ds.subdatasets(result_xfm='relpaths')), ['one', 'two'])
    assert_repo_status(ds.path)
    # now kill one
    res = ds.remove('one', result_xfm=None)
    # subds1 got uninstalled, and ds got the removal of subds1 saved
    assert_result_count(res,
                        1,
                        path=subds1.path,
                        action='uninstall',
                        status='ok')
    assert_result_count(res, 1, path=subds1.path, action='remove', status='ok')
    assert_result_count(res, 1, path=ds.path, action='save', status='ok')
    ok_(not subds1.is_installed())
    assert_repo_status(ds.path)
    # two must remain
    eq_(ds.subdatasets(result_xfm='relpaths'), ['two'])
    # one is gone
    assert (not exists(subds1.path))
    # and now again, but this time remove something that is not installed
    ds.create('three')
    eq_(sorted(ds.subdatasets(result_xfm='relpaths')), ['three', 'two'])
    ds.uninstall('two')
    assert_repo_status(ds.path)
    eq_(sorted(ds.subdatasets(result_xfm='relpaths')), ['three', 'two'])
    ok_(not subds2.is_installed())
    assert (exists(subds2.path))
    res = ds.remove('two', result_xfm='datasets')
    assert_repo_status(ds.path)
    # subds2 was already uninstalled, now ds got the removal of subds2 saved
    assert (not exists(subds2.path))
    eq_(ds.subdatasets(result_xfm='relpaths'), ['three'])
    eq_(res, [subds2, ds])
Esempio n. 49
0
def test_openfmri_pipeline1(ind, topurl, outd, clonedir):
    index_html = opj(ind, 'ds666', 'index.html')

    list(
        initiate_dataset(template="openfmri",
                         dataset_name='dataladtest-ds666',
                         path=outd,
                         data_fields=['dataset'])({
                             'dataset': 'ds666'
                         }))

    repo = AnnexRepo(outd, create=False)  # to be used in the checks
    # Since datalad 0.11.2 all .metadata/objects go under annex.
    # Here we have a test where we force drop all annexed content,
    # to mitigate that let's place all metadata under git
    dotdatalad_attributes_file = opj('.datalad', '.gitattributes')
    repo.set_gitattributes([('metadata/objects/**', {
        'annex.largefiles': '(nothing)'
    })], dotdatalad_attributes_file)
    # --amend so we do not cause change in # of commits below
    repo.commit("gitattributes",
                files=dotdatalad_attributes_file,
                options=['--amend'])

    with chpwd(outd):
        pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl)
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    # Inspect the tree -- that we have all the branches
    branches = {'master', 'incoming', 'incoming-processed', 'git-annex'}
    eq_(set(repo.get_branches()), branches)
    # We do not have custom changes in master yet, so it just follows incoming-processed atm
    # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed'))
    # Since we did initiate_dataset -- now we have separate master!
    assert_not_equal(repo.get_hexsha('master'),
                     repo.get_hexsha('incoming-processed'))
    # and that one is different from incoming
    assert_not_equal(repo.get_hexsha('incoming'),
                     repo.get_hexsha('incoming-processed'))

    t1w_fpath_nover = opj(outd, 'sub1', 'anat', 'sub-1_T1w.dat')
    ok_file_has_content(t1w_fpath_nover, "mighty load in old format")

    #
    # And now versioned files were specified!
    #
    add_to_index(index_html, content=_versioned_files)

    with chpwd(outd):
        pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl)
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    ok_(
        not exists(t1w_fpath_nover),
        "%s file should no longer be there if unversioned files get removed correctly"
        % t1w_fpath_nover)
    repo = AnnexRepo(outd, create=False)  # to be used in the checks
    # Inspect the tree -- that we have all the branches
    branches = {'master', 'incoming', 'incoming-processed', 'git-annex'}
    eq_(set(repo.get_branches()), branches)
    # We do not have custom changes in master yet, so it just follows incoming-processed atm
    # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed'))
    # Since we did initiate_dataset -- now we have separate master!
    assert_not_equal(repo.get_hexsha('master'),
                     repo.get_hexsha('incoming-processed'))
    # and that one is different from incoming
    assert_not_equal(repo.get_hexsha('incoming'),
                     repo.get_hexsha('incoming-processed'))

    # actually the tree should look quite neat with 1.0.0 tag having 1 parent in incoming
    # 1.0.1 having 1.0.0 and the 2nd commit in incoming as parents

    commits_hexsha = {b: list(_get_branch_commits(repo, b)) for b in branches}
    commits_l = {
        b: list(_get_branch_commits(repo, b, limit='left-only'))
        for b in branches
    }

    # all commits out there:
    # dataset init, crawler init
    #   (2 commits)
    # + 3*(incoming, processed, merge)
    # + 3*aggregate-metadata update
    #   - 1 since now that incoming starts with master, there is one less merge
    # In --incremental mode there is a side effect of absent now
    #   2*remove of obsolete metadata object files,
    #     see https://github.com/datalad/datalad/issues/2772
    # TODO inspect by knowledgeable person and re-enable
    #ncommits_master = len(commits_hexsha['master'])
    #assert_in(ncommits_master, [13, 14])
    #assert_in(len(commits_l['master']), [8, 9])

    # TODO inspect by knowledgeable person and re-enable
    #eq_(len(commits_hexsha['incoming']), ncommits_master - 8)
    #eq_(len(commits_l['incoming']), ncommits_master - 8)
    #eq_(len(commits_hexsha['incoming-processed']), ncommits_master - 5)
    #eq_(len(commits_l['incoming-processed']), ncommits_master - 8)

    # Check tags for the versions
    eq_(out[0]['datalad_stats'].get_total().versions, ['1.0.0', '1.0.1'])
    # +1 because original "release" was assumed to be 1.0.0
    repo_tags = repo.get_tags()
    eq_(repo.get_tags(output='name'), ['1.0.0', '1.0.0+1', '1.0.1'])

    # Ben: The tagged ones currently are the ones with the message
    # '[DATALAD] dataset aggregate metadata update\n':
    #eq_(repo_tags[0]['hexsha'], commits_l['master'][4])  # next to the last one
    #eq_(repo_tags[-1]['hexsha'], commits_l['master'][0])  # the last one

    def hexsha(l):
        return l.__class__(x.hexsha for x in l)

    # TODO requires additional tooling to re-enable
    ## Verify that we have desired tree of merges
    #eq_(hexsha(commits_l['incoming-processed'][0].parents), (commits_l['incoming-processed'][1],
    #                                                         commits_l['incoming'][0]))
    #eq_(hexsha(commits_l['incoming-processed'][2].parents), (commits_l['incoming-processed'][3],  # also in master
    #                                                         commits_l['incoming'][2],))

    # ben: The following two comparisons are targeting these commits:
    # commit "Merge branch 'incoming-processed'\n" in commits_l['master'],
    # parents are:
    # commit "[DATALAD] dataset aggregate metadata update\n" in commits_l['master'] and
    # commit "[DATALAD] Added files from extracted archives\n\nFiles processed: 6\n renamed: 2\n +annex: 3\nBranches merged: incoming->incoming-processed\n" in commits_l['incoming-processed']
    # TODO requires additional tooling to re-enable
    #eq_(hexsha(commits_l['master'][1].parents), (commits_l['master'][2],
    #                                             commits_l['incoming-processed'][0]))
    #eq_(hexsha(commits_l['master'][3].parents), (commits_l['master'][4],
    #                                             commits_l['incoming-processed'][1]))

    with chpwd(outd):
        eq_(set(glob('*')), {'changelog.txt', 'sub-1'})
        all_files = sorted(find_files('.'))

    t1w_fpath = opj(outd, 'sub-1', 'anat', 'sub-1_T1w.dat')
    ok_file_has_content(t1w_fpath, "mighty load 1.0.1")
    ok_file_under_git(opj(outd, 'changelog.txt'), annexed=False)
    ok_file_under_git(t1w_fpath, annexed=True)

    try:
        # this is the new way
        from datalad.metadata.metadata import get_ds_aggregate_db_locations
        ds = Dataset('.')
        dbloc, objbase = get_ds_aggregate_db_locations(ds)
        dbloc = op.relpath(dbloc, start=ds.path)
    except ImportError:
        # this stopped working in early 2019 versions of datalad
        from datalad.metadata.metadata import agginfo_relpath
        dbloc = agginfo_relpath

    target_files = {
        './.datalad/config',
        './.datalad/crawl/crawl.cfg',
        # no more!
        # './.datalad/config.ttl', './.datalad/datalad.ttl',
        './.datalad/crawl/statuses/incoming.json',
        './.datalad/crawl/versions/incoming.json',
        './changelog.txt',
        './sub-1/anat/sub-1_T1w.dat',
        './sub-1/beh/responses.tsv',
        './' + dbloc,
    }
    target_incoming_files = {
        '.gitattributes',  # we marked default backend right in the incoming
        # we now base 'incoming' on master branch, so we get all those as well
        '.datalad/.gitattributes',
        '.datalad/config',
        '.datalad/crawl/crawl.cfg',
        'changelog.txt',
        'ds666.tar.gz',
        'ds666-beh_R1.0.1.tar.gz',
        'ds666_R1.0.0.tar.gz',
        'ds666_R1.0.1.tar.gz',
        'ds666_R2.0.0.tar.gz',
        '.datalad/crawl/statuses/incoming.json',
        '.datalad/crawl/versions/incoming.json'
    }
    # Ben: metadata object files may differ in their names containing some checksum-ish shit ...
    # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison
    eq_(
        set([
            f for f in all_files
            if not f.startswith('./.datalad/metadata/objects/')
        ]), target_files)

    # check that -beh was committed in 2nd commit in incoming, not the first one
    assert_not_in('ds666-beh_R1.0.1.tar.gz',
                  repo.get_files(commits_l['incoming'][-1]))
    assert_in('ds666-beh_R1.0.1.tar.gz',
              repo.get_files(commits_l['incoming'][0]))

    # rerun pipeline -- make sure we are on the same in all branches!
    with chpwd(outd):
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches}
    eq_(commits_hexsha, commits_hexsha_)  # i.e. nothing new
    # actually we do manage to add_git 1 (README) since it is generated committed directly to git
    # BUT now fixed -- if not committed (was the same), should be marked as skipped
    # Nothing was committed so stats leaked all the way up
    eq_(out[0]['datalad_stats'], ActivityStats(files=5, skipped=5, urls=5))
    eq_(out[0]['datalad_stats'], out[0]['datalad_stats'].get_total())

    # rerun pipeline when new content is available
    # add new revision, rerun pipeline and check that stuff was processed/added correctly
    add_to_index(
        index_html,
        content=
        '<a href="ds666_R2.0.0.tar.gz">Raw data on AWS version 2.0.0</a>')

    with chpwd(outd):
        out = run_pipeline(pipeline)
        all_files_updated = sorted(find_files('.'))
    eq_(len(out), 1)
    assert_not_equal(out[0]['datalad_stats'].get_total(), ActivityStats())
    # there is no overlays ATM, so behav would be gone since no 2.0.0 for it!
    target_files.remove('./sub-1/beh/responses.tsv')

    # Ben: metadata object files may differ in their names containing some checksum-ish shit ...
    # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison
    eq_(
        set([
            f for f in all_files_updated
            if not f.startswith('./.datalad/metadata/objects/')
        ]), target_files)

    # new instance so it re-reads git stuff etc
    # repo = AnnexRepo(outd, create=False)  # to be used in the checks
    commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches}
    commits_l_ = {
        b: list(_get_branch_commits(repo, b, limit='left-only'))
        for b in branches
    }

    assert_not_equal(commits_hexsha, commits_hexsha_)
    eq_(out[0]['datalad_stats'],
        ActivityStats())  # commit happened so stats were consumed
    # numbers seems to be right
    total_stats = out[0]['datalad_stats'].get_total()
    # but for some reason downloaded_size fluctuates.... why? probably archiving...?
    total_stats.downloaded_size = 0
    eq_(
        total_stats,
        ActivityStats(
            files=8,
            skipped=5,
            downloaded=1,
            renamed=1,
            urls=6,
            add_annex=2,  # add_git=1, # README
            versions=['2.0.0'],
            merges=[['incoming', 'incoming-processed']]))

    check_dropall_get(repo)

    # Let's see if pipeline would remove files we stopped tracking
    remove_from_index(index_html, '<a href=.ds666_R1.0.0[^<]*</a>')
    with chpwd(outd):
        with swallow_logs(new_level=logging.WARNING) as cml:
            out = run_pipeline(pipeline)
            # since files get removed in incoming, but repreprocessed completely
            # incomming-processed and merged into master -- new commits will come
            # They shouldn't have any difference but still should be new commits
            assert_in("There is already a tag 2.0.0 in the repository",
                      cml.out)
    eq_(len(out), 1)
    incoming_files = repo.get_files('incoming')
    target_incoming_files.remove('ds666_R1.0.0.tar.gz')
    eq_(set(incoming_files), target_incoming_files)
    commits_hexsha_removed = {
        b: list(_get_branch_commits(repo, b))
        for b in branches
    }
    # our 'statuses' database should have recorded the change thus got a diff
    # which propagated through all branches
    for b in 'master', 'incoming-processed':
        # with non persistent DB we had no changes
        # eq_(repo.repo.branches[b].commit.diff(commits_hexsha_[b][0]), [])
        assert_in(repo.pathobj / '.datalad/crawl/statuses/incoming.json',
                  repo.diff(b, commits_hexsha_[b][0]))
    dincoming = repo.diff('incoming', commits_hexsha_['incoming'][0])
    eq_(len(dincoming),
        2)  # 2 diff objects -- 1 file removed, 1 statuses updated
    eq_(
        set(dincoming.keys()), {
            repo.pathobj / '.datalad/crawl/statuses/incoming.json',
            repo.pathobj / 'ds666_R1.0.0.tar.gz'
        })

    eq_(out[0]['datalad_stats'].get_total().removed, 1)
    assert_not_equal(commits_hexsha_, commits_hexsha_removed)

    # we will check if a clone would be crawling just as good
    from datalad.api import crawl

    # make a brand new clone
    GitRepo.clone(outd, clonedir)

    def _pipeline(*args, **kwargs):
        """Helper to mock openfmri.pipeline invocation so it looks at our 'server'"""
        kwargs = updated(kwargs, {'topurl': topurl, 'versioned_urls': False})
        return ofpipeline(*args, **kwargs)

    with chpwd(clonedir), patch.object(openfmri, 'pipeline', _pipeline):
        output, stats = crawl(
        )  # we should be able to recrawl without doing anything
        ok_(stats, ActivityStats(files=6, skipped=6, urls=5))
Esempio n. 50
0
def test_get_subpaths():
    for fname, expect in [("no/dbl/slash", ("no/dbl/slash", [])),
                          ("p1//n", ("p1/n", ["p1"])),
                          ("p1//p2/p3//n", ("p1/p2/p3/n", ["p1", "p1/p2/p3"])),
                          ("//n", ("/n", [""])), ("n//", ("n/", ["n"]))]:
        eq_(au.get_subpaths(fname), expect)
Esempio n. 51
0
def test_path_diff(_path, linkpath):
    # do the setup on the real path, not the symlink, to have its
    # bugs not affect this test of status()
    ds = get_deeply_nested_structure(str(_path))
    if has_symlink_capability():
        # make it more complicated by default
        ut.Path(linkpath).symlink_to(_path, target_is_directory=True)
        path = linkpath
    else:
        path = _path

    ds = Dataset(path)
    if not on_windows:
        # TODO test should also be has_symlink_capability(), but
        # something in the repo base class is not behaving yet
        # check the premise of this test
        assert ds.pathobj != ds.repo.pathobj

    plain_recursive = ds.diff(recursive=True, annex='all')
    # check integrity of individual reports with a focus on how symlinks
    # are reported
    for res in plain_recursive:
        # anything that is an "intended" symlink should be reported
        # as such. In contrast, anything that is a symlink for mere
        # technical reasons (annex using it for something in some mode)
        # should be reported as the thing it is representing (i.e.
        # a file)
        if 'link2' in text_type(res['path']):
            assert res['type'] == 'symlink', res
        else:
            assert res['type'] != 'symlink', res
        # every item must report its parent dataset
        assert_in('parentds', res)

    # bunch of smoke tests
    # query of '.' is same as no path
    eq_(plain_recursive, ds.diff(path='.', recursive=True, annex='all'))
    # duplicate paths do not change things
    eq_(plain_recursive, ds.diff(path=['.', '.'], recursive=True, annex='all'))
    # neither do nested paths
    eq_(plain_recursive,
        ds.diff(path=['.', 'subds_modified'], recursive=True, annex='all'))
    # when invoked in a subdir of a dataset it still reports on the full thing
    # just like `git status`, as long as there are no paths specified
    with chpwd(op.join(path, 'directory_untracked')):
        plain_recursive = diff(recursive=True, annex='all')
    # should be able to take absolute paths and yield the same
    # output
    eq_(plain_recursive, ds.diff(path=ds.path, recursive=True, annex='all'))

    # query for a deeply nested path from the top, should just work with a
    # variety of approaches
    rpath = op.join('subds_modified', 'subds_lvl1_modified',
                    u'{}_directory_untracked'.format(OBSCURE_FILENAME))
    apathobj = ds.pathobj / rpath
    apath = text_type(apathobj)
    for p in (rpath, apath, None):
        if p is None:
            # change into the realpath of the dataset and
            # query with an explicit path
            with chpwd(ds.path):
                res = ds.diff(path=op.join('.', rpath),
                              recursive=True,
                              annex='all')
        else:
            res = ds.diff(path=p, recursive=True, annex='all')
        assert_result_count(
            res,
            1,
            state='untracked',
            type='directory',
            refds=ds.path,
            # path always comes out a full path inside the queried dataset
            path=apath,
        )

    assert_result_count(ds.diff(recursive=True), 1, path=apath)
    # limiting recursion will exclude this particular path
    assert_result_count(ds.diff(recursive=True, recursion_limit=1),
                        0,
                        path=apath)
    # negative limit is unlimited limit
    eq_(ds.diff(recursive=True, recursion_limit=-1), ds.diff(recursive=True))
Esempio n. 52
0
 def func2(x):
     assert x == 1
     eq_(ui.yesno("title"), True)
     eq_(ui.question("title2"), "maybe so")
     assert_raises(AssertionError, ui.question, "asking more than we know")
     return x * 2
Esempio n. 53
0
def test_a_href_match_basic():
    m = a_href_match('.*')

    mg = m(dict(response=sample1.response))
    ok_(inspect.isgenerator(mg))
    hits = list(mg)
    eq_(len(hits), 3)
    eq_([u['url_text'] for u in hits], sample1.a_texts)
    eq_([u['url_href'] for u in hits], sample1.a_url_hrefs)
    # nothing done to url
    eq_([u['url'] for u in hits], sample1.a_url_hrefs)

    # if we do provide original url where it comes from -- result urls should be full
    mg = m(dict(response=sample1.response, url="http://w.example.com:888/d/"))
    ok_(inspect.isgenerator(mg))
    hits = list(mg)
    eq_(len(hits), 3)
    eq_([u['url_text'] for u in hits], sample1.a_texts)
    eq_([u['url_href'] for u in hits], sample1.a_url_hrefs)
    eq_([u['url'] for u in hits],
        ['http://w.example.com:888/', 'http://w.example.com:888/d/buga/duga/du', 'http://example.com'])
Esempio n. 54
0
def test_repo_diff(path, norepo):
    ds = Dataset(path).create()
    assert_repo_status(ds.path)
    assert_raises(ValueError, ds.repo.diff, fr='WTF', to='MIKE')
    # no diff
    eq_(ds.repo.diff('HEAD', None), {})
    # bogus path makes no difference
    eq_(ds.repo.diff('HEAD', None, paths=['THIS']), {})
    # let's introduce a known change
    create_tree(ds.path, {'new': 'empty'})
    ds.save(to_git=True)
    assert_repo_status(ds.path)
    eq_(
        ds.repo.diff(fr='HEAD~1', to='HEAD'), {
            ut.Path(ds.repo.pathobj / 'new'): {
                'state': 'added',
                'type': 'file',
                'bytesize': 5,
                'gitshasum': '7b4d68d70fcae134d5348f5e118f5e9c9d3f05f6'
            }
        })
    # modify known file
    create_tree(ds.path, {'new': 'notempty'})
    eq_(
        ds.repo.diff(fr='HEAD', to=None),
        {
            ut.Path(ds.repo.pathobj / 'new'): {
                'state': 'modified',
                'type': 'file',
                # the beast is modified, but no change in shasum -> not staged
                'gitshasum': '7b4d68d70fcae134d5348f5e118f5e9c9d3f05f6',
                'prev_gitshasum': '7b4d68d70fcae134d5348f5e118f5e9c9d3f05f6'
            }
        })
    # per path query gives the same result
    eq_(ds.repo.diff(fr='HEAD', to=None),
        ds.repo.diff(fr='HEAD', to=None, paths=['new']))
    # also given a directory as a constraint does the same
    eq_(ds.repo.diff(fr='HEAD', to=None),
        ds.repo.diff(fr='HEAD', to=None, paths=['.']))
    # but if we give another path, it doesn't show up
    eq_(ds.repo.diff(fr='HEAD', to=None, paths=['other']), {})

    # make clean
    ds.save()
    assert_repo_status(ds.path)

    # untracked stuff
    create_tree(ds.path, {'deep': {'down': 'untracked', 'down2': 'tobeadded'}})
    # default is to report all files
    eq_(
        ds.repo.diff(fr='HEAD', to=None), {
            ut.Path(ds.repo.pathobj / 'deep' / 'down'): {
                'state': 'untracked',
                'type': 'file'
            },
            ut.Path(ds.repo.pathobj / 'deep' / 'down2'): {
                'state': 'untracked',
                'type': 'file'
            }
        })
    # but can be made more compact
    eq_(
        ds.repo.diff(fr='HEAD', to=None, untracked='normal'), {
            ut.Path(ds.repo.pathobj / 'deep'): {
                'state': 'untracked',
                'type': 'directory'
            }
        })

    # again a unmatching path constrainted will give an empty report
    eq_(ds.repo.diff(fr='HEAD', to=None, paths=['other']), {})
    # perfect match and anything underneath will do
    eq_(
        ds.repo.diff(fr='HEAD', to=None, paths=['deep']), {
            ut.Path(ds.repo.pathobj / 'deep' / 'down'): {
                'state': 'untracked',
                'type': 'file'
            },
            ut.Path(ds.repo.pathobj / 'deep' / 'down2'): {
                'state': 'untracked',
                'type': 'file'
            }
        })
Esempio n. 55
0
def test_update_strategy(path):
    base = Dataset(op.join(path, 'origin')).create(force=True)
    # force all metadata objects into the annex
    with open(op.join(base.path, '.datalad', '.gitattributes'), 'w') as f:
        f.write(
            '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n'
        )
    sub = base.create('sub', force=True)
    subsub = sub.create(op.join('subsub'), force=True)
    base.save(recursive=True)
    assert_repo_status(base.path)
    # we start clean
    for ds in base, sub, subsub:
        eq_(len(_get_contained_objs(ds)), 0)
    # aggregate the base dataset only, nothing below changes
    base.meta_aggregate()
    eq_(len(_get_contained_objs(base)), 2)
    for ds in sub, subsub:
        eq_(len(_get_contained_objs(ds)), 0)
    # aggregate the entire tree, but by default only updates
    # the top-level dataset with all objects, none of the leaf
    # or intermediate datasets get's touched
    base.meta_aggregate(recursive=True)
    eq_(len(_get_contained_objs(base)), 6)
    eq_(len(_get_referenced_objs(base)), 6)
    for ds in sub, subsub:
        eq_(len(_get_contained_objs(ds)), 0)
    res = base.meta_dump(reporton='aggregates', recursive=True)
    assert_result_count(res, 3)
    # it is impossible to query an intermediate or leaf dataset
    # for metadata
    for ds in sub, subsub:
        assert_status('impossible',
                      ds.meta_dump(reporton='aggregates', on_failure='ignore'))
    # get the full metadata report
    target_meta = _kill_time(base.meta_dump())

    # now redo full aggregation, this time updating all
    # (intermediate) datasets
    base.meta_aggregate(recursive=True, into='all')
    eq_(len(_get_contained_objs(base)), 6)
    eq_(len(_get_contained_objs(sub)), 4)
    eq_(len(_get_contained_objs(subsub)), 2)
    # it is now OK to query an intermediate or leaf dataset
    # for metadata
    for ds in sub, subsub:
        assert_status('ok',
                      ds.meta_dump(reporton='aggregates', on_failure='ignore'))

    # all of that has no impact on the reported metadata
    # minus the change in the refcommits
    for i in zip(target_meta, _kill_time(base.meta_dump())):
        assert_dict_equal(i[0], i[1])
Esempio n. 56
0
def test_setup():
    # just verify that we monkey patched consts correctly
    from datalad.consts import DATASETS_TOPURL
    eq_(DATASETS_TOPURL, 'http://datasets-tests.datalad.org/')
    from datalad.tests.utils import get_datasets_topdir
    eq_(get_datasets_topdir(), 'datasets-tests.datalad.org')
Esempio n. 57
0
def check_push(annex, src_path, dst_path):
    # prepare src
    src = Dataset(src_path).create(annex=annex)
    src_repo = src.repo
    # push should not add branches to the local dataset
    orig_branches = src_repo.get_branches()
    assert_not_in('synced/' + DEFAULT_BRANCH, orig_branches)

    res = src.push(on_failure='ignore')
    assert_result_count(res, 1)
    assert_in_results(
        res,
        status='impossible',
        message='No push target given, and none could be auto-detected, '
        'please specify via --to')
    eq_(orig_branches, src_repo.get_branches())
    # target sibling
    target = mk_push_target(src, 'target', dst_path, annex=annex)
    eq_(orig_branches, src_repo.get_branches())

    res = src.push(to="target")
    eq_(orig_branches, src_repo.get_branches())
    assert_result_count(res, 2 if annex else 1)
    assert_in_results(res,
                      action='publish',
                      status='ok',
                      target='target',
                      refspec=DEFAULT_REFSPEC,
                      operations=['new-branch'])

    assert_repo_status(src_repo, annex=annex)
    eq_(list(target.get_branch_commits_(DEFAULT_BRANCH)),
        list(src_repo.get_branch_commits_(DEFAULT_BRANCH)))

    # configure a default merge/upstream target
    src.config.set('branch.{}.remote'.format(DEFAULT_BRANCH),
                   'target',
                   where='local')
    src.config.set('branch.{}.merge'.format(DEFAULT_BRANCH),
                   DEFAULT_BRANCH,
                   where='local')

    # don't fail when doing it again, no explicit target specification
    # needed anymore
    res = src.push()
    eq_(orig_branches, src_repo.get_branches())
    # and nothing is pushed
    assert_status('notneeded', res)

    assert_repo_status(src_repo, annex=annex)
    eq_(list(target.get_branch_commits_(DEFAULT_BRANCH)),
        list(src_repo.get_branch_commits_(DEFAULT_BRANCH)))

    # some modification:
    (src.pathobj / 'test_mod_file').write_text("Some additional stuff.")
    src.save(to_git=True, message="Modified.")
    (src.pathobj / 'test_mod_annex_file').write_text("Heavy stuff.")
    src.save(to_git=not annex, message="Modified again.")
    assert_repo_status(src_repo, annex=annex)

    # we could say since='HEAD~2' to make things fast, or we are lazy
    # and say since='^' to indicate the state of the tracking remote
    # which is the same, because we made to commits since the last push.
    res = src.push(to='target', since="^", jobs=2)
    assert_in_results(
        res,
        action='publish',
        status='ok',
        target='target',
        refspec=DEFAULT_REFSPEC,
        # we get to see what happened
        operations=['fast-forward'])
    if annex:
        # we got to see the copy result for the annexed files
        assert_in_results(res,
                          action='copy',
                          status='ok',
                          path=str(src.pathobj / 'test_mod_annex_file'))
        # we published, so we can drop and reobtain
        ok_(src_repo.file_has_content('test_mod_annex_file'))
        src_repo.drop('test_mod_annex_file')
        ok_(not src_repo.file_has_content('test_mod_annex_file'))
        src_repo.get('test_mod_annex_file')
        ok_(src_repo.file_has_content('test_mod_annex_file'))
        ok_file_has_content(src_repo.pathobj / 'test_mod_annex_file',
                            'Heavy stuff.')

    eq_(list(target.get_branch_commits_(DEFAULT_BRANCH)),
        list(src_repo.get_branch_commits_(DEFAULT_BRANCH)))
    if not (annex and src_repo.is_managed_branch()):
        # the following doesn't make sense in managed branches, because
        # a commit that could be amended is no longer the last commit
        # of a branch after a sync has happened (which did happen
        # during the last push above

        # amend and change commit msg in order to test for force push:
        src_repo.commit("amended", options=['--amend'])
        # push should be rejected (non-fast-forward):
        res = src.push(to='target', since='HEAD~2', on_failure='ignore')
        # fails before even touching the annex branch
        assert_in_results(res,
                          action='publish',
                          status='error',
                          target='target',
                          refspec=DEFAULT_REFSPEC,
                          operations=['rejected', 'error'])
        # push with force=True works:
        res = src.push(to='target', since='HEAD~2', force='gitpush')
        assert_in_results(res,
                          action='publish',
                          status='ok',
                          target='target',
                          refspec=DEFAULT_REFSPEC,
                          operations=['forced-update'])
        eq_(list(target.get_branch_commits_(DEFAULT_BRANCH)),
            list(src_repo.get_branch_commits_(DEFAULT_BRANCH)))

    # we do not have more branches than we had in the beginning
    # in particular no 'synced/<default branch>'
    eq_(orig_branches, src_repo.get_branches())
Esempio n. 58
0
def test_push_recursive(origin_path, src_path, dst_top, dst_sub,
                        dst_subnoannex, dst_subsub):
    # dataset with two submodules and one subsubmodule
    origin = Dataset(origin_path).create()
    origin_subm1 = origin.create('sub m')
    origin_subm1.create('subsub m')
    origin.create('subm noannex', annex=False)
    origin.save()
    assert_repo_status(origin.path)
    # prepare src as a fresh clone with all subdatasets checkout out recursively
    # running on a clone should make the test scenario more different than
    # test_push(), even for the pieces that should be identical
    top = Clone.__call__(source=origin.path, path=src_path)
    subs = top.get('.', recursive=True, get_data=False, result_xfm='datasets')
    # order for '.' should not be relied upon, so sort by path
    sub, subsub, subnoannex = sorted(subs, key=lambda ds: ds.path)

    target_top = mk_push_target(top, 'target', dst_top, annex=True)
    # subdatasets have no remote yet, so recursive publishing should fail:
    res = top.push(to="target", recursive=True, on_failure='ignore')
    assert_in_results(res,
                      path=top.path,
                      type='dataset',
                      refspec=DEFAULT_REFSPEC,
                      operations=['new-branch'],
                      action='publish',
                      status='ok',
                      target='target')
    for d in (sub, subsub, subnoannex):
        assert_in_results(res,
                          status='error',
                          type='dataset',
                          path=d.path,
                          message=("Unknown target sibling '%s'.", 'target'))
    # now fix that and set up targets for the submodules
    target_sub = mk_push_target(sub, 'target', dst_sub, annex=True)
    target_subnoannex = mk_push_target(subnoannex,
                                       'target',
                                       dst_subnoannex,
                                       annex=False)
    target_subsub = mk_push_target(subsub, 'target', dst_subsub, annex=True)

    # and same push call as above
    res = top.push(to="target", recursive=True)
    # topds skipped
    assert_in_results(res,
                      path=top.path,
                      type='dataset',
                      action='publish',
                      status='notneeded',
                      target='target')
    # the rest pushed
    for d in (sub, subsub, subnoannex):
        assert_in_results(res,
                          status='ok',
                          type='dataset',
                          path=d.path,
                          refspec=DEFAULT_REFSPEC)
    # all correspondig branches match across all datasets
    for s, d in zip(
        (top, sub, subnoannex, subsub),
        (target_top, target_sub, target_subnoannex, target_subsub)):
        eq_(list(s.repo.get_branch_commits_(DEFAULT_BRANCH)),
            list(d.get_branch_commits_(DEFAULT_BRANCH)))
        if s != subnoannex:
            eq_(list(s.repo.get_branch_commits_("git-annex")),
                list(d.get_branch_commits_("git-annex")))

    # rerun should not result in further pushes of the default branch
    res = top.push(to="target", recursive=True)
    assert_not_in_results(res, status='ok', refspec=DEFAULT_REFSPEC)
    assert_in_results(res, status='notneeded', refspec=DEFAULT_REFSPEC)

    # now annex a file in subsub
    test_copy_file = subsub.pathobj / 'test_mod_annex_file'
    test_copy_file.write_text("Heavy stuff.")
    # save all the way up
    assert_status(('ok', 'notneeded'),
                  top.save(message='subsub got something', recursive=True))
    assert_repo_status(top.path)
    # publish straight up, should be smart by default
    res = top.push(to="target", recursive=True)
    # we see 3 out of 4 datasets pushed (sub noannex was left unchanged)
    for d in (top, sub, subsub):
        assert_in_results(res,
                          status='ok',
                          type='dataset',
                          path=d.path,
                          refspec=DEFAULT_REFSPEC)
    # file content copied too
    assert_in_results(res,
                      action='copy',
                      status='ok',
                      path=str(test_copy_file))
    # verify it is accessible, drop and bring back
    assert_status('ok', top.drop(str(test_copy_file)))
    ok_(not subsub.repo.file_has_content('test_mod_annex_file'))
    top.get(test_copy_file)
    ok_file_has_content(test_copy_file, 'Heavy stuff.')

    # make two modification
    (sub.pathobj / 'test_mod_annex_file').write_text('annex')
    (subnoannex.pathobj / 'test_mod_file').write_text('git')
    # save separately
    top.save(sub.pathobj, message='annexadd', recursive=True)
    top.save(subnoannex.pathobj, message='gitadd', recursive=True)
    # now only publish the latter one
    res = top.push(to="target", since=DEFAULT_BRANCH + '~1', recursive=True)
    # nothing copied, no reports on the other modification
    assert_not_in_results(res, action='copy')
    assert_not_in_results(res, path=sub.path)
    for d in (top, subnoannex):
        assert_in_results(res,
                          status='ok',
                          type='dataset',
                          path=d.path,
                          refspec=DEFAULT_REFSPEC)
    # an unconditional push should now pick up the remaining changes
    res = top.push(to="target", recursive=True)
    assert_in_results(res,
                      action='copy',
                      status='ok',
                      path=str(sub.pathobj / 'test_mod_annex_file'))
    assert_in_results(res,
                      status='ok',
                      type='dataset',
                      path=sub.path,
                      refspec=DEFAULT_REFSPEC)
    for d in (top, subnoannex, subsub):
        assert_in_results(res,
                          status='notneeded',
                          type='dataset',
                          path=d.path,
                          refspec=DEFAULT_REFSPEC)

    # if noannex target gets some annex, we still should not fail to push
    target_subnoannex.call_git(['annex', 'init'])
    # just to ensure that we do need something to push
    (subnoannex.pathobj / "newfile").write_text("content")
    subnoannex.save()
    res = subnoannex.push(to="target")
    assert_in_results(res, status='ok', type='dataset')
Esempio n. 59
0
def test_formatter_lower_case():
    fmt = au.Formatter({0: "key"})
    eq_(fmt.format("{key!l}", {"key": "UP"}), "up")
    eq_(fmt.format("{0!l}", {"key": "UP"}), "up")
    eq_(fmt.format("{other!s}", {}, other=[1, 2]), "[1, 2]")
Esempio n. 60
0
 def f(arg, kwarg=None):
     eq_(arg, 1)
     eq_(kwarg, 2)
     eq_(getpwd(), d)