Beispiel #1
0
def test_dont_trip_over_missing_subds(path):
    ds1 = Dataset(opj(path, 'ds1')).create()
    ds2 = Dataset(opj(path, 'ds2')).create()
    subds2 = ds1.install(
        source=ds2.path, path='subds2',
        result_xfm='datasets', return_type='item-or-list')
    assert_true(subds2.is_installed())
    assert_in('subds2', ds1.subdatasets(result_xfm='relpaths'))
    subds2.uninstall()
    assert_in('subds2', ds1.subdatasets(result_xfm='relpaths'))
    assert_false(subds2.is_installed())
    # see if it wants to talk to github (and fail), or if it trips over something
    # before
    assert_raises(gh.BadCredentialsException,
        ds1.create_sibling_github, 'bogus', recursive=True,
        github_login='******')
    # inject remote config prior run
    assert_not_in('github', ds1.repo.get_remotes())
    # fail on existing
    ds1.repo.add_remote('github', 'http://nothere')
    assert_raises(ValueError,
        ds1.create_sibling_github, 'bogus', recursive=True,
        github_login='******')
    # talk to github when existing is OK
    assert_raises(gh.BadCredentialsException,
        ds1.create_sibling_github, 'bogus', recursive=True,
        github_login='******', existing='reconfigure')
    # return happy emptiness when all is skipped
    assert_equal(
        ds1.create_sibling_github(
            'bogus', recursive=True,
            github_login='******', existing='skip'),
        [])
Beispiel #2
0
def test_aggregate_with_missing_or_duplicate_id(path):
    # a hierarchy of three (super/sub)datasets, each with some native metadata
    ds = Dataset(opj(path, 'origin')).create(force=True)
    subds = ds.create('sub', force=True)
    subds.remove(opj('.datalad', 'config'), if_dirty='ignore')
    assert_false(exists(opj(subds.path, '.datalad', 'config')))
    subsubds = subds.create('subsub', force=True)
    # aggregate from bottom to top, guess native data, no compacting of graph
    # should yield 6 meta data sets, one implicit, and one native per dataset
    # and a second native set for the topmost dataset
    aggregate_metadata(ds, guess_native_type=True, recursive=True)
    # no only ask the top superdataset, no recursion, just reading from the cache
    meta = get_metadata(
        ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False)
    # and we know nothing subsub
    for name in ('grandchild_äöü東',):
        assert_true(sum([s.get('name', '') == assure_unicode(name) for s in meta]))

    # but search should not fail
    with swallow_outputs():
        res1 = list(search_('.', regex=True, dataset=ds))
    assert res1

    # and let's see now if we wouldn't fail if dataset is duplicate if we
    # install the same dataset twice
    subds_clone = ds.install(source=subds.path, path="subds2")
    with swallow_outputs():
        res2 = list(search_('.', regex=True, dataset=ds))
Beispiel #3
0
def test_ignore_nondatasets(path):
    # we want to ignore the version/commits for this test
    def _kill_time(meta):
        for m in meta:
            for k in ('version', 'dcterms:modified'):
                if k in m:
                    del m[k]
        return meta

    ds = Dataset(path).create()
    meta = _kill_time(get_metadata(ds))
    n_subm = 0
    # placing another repo in the dataset has no effect on metadata
    for cls, subpath in ((GitRepo, 'subm'), (AnnexRepo, 'annex_subm')):
        subm_path = opj(ds.path, subpath)
        r = cls(subm_path, create=True)
        with open(opj(subm_path, 'test'), 'w') as f:
            f.write('test')
        r.add('test')
        r.commit('some')
        assert_true(Dataset(subm_path).is_installed())
        assert_equal(meta, _kill_time(get_metadata(ds)))
        # making it a submodule has no effect either
        ds.add(subpath)
        assert_equal(len(ds.get_subdatasets()), n_subm + 1)
        assert_equal(meta, _kill_time(get_metadata(ds)))
        n_subm += 1
Beispiel #4
0
def test_addurls_dry_run(path):
    ds = Dataset(path).create(force=True)

    with chpwd(path):
        json_file = "links.json"
        with open(json_file, "w") as jfh:
            json.dump([{"url": "URL/a.dat", "name": "a", "subdir": "foo"},
                       {"url": "URL/b.dat", "name": "b", "subdir": "bar"},
                       {"url": "URL/c.dat", "name": "c", "subdir": "foo"}],
                      jfh)

        ds.save(message="setup")

        with swallow_logs(new_level=logging.INFO) as cml:
            ds.addurls(json_file,
                       "{url}",
                       "{subdir}//{_url_filename_root}",
                       dry_run=True)

            for dir_ in ["foo", "bar"]:
                assert_in("Would create a subdataset at {}".format(dir_),
                          cml.out)
            assert_in(
                "Would download URL/a.dat to {}".format(
                    os.path.join(path, "foo", "BASE")),
                cml.out)

            assert_in("Metadata: {}".format([u"name=a", u"subdir=foo"]),
                      cml.out)
Beispiel #5
0
    def test_addurls_url_parts(self, path):
        ds = Dataset(path).create(force=True)
        with chpwd(path):
            ds.addurls(self.json_file, "{url}", "{_url0}/{_url_basename}")

            for fname in ["a.dat", "b.dat", "c.dat"]:
                ok_exists(op.join("udir", fname))
Beispiel #6
0
def test_bf2458(src, dst):
    ds = Dataset(src).create(force=True)
    ds.save(to_git=False)

    # no clone (empty) into new dst
    clone = install(source=ds.path, path=dst)
    # content is not here
    eq_(clone.repo.whereis('dummy'), [ds.config.get('annex.uuid')])
    # check that plain metadata access does not `get` stuff
    clone.metadata('.', on_failure='ignore')
    eq_(clone.repo.whereis('dummy'), [ds.config.get('annex.uuid')])
Beispiel #7
0
    def test_addurls_repindex(self, path):
        ds = Dataset(path).create(force=True)

        with chpwd(path):
            with assert_raises(IncompleteResultsError) as raised:
                ds.addurls(self.json_file, "{url}", "{subdir}")
            assert_in("There are file name collisions", str(raised.exception))

            ds.addurls(self.json_file, "{url}", "{subdir}-{_repindex}")

            for fname in ["foo-0", "bar-0", "foo-1"]:
                ok_exists(fname)
Beispiel #8
0
    def test_addurls_metafail(self, path):
        ds = Dataset(path).create(force=True)

        # Force failure by passing a non-existent file name to annex.
        fn = ds.repo.set_metadata_

        def set_meta(_, **kwargs):
            for i in fn("wreaking-havoc-and-such", **kwargs):
                yield i

        with chpwd(path), patch.object(ds.repo, 'set_metadata_', set_meta):
            with assert_raises(IncompleteResultsError):
                ds.addurls(self.json_file, "{url}", "{name}")
Beispiel #9
0
def test_add_file(client, annex_path):
    ds_id = 'ds000001'
    file_data = 'Test dataset README'
    response = client.simulate_post('/datasets/{}/files/README'.format(ds_id),
                                    body=file_data)
    assert response.status == falcon.HTTP_OK
    # Load the dataset to check for this file
    ds_obj = Dataset(os.path.join(annex_path, ds_id))
    test_files = ds_obj.get('README')
    assert test_files
    assert len(test_files) == 1
    with open(test_files.pop()['path']) as f:
        assert f.read() == file_data
Beispiel #10
0
def test_specialremote(dspath=None, remotepath=None):
    ds = Dataset(dspath).create()
    ds.repo.call_annex([
        'initremote', 'myremote', 'type=directory', f'directory={remotepath}',
        'encryption=none'
    ])
    res = ds.siblings('query', result_renderer='disabled')
    assert_in_results(
        res, **{
            'name': 'myremote',
            'annex-type': 'directory',
            'annex-directory': remotepath
        })
    def test_addurls_metafail(self, path):
        ds = Dataset(path).create(force=True)

        # Force failure by passing a non-existent file name to annex.
        fn = ds.repo.set_metadata_

        def set_meta(_, **kwargs):
            for i in fn("wreaking-havoc-and-such", **kwargs):
                yield i

        with patch.object(ds.repo, 'set_metadata_', set_meta):
            with assert_raises(IncompleteResultsError):
                ds.addurls(self.json_file, "{url}", "{name}")
Beispiel #12
0
    def setup(self, tarfile_path):
        import tarfile
        tempdir = osp.dirname(tarfile_path)
        with tarfile.open(tarfile_path) as tar:
            tar.extractall(tempdir)

        # TODO -- remove this abomination after https://github.com/datalad/datalad/issues/1512 is fixed
        epath = opj(tempdir, 'testds1')
        epath_unique = epath + str(self.__class__.ds_count)
        os.rename(epath, epath_unique)
        self.__class__.ds_count += 1
        self.ds = Dataset(epath_unique)
        print("Finished setup for %s" % tempdir)
Beispiel #13
0
def test_read_access(store_path, store_url, ds_path):

    ds = Dataset(ds_path).create()
    populate_dataset(ds)
    ds.save()

    if ds.repo.is_managed_branch():
        # TODO: on crippled FS copytree to populate store doesn't seem to work.
        #       Or may be it's just the serving via HTTP that doesn't work.
        #       Either way, after copytree and fsck, whereis doesn't report
        #       the store as an available source.
        raise SkipTest("Skip on crippled FS")

    files = [Path('one.txt'), Path('subdir') / 'two']
    store_path = Path(store_path)
    url = "ria+" + store_url
    init_opts = common_init_opts + ['url={}'.format(url)]

    io = LocalIO()
    create_store(io, store_path, '1')
    create_ds_in_store(io, store_path, ds.id, '2', '1')
    ds.repo.init_remote('ora-remote', options=init_opts)
    ds.repo.fsck(remote='ora-remote', fast=True)
    store_uuid = ds.siblings(name='ora-remote',
                             return_type='item-or-list')['annex-uuid']
    here_uuid = ds.siblings(name='here',
                            return_type='item-or-list')['annex-uuid']

    # nothing in store yet:
    for f in files:
        known_sources = ds.repo.whereis(str(f))
        assert_in(here_uuid, known_sources)
        assert_not_in(store_uuid, known_sources)

    annex_obj_target = str(store_path / ds.id[:3] / ds.id[3:] / 'annex' /
                           'objects')
    shutil.rmtree(annex_obj_target)
    shutil.copytree(src=str(ds.repo.dot_git / 'annex' / 'objects'),
                    dst=annex_obj_target)

    ds.repo.fsck(remote='ora-remote', fast=True)
    # all in store now:
    for f in files:
        known_sources = ds.repo.whereis(str(f))
        assert_in(here_uuid, known_sources)
        assert_in(store_uuid, known_sources)

    ds.drop('.')
    res = ds.get('.')
    assert_equal(len(res), 2)
    assert_result_count(res,
                        2,
                        status='ok',
                        type='file',
                        action='get',
                        message="from ora-remote...")
Beispiel #14
0
def test_create_osf_simple(path):

    ds = Dataset(path).create(force=True)
    ds.save()

    file1 = Path('ds') / "file1.txt"

    create_results = ds.create_sibling_osf(name="osf")

    assert_result_count(create_results, 2, status='ok')
    assert_result_count(create_results,
                        1,
                        status='ok',
                        type='dataset',
                        name="osf-storage",
                        path=ds.path)
    assert_result_count(create_results,
                        1,
                        status='ok',
                        type='sibling',
                        name="osf",
                        path=ds.path)

    # if we got here, we created something at OSF;
    # make sure, we clean up afterwards
    try:
        # special remote is configured:
        remote_log = ds.repo.call_git(
            ['cat-file', 'blob', 'git-annex:remote.log'])
        assert_in("node={}".format(create_results[0]['id']), remote_log)

        # copy files over
        ds.repo.copy_to('.', "osf-storage")
        whereis = ds.repo.whereis(str(file1))
        here = ds.config.get("annex.uuid")
        # files should be 'here' and on remote end:
        assert_equal(len(whereis), 2)
        assert_in(here, whereis)

        # drop content here
        ds.drop('.')
        whereis = ds.repo.whereis(str(file1))
        # now on remote end only
        assert_equal(len(whereis), 1)
        assert_not_in(here, whereis)

        # and get content again from remote:
        ds.get('.')
        whereis = ds.repo.whereis(str(file1))
        assert_equal(len(whereis), 2)
        assert_in(here, whereis)
    finally:
        # clean remote end:
        cred = get_credentials(allow_interactive=False)
        osf = OSF(**cred)
        delete_node(osf.session, create_results[0]['id'])
Beispiel #15
0
def test_create_push_url(detection_path, ds_path, store_path):

    store_path = Path(store_path)
    ds_path = Path(ds_path)
    detection_path = Path(detection_path)

    ds = Dataset(ds_path).create(force=True)
    ds.save()

    # patch SSHConnection to signal it was used:
    from datalad.support.sshconnector import SSHManager

    def detector(f, d):
        @wraps(f)
        def _wrapper(*args, **kwargs):
            d.touch()
            return f(*args, **kwargs)

        return _wrapper

    url = "ria+{}".format(store_path.as_uri())
    push_url = "ria+ssh://datalad-test{}".format(store_path.as_posix())
    assert not detection_path.exists()

    with patch('datalad.support.sshconnector.SSHManager.get_connection',
               new=detector(SSHManager.get_connection, detection_path)):

        ds.create_sibling_ria(url, "datastore", push_url=push_url)
        # used ssh_manager despite file-url hence used push-url (ria+ssh):
        assert detection_path.exists()

        # correct config in special remote:
        sr_cfg = ds.repo.get_special_remotes()[ds.siblings(
            name='datastore-storage')[0]['annex-uuid']]
        eq_(sr_cfg['url'], url)
        eq_(sr_cfg['push-url'], push_url)

        # git remote based on url (local path):
        eq_(ds.config.get("remote.datastore.url"),
            (store_path / ds.id[:3] / ds.id[3:]).as_posix())
        eq_(
            ds.config.get("remote.datastore.pushurl"),
            "ssh://datalad-test{}".format(
                (store_path / ds.id[:3] / ds.id[3:]).as_posix()))

        # git-push uses SSH:
        detection_path.unlink()
        ds.push('.', to="datastore", data='nothing')
        assert detection_path.exists()

        # data push
        # Note, that here the patching has no effect, since the special remote
        # is running in a subprocess of git-annex. Hence we can't detect SSH
        # usage really. However, ORA remote is tested elsewhere - if it succeeds
        # all should be good wrt `create-sibling-ria`.
        ds.repo.call_annex(['copy', '.', '--to', 'datastore-storage'])
Beispiel #16
0
def test_sibling_enable_sameas(repo, clone_path):
    ds = Dataset(repo.path)
    create_tree(ds.path, {"f0": "0"})
    ds.save(path="f0")
    ds.repo.copy_to(["f0"], remote="r_dir")
    ds.repo.drop(["f0"])

    ds_cloned = clone(ds.path, clone_path)

    assert_false(ds_cloned.repo.file_has_content("f0"))
    res = ds_cloned.siblings(action="enable", name="r_rsync")
    assert_status("ok", res)
    ds_cloned.get(path=["f0"])
    ok_(ds_cloned.repo.file_has_content("f0"))
Beispiel #17
0
def test_sibling_path_is_posix(basedir=None, otherpath=None):
    ds_source = Dataset(opj(basedir, "source")).create()
    # add remote with system native path
    ds_source.siblings(action="add",
                       name="donotexist",
                       url=otherpath,
                       result_renderer='disabled')
    res = ds_source.siblings(action="query",
                             name="donotexist",
                             result_renderer='disabled',
                             return_type='item-or-list')
    # path URL should come out POSIX as if `git clone` had configured it for origin
    # https://github.com/datalad/datalad/issues/3972
    eq_(res['url'], Path(otherpath).as_posix())
Beispiel #18
0
 def tag_releases(self, dandiset: RemoteDandiset, ds: Dataset,
                  push: bool) -> None:
     if not self.config.enable_tags:
         return
     log.info("Tagging releases for Dandiset %s", dandiset.identifier)
     versions = [
         v for v in dandiset.get_versions() if v.identifier != "draft"
     ]
     for v in versions:
         if readcmd("git", "tag", "-l", v.identifier, cwd=ds.path):
             log.debug("Version %s already tagged", v.identifier)
         else:
             log.info("Tagging version %s", v.identifier)
             self.mkrelease(dandiset.for_version(v), ds, push=push)
     if versions:
         latest = max(map(attrgetter("identifier"), versions), key=Version)
         description = readcmd("git",
                               "describe",
                               "--tags",
                               "--long",
                               "--always",
                               cwd=ds.path)
         if "-" not in description:
             # No tags on default branch
             merge = True
         else:
             m = re.fullmatch(
                 r"(?P<tag>.+)-(?P<distance>[0-9]+)-g(?P<rev>[0-9a-f]+)?",
                 description,
             )
             assert m, f"Could not parse `git describe` output: {description!r}"
             merge = Version(latest) > Version(m["tag"])
         if merge:
             log.debug("Running: git merge -s ours %s", shlex.quote(latest))
             subprocess.run(
                 [
                     "git",
                     "merge",
                     "-s",
                     "ours",
                     "-m",
                     f"Merge '{latest}' into drafts branch (no differences"
                     " in content merged)",
                     latest,
                 ],
                 cwd=ds.path,
                 check=True,
             )
         if push:
             ds.push(to="github", jobs=self.config.jobs)
Beispiel #19
0
def check_api(no_annex, path):
    ds = Dataset(path).create(force=True, no_annex=no_annex)
    ds.add('.')
    ok_clean_git(ds.path)

    processed_extractors, skipped_extractors = [], []
    for extractor_ep in iter_entry_points('datalad.metadata.extractors'):
        # we need to be able to query for metadata, even if there is none
        # from any extractor
        try:
            extractor_cls = extractor_ep.load()
        except Exception as exc:
            exc_ = str(exc)
            skipped_extractors += [exc_]
            continue
        extractor = extractor_cls(
            ds, paths=['file.dat'])
        meta = extractor.get_metadata(
            dataset=True,
            content=True)
        # we also get something for the dataset and something for the content
        # even if any of the two is empty
        assert_equal(len(meta), 2)
        dsmeta, contentmeta = meta
        assert (isinstance(dsmeta, dict))
        assert hasattr(contentmeta, '__len__') or isgenerator(contentmeta)
        # verify that generator does not blow and has an entry for our
        # precious file
        cm = dict(contentmeta)
        # datalad_core does provide some (not really) information about our
        # precious file
        if extractor_ep.name == 'datalad_core':
            assert 'file.dat' in cm
        elif extractor_ep.name == 'annex':
            if not no_annex:
                # verify correct key, which is the same for all files of 0 size
                assert_equal(
                    cm['file.dat']['key'],
                    'MD5E-s0--d41d8cd98f00b204e9800998ecf8427e.dat'
                )
            else:
                # no metadata on that file
                assert not cm
        processed_extractors.append(extractor_ep.name)
    assert "datalad_core" in processed_extractors, \
        "Should have managed to find at least the core extractor extractor"
    if skipped_extractors:
        raise SkipTest(
            "Not fully tested/succeded since some extractors failed"
            " to load:\n%s" % ("\n".join(skipped_extractors)))
Beispiel #20
0
    def test_drop_after(self=None, path=None):
        ds = Dataset(path).create(force=True)
        ds.repo.set_gitattributes([('a*', {'annex.largefiles': 'nothing'})])
        # make some files go to git, so we could test that we do not blow
        # while trying to drop what is in git not annex
        res = ds.addurls(self.json_file,
                         '{url}',
                         '{name}',
                         drop_after=True,
                         result_renderer='disabled')

        assert_result_count(res, 3, action='addurl',
                            status='ok')  # a, b, c  even if a goes to git
        assert_result_count(res, 2, action='drop', status='ok')  # b, c
Beispiel #21
0
def test_add_readme(path):
    ds = Dataset(path).create(force=True)
    ds.add('.')
    ds.aggregate_metadata()
    ok_clean_git(ds.path)
    assert_status('ok', ds.plugin('add_readme'))
    # should use default name
    eq_(
        open(opj(path, 'README.md')).read(),
        """\
# Dataset "demo_ds"

this is for play

### Authors

- Betty
- Tom

### License

PDDL

## General information

This is a DataLad dataset (id: {id}).

For more information on DataLad and on how to work with its datasets,
see the DataLad documentation at: http://docs.datalad.org
""".format(
    id=ds.id))

    # should skip on re-run
    assert_status('notneeded', ds.plugin('add_readme'))
Beispiel #22
0
    def test_addurls_url_on_collision_choose(self=None, path=None):
        ds = Dataset(path).create(force=True)
        data = deepcopy(self.data)
        for row in data:
            row["name"] = "a"

        with patch("sys.stdin", new=StringIO(json.dumps(data))):
            assert_in_results(ds.addurls("-",
                                         "{url}",
                                         "{name}",
                                         on_failure="ignore"),
                              action="addurls",
                              status="error")
        with patch("sys.stdin", new=StringIO(json.dumps(data))):
            assert_in_results(ds.addurls("-",
                                         "{url}",
                                         "{name}",
                                         on_collision="error-if-different",
                                         on_failure="ignore"),
                              action="addurls",
                              status="error")

        with patch("sys.stdin", new=StringIO(json.dumps(data))):
            ds.addurls("-", "{url}", "{name}-first", on_collision="take-first")
        ok_file_has_content(op.join(ds.path, "a-first"),
                            "a content",
                            strip=True)

        with patch("sys.stdin", new=StringIO(json.dumps(data))):
            ds.addurls("-", "{url}", "{name}-last", on_collision="take-last")
        ok_file_has_content(op.join(ds.path, "a-last"),
                            "c content",
                            strip=True)
Beispiel #23
0
    def test_addurls_subdataset(self, path):
        ds = Dataset(path).create(force=True)

        with chpwd(path):
            for save in True, False:
                label = "save" if save else "nosave"
                hexsha_before = ds.repo.get_hexsha()
                ds.addurls(self.json_file,
                           "{url}",
                           "{subdir}-" + label + "//{name}",
                           save=save)
                hexsha_after = ds.repo.get_hexsha()

                for fname in ["foo-{}/a", "bar-{}/b", "foo-{}/c"]:
                    ok_exists(fname.format(label))

                assert_true(save ^ (hexsha_before == hexsha_after))
                assert_true(save ^ ds.repo.dirty)

            # Now save the "--nosave" changes and check that we have
            # all the subdatasets.
            ds.add(".")
            eq_(set(subdatasets(ds, recursive=True, result_xfm="relpaths")),
                {"foo-save", "bar-save", "foo-nosave", "bar-nosave"})

            # We don't try to recreate existing subdatasets.
            with swallow_logs(new_level=logging.DEBUG) as cml:
                ds.addurls(self.json_file, "{url}", "{subdir}-nosave//{name}")
                assert_in("Not creating subdataset at existing path", cml.out)
Beispiel #24
0
def check_api(no_annex, path):
    ds = Dataset(path).create(force=True, no_annex=no_annex)
    ds.save()
    assert_repo_status(ds.path)

    processed_extractors, skipped_extractors = [], []
    for extractor_ep in iter_entry_points('datalad.metadata.extractors'):
        # we need to be able to query for metadata, even if there is none
        # from any extractor
        try:
            extractor_cls = extractor_ep.load()
        except Exception as exc:
            exc_ = str(exc)
            skipped_extractors += [exc_]
            continue
        extractor = extractor_cls(
            ds, paths=['file.dat'])
        meta = extractor.get_metadata(
            dataset=True,
            content=True)
        # we also get something for the dataset and something for the content
        # even if any of the two is empty
        assert_equal(len(meta), 2)
        dsmeta, contentmeta = meta
        assert (isinstance(dsmeta, dict))
        assert hasattr(contentmeta, '__len__') or isgenerator(contentmeta)
        # verify that generator does not blow and has an entry for our
        # precious file
        cm = dict(contentmeta)
        # datalad_core does provide some (not really) information about our
        # precious file
        if extractor_ep.name == 'datalad_core':
            assert 'file.dat' in cm
        elif extractor_ep.name == 'annex':
            if not no_annex:
                # verify correct key, which is the same for all files of 0 size
                assert_equal(
                    cm['file.dat']['key'],
                    'MD5E-s0--d41d8cd98f00b204e9800998ecf8427e.dat'
                )
            else:
                # no metadata on that file
                assert not cm
        processed_extractors.append(extractor_ep.name)
    assert "datalad_core" in processed_extractors, \
        "Should have managed to find at least the core extractor extractor"
    if skipped_extractors:
        raise SkipTest(
            "Not fully tested/succeded since some extractors failed"
            " to load:\n%s" % ("\n".join(skipped_extractors)))
Beispiel #25
0
def test_reproin_largely_smoke(tmpdir, heuristic, invocation):
    is_bids = True if heuristic == 'reproin' else False
    arg = "--random-seed 1 -f %s -c dcm2niix -o %s" \
          % (heuristic, tmpdir)
    if is_bids:
        arg += " -b"
    arg += " --datalad "
    args = (
        arg + invocation
    ).split(' ')

    # Test some safeguards
    if invocation == "--files %s" % TESTS_DATA_PATH:
        # Multiple subjects must not be specified -- only a single one could
        # be overridden from the command line
        with pytest.raises(ValueError):
            runner(args + ['--subjects', 'sub1', 'sub2'])

        if heuristic != 'reproin':
            # if subject is not overriden, raise error
            with pytest.raises(NotImplementedError):
                runner(args)
            return

    runner(args)
    ds = Dataset(str(tmpdir))
    assert ds.is_installed()
    assert not ds.repo.dirty
    head = ds.repo.get_hexsha()

    # and if we rerun -- should fail
    lgr.info(
        "RERUNNING, expecting to FAIL since the same everything "
        "and -c specified so we did conversion already"
    )
    with pytest.raises(RuntimeError):
        runner(args)

    # but there should be nothing new
    assert not ds.repo.dirty
    assert head == ds.repo.get_hexsha()

    # unless we pass 'overwrite' flag
    runner(args + ['--overwrite'])
    # but result should be exactly the same, so it still should be clean
    # and at the same commit
    assert ds.is_installed()
    assert not ds.repo.dirty
    assert head == ds.repo.get_hexsha()
Beispiel #26
0
def test_zip_archive(path):
    ds = Dataset(opj(path, 'ds')).create(force=True, no_annex=True)
    ds.add('.')
    with chpwd(path):
        ds.export_archive(filename='my', archivetype='zip')
        assert_true(os.path.exists('my.zip'))
        custom1_md5 = md5sum('my.zip')
        time.sleep(1.1)
        ds.export_archive(filename='my', archivetype='zip')
        assert_equal(md5sum('my.zip'), custom1_md5)

    # should be able to export without us cd'ing to that ds directory
    ds.export_archive(filename=ds.path, archivetype='zip')
    default_name = 'datalad_{}.zip'.format(ds.id)
    assert_true(os.path.exists(os.path.join(ds.path, default_name)))
Beispiel #27
0
def test_docker(path):  # Singularity's "docker://" scheme.
    ds = Dataset(path).create()
    ds.containers_add(
        "bb",
        url=("docker://busybox@sha256:"
             "7964ad52e396a6e045c39b5a44438424ac52e12e4d5a25d94895f2058cb863a0"
             ))

    img = op.join(ds.path, ".datalad", "environments", "bb", "image")
    assert_result_count(ds.containers_list(), 1, path=img, name="bb")
    ok_clean_git(path)

    WitlessRunner(cwd=ds.path).run(
        ["datalad", "containers-run", "ls", "/singularity"],
        protocol=StdOutCapture)
Beispiel #28
0
 def test_addurls_deeper(self, path):
     ds = Dataset(path).create(force=True)
     ds.addurls(self.json_file, "{url}",
                "{subdir}//adir/{subdir}-again//other-ds//bdir/{name}")
     eq_(
         set(ds.subdatasets(recursive=True, result_xfm="relpaths")), {
             "foo", "bar",
             op.join("foo", "adir", "foo-again"),
             op.join("bar", "adir", "bar-again"),
             op.join("foo", "adir", "foo-again", "other-ds"),
             op.join("bar", "adir", "bar-again", "other-ds")
         })
     ok_exists(
         os.path.join(ds.path, "foo", "adir", "foo-again", "other-ds",
                      "bdir", "a"))
Beispiel #29
0
 def test_addurls_from_key_invalid_format(self, path):
     ds = Dataset(path).create(force=True)
     for fmt in [
             "{name}-which-has-no-double-dash",
             # Invalid hash length.
             "MD5-s{size}--{md5sum}a",
             # Invalid hash content.
             "MD5-s{size}--" + 32 * "q"
     ]:
         with assert_raises(IncompleteResultsError):
             ds.addurls(self.json_file,
                        "{url}",
                        "{name}",
                        key=fmt,
                        exclude_autometa="*")
Beispiel #30
0
def annex_path(tmpdir_factory):
    path = tmpdir_factory.mktemp('annexes')
    ds_path = str(path.join(DATASET_ID))
    # Create an empty dataset for testing
    ds = Dataset(ds_path)
    ds.create()
    ds.no_annex(BIDS_NO_ANNEX)
    json_path = os.path.join(ds_path, 'dataset_description.json')
    with open(json_path, 'w') as f:
        json.dump(DATASET_DESCRIPTION, f, ensure_ascii=False)
    ds.add(json_path)
    ds.save(version_tag=SNAPSHOT_ID)
    # Setup a seed for any new_dataset uses
    random.seed(42)
    return path
Beispiel #31
0
 def setUp(self):
     pt = test_directory.parent / "BEP032-examples"
     if pt.exists():
         self.dataset = Dataset(str(pt))
         self.dataset.clean()
         self.dataset.update(merge=True)
         self.dataset.get()
     else:
         self.dataset = install(
             path=str(pt),
             source="https://gin.g-node.org/NeuralEnsemble/BEP032-examples",
             get_data=True,
         )
     self.datadir = str(pt)
     self.savedir = tempfile.mkdtemp()
Beispiel #32
0
def test_noop(path, outdir):
    ds = Dataset(opj(path, 'ds')).create(force=True)
    ds.save()
    assert_raises(
        TypeError,
        ds.bids2scidata,
    )
    with chpwd(outdir):  # to not pollute cwd
        assert_raises(
            IncompleteResultsError,
            ds.bids2scidata,
            repo_name="dummy",
            repo_accession='ds1',
            repo_url='http://example.com',
        )
Beispiel #33
0
def test_reproin_largely_smoke(tmpdir, heuristic, invocation):
    is_bids = True if heuristic == 'reproin' else False
    arg = "--random-seed 1 -f %s -c dcm2niix -o %s" \
          % (heuristic, tmpdir)
    if is_bids:
        arg += " -b"
    arg += " --datalad "
    args = (
        arg + invocation
    ).split(' ')

    # Test some safeguards
    if invocation == "--files %s" % TESTS_DATA_PATH:
        # Multiple subjects must not be specified -- only a single one could
        # be overridden from the command line
        with pytest.raises(ValueError):
            runner(args + ['--subjects', 'sub1', 'sub2'])

        if heuristic != 'reproin':
            # none other heuristic has mighty infotoids atm
            with pytest.raises(NotImplementedError):
                runner(args)
            return
    runner(args)
    ds = Dataset(str(tmpdir))
    assert ds.is_installed()
    assert not ds.repo.dirty
    head = ds.repo.get_hexsha()

    # and if we rerun -- should fail
    lgr.info(
        "RERUNNING, expecting to FAIL since the same everything "
        "and -c specified so we did conversion already"
    )
    with pytest.raises(RuntimeError):
        runner(args)

    # but there should be nothing new
    assert not ds.repo.dirty
    assert head == ds.repo.get_hexsha()

    # unless we pass 'overwrite' flag
    runner(args + ['--overwrite'])
    # but result should be exactly the same, so it still should be clean
    # and at the same commit
    assert ds.is_installed()
    assert not ds.repo.dirty
    assert head == ds.repo.get_hexsha()
Beispiel #34
0
def test_ensure_datalad_remote_init_and_enable_needed(path=None):
    from datalad.consts import DATALAD_SPECIAL_REMOTE
    ds = Dataset(path).create(force=True)
    repo = ds.repo
    assert_false(repo.get_remotes())
    ensure_datalad_remote(repo)
    assert_in(DATALAD_SPECIAL_REMOTE, repo.get_remotes())
def test_get_metadata(path):

    ds = Dataset(path).create(force=True)
    p = MetadataExtractor(ds, [])
    meta = p._get_dataset_metadata()
    assert_equal(
        dumps(meta, sort_keys=True, indent=2), """\
{
  "author": "Jane Doe <*****@*****.**>",
  "conformsto": "http://specs.frictionlessdata.io/data-packages",
  "contributors": [
    "Joe Bloggs <*****@*****.**> (http://www.example.com)"
  ],
  "description": "Annual Consumer Price Index (CPI) for most countries in the world. Reference year is 2005.",
  "license": "http://opendatacommons.org/licenses/pddl/",
  "name": "cpi",
  "shortdescription": "Annual Consumer Price Index (CPI)",
  "tag": [
    "CPI",
    "World",
    "Consumer Price Index",
    "Annual Data",
    "The World Bank"
  ],
  "version": "2.0.0"
}""")
Beispiel #36
0
def test_profile_get_repo_files(annex_path, new_dataset):
    ds_id = os.path.basename(new_dataset.path)
    ds = Dataset(str(annex_path.join(ds_id)))
    for each in range(5000):
        filename = 'file-{}'.format(each)
        path = os.path.join(new_dataset.path, filename)
        with open(path, 'a'):
            os.utime(path)
    # Add all generated files
    ds.add('.')
    # Profile get_repo_files by itself
    with open('{}.prof'.format(__name__), 'w+b') as fd:
        vmprof.enable(fd.fileno())
        for n in range(1):
            get_repo_files(ds)
        vmprof.disable()
Beispiel #37
0
def test_dicom2spec(path):

    # ###  SETUP ###
    dicoms = get_dicom_dataset('structural')

    ds = Dataset.create(path, cfg_proc=['hirni'])
    ds.install(source=dicoms, path='acq100')
    # Note: Recursive, since aggregation wasn't performed in the installed dastasets
    # TODO: Use get_raw_sd from above instead of this setup
    ds.meta_aggregate('acq100', into='top', recursive=True)
    # ### END SETUP ###

    # TODO: should it be specfile or acq/specfile? => At least doc needed,
    # if not change
    res = ds.hirni_dicom2spec(path='acq100', spec='spec_structural.json')

    # check for actual location of spec_structural!
    # => studyds root!

    assert_result_count(res, 2)
    assert_result_count(res, 1, path=op.join(ds.path, 'spec_structural.json'))
    assert_result_count(res, 1, path=op.join(ds.path, '.gitattributes'))
    ok_clean_git(ds.path)

    # multiple execution shouldn't change .gitattributes again:
    from os import stat
    mtime = stat(op.join(ds.path, '.gitattributes')).st_mtime
    res = ds.hirni_dicom2spec(path='acq100', spec='spec_structural.json')
    assert_equal(stat(op.join(ds.path, '.gitattributes')).st_mtime, mtime)
Beispiel #38
0
def test_get_metadata(path):

    ds = Dataset(path).create(force=True)
    meta = MetadataParser(ds, []).get_metadata(True, False)[0]
    del meta['@context']
    dump = dumps(meta, sort_keys=True, indent=2, ensure_ascii=False)
    assert_equal(
        dump, """\
{
  "author": [
    "Mike One",
    "Anna Two"
  ],
  "citation": [
    "http://studyforrest.org"
  ],
  "comment<BIDSVersion>": "1.0.0-rc3",
  "conformsto": "http://bids.neuroimaging.io/bids_spec1.0.0-rc3.pdf",
  "description": "Some description",
  "fundedby": "We got money from collecting plastic bottles",
  "license": "PDDL",
  "name": "studyforrest_phase2"
}""")

    test_fname = opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz')
    cmeta = list(
        MetadataParser(ds,
                       [opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz')
                        ]).get_metadata(False, True)[1])
    assert_equal(len(cmeta), 1)
    assert_equal(cmeta[0][0], test_fname)
    assert_in('comment<participant#handedness>', cmeta[0][1])
Beispiel #39
0
    def get_raw_dataset(self):
        # Note: This is lazy to avoid building on import time, since import is part of nose's discovery and executed
        # before the dependencies. This leads to datalad's ui backend not yet being correctly set, which in turn
        # let's the cloning hang within progressbar generation.
        if not self._dspath:
            import tempfile
            kwargs = get_tempfile_kwargs()
            path = tempfile.mkdtemp(**kwargs)
            f_dicoms = get_dicom_dataset('functional')
            s_dicoms = get_dicom_dataset('structural')
            ds = Dataset.create(path, cfg_proc=['hirni'])
            ds.install(source=f_dicoms, path=op.join('func_acq', 'dicoms'))
            ds.install(source=s_dicoms, path=op.join('struct_acq', 'dicoms'))

            # Note: Recursive, since aggregation wasn't performed in the installed dastasets
            ds.meta_aggregate([
                op.join('func_acq', 'dicoms'),
                op.join('struct_acq', 'dicoms')
            ],
                              into='top',
                              recursive=True)

            # TODO: Figure how to add it to things to be removed after tests ran
            self._dspath = ds.path
        return self._dspath
Beispiel #40
0
def test_initremote(store_path, store_url, ds_path):
    ds = Dataset(ds_path).create()
    store_path = Path(store_path)
    url = "ria+" + store_url
    init_opts = common_init_opts + ['url={}'.format(url)]

    # fails on non-RIA URL
    assert_raises(CommandError,
                  ds.repo.init_remote,
                  'ora-remote',
                  options=common_init_opts +
                  ['url={}'
                   ''.format(store_path.as_uri())])
    # Doesn't actually create a remote if it fails
    assert_not_in(
        'ora-remote',
        [cfg['name'] for uuid, cfg in ds.repo.get_special_remotes().items()])

    ds.repo.init_remote('ora-remote', options=init_opts)
    assert_in(
        'ora-remote',
        [cfg['name'] for uuid, cfg in ds.repo.get_special_remotes().items()])
    assert_repo_status(ds.path)
    # git-annex:remote.log should have:
    #   - url
    #   - common_init_opts
    #   - archive_id (which equals ds id)
    remote_log = ds.repo.call_git(['cat-file', 'blob', 'git-annex:remote.log'])
    assert_in("url={}".format(url), remote_log)
    [assert_in(c, remote_log) for c in common_init_opts]
    assert_in("archive-id={}".format(ds.id), remote_log)
Beispiel #41
0
    def test_addurls_subdataset(self, path):
        ds = Dataset(path).create(force=True)

        with chpwd(path):
            for save in True, False:
                label = "save" if save else "nosave"
                hexsha_before = ds.repo.get_hexsha()
                ds.addurls(self.json_file, "{url}",
                           "{subdir}-" + label + "//{name}",
                           save=save)
                hexsha_after = ds.repo.get_hexsha()

                for fname in ["foo-{}/a", "bar-{}/b", "foo-{}/c"]:
                    ok_exists(fname.format(label))

                assert_true(save ^ (hexsha_before == hexsha_after))
                assert_true(save ^ ds.repo.dirty)

            # Now save the "--nosave" changes and check that we have
            # all the subdatasets.
            ds.add(".")
            eq_(set(subdatasets(ds, recursive=True,
                                result_xfm="relpaths")),
                {"foo-save", "bar-save", "foo-nosave", "bar-nosave"})

            # We don't try to recreate existing subdatasets.
            with swallow_logs(new_level=logging.DEBUG) as cml:
                ds.addurls(self.json_file, "{url}", "{subdir}-nosave//{name}")
                assert_in("Not creating subdataset at existing path", cml.out)
Beispiel #42
0
def test_exif(path):
    ds = Dataset(path).create()
    ds.config.add('datalad.metadata.nativetype', 'exif', where='dataset')
    copy(
        opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'exif.jpg'),
        path)
    ds.save()
    ok_clean_git(ds.path)
    res = ds.aggregate_metadata()
    assert_status('ok', res)
    res = ds.metadata('exif.jpg')
    assert_result_count(res, 1)
    # from this extractor
    meta = res[0]['metadata']['exif']
    for k, v in target.items():
        eq_(meta[k], v)

    assert_in('@context', meta)
Beispiel #43
0
def test_zip_archive(path):
    ds = Dataset(opj(path, 'ds')).create(force=True, no_annex=True)
    ds.save()
    with chpwd(path):
        ds.export_archive(filename='my', archivetype='zip')
        assert_true(os.path.exists('my.zip'))
        custom1_md5 = md5sum('my.zip')
        time.sleep(1.1)
        ds.export_archive(filename='my', archivetype='zip')
        assert_equal(md5sum('my.zip'), custom1_md5)

    # should be able to export without us cd'ing to that ds directory
    ds.export_archive(filename=ds.path, archivetype='zip')
    default_name = 'datalad_{}.zip'.format(ds.id)
    assert_true(os.path.exists(os.path.join(ds.path, default_name)))
Beispiel #44
0
def test_audio(path):
    ds = Dataset(path).create()
    ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset')
    copy(
        opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'audio.mp3'),
        path)
    ds.add('.')
    ok_clean_git(ds.path)
    res = ds.aggregate_metadata()
    assert_status('ok', res)
    res = ds.metadata('audio.mp3')
    assert_result_count(res, 1)

    # from this extractor
    meta = res[0]['metadata']['audio']
    for k, v in target.items():
        eq_(meta[k], v)

    assert_in('@context', meta)

    uniques = ds.metadata(
        reporton='datasets', return_type='item-or-list')['metadata']['datalad_unique_content_properties']
    # test file has it, but uniques have it blanked out, because the extractor considers it worthless
    # for discovering whole datasets
    assert_in('bitrate', meta)
    eq_(uniques['audio']['bitrate'], None)

    # 'date' field carries not value, hence gets exclude from the unique report
    assert_in('date', meta)
    assert(not meta['date'])
    assert_not_in('date', uniques['audio'])
Beispiel #45
0
def test_basic_metadata(path):
    ds = Dataset(opj(path, 'origin'))
    meta = get_metadata(ds)
    assert_equal(sorted(meta[0].keys()),
                 ['@context', 'dcterms:conformsTo'])
    ds.create(force=True, save=False)
    # with subdataset
    sub = ds.create('sub', force=True)
    ds.save()
    meta = get_metadata(ds)
    assert_equal(
        sorted(meta[0].keys()),
        ['@context', '@id', 'availableFrom', 'dcterms:conformsTo',
         'dcterms:modified', 'type', 'version'])
    assert_equal(meta[0]['type'], 'Dataset')
    # clone and get relationship info in metadata
    sibling = install(opj(path, 'sibling'), source=opj(path, 'origin'))
    sibling_meta = get_metadata(sibling)
    assert_equal(sibling_meta[0]['@id'], ds.id)
    # origin should learn about the clone
    sibling.repo.push(remote='origin', refspec='git-annex')
    meta = get_metadata(ds)
    assert_equal([m['@id'] for m in meta[0]['availableFrom']],
                 [m['@id'] for m in sibling_meta[0]['availableFrom']])
    meta = get_metadata(ds, guess_type=True)
    # without aggregation there is not trace of subdatasets in the metadata
    assert_not_in('dcterms:hasPart', meta[0])
Beispiel #46
0
    def setup(self, tarfile_path):
        import tarfile
        tempdir = osp.dirname(tarfile_path)
        with tarfile.open(tarfile_path) as tar:
            tar.extractall(tempdir)

        # TODO -- remove this abomination after https://github.com/datalad/datalad/issues/1512 is fixed
        epath = opj(tempdir, 'testds1')
        epath_unique = epath + str(self.__class__.ds_count)
        os.rename(epath, epath_unique)
        self.__class__.ds_count += 1
        self.ds = Dataset(epath_unique)
        print("Finished setup for %s" % tempdir)
Beispiel #47
0
    def test_addurls_version(self, path):
        ds = Dataset(path).create(force=True)

        def version_fn(url):
            if url.endswith("b.dat"):
                raise ValueError("Scheme error")
            return url + ".v1"

        with patch("datalad.plugin.addurls.get_versioned_url", version_fn):
            with swallow_logs(new_level=logging.WARNING) as cml:
                ds.addurls(self.json_file, "{url}", "{name}",
                           version_urls=True)
                assert_in("b.dat", str(cml.out))

        names = ["a", "c"]
        for fname in names:
            ok_exists(os.path.join(path, fname))

        whereis = ds.repo.whereis(names, output="full")
        for fname, info in whereis.items():
            eq_(info[ds.repo.WEB_UUID]['urls'],
                ["{}udir/{}.dat.v1".format(self.url, fname)])
Beispiel #48
0
def check_integration1(login, keyring,
                       path,
                       organization=None,
                       kwargs={},
                       oauthtokens=None):
    kwargs = kwargs.copy()
    if organization:
        kwargs['github_organization'] = organization

    ds = Dataset(path).create()
    if oauthtokens:
        for oauthtoken in assure_list(oauthtokens):
            ds.config.add('hub.oauthtoken', oauthtoken, where='local')

    # so we do not pick up local repo configuration/token
    repo_name = 'test_integration1'
    with chpwd(path):
        # ATM all the github goodness does not care about "this dataset"
        # so force "process wide" cfg to pick up our defined above oauthtoken
        cfg.reload(force=True)
        # everything works just nice, no conflicts etc
        res = ds.create_sibling_github(repo_name, **kwargs)

        if organization:
            url_fmt = 'https://{login}@github.com/{organization}/{repo_name}.git'
        else:
            url_fmt = 'https://github.com/{login}/{repo_name}.git'
        eq_(res, [(ds, url_fmt.format(**locals()), False)])

        # but if we rerun - should kaboom since already has this sibling:
        with assert_raises(ValueError) as cme:
            ds.create_sibling_github(repo_name, **kwargs)
        assert_in("already has a configured sibling", str(cme.exception))

        # but we can give it a new name, but it should kaboom since the remote one
        # exists already
        with assert_raises(ValueError) as cme:
            ds.create_sibling_github(repo_name, name="github2", **kwargs)
        assert_in("already exists on", str(cme.exception))
        # we should not leave the broken sibling behind
        assert_not_in('github2', ds.repo.get_remotes())

        # If we ask to reconfigure - should proceed normally
        ds.create_sibling_github(repo_name, existing='reconfigure', **kwargs)
    cfg.reload(force=True)
Beispiel #49
0
def test_tarball(path):
    ds = Dataset(opj(path, 'ds')).create(force=True)
    ds.save(all_changes=True)
    committed_date = ds.repo.get_committed_date()
    with chpwd(path):
        _mod, tarball1 = ds.export('tarball')
        assert(not isabs(tarball1))
        tarball1 = opj(path, tarball1)
    default_outname = opj(path, 'datalad_{}.tar.gz'.format(ds.id))
    assert_equal(tarball1, default_outname)
    assert_true(os.path.exists(default_outname))
    custom_outname = opj(path, 'myexport.tar.gz')
    # feed in without extension
    ds.export('tarball', output=custom_outname[:-7])
    assert_true(os.path.exists(custom_outname))
    custom1_md5 = md5sum(custom_outname)
    # encodes the original tarball filename -> different checksum, despit
    # same content
    assert_not_equal(md5sum(default_outname), custom1_md5)
    # should really sleep so if they stop using time.time - we know
    time.sleep(1.1)
    ds.export('tarball', output=custom_outname)
    # should not encode mtime, so should be identical
    assert_equal(md5sum(custom_outname), custom1_md5)

    def check_contents(outname, prefix):
        with tarfile.open(outname) as tf:
            nfiles = 0
            for ti in tf:
                # any annex links resolved
                assert_false(ti.issym())
                ok_startswith(ti.name, prefix + '/')
                assert_equal(ti.mtime, committed_date)
                if '.datalad' not in ti.name:
                    # ignore any files in .datalad for this test to not be
                    # susceptible to changes in how much we generate a meta info
                    nfiles += 1
            # we have exactly three files, and expect no content for any directory
            assert_equal(nfiles, 3)
    check_contents(default_outname, 'datalad_%s' % ds.id)
    check_contents(custom_outname, 'myexport')
Beispiel #50
0
    def test_addurls_subdataset(self, path):
        ds = Dataset(path).create(force=True)

        with chpwd(path):
            for save in True, False:
                label = "save" if save else "nosave"
                ds.addurls(self.json_file, "{url}",
                           "{subdir}-" + label + "//{name}",
                           save=save)

                subdirs = ["{}-{}".format(d, label) for d in ["foo", "bar"]]
                subdir_files = dict(zip(subdirs, [["a", "c"], ["b"]]))

                for subds, fnames in subdir_files.items():
                    for fname in fnames:
                        ok_exists(op.join(subds, fname))

                if save:
                    assert_repo_status(path)
                else:
                    # The datasets are create and saved ...
                    assert_repo_status(path, modified=subdirs)
                    # but the downloaded files aren't.
                    for subds, fnames in subdir_files.items():
                        assert_repo_status(subds, added=fnames)

            # Now save the "--nosave" changes and check that we have
            # all the subdatasets.
            ds.save()
            eq_(set(subdatasets(dataset=ds, recursive=True,
                                result_xfm="relpaths")),
                {"foo-save", "bar-save", "foo-nosave", "bar-nosave"})

            # We don't try to recreate existing subdatasets.
            with swallow_logs(new_level=logging.DEBUG) as cml:
                ds.addurls(self.json_file, "{url}", "{subdir}-nosave//{name}")
                assert_in("Not creating subdataset at existing path", cml.out)
Beispiel #51
0
 def test_addurls_url_filename(self, path):
     ds = Dataset(path).create(force=True)
     with chpwd(path):
         ds.addurls(self.json_file, "{url}", "{_url0}/{_url_filename}")
         for fname in ["udir/a.dat", "udir/b.dat", "udir/c.dat"]:
             ok_exists(fname)
Beispiel #52
0
def test_aggregation(path):
    with chpwd(path):
        assert_raises(InsufficientArgumentsError, aggregate_metadata, None)
    # a hierarchy of three (super/sub)datasets, each with some native metadata
    ds = Dataset(opj(path, 'origin')).create(force=True)
    # before anything aggregated we would get nothing and only a log warning
    with swallow_logs(new_level=logging.WARNING) as cml:
        assert_equal(list(query_aggregated_metadata('all', ds, [])), [])
    assert_re_in('.*Found no aggregated metadata.*update', cml.out)
    ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
                  where='dataset')
    subds = ds.create('sub', force=True)
    subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
                     where='dataset')
    subsubds = subds.create('subsub', force=True)
    subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
                        where='dataset')
    ds.add('.', recursive=True)
    ok_clean_git(ds.path)
    # aggregate metadata from all subdatasets into any superdataset, including
    # intermediate ones
    res = ds.aggregate_metadata(recursive=True, update_mode='all')
    # we get success report for both subdatasets and the superdataset,
    # and they get saved
    assert_result_count(res, 6)
    assert_result_count(res, 3, status='ok', action='aggregate_metadata')
    assert_result_count(res, 3, status='ok', action='save')
    # nice and tidy
    ok_clean_git(ds.path)

    # quick test of aggregate report
    aggs = ds.metadata(get_aggregates=True)
    # one for each dataset
    assert_result_count(aggs, 3)
    # mother also report layout version
    assert_result_count(aggs, 1, path=ds.path, layout_version=1)

    # store clean direct result
    origres = ds.metadata(recursive=True)
    # basic sanity check
    assert_result_count(origres, 6)
    assert_result_count(origres, 3, type='dataset')
    assert_result_count(origres, 3, type='file')  # Now that we have annex.key
    # three different IDs
    assert_equal(3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset'])))
    # and we know about all three datasets
    for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'):
        assert_true(
            sum([s['metadata']['frictionless_datapackage']['name'] \
                    == assure_unicode(name) for s in origres
                 if s['type'] == 'dataset']))

    # now clone the beast to simulate a new user installing an empty dataset
    clone = install(
        opj(path, 'clone'), source=ds.path,
        result_xfm='datasets', return_type='item-or-list')
    # ID mechanism works
    assert_equal(ds.id, clone.id)

    # get fresh metadata
    cloneres = clone.metadata()
    # basic sanity check
    assert_result_count(cloneres, 2)
    assert_result_count(cloneres, 1, type='dataset')
    assert_result_count(cloneres, 1, type='file')

    # now loop over the previous results from the direct metadata query of
    # origin and make sure we get the extact same stuff from the clone
    _compare_metadata_helper(origres, clone)

    # now obtain a subdataset in the clone, should make no difference
    assert_status('ok', clone.install('sub', result_xfm=None, return_type='list'))
    _compare_metadata_helper(origres, clone)

    # test search in search tests, not all over the place
    ## query smoke test
    assert_result_count(clone.search('mother', mode='egrep'), 1)
    assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1)

    child_res = clone.search('child', mode='egrep')
    assert_result_count(child_res, 2)
    for r in child_res:
        if r['type'] == 'dataset':
            assert_in(
                r['query_matched']['frictionless_datapackage.name'],
                r['metadata']['frictionless_datapackage']['name'])
Beispiel #53
0
def test_addurls_nonannex_repo(path):
    ds = Dataset(path).create(force=True, no_annex=True)
    with assert_raises(IncompleteResultsError) as raised:
        ds.addurls("dummy_arg0", "dummy_arg1", "dummy_arg2")
    assert_in("not an annex repo", str(raised.exception))
Beispiel #54
0
def test_get_aggregates_fails(path):
    with chpwd(path), assert_raises(NoDatasetArgumentFound):
        metadata(get_aggregates=True)
    ds = Dataset(path).create()
    res = ds.metadata(get_aggregates=True, on_failure='ignore')
    assert_result_count(res, 1, path=ds.path, status='impossible')
Beispiel #55
0
class supers(SuprocBenchmarks):
    """
    Benchmarks on common operations on collections of datasets using datalad API
    """

    timeout = 3600
    # need to assure that we are working in a different repository now
    # see https://github.com/datalad/datalad/issues/1512
    # might not be sufficient due to side effects between tests and
    # thus getting into the same situation
    ds_count = 0
    def setup_cache(self):
        # creating in CWD so things get removed when ASV is done
        ds_path = create_test_dataset("testds1", spec='2/-2/-2', seed=0)[0]
        # Will store into a tarfile since otherwise install -r is way too slow
        # to be invoked for every benchmark
        tarfile_path = opj(osp.dirname(ds_path), 'testds1.tar')
        with tarfile.open(tarfile_path, "w") as tar:
            # F.CK -- Python tarfile can't later extract those because key dirs are
            # read-only.  For now just a workaround - make it all writeable
            from datalad.utils import rotree
            rotree('testds1', ro=False, chmod_files=False)
            tar.add('testds1', recursive=True)
        rmtree('testds1')

        return tarfile_path

    def setup(self, tarfile_path):
        import tarfile
        tempdir = osp.dirname(tarfile_path)
        with tarfile.open(tarfile_path) as tar:
            tar.extractall(tempdir)

        # TODO -- remove this abomination after https://github.com/datalad/datalad/issues/1512 is fixed
        epath = opj(tempdir, 'testds1')
        epath_unique = epath + str(self.__class__.ds_count)
        os.rename(epath, epath_unique)
        self.__class__.ds_count += 1
        self.ds = Dataset(epath_unique)
        print("Finished setup for %s" % tempdir)

    def teardown(self, tarfile_path):
        for path in [self.ds.path + '_', self.ds.path]:
            print("Cleaning up %s" % path)
            if osp.exists(path):
                rmtree(path)

    def time_installr(self, tarfile_path):
        # somewhat duplicating setup but lazy to do different one for now
        assert install(self.ds.path + '_', source=self.ds.path, recursive=True)

    def time_createadd(self, tarfile_path):
        assert self.ds.create('newsubds')

    def time_createadd_to_dataset(self, tarfile_path):
        subds = create(opj(self.ds.path, 'newsubds'))
        self.ds.add(subds.path)

    def time_ls(self, tarfile_path):
        ls(self.ds.path)

    def time_ls_recursive(self, tarfile_path):
        ls(self.ds.path, recursive=True)

    def time_ls_recursive_long_all(self, tarfile_path):
        ls(self.ds.path, recursive=True, long_=True, all_=True)

    # TODO: since doesn't really allow to uninstall top level ds... bleh ;)
    #def time_uninstall(self, tarfile_path):
    #    uninstall(self.ds.path, recursive=True)

    def time_remove(self, tarfile_path):
        remove(self.ds.path, recursive=True)
Beispiel #56
0
    def test_addurls(self, path):
        ds = Dataset(path).create(force=True)

        def get_annex_commit_counts():
            return int(
                ds.repo.repo.git.rev_list("--count", "git-annex").strip())

        n_annex_commits = get_annex_commit_counts()

        with chpwd(path):
            ds.addurls(self.json_file, "{url}", "{name}")

            filenames = ["a", "b", "c"]
            for fname in filenames:
                ok_exists(fname)

            for (fname, meta), subdir in zip(ds.repo.get_metadata(filenames),
                                             ["foo", "bar", "foo"]):
                assert_dict_equal(meta,
                                  {"subdir": [subdir], "name": [fname]})

            # Ignore this check if we're faking dates because that disables
            # batch mode.
            if not os.environ.get('DATALAD_FAKE__DATES'):
                # We should have two new commits on the git-annex: one for the
                # added urls and one for the added metadata.
                eq_(n_annex_commits + 2, get_annex_commit_counts())

            # Add to already existing links, overwriting.
            with swallow_logs(new_level=logging.DEBUG) as cml:
                ds.addurls(self.json_file, "{url}", "{name}",
                           ifexists="overwrite")
                for fname in filenames:
                    assert_in("Removing {}".format(os.path.join(path, fname)),
                              cml.out)

            # Add to already existing links, skipping.
            assert_in_results(
                ds.addurls(self.json_file, "{url}", "{name}", ifexists="skip"),
                action="addurls",
                status="notneeded")

            # Add to already existing links works, as long content is the same.
            ds.addurls(self.json_file, "{url}", "{name}")

            # But it fails if something has changed.
            ds.unlock("a")
            with open("a", "w") as ofh:
                ofh.write("changed")
            ds.save("a")

            assert_raises(IncompleteResultsError,
                          ds.addurls,
                          self.json_file, "{url}", "{name}")
Beispiel #57
0
 def test_addurls_dropped_urls(self, path):
     ds = Dataset(path).create(force=True)
     with chpwd(path), swallow_logs(new_level=logging.WARNING) as cml:
         ds.addurls(self.json_file, "", "{subdir}//{name}")
         assert_re_in(r".*Dropped [0-9]+ row\(s\) that had an empty URL",
                      str(cml.out))
Beispiel #58
0
def test_within_ds_file_search(path):
    try:
        import mutagen
    except ImportError:
        raise SkipTest
    ds = Dataset(path).create(force=True)
    # override default and search for datasets and files for this test
    for m in ('egrep', 'textblob', 'autofield'):
        ds.config.add(
            'datalad.search.index-{}-documenttype'.format(m), 'all',
            where='dataset')
    ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset')
    makedirs(opj(path, 'stim'))
    for src, dst in (
            ('audio.mp3', opj('stim', 'stim1.mp3')),):
        copy(
            opj(dirname(dirname(__file__)), 'tests', 'data', src),
            opj(path, dst))
    ds.save()
    ok_file_under_git(path, opj('stim', 'stim1.mp3'), annexed=True)
    # If it is not under annex, below addition of metadata silently does
    # not do anything
    ds.repo.set_metadata(
        opj('stim', 'stim1.mp3'), init={'importance': 'very'})
    ds.aggregate_metadata()
    ok_clean_git(ds.path)
    # basic sanity check on the metadata structure of the dataset
    dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata']
    for src in ('audio',):
        # something for each one
        assert_in(src, dsmeta)
        # each src declares its own context
        assert_in('@context', dsmeta[src])
        # we have a unique content metadata summary for each src
        assert_in(src, dsmeta['datalad_unique_content_properties'])

    # test default behavior
    with swallow_outputs() as cmo:
        ds.search(show_keys='name', mode='textblob')

        assert_in("""\
id
meta
parentds
path
type
""", cmo.out)

    target_out = """\
annex.importance
annex.key
audio.bitrate
audio.duration(s)
audio.format
audio.music-Genre
audio.music-album
audio.music-artist
audio.music-channels
audio.music-sample_rate
audio.name
audio.tracknumber
datalad_core.id
datalad_core.refcommit
id
parentds
path
type
"""

    # check generated autofield index keys
    with swallow_outputs() as cmo:
        ds.search(mode='autofield', show_keys='name')
        # it is impossible to assess what is different from that dump
        assert_in(target_out, cmo.out)

    assert_result_count(ds.search('blablob#'), 0)
    # now check that we can discover things from the aggregated metadata
    for mode, query, hitpath, matched in (
        ('egrep',
         ':mp3',
         opj('stim', 'stim1.mp3'),
         {'audio.format': 'mp3'}),
        # same as above, leading : is stripped, in indicates "ALL FIELDS"
        ('egrep',
         'mp3',
         opj('stim', 'stim1.mp3'),
         {'audio.format': 'mp3'}),
        # same as above, but with AND condition
        # get both matches
        ('egrep',
         ['mp3', 'type:file'],
         opj('stim', 'stim1.mp3'),
         {'type': 'file', 'audio.format': 'mp3'}),
        # case insensitive search
        ('egrep',
         'mp3',
         opj('stim', 'stim1.mp3'),
         {'audio.format': 'mp3'}),
        # field selection by expression
        ('egrep',
         'audio\.+:mp3',
         opj('stim', 'stim1.mp3'),
         {'audio.format': 'mp3'}),
        # random keyword query
        ('textblob',
         'mp3',
         opj('stim', 'stim1.mp3'),
         {'meta': 'mp3'}),
        # report which field matched with auto-field
        ('autofield',
         'mp3',
         opj('stim', 'stim1.mp3'),
         {'audio.format': 'mp3'}),
        # XXX next one is not supported by current text field analyser
        # decomposes the mime type in [mime, audio, mp3]
        # ('autofield',
        # "'mime:audio/mp3'",
        # opj('stim', 'stim1.mp3'),
        # 'audio.format', 'mime:audio/mp3'),
        # but this one works
        ('autofield',
         "'mime audio mp3'",
         opj('stim', 'stim1.mp3'),
         {'audio.format': 'mp3'}),
        # TODO extend with more complex queries to test whoosh
        # query language configuration
    ):
        res = ds.search(query, mode=mode, full_record=True)
        assert_result_count(
            res, 1, type='file', path=opj(ds.path, hitpath),
            # each file must report the ID of the dataset it is from, critical for
            # discovering related content
            dsid=ds.id)
        # in egrep we currently do not search unique values
        # and the queries above aim at files
        assert_result_count(res, 1 if mode == 'egrep' else 2)
        if mode != 'egrep':
            assert_result_count(
                res, 1, type='dataset', path=ds.path, dsid=ds.id)
        # test the key and specific value of the match
        for matched_key, matched_val in matched.items():
            assert_in(matched_key, res[-1]['query_matched'])
            assert_equal(res[-1]['query_matched'][matched_key], matched_val)

    # test a suggestion msg being logged if no hits and key is a bit off
    with swallow_logs(new_level=logging.INFO) as cml:
        res = ds.search('audio.formats:mp3 audio.bitsrate:1', mode='egrep')
        assert not res
        assert_in('Did you mean any of', cml.out)
        assert_in('audio.format', cml.out)
        assert_in('audio.bitrate', cml.out)
Beispiel #59
0
def test_aggregation(path):
    with chpwd(path):
        assert_raises(InsufficientArgumentsError, aggregate_metadata, None)
    # a hierarchy of three (super/sub)datasets, each with some native metadata
    ds = Dataset(opj(path, 'origin')).create(force=True)
    subds = ds.create('sub', force=True)
    subsubds = subds.create('subsub', force=True)
    # aggregate from bottom to top, guess native data, no compacting of graph
    # should yield 6 meta data sets, one implicit, and one native per dataset
    # and a second natiev set for the topmost dataset
    aggregate_metadata(ds, guess_native_type=True, recursive=True)
    # no only ask the top superdataset, no recursion, just reading from the cache
    meta = get_metadata(
        ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False)
    assert_equal(len(meta), 10)
    # same schema
    assert_equal(
        10,
        sum([s.get('@context', {'@vocab': None})['@vocab'] == 'http://schema.org/'
             for s in meta]))
    # three different IDs
    assert_equal(3, len(set([s.get('@id') for s in meta])))
    # and we know about all three datasets
    for name in ('mother_äöü東', 'child_äöü東', 'grandchild_äöü東'):
        assert_true(sum([s.get('name', None) == assure_unicode(name) for s in meta]))
    #print(meta)
    assert_equal(
        # first implicit, then two natives, then aggregate
        meta[3]['dcterms:hasPart']['@id'],
        subds.id)
    success = False
    for m in meta:
        p = m.get('dcterms:hasPart', {})
        if p.get('@id', None) == subsubds.id:
            assert_equal(opj('sub', 'subsub'), p.get('location', None))
            success = True
    assert_true(success)

    # save the toplevel dataset only (see below)
    ds.save('with aggregated meta data', all_changes=True)

    # now clone the beast to simulate a new user installing an empty dataset
    clone = install(opj(path, 'clone'), source=ds.path)
    # ID mechanism works
    assert_equal(ds.id, clone.id)

    # get fresh meta data, the implicit one for the top-most datasets should
    # differ, but the rest not
    clonemeta = get_metadata(
        clone, guess_type=False, ignore_subdatasets=False, ignore_cache=False)

    # make sure the implicit md for the topmost come first
    assert_equal(clonemeta[0]['@id'], clone.id)
    assert_equal(clonemeta[0]['@id'], ds.id)
    assert_equal(clone.repo.get_hexsha(), ds.repo.get_hexsha())
    assert_equal(clonemeta[0]['version'], ds.repo.get_hexsha())
    # all but the implicit is identical
    assert_equal(clonemeta[1:], meta[1:])
    # the implicit md of the clone should list a dataset ID for its subds,
    # although it has not been obtained!
    assert_equal(
        clonemeta[3]['dcterms:hasPart']['@id'],
        subds.id)

    # now obtain a subdataset in the clone and the IDs should be updated
    clone.install('sub')
    partial = get_metadata(clone, guess_type=False, ignore_cache=True)
    # ids don't change
    assert_equal(partial[0]['@id'], clonemeta[0]['@id'])
    # datasets are properly connected
    assert_equal(partial[1]['dcterms:hasPart']['@id'],
                 partial[2]['@id'])

    # query smoke test
    if os.environ.get('DATALAD_TESTS_NONETWORK'):
        raise SkipTest

    assert_equal(len(list(clone.search('mother'))), 1)
    assert_equal(len(list(clone.search('MoTHER'))), 1)  # case insensitive

    child_res = list(clone.search('child'))
    assert_equal(len(child_res), 2)

    # little helper to match names
    def assert_names(res, names, path=clone.path):
        assert_equal(list(map(itemgetter(0), res)),
                     [opj(path, n) for n in names])
    # should yield (location, report) tuples
    assert_names(child_res, ['sub', 'sub/subsub'])

    # result should be identical to invoking search from api
    # and search_ should spit out locations out
    with swallow_outputs() as cmo:
        res = list(search_('child', dataset=clone))
        assert_equal(res, child_res)
        assert_in(res[0][0], cmo.out)
    # and overarching search_ just for smoke testing of processing outputs
    # and not puking (e.g. under PY3)
    with swallow_outputs() as cmo:
        assert list(search_('.', regex=True, dataset=clone))
        assert cmo.out

    # test searching among specified properties only
    assert_names(clone.search('i', search='name'), ['sub', 'sub/subsub'])
    assert_names(clone.search('i', search='keywords'), ['.'])
    # case shouldn't matter
    assert_names(clone.search('i', search='Keywords'), ['.'])
    assert_names(clone.search('i', search=['name', 'keywords']),
                 ['.', 'sub', 'sub/subsub'])

    # without report_matched, we are getting none of the fields
    assert(all([not x for x in map(itemgetter(1), child_res)]))
    # but we would get all if asking for '*'
    assert(all([len(x) >= 9
                for x in map(itemgetter(1),
                             list(clone.search('child', report='*')))]))
    # but we would get only the matching name if we ask for report_matched
    assert_equal(
        set(map(lambda x: tuple(x[1].keys()),
                clone.search('child', report_matched=True))),
        set([('name',)])
    )
    # and the additional field we might have asked with report
    assert_equal(
        set(map(lambda x: tuple(sorted(x[1].keys())),
                clone.search('child', report_matched=True,
                             report=['schema:type']))),
        set([('name', 'schema:type')])
    )
    # and if we ask report to be 'empty', we should get no fields
    child_res_empty = list(clone.search('child', report=''))
    assert_equal(len(child_res_empty), 2)
    assert_equal(
        set(map(lambda x: tuple(x[1].keys()), child_res_empty)),
        set([tuple()])
    )

    # more tests on returned paths:
    assert_names(clone.search('datalad'), ['.', 'sub', 'sub/subsub'])
    # if we clone subdataset and query for value present in it and its kid
    clone_sub = clone.install('sub')
    assert_names(clone_sub.search('datalad'), ['.', 'subsub'], clone_sub.path)

    # Test 'and' for multiple search entries
    assert_equal(len(list(clone.search(['child', 'bids']))), 2)
    assert_equal(len(list(clone.search(['child', 'subsub']))), 1)
    assert_equal(len(list(clone.search(['bids', 'sub']))), 2)

    res = list(clone.search('.*', regex=True))  # with regex
    assert_equal(len(res), 3)  # one per dataset

    # we do search, not match
    assert_equal(len(list(clone.search('randchild', regex=True))), 1)
    assert_equal(len(list(clone.search(['gr.nd', 'ch.ld'], regex=True))), 1)
    assert_equal(len(list(clone.search('randchil.', regex=True))), 1)
    assert_equal(len(list(clone.search('^randchild.*', regex=True))), 0)
    assert_equal(len(list(clone.search('^grandchild.*', regex=True))), 1)
    assert_equal(len(list(clone.search('grandchild'))), 1)
Beispiel #60
0
def test_archive(path):
    ds = Dataset(opj(path, 'ds')).create(force=True)
    ds.save()
    committed_date = ds.repo.get_commit_date()
    default_outname = opj(path, 'datalad_{}.tar.gz'.format(ds.id))
    with chpwd(path):
        res = list(ds.export_archive())
        assert_status('ok', res)
        assert_result_count(res, 1)
        assert(isabs(res[0]['path']))
    assert_true(os.path.exists(default_outname))
    custom_outname = opj(path, 'myexport.tar.gz')
    # feed in without extension
    ds.export_archive(filename=custom_outname[:-7])
    assert_true(os.path.exists(custom_outname))
    custom1_md5 = md5sum(custom_outname)
    # encodes the original archive filename -> different checksum, despit
    # same content
    assert_not_equal(md5sum(default_outname), custom1_md5)
    # should really sleep so if they stop using time.time - we know
    time.sleep(1.1)
    ds.export_archive(filename=custom_outname)
    # should not encode mtime, so should be identical
    assert_equal(md5sum(custom_outname), custom1_md5)

    def check_contents(outname, prefix):
        with tarfile.open(outname) as tf:
            nfiles = 0
            for ti in tf:
                # any annex links resolved
                assert_false(ti.issym())
                ok_startswith(ti.name, prefix + '/')
                assert_equal(ti.mtime, committed_date)
                if '.datalad' not in ti.name:
                    # ignore any files in .datalad for this test to not be
                    # susceptible to changes in how much we generate a meta info
                    nfiles += 1
            # we have exactly four files (includes .gitattributes for default
            # MD5E backend), and expect no content for any directory
            assert_equal(nfiles, 4)
    check_contents(default_outname, 'datalad_%s' % ds.id)
    check_contents(custom_outname, 'myexport')

    # now loose some content
    ds.drop('file_up', check=False)
    assert_raises(IOError, ds.export_archive, filename=opj(path, 'my'))
    ds.export_archive(filename=opj(path, 'partial'), missing_content='ignore')
    assert_true(os.path.exists(opj(path, 'partial.tar.gz')))