Example #1
0
def test_basic_metadata(path):
    ds = Dataset(opj(path, 'origin'))
    meta = get_metadata(ds)
    assert_equal(sorted(meta[0].keys()),
                 ['@context', 'dcterms:conformsTo'])
    ds.create(force=True, save=False)
    # with subdataset
    sub = ds.create('sub', force=True)
    ds.save()
    meta = get_metadata(ds)
    assert_equal(
        sorted(meta[0].keys()),
        ['@context', '@id', 'availableFrom', 'dcterms:conformsTo',
         'dcterms:modified', 'type', 'version'])
    assert_equal(meta[0]['type'], 'Dataset')
    # clone and get relationship info in metadata
    sibling = install(opj(path, 'sibling'), source=opj(path, 'origin'))
    sibling_meta = get_metadata(sibling)
    assert_equal(sibling_meta[0]['@id'], ds.id)
    # origin should learn about the clone
    sibling.repo.push(remote='origin', refspec='git-annex')
    meta = get_metadata(ds)
    assert_equal([m['@id'] for m in meta[0]['availableFrom']],
                 [m['@id'] for m in sibling_meta[0]['availableFrom']])
    meta = get_metadata(ds, guess_type=True)
    # without aggregation there is not trace of subdatasets in the metadata
    assert_not_in('dcterms:hasPart', meta[0])
Example #2
0
 def init_dataset(self, dsdir: Path, create_time: datetime) -> Dataset:
     ds = Dataset(str(dsdir))
     if not ds.is_installed():
         log.info("Creating Datalad dataset")
         with custom_commit_date(create_time):
             with envset("GIT_CONFIG_PARAMETERS",
                         f"'init.defaultBranch={DEFAULT_BRANCH}'"):
                 ds.create(cfg_proc="text2git")
         if self.config.backup_remote is not None:
             ds.repo.init_remote(
                 self.config.backup_remote,
                 [
                     "type=external",
                     "externaltype=rclone",
                     "chunk=1GB",
                     f"target={self.config.backup_remote}",  # I made them matching
                     "prefix=dandi-dandisets/annexstore",
                     "embedcreds=no",
                     "uuid=727f466f-60c3-4778-90b2-b2332856c2f8",
                     "encryption=none",
                     # shared, initialized in 000003
                 ],
             )
             ds.repo.call_annex(["untrust", self.config.backup_remote])
             ds.repo.set_preferred_content(
                 "wanted",
                 "(not metadata=distribution-restrictions=*)",
                 remote=self.config.backup_remote,
             )
     return ds
Example #3
0
def test_basic_metadata(path):
    ds = Dataset(opj(path, 'origin'))
    meta = get_metadata(ds)
    assert_equal(sorted(meta[0].keys()), ['@context', 'dcterms:conformsTo'])
    ds.create(force=True, save=False)
    # with subdataset
    sub = ds.create('sub', force=True, if_dirty='ignore')
    ds.save()
    meta = get_metadata(ds)
    assert_equal(sorted(meta[0].keys()), [
        '@context', '@id', 'availableFrom', 'dcterms:conformsTo',
        'dcterms:modified', 'type', 'version'
    ])
    assert_equal(meta[0]['type'], 'Dataset')
    # clone and get relationship info in metadata
    sibling = install(opj(path, 'sibling'), source=opj(path, 'origin'))
    sibling_meta = get_metadata(sibling)
    assert_equal(sibling_meta[0]['@id'], ds.id)
    # origin should learn about the clone
    sibling.repo.push(remote='origin', refspec='git-annex')
    meta = get_metadata(ds)
    assert_equal([m['@id'] for m in meta[0]['availableFrom']],
                 [m['@id'] for m in sibling_meta[0]['availableFrom']])
    meta = get_metadata(ds, guess_type=True)
    # without aggregation there is not trace of subdatasets in the metadata
    assert_not_in('dcterms:hasPart', meta[0])
Example #4
0
 def ensure_superdataset(self) -> Dataset:
     superds = Dataset(self.target_path)
     if not superds.is_installed():
         log.info("Creating Datalad superdataset")
         with envset("GIT_CONFIG_PARAMETERS",
                     f"'init.defaultBranch={DEFAULT_BRANCH}'"):
             superds.create(cfg_proc="text2git")
     return superds
Example #5
0
def test_custom_commit_date(tmp_path: Path) -> None:
    ds = Dataset(tmp_path)
    ds.create(cfg_proc="text2git")
    (tmp_path / "file.txt").write_text("This is test text.\n")
    with custom_commit_date(
            datetime(2021, 6, 1, 12, 34, 56, tzinfo=timezone.utc)):
        ds.save(message="Add a file")
    repo = GitRepo(tmp_path)
    assert repo.get_commit_date("HEAD") == "2021-06-01T12:34:56+00:00"
    assert repo.get_commit_author(
        "HEAD") == "DANDI User <*****@*****.**>"
Example #6
0
def test_ephemeral(ds_path=None, store_path=None, clone_path=None):

    dspath = Path(ds_path)
    store = Path(store_path)
    file_test = Path('file1.txt')
    file_testsub = Path('sub') / 'other.txt'

    # create the original dataset
    ds = Dataset(dspath)
    ds.create(force=True)
    ds.save()

    # put into store:
    ds.create_sibling_ria("ria+{}".format(store.as_uri()),
                          "riastore",
                          new_store_ok=True)
    ds.push(to="riastore", data="anything")

    # now, get an ephemeral clone from the RIA store:
    eph_clone = clone('ria+{}#{}'.format(store.as_uri(), ds.id),
                      clone_path,
                      reckless="ephemeral")

    # ephemeral clone was properly linked (store has bare repos!):
    clone_annex = (eph_clone.repo.dot_git / 'annex')
    assert_true(clone_annex.is_symlink())
    assert_true(clone_annex.resolve().samefile(store / ds.id[:3] / ds.id[3:] /
                                               'annex'))
    if not eph_clone.repo.is_managed_branch():
        # TODO: We can't properly handle adjusted branch yet
        # we don't need to get files in order to access them:
        assert_equal((eph_clone.pathobj / file_test).read_text(), "some")
        assert_equal((eph_clone.pathobj / file_testsub).read_text(), "other")

        # can we unlock those files?
        eph_clone.unlock(file_test)
        # change content
        (eph_clone.pathobj / file_test).write_text("new content")
        eph_clone.save()

        # new content should already be in store
        # (except the store doesn't know yet)
        res = eph_clone.repo.fsck(remote="riastore-storage", fast=True)
        assert_equal(len(res), 2)
        assert_result_count(res, 1, success=True, file=file_test.as_posix())
        assert_result_count(res, 1, success=True, file=file_testsub.as_posix())

        # push back git history
        eph_clone.push(to=DEFAULT_REMOTE, data="nothing")

        # get an update in origin
        ds.update(merge=True, reobtain_data=True)
        assert_equal((ds.pathobj / file_test).read_text(), "new content")
Example #7
0
def annex_path(tmpdir_factory):
    path = tmpdir_factory.mktemp('annexes')
    ds_path = str(path.join(DATASET_ID))
    # Create an empty dataset for testing
    ds = Dataset(ds_path)
    ds.create()
    ds.no_annex(BIDS_NO_ANNEX)
    json_path = os.path.join(ds_path, 'dataset_description.json')
    with open(json_path, 'w') as f:
        json.dump(DATASET_DESCRIPTION, f, ensure_ascii=False)
    ds.add(json_path)
    ds.save(version_tag=SNAPSHOT_ID)
    # Setup a seed for any new_dataset uses
    random.seed(42)
    return path
Example #8
0
def test_get_default_title(path):
    repo = GitRepo(path)
    ds = Dataset(path)
    # There is no dataset initialized yet, so only path will be the title
    dirname = op.basename(path)
    eq_(_get_default_title(ds), dirname)

    # Initialize and get UUID
    ds.create(force=True)
    eq_(_get_default_title(ds), '{dirname}#{ds.id}'.format(**locals()))

    # Tag and get @version
    # cannot use ds.save since our tags are not annotated,
    # see https://github.com/datalad/datalad/issues/4139
    ds.repo.tag("0.1", message="important version")
    eq_(_get_default_title(ds), '{dirname}#{ds.id}@0.1'.format(**locals()))
Example #9
0
def test_custom_call_fmt(path, local_file):
    ds = Dataset(path).create()
    subds = ds.create('sub')

    # plug in a proper singularity image
    subds.containers_add(
        'mycontainer',
        url=get_local_file_url(op.join(local_file, 'some_container.img')),
        image='righthere',
        call_fmt='echo image={img} cmd={cmd} img_dspath={img_dspath} '
        # and environment variable being set/propagated by default
        'name=$DATALAD_CONTAINER_NAME')
    ds.save()  # record the effect in super-dataset

    # Running should work fine either withing sub or within super
    out = WitlessRunner(cwd=subds.path).run(
        ['datalad', 'containers-run', '-n', 'mycontainer', 'XXX'],
        protocol=StdOutCapture)
    assert_in('image=righthere cmd=XXX img_dspath=. name=mycontainer',
              out['stdout'])

    out = WitlessRunner(cwd=ds.path).run(
        ['datalad', 'containers-run', '-n', 'sub/mycontainer', 'XXX'],
        protocol=StdOutCapture)
    assert_in('image=sub/righthere cmd=XXX img_dspath=sub', out['stdout'])

    # Test within subdirectory of the super-dataset
    subdir = op.join(ds.path, 'subdir')
    os.mkdir(subdir)
    out = WitlessRunner(cwd=subdir).run(
        ['datalad', 'containers-run', '-n', 'sub/mycontainer', 'XXX'],
        protocol=StdOutCapture)
    assert_in('image=../sub/righthere cmd=XXX img_dspath=../sub',
              out['stdout'])
Example #10
0
def test_aggregate_with_missing_or_duplicate_id(path):
    # a hierarchy of three (super/sub)datasets, each with some native metadata
    ds = Dataset(opj(path, 'origin')).create(force=True)
    subds = ds.create('sub', force=True, if_dirty='ignore')
    subds.repo.remove(opj('.datalad', 'config'))
    subds.save()
    assert_false(exists(opj(subds.path, '.datalad', 'config')))
    subsubds = subds.create('subsub', force=True, if_dirty='ignore')
    # aggregate from bottom to top, guess native data, no compacting of graph
    # should yield 6 meta data sets, one implicit, and one native per dataset
    # and a second native set for the topmost dataset
    aggregate_metadata(ds, guess_native_type=True, recursive=True)
    # no only ask the top superdataset, no recursion, just reading from the cache
    meta = get_metadata(ds,
                        guess_type=False,
                        ignore_subdatasets=False,
                        ignore_cache=False)
    # and we know nothing subsub
    for name in ('grandchild_äöü東', ):
        assert_true(
            sum([s.get('name', '') == assure_unicode(name) for s in meta]))

    # but search should not fail
    with swallow_outputs():
        res1 = list(search_('.', regex=True, dataset=ds))
    assert res1

    # and let's see now if we wouldn't fail if dataset is duplicate if we
    # install the same dataset twice
    subds_clone = ds.install(source=subds.path, path="subds2")
    with swallow_outputs():
        res2 = list(search_('.', regex=True, dataset=ds))
Example #11
0
def test_custom_call_fmt(path, local_file):
    ds = Dataset(path).create()
    subds = ds.create('sub')

    # plug in a proper singularity image
    subds.containers_add(
        'mycontainer',
        url=get_local_file_url(op.join(local_file, 'some_container.img')),
        image='righthere',
        call_fmt='echo image={img} cmd={cmd} img_dspath={img_dspath} '
                 # and environment variable being set/propagated by default
                 'name=$DATALAD_CONTAINER_NAME'
    )
    ds.save()  # record the effect in super-dataset

    # Running should work fine either withing sub or within super
    with swallow_outputs() as cmo:
        subds.containers_run('XXX', container_name='mycontainer')
        assert_in('image=righthere cmd=XXX img_dspath=. name=mycontainer', cmo.out)

    with swallow_outputs() as cmo:
        ds.containers_run('XXX', container_name='sub/mycontainer')
        assert_in('image=sub/righthere cmd=XXX img_dspath=sub', cmo.out)

    # Test within subdirectory of the super-dataset
    subdir = op.join(ds.path, 'subdir')
    os.mkdir(subdir)
    with chpwd(subdir):
        with swallow_outputs() as cmo:
            containers_run('XXX', container_name='sub/mycontainer')
            assert_in('image=../sub/righthere cmd=XXX img_dspath=../sub', cmo.out)
Example #12
0
def new_dataset(annex_path):
    """Create a new dataset with a unique name for one test."""
    ds_path = str(annex_path.join(id_generator()))
    ds = Dataset(ds_path)
    ds.create()
    ds.no_annex(BIDS_NO_ANNEX)
    json_path = os.path.join(ds_path, 'dataset_description.json')
    dsdesc = {
        'BIDSVersion': '1.0.2',
        'License': 'This is not a real dataset',
        'Name': 'Test fixture new dataset',
    }
    with open(json_path, 'w') as f:
        json.dump(dsdesc, f, ensure_ascii=False)
    ds.add(json_path)
    return ds
Example #13
0
def test_aggregate_with_missing_or_duplicate_id(path):
    # a hierarchy of three (super/sub)datasets, each with some native metadata
    ds = Dataset(opj(path, 'origin')).create(force=True)
    subds = ds.create('sub', force=True)
    subds.remove(opj('.datalad', 'config'), if_dirty='ignore')
    assert_false(exists(opj(subds.path, '.datalad', 'config')))
    subsubds = subds.create('subsub', force=True)
    # aggregate from bottom to top, guess native data, no compacting of graph
    # should yield 6 meta data sets, one implicit, and one native per dataset
    # and a second native set for the topmost dataset
    aggregate_metadata(ds, guess_native_type=True, recursive=True)
    # no only ask the top superdataset, no recursion, just reading from the cache
    meta = get_metadata(
        ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False)
    # and we know nothing subsub
    for name in ('grandchild_äöü東',):
        assert_true(sum([s.get('name', '') == assure_unicode(name) for s in meta]))

    # but search should not fail
    with swallow_outputs():
        res1 = list(search_('.', regex=True, dataset=ds))
    assert res1

    # and let's see now if we wouldn't fail if dataset is duplicate if we
    # install the same dataset twice
    subds_clone = ds.install(source=subds.path, path="subds2")
    with swallow_outputs():
        res2 = list(search_('.', regex=True, dataset=ds))
Example #14
0
def test_dicom2spec(path):

    # ###  SETUP ###
    dicoms = get_dicom_dataset('structural')

    ds = Dataset.create(path, cfg_proc=['hirni'])
    ds.install(source=dicoms, path='acq100')
    # Note: Recursive, since aggregation wasn't performed in the installed dastasets
    # TODO: Use get_raw_sd from above instead of this setup
    ds.meta_aggregate('acq100', into='top', recursive=True)
    # ### END SETUP ###

    # TODO: should it be specfile or acq/specfile? => At least doc needed,
    # if not change
    res = ds.hirni_dicom2spec(path='acq100', spec='spec_structural.json')

    # check for actual location of spec_structural!
    # => studyds root!

    assert_result_count(res, 2)
    assert_result_count(res, 1, path=op.join(ds.path, 'spec_structural.json'))
    assert_result_count(res, 1, path=op.join(ds.path, '.gitattributes'))
    ok_clean_git(ds.path)

    # multiple execution shouldn't change .gitattributes again:
    from os import stat
    mtime = stat(op.join(ds.path, '.gitattributes')).st_mtime
    res = ds.hirni_dicom2spec(path='acq100', spec='spec_structural.json')
    assert_equal(stat(op.join(ds.path, '.gitattributes')).st_mtime, mtime)
Example #15
0
    def get_raw_dataset(self):
        # Note: This is lazy to avoid building on import time, since import is part of nose's discovery and executed
        # before the dependencies. This leads to datalad's ui backend not yet being correctly set, which in turn
        # let's the cloning hang within progressbar generation.
        if not self._dspath:
            import tempfile
            kwargs = get_tempfile_kwargs()
            path = tempfile.mkdtemp(**kwargs)
            f_dicoms = get_dicom_dataset('functional')
            s_dicoms = get_dicom_dataset('structural')
            ds = Dataset.create(path, cfg_proc=['hirni'])
            ds.install(source=f_dicoms, path=op.join('func_acq', 'dicoms'))
            ds.install(source=s_dicoms, path=op.join('struct_acq', 'dicoms'))

            # Note: Recursive, since aggregation wasn't performed in the installed dastasets
            ds.meta_aggregate([
                op.join('func_acq', 'dicoms'),
                op.join('struct_acq', 'dicoms')
            ],
                              into='top',
                              recursive=True)

            # TODO: Figure how to add it to things to be removed after tests ran
            self._dspath = ds.path
        return self._dspath
Example #16
0
def test_annex_get_from_subdir(topdir=None):
    ds = Dataset(topdir)
    ds.create(force=True)
    ds.save('a.tar.gz')
    ds.add_archive_content('a.tar.gz', delete=True)
    fpath = op.join(topdir, 'a', 'd', fn_in_archive_obscure)

    with chpwd(op.join(topdir, 'a', 'd')):
        runner = WitlessRunner()
        runner.run(['git', 'annex', 'drop', '--', fn_in_archive_obscure],
                   protocol=KillOutput)  # run git annex drop
        assert_false(ds.repo.file_has_content(
            fpath))  # and verify if file deleted from directory
        runner.run(['git', 'annex', 'get', '--', fn_in_archive_obscure],
                   protocol=KillOutput)  # run git annex get
        assert_true(ds.repo.file_has_content(
            fpath))  # and verify if file got into directory
def test_dicom_metadata_aggregation(path):
    dicoms = get_dicom_dataset('structural')

    ds = Dataset.create(path)
    ds.install(source=dicoms, path='acq100')
    ds.aggregate_metadata(recursive=True)
    res = ds.metadata(get_aggregates=True)
    assert_result_count(res, 2)
    assert_result_count(res, 1, path=opj(ds.path, 'acq100'))
Example #18
0
def _get_nested_collections(path):
    ds = Dataset(path).create()
    c1 = ds.create(ds.pathobj / 'subdir' / 'collection1')
    c1s1 = c1.create('sub1')
    c1s2 = c1.create('sub2')
    c2 = ds.create('collection2')
    c2s1 = c2.create('sub1')
    c2s11 = c2s1.create('deepsub1')
    ds.save(recursive=True)
    assert_repo_status(ds.path)
    # return a catalog
    return dict(
        root=ds,
        c1=c1,
        c1s1=c1s2,
        c1s2=c1s2,
        c2=c2,
        c2s1=c2s1,
        c2s11=c2s11,
    )
def test_dicom_metadata_aggregation(path):
    dicoms = get_dicom_dataset('structural')

    ds = Dataset.create(path)
    ds.install(source=dicoms, path='acq100')

    # Note: Recursive, since aggregation wasn't performed in the installed dastasets
    ds.meta_aggregate('acq100', into='top', recursive=True)
    res = ds.meta_dump(reporton='aggregates', recursive=True)
    assert_result_count(res, 2)
    assert_result_count(res, 1, path=op.join(ds.path, 'acq100'))
def new_dataset(datalad_store):
    """Create a new dataset with a unique name for one test."""
    ds_path = str(os.path.join(datalad_store.annex_path, id_generator()))
    ds = Dataset(ds_path)
    ds.create()
    ds.no_annex(BIDS_NO_ANNEX)

    json_path = os.path.join(ds_path, 'dataset_description.json')
    dsdesc = {
        'BIDSVersion': '1.0.2',
        'License': 'This is not a real dataset',
        'Name': 'Test fixture new dataset',
    }
    with open(json_path, 'w') as f:
        json.dump(dsdesc, f, ensure_ascii=False)
    ds.save(json_path)

    changes_path = os.path.join(ds_path, 'CHANGES')
    with open(changes_path, 'w') as f:
        f.write(CHANGES)
    ds.save(changes_path)
    ds.close()
    return ds
Example #21
0
def make_ds_hierarchy_with_metadata(path):
    """Test helper that returns the two datasets in the hierarchy

    The top-level dataset contains an annex'ed file with annex
    metadata.
    """
    ds = Dataset(path).create(force=True)
    create_tree(ds.path, {'file.dat': 'content'})
    ds.save()
    ds.repo.set_metadata('file.dat', reset={'tag': ['one', 'two']})
    subds = ds.create('sub')
    # we need one real piece of content for metadata extraction
    (subds.pathobj / 'real').write_text(text_type('real'))
    ds.save(recursive=True)
    return ds, subds
def test_list_contains(path):
    ds = Dataset(path).create()
    subds_a = ds.create("a")
    subds_b = ds.create("b")
    subds_a_c = subds_a.create("c")

    add_pyscript_image(subds_a_c, "in-c", "img")
    add_pyscript_image(subds_a, "in-a", "img")
    add_pyscript_image(subds_b, "in-b", "img")
    add_pyscript_image(ds, "in-top", "img")

    ds.save(recursive=True)

    assert_result_count(ds.containers_list(recursive=True, **RAW_KWDS), 4)

    assert_result_count(ds.containers_list(contains=["nowhere"],
                                           recursive=True,
                                           **RAW_KWDS),
                        1,
                        name="in-top",
                        action='containers')

    res = ds.containers_list(contains=[subds_a.path],
                             recursive=True,
                             **RAW_KWDS)
    assert_result_count(res, 3)
    assert_in_results(res, name="in-top")
    assert_in_results(res, name="a/in-a")
    assert_in_results(res, name="a/c/in-c")

    res = ds.containers_list(contains=[subds_a_c.path],
                             recursive=True,
                             **RAW_KWDS)
    assert_result_count(res, 3)
    assert_in_results(res, name="in-top")
    assert_in_results(res, name="a/in-a")
    assert_in_results(res, name="a/c/in-c")

    res = ds.containers_list(contains=[subds_b.path],
                             recursive=True,
                             **RAW_KWDS)
    assert_result_count(res, 2)
    assert_in_results(res, name="in-top")
    assert_in_results(res, name="b/in-b")
def _single_session_dicom2bids(label, path, toolbox_url):

    with patch.dict('os.environ',
                    {'DATALAD_HIRNI_TOOLBOX_URL': toolbox_url}):
        ds = Dataset.create(path, cfg_proc=['hirni'])

    subject = "02"
    acquisition = "{sub}_{label}".format(sub=subject, label=label)

    dicoms = get_dicom_dataset(label)
    ds.install(source=dicoms, path=op.join(acquisition, 'dicoms'))
    # Note: Recursive, since aggregation wasn't performed in the installed dastasets
    ds.meta_aggregate(op.join(acquisition, 'dicoms'), into='top', recursive=True)

    spec_file = 'spec_{label}.json'.format(label=label)
    ds.hirni_dicom2spec(path=op.join(acquisition, 'dicoms'),
                        spec=op.join(acquisition, spec_file))

    ds.hirni_spec2bids(op.join(acquisition, spec_file))
Example #24
0
def test_dryrun(path=None):
    ds = Dataset(path).create()
    # see that the correct request would be made
    res = ds.create_sibling_gin('bogus', credential='some', dry_run=True)
    assert_result_count(res, 1)
    res = res[0]
    eq_(res['request_url'], 'https://gin.g-node.org/api/v1/user/repos')
    # we dont care much which user-agent, but there should be one
    assert_in('user-agent', res['request_headers'])
    # only a placeholder no-token makes it into the request
    assert_in('NO-TOKEN-AVAILABLE', res['request_headers']['authorization'])
    # correct name
    eq_(res['request_data']['name'], 'bogus')
    # public by default
    eq_(res['request_data']['private'], False)
    # it is important that we do not tell the portal to generate some
    # repo content
    eq_(res['request_data']['auto_init'], False)

    # org repo
    res = ds.create_sibling_gin('strangeorg/bogus', credential='some',
                                dry_run=True)
    assert_result_count(res, 1)
    res = res[0]
    eq_(res['request_data']['name'], 'bogus')
    eq_(res['request_url'],
        'https://gin.g-node.org/api/v1/org/strangeorg/repos')

    # recursive name, building
    subds = ds.create('subds')
    res = ds.create_sibling_gin(
        'bogus', recursive=True, credential='some', dry_run=True)
    eq_(res[-1]['request_data']['name'], 'bogus-subds')

    # ignore unavailable datasets
    ds.drop('subds', what='all', reckless='kill', recursive=True)
    res = ds.create_sibling_gin(
        'bogus', recursive=True, credential='some', dry_run=True)
    eq_(len(res), 1)
Example #25
0
def test_aggregation(path):
    with chpwd(path):
        assert_raises(InsufficientArgumentsError, aggregate_metadata, None)
    # a hierarchy of three (super/sub)datasets, each with some native metadata
    ds = Dataset(opj(path, 'origin')).create(force=True)
    subds = ds.create('sub', force=True)
    subsubds = subds.create('subsub', force=True)
    # aggregate from bottom to top, guess native data, no compacting of graph
    # should yield 6 meta data sets, one implicit, and one native per dataset
    # and a second natiev set for the topmost dataset
    aggregate_metadata(ds, guess_native_type=True, recursive=True)
    # no only ask the top superdataset, no recursion, just reading from the cache
    meta = get_metadata(
        ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False)
    assert_equal(len(meta), 10)
    # same schema
    assert_equal(
        10,
        sum([s.get('@context', {'@vocab': None})['@vocab'] == 'http://schema.org/'
             for s in meta]))
    # three different IDs
    assert_equal(3, len(set([s.get('@id') for s in meta])))
    # and we know about all three datasets
    for name in ('mother_äöü東', 'child_äöü東', 'grandchild_äöü東'):
        assert_true(sum([s.get('name', None) == assure_unicode(name) for s in meta]))
    #print(meta)
    assert_equal(
        # first implicit, then two natives, then aggregate
        meta[3]['dcterms:hasPart']['@id'],
        subds.id)
    success = False
    for m in meta:
        p = m.get('dcterms:hasPart', {})
        if p.get('@id', None) == subsubds.id:
            assert_equal(opj('sub', 'subsub'), p.get('location', None))
            success = True
    assert_true(success)

    # save the toplevel dataset only (see below)
    ds.save('with aggregated meta data', all_changes=True)

    # now clone the beast to simulate a new user installing an empty dataset
    clone = install(opj(path, 'clone'), source=ds.path)
    # ID mechanism works
    assert_equal(ds.id, clone.id)

    # get fresh meta data, the implicit one for the top-most datasets should
    # differ, but the rest not
    clonemeta = get_metadata(
        clone, guess_type=False, ignore_subdatasets=False, ignore_cache=False)

    # make sure the implicit md for the topmost come first
    assert_equal(clonemeta[0]['@id'], clone.id)
    assert_equal(clonemeta[0]['@id'], ds.id)
    assert_equal(clone.repo.get_hexsha(), ds.repo.get_hexsha())
    assert_equal(clonemeta[0]['version'], ds.repo.get_hexsha())
    # all but the implicit is identical
    assert_equal(clonemeta[1:], meta[1:])
    # the implicit md of the clone should list a dataset ID for its subds,
    # although it has not been obtained!
    assert_equal(
        clonemeta[3]['dcterms:hasPart']['@id'],
        subds.id)

    # now obtain a subdataset in the clone and the IDs should be updated
    clone.install('sub')
    partial = get_metadata(clone, guess_type=False, ignore_cache=True)
    # ids don't change
    assert_equal(partial[0]['@id'], clonemeta[0]['@id'])
    # datasets are properly connected
    assert_equal(partial[1]['dcterms:hasPart']['@id'],
                 partial[2]['@id'])

    # query smoke test
    if os.environ.get('DATALAD_TESTS_NONETWORK'):
        raise SkipTest

    assert_equal(len(list(clone.search('mother'))), 1)
    assert_equal(len(list(clone.search('MoTHER'))), 1)  # case insensitive

    child_res = list(clone.search('child'))
    assert_equal(len(child_res), 2)

    # little helper to match names
    def assert_names(res, names, path=clone.path):
        assert_equal(list(map(itemgetter(0), res)),
                     [opj(path, n) for n in names])
    # should yield (location, report) tuples
    assert_names(child_res, ['sub', 'sub/subsub'])

    # result should be identical to invoking search from api
    # and search_ should spit out locations out
    with swallow_outputs() as cmo:
        res = list(search_('child', dataset=clone))
        assert_equal(res, child_res)
        assert_in(res[0][0], cmo.out)
    # and overarching search_ just for smoke testing of processing outputs
    # and not puking (e.g. under PY3)
    with swallow_outputs() as cmo:
        assert list(search_('.', regex=True, dataset=clone))
        assert cmo.out

    # test searching among specified properties only
    assert_names(clone.search('i', search='name'), ['sub', 'sub/subsub'])
    assert_names(clone.search('i', search='keywords'), ['.'])
    # case shouldn't matter
    assert_names(clone.search('i', search='Keywords'), ['.'])
    assert_names(clone.search('i', search=['name', 'keywords']),
                 ['.', 'sub', 'sub/subsub'])

    # without report_matched, we are getting none of the fields
    assert(all([not x for x in map(itemgetter(1), child_res)]))
    # but we would get all if asking for '*'
    assert(all([len(x) >= 9
                for x in map(itemgetter(1),
                             list(clone.search('child', report='*')))]))
    # but we would get only the matching name if we ask for report_matched
    assert_equal(
        set(map(lambda x: tuple(x[1].keys()),
                clone.search('child', report_matched=True))),
        set([('name',)])
    )
    # and the additional field we might have asked with report
    assert_equal(
        set(map(lambda x: tuple(sorted(x[1].keys())),
                clone.search('child', report_matched=True,
                             report=['schema:type']))),
        set([('name', 'schema:type')])
    )
    # and if we ask report to be 'empty', we should get no fields
    child_res_empty = list(clone.search('child', report=''))
    assert_equal(len(child_res_empty), 2)
    assert_equal(
        set(map(lambda x: tuple(x[1].keys()), child_res_empty)),
        set([tuple()])
    )

    # more tests on returned paths:
    assert_names(clone.search('datalad'), ['.', 'sub', 'sub/subsub'])
    # if we clone subdataset and query for value present in it and its kid
    clone_sub = clone.install('sub')
    assert_names(clone_sub.search('datalad'), ['.', 'subsub'], clone_sub.path)

    # Test 'and' for multiple search entries
    assert_equal(len(list(clone.search(['child', 'bids']))), 2)
    assert_equal(len(list(clone.search(['child', 'subsub']))), 1)
    assert_equal(len(list(clone.search(['bids', 'sub']))), 2)

    res = list(clone.search('.*', regex=True))  # with regex
    assert_equal(len(res), 3)  # one per dataset

    # we do search, not match
    assert_equal(len(list(clone.search('randchild', regex=True))), 1)
    assert_equal(len(list(clone.search(['gr.nd', 'ch.ld'], regex=True))), 1)
    assert_equal(len(list(clone.search('randchil.', regex=True))), 1)
    assert_equal(len(list(clone.search('^randchild.*', regex=True))), 0)
    assert_equal(len(list(clone.search('^grandchild.*', regex=True))), 1)
    assert_equal(len(list(clone.search('grandchild'))), 1)
Example #26
0
def _test_create_store(host, base_path, ds_path, clone_path):

    ds = Dataset(ds_path).create(force=True)

    subds = ds.create('sub', force=True)
    subds2 = ds.create('sub2', force=True, annex=False)
    ds.save(recursive=True)
    assert_repo_status(ds.path)

    # don't specify special remote. By default should be git-remote + "-storage"
    res = ds.create_sibling_ria("ria+ssh://test-store:", "datastore")
    assert_result_count(res, 1, status='ok', action='create-sibling-ria')
    eq_(len(res), 1)

    # remotes exist, but only in super
    siblings = ds.siblings(result_renderer=None)
    eq_({'datastore', 'datastore-storage', 'here'},
        {s['name'] for s in siblings})
    sub_siblings = subds.siblings(result_renderer=None)
    eq_({'here'}, {s['name'] for s in sub_siblings})
    sub2_siblings = subds2.siblings(result_renderer=None)
    eq_({'here'}, {s['name'] for s in sub2_siblings})

    # TODO: post-update hook was enabled

    # check bare repo:
    git_config = Path(base_path) / ds.id[:3] / ds.id[3:] / 'config'
    assert git_config.exists()
    content = git_config.read_text()
    assert_in("[datalad \"ora-remote\"]", content)
    super_uuid = ds.config.get("remote.{}.annex-uuid".format('datastore-storage'))
    assert_in("uuid = {}".format(super_uuid), content)

    # implicit test of success by ria-installing from store:
    ds.publish(to="datastore", transfer_data='all')
    with chpwd(clone_path):
        if host:
            # note, we are not using the "test-store"-label here
            clone('ria+ssh://{}{}#{}'.format(host, base_path, ds.id),
                  path='test_install')
        else:
            # TODO: Whenever ria+file supports special remote config (label),
            # change here:
            clone('ria+file://{}#{}'.format(base_path, ds.id),
                  path='test_install')
        installed_ds = Dataset(op.join(clone_path, 'test_install'))
        assert installed_ds.is_installed()
        assert_repo_status(installed_ds.repo)
        eq_(installed_ds.id, ds.id)
        assert_in(op.join('ds', 'file1.txt'),
                  installed_ds.repo.get_annexed_files())
        assert_result_count(installed_ds.get(op.join('ds', 'file1.txt')),
                            1,
                            status='ok',
                            action='get',
                            path=op.join(installed_ds.path, 'ds', 'file1.txt'))

    # now, again but recursive.
    res = ds.create_sibling_ria("ria+ssh://test-store:", "datastore",
                                recursive=True, existing='reconfigure')
    eq_(len(res), 3)
    assert_result_count(res, 1, path=str(ds.pathobj), status='ok', action="create-sibling-ria")
    assert_result_count(res, 1, path=str(subds.pathobj), status='ok', action="create-sibling-ria")
    assert_result_count(res, 1, path=str(subds2.pathobj), status='ok', action="create-sibling-ria")

    # remotes now exist in super and sub
    siblings = ds.siblings(result_renderer=None)
    eq_({'datastore', 'datastore-storage', 'here'},
        {s['name'] for s in siblings})
    sub_siblings = subds.siblings(result_renderer=None)
    eq_({'datastore', 'datastore-storage', 'here'},
        {s['name'] for s in sub_siblings})
    # but no special remote in plain git subdataset:
    sub2_siblings = subds2.siblings(result_renderer=None)
    eq_({'datastore', 'here'},
        {s['name'] for s in sub2_siblings})

    # for testing trust_level parameter, redo for each label:
    for trust in ['trust', 'semitrust', 'untrust']:
        ds.create_sibling_ria("ria+ssh://test-store:",
                              "datastore",
                              existing='reconfigure',
                              trust_level=trust)
        res = ds.repo.repo_info()
        assert_in('[datastore-storage]',
                  [r['description']
                   for r in res['{}ed repositories'.format(trust)]])
Example #27
0
def test_container_from_subdataset(ds_path, src_subds_path, local_file):

    # prepare a to-be subdataset with a registered container
    src_subds = Dataset(src_subds_path).create()
    src_subds.containers_add(name="first",
                             url=get_local_file_url(
                                 op.join(local_file, 'some_container.img')))
    # add it as subdataset to a super ds:
    ds = Dataset(ds_path).create()
    subds = ds.install("sub", source=src_subds_path)
    # add it again one level down to see actual recursion:
    subds.install("subsub", source=src_subds_path)

    # We come up empty without recursive:
    res = ds.containers_list(recursive=False, **RAW_KWDS)
    assert_result_count(res, 0)

    # query available containers from within super:
    res = ds.containers_list(recursive=True, **RAW_KWDS)
    assert_result_count(res, 2)
    assert_in_results(res, action="containers", refds=ds.path)

    # default location within the subdataset:
    target_path = op.join(subds.path, '.datalad', 'environments', 'first',
                          'image')
    assert_result_count(res,
                        1,
                        name='sub/first',
                        type='file',
                        action='containers',
                        status='ok',
                        path=target_path,
                        parentds=subds.path)

    # not installed subdataset doesn't pose an issue:
    sub2 = ds.create("sub2")
    assert_result_count(ds.subdatasets(), 2, type="dataset")
    ds.uninstall("sub2")
    from datalad.tests.utils import assert_false
    assert_false(sub2.is_installed())

    # same results as before, not crashing or somehow confused by a not present
    # subds:
    res = ds.containers_list(recursive=True, **RAW_KWDS)
    assert_result_count(res, 2)
    assert_result_count(res,
                        1,
                        name='sub/first',
                        type='file',
                        action='containers',
                        status='ok',
                        path=target_path,
                        parentds=subds.path)

    # The default renderer includes the image names.
    with swallow_outputs() as out:
        ds.containers_list(recursive=True)
        lines = out.out.splitlines()
    assert_re_in("sub/first", lines)
    assert_re_in("sub/subsub/first", lines)
    # But we are careful not to render partial names from subdataset traversals
    # (i.e. we recurse with containers_list(..., result_renderer=None)).
    with assert_raises(AssertionError):
        assert_re_in("subsub/first", lines)
Example #28
0
def _test_create_store(host, base_path, ds_path, clone_path):
    skip_if_no_module("ria_remote")  # special remote needs to be installed

    ds = Dataset(ds_path).create(force=True)

    subds = ds.create('sub', force=True)
    ds.save(recursive=True)
    assert_repo_status(ds.path)

    # don't specify special remote. By default should be git-remote + "-ria"
    res = ds.create_sibling_ria("ria+ssh://test-store:", "datastore")
    assert_result_count(res, 1, status='ok', action='create-sibling-ria')
    eq_(len(res), 1)

    # remotes exist, but only in super
    siblings = ds.siblings(result_renderer=None)
    eq_({'datastore', 'datastore-ria', 'here'}, {s['name'] for s in siblings})
    sub_siblings = subds.siblings(result_renderer=None)
    eq_({'here'}, {s['name'] for s in sub_siblings})

    # TODO: post-update hook was enabled

    # implicit test of success by ria-installing from store:
    ds.publish(to="datastore", transfer_data='all')
    with chpwd(clone_path):
        if host:
            # note, we are not using the "test-store"-label here
            clone('ria+ssh://{}{}#{}'.format(host, base_path, ds.id),
                  path='test_install')
        else:
            # TODO: Whenever ria+file supports special remote config (label),
            # change here:
            clone('ria+file://{}#{}'.format(base_path, ds.id),
                  path='test_install')
        installed_ds = Dataset(op.join(clone_path, 'test_install'))
        assert installed_ds.is_installed()
        assert_repo_status(installed_ds.repo)
        eq_(installed_ds.id, ds.id)
        assert_in(op.join('ds', 'file1.txt'),
                  installed_ds.repo.get_annexed_files())
        assert_result_count(installed_ds.get(op.join('ds', 'file1.txt')),
                            1,
                            status='ok',
                            action='get',
                            path=op.join(installed_ds.path, 'ds', 'file1.txt'))

    # now, again but recursive.
    res = ds.create_sibling_ria("ria+ssh://test-store:",
                                "datastore",
                                recursive=True,
                                existing='reconfigure')
    eq_(len(res), 2)
    assert_result_count(res, 2, status='ok', action="create-sibling-ria")

    # remotes now exist in super and sub
    siblings = ds.siblings(result_renderer=None)
    eq_({'datastore', 'datastore-ria', 'here'}, {s['name'] for s in siblings})
    sub_siblings = subds.siblings(result_renderer=None)
    eq_({'datastore', 'datastore-ria', 'here'},
        {s['name']
         for s in sub_siblings})

    # for testing trust_level parameter, redo for each label:
    for trust in ['trust', 'semitrust', 'untrust']:
        ds.create_sibling_ria("ria+ssh://test-store:",
                              "datastore",
                              existing='reconfigure',
                              trust_level=trust)
        res = ds.repo.repo_info()
        assert_in(
            '[datastore-ria]',
            [r['description'] for r in res['{}ed repositories'.format(trust)]])
Example #29
0
def add_to_datalad(topdir, studydir, msg, bids):
    """Do all necessary preparations (if were not done before) and save
    """
    import datalad.api as dl
    from datalad.api import Dataset
    from datalad.support.annexrepo import AnnexRepo
    from datalad.support.external_versions import external_versions
    assert external_versions['datalad'] >= MIN_VERSION, (
        "Need datalad >= {}".format(MIN_VERSION))  # add to reqs

    studyrelpath = op.relpath(studydir, topdir)
    assert not studyrelpath.startswith(op.pardir)  # so we are under
    # now we need to test and initiate a DataLad dataset all along the path
    curdir_ = topdir
    superds = None
    subdirs = [''] + [d for d in studyrelpath.split(op.sep) if d != os.curdir]
    for isubdir, subdir in enumerate(subdirs):
        curdir_ = op.join(curdir_, subdir)
        ds = Dataset(curdir_)
        if not ds.is_installed():
            lgr.info("Initiating %s", ds)
            # would require annex > 20161018 for correct operation on annex v6
            # need to add .gitattributes first anyways
            ds_ = dl.create(
                curdir_,
                dataset=superds,
                force=True,
                # initiate annex only at the bottom repository
                annex=isubdir == (len(subdirs) - 1),
                fake_dates=True,
                # shared_access='all',
            )
            assert ds == ds_
        assert ds.is_installed()
        superds = ds

    # TODO: we need a helper (in DataLad ideally) to ease adding such
    # specifications
    gitattributes_path = op.join(studydir, '.gitattributes')
    # We will just make sure that all our desired rules are present in it
    desired_attrs = """\
* annex.largefiles=(largerthan=100kb)
*.json annex.largefiles=nothing
*.txt annex.largefiles=nothing
*.tsv annex.largefiles=nothing
*.nii.gz annex.largefiles=anything
*.tgz annex.largefiles=anything
*_scans.tsv annex.largefiles=anything
"""
    if op.exists(gitattributes_path):
        with open(gitattributes_path, 'rb') as f:
            known_attrs = [
                line.decode('utf-8').rstrip() for line in f.readlines()
            ]
    else:
        known_attrs = []
    for attr in desired_attrs.split('\n'):
        if attr not in known_attrs:
            known_attrs.append(attr)
    with open(gitattributes_path, 'wb') as f:
        f.write('\n'.join(known_attrs).encode('utf-8'))

    # ds might have memories of having ds.repo GitRepo
    superds = Dataset(topdir)
    assert op.realpath(ds.path) == op.realpath(studydir)
    assert isinstance(ds.repo, AnnexRepo)
    # Add doesn't have all the options of save such as msg and supers
    ds.save(path=['.gitattributes'],
            message="Custom .gitattributes",
            to_git=True)
    dsh = dsh_path = None
    if op.lexists(op.join(ds.path, '.heudiconv')):
        dsh_path = op.join(ds.path, '.heudiconv')
        dsh = Dataset(dsh_path)
        if not dsh.is_installed():
            # Previously we did not have it as a submodule, and since no
            # automagic migration is implemented, we just need to check first
            # if any path under .heudiconv is already under git control
            if any(x.startswith('.heudiconv/') for x in ds.repo.get_files()):
                lgr.warning(
                    "%s has .heudiconv not as a submodule from previous"
                    " versions of heudiconv. No automagic migration is "
                    "yet provided", ds)
            else:
                dsh = ds.create(
                    path='.heudiconv',
                    force=True,
                    # shared_access='all'
                )
        # Since .heudiconv could contain sensitive information
        # we place all files under annex and then add
        if create_file_if_missing(op.join(dsh_path, '.gitattributes'),
                                  """* annex.largefiles=anything"""):
            ds.save(
                '.heudiconv/.gitattributes',
                to_git=True,
                message="Added gitattributes to place all .heudiconv content"
                " under annex")
    ds.save('.',
            recursive=True
            # not in effect! ?
            #annex_add_opts=['--include-dotfiles']
            )

    # TODO: filter for only changed files?
    # Provide metadata for sensitive information
    mark_sensitive(ds, 'sourcedata')
    mark_sensitive(ds, '*_scans.tsv')  # top level
    mark_sensitive(ds, '*/*_scans.tsv')  # within subj
    mark_sensitive(ds, '*/*/*_scans.tsv')  # within sess/subj
    mark_sensitive(ds, '*/anat')  # within subj
    mark_sensitive(ds, '*/*/anat')  # within ses/subj
    if dsh_path:
        mark_sensitive(ds, '.heudiconv')  # entire .heudiconv!
    superds.save(path=ds.path, message=msg, recursive=True)

    assert not ds.repo.dirty
    # TODO:  they are still appearing as native annex symlinked beasts
    """
Example #30
0
def get_bids_dataset():
    srcrepo = get_sourcerepo()
    bids_ds = Dataset(path=opj(srcrepo.path, 'datalad_neuroimaging', 'tests',
                               'data', 'bids'))
    if bids_ds.is_installed():
        return bids_ds
    try:
        import heudiconv
    except ImportError:
        raise SkipTest
    # make one
    bids_ds.create()
    # place dicoms in the mandated shadow tree
    structdicom_ds = bids_ds.install(source=get_dicom_dataset('structural'),
                                     path=opj('sourcedata', 'sub-02',
                                              'ses-structural'),
                                     reckless=True)
    funcdicom_ds = bids_ds.install(source=get_dicom_dataset('functional'),
                                   path=opj('sourcedata', 'sub-02',
                                            'ses-functional'),
                                   reckless=True)
    # dicom dataset is preconfigured for metadata extraction
    # XXX this is the slowest step of the entire procedure
    # reading 5k dicoms of the functional data
    bids_ds.aggregate_metadata(recursive=True)
    # pull subject ID from metadata
    res = bids_ds.metadata(funcdicom_ds.path,
                           reporton='datasets',
                           return_type='item-or-list',
                           result_renderer='disabled')
    subj_id = res['metadata']['dicom']['Series'][0]['PatientID']
    # prepare for incoming BIDS metadata that we will want to keep in
    # Git -- templates would be awesome!
    with open(opj(bids_ds.path, '.gitattributes'), 'a') as ga:
        # except for hand-picked global metadata, we want anything
        # to go into the annex to be able to retract files after
        # publication
        ga.write('** annex.largefiles=anything\n')
        for fn in ('CHANGES', 'README', 'dataset_description.json'):
            # but not these
            ga.write('{} annex.largefiles=nothing\n'.format(fn))
    bids_ds.add('.gitattributes',
                to_git=True,
                message='Initial annex entry configuration')
    ok_clean_git(bids_ds.path)
    # conversion of two DICOM datasets to one BIDS dataset
    for label, ds, scanlabel in (
            # structural needs to come first or else heudiconv
            # will try to rewrite the events.tsv for the functional
            # run, for some strange reason
        ('structural', structdicom_ds, 'anat'),
        ('functional', funcdicom_ds, 'func')):
        bids_ds.run(
            [
                'heudiconv',
                '-f',
                'reproin',
                # TODO fix DICOMs to not have a 'sub' prefix
                '-s',
                subj_id,
                '-c',
                'dcm2niix',
                # TODO decide on the fate of .heudiconv/
                # but ATM we need to (re)move it:
                # https://github.com/nipy/heudiconv/issues/196
                '-o',
                opj(bids_ds.path, '.git', 'stupid', label),
                '-b',
                '-a',
                bids_ds.path,
                '-l',
                '',
                # avoid gory details provided by dcmstack, we have them in
                # the aggregated DICOM metadata already
                '--minmeta',
                '--files',
                opj(ds.path, 'dicoms')
            ],
            message="DICOM conversion of {} scans".format(label))
        # remove unwanted stuff that cannot be disabled ATM
        # https://github.com/nipy/heudiconv/issues/195
        # TODO should be removed eventually
        bids_ds.remove([
            p for p in (opj('sourcedata', 'sub-02', scanlabel),
                        opj('sourcedata', 'README'))
            if op.lexists(opj(bids_ds.path, p))
        ],
                       check=False)

    bids_ds.config.add('datalad.metadata.nativetype',
                       'bids',
                       where='dataset',
                       reload=False)
    bids_ds.config.add('datalad.metadata.nativetype',
                       'nifti1',
                       where='dataset',
                       reload=True)
    # XXX need to `add` specifically to make it work in direct mode
    #bids_ds.save(message='Metadata type config')
    bids_ds.add('.', message='Metadata type config')
    # loose dicom datasets
    bids_ds.uninstall([structdicom_ds.path, funcdicom_ds.path], check=False)
    # no need for recursion, we already have the dicom dataset's
    # stuff on record
    bids_ds.aggregate_metadata(recursive=False, incremental=True)
    ok_clean_git(bids_ds.path)
    return bids_ds
Example #31
0
def test_demo_raw_ds(path, toolbox_url):

    ds = Dataset(path)

    with patch.dict('os.environ', {'DATALAD_HIRNI_TOOLBOX_URL': toolbox_url}):
        ds.create()  # TODO: May be move to ds.create(cfg_proc='hirni') in demo
        ds.run_procedure('cfg_hirni')

    # clean repo with an annex:
    assert_repo_status(ds.repo, annex=True)

    # README, dataset_description.json and studyspec.json at toplevel and in git
    for f in ['README', 'studyspec.json', 'dataset_description.json']:
        ok_file_under_git(ds.path, f, annexed=False)

    # toolbox installed under code/hirni-toolbox
    subs = ds.subdatasets()
    assert_result_count(subs, 1)
    assert_result_count(subs,
                        1,
                        path=op.join(ds.path, 'code', 'hirni-toolbox'))

    ds.hirni_import_dcm(
        'https://github.com/datalad/example-dicom-structural/archive/master.tar.gz',
        'acq1',
        anon_subject='001')

    # acquisition directory and studyspec created + subdataset 'dicoms' within the acquisition dir
    for f in [
            op.join(ds.path, 'acq1'),
            op.join(ds.path, 'acq1', 'studyspec.json'),
            op.join(ds.path, 'acq1', 'dicoms')
    ]:
        assert_true(op.exists(f))
    subs = ds.subdatasets()
    assert_result_count(subs, 2)
    assert_result_count(subs,
                        1,
                        path=op.join(ds.path, 'code', 'hirni-toolbox'))
    assert_result_count(subs, 1, path=op.join(ds.path, 'acq1', 'dicoms'))

    # TODO: check actual spec? (Prob. sufficient to test for that in dedicated import-dcm/dcm2spec tests
    # TODO: check dicom metadata

    ds.hirni_import_dcm(
        'https://github.com/datalad/example-dicom-functional/archive/master.tar.gz',
        'acq2',
        anon_subject='001')

    # acquisition directory and studyspec created + subdataset 'dicoms' within the acquisition dir
    for f in [
            op.join(ds.path, 'acq2'),
            op.join(ds.path, 'acq2', 'studyspec.json'),
            op.join(ds.path, 'acq2', 'dicoms')
    ]:
        assert_true(op.exists(f))
    subs = ds.subdatasets()
    assert_result_count(subs, 3)
    assert_result_count(subs,
                        1,
                        path=op.join(ds.path, 'code', 'hirni-toolbox'))
    assert_result_count(subs, 1, path=op.join(ds.path, 'acq1', 'dicoms'))
    assert_result_count(subs, 1, path=op.join(ds.path, 'acq2', 'dicoms'))

    # Note from demo: The calls to `git annex addurl` and `datalad save` currently replace a single call to
    # `datalad download-url` due to a bug in that command.
    events_file = op.join('acq2', 'events.tsv')
    ds.repo.add_url_to_file(
        file_=events_file,
        url=
        'https://github.com/datalad/example-dicom-functional/raw/master/events.tsv'
    )
    ds.save(message="Added stimulation protocol for acquisition 2")

    ok_file_under_git(ds.path, events_file, annexed=True)

    ds.hirni_spec4anything(
        events_file,
        properties=
        '{"procedures": {"procedure-name": "copy-converter", "procedure-call": "bash {script} {{location}} '
        '{ds}/sub-{{bids-subject}}/func/sub-{{bids-subject}}_task-{{bids-task}}_run-{{bids-run}}_events.tsv'
        '"}, "type": "events_file"}')

    ok_file_under_git(ds.path,
                      op.join('acq2', 'studyspec.json'),
                      annexed=False)
    assert_repo_status(ds.repo, annex=True)
Example #32
0
File: api.py Project: hanke/datalad
class supers(SuprocBenchmarks):
    """
    Benchmarks on common operations on collections of datasets using datalad API
    """

    timeout = 3600
    # need to assure that we are working in a different repository now
    # see https://github.com/datalad/datalad/issues/1512
    # might not be sufficient due to side effects between tests and
    # thus getting into the same situation
    ds_count = 0
    def setup_cache(self):
        # creating in CWD so things get removed when ASV is done
        ds_path = create_test_dataset("testds1", spec='2/-2/-2', seed=0)[0]
        # Will store into a tarfile since otherwise install -r is way too slow
        # to be invoked for every benchmark
        tarfile_path = opj(osp.dirname(ds_path), 'testds1.tar')
        with tarfile.open(tarfile_path, "w") as tar:
            # F.CK -- Python tarfile can't later extract those because key dirs are
            # read-only.  For now just a workaround - make it all writeable
            from datalad.utils import rotree
            rotree('testds1', ro=False, chmod_files=False)
            tar.add('testds1', recursive=True)
        rmtree('testds1')

        return tarfile_path

    def setup(self, tarfile_path):
        import tarfile
        tempdir = osp.dirname(tarfile_path)
        with tarfile.open(tarfile_path) as tar:
            tar.extractall(tempdir)

        # TODO -- remove this abomination after https://github.com/datalad/datalad/issues/1512 is fixed
        epath = opj(tempdir, 'testds1')
        epath_unique = epath + str(self.__class__.ds_count)
        os.rename(epath, epath_unique)
        self.__class__.ds_count += 1
        self.ds = Dataset(epath_unique)
        print("Finished setup for %s" % tempdir)

    def teardown(self, tarfile_path):
        for path in [self.ds.path + '_', self.ds.path]:
            print("Cleaning up %s" % path)
            if osp.exists(path):
                rmtree(path)

    def time_installr(self, tarfile_path):
        # somewhat duplicating setup but lazy to do different one for now
        assert install(self.ds.path + '_', source=self.ds.path, recursive=True)

    def time_createadd(self, tarfile_path):
        assert self.ds.create('newsubds')

    def time_createadd_to_dataset(self, tarfile_path):
        subds = create(opj(self.ds.path, 'newsubds'))
        self.ds.add(subds.path)

    def time_ls(self, tarfile_path):
        ls(self.ds.path)

    def time_ls_recursive(self, tarfile_path):
        ls(self.ds.path, recursive=True)

    def time_ls_recursive_long_all(self, tarfile_path):
        ls(self.ds.path, recursive=True, long_=True, all_=True)

    # TODO: since doesn't really allow to uninstall top level ds... bleh ;)
    #def time_uninstall(self, tarfile_path):
    #    uninstall(self.ds.path, recursive=True)

    def time_remove(self, tarfile_path):
        remove(self.ds.path, recursive=True)
Example #33
0
def test_basics(src, dst):
    # dataset with subdataset, not specific configuration
    ds = Dataset(src).create()
    (ds.pathobj / 'file1').write_text('some')
    ds.save()
    sub = ds.create('subds')
    # second one for a result_xfm test below
    ds.create('subds2')
    eq_(sub.config.get('datalad.metadata.nativetype'), None)

    # now clone the super
    clone = install(source=src, path=dst)
    # and configure it, such that it modifies each obtained subdataset
    # on install to have 'bids' listed as a metadata type
    clone.config.set(
        'datalad.result-hook.alwaysbids.call-json',
        # string substitutions based on the result record are supported
        'run_procedure {{"dataset":"{path}","spec":"cfg_metadatatypes bids"}}',
        where='local',
    )
    # config on which kind of results this hook should operate
    clone.config.set(
        'datalad.result-hook.alwaysbids.match-json',
        # any successfully installed dataset
        '{"type":"dataset","action":"install","status":["eq", "ok"]}',
        where='local',
    )
    # a smoke test to see if a hook definition without any call args works too
    clone.config.set('datalad.result-hook.wtf.call-json', 'wtf', where='local')
    clone.config.set(
        'datalad.result-hook.wtf.match-json',
        '{"type":"dataset","action":"install","status":["eq", "ok"]}',
        where='local',
    )
    # configure another one that will unlock any obtained file
    # {dsarg} is substituted by the dataset arg of the command that
    # the eval_func() decorator belongs to
    # but it may not have any, as this is not the outcome of a
    # require_dataset(), but rather the verbatim input
    # it could be more useful to use {refds}
    clone.config.set(
        'datalad.result-hook.unlockfiles.call-json',
        'unlock {{"dataset":"{dsarg}","path":"{path}"}}',
        where='local',
    )
    clone.config.set(
        'datalad.result-hook.unlockfiles.match-json',
        '{"type":"file","action":"get","status":"ok"}',
        where='local',
    )
    if not on_windows:
        # and one that runs a shell command on any notneeded file-get
        clone.config.set(
            'datalad.result-hook.annoy.call-json',
            'run {{"cmd":"touch {path}_annoyed",'
            '"dataset":"{dsarg}","explicit":true}}',
            where='local',
        )
        clone.config.set(
            'datalad.result-hook.annoy.match-json',
            '{"type":["in", ["file"]],"action":"get","status":"notneeded"}',
            where='local',
        )
    # setup done, now see if it works
    clone.get('subds')
    clone_sub = Dataset(clone.pathobj / 'subds')
    eq_(clone_sub.config.get('datalad.metadata.nativetype'), 'bids')
    # now the same thing with a result_xfm, should make no difference
    clone.get('subds2')
    clone_sub2 = Dataset(clone.pathobj / 'subds2')
    eq_(clone_sub2.config.get('datalad.metadata.nativetype'), 'bids')

    # hook auto-unlocks the file
    if not on_windows:
        ok_((clone.pathobj / 'file1').is_symlink())
    res = clone.get('file1')
    if not on_windows:
        # we get to see the results from the hook too!
        assert_result_count(
            res, 1, action='unlock', path=str(clone.pathobj / 'file1'))
    ok_(not (clone.pathobj / 'file1').is_symlink())

    if not on_windows:
        # different hook places annoying file next to a file that was already present
        annoyed_file = clone.pathobj / 'file1_annoyed'
        ok_(not annoyed_file.exists())
        clone.get('file1')
        ok_(annoyed_file.exists())
Example #34
0
def test_aggregation(path=None):
    with chpwd(path):
        assert_raises(InsufficientArgumentsError, aggregate_metadata, None)
    # a hierarchy of three (super/sub)datasets, each with some native metadata
    ds = Dataset(opj(path, 'origin')).create(force=True)
    # before anything aggregated we would get nothing and only a log warning
    with swallow_logs(new_level=logging.WARNING) as cml:
        assert_equal(list(query_aggregated_metadata('all', ds, [])), [])
    assert_re_in('.*Found no aggregated metadata.*update', cml.out)
    ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
                  scope='branch')
    subds = ds.create('sub', force=True)
    subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
                     scope='branch')
    subsubds = subds.create('subsub', force=True)
    subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
                        scope='branch')
    ds.save(recursive=True)
    assert_repo_status(ds.path)
    # aggregate metadata from all subdatasets into any superdataset, including
    # intermediate ones
    res = ds.aggregate_metadata(recursive=True, update_mode='all')
    # we get success report for both subdatasets and the superdataset,
    # and they get saved
    assert_result_count(res, 3, status='ok', action='aggregate_metadata')
    assert_in_results(res, action='save', status="ok")
    # nice and tidy
    assert_repo_status(ds.path)

    # quick test of aggregate report
    aggs = ds.metadata(get_aggregates=True)
    # one for each dataset
    assert_result_count(aggs, 3)
    # mother also report layout version
    assert_result_count(aggs, 1, path=ds.path, layout_version=1)

    # store clean direct result
    origres = ds.metadata(recursive=True)
    # basic sanity check
    assert_result_count(origres, 6)
    assert_result_count(origres, 3, type='dataset')
    assert_result_count(origres, 3, type='file')  # Now that we have annex.key
    # three different IDs
    assert_equal(3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset'])))
    # and we know about all three datasets
    for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'):
        assert_true(
            sum([s['metadata']['frictionless_datapackage']['name'] \
                    == ensure_unicode(name) for s in origres
                 if s['type'] == 'dataset']))

    # now clone the beast to simulate a new user installing an empty dataset
    clone = install(
        opj(path, 'clone'), source=ds.path,
        result_xfm='datasets', return_type='item-or-list')
    # ID mechanism works
    assert_equal(ds.id, clone.id)

    # get fresh metadata
    cloneres = clone.metadata()
    # basic sanity check
    assert_result_count(cloneres, 2)
    assert_result_count(cloneres, 1, type='dataset')
    assert_result_count(cloneres, 1, type='file')

    # now loop over the previous results from the direct metadata query of
    # origin and make sure we get the extract same stuff from the clone
    _compare_metadata_helper(origres, clone)

    # now obtain a subdataset in the clone, should make no difference
    assert_status('ok', clone.install('sub', result_xfm=None, return_type='list'))
    _compare_metadata_helper(origres, clone)

    # test search in search tests, not all over the place
    ## query smoke test
    assert_result_count(clone.search('mother', mode='egrep'), 1)
    assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1)

    child_res = clone.search('child', mode='egrep')
    assert_result_count(child_res, 2)
    for r in child_res:
        if r['type'] == 'dataset':
            assert_in(
                r['query_matched']['frictionless_datapackage.name'],
                r['metadata']['frictionless_datapackage']['name'])
Example #35
0
def test_aggregation(path):
    with chpwd(path):
        assert_raises(InsufficientArgumentsError, aggregate_metadata, None)
    # a hierarchy of three (super/sub)datasets, each with some native metadata
    ds = Dataset(opj(path, 'origin')).create(force=True)
    # before anything aggregated we would get nothing and only a log warning
    with swallow_logs(new_level=logging.WARNING) as cml:
        assert_equal(list(query_aggregated_metadata('all', ds, [])), [])
    assert_re_in('.*Found no aggregated metadata.*update', cml.out)
    ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
                  where='dataset')
    subds = ds.create('sub', force=True)
    subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
                     where='dataset')
    subsubds = subds.create('subsub', force=True)
    subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
                        where='dataset')
    ds.add('.', recursive=True)
    ok_clean_git(ds.path)
    # aggregate metadata from all subdatasets into any superdataset, including
    # intermediate ones
    res = ds.aggregate_metadata(recursive=True, update_mode='all')
    # we get success report for both subdatasets and the superdataset,
    # and they get saved
    assert_result_count(res, 6)
    assert_result_count(res, 3, status='ok', action='aggregate_metadata')
    assert_result_count(res, 3, status='ok', action='save')
    # nice and tidy
    ok_clean_git(ds.path)

    # quick test of aggregate report
    aggs = ds.metadata(get_aggregates=True)
    # one for each dataset
    assert_result_count(aggs, 3)
    # mother also report layout version
    assert_result_count(aggs, 1, path=ds.path, layout_version=1)

    # store clean direct result
    origres = ds.metadata(recursive=True)
    # basic sanity check
    assert_result_count(origres, 6)
    assert_result_count(origres, 3, type='dataset')
    assert_result_count(origres, 3, type='file')  # Now that we have annex.key
    # three different IDs
    assert_equal(3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset'])))
    # and we know about all three datasets
    for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'):
        assert_true(
            sum([s['metadata']['frictionless_datapackage']['name'] \
                    == assure_unicode(name) for s in origres
                 if s['type'] == 'dataset']))

    # now clone the beast to simulate a new user installing an empty dataset
    clone = install(
        opj(path, 'clone'), source=ds.path,
        result_xfm='datasets', return_type='item-or-list')
    # ID mechanism works
    assert_equal(ds.id, clone.id)

    # get fresh metadata
    cloneres = clone.metadata()
    # basic sanity check
    assert_result_count(cloneres, 2)
    assert_result_count(cloneres, 1, type='dataset')
    assert_result_count(cloneres, 1, type='file')

    # now loop over the previous results from the direct metadata query of
    # origin and make sure we get the extact same stuff from the clone
    _compare_metadata_helper(origres, clone)

    # now obtain a subdataset in the clone, should make no difference
    assert_status('ok', clone.install('sub', result_xfm=None, return_type='list'))
    _compare_metadata_helper(origres, clone)

    # test search in search tests, not all over the place
    ## query smoke test
    assert_result_count(clone.search('mother', mode='egrep'), 1)
    assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1)

    child_res = clone.search('child', mode='egrep')
    assert_result_count(child_res, 2)
    for r in child_res:
        if r['type'] == 'dataset':
            assert_in(
                r['query_matched']['frictionless_datapackage.name'],
                r['metadata']['frictionless_datapackage']['name'])