Python Dataset.aggregate_metadataの例、datalad.api.Dataset.aggregate_metadata Pythonの例

コード例 #1

0

ファイルを表示

def test_add_readme(path):
    ds = Dataset(path).create(force=True)
    ds.add('.')
    ds.aggregate_metadata()
    ok_clean_git(ds.path)
    assert_status('ok', ds.plugin('add_readme'))
    # should use default name
    eq_(
        open(opj(path, 'README.md')).read(),
        """\
# Dataset "demo_ds"

this is for play

### Authors

- Betty
- Tom

### License

PDDL

## General information

This is a DataLad dataset (id: {id}).

For more information on DataLad and on how to work with its datasets,
see the DataLad documentation at: http://docs.datalad.org
""".format(
    id=ds.id))

    # should skip on re-run
    assert_status('notneeded', ds.plugin('add_readme'))

コード例 #2

0

ファイルを表示

def test_minimal(path):
    ds = Dataset(opj(path, 'ds')).create(force=True)
    ds.add('.')
    ok_clean_git(ds.path)
    # make sure essential metadata files are annex for this test
    # we won't to drop them later and still do the conversion
    assert_true(
        ds.repo.is_under_annex(
            ['participants.tsv', 'dataset_description.json']))
    ds.aggregate_metadata()
    ok_clean_git(ds.path)
    # do conversion
    # where output should appear by default
    target_path = opj(path, 'scidata_isatab_{}'.format(ds.repo.get_hexsha()))
    with chpwd(path):
        assert_status(
            'ok',
            ds.plugin(
                'bids2scidata',
                repo_name="dummy",
                repo_accession='ds1',
                repo_url='http://example.com',
            ))
    # just a few basic sanity tests that info ends up in the right places
    # a proper test should be a full regression test on a real dataset
    # with hand-validated exported metadata

    # investigator info
    invest = open(opj(target_path, 'i_Investigation.txt')).read()
    assert_in('Betty\tTom', invest)
    assert_in('Study Assay File Name\ta_mri_t1w.txt\ta_mri_bold.txt', invest)
    assert_in(
        'Comment[Data Repository]\tdummy\nComment[Data Record Accession]\tds1\nComment[Data Record URI]\thttp://example.com',
        invest)

    # study table
    assert_equal(
        """\
Source Name\tCharacteristics[organism]\tCharacteristics[organism part]\tProtocol REF\tSample Name\tCharacteristics[sex]\tCharacteristics[age at scan]\tCharacteristics[handedness]
01\thomo sapiens\tbrain\tParticipant recruitment\t01\tm\t30\tr
15\thomo sapiens\tbrain\tParticipant recruitment\t15\tf\t35\tl
""",
        open(opj(target_path, 's_study.txt')).read())

    # assay tables
    assert_equal(
        """\
Sample Name\tProtocol REF\tParameter Value[modality]\tAssay Name\tRaw Data File\tComment[Data Repository]\tComment[Data Record Accession]\tComment[Data Record URI]\tFactor Value[task]
sub-15\tMagnetic Resonance Imaging\tbold\tsub-15_task-nix_run-1\tsub-15/func/sub-15_task-nix_run-1_bold.nii.gz\tdummy\tds1\thttp://example.com\tnix
""",
        open(opj(target_path, 'a_mri_bold.txt')).read())

    assert_equal(
        """\
Sample Name\tProtocol REF\tParameter Value[modality]\tAssay Name\tRaw Data File\tComment[Data Repository]\tComment[Data Record Accession]\tComment[Data Record URI]
sub-01\tMagnetic Resonance Imaging\tT1w\tsub-01\tsub-01/anat/sub-01_T1w.nii.gz\tdummy\tds1\thttp://example.com
""",
        open(opj(target_path, 'a_mri_t1w.txt')).read())

コード例 #3

0

ファイルを表示

ファイル: test_dicom.py プロジェクト: yarikoptic/datalad-neuroimaging

def test_dicom(path):
    ds = Dataset(path).create()
    ds.config.add('datalad.metadata.nativetype', 'dicom', where='dataset')
    copy(
        opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'dicom.dcm'),
        path)
    ds.add('.')
    ok_clean_git(ds.path)
    res = ds.aggregate_metadata()
    assert_status('ok', res)
    # query for the file metadata
    res = ds.metadata('dicom.dcm')
    assert_result_count(res, 1)
    # from this extractor
    meta = res[0]['metadata']['dicom']
    assert_in('@context', meta)
    # no point in testing ALL keys, but we got plenty
    assert (len(meta.keys()) > 70)
    eq_(meta['SeriesDate'], '20070205')

    # now ask for the dataset metadata, which should have both the unique props
    # and a list of imageseries (one in this case, but a list)
    res = ds.metadata(reporton='datasets')
    assert_result_count(res, 1)
    dsmeta = res[0]['metadata']['dicom']
    # same context
    assert_dict_equal(meta['@context'], dsmeta['@context'])
    meta.pop('@context')
    eq_(dsmeta['Series'], [meta])

    # for this artificial case pretty much the same info also comes out as
    # unique props, but wrapped in lists
    ucp = res[0]['metadata']["datalad_unique_content_properties"]['dicom']
    assert_dict_equal(
        {
            k: [v]
            for k, v in dsmeta['Series'][0].items()
            if k not in DicomExtractor._unique_exclude and k in ucp
        }, {
            k: v
            for k, v in ucp.items() if k not in DicomExtractor._unique_exclude
        })

    # buuuut, if we switch of file-based metadata storage
    ds.config.add('datalad.metadata.aggregate-content-dicom',
                  'false',
                  where='dataset')
    ds.aggregate_metadata()
    res = ds.metadata(reporton='datasets')

    # the auto-uniquified bits are gone but the Series description stays
    assert_not_in("datalad_unique_content_properties", res[0]['metadata'])
    eq_(dsmeta['Series'], [meta])

コード例 #4

0

ファイルを表示

def test_audio(path):
    ds = Dataset(path).create()
    ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset')
    copy(
        opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'audio.mp3'),
        path)
    ds.add('.')
    ok_clean_git(ds.path)
    res = ds.aggregate_metadata()
    assert_status('ok', res)
    res = ds.metadata('audio.mp3')
    assert_result_count(res, 1)
    # compare full expected metadata set to catch any change of mind on the
    # side of the mutagen package
    # but not the bitrate, to variable estimate across decoders
    res[0]['metadata'].pop("comment<bitrate>", None)
    assert_result_count(
        res, 1,
        metadata={
           "format": "mime:audio/mp3",
           "duration(s)": 1.0,
           "name": "dltracktitle",
           "music:album": "dlalbumtitle",
           "music:artist": "dlartist",
           "music:channels": 1,
           "music:sample_rate": 44100,
           "music:Genre": "dlgenre",
           "comment<date>": "",
           "comment<tracknumber>": "dltracknumber",
        })

コード例 #5

0

ファイルを表示

def test_audio(path):
    ds = Dataset(path).create()
    ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset')
    copy(
        opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'audio.mp3'),
        path)
    ds.save()
    ok_clean_git(ds.path)
    res = ds.aggregate_metadata()
    assert_status('ok', res)
    res = ds.metadata('audio.mp3')
    assert_result_count(res, 1)

    # from this extractor
    meta = res[0]['metadata']['audio']
    for k, v in target.items():
        eq_(meta[k], v)

    assert_in('@context', meta)

    uniques = ds.metadata(reporton='datasets', return_type='item-or-list'
                          )['metadata']['datalad_unique_content_properties']
    # test file has it, but uniques have it blanked out, because the extractor considers it worthless
    # for discovering whole datasets
    assert_in('bitrate', meta)
    eq_(uniques['audio']['bitrate'], None)

    # 'date' field carries not value, hence gets exclude from the unique report
    assert_in('date', meta)
    assert (not meta['date'])
    assert_not_in('date', uniques['audio'])

コード例 #6

0

ファイルを表示

ファイル: test_audio.py プロジェクト: hanke/datalad

def test_audio(path):
    ds = Dataset(path).create()
    ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset')
    copy(
        opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'audio.mp3'),
        path)
    ds.add('.')
    ok_clean_git(ds.path)
    res = ds.aggregate_metadata()
    assert_status('ok', res)
    res = ds.metadata('audio.mp3')
    assert_result_count(res, 1)

    # from this extractor
    meta = res[0]['metadata']['audio']
    for k, v in target.items():
        eq_(meta[k], v)

    assert_in('@context', meta)

    uniques = ds.metadata(
        reporton='datasets', return_type='item-or-list')['metadata']['datalad_unique_content_properties']
    # test file has it, but uniques have it blanked out, because the extractor considers it worthless
    # for discovering whole datasets
    assert_in('bitrate', meta)
    eq_(uniques['audio']['bitrate'], None)

    # 'date' field carries not value, hence gets exclude from the unique report
    assert_in('date', meta)
    assert(not meta['date'])
    assert_not_in('date', uniques['audio'])

コード例 #7

0

ファイルを表示

ファイル: test_xmp.py プロジェクト: overlake333/datalad

def test_xmp(path):
    ds = Dataset(path).create()
    ds.config.add('datalad.metadata.nativetype', 'xmp', where='dataset')
    copy(opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'xmp.pdf'),
         path)
    ds.add('.')
    ok_clean_git(ds.path)
    res = ds.aggregate_metadata()
    assert_status('ok', res)
    res = ds.metadata('xmp.pdf')
    assert_result_count(res, 1)
    # compare full expected metadata set to catch any change of mind on the
    # side of the XMP library
    assert_result_count(res,
                        1,
                        metadata={
                            'dc:creator': 'Michael Hanke',
                            'dc:description': 'dlsubject',
                            'dc:description<?xml:lang>': 'x-default',
                            'dc:title': 'dltitle',
                            'dc:title<?xml:lang>': 'x-default',
                            'pdfaid:part': '1',
                            'pdfaid:conformance': 'A',
                            'pdf:Keywords': 'dlkeyword1 dlkeyword2',
                            'pdf:Producer': 'LibreOffice 5.2',
                            'xmp:CreateDate': '2017-10-08T10:27:06+02:00',
                            'xmp:CreatorTool': 'Writer',
                        })

コード例 #8

0

ファイルを表示

ファイル: test_nidm.py プロジェクト: datalad/datalad-neuroimaging

def test_nidm(path):
    ds = Dataset(path).create()
    ds.config.add('datalad.metadata.nativetype', 'nidm', where='dataset')
    # imagine filling the dataset up with something that NIDM info could be
    copy(
        opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'files',
            'nifti1.nii.gz'), path)
    # extracted from
    ds.save()
    # all nice and tidy, nothing untracked
    ok_clean_git(ds.path)
    # engage the extractor(s)
    res = ds.aggregate_metadata()
    # aggregation done without whining
    assert_status('ok', res)
    res = ds.metadata(reporton='datasets')
    # ATM we do not forsee file-based metadata to come back from NIDM
    assert_result_count(res, 1)
    # kill version info
    core = res[0]['metadata']['datalad_core']
    core.pop('version', None)
    core.pop('refcommit')
    # show full structure of the assembled metadata from demo content
    target_metadata = {
        "@context": {
            "@vocab": "http://docs.datalad.org/schema_v2.0.json"
        },
        "datalad_core": {
            "@id": ds.id
        },
        "nidm": {
            "@context": {
                "mydurationkey": {
                    "@id": "time:Duration"
                },
                "myvocabprefix": {
                    "@id": "http://purl.org/ontology/mydefinition",
                    "description": "I am a vocabulary",
                    "type": "http://purl.org/dc/dcam/VocabularyEncodingScheme"
                }
            },
            "mydurationkey": 0.6
        }
    }

    if datalad_extracts_annex_key:
        target_metadata['datalad_unique_content_properties'] = \
            {
                "annex": {
                 "key": [
                  "MD5E-s15920--acfb708aa74951cfff1a9d466f6d77be.nii.gz"
                 ]
                }
            }

    assert_result_count(res, 1, metadata=target_metadata)

コード例 #9

0

ファイルを表示

ファイル: test_exif.py プロジェクト: loj/datalad

def test_exif(path):
    ds = Dataset(path).rev_create()
    ds.config.add('datalad.metadata.nativetype', 'exif', where='dataset')
    copy(opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'exif.jpg'),
         path)
    ds.rev_save()
    ok_clean_git(ds.path)
    res = ds.aggregate_metadata()
    assert_status('ok', res)
    res = ds.metadata('exif.jpg')
    assert_result_count(res, 1)
    # from this extractor
    meta = res[0]['metadata']['exif']
    for k, v in target.items():
        eq_(meta[k], v)

    assert_in('@context', meta)

コード例 #10

0

ファイルを表示

ファイル: test_exif.py プロジェクト: datalad/datalad

def test_exif(path):
    ds = Dataset(path).create()
    ds.config.add('datalad.metadata.nativetype', 'exif', where='dataset')
    copy(
        opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'exif.jpg'),
        path)
    ds.save()
    ok_clean_git(ds.path)
    res = ds.aggregate_metadata()
    assert_status('ok', res)
    res = ds.metadata('exif.jpg')
    assert_result_count(res, 1)
    # from this extractor
    meta = res[0]['metadata']['exif']
    for k, v in target.items():
        eq_(meta[k], v)

    assert_in('@context', meta)

コード例 #11

0

ファイルを表示

def test_xmp(path=None):
    ds = Dataset(path).create()
    ds.config.add('datalad.metadata.nativetype', 'xmp', scope='branch')
    copy(opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'xmp.pdf'),
         path)
    ds.save()
    assert_repo_status(ds.path)
    res = ds.aggregate_metadata()
    assert_status('ok', res)
    res = ds.metadata('xmp.pdf')
    assert_result_count(res, 1)

    # from this extractor
    meta = res[0]['metadata']['xmp']
    for k, v in target.items():
        eq_(meta[k], v)

    assert_in('@context', meta)

コード例 #12

0

ファイルを表示

def test_nidm(path):
    ds = Dataset(path).create()
    ds.config.add('datalad.metadata.nativetype', 'nidm', where='dataset')
    # imagine filling the dataset up with something that NIDM info could be
    copy(
        opj(dirname(dirname(dirname(__file__))), 'tests', 'data',
            'nifti1.nii.gz'), path)
    # extracted from
    ds.add('.')
    # all nice and tidy, nothing untracked
    ok_clean_git(ds.path)
    # engage the extractor(s)
    res = ds.aggregate_metadata()
    # aggregation done without whining
    assert_status('ok', res)
    res = ds.metadata(reporton='datasets')
    # ATM we do not forsee file-based metadata to come back from NIDM
    assert_result_count(res, 1)
    # show full structure of the assembled metadata from demo content
    assert_result_count(
        res,
        1,
        metadata={
            "@context": {
                "@vocab": "http://docs.datalad.org/schema_v2.0.json"
            },
            "datalad_core": {
                "@id": ds.id
            },
            "nidm": {
                "@context": {
                    "mydurationkey": {
                        "@id": "time:Duration"
                    },
                    "myvocabprefix": {
                        "@id": "http://purl.org/ontology/mydefinition",
                        "description": "I am a vocabulary",
                        "type":
                        "http://purl.org/dc/dcam/VocabularyEncodingScheme"
                    }
                },
                "mydurationkey": 0.6
            }
        })

コード例 #13

0

ファイルを表示

ファイル: test_image.py プロジェクト: overlake333/datalad

def test_image(path):
    ds = Dataset(path).create()
    ds.config.add('datalad.metadata.nativetype', 'image', where='dataset')
    copy(opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'exif.jpg'),
         path)
    ds.add('.')
    ok_clean_git(ds.path)
    res = ds.aggregate_metadata()
    assert_status('ok', res)
    res = ds.metadata('exif.jpg')
    assert_result_count(res, 1)
    assert_result_count(res,
                        1,
                        metadata={
                            "dcterms:SizeOrDuration": [4, 3],
                            "color_mode": "3x8-bit pixels, true color",
                            "type": "dctype:Image",
                            "spatial_resolution(dpi)": [72, 72],
                            "format": "JPEG (ISO 10918)"
                        })

コード例 #14

0

ファイルを表示

def test_nifti(path):
    ds = Dataset(path).create()
    ds.config.add('datalad.metadata.nativetype', 'nifti1', where='dataset')
    copy(
        opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'nifti1.nii.gz'),
        path)
    ds.add('.')
    ok_clean_git(ds.path)
    res = ds.aggregate_metadata()
    assert_status('ok', res)
    res = ds.metadata('nifti1.nii.gz')
    assert_result_count(res, 1)
    assert_result_count(
        res, 1,
        metadata={
            "description": "FSL5.0",
            "spatial_resolution(mm)": [2.0, 2.0, 2.0],
            "temporal_spacing(s)": 6.0,
            "nifti1:datatype": "int16",
            "nifti1:dim": [4, 91, 109, 91, 2, 1, 1, 1],
            "nifti1:pixdim": [-1.0, 2.0, 2.0, 2.0, 6.0, 1.0, 1.0, 1.0],
            "nifti1:xyz_unit": "millimiter (uo:0000016)",
            "nifti1:t_unit": "second (uo:0000010)",
            "nifti1:cal_min": 3000.0,
            "nifti1:cal_max": 8000.0,
            "nifti1:toffset": 0.0,
            "nifti1:vox_offset": 0.0,
            "nifti1:intent": "none",
            "nifti1:sizeof_hdr": 348,
            "nifti1:magic": "n+1",
            "nifti1:sform_code": "mni",
            "nifti1:qform_code": "mni",
            "nifti1:freq_axis": None,
            "nifti1:phase_axis": None,
            "nifti1:slice_axis": None,
            "nifti1:slice_start": 0,
            "nifti1:slice_duration": 0.0,
            "nifti1:slice_order": "unknown",
            "nifti1:slice_end": 0,
        })

コード例 #15

0

ファイルを表示

ファイル: test_search.py プロジェクト: datalad/datalad-neuroimaging

def test_within_ds_file_search(path):
    try:
        import nibabel
    except ImportError:
        raise SkipTest
    ds = Dataset(path).create(force=True)
    ds.config.add('datalad.metadata.nativetype', 'nifti1', where='dataset')
    makedirs(opj(path, 'stim'))
    for src, dst in (('nifti1.nii.gz',
                      opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz')),
                     ('nifti1.nii.gz',
                      opj('sub-03', 'func', 'sub-03_task-other_bold.nii.gz'))):
        copy(opj(dirname(dirname(__file__)), 'tests', 'data', 'files', src),
             opj(path, dst))
    ds.save()
    ds.aggregate_metadata()
    ok_clean_git(ds.path)
    # basic sanity check on the metadata structure of the dataset
    dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata']
    for src in ('bids', 'nifti1'):
        # something for each one
        assert_in(src, dsmeta)
        # each src declares its own context
        assert_in('@context', dsmeta[src])
        # we have a unique content metadata summary for each src
        assert_in(src, dsmeta['datalad_unique_content_properties'])

    # test default behavior
    with swallow_outputs() as cmo:
        ds.search(show_keys='name', mode='textblob')

        assert_in("""\
id
meta
parentds
path
type
""", cmo.out)

    target_out = """\
annex.key
bids.BIDSVersion
bids.author
bids.citation
bids.conformsto
bids.datatype
bids.description
"""
    if external_versions['bids'] >= '0.9':
        target_out += "bids.extension\n"
    target_out += """\
bids.fundedby
bids.license
bids.name
bids.subject.age(years)
bids.subject.gender
bids.subject.handedness
bids.subject.hearing_problems_current
bids.subject.id
bids.subject.language
bids.suffix
bids.task
datalad_core.id
datalad_core.refcommit
id
nifti1.cal_max
nifti1.cal_min
nifti1.datatype
nifti1.description
nifti1.dim
nifti1.freq_axis
nifti1.intent
nifti1.magic
nifti1.phase_axis
nifti1.pixdim
nifti1.qform_code
nifti1.sform_code
nifti1.sizeof_hdr
nifti1.slice_axis
nifti1.slice_duration
nifti1.slice_end
nifti1.slice_order
nifti1.slice_start
nifti1.spatial_resolution(mm)
nifti1.t_unit
nifti1.temporal_spacing(s)
nifti1.toffset
nifti1.vox_offset
nifti1.xyz_unit
parentds
path
type
"""

    # check generated autofield index keys
    with swallow_outputs() as cmo:
        ds.search(mode='autofield', show_keys='name')
        # it is impossible to assess what is different from that dump
        # so we will use diff
        diff = list(unified_diff(target_out.splitlines(),
                                 cmo.out.splitlines()))
        assert_in(target_out, cmo.out, msg="Diff: %s" % os.linesep.join(diff))

    assert_result_count(ds.search('blablob#'), 0)
    # now check that we can discover things from the aggregated metadata
    for mode, query, hitpath, matched_key, matched_val in (
            # random keyword query
            # multi word query implies AND
        ('textblob', ['bold', 'female'],
         opj('sub-03', 'func',
             'sub-03_task-some_bold.nii.gz'), 'meta', 'female'),
            # report which field matched with auto-field
        ('autofield', 'female',
         opj('sub-03', 'func', 'sub-03_task-other_bold.nii.gz'),
         'bids.subject.gender', 'female'),
            # autofield multi-word query is also AND
        ('autofield', ['bids.suffix:bold', 'bids.subject.id:01'],
         opj('sub-01', 'func',
             'sub-01_task-some_bold.nii.gz'), 'bids.suffix', 'bold'),
            # TODO extend with more complex queries to test whoosh
            # query language configuration
    ):
        res = ds.search(query, mode=mode, full_record=True)
        if mode == 'textblob':
            # 'textblob' does datasets by default only (be could be configured otherwise
            assert_result_count(res, 1)
        else:
            # the rest has always a file and the dataset, because they carry metadata in
            # the same structure
            assert_result_count(res, 2)
            assert_result_count(
                res,
                1,
                type='file',
                path=opj(ds.path, hitpath),
                # each file must report the ID of the dataset it is from, critical for
                # discovering related content
                dsid=ds.id)
        assert_result_count(res, 1, type='dataset', path=ds.path, dsid=ds.id)
        # test the key and specific value of the match
        assert_in(matched_key, res[-1]['query_matched'])
        assert_equal(res[-1]['query_matched'][matched_key], matched_val)

コード例 #16

0

ファイルを表示

ファイル: test_search.py プロジェクト: raamana/datalad

def test_within_ds_file_search(path):
    try:
        import mutagen
    except ImportError:
        raise SkipTest
    ds = Dataset(path).create(force=True)
    ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset')
    makedirs(opj(path, 'stim'))
    for src, dst in (('audio.mp3', opj('stim', 'stim1.mp3')), ):
        copy(opj(dirname(dirname(__file__)), 'tests', 'data', src),
             opj(path, dst))
    ds.add('.')
    ds.aggregate_metadata()
    ok_clean_git(ds.path)
    # basic sanity check on the metadata structure of the dataset
    dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata']
    for src in ('audio', ):
        # something for each one
        assert_in(src, dsmeta)
        # each src declares its own context
        assert_in('@context', dsmeta[src])
        # we have a unique content metadata summary for each src
        assert_in(src, dsmeta['datalad_unique_content_properties'])

    # test default behavior
    with swallow_outputs() as cmo:
        ds.search(show_keys='name', mode='textblob')

        assert_in("""\
id
meta
parentds
path
type
""", cmo.out)

    target_out = """\
audio.bitrate
audio.duration(s)
audio.format
audio.music-Genre
audio.music-album
audio.music-artist
audio.music-channels
audio.music-sample_rate
audio.name
audio.tracknumber
datalad_core.id
datalad_core.refcommit
id
parentds
path
type
"""

    # check generated autofield index keys
    with swallow_outputs() as cmo:
        ds.search(mode='autofield', show_keys='name')
        # it is impossible to assess what is different from that dump
        assert_in(target_out, cmo.out)

    assert_result_count(ds.search('blablob#'), 0)
    # now check that we can discover things from the aggregated metadata
    for mode, query, hitpath, matched_key, matched_val in (
            # random keyword query
        ('textblob', 'mp3', opj('stim', 'stim1.mp3'), 'meta', 'mp3'),
            # report which field matched with auto-field
        ('autofield', 'mp3', opj('stim', 'stim1.mp3'), 'audio.format', 'mp3'),
            # XXX next one is not supported by current text field analyser
            # decomposes the mime type in [mime, audio, mp3]
            # ('autofield',
            # "'mime:audio/mp3'",
            # opj('stim', 'stim1.mp3'),
            # 'audio.format', 'mime:audio/mp3'),
            # but this one works
        ('autofield', "'mime audio mp3'", opj('stim', 'stim1.mp3'),
         'audio.format', 'mp3'),
            # TODO extend with more complex queries to test whoosh
            # query language configuration
    ):
        res = ds.search(query, mode=mode, full_record=True)
        if mode == 'textblob':
            # 'textblob' does datasets by default only (be could be configured otherwise
            assert_result_count(res, 1)
        else:
            # the rest has always a file and the dataset, because they carry metadata in
            # the same structure
            assert_result_count(res, 2)
            assert_result_count(
                res,
                1,
                type='file',
                path=opj(ds.path, hitpath),
                # each file must report the ID of the dataset it is from, critical for
                # discovering related content
                dsid=ds.id)
        assert_result_count(res, 1, type='dataset', path=ds.path, dsid=ds.id)
        # test the key and specific value of the match
        assert_in(matched_key, res[-1]['query_matched'])
        assert_equal(res[-1]['query_matched'][matched_key], matched_val)

コード例 #17

0

ファイルを表示

ファイル: test_search.py プロジェクト: shots47s/datalad

def test_within_ds_file_search(path):
    try:
        import mutagen
    except ImportError:
        raise SkipTest
    ds = Dataset(path).create(force=True)
    # override default and search for datasets and files for this test
    for m in ('egrep', 'textblob', 'autofield'):
        ds.config.add('datalad.search.index-{}-documenttype'.format(m),
                      'all',
                      where='dataset')
    ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset')
    makedirs(opj(path, 'stim'))
    for src, dst in (('audio.mp3', opj('stim', 'stim1.mp3')), ):
        copy(opj(dirname(dirname(__file__)), 'tests', 'data', src),
             opj(path, dst))
    ds.add('.')
    ok_file_under_git(path, opj('stim', 'stim1.mp3'), annexed=True)
    # If it is not under annex, below addition of metadata silently does
    # not do anything
    list(
        ds.repo.set_metadata(opj('stim', 'stim1.mp3'),
                             init={'importance': 'very'}))
    ds.aggregate_metadata()
    ok_clean_git(ds.path)
    # basic sanity check on the metadata structure of the dataset
    dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata']
    for src in ('audio', ):
        # something for each one
        assert_in(src, dsmeta)
        # each src declares its own context
        assert_in('@context', dsmeta[src])
        # we have a unique content metadata summary for each src
        assert_in(src, dsmeta['datalad_unique_content_properties'])

    # test default behavior
    with swallow_outputs() as cmo:
        ds.search(show_keys='name', mode='textblob')

        assert_in("""\
id
meta
parentds
path
type
""", cmo.out)

    target_out = """\
annex.importance
annex.key
audio.bitrate
audio.duration(s)
audio.format
audio.music-Genre
audio.music-album
audio.music-artist
audio.music-channels
audio.music-sample_rate
audio.name
audio.tracknumber
datalad_core.id
datalad_core.refcommit
id
parentds
path
type
"""

    # check generated autofield index keys
    with swallow_outputs() as cmo:
        ds.search(mode='autofield', show_keys='name')
        # it is impossible to assess what is different from that dump
        assert_in(target_out, cmo.out)

    assert_result_count(ds.search('blablob#'), 0)
    # now check that we can discover things from the aggregated metadata
    for mode, query, hitpath, matched in (
        ('egrep', ':mp3', opj('stim', 'stim1.mp3'), {
            'audio.format': 'mp3'
        }),
            # same as above, leading : is stripped, in indicates "ALL FIELDS"
        ('egrep', 'mp3', opj('stim', 'stim1.mp3'), {
            'audio.format': 'mp3'
        }),
            # same as above, but with AND condition
            # get both matches
        ('egrep', ['mp3', 'type:file'], opj('stim', 'stim1.mp3'), {
            'type': 'file',
            'audio.format': 'mp3'
        }),
            # case insensitive search
        ('egrep', 'mp3', opj('stim', 'stim1.mp3'), {
            'audio.format': 'mp3'
        }),
            # field selection by expression
        ('egrep', 'audio\.+:mp3', opj('stim', 'stim1.mp3'), {
            'audio.format': 'mp3'
        }),
            # random keyword query
        ('textblob', 'mp3', opj('stim', 'stim1.mp3'), {
            'meta': 'mp3'
        }),
            # report which field matched with auto-field
        ('autofield', 'mp3', opj('stim', 'stim1.mp3'), {
            'audio.format': 'mp3'
        }),
            # XXX next one is not supported by current text field analyser
            # decomposes the mime type in [mime, audio, mp3]
            # ('autofield',
            # "'mime:audio/mp3'",
            # opj('stim', 'stim1.mp3'),
            # 'audio.format', 'mime:audio/mp3'),
            # but this one works
        ('autofield', "'mime audio mp3'", opj('stim', 'stim1.mp3'), {
            'audio.format': 'mp3'
        }),
            # TODO extend with more complex queries to test whoosh
            # query language configuration
    ):
        res = ds.search(query, mode=mode, full_record=True)
        assert_result_count(
            res,
            1,
            type='file',
            path=opj(ds.path, hitpath),
            # each file must report the ID of the dataset it is from, critical for
            # discovering related content
            dsid=ds.id)
        # in egrep we currently do not search unique values
        # and the queries above aim at files
        assert_result_count(res, 1 if mode == 'egrep' else 2)
        if mode != 'egrep':
            assert_result_count(res,
                                1,
                                type='dataset',
                                path=ds.path,
                                dsid=ds.id)
        # test the key and specific value of the match
        for matched_key, matched_val in matched.items():
            assert_in(matched_key, res[-1]['query_matched'])
            assert_equal(res[-1]['query_matched'][matched_key], matched_val)

    # test a suggestion msg being logged if no hits and key is a bit off
    with swallow_logs(new_level=logging.INFO) as cml:
        res = ds.search('audio.formats:mp3 audio.bitsrate:1', mode='egrep')
        assert not res
        assert_in('Did you mean any of', cml.out)
        assert_in('audio.format', cml.out)
        assert_in('audio.bitrate', cml.out)

コード例 #18

0

ファイルを表示

ファイル: test_exif.py プロジェクト: overlake333/datalad

def test_exif(path):
    ds = Dataset(path).create()
    ds.config.add('datalad.metadata.nativetype', 'exif', where='dataset')
    copy(opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'exif.jpg'),
         path)
    ds.add('.')
    ok_clean_git(ds.path)
    res = ds.aggregate_metadata()
    assert_status('ok', res)
    res = ds.metadata('exif.jpg')
    assert_result_count(res, 1)
    # compare full expected metadata set to catch any change of mind on the
    # side of the EXIF library
    assert_result_count(
        res,
        1,
        metadata={
            "exif:InteroperabilityVersion": "[48, 49, 48, 48]",
            "exif:ExifVersion": 221.0,
            "exif:FocalLengthIn35mmFilm": 38.0,
            "exif:CompressedBitsPerPixel": 5.0,
            "exif:GainControl": "None",
            "exif:Compression": "JPEG (old-style)",
            "exif:PrintIM":
            "[80, 114, 105, 110, 116, 73, 77, 0, 48, 51, 48, 48, 0, 0, 0, 5, 0, 1, 0, 22, 0, 22, 0, 2, 1, 0, 0, 0, 1, 0, 5, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 16, 131, 0, 0, 0]",
            "exif:Make": "CASIO COMPUTER CO.,LTD.",
            "exif:Sharpness": "Normal",
            "exif:Contrast": "Normal",
            "exif:ColorSpace": "sRGB",
            "exif:ExposureMode": "Auto Exposure",
            "exif:ExposureBiasValue": 0.0,
            "exif:ExifImageWidth": 4.0,
            "exif:ComponentsConfiguration": "YCbCr",
            "exif:DateTimeOriginal": "2011:03:13 16:36:02",
            "exif:MaxApertureValue": "14/5",
            "exif:DateTime": "2017:10:08 10:21:03",
            "exif:InteroperabilityOffset": 30412.0,
            "exif:InteroperabilityIndex": "R98",
            "exif:FileSource": "Digital Camera",
            "exif:ResolutionUnit": "Pixels/Inch",
            "exif:FNumber": "27/10",
            "exif:ExposureProgram": "Program Normal",
            "exif:DigitalZoomRatio": "0/0",
            "exif:LightSource": "Unknown",
            "exif:ExifImageLength": 3.0,
            "exif:FlashPixVersion": 100.0,
            "exif:CustomRendered": "Normal",
            "exif:Flash": "Flash fired, auto mode",
            "exif:WhiteBalance": "Auto",
            "exif:Orientation": "Horizontal (normal)",
            "exif:ExposureTime": "1/60",
            "exif:Software": "GIMP 2.8.20",
            "exif:Model": "EX-S600",
            "exif:FocalLength": "31/5",
            "exif:SceneCaptureType": "Standard",
            "exif:ExifOffset": 272.0,
            "exif:Saturation": "Normal",
            "exif:YCbCrPositioning": "Centered",
            "exif:DateTimeDigitized": "2011:03:13 16:36:02",
            "exif:XResolution": 72.0,
            "exif:YResolution": 72.0,
            "exif:MeteringMode": "Pattern",
        })

コード例 #19

0

ファイルを表示

ファイル: test_search.py プロジェクト: xlecours/datalad

def test_within_ds_file_search(path):
    try:
        import nibabel
        import mutagen
    except ImportError:
        raise SkipTest
    ds = Dataset(path).create(force=True)
    ds.config.add('datalad.metadata.nativetype', 'nifti1', where='dataset')
    ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset')
    makedirs(opj(path, 'stim'))
    for src, dst in (('audio.mp3', opj('stim', 'stim1.mp3')),
                     ('nifti1.nii.gz',
                      opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz')),
                     ('nifti1.nii.gz',
                      opj('sub-03', 'func', 'sub-03_task-other_bold.nii.gz'))):
        copy(opj(dirname(dirname(__file__)), 'tests', 'data', src),
             opj(path, dst))
    ds.add('.')
    ds.aggregate_metadata()
    ok_clean_git(ds.path)
    # basic sanity check on the metadata structure of the dataset
    dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata']
    for src in ('audio', 'bids', 'nifti1'):
        # something for each one
        assert_in(src, dsmeta)
        # each src declares its own context
        assert_in('@context', dsmeta[src])
        # we have a unique content metadata summary for each src
        assert_in(src, dsmeta['datalad_unique_content_properties'])

    # test default behavior
    with swallow_outputs() as cmo:
        ds.search(show_keys=True, mode='textblob')

        assert_in("""\
id
meta
parentds
path
type
""", cmo.out)

    target_out = """\
audio.bitrate
audio.date
audio.duration(s)
audio.format
audio.music-Genre
audio.music-album
audio.music-artist
audio.music-channels
audio.music-sample_rate
audio.name
audio.tracknumber
bids.BIDSVersion
bids.author
bids.citation
bids.conformsto
bids.description
bids.fundedby
bids.license
bids.modality
bids.name
bids.participant.age(years)
bids.participant.gender
bids.participant.handedness
bids.participant.hearing_problems_current
bids.participant.id
bids.participant.language
bids.subject
bids.task
bids.type
id
nifti1.cal_max
nifti1.cal_min
nifti1.datatype
nifti1.description
nifti1.dim
nifti1.freq_axis
nifti1.intent
nifti1.magic
nifti1.phase_axis
nifti1.pixdim
nifti1.qform_code
nifti1.sform_code
nifti1.sizeof_hdr
nifti1.slice_axis
nifti1.slice_duration
nifti1.slice_end
nifti1.slice_order
nifti1.slice_start
nifti1.spatial_resolution(mm)
nifti1.t_unit
nifti1.temporal_spacing(s)
nifti1.toffset
nifti1.vox_offset
nifti1.xyz_unit
parentds
path
type
"""

    # check generated autofield index keys
    with swallow_outputs() as cmo:
        ds.search(mode='autofield', show_keys=True)
        # it is impossible to assess what is different from that dump
        assert_in(target_out, cmo.out)

    assert_result_count(ds.search('blablob#'), 0)
    # now check that we can discover things from the aggregated metadata
    for mode, query, hitpath, matched_key, matched_val in (
            # random keyword query
        ('textblob', 'mp3', opj('stim', 'stim1.mp3'), 'meta', 'mp3'),
            # multi word query implies AND
        ('textblob', ['bold', 'male'],
         opj('sub-01', 'func',
             'sub-01_task-some_bold.nii.gz'), 'meta', 'male'),
            # report which field matched with auto-field
        ('autofield', 'mp3', opj('stim', 'stim1.mp3'), 'audio.format', 'mp3'),
        ('autofield', 'female',
         opj('sub-03', 'func', 'sub-03_task-other_bold.nii.gz'),
         'bids.participant.gender', 'female'),
            # autofield multi-word query is also AND
        ('autofield', ['bids.type:bold', 'bids.participant.id:01'],
         opj('sub-01', 'func',
             'sub-01_task-some_bold.nii.gz'), 'bids.type', 'bold'),
            # XXX next one is not supported by current text field analyser
            # decomposes the mime type in [mime, audio, mp3]
            # ('autofield',
            # "'mime:audio/mp3'",
            # opj('stim', 'stim1.mp3'),
            # 'audio.format', 'mime:audio/mp3'),
            # but this one works
        ('autofield', "'mime audio mp3'", opj('stim', 'stim1.mp3'),
         'audio.format', 'mp3'),
            # TODO extend with more complex queries to test whoosh
            # query language configuration
    ):
        res = ds.search(query, mode=mode)
        if mode == 'textblob':
            # 'textblob' does datasets by default only (be could be configured otherwise
            assert_result_count(res, 1)
        else:
            # the rest has always a file and the dataset, because they carry metadata in
            # the same structure
            assert_result_count(res, 2)
            assert_result_count(
                res,
                1,
                type='file',
                path=opj(ds.path, hitpath),
                # each file must report the ID of the dataset it is from, critical for
                # discovering related content
                dsid=ds.id)
        assert_result_count(res, 1, type='dataset', path=ds.path, dsid=ds.id)
        # test the key and specific value of the match
        assert_in(matched_key, res[-1]['query_matched'])
        assert_equal(res[-1]['query_matched'][matched_key], matched_val)

コード例 #20

0

ファイルを表示

def test_dicom(path):
    ds = Dataset(path).create()
    ds.config.add('datalad.metadata.nativetype', 'dicom', where='dataset')
    copy(
        op.join(op.dirname(op.dirname(op.dirname(__file__))), 'tests', 'data',
                'files', 'dicom.dcm'), path)
    ds.save()
    ok_clean_git(ds.path)
    res = ds.aggregate_metadata()
    assert_status('ok', res)
    # query for the file metadata
    res = ds.metadata('dicom.dcm')
    assert_result_count(res, 1)
    # from this extractor
    meta = res[0]['metadata']['dicom']
    assert_in('@context', meta)
    # no point in testing ALL keys, but we got plenty
    assert (len(meta.keys()) > 70)
    eq_(meta['SeriesDate'], '20070205')
    # Actually a tricky one of the dcm.multival.MultiValue type
    # which we should extract as a list
    # https://github.com/datalad/datalad-neuroimaging/issues/49
    eq_(meta['ImageType'], ['ORIGINAL', 'PRIMARY', 'EPI', 'NONE'])
    # make sure we have PatientName -- this is not using a basic data type, but
    # dicom.valuerep.PersonName3 -- conversion should have handled that
    # we can only test if the key is there, the source dicom has an empty
    # string as value
    eq_(meta['PatientName'], '')

    # now ask for the dataset metadata, which should have both the unique props
    # and a list of imageseries (one in this case, but a list)
    res = ds.metadata(reporton='datasets')
    assert_result_count(res, 1)
    dsmeta = res[0]['metadata']['dicom']
    # same context
    assert_dict_equal(meta['@context'], dsmeta['@context'])
    meta.pop('@context')
    seriesmeta = dsmeta['Series']
    eq_(seriesmeta[0].pop('SeriesDirectory'), op.curdir)
    eq_(dsmeta['Series'], [meta])

    # for this artificial case pretty much the same info also comes out as
    # unique props, but wrapped in lists
    ucp = res[0]['metadata']["datalad_unique_content_properties"]['dicom']
    assert_dict_equal(
        {
            k: [v]
            for k, v in dsmeta['Series'][0].items()
            if k not in DicomExtractor._unique_exclude and k in ucp
        }, {
            k: v
            for k, v in ucp.items() if k not in DicomExtractor._unique_exclude
        })

    # buuuut, if we switch of file-based metadata storage
    ds.config.add('datalad.metadata.aggregate-content-dicom',
                  'false',
                  where='dataset')
    ds.aggregate_metadata()
    res = ds.metadata(reporton='datasets')

    if not datalad_extracts_annex_key:
        # the auto-uniquified bits are gone but the Series description stays
        assert_not_in("datalad_unique_content_properties", res[0]['metadata'])
    eq_(dsmeta['Series'], [meta])

コード例 #21

0

ファイルを表示

ファイル: test_base.py プロジェクト: hanke/datalad

def test_aggregation(path):
    with chpwd(path):
        assert_raises(InsufficientArgumentsError, aggregate_metadata, None)
    # a hierarchy of three (super/sub)datasets, each with some native metadata
    ds = Dataset(opj(path, 'origin')).create(force=True)
    # before anything aggregated we would get nothing and only a log warning
    with swallow_logs(new_level=logging.WARNING) as cml:
        assert_equal(list(query_aggregated_metadata('all', ds, [])), [])
    assert_re_in('.*Found no aggregated metadata.*update', cml.out)
    ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
                  where='dataset')
    subds = ds.create('sub', force=True)
    subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
                     where='dataset')
    subsubds = subds.create('subsub', force=True)
    subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
                        where='dataset')
    ds.add('.', recursive=True)
    ok_clean_git(ds.path)
    # aggregate metadata from all subdatasets into any superdataset, including
    # intermediate ones
    res = ds.aggregate_metadata(recursive=True, update_mode='all')
    # we get success report for both subdatasets and the superdataset,
    # and they get saved
    assert_result_count(res, 6)
    assert_result_count(res, 3, status='ok', action='aggregate_metadata')
    assert_result_count(res, 3, status='ok', action='save')
    # nice and tidy
    ok_clean_git(ds.path)

    # quick test of aggregate report
    aggs = ds.metadata(get_aggregates=True)
    # one for each dataset
    assert_result_count(aggs, 3)
    # mother also report layout version
    assert_result_count(aggs, 1, path=ds.path, layout_version=1)

    # store clean direct result
    origres = ds.metadata(recursive=True)
    # basic sanity check
    assert_result_count(origres, 6)
    assert_result_count(origres, 3, type='dataset')
    assert_result_count(origres, 3, type='file')  # Now that we have annex.key
    # three different IDs
    assert_equal(3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset'])))
    # and we know about all three datasets
    for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'):
        assert_true(
            sum([s['metadata']['frictionless_datapackage']['name'] \
                    == assure_unicode(name) for s in origres
                 if s['type'] == 'dataset']))

    # now clone the beast to simulate a new user installing an empty dataset
    clone = install(
        opj(path, 'clone'), source=ds.path,
        result_xfm='datasets', return_type='item-or-list')
    # ID mechanism works
    assert_equal(ds.id, clone.id)

    # get fresh metadata
    cloneres = clone.metadata()
    # basic sanity check
    assert_result_count(cloneres, 2)
    assert_result_count(cloneres, 1, type='dataset')
    assert_result_count(cloneres, 1, type='file')

    # now loop over the previous results from the direct metadata query of
    # origin and make sure we get the extact same stuff from the clone
    _compare_metadata_helper(origres, clone)

    # now obtain a subdataset in the clone, should make no difference
    assert_status('ok', clone.install('sub', result_xfm=None, return_type='list'))
    _compare_metadata_helper(origres, clone)

    # test search in search tests, not all over the place
    ## query smoke test
    assert_result_count(clone.search('mother', mode='egrep'), 1)
    assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1)

    child_res = clone.search('child', mode='egrep')
    assert_result_count(child_res, 2)
    for r in child_res:
        if r['type'] == 'dataset':
            assert_in(
                r['query_matched']['frictionless_datapackage.name'],
                r['metadata']['frictionless_datapackage']['name'])

コード例 #22

0

ファイルを表示

def test_aggregation(path):
    with chpwd(path):
        assert_raises(InsufficientArgumentsError, aggregate_metadata, None)
    # a hierarchy of three (super/sub)datasets, each with some native metadata
    ds = Dataset(opj(path, 'origin')).create(force=True)
    ds.config.add('datalad.metadata.nativetype', 'bids', where='dataset')
    ds.config.add('datalad.metadata.nativetype',
                  'frictionless_datapackage',
                  where='dataset')
    subds = ds.create('sub', force=True)
    subds.config.add('datalad.metadata.nativetype', 'bids', where='dataset')
    subsubds = subds.create('subsub', force=True)
    subsubds.config.add('datalad.metadata.nativetype', 'bids', where='dataset')
    ds.add('.', recursive=True)
    ok_clean_git(ds.path)
    # aggregate metadata from all subdatasets into any superdataset, including
    # intermediate ones
    res = ds.aggregate_metadata(recursive=True)
    # we get success report for both subdatasets and the superdataset,
    # and they get saved
    assert_result_count(res, 6)
    assert_result_count(res, 3, status='ok', action='aggregate_metadata')
    assert_result_count(res, 3, status='ok', action='save')
    # nice and tidy
    ok_clean_git(ds.path)
    # store clean direct result
    origres = ds.metadata(recursive=True)
    # basic sanity check
    assert_result_count(origres, 7)
    assert_result_count(origres, 3, type='dataset')
    assert_result_count(origres, 4, type='file')
    # three different IDs
    assert_equal(
        3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset'])))
    # and we know about all three datasets
    for name in ('mother_äöü東', 'child_äöü東', 'grandchild_äöü東'):
        assert_true(
            sum([
                s['metadata']['name'] == assure_unicode(name) for s in origres
                if s['type'] == 'dataset'
            ]))

    # now clone the beast to simulate a new user installing an empty dataset
    clone = install(opj(path, 'clone'),
                    source=ds.path,
                    result_xfm='datasets',
                    return_type='item-or-list')
    # ID mechanism works
    assert_equal(ds.id, clone.id)

    # get fresh metadata
    cloneres = clone.metadata()
    # basic sanity check
    assert_result_count(cloneres, 3)
    assert_result_count(cloneres, 1, type='dataset')
    assert_result_count(cloneres, 2, type='file')

    # now loop over the previous results from the direct metadata query of
    # origin and make sure we get the extact same stuff from the clone
    _compare_metadata_helper(origres, clone)

    # now obtain a subdataset in the clone, should make no difference
    assert_status('ok',
                  clone.install('sub', result_xfm=None, return_type='list'))
    _compare_metadata_helper(origres, clone)

    # query smoke test
    assert_result_count(clone.search('mother*'), 1)
    assert_result_count(clone.search('MoTHER*'), 1)

    child_res = clone.search('*child*')
    assert_result_count(child_res, 2)
    for r in child_res:
        if r['metadata']['type'] == 'dataset':
            eq_(r['query_matched']['name'], r['metadata']['name'])

    # Test 'and' for multiple search entries
    assert_result_count(clone.search(['*child*', '*bids*']), 2)
    assert_result_count(clone.search(['*child*', '*subsub*']), 1)
    assert_result_count(clone.search(['*bids*', '*sub*']), 2)

    assert_result_count(clone.search(['*', 'type:dataset']), 3)

コード例 #23

0

ファイルを表示

ファイル: utils.py プロジェクト: maltheism/datalad-neuroimaging

def get_bids_dataset():
    srcrepo = get_sourcerepo()
    bids_ds = Dataset(path=opj(srcrepo.path, 'datalad_neuroimaging', 'tests',
                               'data', 'bids'))
    if bids_ds.is_installed():
        return bids_ds
    try:
        import heudiconv
    except ImportError:
        raise SkipTest
    # make one
    bids_ds.create()
    # place dicoms in the mandated shadow tree
    structdicom_ds = bids_ds.install(source=get_dicom_dataset('structural'),
                                     path=opj('sourcedata', 'sub-02',
                                              'ses-structural'),
                                     reckless=True)
    funcdicom_ds = bids_ds.install(source=get_dicom_dataset('functional'),
                                   path=opj('sourcedata', 'sub-02',
                                            'ses-functional'),
                                   reckless=True)
    # dicom dataset is preconfigured for metadata extraction
    # XXX this is the slowest step of the entire procedure
    # reading 5k dicoms of the functional data
    bids_ds.aggregate_metadata(recursive=True)
    # pull subject ID from metadata
    res = bids_ds.metadata(funcdicom_ds.path,
                           reporton='datasets',
                           return_type='item-or-list',
                           result_renderer='disabled')
    subj_id = res['metadata']['dicom']['Series'][0]['PatientID']
    # prepare for incoming BIDS metadata that we will want to keep in
    # Git -- templates would be awesome!
    with open(opj(bids_ds.path, '.gitattributes'), 'a') as ga:
        # except for hand-picked global metadata, we want anything
        # to go into the annex to be able to retract files after
        # publication
        ga.write('** annex.largefiles=anything\n')
        for fn in ('CHANGES', 'README', 'dataset_description.json'):
            # but not these
            ga.write('{} annex.largefiles=nothing\n'.format(fn))
    bids_ds.add('.gitattributes',
                to_git=True,
                message='Initial annex entry configuration')
    ok_clean_git(bids_ds.path)
    # conversion of two DICOM datasets to one BIDS dataset
    for label, ds, scanlabel in (
            # structural needs to come first or else heudiconv
            # will try to rewrite the events.tsv for the functional
            # run, for some strange reason
        ('structural', structdicom_ds, 'anat'),
        ('functional', funcdicom_ds, 'func')):
        bids_ds.run(
            [
                'heudiconv',
                '-f',
                'reproin',
                # TODO fix DICOMs to not have a 'sub' prefix
                '-s',
                subj_id,
                '-c',
                'dcm2niix',
                # TODO decide on the fate of .heudiconv/
                # but ATM we need to (re)move it:
                # https://github.com/nipy/heudiconv/issues/196
                '-o',
                opj(bids_ds.path, '.git', 'stupid', label),
                '-b',
                '-a',
                bids_ds.path,
                '-l',
                '',
                # avoid gory details provided by dcmstack, we have them in
                # the aggregated DICOM metadata already
                '--minmeta',
                '--files',
                opj(ds.path, 'dicoms')
            ],
            message="DICOM conversion of {} scans".format(label))
        # remove unwanted stuff that cannot be disabled ATM
        # https://github.com/nipy/heudiconv/issues/195
        # TODO should be removed eventually
        bids_ds.remove([
            p for p in (opj('sourcedata', 'sub-02', scanlabel),
                        opj('sourcedata', 'README'))
            if op.lexists(opj(bids_ds.path, p))
        ],
                       check=False)

    bids_ds.config.add('datalad.metadata.nativetype',
                       'bids',
                       where='dataset',
                       reload=False)
    bids_ds.config.add('datalad.metadata.nativetype',
                       'nifti1',
                       where='dataset',
                       reload=True)
    # XXX need to `add` specifically to make it work in direct mode
    #bids_ds.save(message='Metadata type config')
    bids_ds.add('.', message='Metadata type config')
    # loose dicom datasets
    bids_ds.uninstall([structdicom_ds.path, funcdicom_ds.path], check=False)
    # no need for recursion, we already have the dicom dataset's
    # stuff on record
    bids_ds.aggregate_metadata(recursive=False, incremental=True)
    ok_clean_git(bids_ds.path)
    return bids_ds

コード例 #24

0

ファイルを表示

def test_aggregation(path=None):
    with chpwd(path):
        assert_raises(InsufficientArgumentsError, aggregate_metadata, None)
    # a hierarchy of three (super/sub)datasets, each with some native metadata
    ds = Dataset(opj(path, 'origin')).create(force=True)
    # before anything aggregated we would get nothing and only a log warning
    with swallow_logs(new_level=logging.WARNING) as cml:
        assert_equal(list(query_aggregated_metadata('all', ds, [])), [])
    assert_re_in('.*Found no aggregated metadata.*update', cml.out)
    ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
                  scope='branch')
    subds = ds.create('sub', force=True)
    subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
                     scope='branch')
    subsubds = subds.create('subsub', force=True)
    subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
                        scope='branch')
    ds.save(recursive=True)
    assert_repo_status(ds.path)
    # aggregate metadata from all subdatasets into any superdataset, including
    # intermediate ones
    res = ds.aggregate_metadata(recursive=True, update_mode='all')
    # we get success report for both subdatasets and the superdataset,
    # and they get saved
    assert_result_count(res, 3, status='ok', action='aggregate_metadata')
    assert_in_results(res, action='save', status="ok")
    # nice and tidy
    assert_repo_status(ds.path)

    # quick test of aggregate report
    aggs = ds.metadata(get_aggregates=True)
    # one for each dataset
    assert_result_count(aggs, 3)
    # mother also report layout version
    assert_result_count(aggs, 1, path=ds.path, layout_version=1)

    # store clean direct result
    origres = ds.metadata(recursive=True)
    # basic sanity check
    assert_result_count(origres, 6)
    assert_result_count(origres, 3, type='dataset')
    assert_result_count(origres, 3, type='file')  # Now that we have annex.key
    # three different IDs
    assert_equal(3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset'])))
    # and we know about all three datasets
    for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'):
        assert_true(
            sum([s['metadata']['frictionless_datapackage']['name'] \
                    == ensure_unicode(name) for s in origres
                 if s['type'] == 'dataset']))

    # now clone the beast to simulate a new user installing an empty dataset
    clone = install(
        opj(path, 'clone'), source=ds.path,
        result_xfm='datasets', return_type='item-or-list')
    # ID mechanism works
    assert_equal(ds.id, clone.id)

    # get fresh metadata
    cloneres = clone.metadata()
    # basic sanity check
    assert_result_count(cloneres, 2)
    assert_result_count(cloneres, 1, type='dataset')
    assert_result_count(cloneres, 1, type='file')

    # now loop over the previous results from the direct metadata query of
    # origin and make sure we get the extract same stuff from the clone
    _compare_metadata_helper(origres, clone)

    # now obtain a subdataset in the clone, should make no difference
    assert_status('ok', clone.install('sub', result_xfm=None, return_type='list'))
    _compare_metadata_helper(origres, clone)

    # test search in search tests, not all over the place
    ## query smoke test
    assert_result_count(clone.search('mother', mode='egrep'), 1)
    assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1)

    child_res = clone.search('child', mode='egrep')
    assert_result_count(child_res, 2)
    for r in child_res:
        if r['type'] == 'dataset':
            assert_in(
                r['query_matched']['frictionless_datapackage.name'],
                r['metadata']['frictionless_datapackage']['name'])

コード例 #25

0

ファイルを表示

ファイル: test_search.py プロジェクト: datalad/datalad

def test_within_ds_file_search(path):
    try:
        import mutagen
    except ImportError:
        raise SkipTest
    ds = Dataset(path).create(force=True)
    # override default and search for datasets and files for this test
    for m in ('egrep', 'textblob', 'autofield'):
        ds.config.add(
            'datalad.search.index-{}-documenttype'.format(m), 'all',
            where='dataset')
    ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset')
    makedirs(opj(path, 'stim'))
    for src, dst in (
            ('audio.mp3', opj('stim', 'stim1.mp3')),):
        copy(
            opj(dirname(dirname(__file__)), 'tests', 'data', src),
            opj(path, dst))
    ds.save()
    ok_file_under_git(path, opj('stim', 'stim1.mp3'), annexed=True)
    # If it is not under annex, below addition of metadata silently does
    # not do anything
    ds.repo.set_metadata(
        opj('stim', 'stim1.mp3'), init={'importance': 'very'})
    ds.aggregate_metadata()
    ok_clean_git(ds.path)
    # basic sanity check on the metadata structure of the dataset
    dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata']
    for src in ('audio',):
        # something for each one
        assert_in(src, dsmeta)
        # each src declares its own context
        assert_in('@context', dsmeta[src])
        # we have a unique content metadata summary for each src
        assert_in(src, dsmeta['datalad_unique_content_properties'])

    # test default behavior
    with swallow_outputs() as cmo:
        ds.search(show_keys='name', mode='textblob')

        assert_in("""\
id
meta
parentds
path
type
""", cmo.out)

    target_out = """\
annex.importance
annex.key
audio.bitrate
audio.duration(s)
audio.format
audio.music-Genre
audio.music-album
audio.music-artist
audio.music-channels
audio.music-sample_rate
audio.name
audio.tracknumber
datalad_core.id
datalad_core.refcommit
id
parentds
path
type
"""

    # check generated autofield index keys
    with swallow_outputs() as cmo:
        ds.search(mode='autofield', show_keys='name')
        # it is impossible to assess what is different from that dump
        assert_in(target_out, cmo.out)

    assert_result_count(ds.search('blablob#'), 0)
    # now check that we can discover things from the aggregated metadata
    for mode, query, hitpath, matched in (
        ('egrep',
         ':mp3',
         opj('stim', 'stim1.mp3'),
         {'audio.format': 'mp3'}),
        # same as above, leading : is stripped, in indicates "ALL FIELDS"
        ('egrep',
         'mp3',
         opj('stim', 'stim1.mp3'),
         {'audio.format': 'mp3'}),
        # same as above, but with AND condition
        # get both matches
        ('egrep',
         ['mp3', 'type:file'],
         opj('stim', 'stim1.mp3'),
         {'type': 'file', 'audio.format': 'mp3'}),
        # case insensitive search
        ('egrep',
         'mp3',
         opj('stim', 'stim1.mp3'),
         {'audio.format': 'mp3'}),
        # field selection by expression
        ('egrep',
         'audio\.+:mp3',
         opj('stim', 'stim1.mp3'),
         {'audio.format': 'mp3'}),
        # random keyword query
        ('textblob',
         'mp3',
         opj('stim', 'stim1.mp3'),
         {'meta': 'mp3'}),
        # report which field matched with auto-field
        ('autofield',
         'mp3',
         opj('stim', 'stim1.mp3'),
         {'audio.format': 'mp3'}),
        # XXX next one is not supported by current text field analyser
        # decomposes the mime type in [mime, audio, mp3]
        # ('autofield',
        # "'mime:audio/mp3'",
        # opj('stim', 'stim1.mp3'),
        # 'audio.format', 'mime:audio/mp3'),
        # but this one works
        ('autofield',
         "'mime audio mp3'",
         opj('stim', 'stim1.mp3'),
         {'audio.format': 'mp3'}),
        # TODO extend with more complex queries to test whoosh
        # query language configuration
    ):
        res = ds.search(query, mode=mode, full_record=True)
        assert_result_count(
            res, 1, type='file', path=opj(ds.path, hitpath),
            # each file must report the ID of the dataset it is from, critical for
            # discovering related content
            dsid=ds.id)
        # in egrep we currently do not search unique values
        # and the queries above aim at files
        assert_result_count(res, 1 if mode == 'egrep' else 2)
        if mode != 'egrep':
            assert_result_count(
                res, 1, type='dataset', path=ds.path, dsid=ds.id)
        # test the key and specific value of the match
        for matched_key, matched_val in matched.items():
            assert_in(matched_key, res[-1]['query_matched'])
            assert_equal(res[-1]['query_matched'][matched_key], matched_val)

    # test a suggestion msg being logged if no hits and key is a bit off
    with swallow_logs(new_level=logging.INFO) as cml:
        res = ds.search('audio.formats:mp3 audio.bitsrate:1', mode='egrep')
        assert not res
        assert_in('Did you mean any of', cml.out)
        assert_in('audio.format', cml.out)
        assert_in('audio.bitrate', cml.out)