def test_add_readme(path): ds = Dataset(path).create(force=True) ds.add('.') ds.aggregate_metadata() ok_clean_git(ds.path) assert_status('ok', ds.plugin('add_readme')) # should use default name eq_( open(opj(path, 'README.md')).read(), """\ # Dataset "demo_ds" this is for play ### Authors - Betty - Tom ### License PDDL ## General information This is a DataLad dataset (id: {id}). For more information on DataLad and on how to work with its datasets, see the DataLad documentation at: http://docs.datalad.org """.format( id=ds.id)) # should skip on re-run assert_status('notneeded', ds.plugin('add_readme'))
def test_minimal(path): ds = Dataset(opj(path, 'ds')).create(force=True) ds.add('.') ok_clean_git(ds.path) # make sure essential metadata files are annex for this test # we won't to drop them later and still do the conversion assert_true( ds.repo.is_under_annex( ['participants.tsv', 'dataset_description.json'])) ds.aggregate_metadata() ok_clean_git(ds.path) # do conversion # where output should appear by default target_path = opj(path, 'scidata_isatab_{}'.format(ds.repo.get_hexsha())) with chpwd(path): assert_status( 'ok', ds.plugin( 'bids2scidata', repo_name="dummy", repo_accession='ds1', repo_url='http://example.com', )) # just a few basic sanity tests that info ends up in the right places # a proper test should be a full regression test on a real dataset # with hand-validated exported metadata # investigator info invest = open(opj(target_path, 'i_Investigation.txt')).read() assert_in('Betty\tTom', invest) assert_in('Study Assay File Name\ta_mri_t1w.txt\ta_mri_bold.txt', invest) assert_in( 'Comment[Data Repository]\tdummy\nComment[Data Record Accession]\tds1\nComment[Data Record URI]\thttp://example.com', invest) # study table assert_equal( """\ Source Name\tCharacteristics[organism]\tCharacteristics[organism part]\tProtocol REF\tSample Name\tCharacteristics[sex]\tCharacteristics[age at scan]\tCharacteristics[handedness] 01\thomo sapiens\tbrain\tParticipant recruitment\t01\tm\t30\tr 15\thomo sapiens\tbrain\tParticipant recruitment\t15\tf\t35\tl """, open(opj(target_path, 's_study.txt')).read()) # assay tables assert_equal( """\ Sample Name\tProtocol REF\tParameter Value[modality]\tAssay Name\tRaw Data File\tComment[Data Repository]\tComment[Data Record Accession]\tComment[Data Record URI]\tFactor Value[task] sub-15\tMagnetic Resonance Imaging\tbold\tsub-15_task-nix_run-1\tsub-15/func/sub-15_task-nix_run-1_bold.nii.gz\tdummy\tds1\thttp://example.com\tnix """, open(opj(target_path, 'a_mri_bold.txt')).read()) assert_equal( """\ Sample Name\tProtocol REF\tParameter Value[modality]\tAssay Name\tRaw Data File\tComment[Data Repository]\tComment[Data Record Accession]\tComment[Data Record URI] sub-01\tMagnetic Resonance Imaging\tT1w\tsub-01\tsub-01/anat/sub-01_T1w.nii.gz\tdummy\tds1\thttp://example.com """, open(opj(target_path, 'a_mri_t1w.txt')).read())
def test_dicom(path): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'dicom', where='dataset') copy( opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'dicom.dcm'), path) ds.add('.') ok_clean_git(ds.path) res = ds.aggregate_metadata() assert_status('ok', res) # query for the file metadata res = ds.metadata('dicom.dcm') assert_result_count(res, 1) # from this extractor meta = res[0]['metadata']['dicom'] assert_in('@context', meta) # no point in testing ALL keys, but we got plenty assert (len(meta.keys()) > 70) eq_(meta['SeriesDate'], '20070205') # now ask for the dataset metadata, which should have both the unique props # and a list of imageseries (one in this case, but a list) res = ds.metadata(reporton='datasets') assert_result_count(res, 1) dsmeta = res[0]['metadata']['dicom'] # same context assert_dict_equal(meta['@context'], dsmeta['@context']) meta.pop('@context') eq_(dsmeta['Series'], [meta]) # for this artificial case pretty much the same info also comes out as # unique props, but wrapped in lists ucp = res[0]['metadata']["datalad_unique_content_properties"]['dicom'] assert_dict_equal( { k: [v] for k, v in dsmeta['Series'][0].items() if k not in DicomExtractor._unique_exclude and k in ucp }, { k: v for k, v in ucp.items() if k not in DicomExtractor._unique_exclude }) # buuuut, if we switch of file-based metadata storage ds.config.add('datalad.metadata.aggregate-content-dicom', 'false', where='dataset') ds.aggregate_metadata() res = ds.metadata(reporton='datasets') # the auto-uniquified bits are gone but the Series description stays assert_not_in("datalad_unique_content_properties", res[0]['metadata']) eq_(dsmeta['Series'], [meta])
def test_audio(path): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset') copy( opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'audio.mp3'), path) ds.add('.') ok_clean_git(ds.path) res = ds.aggregate_metadata() assert_status('ok', res) res = ds.metadata('audio.mp3') assert_result_count(res, 1) # compare full expected metadata set to catch any change of mind on the # side of the mutagen package # but not the bitrate, to variable estimate across decoders res[0]['metadata'].pop("comment<bitrate>", None) assert_result_count( res, 1, metadata={ "format": "mime:audio/mp3", "duration(s)": 1.0, "name": "dltracktitle", "music:album": "dlalbumtitle", "music:artist": "dlartist", "music:channels": 1, "music:sample_rate": 44100, "music:Genre": "dlgenre", "comment<date>": "", "comment<tracknumber>": "dltracknumber", })
def test_audio(path): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset') copy( opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'audio.mp3'), path) ds.save() ok_clean_git(ds.path) res = ds.aggregate_metadata() assert_status('ok', res) res = ds.metadata('audio.mp3') assert_result_count(res, 1) # from this extractor meta = res[0]['metadata']['audio'] for k, v in target.items(): eq_(meta[k], v) assert_in('@context', meta) uniques = ds.metadata(reporton='datasets', return_type='item-or-list' )['metadata']['datalad_unique_content_properties'] # test file has it, but uniques have it blanked out, because the extractor considers it worthless # for discovering whole datasets assert_in('bitrate', meta) eq_(uniques['audio']['bitrate'], None) # 'date' field carries not value, hence gets exclude from the unique report assert_in('date', meta) assert (not meta['date']) assert_not_in('date', uniques['audio'])
def test_audio(path): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset') copy( opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'audio.mp3'), path) ds.add('.') ok_clean_git(ds.path) res = ds.aggregate_metadata() assert_status('ok', res) res = ds.metadata('audio.mp3') assert_result_count(res, 1) # from this extractor meta = res[0]['metadata']['audio'] for k, v in target.items(): eq_(meta[k], v) assert_in('@context', meta) uniques = ds.metadata( reporton='datasets', return_type='item-or-list')['metadata']['datalad_unique_content_properties'] # test file has it, but uniques have it blanked out, because the extractor considers it worthless # for discovering whole datasets assert_in('bitrate', meta) eq_(uniques['audio']['bitrate'], None) # 'date' field carries not value, hence gets exclude from the unique report assert_in('date', meta) assert(not meta['date']) assert_not_in('date', uniques['audio'])
def test_xmp(path): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'xmp', where='dataset') copy(opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'xmp.pdf'), path) ds.add('.') ok_clean_git(ds.path) res = ds.aggregate_metadata() assert_status('ok', res) res = ds.metadata('xmp.pdf') assert_result_count(res, 1) # compare full expected metadata set to catch any change of mind on the # side of the XMP library assert_result_count(res, 1, metadata={ 'dc:creator': 'Michael Hanke', 'dc:description': 'dlsubject', 'dc:description<?xml:lang>': 'x-default', 'dc:title': 'dltitle', 'dc:title<?xml:lang>': 'x-default', 'pdfaid:part': '1', 'pdfaid:conformance': 'A', 'pdf:Keywords': 'dlkeyword1 dlkeyword2', 'pdf:Producer': 'LibreOffice 5.2', 'xmp:CreateDate': '2017-10-08T10:27:06+02:00', 'xmp:CreatorTool': 'Writer', })
def test_nidm(path): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'nidm', where='dataset') # imagine filling the dataset up with something that NIDM info could be copy( opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'files', 'nifti1.nii.gz'), path) # extracted from ds.save() # all nice and tidy, nothing untracked ok_clean_git(ds.path) # engage the extractor(s) res = ds.aggregate_metadata() # aggregation done without whining assert_status('ok', res) res = ds.metadata(reporton='datasets') # ATM we do not forsee file-based metadata to come back from NIDM assert_result_count(res, 1) # kill version info core = res[0]['metadata']['datalad_core'] core.pop('version', None) core.pop('refcommit') # show full structure of the assembled metadata from demo content target_metadata = { "@context": { "@vocab": "http://docs.datalad.org/schema_v2.0.json" }, "datalad_core": { "@id": ds.id }, "nidm": { "@context": { "mydurationkey": { "@id": "time:Duration" }, "myvocabprefix": { "@id": "http://purl.org/ontology/mydefinition", "description": "I am a vocabulary", "type": "http://purl.org/dc/dcam/VocabularyEncodingScheme" } }, "mydurationkey": 0.6 } } if datalad_extracts_annex_key: target_metadata['datalad_unique_content_properties'] = \ { "annex": { "key": [ "MD5E-s15920--acfb708aa74951cfff1a9d466f6d77be.nii.gz" ] } } assert_result_count(res, 1, metadata=target_metadata)
def test_exif(path): ds = Dataset(path).rev_create() ds.config.add('datalad.metadata.nativetype', 'exif', where='dataset') copy(opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'exif.jpg'), path) ds.rev_save() ok_clean_git(ds.path) res = ds.aggregate_metadata() assert_status('ok', res) res = ds.metadata('exif.jpg') assert_result_count(res, 1) # from this extractor meta = res[0]['metadata']['exif'] for k, v in target.items(): eq_(meta[k], v) assert_in('@context', meta)
def test_exif(path): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'exif', where='dataset') copy( opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'exif.jpg'), path) ds.save() ok_clean_git(ds.path) res = ds.aggregate_metadata() assert_status('ok', res) res = ds.metadata('exif.jpg') assert_result_count(res, 1) # from this extractor meta = res[0]['metadata']['exif'] for k, v in target.items(): eq_(meta[k], v) assert_in('@context', meta)
def test_xmp(path=None): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'xmp', scope='branch') copy(opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'xmp.pdf'), path) ds.save() assert_repo_status(ds.path) res = ds.aggregate_metadata() assert_status('ok', res) res = ds.metadata('xmp.pdf') assert_result_count(res, 1) # from this extractor meta = res[0]['metadata']['xmp'] for k, v in target.items(): eq_(meta[k], v) assert_in('@context', meta)
def test_nidm(path): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'nidm', where='dataset') # imagine filling the dataset up with something that NIDM info could be copy( opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'nifti1.nii.gz'), path) # extracted from ds.add('.') # all nice and tidy, nothing untracked ok_clean_git(ds.path) # engage the extractor(s) res = ds.aggregate_metadata() # aggregation done without whining assert_status('ok', res) res = ds.metadata(reporton='datasets') # ATM we do not forsee file-based metadata to come back from NIDM assert_result_count(res, 1) # show full structure of the assembled metadata from demo content assert_result_count( res, 1, metadata={ "@context": { "@vocab": "http://docs.datalad.org/schema_v2.0.json" }, "datalad_core": { "@id": ds.id }, "nidm": { "@context": { "mydurationkey": { "@id": "time:Duration" }, "myvocabprefix": { "@id": "http://purl.org/ontology/mydefinition", "description": "I am a vocabulary", "type": "http://purl.org/dc/dcam/VocabularyEncodingScheme" } }, "mydurationkey": 0.6 } })
def test_image(path): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'image', where='dataset') copy(opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'exif.jpg'), path) ds.add('.') ok_clean_git(ds.path) res = ds.aggregate_metadata() assert_status('ok', res) res = ds.metadata('exif.jpg') assert_result_count(res, 1) assert_result_count(res, 1, metadata={ "dcterms:SizeOrDuration": [4, 3], "color_mode": "3x8-bit pixels, true color", "type": "dctype:Image", "spatial_resolution(dpi)": [72, 72], "format": "JPEG (ISO 10918)" })
def test_nifti(path): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'nifti1', where='dataset') copy( opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'nifti1.nii.gz'), path) ds.add('.') ok_clean_git(ds.path) res = ds.aggregate_metadata() assert_status('ok', res) res = ds.metadata('nifti1.nii.gz') assert_result_count(res, 1) assert_result_count( res, 1, metadata={ "description": "FSL5.0", "spatial_resolution(mm)": [2.0, 2.0, 2.0], "temporal_spacing(s)": 6.0, "nifti1:datatype": "int16", "nifti1:dim": [4, 91, 109, 91, 2, 1, 1, 1], "nifti1:pixdim": [-1.0, 2.0, 2.0, 2.0, 6.0, 1.0, 1.0, 1.0], "nifti1:xyz_unit": "millimiter (uo:0000016)", "nifti1:t_unit": "second (uo:0000010)", "nifti1:cal_min": 3000.0, "nifti1:cal_max": 8000.0, "nifti1:toffset": 0.0, "nifti1:vox_offset": 0.0, "nifti1:intent": "none", "nifti1:sizeof_hdr": 348, "nifti1:magic": "n+1", "nifti1:sform_code": "mni", "nifti1:qform_code": "mni", "nifti1:freq_axis": None, "nifti1:phase_axis": None, "nifti1:slice_axis": None, "nifti1:slice_start": 0, "nifti1:slice_duration": 0.0, "nifti1:slice_order": "unknown", "nifti1:slice_end": 0, })
def test_within_ds_file_search(path): try: import nibabel except ImportError: raise SkipTest ds = Dataset(path).create(force=True) ds.config.add('datalad.metadata.nativetype', 'nifti1', where='dataset') makedirs(opj(path, 'stim')) for src, dst in (('nifti1.nii.gz', opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz')), ('nifti1.nii.gz', opj('sub-03', 'func', 'sub-03_task-other_bold.nii.gz'))): copy(opj(dirname(dirname(__file__)), 'tests', 'data', 'files', src), opj(path, dst)) ds.save() ds.aggregate_metadata() ok_clean_git(ds.path) # basic sanity check on the metadata structure of the dataset dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata'] for src in ('bids', 'nifti1'): # something for each one assert_in(src, dsmeta) # each src declares its own context assert_in('@context', dsmeta[src]) # we have a unique content metadata summary for each src assert_in(src, dsmeta['datalad_unique_content_properties']) # test default behavior with swallow_outputs() as cmo: ds.search(show_keys='name', mode='textblob') assert_in("""\ id meta parentds path type """, cmo.out) target_out = """\ annex.key bids.BIDSVersion bids.author bids.citation bids.conformsto bids.datatype bids.description """ if external_versions['bids'] >= '0.9': target_out += "bids.extension\n" target_out += """\ bids.fundedby bids.license bids.name bids.subject.age(years) bids.subject.gender bids.subject.handedness bids.subject.hearing_problems_current bids.subject.id bids.subject.language bids.suffix bids.task datalad_core.id datalad_core.refcommit id nifti1.cal_max nifti1.cal_min nifti1.datatype nifti1.description nifti1.dim nifti1.freq_axis nifti1.intent nifti1.magic nifti1.phase_axis nifti1.pixdim nifti1.qform_code nifti1.sform_code nifti1.sizeof_hdr nifti1.slice_axis nifti1.slice_duration nifti1.slice_end nifti1.slice_order nifti1.slice_start nifti1.spatial_resolution(mm) nifti1.t_unit nifti1.temporal_spacing(s) nifti1.toffset nifti1.vox_offset nifti1.xyz_unit parentds path type """ # check generated autofield index keys with swallow_outputs() as cmo: ds.search(mode='autofield', show_keys='name') # it is impossible to assess what is different from that dump # so we will use diff diff = list(unified_diff(target_out.splitlines(), cmo.out.splitlines())) assert_in(target_out, cmo.out, msg="Diff: %s" % os.linesep.join(diff)) assert_result_count(ds.search('blablob#'), 0) # now check that we can discover things from the aggregated metadata for mode, query, hitpath, matched_key, matched_val in ( # random keyword query # multi word query implies AND ('textblob', ['bold', 'female'], opj('sub-03', 'func', 'sub-03_task-some_bold.nii.gz'), 'meta', 'female'), # report which field matched with auto-field ('autofield', 'female', opj('sub-03', 'func', 'sub-03_task-other_bold.nii.gz'), 'bids.subject.gender', 'female'), # autofield multi-word query is also AND ('autofield', ['bids.suffix:bold', 'bids.subject.id:01'], opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz'), 'bids.suffix', 'bold'), # TODO extend with more complex queries to test whoosh # query language configuration ): res = ds.search(query, mode=mode, full_record=True) if mode == 'textblob': # 'textblob' does datasets by default only (be could be configured otherwise assert_result_count(res, 1) else: # the rest has always a file and the dataset, because they carry metadata in # the same structure assert_result_count(res, 2) assert_result_count( res, 1, type='file', path=opj(ds.path, hitpath), # each file must report the ID of the dataset it is from, critical for # discovering related content dsid=ds.id) assert_result_count(res, 1, type='dataset', path=ds.path, dsid=ds.id) # test the key and specific value of the match assert_in(matched_key, res[-1]['query_matched']) assert_equal(res[-1]['query_matched'][matched_key], matched_val)
def test_within_ds_file_search(path): try: import mutagen except ImportError: raise SkipTest ds = Dataset(path).create(force=True) ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset') makedirs(opj(path, 'stim')) for src, dst in (('audio.mp3', opj('stim', 'stim1.mp3')), ): copy(opj(dirname(dirname(__file__)), 'tests', 'data', src), opj(path, dst)) ds.add('.') ds.aggregate_metadata() ok_clean_git(ds.path) # basic sanity check on the metadata structure of the dataset dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata'] for src in ('audio', ): # something for each one assert_in(src, dsmeta) # each src declares its own context assert_in('@context', dsmeta[src]) # we have a unique content metadata summary for each src assert_in(src, dsmeta['datalad_unique_content_properties']) # test default behavior with swallow_outputs() as cmo: ds.search(show_keys='name', mode='textblob') assert_in("""\ id meta parentds path type """, cmo.out) target_out = """\ audio.bitrate audio.duration(s) audio.format audio.music-Genre audio.music-album audio.music-artist audio.music-channels audio.music-sample_rate audio.name audio.tracknumber datalad_core.id datalad_core.refcommit id parentds path type """ # check generated autofield index keys with swallow_outputs() as cmo: ds.search(mode='autofield', show_keys='name') # it is impossible to assess what is different from that dump assert_in(target_out, cmo.out) assert_result_count(ds.search('blablob#'), 0) # now check that we can discover things from the aggregated metadata for mode, query, hitpath, matched_key, matched_val in ( # random keyword query ('textblob', 'mp3', opj('stim', 'stim1.mp3'), 'meta', 'mp3'), # report which field matched with auto-field ('autofield', 'mp3', opj('stim', 'stim1.mp3'), 'audio.format', 'mp3'), # XXX next one is not supported by current text field analyser # decomposes the mime type in [mime, audio, mp3] # ('autofield', # "'mime:audio/mp3'", # opj('stim', 'stim1.mp3'), # 'audio.format', 'mime:audio/mp3'), # but this one works ('autofield', "'mime audio mp3'", opj('stim', 'stim1.mp3'), 'audio.format', 'mp3'), # TODO extend with more complex queries to test whoosh # query language configuration ): res = ds.search(query, mode=mode, full_record=True) if mode == 'textblob': # 'textblob' does datasets by default only (be could be configured otherwise assert_result_count(res, 1) else: # the rest has always a file and the dataset, because they carry metadata in # the same structure assert_result_count(res, 2) assert_result_count( res, 1, type='file', path=opj(ds.path, hitpath), # each file must report the ID of the dataset it is from, critical for # discovering related content dsid=ds.id) assert_result_count(res, 1, type='dataset', path=ds.path, dsid=ds.id) # test the key and specific value of the match assert_in(matched_key, res[-1]['query_matched']) assert_equal(res[-1]['query_matched'][matched_key], matched_val)
def test_within_ds_file_search(path): try: import mutagen except ImportError: raise SkipTest ds = Dataset(path).create(force=True) # override default and search for datasets and files for this test for m in ('egrep', 'textblob', 'autofield'): ds.config.add('datalad.search.index-{}-documenttype'.format(m), 'all', where='dataset') ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset') makedirs(opj(path, 'stim')) for src, dst in (('audio.mp3', opj('stim', 'stim1.mp3')), ): copy(opj(dirname(dirname(__file__)), 'tests', 'data', src), opj(path, dst)) ds.add('.') ok_file_under_git(path, opj('stim', 'stim1.mp3'), annexed=True) # If it is not under annex, below addition of metadata silently does # not do anything list( ds.repo.set_metadata(opj('stim', 'stim1.mp3'), init={'importance': 'very'})) ds.aggregate_metadata() ok_clean_git(ds.path) # basic sanity check on the metadata structure of the dataset dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata'] for src in ('audio', ): # something for each one assert_in(src, dsmeta) # each src declares its own context assert_in('@context', dsmeta[src]) # we have a unique content metadata summary for each src assert_in(src, dsmeta['datalad_unique_content_properties']) # test default behavior with swallow_outputs() as cmo: ds.search(show_keys='name', mode='textblob') assert_in("""\ id meta parentds path type """, cmo.out) target_out = """\ annex.importance annex.key audio.bitrate audio.duration(s) audio.format audio.music-Genre audio.music-album audio.music-artist audio.music-channels audio.music-sample_rate audio.name audio.tracknumber datalad_core.id datalad_core.refcommit id parentds path type """ # check generated autofield index keys with swallow_outputs() as cmo: ds.search(mode='autofield', show_keys='name') # it is impossible to assess what is different from that dump assert_in(target_out, cmo.out) assert_result_count(ds.search('blablob#'), 0) # now check that we can discover things from the aggregated metadata for mode, query, hitpath, matched in ( ('egrep', ':mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # same as above, leading : is stripped, in indicates "ALL FIELDS" ('egrep', 'mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # same as above, but with AND condition # get both matches ('egrep', ['mp3', 'type:file'], opj('stim', 'stim1.mp3'), { 'type': 'file', 'audio.format': 'mp3' }), # case insensitive search ('egrep', 'mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # field selection by expression ('egrep', 'audio\.+:mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # random keyword query ('textblob', 'mp3', opj('stim', 'stim1.mp3'), { 'meta': 'mp3' }), # report which field matched with auto-field ('autofield', 'mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # XXX next one is not supported by current text field analyser # decomposes the mime type in [mime, audio, mp3] # ('autofield', # "'mime:audio/mp3'", # opj('stim', 'stim1.mp3'), # 'audio.format', 'mime:audio/mp3'), # but this one works ('autofield', "'mime audio mp3'", opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # TODO extend with more complex queries to test whoosh # query language configuration ): res = ds.search(query, mode=mode, full_record=True) assert_result_count( res, 1, type='file', path=opj(ds.path, hitpath), # each file must report the ID of the dataset it is from, critical for # discovering related content dsid=ds.id) # in egrep we currently do not search unique values # and the queries above aim at files assert_result_count(res, 1 if mode == 'egrep' else 2) if mode != 'egrep': assert_result_count(res, 1, type='dataset', path=ds.path, dsid=ds.id) # test the key and specific value of the match for matched_key, matched_val in matched.items(): assert_in(matched_key, res[-1]['query_matched']) assert_equal(res[-1]['query_matched'][matched_key], matched_val) # test a suggestion msg being logged if no hits and key is a bit off with swallow_logs(new_level=logging.INFO) as cml: res = ds.search('audio.formats:mp3 audio.bitsrate:1', mode='egrep') assert not res assert_in('Did you mean any of', cml.out) assert_in('audio.format', cml.out) assert_in('audio.bitrate', cml.out)
def test_exif(path): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'exif', where='dataset') copy(opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'exif.jpg'), path) ds.add('.') ok_clean_git(ds.path) res = ds.aggregate_metadata() assert_status('ok', res) res = ds.metadata('exif.jpg') assert_result_count(res, 1) # compare full expected metadata set to catch any change of mind on the # side of the EXIF library assert_result_count( res, 1, metadata={ "exif:InteroperabilityVersion": "[48, 49, 48, 48]", "exif:ExifVersion": 221.0, "exif:FocalLengthIn35mmFilm": 38.0, "exif:CompressedBitsPerPixel": 5.0, "exif:GainControl": "None", "exif:Compression": "JPEG (old-style)", "exif:PrintIM": "[80, 114, 105, 110, 116, 73, 77, 0, 48, 51, 48, 48, 0, 0, 0, 5, 0, 1, 0, 22, 0, 22, 0, 2, 1, 0, 0, 0, 1, 0, 5, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 16, 131, 0, 0, 0]", "exif:Make": "CASIO COMPUTER CO.,LTD.", "exif:Sharpness": "Normal", "exif:Contrast": "Normal", "exif:ColorSpace": "sRGB", "exif:ExposureMode": "Auto Exposure", "exif:ExposureBiasValue": 0.0, "exif:ExifImageWidth": 4.0, "exif:ComponentsConfiguration": "YCbCr", "exif:DateTimeOriginal": "2011:03:13 16:36:02", "exif:MaxApertureValue": "14/5", "exif:DateTime": "2017:10:08 10:21:03", "exif:InteroperabilityOffset": 30412.0, "exif:InteroperabilityIndex": "R98", "exif:FileSource": "Digital Camera", "exif:ResolutionUnit": "Pixels/Inch", "exif:FNumber": "27/10", "exif:ExposureProgram": "Program Normal", "exif:DigitalZoomRatio": "0/0", "exif:LightSource": "Unknown", "exif:ExifImageLength": 3.0, "exif:FlashPixVersion": 100.0, "exif:CustomRendered": "Normal", "exif:Flash": "Flash fired, auto mode", "exif:WhiteBalance": "Auto", "exif:Orientation": "Horizontal (normal)", "exif:ExposureTime": "1/60", "exif:Software": "GIMP 2.8.20", "exif:Model": "EX-S600", "exif:FocalLength": "31/5", "exif:SceneCaptureType": "Standard", "exif:ExifOffset": 272.0, "exif:Saturation": "Normal", "exif:YCbCrPositioning": "Centered", "exif:DateTimeDigitized": "2011:03:13 16:36:02", "exif:XResolution": 72.0, "exif:YResolution": 72.0, "exif:MeteringMode": "Pattern", })
def test_within_ds_file_search(path): try: import nibabel import mutagen except ImportError: raise SkipTest ds = Dataset(path).create(force=True) ds.config.add('datalad.metadata.nativetype', 'nifti1', where='dataset') ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset') makedirs(opj(path, 'stim')) for src, dst in (('audio.mp3', opj('stim', 'stim1.mp3')), ('nifti1.nii.gz', opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz')), ('nifti1.nii.gz', opj('sub-03', 'func', 'sub-03_task-other_bold.nii.gz'))): copy(opj(dirname(dirname(__file__)), 'tests', 'data', src), opj(path, dst)) ds.add('.') ds.aggregate_metadata() ok_clean_git(ds.path) # basic sanity check on the metadata structure of the dataset dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata'] for src in ('audio', 'bids', 'nifti1'): # something for each one assert_in(src, dsmeta) # each src declares its own context assert_in('@context', dsmeta[src]) # we have a unique content metadata summary for each src assert_in(src, dsmeta['datalad_unique_content_properties']) # test default behavior with swallow_outputs() as cmo: ds.search(show_keys=True, mode='textblob') assert_in("""\ id meta parentds path type """, cmo.out) target_out = """\ audio.bitrate audio.date audio.duration(s) audio.format audio.music-Genre audio.music-album audio.music-artist audio.music-channels audio.music-sample_rate audio.name audio.tracknumber bids.BIDSVersion bids.author bids.citation bids.conformsto bids.description bids.fundedby bids.license bids.modality bids.name bids.participant.age(years) bids.participant.gender bids.participant.handedness bids.participant.hearing_problems_current bids.participant.id bids.participant.language bids.subject bids.task bids.type id nifti1.cal_max nifti1.cal_min nifti1.datatype nifti1.description nifti1.dim nifti1.freq_axis nifti1.intent nifti1.magic nifti1.phase_axis nifti1.pixdim nifti1.qform_code nifti1.sform_code nifti1.sizeof_hdr nifti1.slice_axis nifti1.slice_duration nifti1.slice_end nifti1.slice_order nifti1.slice_start nifti1.spatial_resolution(mm) nifti1.t_unit nifti1.temporal_spacing(s) nifti1.toffset nifti1.vox_offset nifti1.xyz_unit parentds path type """ # check generated autofield index keys with swallow_outputs() as cmo: ds.search(mode='autofield', show_keys=True) # it is impossible to assess what is different from that dump assert_in(target_out, cmo.out) assert_result_count(ds.search('blablob#'), 0) # now check that we can discover things from the aggregated metadata for mode, query, hitpath, matched_key, matched_val in ( # random keyword query ('textblob', 'mp3', opj('stim', 'stim1.mp3'), 'meta', 'mp3'), # multi word query implies AND ('textblob', ['bold', 'male'], opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz'), 'meta', 'male'), # report which field matched with auto-field ('autofield', 'mp3', opj('stim', 'stim1.mp3'), 'audio.format', 'mp3'), ('autofield', 'female', opj('sub-03', 'func', 'sub-03_task-other_bold.nii.gz'), 'bids.participant.gender', 'female'), # autofield multi-word query is also AND ('autofield', ['bids.type:bold', 'bids.participant.id:01'], opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz'), 'bids.type', 'bold'), # XXX next one is not supported by current text field analyser # decomposes the mime type in [mime, audio, mp3] # ('autofield', # "'mime:audio/mp3'", # opj('stim', 'stim1.mp3'), # 'audio.format', 'mime:audio/mp3'), # but this one works ('autofield', "'mime audio mp3'", opj('stim', 'stim1.mp3'), 'audio.format', 'mp3'), # TODO extend with more complex queries to test whoosh # query language configuration ): res = ds.search(query, mode=mode) if mode == 'textblob': # 'textblob' does datasets by default only (be could be configured otherwise assert_result_count(res, 1) else: # the rest has always a file and the dataset, because they carry metadata in # the same structure assert_result_count(res, 2) assert_result_count( res, 1, type='file', path=opj(ds.path, hitpath), # each file must report the ID of the dataset it is from, critical for # discovering related content dsid=ds.id) assert_result_count(res, 1, type='dataset', path=ds.path, dsid=ds.id) # test the key and specific value of the match assert_in(matched_key, res[-1]['query_matched']) assert_equal(res[-1]['query_matched'][matched_key], matched_val)
def test_dicom(path): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'dicom', where='dataset') copy( op.join(op.dirname(op.dirname(op.dirname(__file__))), 'tests', 'data', 'files', 'dicom.dcm'), path) ds.save() ok_clean_git(ds.path) res = ds.aggregate_metadata() assert_status('ok', res) # query for the file metadata res = ds.metadata('dicom.dcm') assert_result_count(res, 1) # from this extractor meta = res[0]['metadata']['dicom'] assert_in('@context', meta) # no point in testing ALL keys, but we got plenty assert (len(meta.keys()) > 70) eq_(meta['SeriesDate'], '20070205') # Actually a tricky one of the dcm.multival.MultiValue type # which we should extract as a list # https://github.com/datalad/datalad-neuroimaging/issues/49 eq_(meta['ImageType'], ['ORIGINAL', 'PRIMARY', 'EPI', 'NONE']) # make sure we have PatientName -- this is not using a basic data type, but # dicom.valuerep.PersonName3 -- conversion should have handled that # we can only test if the key is there, the source dicom has an empty # string as value eq_(meta['PatientName'], '') # now ask for the dataset metadata, which should have both the unique props # and a list of imageseries (one in this case, but a list) res = ds.metadata(reporton='datasets') assert_result_count(res, 1) dsmeta = res[0]['metadata']['dicom'] # same context assert_dict_equal(meta['@context'], dsmeta['@context']) meta.pop('@context') seriesmeta = dsmeta['Series'] eq_(seriesmeta[0].pop('SeriesDirectory'), op.curdir) eq_(dsmeta['Series'], [meta]) # for this artificial case pretty much the same info also comes out as # unique props, but wrapped in lists ucp = res[0]['metadata']["datalad_unique_content_properties"]['dicom'] assert_dict_equal( { k: [v] for k, v in dsmeta['Series'][0].items() if k not in DicomExtractor._unique_exclude and k in ucp }, { k: v for k, v in ucp.items() if k not in DicomExtractor._unique_exclude }) # buuuut, if we switch of file-based metadata storage ds.config.add('datalad.metadata.aggregate-content-dicom', 'false', where='dataset') ds.aggregate_metadata() res = ds.metadata(reporton='datasets') if not datalad_extracts_annex_key: # the auto-uniquified bits are gone but the Series description stays assert_not_in("datalad_unique_content_properties", res[0]['metadata']) eq_(dsmeta['Series'], [meta])
def test_aggregation(path): with chpwd(path): assert_raises(InsufficientArgumentsError, aggregate_metadata, None) # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) # before anything aggregated we would get nothing and only a log warning with swallow_logs(new_level=logging.WARNING) as cml: assert_equal(list(query_aggregated_metadata('all', ds, [])), []) assert_re_in('.*Found no aggregated metadata.*update', cml.out) ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') subds = ds.create('sub', force=True) subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') subsubds = subds.create('subsub', force=True) subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') ds.add('.', recursive=True) ok_clean_git(ds.path) # aggregate metadata from all subdatasets into any superdataset, including # intermediate ones res = ds.aggregate_metadata(recursive=True, update_mode='all') # we get success report for both subdatasets and the superdataset, # and they get saved assert_result_count(res, 6) assert_result_count(res, 3, status='ok', action='aggregate_metadata') assert_result_count(res, 3, status='ok', action='save') # nice and tidy ok_clean_git(ds.path) # quick test of aggregate report aggs = ds.metadata(get_aggregates=True) # one for each dataset assert_result_count(aggs, 3) # mother also report layout version assert_result_count(aggs, 1, path=ds.path, layout_version=1) # store clean direct result origres = ds.metadata(recursive=True) # basic sanity check assert_result_count(origres, 6) assert_result_count(origres, 3, type='dataset') assert_result_count(origres, 3, type='file') # Now that we have annex.key # three different IDs assert_equal(3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset']))) # and we know about all three datasets for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'): assert_true( sum([s['metadata']['frictionless_datapackage']['name'] \ == assure_unicode(name) for s in origres if s['type'] == 'dataset'])) # now clone the beast to simulate a new user installing an empty dataset clone = install( opj(path, 'clone'), source=ds.path, result_xfm='datasets', return_type='item-or-list') # ID mechanism works assert_equal(ds.id, clone.id) # get fresh metadata cloneres = clone.metadata() # basic sanity check assert_result_count(cloneres, 2) assert_result_count(cloneres, 1, type='dataset') assert_result_count(cloneres, 1, type='file') # now loop over the previous results from the direct metadata query of # origin and make sure we get the extact same stuff from the clone _compare_metadata_helper(origres, clone) # now obtain a subdataset in the clone, should make no difference assert_status('ok', clone.install('sub', result_xfm=None, return_type='list')) _compare_metadata_helper(origres, clone) # test search in search tests, not all over the place ## query smoke test assert_result_count(clone.search('mother', mode='egrep'), 1) assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1) child_res = clone.search('child', mode='egrep') assert_result_count(child_res, 2) for r in child_res: if r['type'] == 'dataset': assert_in( r['query_matched']['frictionless_datapackage.name'], r['metadata']['frictionless_datapackage']['name'])
def test_aggregation(path): with chpwd(path): assert_raises(InsufficientArgumentsError, aggregate_metadata, None) # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) ds.config.add('datalad.metadata.nativetype', 'bids', where='dataset') ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') subds = ds.create('sub', force=True) subds.config.add('datalad.metadata.nativetype', 'bids', where='dataset') subsubds = subds.create('subsub', force=True) subsubds.config.add('datalad.metadata.nativetype', 'bids', where='dataset') ds.add('.', recursive=True) ok_clean_git(ds.path) # aggregate metadata from all subdatasets into any superdataset, including # intermediate ones res = ds.aggregate_metadata(recursive=True) # we get success report for both subdatasets and the superdataset, # and they get saved assert_result_count(res, 6) assert_result_count(res, 3, status='ok', action='aggregate_metadata') assert_result_count(res, 3, status='ok', action='save') # nice and tidy ok_clean_git(ds.path) # store clean direct result origres = ds.metadata(recursive=True) # basic sanity check assert_result_count(origres, 7) assert_result_count(origres, 3, type='dataset') assert_result_count(origres, 4, type='file') # three different IDs assert_equal( 3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset']))) # and we know about all three datasets for name in ('mother_äöü東', 'child_äöü東', 'grandchild_äöü東'): assert_true( sum([ s['metadata']['name'] == assure_unicode(name) for s in origres if s['type'] == 'dataset' ])) # now clone the beast to simulate a new user installing an empty dataset clone = install(opj(path, 'clone'), source=ds.path, result_xfm='datasets', return_type='item-or-list') # ID mechanism works assert_equal(ds.id, clone.id) # get fresh metadata cloneres = clone.metadata() # basic sanity check assert_result_count(cloneres, 3) assert_result_count(cloneres, 1, type='dataset') assert_result_count(cloneres, 2, type='file') # now loop over the previous results from the direct metadata query of # origin and make sure we get the extact same stuff from the clone _compare_metadata_helper(origres, clone) # now obtain a subdataset in the clone, should make no difference assert_status('ok', clone.install('sub', result_xfm=None, return_type='list')) _compare_metadata_helper(origres, clone) # query smoke test assert_result_count(clone.search('mother*'), 1) assert_result_count(clone.search('MoTHER*'), 1) child_res = clone.search('*child*') assert_result_count(child_res, 2) for r in child_res: if r['metadata']['type'] == 'dataset': eq_(r['query_matched']['name'], r['metadata']['name']) # Test 'and' for multiple search entries assert_result_count(clone.search(['*child*', '*bids*']), 2) assert_result_count(clone.search(['*child*', '*subsub*']), 1) assert_result_count(clone.search(['*bids*', '*sub*']), 2) assert_result_count(clone.search(['*', 'type:dataset']), 3)
def get_bids_dataset(): srcrepo = get_sourcerepo() bids_ds = Dataset(path=opj(srcrepo.path, 'datalad_neuroimaging', 'tests', 'data', 'bids')) if bids_ds.is_installed(): return bids_ds try: import heudiconv except ImportError: raise SkipTest # make one bids_ds.create() # place dicoms in the mandated shadow tree structdicom_ds = bids_ds.install(source=get_dicom_dataset('structural'), path=opj('sourcedata', 'sub-02', 'ses-structural'), reckless=True) funcdicom_ds = bids_ds.install(source=get_dicom_dataset('functional'), path=opj('sourcedata', 'sub-02', 'ses-functional'), reckless=True) # dicom dataset is preconfigured for metadata extraction # XXX this is the slowest step of the entire procedure # reading 5k dicoms of the functional data bids_ds.aggregate_metadata(recursive=True) # pull subject ID from metadata res = bids_ds.metadata(funcdicom_ds.path, reporton='datasets', return_type='item-or-list', result_renderer='disabled') subj_id = res['metadata']['dicom']['Series'][0]['PatientID'] # prepare for incoming BIDS metadata that we will want to keep in # Git -- templates would be awesome! with open(opj(bids_ds.path, '.gitattributes'), 'a') as ga: # except for hand-picked global metadata, we want anything # to go into the annex to be able to retract files after # publication ga.write('** annex.largefiles=anything\n') for fn in ('CHANGES', 'README', 'dataset_description.json'): # but not these ga.write('{} annex.largefiles=nothing\n'.format(fn)) bids_ds.add('.gitattributes', to_git=True, message='Initial annex entry configuration') ok_clean_git(bids_ds.path) # conversion of two DICOM datasets to one BIDS dataset for label, ds, scanlabel in ( # structural needs to come first or else heudiconv # will try to rewrite the events.tsv for the functional # run, for some strange reason ('structural', structdicom_ds, 'anat'), ('functional', funcdicom_ds, 'func')): bids_ds.run( [ 'heudiconv', '-f', 'reproin', # TODO fix DICOMs to not have a 'sub' prefix '-s', subj_id, '-c', 'dcm2niix', # TODO decide on the fate of .heudiconv/ # but ATM we need to (re)move it: # https://github.com/nipy/heudiconv/issues/196 '-o', opj(bids_ds.path, '.git', 'stupid', label), '-b', '-a', bids_ds.path, '-l', '', # avoid gory details provided by dcmstack, we have them in # the aggregated DICOM metadata already '--minmeta', '--files', opj(ds.path, 'dicoms') ], message="DICOM conversion of {} scans".format(label)) # remove unwanted stuff that cannot be disabled ATM # https://github.com/nipy/heudiconv/issues/195 # TODO should be removed eventually bids_ds.remove([ p for p in (opj('sourcedata', 'sub-02', scanlabel), opj('sourcedata', 'README')) if op.lexists(opj(bids_ds.path, p)) ], check=False) bids_ds.config.add('datalad.metadata.nativetype', 'bids', where='dataset', reload=False) bids_ds.config.add('datalad.metadata.nativetype', 'nifti1', where='dataset', reload=True) # XXX need to `add` specifically to make it work in direct mode #bids_ds.save(message='Metadata type config') bids_ds.add('.', message='Metadata type config') # loose dicom datasets bids_ds.uninstall([structdicom_ds.path, funcdicom_ds.path], check=False) # no need for recursion, we already have the dicom dataset's # stuff on record bids_ds.aggregate_metadata(recursive=False, incremental=True) ok_clean_git(bids_ds.path) return bids_ds
def test_aggregation(path=None): with chpwd(path): assert_raises(InsufficientArgumentsError, aggregate_metadata, None) # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) # before anything aggregated we would get nothing and only a log warning with swallow_logs(new_level=logging.WARNING) as cml: assert_equal(list(query_aggregated_metadata('all', ds, [])), []) assert_re_in('.*Found no aggregated metadata.*update', cml.out) ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', scope='branch') subds = ds.create('sub', force=True) subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', scope='branch') subsubds = subds.create('subsub', force=True) subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', scope='branch') ds.save(recursive=True) assert_repo_status(ds.path) # aggregate metadata from all subdatasets into any superdataset, including # intermediate ones res = ds.aggregate_metadata(recursive=True, update_mode='all') # we get success report for both subdatasets and the superdataset, # and they get saved assert_result_count(res, 3, status='ok', action='aggregate_metadata') assert_in_results(res, action='save', status="ok") # nice and tidy assert_repo_status(ds.path) # quick test of aggregate report aggs = ds.metadata(get_aggregates=True) # one for each dataset assert_result_count(aggs, 3) # mother also report layout version assert_result_count(aggs, 1, path=ds.path, layout_version=1) # store clean direct result origres = ds.metadata(recursive=True) # basic sanity check assert_result_count(origres, 6) assert_result_count(origres, 3, type='dataset') assert_result_count(origres, 3, type='file') # Now that we have annex.key # three different IDs assert_equal(3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset']))) # and we know about all three datasets for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'): assert_true( sum([s['metadata']['frictionless_datapackage']['name'] \ == ensure_unicode(name) for s in origres if s['type'] == 'dataset'])) # now clone the beast to simulate a new user installing an empty dataset clone = install( opj(path, 'clone'), source=ds.path, result_xfm='datasets', return_type='item-or-list') # ID mechanism works assert_equal(ds.id, clone.id) # get fresh metadata cloneres = clone.metadata() # basic sanity check assert_result_count(cloneres, 2) assert_result_count(cloneres, 1, type='dataset') assert_result_count(cloneres, 1, type='file') # now loop over the previous results from the direct metadata query of # origin and make sure we get the extract same stuff from the clone _compare_metadata_helper(origres, clone) # now obtain a subdataset in the clone, should make no difference assert_status('ok', clone.install('sub', result_xfm=None, return_type='list')) _compare_metadata_helper(origres, clone) # test search in search tests, not all over the place ## query smoke test assert_result_count(clone.search('mother', mode='egrep'), 1) assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1) child_res = clone.search('child', mode='egrep') assert_result_count(child_res, 2) for r in child_res: if r['type'] == 'dataset': assert_in( r['query_matched']['frictionless_datapackage.name'], r['metadata']['frictionless_datapackage']['name'])
def test_within_ds_file_search(path): try: import mutagen except ImportError: raise SkipTest ds = Dataset(path).create(force=True) # override default and search for datasets and files for this test for m in ('egrep', 'textblob', 'autofield'): ds.config.add( 'datalad.search.index-{}-documenttype'.format(m), 'all', where='dataset') ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset') makedirs(opj(path, 'stim')) for src, dst in ( ('audio.mp3', opj('stim', 'stim1.mp3')),): copy( opj(dirname(dirname(__file__)), 'tests', 'data', src), opj(path, dst)) ds.save() ok_file_under_git(path, opj('stim', 'stim1.mp3'), annexed=True) # If it is not under annex, below addition of metadata silently does # not do anything ds.repo.set_metadata( opj('stim', 'stim1.mp3'), init={'importance': 'very'}) ds.aggregate_metadata() ok_clean_git(ds.path) # basic sanity check on the metadata structure of the dataset dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata'] for src in ('audio',): # something for each one assert_in(src, dsmeta) # each src declares its own context assert_in('@context', dsmeta[src]) # we have a unique content metadata summary for each src assert_in(src, dsmeta['datalad_unique_content_properties']) # test default behavior with swallow_outputs() as cmo: ds.search(show_keys='name', mode='textblob') assert_in("""\ id meta parentds path type """, cmo.out) target_out = """\ annex.importance annex.key audio.bitrate audio.duration(s) audio.format audio.music-Genre audio.music-album audio.music-artist audio.music-channels audio.music-sample_rate audio.name audio.tracknumber datalad_core.id datalad_core.refcommit id parentds path type """ # check generated autofield index keys with swallow_outputs() as cmo: ds.search(mode='autofield', show_keys='name') # it is impossible to assess what is different from that dump assert_in(target_out, cmo.out) assert_result_count(ds.search('blablob#'), 0) # now check that we can discover things from the aggregated metadata for mode, query, hitpath, matched in ( ('egrep', ':mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # same as above, leading : is stripped, in indicates "ALL FIELDS" ('egrep', 'mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # same as above, but with AND condition # get both matches ('egrep', ['mp3', 'type:file'], opj('stim', 'stim1.mp3'), {'type': 'file', 'audio.format': 'mp3'}), # case insensitive search ('egrep', 'mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # field selection by expression ('egrep', 'audio\.+:mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # random keyword query ('textblob', 'mp3', opj('stim', 'stim1.mp3'), {'meta': 'mp3'}), # report which field matched with auto-field ('autofield', 'mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # XXX next one is not supported by current text field analyser # decomposes the mime type in [mime, audio, mp3] # ('autofield', # "'mime:audio/mp3'", # opj('stim', 'stim1.mp3'), # 'audio.format', 'mime:audio/mp3'), # but this one works ('autofield', "'mime audio mp3'", opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # TODO extend with more complex queries to test whoosh # query language configuration ): res = ds.search(query, mode=mode, full_record=True) assert_result_count( res, 1, type='file', path=opj(ds.path, hitpath), # each file must report the ID of the dataset it is from, critical for # discovering related content dsid=ds.id) # in egrep we currently do not search unique values # and the queries above aim at files assert_result_count(res, 1 if mode == 'egrep' else 2) if mode != 'egrep': assert_result_count( res, 1, type='dataset', path=ds.path, dsid=ds.id) # test the key and specific value of the match for matched_key, matched_val in matched.items(): assert_in(matched_key, res[-1]['query_matched']) assert_equal(res[-1]['query_matched'][matched_key], matched_val) # test a suggestion msg being logged if no hits and key is a bit off with swallow_logs(new_level=logging.INFO) as cml: res = ds.search('audio.formats:mp3 audio.bitsrate:1', mode='egrep') assert not res assert_in('Did you mean any of', cml.out) assert_in('audio.format', cml.out) assert_in('audio.bitrate', cml.out)