def test_our_metadataset_search(tdir): # smoke test for basic search operations on our super-megadataset # expensive operation but ok ds = install(path=tdir, source="///") assert list(ds.search('.', report='*', regex=True)) assert list(ds.search('.', report='*')) assert list(ds.search('.', report_matched=True)) # and either we could provide output in different formats import simplejson from datalad.utils import swallow_outputs from datalad.api import search_ with swallow_outputs() as cmo: assert list(search_('.', report='*', regex=True, format='json', dataset=ds)) out = cmo.out # since this one is just absorbs all first, we can't go one by one assert simplejson.loads(out) try: import yaml except ImportError: raise SkipTest("no yaml module") with swallow_outputs() as cmo: assert list(search_('.', report='*', regex=True, format='yaml', dataset=ds)) out = cmo.out assert yaml.load(out)
def test_ls_s3(): url = 's3://datalad-test0-versioned/' with swallow_outputs(): # just to skip if no credentials get_test_providers(url) with swallow_outputs() as cmo: res = ls(url) assert_equal(len(res), 17) # all the entries counts = Counter(map(lambda x: x.__class__.__name__, res)) assert_equal(counts, {'Key': 14, 'DeleteMarker': 3}) assert_in('Bucket info:', cmo.out)
def test_create_1test_dataset(): # and just a single dataset from datalad.api import create_test_dataset with swallow_outputs(): dss = create_test_dataset() eq_(len(dss), 1) ok_clean_git(dss[0], annex=False)
def test_create_1test_dataset(): # and just a single dataset from datalad.api import create_test_dataset with swallow_outputs(): dss = create_test_dataset() eq_(len(dss), 1) ok_clean_git(dss[0], annex=False)
def test_create_1test_dataset(): # and just a single dataset from datalad.api import create_test_dataset with swallow_outputs(): dss = create_test_dataset() eq_(len(dss), 1) assert_repo_status(dss[0], annex=False)
def test_new_relpath(topdir): from datalad.api import create_test_dataset with swallow_logs(), chpwd(topdir), swallow_outputs(): dss = create_test_dataset('testds', spec='1') eq_(dss[0], opj(topdir, 'testds')) eq_(len(dss), 2) # 1 top + 1 sub-dataset as demanded for ds in dss: ok_clean_git(ds, annex=False)
def test_new_relpath(topdir): from datalad.api import create_test_dataset with swallow_logs(), chpwd(topdir), swallow_outputs(): dss = create_test_dataset('testds', spec='1') eq_(dss[0], opj(topdir, 'testds')) eq_(len(dss), 2) # 1 top + 1 sub-dataset as demanded for ds in dss: ok_clean_git(ds, annex=False)
def test_ls_uninstalled(path): ds = Dataset(path) ds.create() ds.create('sub') ds.uninstall('sub', check=False) with swallow_outputs() as cmo: ls([path], recursive=True) assert_in('not installed', cmo.out)
def test_create_test_dataset(): # rudimentary smoke test from datalad.api import create_test_dataset with swallow_logs(), swallow_outputs(): dss = create_test_dataset(spec='2/1-2') ok_(5 <= len(dss) <= 7) # at least five - 1 top, two on top level, 1 in each for ds in dss: ok_clean_git(ds, annex=None) # some of them are annex but we just don't check ok_(len(glob(opj(ds, 'file*'))))
def test_create_test_dataset(): # rudimentary smoke test from datalad.api import create_test_dataset with swallow_logs(), swallow_outputs(): dss = create_test_dataset(spec='2/1-2') ok_(5 <= len(dss) <= 7) # at least five - 1 top, two on top level, 1 in each for ds in dss: assert_repo_status(ds, annex=None) # some of them are annex but we just don't check ok_(len(glob(opj(ds, 'file*'))))
def test_swallow_outputs(): with swallow_outputs() as cm: eq_(cm.out, '') sys.stdout.write("out normal") sys.stderr.write("out error") eq_(cm.out, 'out normal') sys.stdout.write(" and more") eq_(cm.out, 'out normal and more') # incremental eq_(cm.err, 'out error') eq_(cm.err, 'out error') # the same value if multiple times
def test_check_dates_invalid_date(): skip_if_no_module("dateutil") with swallow_outputs() as cmo: assert_raises(IncompleteResultsError, check_dates, [], reference_date="not a valid date", return_type="list") out = cmo.out # The error makes it through the standard renderer. assert_in('"status": "error"', out)
def test_check_dates_invalid_date(): skip_if_no_module("dateutil") with swallow_outputs() as cmo: assert_raises(IncompleteResultsError, check_dates, [], reference_date="not a valid date", return_type="list") out = cmo.out # The error makes it through the standard renderer. assert_in('"status": "error"', out)
def test_ls_noarg(toppath): # smoke test pretty much AnnexRepo(toppath, create=True) # this test is pointless for now and until ls() actually returns # something with swallow_outputs(): ls_out = ls(toppath) with chpwd(toppath): assert_equal(ls_out, ls([])) assert_equal(ls_out, ls('.'))
def test_line_profile(): skip_if_no_module('line_profiler') @line_profile def f(j): i = j + 1 # xyz return i with swallow_outputs() as cmo: assert_equal(f(3), 4) assert_equal(cmo.err, '') assert_in('i = j + 1 # xyz', cmo.out)
def _test(*args_): #print args_ for args in args_: for recursive in [False, True]: # in both cases shouldn't fail with swallow_outputs() as cmo: ls(args, recursive=recursive) assert_equal(len(cmo.out.rstrip().split('\n')), len(args)) assert_in('[annex]', cmo.out) assert_in('[git]', cmo.out) assert_in(DEFAULT_BRANCH, cmo.out) if "bogus" in args: assert_in('unknown', cmo.out)
def test_containers_run(self, path): if self.image_existed: raise SkipTest( "Not pulling with containers-run due to existing image: {}". format(self.image_name)) from datalad.api import Dataset ds = Dataset(path).create(force=True) ds.save(path="foo") ds.containers_add("bb", url="dhub://" + self.image_name) with swallow_outputs() as out: ds.containers_run(["cat", "foo"], container_name="bb") assert_in("content", out.out)
def test_hierarchy(topdir): # GH 1178 from datalad.api import create_test_dataset with swallow_logs(), swallow_outputs(): dss = create_test_dataset(topdir, spec='1/1') eq_(len(dss), 3) eq_(dss[0], topdir) for ids, ds in enumerate(dss): ok_clean_git(ds, annex=False) # each one should have 2 commits (but the last one)-- one for file and # another one for sub-dataset repo = GitRepo(ds) eq_(len(list(repo.get_branch_commits())), 1 + int(ids < 2))
def test_hierarchy(topdir): # GH 1178 from datalad.api import create_test_dataset with swallow_logs(), swallow_outputs(): dss = create_test_dataset(topdir, spec='1/1') eq_(len(dss), 3) eq_(dss[0], topdir) for ids, ds in enumerate(dss): ok_clean_git(ds, annex=False) # each one should have 2 commits (but the last one)-- one for file and # another one for sub-dataset repo = GitRepo(ds) eq_(len(list(repo.get_branch_commits())), 1 + int(ids<2))
def test_docker(path): # Singularity's "docker://" scheme. ds = Dataset(path).create() ds.containers_add( "bb", url=("docker://busybox@sha256:" "7964ad52e396a6e045c39b5a44438424ac52e12e4d5a25d94895f2058cb863a0" )) img = op.join(ds.path, ".datalad", "environments", "bb", "image") assert_result_count(ds.containers_list(), 1, path=img, name="bb") ok_clean_git(path) with swallow_outputs(): ds.containers_run(["ls", "/singularity"])
def test_hierarchy(topdir): # GH 1178 from datalad.api import create_test_dataset with swallow_logs(), swallow_outputs(): dss = create_test_dataset(topdir, spec='1/1') eq_(len(dss), 3) eq_(dss[0], topdir) for ids, ds in enumerate(dss): assert_repo_status(ds, annex=False) # each one should have 2 commits (but the last one)-- one for file and # another one for sub-dataset repo = repo_from_path(ds) if not hasattr(repo, 'is_managed_branch') or not repo.is_managed_branch(): eq_(len(list(repo.get_branch_commits_())), 1 + int(ids < 2))
def test_check_dates(path): skip_if_no_module("dateutil") ref_ts = 1218182889 # Fri, 08 Aug 2008 04:08:09 -0400 refdate = "@{}".format(ref_ts) repo = os.path.join(path, "repo") with set_date(ref_ts + 5000): ar = AnnexRepo(repo) ar.add(".") ar.commit() # The standard renderer outputs json. with swallow_outputs() as cmo: # Set level to WARNING to avoid the progress bar when # DATALAD_TESTS_UI_BACKEND=console. with swallow_logs(new_level=logging.WARNING): check_dates([repo], reference_date=refdate, return_type="list") assert_in("report", json.loads(cmo.out).keys()) # We find the newer objects. newer = call([path], reference_date=refdate) eq_(len(newer), 1) ok_(newer[0]["report"]["objects"]) # There are no older objects to find. older = call([repo], reference_date=refdate, older=True) assert_false(older[0]["report"]["objects"]) # We can pass the date in RFC 2822 format. assert_dict_equal( newer[0], call([path], reference_date="08 Aug 2008 04:08:09 -0400")[0]) # paths=None defaults to the current directory. with chpwd(path): assert_dict_equal( newer[0]["report"], call(paths=None, reference_date=refdate)[0]["report"]) # Only commit type is present when annex='none'. newer_noannex = call([path], reference_date=refdate, annex="none") for entry in newer_noannex[0]["report"]["objects"].values(): ok_(entry["type"] == "commit")
def test_quoting(path): ds = Dataset(op.join(path, OBSCURE_FILENAME)).create(force=True) # Our custom procedure fails if it receives anything other than two # procedure arguments (so the script itself receives 3). Check a few cases # from the Python API and CLI. ds.config.add("datalad.locations.dataset-procedures", "code", where="dataset") with swallow_outputs(): ds.run_procedure(spec=["just2args", "with ' sing", 'with " doub']) with assert_raises(CommandError): ds.run_procedure(spec=["just2args", "still-one arg"]) runner = Runner(cwd=ds.path) runner.run( "datalad run-procedure just2args \"with ' sing\" 'with \" doub'") with assert_raises(CommandError): runner.run("datalad run-procedure just2args 'still-one arg'")
def test_check_dates(path=None): skip_if_no_module("dateutil") ref_ts = 1218182889 # Fri, 08 Aug 2008 04:08:09 -0400 refdate = "@{}".format(ref_ts) repo = os.path.join(path, "repo") with set_date(ref_ts + 5000): ar = AnnexRepo(repo) ar.add(".") ar.commit() # The standard renderer outputs json. with swallow_outputs() as cmo: # Set level to WARNING to avoid the progress bar when # DATALAD_TESTS_UI_BACKEND=console. with swallow_logs(new_level=logging.WARNING): check_dates([repo], reference_date=refdate, return_type="list") assert_in("report", json.loads(cmo.out).keys()) # We find the newer objects. newer = call([path], reference_date=refdate) eq_(len(newer), 1) ok_(newer[0]["report"]["objects"]) # There are no older objects to find. older = call([repo], reference_date=refdate, older=True) assert_false(older[0]["report"]["objects"]) # We can pass the date in RFC 2822 format. assert_dict_equal( newer[0], call([path], reference_date="08 Aug 2008 04:08:09 -0400")[0]) # paths=None defaults to the current directory. with chpwd(path): assert_dict_equal( newer[0]["report"], call(paths=None, reference_date=refdate)[0]["report"]) # Only commit type is present when annex='none'. newer_noannex = call([path], reference_date=refdate, annex="none") for entry in newer_noannex[0]["report"]["objects"].values(): ok_(entry["type"] == "commit")
def test_push_custom_summary(path=None): path = Path(path) ds = Dataset(path / "ds").create() sib = mk_push_target(ds, "sib", str(path / "sib"), bare=False, annex=False) (sib.pathobj / "f1").write_text("f1") sib.save() (ds.pathobj / "f2").write_text("f2") ds.save() # These options are true by default and our tests usually run with a # temporary home, but set them to be sure. ds.config.set("advice.pushUpdateRejected", "true", scope="local") ds.config.set("advice.pushFetchFirst", "true", scope="local") with swallow_outputs() as cmo: ds.push(to="sib", result_renderer="default", on_failure="ignore") assert_in("Hints:", cmo.out) assert_in("action summary:", cmo.out)
def _check_setup_exceptionhook(interactive): old_exceptionhook = sys.excepthook post_mortem_tb = [] def our_post_mortem(tb): post_mortem_tb.append(tb) with patch('sys.excepthook'), \ patch('datalad.utils.is_interactive', lambda: interactive), \ patch('pdb.post_mortem', our_post_mortem): setup_exceptionhook() our_exceptionhook = sys.excepthook ok_(old_exceptionhook != our_exceptionhook) with swallow_logs() as cml, swallow_outputs() as cmo: # we need to call our_exceptionhook explicitly b/c nose # swallows all Exceptions and hook never gets executed try: raise RuntimeError except Exception as e: # RuntimeError: type_, value_, tb_ = sys.exc_info() our_exceptionhook(type_, value_, tb_) # Happens under tox environment but not in manually crafted # ones -- not yet sure what it is about but --dbg does work # with python3 so lettting it skip for now raise SkipTest( "TODO: Not clear why in PY3 calls cleanup if we try to " "access the beast" ) #assert_in('Traceback (most recent call last)', cmo.err) #assert_in('in _check_setup_exceptionhook', cmo.err) #if interactive: # assert_equal(post_mortem_tb[0], tb_) #else: # assert_equal(post_mortem_tb, []) # # assert_in('We cannot setup exception hook', cml.out) eq_(old_exceptionhook, sys.excepthook)
def test_quoting(path=None): ds = Dataset(op.join(path, OBSCURE_FILENAME)).create(force=True) # Our custom procedure fails if it receives anything other than two # procedure arguments (so the script itself receives 3). Check a few cases # from the Python API and CLI. ds.config.add("datalad.locations.dataset-procedures", "code", scope='branch') with swallow_outputs(): ds.run_procedure(spec=["just2args", "with ' sing", 'with " doub']) assert_in_results(ds.run_procedure(spec=["just2args", "still-one arg"], on_failure="ignore", result_renderer=None), action="run", status="error") runner = WitlessRunner(cwd=ds.path) runner.run( "datalad run-procedure just2args \"with ' sing\" 'with \" doub'", protocol=KillOutput) with assert_raises(CommandError): runner.run("datalad run-procedure just2args 'still-one arg'", protocol=KillOutput)
def test_setup_exceptionhook(interactive): old_exceptionhook = sys.excepthook post_mortem_tb = [] def our_post_mortem(tb): post_mortem_tb.append(tb) with patch('sys.excepthook'), \ patch('datalad.utils.is_interactive', lambda: interactive), \ patch('pdb.post_mortem', our_post_mortem): setup_exceptionhook() our_exceptionhook = sys.excepthook ok_(old_exceptionhook != our_exceptionhook) with swallow_logs() as cml, swallow_outputs() as cmo: # we need to call our_exceptionhook explicitly b/c nose # swallows all Exceptions and hook never gets executed try: raise RuntimeError except Exception as e: # RuntimeError: type_, value_, tb_ = sys.exc_info() our_exceptionhook(type_, value_, tb_) eq_(old_exceptionhook, sys.excepthook)
def test_within_ds_file_search(path): try: import mutagen except ImportError: raise SkipTest ds = Dataset(path).create(force=True) # override default and search for datasets and files for this test for m in ('egrep', 'textblob', 'autofield'): ds.config.add('datalad.search.index-{}-documenttype'.format(m), 'all', where='dataset') ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset') makedirs(opj(path, 'stim')) for src, dst in (('audio.mp3', opj('stim', 'stim1.mp3')), ): copy(opj(dirname(dirname(__file__)), 'tests', 'data', src), opj(path, dst)) ds.add('.') ok_file_under_git(path, opj('stim', 'stim1.mp3'), annexed=True) # If it is not under annex, below addition of metadata silently does # not do anything list( ds.repo.set_metadata(opj('stim', 'stim1.mp3'), init={'importance': 'very'})) ds.aggregate_metadata() ok_clean_git(ds.path) # basic sanity check on the metadata structure of the dataset dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata'] for src in ('audio', ): # something for each one assert_in(src, dsmeta) # each src declares its own context assert_in('@context', dsmeta[src]) # we have a unique content metadata summary for each src assert_in(src, dsmeta['datalad_unique_content_properties']) # test default behavior with swallow_outputs() as cmo: ds.search(show_keys='name', mode='textblob') assert_in("""\ id meta parentds path type """, cmo.out) target_out = """\ annex.importance annex.key audio.bitrate audio.duration(s) audio.format audio.music-Genre audio.music-album audio.music-artist audio.music-channels audio.music-sample_rate audio.name audio.tracknumber datalad_core.id datalad_core.refcommit id parentds path type """ # check generated autofield index keys with swallow_outputs() as cmo: ds.search(mode='autofield', show_keys='name') # it is impossible to assess what is different from that dump assert_in(target_out, cmo.out) assert_result_count(ds.search('blablob#'), 0) # now check that we can discover things from the aggregated metadata for mode, query, hitpath, matched in ( ('egrep', ':mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # same as above, leading : is stripped, in indicates "ALL FIELDS" ('egrep', 'mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # same as above, but with AND condition # get both matches ('egrep', ['mp3', 'type:file'], opj('stim', 'stim1.mp3'), { 'type': 'file', 'audio.format': 'mp3' }), # case insensitive search ('egrep', 'mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # field selection by expression ('egrep', 'audio\.+:mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # random keyword query ('textblob', 'mp3', opj('stim', 'stim1.mp3'), { 'meta': 'mp3' }), # report which field matched with auto-field ('autofield', 'mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # XXX next one is not supported by current text field analyser # decomposes the mime type in [mime, audio, mp3] # ('autofield', # "'mime:audio/mp3'", # opj('stim', 'stim1.mp3'), # 'audio.format', 'mime:audio/mp3'), # but this one works ('autofield', "'mime audio mp3'", opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # TODO extend with more complex queries to test whoosh # query language configuration ): res = ds.search(query, mode=mode, full_record=True) assert_result_count( res, 1, type='file', path=opj(ds.path, hitpath), # each file must report the ID of the dataset it is from, critical for # discovering related content dsid=ds.id) # in egrep we currently do not search unique values # and the queries above aim at files assert_result_count(res, 1 if mode == 'egrep' else 2) if mode != 'egrep': assert_result_count(res, 1, type='dataset', path=ds.path, dsid=ds.id) # test the key and specific value of the match for matched_key, matched_val in matched.items(): assert_in(matched_key, res[-1]['query_matched']) assert_equal(res[-1]['query_matched'][matched_key], matched_val) # test a suggestion msg being logged if no hits and key is a bit off with swallow_logs(new_level=logging.INFO) as cml: res = ds.search('audio.formats:mp3 audio.bitsrate:1', mode='egrep') assert not res assert_in('Did you mean any of', cml.out) assert_in('audio.format', cml.out) assert_in('audio.bitrate', cml.out)
def test_within_ds_file_search(path): try: import nibabel import mutagen except ImportError: raise SkipTest ds = Dataset(path).create(force=True) ds.config.add('datalad.metadata.nativetype', 'nifti1', where='dataset') ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset') makedirs(opj(path, 'stim')) for src, dst in (('audio.mp3', opj('stim', 'stim1.mp3')), ('nifti1.nii.gz', opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz')), ('nifti1.nii.gz', opj('sub-03', 'func', 'sub-03_task-other_bold.nii.gz'))): copy(opj(dirname(dirname(__file__)), 'tests', 'data', src), opj(path, dst)) ds.add('.') ds.aggregate_metadata() ok_clean_git(ds.path) # basic sanity check on the metadata structure of the dataset dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata'] for src in ('audio', 'bids', 'nifti1'): # something for each one assert_in(src, dsmeta) # each src declares its own context assert_in('@context', dsmeta[src]) # we have a unique content metadata summary for each src assert_in(src, dsmeta['datalad_unique_content_properties']) # test default behavior with swallow_outputs() as cmo: ds.search(show_keys=True, mode='textblob') assert_in("""\ id meta parentds path type """, cmo.out) target_out = """\ audio.bitrate audio.date audio.duration(s) audio.format audio.music-Genre audio.music-album audio.music-artist audio.music-channels audio.music-sample_rate audio.name audio.tracknumber bids.BIDSVersion bids.author bids.citation bids.conformsto bids.description bids.fundedby bids.license bids.modality bids.name bids.participant.age(years) bids.participant.gender bids.participant.handedness bids.participant.hearing_problems_current bids.participant.id bids.participant.language bids.subject bids.task bids.type id nifti1.cal_max nifti1.cal_min nifti1.datatype nifti1.description nifti1.dim nifti1.freq_axis nifti1.intent nifti1.magic nifti1.phase_axis nifti1.pixdim nifti1.qform_code nifti1.sform_code nifti1.sizeof_hdr nifti1.slice_axis nifti1.slice_duration nifti1.slice_end nifti1.slice_order nifti1.slice_start nifti1.spatial_resolution(mm) nifti1.t_unit nifti1.temporal_spacing(s) nifti1.toffset nifti1.vox_offset nifti1.xyz_unit parentds path type """ # check generated autofield index keys with swallow_outputs() as cmo: ds.search(mode='autofield', show_keys=True) # it is impossible to assess what is different from that dump assert_in(target_out, cmo.out) assert_result_count(ds.search('blablob#'), 0) # now check that we can discover things from the aggregated metadata for mode, query, hitpath, matched_key, matched_val in ( # random keyword query ('textblob', 'mp3', opj('stim', 'stim1.mp3'), 'meta', 'mp3'), # multi word query implies AND ('textblob', ['bold', 'male'], opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz'), 'meta', 'male'), # report which field matched with auto-field ('autofield', 'mp3', opj('stim', 'stim1.mp3'), 'audio.format', 'mp3'), ('autofield', 'female', opj('sub-03', 'func', 'sub-03_task-other_bold.nii.gz'), 'bids.participant.gender', 'female'), # autofield multi-word query is also AND ('autofield', ['bids.type:bold', 'bids.participant.id:01'], opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz'), 'bids.type', 'bold'), # XXX next one is not supported by current text field analyser # decomposes the mime type in [mime, audio, mp3] # ('autofield', # "'mime:audio/mp3'", # opj('stim', 'stim1.mp3'), # 'audio.format', 'mime:audio/mp3'), # but this one works ('autofield', "'mime audio mp3'", opj('stim', 'stim1.mp3'), 'audio.format', 'mp3'), # TODO extend with more complex queries to test whoosh # query language configuration ): res = ds.search(query, mode=mode) if mode == 'textblob': # 'textblob' does datasets by default only (be could be configured otherwise assert_result_count(res, 1) else: # the rest has always a file and the dataset, because they carry metadata in # the same structure assert_result_count(res, 2) assert_result_count( res, 1, type='file', path=opj(ds.path, hitpath), # each file must report the ID of the dataset it is from, critical for # discovering related content dsid=ds.id) assert_result_count(res, 1, type='dataset', path=ds.path, dsid=ds.id) # test the key and specific value of the match assert_in(matched_key, res[-1]['query_matched']) assert_equal(res[-1]['query_matched'][matched_key], matched_val)
def test_fs_traverse(topdir): # setup temp directory tree for testing annex = AnnexRepo(topdir) AnnexRepo(opj(topdir, 'annexdir'), create=True) GitRepo(opj(topdir, 'gitdir'), create=True) GitRepo(opj(topdir, 'dir', 'subgit'), create=True) annex.add(opj(topdir, 'dir')) annex.commit() annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # traverse file system in recursive and non-recursive modes for recursive in [True, False]: # test fs_traverse in display mode with swallow_logs(new_level=logging.INFO) as log, swallow_outputs() as cmo: repo = AnnexRepo(topdir) fs = fs_traverse(topdir, repo, recurse_directories=recursive, json='display') if recursive: # fs_traverse logs should contain all not ignored subdirectories for subdir in [opj(topdir, 'dir'), opj(topdir, 'dir', 'subdir')]: assert_in('Directory: ' + subdir, log.out) # fs_traverse stdout contains subdirectory assert_in(('file2.txt' and 'dir'), cmo.out) # extract info of the top-level child directory child = [item for item in fs['nodes'] if item['name'] == 'dir'][0] # size of dir type child in non-recursive modes should be 0 Bytes(default) as # dir type child's size currently has no metadata file for traverser to pick its size from # and would require a recursive traversal w/ write to child metadata file mode assert_equal(child['size']['total'], {True: '6 Bytes', False: '0 Bytes'}[recursive]) repo.precommit() # to possibly stop batch process occupying the stdout for recursive in [True, False]: # run fs_traverse in write to json 'file' mode repo = AnnexRepo(topdir) fs = fs_traverse(topdir, repo, recurse_directories=recursive, json='file') # fs_traverse should return a dictionary assert_equal(isinstance(fs, dict), True) # not including git and annex folders assert_equal([item for item in fs['nodes'] if ('gitdir' or 'annexdir') == item['name']], []) # extract info of the top-level child directory child = [item for item in fs['nodes'] if item['name'] == 'dir'][0] # verify node type assert_equal(child['type'], 'dir') # same node size on running fs_traversal in recursive followed by non-recursive mode # verifies child's metadata file being used to find its size # running in reverse order (non-recursive followed by recursive mode) will give (0, actual size) assert_equal(child['size']['total'], '6 Bytes') # verify subdirectory traversal if run in recursive mode # In current RF 'nodes' are stripped away during recursive traversal # for now... later we might reincarnate them "differently" # TODO! if False: # recursive: # sub-dictionary should not include git and hidden directory info assert_equal([item for item in child['nodes'] if ('subgit' or '.fgit') == item['name']], []) # extract subdirectory dictionary, else fail subchild = [subitem for subitem in child["nodes"] if subitem['name'] == 'subdir'][0] # extract info of file1.txts, else fail link = [subnode for subnode in subchild["nodes"] if subnode['name'] == 'file1.txt'][0] # verify node's sizes and type assert_equal(link['size']['total'], '3 Bytes') assert_equal(link['size']['ondisk'], link['size']['total']) assert_equal(link['type'], 'link') # extract info of file2.txt, else fail brokenlink = [subnode for subnode in subchild["nodes"] if subnode['name'] == 'file2.txt'][0] # verify node's sizes and type assert_equal(brokenlink['type'], 'link-broken') assert_equal(brokenlink['size']['ondisk'], '0 Bytes') assert_equal(brokenlink['size']['total'], '3 Bytes')
def _swallow_outputs(isatty=True): with swallow_outputs() as cmo: stdout = cmo.handles[0] stdout.isatty = lambda: isatty yield cmo
def run_cmd(dspath): ds = Dataset(dspath) status_rec = get_status_dict('foreach-dataset', ds=ds, path=ds.path, command=cmd) if not ds.is_installed(): yield dict(status_rec, status="impossible", message="not installed") return # For consistent environment (Python) and formatting (command) similar to `run` one # But for Python command we provide actual ds and refds not paths placeholders = dict( pwd=pwd, # pass actual instances so .format could access attributes even for external commands ds=ds, # if python else ds.path, dspath=ds.path, # just for consistency with `run` refds=refds, # if python else refds.path, # Check if the command contains "tmpdir" to avoid creating an # unnecessary temporary directory in most but not all cases. # Note: different from 'run' - not wrapping match within {} and doing str tmpdir=mkdtemp( prefix="datalad-run-") if "tmpdir" in str(cmd) else "") try: if python: if isinstance(cmd, str): cmd_f, cmd_a, cmd_kw = _PYTHON_CMDS[cmd_type], ( cmd, placeholders), {} else: assert _is_callable(cmd) # all placeholders are passed as kwargs to the function cmd_f, cmd_a, cmd_kw = cmd, [], placeholders cm = chpwd_cm(ds.path) if chpwd == 'ds' else nothing_cm() with cm: if output_streams == 'pass-through': res = cmd_f(*cmd_a, **cmd_kw) out = {} elif output_streams == 'capture': with swallow_outputs() as cmo: res = cmd_f(*cmd_a, **cmd_kw) out = { 'stdout': cmo.out, 'stderr': cmo.err, } else: raise RuntimeError(output_streams) if cmd_type == 'eval': status_rec['result'] = res else: assert res is None else: try: cmd_expanded = format_command(cmd, **placeholders) except KeyError as exc: yield dict( status_rec, status='impossible', message=( 'command has an unrecognized placeholder: %s', exc)) return out = ds.repo._git_runner.run( cmd_expanded, cwd=ds.path if chpwd == 'ds' else pwd, protocol=protocol) if output_streams == 'capture': status_rec.update(out) # provide some feedback to user in default rendering if any(out.values()): status_rec['message'] = shortened_repr(out, 100) status_rec['status'] = 'ok' yield status_rec except Exception as exc: # get a better version with exception handling redoing the whole # status dict from scratch yield get_status_dict('foreach-dataset', ds=ds, path=ds.path, command=cmd, exception=exc, status='error', message=str(exc))
def test_fs_traverse(topdir): # setup temp directory tree for testing annex = AnnexRepo(topdir) AnnexRepo(opj(topdir, 'annexdir'), create=True) GitRepo(opj(topdir, 'gitdir'), create=True) GitRepo(opj(topdir, 'dir', 'subgit'), create=True) annex.add(opj(topdir, 'dir'), commit=True) annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # traverse file system in recursive and non-recursive modes for recursive in [True, False]: # test fs_traverse in display mode with swallow_logs( new_level=logging.INFO) as log, swallow_outputs() as cmo: fs = fs_traverse(topdir, AnnexRepo(topdir), recurse_directories=recursive, json='display') if recursive: # fs_traverse logs should contain all not ignored subdirectories for subdir in [ opj(topdir, 'dir'), opj(topdir, 'dir', 'subdir') ]: assert_in('Directory: ' + subdir, log.out) # fs_traverse stdout contains subdirectory assert_in(('file2.txt' and 'dir'), cmo.out) # extract info of the top-level child directory child = [item for item in fs['nodes'] if item['name'] == 'dir'][0] # size of dir type child in non-recursive modes should be 0 Bytes(default) as # dir type child's size currently has no metadata file for traverser to pick its size from # and would require a recursive traversal w/ write to child metadata file mode assert_equal(child['size']['total'], { True: '6 Bytes', False: '0 Bytes' }[recursive]) for recursive in [True, False]: # run fs_traverse in write to json 'file' mode fs = fs_traverse(topdir, AnnexRepo(topdir), recurse_directories=recursive, json='file') # fs_traverse should return a dictionary assert_equal(isinstance(fs, dict), True) # not including git and annex folders assert_equal([ item for item in fs['nodes'] if ('gitdir' or 'annexdir') == item['name'] ], []) # extract info of the top-level child directory child = [item for item in fs['nodes'] if item['name'] == 'dir'][0] # verify node type assert_equal(child['type'], 'dir') # same node size on running fs_traversal in recursive followed by non-recursive mode # verifies child's metadata file being used to find its size # running in reverse order (non-recursive followed by recursive mode) will give (0, actual size) assert_equal(child['size']['total'], '6 Bytes') # verify subdirectory traversal if run in recursive mode # In current RF 'nodes' are stripped away during recursive traversal # for now... later we might reincarnate them "differently" # TODO! if False: # recursive: # sub-dictionary should not include git and hidden directory info assert_equal([ item for item in child['nodes'] if ('subgit' or '.fgit') == item['name'] ], []) # extract subdirectory dictionary, else fail subchild = [ subitem for subitem in child["nodes"] if subitem['name'] == 'subdir' ][0] # extract info of file1.txts, else fail link = [ subnode for subnode in subchild["nodes"] if subnode['name'] == 'file1.txt' ][0] # verify node's sizes and type assert_equal(link['size']['total'], '3 Bytes') assert_equal(link['size']['ondisk'], link['size']['total']) assert_equal(link['type'], 'link') # extract info of file2.txt, else fail brokenlink = [ subnode for subnode in subchild["nodes"] if subnode['name'] == 'file2.txt' ][0] # verify node's sizes and type assert_equal(brokenlink['type'], 'link-broken') assert_equal(brokenlink['size']['ondisk'], '0 Bytes') assert_equal(brokenlink['size']['total'], '3 Bytes')
def test_clean(d=None): AnnexRepo(d, create=True) ds = Dataset(d) assert_status('notneeded', clean(dataset=ds)) archives_path = ds.pathobj / Path(ARCHIVES_TEMP_DIR) annex_tmp_path = ds.pathobj / Path(ANNEX_TEMP_DIR) annex_trans_path = ds.pathobj / Path(ANNEX_TRANSFER_DIR) index_path = ds.repo.dot_git / Path(SEARCH_INDEX_DOTGITDIR) # if we create some temp archives directory (archives_path / 'somebogus').mkdir(parents=True) res = clean(dataset=ds, return_type='item-or-list', result_filter=lambda x: x['status'] == 'ok') assert_equal(res['path'], str(archives_path)) assert_equal(res['message'][0] % tuple(res['message'][1:]), "Removed 1 temporary archive directory: somebogus") assert_false(archives_path.exists()) # relative path (archives_path / 'somebogus').mkdir(parents=True) (archives_path / 'somebogus2').mkdir(parents=True) with chpwd(d), swallow_outputs() as cmo: res = clean(return_type='item-or-list', result_filter=lambda x: x['status'] == 'ok') assert_equal( res['message'][0] % tuple(res['message'][1:]), "Removed 2 temporary archive directories: somebogus, " "somebogus2") assert_false(archives_path.exists()) # and what about git annex temporary files? annex_tmp_path.mkdir(parents=True) (annex_tmp_path / "somebogus").write_text("load") with chpwd(d): res = clean(return_type='item-or-list', result_filter=lambda x: x['status'] == 'ok') assert_equal(res['path'], str(annex_tmp_path)) assert_equal(res['message'][0] % tuple(res['message'][1:]), "Removed 1 temporary annex file: somebogus") assert_false(annex_tmp_path.exists()) (annex_trans_path / 'somebogus').mkdir(parents=True, exist_ok=True) with chpwd(d): res = clean(return_type='item-or-list', result_filter=lambda x: x['status'] == 'ok') assert_equal(res['path'], str(annex_trans_path)) assert_equal( res['message'][0] % tuple(res['message'][1:]), "Removed 1 annex temporary transfer directory: somebogus") assert_false(annex_trans_path.exists()) # search index index_path.mkdir(parents=True) (index_path / "MAIN_r55n3hiyvxkdf1fi.seg, _MAIN_1.toc").write_text("noop") with chpwd(d): res = clean(return_type='item-or-list', result_filter=lambda x: x['status'] == 'ok') assert_equal(res['path'], str(index_path)) assert_equal( res['message'][0] % tuple(res['message'][1:]), "Removed 1 metadata search index file: " "MAIN_r55n3hiyvxkdf1fi.seg, _MAIN_1.toc") assert_false(index_path.exists()) # remove empty directories, too archives_path.mkdir(parents=True) with chpwd(d): res = clean(return_type='item-or-list', result_filter=lambda x: x['status'] == 'ok') assert_equal(res['path'], str(archives_path)) assert_equal(res['message'][0] % tuple(res['message'][1:]), "Removed empty temporary archive directory") assert_false(archives_path.exists()) annex_tmp_path.mkdir(parents=True) with chpwd(d): res = clean(return_type='item-or-list', result_filter=lambda x: x['status'] == 'ok') assert_equal(res['path'], str(annex_tmp_path)) assert_equal(res['message'][0] % tuple(res['message'][1:]), "Removed empty temporary annex directory") assert_false(annex_tmp_path.exists()) annex_trans_path.mkdir(parents=True) with chpwd(d): res = clean(return_type='item-or-list', result_filter=lambda x: x['status'] == 'ok') assert_equal(res['path'], str(annex_trans_path)) assert_equal(res['message'][0] % tuple(res['message'][1:]), "Removed empty annex temporary transfer directory") assert_false(annex_trans_path.exists()) index_path.mkdir(parents=True) with chpwd(d): res = clean(return_type='item-or-list', result_filter=lambda x: x['status'] == 'ok') assert_equal(res['path'], str(index_path)) assert_equal(res['message'][0] % tuple(res['message'][1:]), "Removed empty metadata search index directory") assert_false(index_path.exists())
def test_within_ds_file_search(path): try: import nibabel except ImportError: raise SkipTest ds = Dataset(path).create(force=True) ds.config.add('datalad.metadata.nativetype', 'nifti1', where='dataset') makedirs(opj(path, 'stim')) for src, dst in (('nifti1.nii.gz', opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz')), ('nifti1.nii.gz', opj('sub-03', 'func', 'sub-03_task-other_bold.nii.gz'))): copy(opj(dirname(dirname(__file__)), 'tests', 'data', 'files', src), opj(path, dst)) ds.save() ds.aggregate_metadata() ok_clean_git(ds.path) # basic sanity check on the metadata structure of the dataset dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata'] for src in ('bids', 'nifti1'): # something for each one assert_in(src, dsmeta) # each src declares its own context assert_in('@context', dsmeta[src]) # we have a unique content metadata summary for each src assert_in(src, dsmeta['datalad_unique_content_properties']) # test default behavior with swallow_outputs() as cmo: ds.search(show_keys='name', mode='textblob') assert_in("""\ id meta parentds path type """, cmo.out) target_out = """\ annex.key bids.BIDSVersion bids.author bids.citation bids.conformsto bids.datatype bids.description """ if external_versions['bids'] >= '0.9': target_out += "bids.extension\n" target_out += """\ bids.fundedby bids.license bids.name bids.subject.age(years) bids.subject.gender bids.subject.handedness bids.subject.hearing_problems_current bids.subject.id bids.subject.language bids.suffix bids.task datalad_core.id datalad_core.refcommit id nifti1.cal_max nifti1.cal_min nifti1.datatype nifti1.description nifti1.dim nifti1.freq_axis nifti1.intent nifti1.magic nifti1.phase_axis nifti1.pixdim nifti1.qform_code nifti1.sform_code nifti1.sizeof_hdr nifti1.slice_axis nifti1.slice_duration nifti1.slice_end nifti1.slice_order nifti1.slice_start nifti1.spatial_resolution(mm) nifti1.t_unit nifti1.temporal_spacing(s) nifti1.toffset nifti1.vox_offset nifti1.xyz_unit parentds path type """ # check generated autofield index keys with swallow_outputs() as cmo: ds.search(mode='autofield', show_keys='name') # it is impossible to assess what is different from that dump # so we will use diff diff = list(unified_diff(target_out.splitlines(), cmo.out.splitlines())) assert_in(target_out, cmo.out, msg="Diff: %s" % os.linesep.join(diff)) assert_result_count(ds.search('blablob#'), 0) # now check that we can discover things from the aggregated metadata for mode, query, hitpath, matched_key, matched_val in ( # random keyword query # multi word query implies AND ('textblob', ['bold', 'female'], opj('sub-03', 'func', 'sub-03_task-some_bold.nii.gz'), 'meta', 'female'), # report which field matched with auto-field ('autofield', 'female', opj('sub-03', 'func', 'sub-03_task-other_bold.nii.gz'), 'bids.subject.gender', 'female'), # autofield multi-word query is also AND ('autofield', ['bids.suffix:bold', 'bids.subject.id:01'], opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz'), 'bids.suffix', 'bold'), # TODO extend with more complex queries to test whoosh # query language configuration ): res = ds.search(query, mode=mode, full_record=True) if mode == 'textblob': # 'textblob' does datasets by default only (be could be configured otherwise assert_result_count(res, 1) else: # the rest has always a file and the dataset, because they carry metadata in # the same structure assert_result_count(res, 2) assert_result_count( res, 1, type='file', path=opj(ds.path, hitpath), # each file must report the ID of the dataset it is from, critical for # discovering related content dsid=ds.id) assert_result_count(res, 1, type='dataset', path=ds.path, dsid=ds.id) # test the key and specific value of the match assert_in(matched_key, res[-1]['query_matched']) assert_equal(res[-1]['query_matched'][matched_key], matched_val)
def test_container_from_subdataset(ds_path, src_subds_path, local_file): # prepare a to-be subdataset with a registered container src_subds = Dataset(src_subds_path).create() src_subds.containers_add(name="first", url=get_local_file_url( op.join(local_file, 'some_container.img'))) # add it as subdataset to a super ds: ds = Dataset(ds_path).create() subds = ds.install("sub", source=src_subds_path) # add it again one level down to see actual recursion: subds.install("subsub", source=src_subds_path) # We come up empty without recursive: res = ds.containers_list(recursive=False, **RAW_KWDS) assert_result_count(res, 0) # query available containers from within super: res = ds.containers_list(recursive=True, **RAW_KWDS) assert_result_count(res, 2) assert_in_results(res, action="containers", refds=ds.path) # default location within the subdataset: target_path = op.join(subds.path, '.datalad', 'environments', 'first', 'image') assert_result_count(res, 1, name='sub/first', type='file', action='containers', status='ok', path=target_path, parentds=subds.path) # not installed subdataset doesn't pose an issue: sub2 = ds.create("sub2") assert_result_count(ds.subdatasets(), 2, type="dataset") ds.uninstall("sub2") from datalad.tests.utils import assert_false assert_false(sub2.is_installed()) # same results as before, not crashing or somehow confused by a not present # subds: res = ds.containers_list(recursive=True, **RAW_KWDS) assert_result_count(res, 2) assert_result_count(res, 1, name='sub/first', type='file', action='containers', status='ok', path=target_path, parentds=subds.path) # The default renderer includes the image names. with swallow_outputs() as out: ds.containers_list(recursive=True) lines = out.out.splitlines() assert_re_in("sub/first", lines) assert_re_in("sub/subsub/first", lines) # But we are careful not to render partial names from subdataset traversals # (i.e. we recurse with containers_list(..., result_renderer=None)). with assert_raises(AssertionError): assert_re_in("subsub/first", lines)
def test_within_ds_file_search(path): try: import mutagen except ImportError: raise SkipTest ds = Dataset(path).create(force=True) # override default and search for datasets and files for this test for m in ('egrep', 'textblob', 'autofield'): ds.config.add( 'datalad.search.index-{}-documenttype'.format(m), 'all', where='dataset') ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset') makedirs(opj(path, 'stim')) for src, dst in ( ('audio.mp3', opj('stim', 'stim1.mp3')),): copy( opj(dirname(dirname(__file__)), 'tests', 'data', src), opj(path, dst)) ds.save() ok_file_under_git(path, opj('stim', 'stim1.mp3'), annexed=True) # If it is not under annex, below addition of metadata silently does # not do anything ds.repo.set_metadata( opj('stim', 'stim1.mp3'), init={'importance': 'very'}) ds.aggregate_metadata() ok_clean_git(ds.path) # basic sanity check on the metadata structure of the dataset dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata'] for src in ('audio',): # something for each one assert_in(src, dsmeta) # each src declares its own context assert_in('@context', dsmeta[src]) # we have a unique content metadata summary for each src assert_in(src, dsmeta['datalad_unique_content_properties']) # test default behavior with swallow_outputs() as cmo: ds.search(show_keys='name', mode='textblob') assert_in("""\ id meta parentds path type """, cmo.out) target_out = """\ annex.importance annex.key audio.bitrate audio.duration(s) audio.format audio.music-Genre audio.music-album audio.music-artist audio.music-channels audio.music-sample_rate audio.name audio.tracknumber datalad_core.id datalad_core.refcommit id parentds path type """ # check generated autofield index keys with swallow_outputs() as cmo: ds.search(mode='autofield', show_keys='name') # it is impossible to assess what is different from that dump assert_in(target_out, cmo.out) assert_result_count(ds.search('blablob#'), 0) # now check that we can discover things from the aggregated metadata for mode, query, hitpath, matched in ( ('egrep', ':mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # same as above, leading : is stripped, in indicates "ALL FIELDS" ('egrep', 'mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # same as above, but with AND condition # get both matches ('egrep', ['mp3', 'type:file'], opj('stim', 'stim1.mp3'), {'type': 'file', 'audio.format': 'mp3'}), # case insensitive search ('egrep', 'mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # field selection by expression ('egrep', 'audio\.+:mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # random keyword query ('textblob', 'mp3', opj('stim', 'stim1.mp3'), {'meta': 'mp3'}), # report which field matched with auto-field ('autofield', 'mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # XXX next one is not supported by current text field analyser # decomposes the mime type in [mime, audio, mp3] # ('autofield', # "'mime:audio/mp3'", # opj('stim', 'stim1.mp3'), # 'audio.format', 'mime:audio/mp3'), # but this one works ('autofield', "'mime audio mp3'", opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # TODO extend with more complex queries to test whoosh # query language configuration ): res = ds.search(query, mode=mode, full_record=True) assert_result_count( res, 1, type='file', path=opj(ds.path, hitpath), # each file must report the ID of the dataset it is from, critical for # discovering related content dsid=ds.id) # in egrep we currently do not search unique values # and the queries above aim at files assert_result_count(res, 1 if mode == 'egrep' else 2) if mode != 'egrep': assert_result_count( res, 1, type='dataset', path=ds.path, dsid=ds.id) # test the key and specific value of the match for matched_key, matched_val in matched.items(): assert_in(matched_key, res[-1]['query_matched']) assert_equal(res[-1]['query_matched'][matched_key], matched_val) # test a suggestion msg being logged if no hits and key is a bit off with swallow_logs(new_level=logging.INFO) as cml: res = ds.search('audio.formats:mp3 audio.bitsrate:1', mode='egrep') assert not res assert_in('Did you mean any of', cml.out) assert_in('audio.format', cml.out) assert_in('audio.bitrate', cml.out)