def test_basics(path): ds = Dataset(path).create(force=True) # TODO: this procedure would leave a clean dataset, but `run` cannot handle dirty # input yet, so manual for now # V6FACT: this leaves the file staged, but not committed ds.add('code', to_git=True) # V6FACT: even this leaves it staged ds.add('.') # V6FACT: but this finally commits it ds.save() # TODO remove above two lines ds.run_procedure('setup_yoda_dataset') ok_clean_git(ds.path) # configure dataset to look for procedures in its code folder ds.config.add('datalad.locations.dataset-procedures', 'code', where='dataset') # configure dataset to run the demo procedure prior to the clean command ds.config.add('datalad.clean.proc-pre', 'datalad_test_proc', where='dataset') # run command that should trigger the demo procedure ds.clean() # look for traces ok_file_has_content(op.join(ds.path, 'fromproc.txt'), 'hello\n') ok_clean_git(ds.path, index_modified=[op.join('.datalad', 'config')])
def test_basics(path, super_path): ds = Dataset(path).create(force=True) ds.run_procedure('setup_yoda_dataset') ok_clean_git(ds.path) assert_false(ds.repo.is_under_annex("README.md")) # configure dataset to look for procedures in its code folder ds.config.add('datalad.locations.dataset-procedures', 'code', where='dataset') # commit this procedure config for later use in a clone: ds.add(op.join('.datalad', 'config')) # configure dataset to run the demo procedure prior to the clean command ds.config.add('datalad.clean.proc-pre', 'datalad_test_proc', where='dataset') # run command that should trigger the demo procedure ds.clean() # look for traces ok_file_has_content(op.join(ds.path, 'fromproc.txt'), 'hello\n') ok_clean_git(ds.path, index_modified=[op.join('.datalad', 'config')]) # make a fresh dataset: super = Dataset(super_path).create() # configure dataset to run the demo procedure prior to the clean command super.config.add('datalad.clean.proc-pre', 'datalad_test_proc', where='dataset') # 'super' doesn't know any procedures but should get to know one by # installing the above as a subdataset super.install('sub', source=ds.path) # run command that should trigger the demo procedure super.clean() # look for traces ok_file_has_content(op.join(super.path, 'fromproc.txt'), 'hello\n') ok_clean_git(super.path, index_modified=[op.join('.datalad', 'config')])
def test_basic_aggregate(path): # TODO give datasets some more metadata to actually aggregate stuff base = Dataset(opj(path, 'origin')).create(force=True) sub = base.create('sub', force=True) #base.metadata(sub.path, init=dict(homepage='this'), apply2global=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) # we will first aggregate the middle dataset on its own, this will # serve as a smoke test for the reuse of metadata objects later on sub.aggregate_metadata() base.save() ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) direct_meta = base.metadata(recursive=True, return_type='list') # loose the deepest dataset sub.uninstall('subsub', check=False) # no we should eb able to reaggregate metadata, and loose nothing # because we can aggregate aggregated metadata of subsub from sub base.aggregate_metadata(recursive=True, update_mode='all') # same result for aggregate query than for (saved) direct query agg_meta = base.metadata(recursive=True, return_type='list') for d, a in zip(direct_meta, agg_meta): print(d['path'], a['path']) assert_dict_equal(d, a) # no we can throw away the subdataset tree, and loose no metadata base.uninstall('sub', recursive=True, check=False) assert(not sub.is_installed()) ok_clean_git(base.path) # same result for aggregate query than for (saved) direct query agg_meta = base.metadata(recursive=True, return_type='list') for d, a in zip(direct_meta, agg_meta): assert_dict_equal(d, a)
def check_renamed_file(recursive, no_annex, path): ds = Dataset(path).create(no_annex=no_annex) create_tree(path, {'old': ''}) ds.add('old') ds.repo._git_custom_command(['old', 'new'], ['git', 'mv']) ds.save(recursive=recursive) ok_clean_git(path)
def check_renamed_file(recursive, no_annex, path): ds = Dataset(path).create(no_annex=no_annex) create_tree(path, {'old': ''}) ds.add('old') ds.repo._git_custom_command(['old', 'new'], ['git', 'mv']) ds._save(recursive=recursive) ok_clean_git(path)
def test_aggregate_with_unavailable_objects_from_subds(path, target): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n') sub = base.create('sub', force=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) # now make that a subdataset of a new one, so aggregation needs to get the # metadata objects first: super = Dataset(target).create() super.install("base", source=base.path) ok_clean_git(super.path) clone = Dataset(opj(super.path, "base")) ok_clean_git(clone.path) objpath = opj('.datalad', 'metadata', 'objects') objs = [o for o in sorted(clone.repo.get_annexed_files(with_content_only=False)) if o.startswith(objpath)] eq_(len(objs), 6) eq_(all(clone.repo.file_has_content(objs)), False) # now aggregate should get those metadata objects super.aggregate_metadata(recursive=True, update_mode='all', force_extraction=False) eq_(all(clone.repo.file_has_content(objs)), True)
def test_aggregate_with_unavailable_objects_from_subds(path, target): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n' ) sub = base.create('sub', force=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) # now make that a subdataset of a new one, so aggregation needs to get the # metadata objects first: super = Dataset(target).create() super.install("base", source=base.path) ok_clean_git(super.path) clone = Dataset(opj(super.path, "base")) ok_clean_git(clone.path) objpath = opj('.datalad', 'metadata', 'objects') objs = [ o for o in sorted(clone.repo.get_annexed_files(with_content_only=False)) if o.startswith(objpath) ] eq_(len(objs), 6) eq_(all(clone.repo.file_has_content(objs)), False) # now aggregate should get those metadata objects super.aggregate_metadata(recursive=True, update_mode='all', force_extraction=False) eq_(all(clone.repo.file_has_content(objs)), True)
def test_aggregate_removal(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n') sub = base.create('sub', force=True) subsub = sub.create(opj('subsub'), force=True) base.add('.', recursive=True) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) res = base.metadata(get_aggregates=True) assert_result_count(res, 3) assert_result_count(res, 1, path=subsub.path) # check that we only have object files that are listed in agginfo eq_(_get_contained_objs(base), _get_referenced_objs(base)) # now delete the deepest subdataset to test cleanup of aggregated objects # in the top-level ds base.remove(opj('sub', 'subsub'), check=False) # now aggregation has to detect that subsub is not simply missing, but gone # for good base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) # internally consistent state eq_(_get_contained_objs(base), _get_referenced_objs(base)) # info on subsub was removed at all levels res = base.metadata(get_aggregates=True) assert_result_count(res, 0, path=subsub.path) assert_result_count(res, 2) res = sub.metadata(get_aggregates=True) assert_result_count(res, 0, path=subsub.path) assert_result_count(res, 1)
def test_placeholders(path): ds = Dataset(path).create(force=True) ds.add(".") ds.run("echo {inputs} >{outputs}", inputs=[".", "*.in"], outputs=["c.out"]) ok_file_has_content(opj(path, "c.out"), "a.in b.in\n") hexsha_before = ds.repo.get_hexsha() ds.rerun() eq_(hexsha_before, ds.repo.get_hexsha()) ds.run("echo {inputs[0]} >getitem", inputs=["*.in"]) ok_file_has_content(opj(path, "getitem"), "a.in\n") ds.run("echo {pwd} >expanded-pwd") ok_file_has_content(opj(path, "expanded-pwd"), path, strip=True) subdir_path = opj(path, "subdir") with chpwd(subdir_path): run("echo {pwd} >expanded-pwd") ok_file_has_content(opj(path, "subdir", "expanded-pwd"), subdir_path, strip=True) # Double brackets can be used to escape placeholders. ds.run("touch {{inputs}}", inputs=["*.in"]) ok_exists(opj(path, "{inputs}"))
def test_get_metadata(path): ds = Dataset(path).create(force=True) ds.add('.') meta = MetadataExtractor(ds, [])._get_dataset_metadata() assert_equal( dumps(meta, sort_keys=True, indent=2), """\ { "citation": "Cool (2016)", "conformsto": "http://docs.datalad.org/metadata.html#v0-1", "description": "A text with arbitrary length and content that can span multiple\\nparagraphs (this is a new one)", "fundedby": "BMBFGQ1411, NSF 1429999", "homepage": "http://studyforrest.org", "issuetracker": "https://github.com/psychoinformatics-de/studyforrest-data-phase2/issues", "license": [ "CC0", "The person who associated a work with this deed has dedicated the work to the public domain by waiving all of his or her rights to the work worldwide under copyright law, including all related and neighboring rights, to the extent allowed by law.\\nYou can copy, modify, distribute and perform the work, even for commercial purposes, all without asking permission." ], "maintainer": [ "Mike One <*****@*****.**>", "Anna Two <*****@*****.**>" ], "name": "studyforrest_phase2", "sameas": "http://dx.doi.org/10.5281/zenodo.48421", "shortdescription": "Basic summary", "version": "1.0.0-rc3" }""")
def test_reaggregate_with_unavailable_objects(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n' ) sub = base.create('sub', force=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) objpath = opj('.datalad', 'metadata', 'objects') # weird that it comes out as a string... objs = [o for o in sorted(base.repo.find(objpath).split('\n')) if o] # we have 3x2 metadata sets (dataset/files) under annex eq_(len(objs), 6) eq_(all(base.repo.file_has_content(objs)), True) # drop all object content base.drop(objs, check=False) eq_(all(base.repo.file_has_content(objs)), False) ok_clean_git(base.path) # now re-aggregate, the state hasn't changed, so the file names will # be the same base.aggregate_metadata(recursive=True, update_mode='all') eq_(all(base.repo.file_has_content(objs)), True) # and there are no new objects eq_(objs, [o for o in sorted(base.repo.find(objpath).split('\n')) if o])
def test_run_failure(path): ds = Dataset(path).create() hexsha_initial = ds.repo.get_hexsha() with assert_raises(CommandError): ds.run("echo x$(cat grows) > grows && false") eq_(hexsha_initial, ds.repo.get_hexsha()) ok_(ds.repo.dirty) msgfile = opj(ds.repo.repo.git_dir, "COMMIT_EDITMSG") ok_exists(msgfile) ds.add(".", save=False) ds.save(message_file=msgfile) ok_clean_git(ds.path) neq_(hexsha_initial, ds.repo.get_hexsha()) outfile = opj(ds.path, "grows") eq_('x\n', open(outfile).read()) # There is no CommandError on rerun if the non-zero error matches the # original code. ds.rerun() eq_('xx\n', open(outfile).read()) # On the other hand, we fail if we rerun a command and there is a non-zero # error that doesn't match. ds.run("[ ! -e bar ] && echo c >bar") ok_clean_git(ds.path) with assert_raises(CommandError): ds.rerun()
def test_aggregate_removal(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n' ) sub = base.create('sub', force=True) subsub = sub.create(opj('subsub'), force=True) base.add('.', recursive=True) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) res = base.metadata(get_aggregates=True) assert_result_count(res, 3) assert_result_count(res, 1, path=subsub.path) # check that we only have object files that are listed in agginfo eq_(_get_contained_objs(base), _get_referenced_objs(base)) # now delete the deepest subdataset to test cleanup of aggregated objects # in the top-level ds base.remove(opj('sub', 'subsub'), check=False) # now aggregation has to detect that subsub is not simply missing, but gone # for good base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) # internally consistent state eq_(_get_contained_objs(base), _get_referenced_objs(base)) # info on subsub was removed at all levels res = base.metadata(get_aggregates=True) assert_result_count(res, 0, path=subsub.path) assert_result_count(res, 2) res = sub.metadata(get_aggregates=True) assert_result_count(res, 0, path=subsub.path) assert_result_count(res, 1)
def test_partial_aggregation(path): ds = Dataset(path).create(force=True) sub1 = ds.create('sub1', force=True) sub2 = ds.create('sub2', force=True) ds.add('.', recursive=True) ds.aggregate_metadata(recursive=True) # baseline, recursive aggregation gets us something for all three datasets res = ds.metadata(get_aggregates=True) assert_result_count(res, 3) # now let's do partial aggregation from just one subdataset # we should not loose information on the other datasets # as this would be a problem any time anything in a dataset # subtree is missing: no installed, too expensive to reaggregate, ... ds.aggregate_metadata(path='sub1', incremental=True) res = ds.metadata(get_aggregates=True) assert_result_count(res, 3) assert_result_count(res, 1, path=sub2.path) # from-scratch aggregation kills datasets that where not listed ds.aggregate_metadata(path='sub1', incremental=False) res = ds.metadata(get_aggregates=True) assert_result_count(res, 2) assert_result_count(res, 0, path=sub2.path) # now reaggregated in full ds.aggregate_metadata(recursive=True) # make change in sub1 sub1.unlock('here') with open(opj(sub1.path, 'here'), 'w') as f: f.write('fresh') ds.save(recursive=True) ok_clean_git(path)
def test_basic_aggregate(path): # TODO give datasets some more metadata to actually aggregate stuff base = Dataset(opj(path, 'origin')).create(force=True) sub = base.create('sub', force=True) #base.metadata(sub.path, init=dict(homepage='this'), apply2global=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) direct_meta = base.metadata(recursive=True, return_type='list') # loose the deepest dataset sub.uninstall('subsub', check=False) # no we should eb able to reaggregate metadata, and loose nothing # because we can aggregate aggregated metadata of subsub from sub base.aggregate_metadata(recursive=True, update_mode='all') # same result for aggregate query than for (saved) direct query agg_meta = base.metadata(recursive=True, return_type='list') for d, a in zip(direct_meta, agg_meta): print(d['path'], a['path']) assert_dict_equal(d, a) # no we can throw away the subdataset tree, and loose no metadata base.uninstall('sub', recursive=True, check=False) assert (not sub.is_installed()) ok_clean_git(base.path) # same result for aggregate query than for (saved) direct query agg_meta = base.metadata(recursive=True, return_type='list') for d, a in zip(direct_meta, agg_meta): assert_dict_equal(d, a)
def test_reaggregate_with_unavailable_objects(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n') sub = base.create('sub', force=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) objpath = opj('.datalad', 'metadata', 'objects') objs = list(sorted(base.repo.find(objpath))) # we have 3x2 metadata sets (dataset/files) under annex eq_(len(objs), 6) eq_(all(base.repo.file_has_content(objs)), True) # drop all object content base.drop(objs, check=False) eq_(all(base.repo.file_has_content(objs)), False) ok_clean_git(base.path) # now re-aggregate, the state hasn't changed, so the file names will # be the same base.aggregate_metadata(recursive=True, update_mode='all', force_extraction=True) eq_(all(base.repo.file_has_content(objs)), True) # and there are no new objects eq_( objs, list(sorted(base.repo.find(objpath))) )
def test_publish_aggregated(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n') base.create('sub', force=True) base.add('.', recursive=True) ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) # create sibling and publish to it spath = opj(path, 'remote') base.create_sibling( name="local_target", sshurl="ssh://localhost", target_dir=spath) base.publish('.', to='local_target', transfer_data='all') remote = Dataset(spath) objpath = opj('.datalad', 'metadata', 'objects') objs = list(sorted(base.repo.find(objpath))) # all object files a present in both datasets eq_(all(base.repo.file_has_content(objs)), True) eq_(all(remote.repo.file_has_content(objs)), True) # and we can squeeze the same metadata out eq_( [{k: v for k, v in i.items() if k not in ('path', 'refds', 'parentds')} for i in base.metadata('sub')], [{k: v for k, v in i.items() if k not in ('path', 'refds', 'parentds')} for i in remote.metadata('sub')], )
def test_save_message_file(path): ds = Dataset(path).create() with assert_raises(IncompleteResultsError): ds.save("blah", message="me", message_file="and me") create_tree(path, {"foo": "x", "msg": "add foo"}) ds.add("foo", save=False) ds.save(message_file=opj(ds.path, "msg")) assert_equal(ds.repo.repo.git.show("--format=%s", "--no-patch"), "add foo")
def test_bf2043p2(path): ds = Dataset(path).create(force=True) ds.add('staged', save=False) ok_clean_git(ds.path, head_modified=['staged'], untracked=['untracked']) # plain save does not commit untracked content # this tests the second issue in #2043 with chpwd(path): save() ok_clean_git(ds.path, untracked=['untracked'])
def test_save_message_file(path): ds = Dataset(path).rev_create() with assert_raises(ValueError): ds.save("blah", message="me", message_file="and me") create_tree(path, {"foo": "x", "msg": u"add β"}) ds.add("foo", save=False) ds.save(message_file=opj(ds.path, "msg")) assert_equal(ds.repo.format_commit("%s"), u"add β")
def test_bf2043p2(path): ds = Dataset(path).create(force=True) ds.add('staged', save=False) ok_clean_git(ds.path, head_modified=['staged'], untracked=['untracked']) # plain save does not commit untracked content # this tests the second issue in #2043 with chpwd(path): save() ok_clean_git(ds.path, untracked=['untracked'])
def test_save_partial_index(path): ds = Dataset(path).create(force=True) ds.add("foo") ok_clean_git(ds.path) ds.unlock(path="foo") create_tree(ds.path, tree={"foo": "a", "staged": ""}, remove_existing=True) ds.repo.add("staged", git=True) ds._save(path="foo") ok_clean_git(ds.path, head_modified=["staged"])
def test_save_partial_index(path): ds = Dataset(path).create(force=True) ds.add("foo") ok_clean_git(ds.path) ds.unlock(path="foo") create_tree(ds.path, tree={"foo": "a", "staged": ""}, remove_existing=True) ds.repo.add("staged", git=True) ds.save(path="foo") ok_clean_git(ds.path, head_modified=["staged"])
def test_placeholders(path): ds = Dataset(path).create(force=True) ds.add(".") ds.run("echo {inputs} >{outputs}", inputs=[".", "*.in"], outputs=["c.out"]) ok_file_has_content(opj(path, "c.out"), "a.in b.in\n") hexsha_before = ds.repo.get_hexsha() ds.rerun() eq_(hexsha_before, ds.repo.get_hexsha()) ds.run("echo {inputs[0]} >getitem", inputs=["*.in"]) ok_file_has_content(opj(path, "getitem"), "a.in\n") ds.run("echo {pwd} >expanded-pwd") ok_file_has_content(opj(path, "expanded-pwd"), path, strip=True) ds.run("echo {dspath} >expanded-dspath") ok_file_has_content(opj(path, "expanded-dspath"), ds.path, strip=True) subdir_path = opj(path, "subdir") with chpwd(subdir_path): run("echo {pwd} >expanded-pwd") ok_file_has_content(opj(path, "subdir", "expanded-pwd"), subdir_path, strip=True) eq_(get_run_info(ds, ds.repo.format_commit("%B"))[1]["pwd"], "subdir") # Double brackets can be used to escape placeholders. ds.run("touch {{inputs}}", inputs=["*.in"]) ok_exists(opj(path, "{inputs}")) # rerun --script expands the placeholders. with patch("sys.stdout", new_callable=StringIO) as cmout: ds.rerun(script="-", since="") script_out = cmout.getvalue() assert_in("echo a.in b.in >c.out", script_out) assert_in("echo {} >expanded-pwd".format(subdir_path), script_out) assert_in("echo {} >expanded-dspath".format(ds.path), script_out) assert_result_count( ds.run("{unknown_placeholder}", on_failure="ignore"), 1, status="impossible", action="run") # Configured placeholders. ds.config.add("datalad.run.substitutions.license", "gpl3", where="local") ds.run("echo {license} >configured-license") ok_file_has_content(opj(path, "configured-license"), "gpl3", strip=True) # --script handles configured placeholders. with patch("sys.stdout", new_callable=StringIO) as cmout: ds.rerun(script="-") assert_in("gpl3", cmout.getvalue())
def test_placeholders(path): ds = Dataset(path).create(force=True) ds.add(".") ds.run("echo {inputs} >{outputs}", inputs=[".", "*.in"], outputs=["c.out"]) ok_file_has_content(opj(path, "c.out"), "a.in b.in\n") hexsha_before = ds.repo.get_hexsha() ds.rerun() eq_(hexsha_before, ds.repo.get_hexsha()) ds.run("echo {inputs[0]} >getitem", inputs=["*.in"]) ok_file_has_content(opj(path, "getitem"), "a.in\n") ds.run("echo {pwd} >expanded-pwd") ok_file_has_content(opj(path, "expanded-pwd"), path, strip=True) ds.run("echo {dspath} >expanded-dspath") ok_file_has_content(opj(path, "expanded-dspath"), ds.path, strip=True) subdir_path = opj(path, "subdir") with chpwd(subdir_path): run("echo {pwd} >expanded-pwd") ok_file_has_content(opj(path, "subdir", "expanded-pwd"), subdir_path, strip=True) eq_(get_run_info(ds, ds.repo.format_commit("%B"))[1]["pwd"], "subdir") # Double brackets can be used to escape placeholders. ds.run("touch {{inputs}}", inputs=["*.in"]) ok_exists(opj(path, "{inputs}")) # rerun --script expands the placeholders. with patch("sys.stdout", new_callable=StringIO) as cmout: ds.rerun(script="-", since="") script_out = cmout.getvalue() assert_in("echo a.in b.in >c.out", script_out) assert_in("echo {} >expanded-pwd".format(subdir_path), script_out) assert_in("echo {} >expanded-dspath".format(ds.path), script_out) assert_result_count(ds.run("{unknown_placeholder}", on_failure="ignore"), 1, status="impossible", action="run") # Configured placeholders. ds.config.add("datalad.run.substitutions.license", "gpl3", where="local") ds.run("echo {license} >configured-license") ok_file_has_content(opj(path, "configured-license"), "gpl3", strip=True) # --script handles configured placeholders. with patch("sys.stdout", new_callable=StringIO) as cmout: ds.rerun(script="-") assert_in("gpl3", cmout.getvalue()) ds.run("echo {tmpdir} >tout") ok_file_has_content(op.join(path, "tout"), ".*datalad-run.*", re_=True)
def test_no_filemeta_with_plaingit(path): ds = Dataset(path).create(force=True, no_annex=True) ds.add('.') ok_clean_git(path) res = ds.metadata('probe', add=['test'], on_failure='ignore') assert_status('impossible', res) assert_result_count( res, 1, status='impossible', message=('non-annex dataset %s has no file metadata support', ds))
def test_save_message_file(path): ds = Dataset(path).create() with assert_raises(ValueError): ds._save("blah", message="me", message_file="and me") create_tree(path, {"foo": "x", "msg": u"add β"}) ds.add("foo", save=False) ds._save(message_file=opj(ds.path, "msg")) assert_equal(ds.repo.format_commit("%s"), u"add β")
def test_run_explicit(path): ds = Dataset(path) assert_false(ds.repo.file_has_content("test-annex.dat")) create_tree(ds.path, { "dirt_untracked": "untracked", "dirt_modified": "modified" }) ds.add("dirt_modified", to_git=True) with open(opj(path, "dirt_modified"), "a") as ofh: ofh.write(", more") # We need explicit=True to run with dirty repo. assert_status( "impossible", ds.run("cat test-annex.dat test-annex.dat >doubled.dat", inputs=["test-annex.dat"], on_failure="ignore")) hexsha_initial = ds.repo.get_hexsha() # If we specify test-annex.dat as an input, it will be retrieved before the # run. ds.run("cat test-annex.dat test-annex.dat >doubled.dat", inputs=["test-annex.dat"], explicit=True) ok_(ds.repo.file_has_content("test-annex.dat")) # We didn't commit anything because outputs weren't specified. assert_false(ds.repo.file_has_content("doubled.dat")) eq_(hexsha_initial, ds.repo.get_hexsha()) # If an input doesn't exist, we just show the standard warning. with swallow_logs(new_level=logging.WARN) as cml: with swallow_outputs(): ds.run("ls", inputs=["not-there"], explicit=True) assert_in("Input does not exist: ", cml.out) remove(opj(path, "doubled.dat")) hexsha_initial = ds.repo.get_hexsha() ds.run("cat test-annex.dat test-annex.dat >doubled.dat", inputs=["test-annex.dat"], outputs=["doubled.dat"], explicit=True) ok_(ds.repo.file_has_content("doubled.dat")) ok_(ds.repo.is_dirty(path="dirt_modified")) neq_(hexsha_initial, ds.repo.get_hexsha()) # Saving explicit outputs works from subdirectories. subdir = opj(path, "subdir") mkdir(subdir) with chpwd(subdir): run("echo insubdir >foo", explicit=True, outputs=["foo"]) ok_(ds.repo.file_has_content(opj("subdir", "foo")))
def test_rerun_cherry_pick(path): ds = Dataset(path).create() ds.repo.tag("prerun") ds.run('echo abc > runfile') with open(opj(path, "nonrun-file"), "w") as f: f.write("foo") ds.add("nonrun-file") for onto, action in [("HEAD", "skip"), ("prerun", "pick")]: results = ds.rerun(since="prerun", onto=onto) assert_in_results(results, status='ok', rerun_action=action)
def test_rerun_cherry_pick(path): ds = Dataset(path).create() ds.repo.tag("prerun") ds.run('echo abc > runfile') with open(opj(path, "nonrun-file"), "w") as f: f.write("foo") ds.add("nonrun-file") for onto, action in [("HEAD", "skip"), ("prerun", "pick")]: results = ds.rerun(since="prerun", onto=onto) assert_in_results(results, status='ok', rerun_action=action)
def test_update_strategy(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n') sub = base.create('sub', force=True) subsub = sub.create(opj('subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) # we start clean for ds in base, sub, subsub: eq_(len(_get_contained_objs(ds)), 0) # aggregate the base dataset only, nothing below changes base.aggregate_metadata() eq_(len(_get_contained_objs(base)), 2) for ds in sub, subsub: eq_(len(_get_contained_objs(ds)), 0) # aggregate the entire tree, but by default only updates # the top-level dataset with all objects, none of the leaf # or intermediate datasets get's touched base.aggregate_metadata(recursive=True) eq_(len(_get_contained_objs(base)), 6) eq_(len(_get_referenced_objs(base)), 6) for ds in sub, subsub: eq_(len(_get_contained_objs(ds)), 0) res = base.metadata(get_aggregates=True) assert_result_count(res, 3) # it is impossible to query an intermediate or leaf dataset # for metadata for ds in sub, subsub: assert_status( 'impossible', ds.metadata(get_aggregates=True, on_failure='ignore')) # get the full metadata report target_meta = base.metadata(return_type='list') # now redo full aggregation, this time updating all # (intermediate) datasets base.aggregate_metadata(recursive=True, update_mode='all') eq_(len(_get_contained_objs(base)), 6) eq_(len(_get_contained_objs(sub)), 4) eq_(len(_get_contained_objs(subsub)), 2) # it is now OK to query an intermediate or leaf dataset # for metadata for ds in sub, subsub: assert_status( 'ok', ds.metadata(get_aggregates=True, on_failure='ignore')) # all of that has no impact on the reported metadata eq_(target_meta, base.metadata(return_type='list'))
def test_rerun_cherry_pick(path): ds = Dataset(path).create() ds.repo.tag("prerun") ds.run('echo abc > runfile') with open(opj(path, "nonrun-file"), "w") as f: f.write("foo") ds.add("nonrun-file") for onto, text in [("HEAD", "skipping"), ("prerun", "cherry picking")]: results = ds.rerun(since="prerun", onto=onto) assert_in_results(results, status='ok', path=ds.path) assert any(r.get("message", "").endswith(text) for r in results)
def test_update_strategy(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n' ) sub = base.create('sub', force=True) subsub = sub.create(opj('subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) # we start clean for ds in base, sub, subsub: eq_(len(_get_contained_objs(ds)), 0) # aggregate the base dataset only, nothing below changes base.aggregate_metadata() eq_(len(_get_contained_objs(base)), 2) for ds in sub, subsub: eq_(len(_get_contained_objs(ds)), 0) # aggregate the entire tree, but by default only updates # the top-level dataset with all objects, none of the leaf # or intermediate datasets get's touched base.aggregate_metadata(recursive=True) eq_(len(_get_contained_objs(base)), 6) eq_(len(_get_referenced_objs(base)), 6) for ds in sub, subsub: eq_(len(_get_contained_objs(ds)), 0) res = base.metadata(get_aggregates=True) assert_result_count(res, 3) # it is impossible to query an intermediate or leaf dataset # for metadata for ds in sub, subsub: assert_status('impossible', ds.metadata(get_aggregates=True, on_failure='ignore')) # get the full metadata report target_meta = base.metadata(return_type='list') # now redo full aggregation, this time updating all # (intermediate) datasets base.aggregate_metadata(recursive=True, update_mode='all') eq_(len(_get_contained_objs(base)), 6) eq_(len(_get_contained_objs(sub)), 4) eq_(len(_get_contained_objs(subsub)), 2) # it is now OK to query an intermediate or leaf dataset # for metadata for ds in sub, subsub: assert_status('ok', ds.metadata(get_aggregates=True, on_failure='ignore')) # all of that has no impact on the reported metadata eq_(target_meta, base.metadata(return_type='list'))
def test_rerun_outofdate_tree(path): ds = Dataset(path).create() input_file = opj(path, "foo") output_file = opj(path, "out") with open(input_file, "w") as f: f.write("abc\ndef") ds.add("foo", to_git=True) # Create inital run. ds.run('grep def foo > out') eq_('def\n', open(output_file).read()) # Change tree so that it is no longer compatible. ds.remove("foo") # Now rerunning should fail because foo no longer exists. assert_raises(CommandError, ds.rerun, revision="HEAD~")
def test_rerun_outofdate_tree(path): ds = Dataset(path).create() input_file = opj(path, "foo") output_file = opj(path, "out") with open(input_file, "w") as f: f.write("abc\ndef") ds.add("foo", to_git=True) # Create inital run. ds.run('grep def foo > out') eq_('def\n', open(output_file).read()) # Change tree so that it is no longer compatible. ds.remove("foo") # Now rerunning should fail because foo no longer exists. assert_raises(CommandError, ds.rerun, revision="HEAD~")
def test_run_explicit(path): ds = Dataset(path) assert_false(ds.repo.file_has_content("test-annex.dat")) create_tree(ds.path, {"dirt_untracked": "untracked", "dirt_modified": "modified"}) ds.add("dirt_modified", to_git=True) with open(opj(path, "dirt_modified"), "a") as ofh: ofh.write(", more") # We need explicit=True to run with dirty repo. assert_status("impossible", ds.run("cat test-annex.dat test-annex.dat >doubled.dat", inputs=["test-annex.dat"], on_failure="ignore")) hexsha_initial = ds.repo.get_hexsha() # If we specify test-annex.dat as an input, it will be retrieved before the # run. ds.run("cat test-annex.dat test-annex.dat >doubled.dat", inputs=["test-annex.dat"], explicit=True) ok_(ds.repo.file_has_content("test-annex.dat")) # We didn't commit anything because outputs weren't specified. assert_false(ds.repo.file_has_content("doubled.dat")) eq_(hexsha_initial, ds.repo.get_hexsha()) # If an input doesn't exist, we just show the standard warning. with swallow_logs(new_level=logging.WARN) as cml: ds.run("ls", inputs=["not-there"], explicit=True) assert_in("Input does not exist: ", cml.out) remove(opj(path, "doubled.dat")) hexsha_initial = ds.repo.get_hexsha() ds.run("cat test-annex.dat test-annex.dat >doubled.dat", inputs=["test-annex.dat"], outputs=["doubled.dat"], explicit=True) ok_(ds.repo.file_has_content("doubled.dat")) ok_(ds.repo.is_dirty(path="dirt_modified")) neq_(hexsha_initial, ds.repo.get_hexsha()) # Saving explicit outputs works from subdirectories. subdir = opj(path, "subdir") mkdir(subdir) with chpwd(subdir): run("echo insubdir >foo", explicit=True, outputs=["foo"]) ok_(ds.repo.file_has_content(opj("subdir", "foo")))
def test_rerun_branch(path): ds = Dataset(path).create() ds.repo.tag("prerun") outfile = opj(path, "run-file") with swallow_outputs(): ds.run('echo x$(cat run-file) > run-file') ds.rerun() eq_('xx\n', open(outfile).read()) with open(opj(path, "nonrun-file"), "w") as f: f.write("foo") ds.add("nonrun-file") # Rerun the commands on a new branch that starts at the parent # commit of the first run. with swallow_outputs(): ds.rerun(since="prerun", onto="prerun", branch="rerun") eq_(ds.repo.get_active_branch(), "rerun") eq_('xx\n', open(outfile).read()) # NOTE: This test depends on the non-run commit above following a run # commit. Otherwise, all the metadata (e.g., author date) aside from the # parent commit that is used to generate the commit ID may be set when # running the tests, which would result in two commits rather than three. for revrange in ["rerun..master", "master..rerun"]: assert_result_count(ds.repo.repo.git.rev_list(revrange).split(), 3) eq_(ds.repo.get_merge_base(["master", "rerun"]), ds.repo.get_hexsha("prerun")) # Start rerun branch at tip of current branch. ds.repo.checkout("master") ds.rerun(since="prerun", branch="rerun2") eq_(ds.repo.get_active_branch(), "rerun2") eq_('xxxx\n', open(outfile).read()) assert_result_count(ds.repo.repo.git.rev_list("master..rerun2").split(), 2) assert_result_count(ds.repo.repo.git.rev_list("rerun2..master").split(), 0) # Using an existing branch name fails. ds.repo.checkout("master") assert_raises(IncompleteResultsError, ds.rerun, since="prerun", branch="rerun2")
def test_rerun_branch(path): ds = Dataset(path).create() ds.repo.tag("prerun") outfile = opj(path, "run-file") ds.run('echo x$(cat run-file) > run-file') ds.rerun() eq_('xx\n', open(outfile).read()) with open(opj(path, "nonrun-file"), "w") as f: f.write("foo") ds.add("nonrun-file") # Rerun the commands on a new branch that starts at the parent # commit of the first run. ds.rerun(since="prerun", onto="prerun", branch="rerun") eq_(ds.repo.get_active_branch(), "rerun") eq_('xx\n', open(outfile).read()) # NOTE: This test depends on the non-run commit above following a run # commit. Otherwise, all the metadata (e.g., author date) aside from the # parent commit that is used to generate the commit ID may be set when # running the tests, which would result in two commits rather than three. for revrange in ["rerun..master", "master..rerun"]: assert_result_count( ds.repo.repo.git.rev_list(revrange).split(), 3) eq_(ds.repo.get_merge_base(["master", "rerun"]), ds.repo.get_hexsha("prerun")) # Start rerun branch at tip of current branch. ds.repo.checkout("master") ds.rerun(since="prerun", branch="rerun2") eq_(ds.repo.get_active_branch(), "rerun2") eq_('xxxx\n', open(outfile).read()) assert_result_count( ds.repo.repo.git.rev_list("master..rerun2").split(), 2) assert_result_count( ds.repo.repo.git.rev_list("rerun2..master").split(), 0) # Using an existing branch name fails. ds.repo.checkout("master") assert_raises(IncompleteResultsError, ds.rerun, since="prerun", branch="rerun2")
def test_diff_recursive(path): ds = Dataset(path).create() sub = ds.create('sub') # look at the last change, and confirm a dataset was added res = ds.diff(revision='HEAD~1..HEAD') assert_result_count(res, 1, action='diff', state='added', path=sub.path, type='dataset') # now recursive res = ds.diff(recursive=True, revision='HEAD~1..HEAD') # we also get the entire diff of the subdataset from scratch assert_status('ok', res) ok_(len(res) > 3) # one specific test assert_result_count(res, 1, action='diff', state='added', path=opj(sub.path, '.datalad', 'config')) # now we add a file to just the parent create_tree(ds.path, {'onefile': 'tobeadded', 'sub': {'twofile': 'tobeadded'}}) res = ds.diff(recursive=True, report_untracked='all') assert_result_count(res, 3) assert_result_count(res, 1, action='diff', state='untracked', path=opj(ds.path, 'onefile'), type='file') assert_result_count(res, 1, action='diff', state='modified', path=sub.path, type='dataset') assert_result_count(res, 1, action='diff', state='untracked', path=opj(sub.path, 'twofile'), type='file') # save sub sub.add('.') # save sub in parent ds.save() # save addition in parent ds.add('.') ok_clean_git(ds.path) # look at the last change, only one file was added res = ds.diff(revision='HEAD~1..HEAD') assert_result_count(res, 1) assert_result_count(res, 1, action='diff', state='added', path=opj(ds.path, 'onefile'), type='file') # now the exact same thing with recursion, must not be different from the call # above res = ds.diff(recursive=True, revision='HEAD~1..HEAD') assert_result_count(res, 1) # last change in parent assert_result_count(res, 1, action='diff', state='added', path=opj(ds.path, 'onefile'), type='file') # one further back brings in the modified subdataset, and the added file within it res = ds.diff(recursive=True, revision='HEAD~2..HEAD') assert_result_count(res, 3) assert_result_count(res, 1, action='diff', state='added', path=opj(ds.path, 'onefile'), type='file') assert_result_count(res, 1, action='diff', state='added', path=opj(sub.path, 'twofile'), type='file') assert_result_count(res, 1, action='diff', state='modified', path=sub.path, type='dataset')
def test_rerun(path, nodspath): ds = Dataset(path).create() sub = ds.create('sub') probe_path = opj(sub.path, 'sequence') # run inside the dataset with chpwd(path), \ swallow_outputs(): ds.run('echo x$(cat sub/sequence) > sub/sequence') # command ran once, all clean ok_clean_git(ds.path) eq_('x\n', open(probe_path).read()) # now, for a rerun we can be anywhere, PWD and all are recorded # moreover, rerun must figure out which bits to unlock, even in # subdatasets with chpwd(nodspath), \ swallow_outputs(): ds.rerun() ok_clean_git(ds.path) # ran twice now eq_('xx\n', open(probe_path).read()) # Make a non-run commit. with open(opj(path, "nonrun-file"), "w") as f: f.write("foo") ds.add("nonrun-file") # Now rerun the buried command. ds.rerun(revision="HEAD~", message="rerun buried") eq_('xxx\n', open(probe_path).read()) # Also check that the messasge override worked. eq_(ds.repo.repo.head.commit.message.splitlines()[0], "[DATALAD RUNCMD] rerun buried") # Or a range of commits, skipping non-run commits. ds.rerun(since="HEAD~3") eq_('xxxxx\n', open(probe_path).read()) # Or --since= to run all reachable commits. ds.rerun(since="") eq_('xxxxxxxxxx\n', open(probe_path).read()) # If the history to rerun has a merge commit, we abort. ds.repo.checkout("HEAD~3", options=["-b", "topic"]) with open(opj(path, "topic-file"), "w") as f: f.write("topic") ds.add("topic-file") ds.repo.checkout("master") ds.repo.merge("topic") ok_clean_git(ds.path) assert_raises(IncompleteResultsError, ds.rerun)
def test_save(path): ds = Dataset(path) with open(opj(path, "new_file.tst"), "w") as f: f.write("something") ds.repo.add("new_file.tst", git=True) ok_(ds.repo.dirty) ds.save("add a new file", all_changes=False) ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) with open(opj(path, "new_file.tst"), "w") as f: f.write("modify") ok_(ds.repo.dirty) ds.save("modified new_file.tst", all_changes=True) ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) files = ['one.txt', 'two.txt'] for fn in files: with open(opj(path, fn), "w") as f: f.write(fn) ds.add([opj(path, f) for f in files]) # superfluous call to save (add saved it already), should not fail # but report that nothing was saved assert_false(ds.save("set of new files")) ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) # create subdataset subds = ds.create('subds') ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) # modify subds with open(opj(subds.path, "some_file.tst"), "w") as f: f.write("something") subds.add('.') ok_clean_git(subds.path, annex=isinstance(ds.repo, AnnexRepo)) ok_(ds.repo.dirty) # ensure modified subds is committed ds.save(all_changes=True) ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo))
def test_new_or_modified(path): def get_new_or_modified(*args, **kwargs): return [relpath(ap["path"], path) for ap in new_or_modified(diff_revision(*args, **kwargs))] ds = Dataset(path).create(force=True, no_annex=True) # Check out an orphan branch so that we can test the "one commit # in a repo" case. ds.repo.checkout("orph", options=["--orphan"]) ds.add(".") assert_false(ds.repo.dirty) assert_result_count(ds.repo.repo.git.rev_list("HEAD").split(), 1) # Diffing doesn't fail when the branch contains a single commit. assert_in("to_modify", get_new_or_modified(ds, "HEAD")) # New files are detected, deletions are not. ds.repo.remove(["to_remove"]) ok_(ds.repo.dirty) with open(opj(path, "to_add"), "w") as f: f.write("content5") ds.repo.add(["to_add"]) ds.repo.commit("add one, remove another") eq_(get_new_or_modified(ds, "HEAD"), ["to_add"]) # Modifications are detected. with open(opj(path, "to_modify"), "w") as f: f.write("updated 1") with open(opj(path, "d/to_modify"), "w") as f: f.write("updated 2") ds.add(["to_modify", "d/to_modify"]) eq_(set(get_new_or_modified(ds, "HEAD")), {"to_modify", "d/to_modify"}) # Non-HEAD revisions work. ds.repo.commit("empty", options=["--allow-empty"]) assert_false(get_new_or_modified(ds, "HEAD")) eq_(set(get_new_or_modified(ds, "HEAD~")), {"to_modify", "d/to_modify"})
def test_save(path): ds = Dataset(path) with open(opj(path, "new_file.tst"), "w") as f: f.write("something") ds.repo.add("new_file.tst", git=True) ok_(ds.repo.dirty) ds.save("add a new file", all_changes=False) ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) with open(opj(path, "new_file.tst"), "w") as f: f.write("modify") ok_(ds.repo.dirty) ds.save("modified new_file.tst", all_changes=True) ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) files = ['one.txt', 'two.txt'] for fn in files: with open(opj(path, fn), "w") as f: f.write(fn) ds.add([opj(path, f) for f in files]) # superfluous call to save (add saved it already), should not fail # but report that nothing was saved assert_false(ds.save("set of new files")) ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) # create subdataset subds = ds.create('subds') ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) # modify subds with open(opj(subds.path, "some_file.tst"), "w") as f: f.write("something") subds.add('.') ok_clean_git(subds.path, annex=isinstance(ds.repo, AnnexRepo)) ok_(ds.repo.dirty) # ensure modified subds is committed ds.save(all_changes=True) ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo))
def test_new_or_modified(path): def get_new_or_modified(*args, **kwargs): return [relpath(ap["path"], path) for ap in new_or_modified(diff_revision(*args, **kwargs))] ds = Dataset(path).create(force=True, no_annex=True) # Check out an orphan branch so that we can test the "one commit # in a repo" case. ds.repo.checkout("orph", options=["--orphan"]) ds.add(".") assert_false(ds.repo.dirty) assert_result_count(ds.repo.repo.git.rev_list("HEAD").split(), 1) # Diffing doesn't fail when the branch contains a single commit. assert_in("to_modify", get_new_or_modified(ds, "HEAD")) # New files are detected, deletions are not. ds.repo.remove(["to_remove"]) ok_(ds.repo.dirty) with open(opj(path, "to_add"), "w") as f: f.write("content5") ds.repo.add(["to_add"]) ds.repo.commit("add one, remove another") eq_(get_new_or_modified(ds, "HEAD"), ["to_add"]) # Modifications are detected. with open(opj(path, "to_modify"), "w") as f: f.write("updated 1") with open(opj(path, "d/to_modify"), "w") as f: f.write("updated 2") ds.add(["to_modify", "d/to_modify"]) eq_(set(get_new_or_modified(ds, "HEAD")), {"to_modify", "d/to_modify"}) # Non-HEAD revisions work. ds.repo.commit("empty", options=["--allow-empty"]) assert_false(get_new_or_modified(ds, "HEAD")) eq_(set(get_new_or_modified(ds, "HEAD~")), {"to_modify", "d/to_modify"})
def test_nested_metadata(path): ds = Dataset(path).create(force=True) ds.add('.') ds.aggregate_metadata() # BIDS returns participant info as a nested dict for each file in the # content metadata. On the dataset-level this should automatically # yield a sequence of participant info dicts, without any further action # or BIDS-specific configuration meta = ds.metadata('.', reporton='datasets', return_type='item-or-list')['metadata'] for i in zip( sorted( meta['datalad_unique_content_properties']['bids']['subject'], key=lambda x: x['id']), sorted([{ "age(years)": "20-25", "id": "03", "gender": "female", "handedness": "r", "hearing_problems_current": "n", "language": "english" }, { "age(years)": "30-35", "id": "01", "gender": 'n/a', "handedness": "r", "hearing_problems_current": "n", "language": u"русский" }], key=lambda x: x['id'])): assert_dict_equal(i[0], i[1]) # we can turn off this kind of auto-summary ds.config.add('datalad.metadata.generate-unique-bids', 'false', where='dataset') ds.aggregate_metadata() meta = ds.metadata('.', reporton='datasets', return_type='item-or-list')['metadata'] # protect next test a little, in case we enhance our core extractor in the future # to provide more info if 'datalad_unique_content_properties' in meta: assert_not_in('bids', meta['datalad_unique_content_properties'])
def test_placeholders(path): ds = Dataset(path).create(force=True) ds.add(".") ds.run("echo {inputs} >{outputs}", inputs=[".", "*.in"], outputs=["c.out"]) ok_file_has_content(opj(path, "c.out"), "a.in b.in\n") hexsha_before = ds.repo.get_hexsha() ds.rerun() eq_(hexsha_before, ds.repo.get_hexsha()) ds.run("echo {inputs[0]} >getitem", inputs=["*.in"]) ok_file_has_content(opj(path, "getitem"), "a.in\n") ds.run("echo {pwd} >expanded-pwd") ok_file_has_content(opj(path, "expanded-pwd"), path, strip=True) ds.run("echo {dspath} >expanded-dspath") ok_file_has_content(opj(path, "expanded-dspath"), ds.path, strip=True) subdir_path = opj(path, "subdir") with chpwd(subdir_path): run("echo {pwd} >expanded-pwd") ok_file_has_content(opj(path, "subdir", "expanded-pwd"), subdir_path, strip=True) eq_(get_run_info(ds, ds.repo.repo.head.commit.message)[1]["pwd"], "subdir") # Double brackets can be used to escape placeholders. ds.run("touch {{inputs}}", inputs=["*.in"]) ok_exists(opj(path, "{inputs}")) # rerun --script expands the placeholders. with patch("sys.stdout", new_callable=StringIO) as cmout: ds.rerun(script="-", since="") script_out = cmout.getvalue() assert_in("echo a.in b.in >c.out", script_out) assert_in("echo {} >expanded-pwd".format(subdir_path), script_out) assert_in("echo {} >expanded-dspath".format(ds.path), script_out)
def test_inputs_quotes_needed(path): ds = Dataset(path).create(force=True) ds.add(".") cmd = "import sys; open(sys.argv[-1], 'w').write('!'.join(sys.argv[1:]))" # The string form of a command works fine when the inputs/outputs have # spaces ... cmd_str = "{} -c \"{}\" {{inputs}} {{outputs[0]}}".format( sys.executable, cmd) ds.run(cmd_str, inputs=["*.t*"], outputs=["out0"], expand="inputs") expected = u"!".join( list(sorted([OBSCURE_FILENAME + u".t", "bar.txt", "foo blah.txt"])) + ["out0"]) with open(op.join(path, "out0")) as ifh: eq_(assure_unicode(ifh.read()), expected) # ... but the list form of a command does not. (Don't test this failure # with the obscure file name because we'd need to know its composition to # predict the failure.) cmd_list = [sys.executable, "-c", cmd, "{inputs}", "{outputs[0]}"] ds.run(cmd_list, inputs=["*.txt"], outputs=["out0"]) ok_file_has_content(opj(path, "out0"), "bar.txt foo!blah.txt!out0")
def test_inputs_quotes_needed(path): ds = Dataset(path).create(force=True) ds.add(".") cmd = "import sys; open(sys.argv[-1], 'w').write('!'.join(sys.argv[1:]))" # The string form of a command works fine when the inputs/outputs have # spaces ... cmd_str = "{} -c \"{}\" {{inputs}} {{outputs[0]}}".format( sys.executable, cmd) ds.run(cmd_str, inputs=["*.t*"], outputs=["out0"], expand="inputs") expected = u"!".join( list(sorted([OBSCURE_FILENAME + u".t", "bar.txt", "foo blah.txt"])) + ["out0"]) with open(op.join(path, "out0")) as ifh: eq_(assure_unicode(ifh.read()), expected) # ... but the list form of a command does not. (Don't test this failure # with the obscure file name because we'd need to know its composition to # predict the failure.) cmd_list = [sys.executable, "-c", cmd, "{inputs}", "{outputs[0]}"] ds.run(cmd_list, inputs=["*.txt"], outputs=["out0"]) ok_file_has_content(opj(path, "out0"), "bar.txt foo!blah.txt!out0")
def test_partial_aggregation(path): ds = Dataset(path).create(force=True) sub1 = ds.create('sub1', force=True) sub2 = ds.create('sub2', force=True) ds.add('.', recursive=True) # if we aggregate a path(s) and say to recurse, we must not recurse into # the dataset itself and aggregate others ds.aggregate_metadata(path='sub1', recursive=True) res = ds.metadata(get_aggregates=True) assert_result_count(res, 1, path=ds.path) assert_result_count(res, 1, path=sub1.path) # so no metadata aggregates for sub2 yet assert_result_count(res, 0, path=sub2.path) ds.aggregate_metadata(recursive=True) # baseline, recursive aggregation gets us something for all three datasets res = ds.metadata(get_aggregates=True) assert_result_count(res, 3) # now let's do partial aggregation from just one subdataset # we should not loose information on the other datasets # as this would be a problem any time anything in a dataset # subtree is missing: not installed, too expensive to reaggregate, ... ds.aggregate_metadata(path='sub1', incremental=True) res = ds.metadata(get_aggregates=True) assert_result_count(res, 3) assert_result_count(res, 1, path=sub2.path) # from-scratch aggregation kills datasets that where not listed ds.aggregate_metadata(path='sub1', incremental=False) res = ds.metadata(get_aggregates=True) assert_result_count(res, 3) assert_result_count(res, 1, path=sub2.path) # now reaggregated in full ds.aggregate_metadata(recursive=True) # make change in sub1 sub1.unlock('here') with open(opj(sub1.path, 'here'), 'w') as f: f.write('fresh') ds.save(recursive=True) ok_clean_git(path)
def test_run_failure(path): ds = Dataset(path).create() subds = ds.create("sub") hexsha_initial = ds.repo.get_hexsha() with swallow_outputs(): with assert_raises(CommandError): ds.run("echo x$(cat sub/grows) > sub/grows && false") eq_(hexsha_initial, ds.repo.get_hexsha()) ok_(ds.repo.dirty) msgfile = opj(path, ds.repo.get_git_dir(ds.repo), "COMMIT_EDITMSG") ok_exists(msgfile) ds.add(".", recursive=True, message_file=msgfile) ok_clean_git(ds.path) neq_(hexsha_initial, ds.repo.get_hexsha()) outfile = opj(subds.path, "grows") eq_('x\n', open(outfile).read()) # There is no CommandError on rerun if the non-zero error matches the # original code. ds.rerun() eq_('xx\n', open(outfile).read()) # On the other hand, we fail if we rerun a command and there is a non-zero # error that doesn't match. ds.run("[ ! -e bar ] && echo c >bar") ok_clean_git(ds.path) with assert_raises(CommandError): ds.rerun() # We don't show instructions if the caller specified us not to save. remove(msgfile) with assert_raises(CommandError): ds.run("false", explicit=True, outputs=None) assert_false(op.exists(msgfile))
def test_basics(path, super_path): ds = Dataset(path).create(force=True) ds.run_procedure('setup_yoda_dataset') ok_clean_git(ds.path) assert_false(ds.repo.is_under_annex("README.md")) # configure dataset to look for procedures in its code folder ds.config.add( 'datalad.locations.dataset-procedures', 'code', where='dataset') # commit this procedure config for later use in a clone: ds.add(op.join('.datalad', 'config')) # configure dataset to run the demo procedure prior to the clean command ds.config.add( 'datalad.clean.proc-pre', 'datalad_test_proc', where='dataset') # run command that should trigger the demo procedure ds.clean() # look for traces ok_file_has_content(op.join(ds.path, 'fromproc.txt'), 'hello\n') ok_clean_git(ds.path, index_modified=[op.join('.datalad', 'config')]) # make a fresh dataset: super = Dataset(super_path).create() # configure dataset to run the demo procedure prior to the clean command super.config.add( 'datalad.clean.proc-pre', 'datalad_test_proc', where='dataset') # 'super' doesn't know any procedures but should get to know one by # installing the above as a subdataset super.install('sub', source=ds.path) # run command that should trigger the demo procedure super.clean() # look for traces ok_file_has_content(op.join(super.path, 'fromproc.txt'), 'hello\n') ok_clean_git(super.path, index_modified=[op.join('.datalad', 'config')])
def test_run_failure(path): ds = Dataset(path).create() subds = ds.create("sub") hexsha_initial = ds.repo.get_hexsha() with assert_raises(CommandError): ds.run("echo x$(cat sub/grows) > sub/grows && false") eq_(hexsha_initial, ds.repo.get_hexsha()) ok_(ds.repo.dirty) msgfile = opj(path, ds.repo.get_git_dir(ds.repo), "COMMIT_EDITMSG") ok_exists(msgfile) ds.add(".", recursive=True, message_file=msgfile) ok_clean_git(ds.path) neq_(hexsha_initial, ds.repo.get_hexsha()) outfile = opj(subds.path, "grows") eq_('x\n', open(outfile).read()) # There is no CommandError on rerun if the non-zero error matches the # original code. ds.rerun() eq_('xx\n', open(outfile).read()) # On the other hand, we fail if we rerun a command and there is a non-zero # error that doesn't match. ds.run("[ ! -e bar ] && echo c >bar") ok_clean_git(ds.path) with assert_raises(CommandError): ds.rerun() # We don't show instructions if the caller specified us not to save. remove(msgfile) with assert_raises(CommandError): ds.run("false", explicit=True, outputs=None) assert_false(op.exists(msgfile))
def test_gh2043p1(path): # this tests documents the interim agreement on what should happen # in the case documented in gh-2043 ds = Dataset(path).create(force=True) ds.add('1') ok_clean_git(ds.path, untracked=['2', '3']) ds.unlock('1') ok_clean_git(ds.path, index_modified=['1'], untracked=['2', '3']) # save(.) should recommit unlocked file, and not touch anything else # this tests the second issue in #2043 with chpwd(path): # only save modified bits by default save('.') # because the first arg is the dataset # state of the file (unlocked/locked) is committed as well, and the # test doesn't lock the file again skip_v6_or_later(method='pass')(ok_clean_git)(ds.path, untracked=['2', '3']) with chpwd(path): # but when a path is given, anything that matches this path # untracked or not is added/saved save(path='.') # state of the file (unlocked/locked) is committed as well, and the # test doesn't lock the file again skip_v6_or_later(method='pass')(ok_clean_git)(ds.path)
def test_bf1886(path): parent = Dataset(path).create() sub = parent.create('sub') ok_clean_git(parent.path) # create a symlink pointing down to the subdataset, and add it os.symlink('sub', opj(parent.path, 'down')) parent.add('down') ok_clean_git(parent.path) # now symlink pointing up os.makedirs(opj(parent.path, 'subdir', 'subsubdir')) os.symlink(opj(pardir, 'sub'), opj(parent.path, 'subdir', 'up')) parent.add(opj('subdir', 'up')) ok_clean_git(parent.path) # now symlink pointing 2xup, as in #1886 os.symlink(opj(pardir, pardir, 'sub'), opj(parent.path, 'subdir', 'subsubdir', 'upup')) parent.add(opj('subdir', 'subsubdir', 'upup')) ok_clean_git(parent.path) # simulatenously add a subds and a symlink pointing to it # create subds, but don't register it sub2 = create(opj(parent.path, 'sub2')) os.symlink( opj(pardir, pardir, 'sub2'), opj(parent.path, 'subdir', 'subsubdir', 'upup2')) parent.add(['sub2', opj('subdir', 'subsubdir', 'upup2')]) ok_clean_git(parent.path) # full replication of #1886: the above but be in subdir of symlink # with no reference dataset sub3 = create(opj(parent.path, 'sub3')) os.symlink( opj(pardir, pardir, 'sub3'), opj(parent.path, 'subdir', 'subsubdir', 'upup3')) # need to use absolute paths with chpwd(opj(parent.path, 'subdir', 'subsubdir')): add([opj(parent.path, 'sub3'), opj(parent.path, 'subdir', 'subsubdir', 'upup3')]) # here is where we need to disagree with the repo in #1886 # we would not expect that `add` registers sub3 as a subdataset # of parent, because no reference dataset was given and the # command cannot decide (with the current semantics) whether # it should "add anything in sub3 to sub3" or "add sub3 to whatever # sub3 is in" ok_clean_git(parent.path, untracked=['sub3/'])
def test_bf1886(path): parent = Dataset(path).create() sub = parent.create('sub') ok_clean_git(parent.path) # create a symlink pointing down to the subdataset, and add it os.symlink('sub', opj(parent.path, 'down')) parent.add('down') ok_clean_git(parent.path) # now symlink pointing up os.makedirs(opj(parent.path, 'subdir', 'subsubdir')) os.symlink(opj(pardir, 'sub'), opj(parent.path, 'subdir', 'up')) parent.add(opj('subdir', 'up')) ok_clean_git(parent.path) # now symlink pointing 2xup, as in #1886 os.symlink(opj(pardir, pardir, 'sub'), opj(parent.path, 'subdir', 'subsubdir', 'upup')) parent.add(opj('subdir', 'subsubdir', 'upup')) ok_clean_git(parent.path) # simulatenously add a subds and a symlink pointing to it # create subds, but don't register it sub2 = create(opj(parent.path, 'sub2')) os.symlink( opj(pardir, pardir, 'sub2'), opj(parent.path, 'subdir', 'subsubdir', 'upup2')) parent.add(['sub2', opj('subdir', 'subsubdir', 'upup2')]) ok_clean_git(parent.path) # full replication of #1886: the above but be in subdir of symlink # with no reference dataset sub3 = create(opj(parent.path, 'sub3')) os.symlink( opj(pardir, pardir, 'sub3'), opj(parent.path, 'subdir', 'subsubdir', 'upup3')) # need to use absolute paths with chpwd(opj(parent.path, 'subdir', 'subsubdir')): rev_save([opj(parent.path, 'sub3'), opj(parent.path, 'subdir', 'subsubdir', 'upup3')]) # in contrast to `add` only operates on a single top-level dataset # although it is not specified, it get's discovered based on the PWD # the logic behind that feels a bit shaky # consult discussion in https://github.com/datalad/datalad/issues/3230 # if this comes up as an issue at some point ok_clean_git(parent.path)
def test_save(path): ds = Dataset(path) with open(opj(path, "new_file.tst"), "w") as f: f.write("something") ds.repo.add("new_file.tst", git=True) ok_(ds.repo.dirty) ds._save("add a new file") ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) with open(opj(path, "new_file.tst"), "w") as f: f.write("modify") ok_(ds.repo.dirty) ds._save("modified new_file.tst") ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) # save works without ds and files given in the PWD with open(opj(path, "new_file.tst"), "w") as f: f.write("rapunzel") with chpwd(path): save("love rapunzel") ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) # and also without `-a` when things are staged with open(opj(path, "new_file.tst"), "w") as f: f.write("exotic") ds.repo.add("new_file.tst", git=True) with chpwd(path): save("love marsians") ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) files = ['one.txt', 'two.txt'] for fn in files: with open(opj(path, fn), "w") as f: f.write(fn) ds.add([opj(path, f) for f in files]) # superfluous call to save (add saved it already), should not fail # but report that nothing was saved assert_status('notneeded', ds._save("set of new files")) ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) # create subdataset subds = ds.create('subds') ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) # modify subds with open(opj(subds.path, "some_file.tst"), "w") as f: f.write("something") subds.add('.') ok_clean_git(subds.path, annex=isinstance(subds.repo, AnnexRepo)) # Note/TODO: ok_clean_git is failing in direct mode, due to staged but # uncommited .datalad (probably caused within create) ok_(ds.repo.dirty) # ensure modified subds is committed ds._save() ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) # now introduce a change downstairs subds.create('someotherds') ok_clean_git(subds.path, annex=isinstance(subds.repo, AnnexRepo)) ok_(ds.repo.dirty) # and save via subdataset path ds._save('subds') ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo))
def test_gh3421(path): # failed to add d/sub:file ds = Dataset(path).create(force=True) ds.add('top:file') ds.add(opj('d', 'sub:file')) ok_clean_git(ds.path)
def test_recursive_save(path): ds = Dataset(path).create() # nothing to save assert_status('notneeded', ds._save()) subds = ds.create('sub') # subdataset presence already saved ok_clean_git(ds.path) subsubds = subds.create('subsub') assert_equal( ds.subdatasets(recursive=True, fulfilled=True, result_xfm='paths'), [subds.path, subsubds.path]) newfile_name = opj(subsubds.path, 'test') with open(newfile_name, 'w') as f: f.write('some') # saves the status change of the subdataset due to the subsubdataset addition assert_result_values_equal( ds._save(result_filter=is_ok_dataset), 'path', [ds.path]) # make the new file known to its dataset ds.add(newfile_name, save=False) # but remains dirty because of the uncommited file down below assert ds.repo.dirty # auto-add will save nothing deep down without recursive assert_status('notneeded', ds._save()) assert ds.repo.dirty # with recursive pick up the change in subsubds assert_result_values_equal( ds._save(recursive=True, result_filter=is_ok_dataset), 'path', [subsubds.path, subds.path, ds.path]) # at this point the entire tree is clean ok_clean_git(ds.path) states = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] # now we save recursively, nothing should happen res = ds._save(recursive=True) # we do not get any report from a subdataset, because we detect at the # very top that the entire tree is clean assert_result_count(res, 1) assert_result_count(res, 1, status='notneeded', action='save', path=ds.path) # now we introduce new files all the way down create_tree(subsubds.path, {"mike1": 'mike1'}) # because we cannot say from the top if there is anything to do down below, # we have to traverse and we will get reports for all dataset, but there is # nothing actually saved res = ds._save(recursive=True) assert_result_count(res, 3) assert_status('notneeded', res) subsubds_indexed = subsubds.repo.get_indexed_files() assert_not_in('mike1', subsubds_indexed) assert_equal(states, [d.repo.get_hexsha() for d in (ds, subds, subsubds)]) unlink(opj(subsubds.path, 'mike1')) ok_clean_git(ds.path) # modify content in subsub and try saving testfname = newfile_name subsubds.unlock(testfname) with open(opj(ds.path, testfname), 'w') as f: f.write('I am in here!') # the following should all do nothing # no auto_add assert_status('notneeded', ds._save()) # no recursive assert_status('notneeded', ds._save()) # an explicit target saves only the corresponding dataset assert_result_values_equal( save(path=[testfname]), 'path', [subsubds.path]) # plain recursive without any files given will save the beast assert_result_values_equal( ds._save(recursive=True, result_filter=is_ok_dataset), 'path', [subds.path, ds.path]) # there is nothing else to save assert_status('notneeded', ds._save(recursive=True)) ok_clean_git(ds.path) # one more time and check that all datasets in the hierarchy are not # contaminated with untracked files states = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] testfname = opj('sub', 'subsub', 'saveme2') with open(opj(ds.path, testfname), 'w') as f: f.write('I am in here!') assert_status('notneeded', ds._save(recursive=True)) newstates = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] for old, new in zip(states, newstates): assert_equal(old, new) assert ds.repo.dirty unlink(opj(ds.path, testfname)) ok_clean_git(ds.path) # now let's check saving "upwards" create_tree(subds.path, {"testnew": 'smth', "testadded": "added"}) subds.repo.add("testadded") indexed_files = subds.repo.get_indexed_files() assert subds.repo.dirty assert ds.repo.dirty assert not subsubds.repo.dirty create_tree(subsubds.path, {"testnew2": 'smth'}) assert subsubds.repo.dirty # and indexed files didn't change assert_equal(indexed_files, subds.repo.get_indexed_files()) ok_clean_git(subds.repo, untracked=['testnew'], index_modified=['subsub'], head_modified=['testadded']) old_states = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] subsubds._save(message="savingtestmessage", super_datasets=True) # this save actually didn't save anything in subsub (or anywhere), # because there were only untracked bits pending for old, new in zip(old_states, [d.repo.get_hexsha() for d in (ds, subds, subsubds)]): assert_equal(old, new) # but now we are saving this untracked bit specifically subsubds._save(message="savingtestmessage", path=['testnew2'], super_datasets=True) ok_clean_git(subsubds.repo) # but its super should have got only the subsub saved # not the file we created ok_clean_git(subds.repo, untracked=['testnew'], head_modified=['testadded']) # check commits to have correct messages # there are no more dedicated superdataset-save commits anymore, because # superdatasets get saved as part of the processed hierarchy and can contain # other parts in the commit (if so instructed) assert_equal(next(subsubds.repo.get_branch_commits('master')).message.rstrip(), 'savingtestmessage') assert_equal(next(subds.repo.get_branch_commits('master')).message.rstrip(), 'savingtestmessage') assert_equal(next(ds.repo.get_branch_commits('master')).message.rstrip(), 'savingtestmessage') # and if we try to save while being within that subsubds path subsubds.unlock('testnew2') create_tree(subsubds.path, {"testnew2": 'smth2'}) # trying to replicate https://github.com/datalad/datalad/issues/1540 subsubds._save(message="saving new changes", all_updated=True) # no super with chpwd(subds.path): # no explicit dataset is provided by path is provided save(path=['subsub'], message='saving sub', super_datasets=True) # super should get it saved too assert_equal(next(ds.repo.get_branch_commits('master')).message.rstrip(), 'saving sub')
def test_ls_json(topdir): annex = AnnexRepo(topdir, create=True) ds = Dataset(topdir) # create some file and commit it open(opj(ds.path, 'subdsfile.txt'), 'w').write('123') ds.add(path='subdsfile.txt') ds.save("Hello!", version_tag=1) # add a subdataset ds.install('subds', source=topdir) git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True) # create git repo git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt'), commit=True) # commit to git to init git repo annex.add(opj(topdir, 'dir', 'subgit'), commit=True) # add the non-dataset git repo to annex annex.add(opj(topdir, 'dir'), commit=True) # add to annex (links) annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # broken-link meta_dir = opj('.git', 'datalad', 'metadata') meta_path = opj(topdir, meta_dir) def get_metahash(*path): return hashlib.md5(opj(*path).encode('utf-8')).hexdigest() for all_ in [True, False]: for recursive in [True, False]: for state in ['file', 'delete']: with swallow_logs(), swallow_outputs(): ds = _ls_json(topdir, json=state, all_=all_, recursive=recursive) # subdataset should have its json created and deleted when all=True else not subds_metahash = get_metahash('/') subds_metapath = opj(topdir, 'subds', meta_dir, subds_metahash) assert_equal(exists(subds_metapath), (state == 'file' and recursive)) # root should have its json file created and deleted in all cases ds_metahash = get_metahash('/') ds_metapath = opj(meta_path, ds_metahash) assert_equal(exists(ds_metapath), state == 'file') # children should have their metadata json's created and deleted only when recursive=True child_metahash = get_metahash('dir', 'subdir') child_metapath = opj(meta_path, child_metahash) assert_equal(exists(child_metapath), (state == 'file' and all_)) # ignored directories should not have json files created in any case for subdir in [('.hidden'), ('dir', 'subgit')]: child_metahash = get_metahash(*subdir) assert_equal(exists(opj(meta_path, child_metahash)), False) # check if its updated in its nodes sublist too. used by web-ui json. regression test assert_equal(ds['nodes'][0]['size']['total'], ds['size']['total']) # check size of subdataset subds = [item for item in ds['nodes'] if item['name'] == ('subdsfile.txt' or 'subds')][0] assert_equal(subds['size']['total'], '3 Bytes') # run non-recursive dataset traversal after subdataset metadata already created # to verify sub-dataset metadata being picked up from its metadata file in such cases if state == 'file' and recursive and not all_: ds = _ls_json(topdir, json='file', all_=False) subds = [item for item in ds['nodes'] if item['name'] == ('subdsfile.txt' or 'subds')][0] assert_equal(subds['size']['total'], '3 Bytes')
def test_procedure_discovery(path, super_path): ps = run_procedure(discover=True) # there are a few procedures coming with datalad, needs to find them assert_true(len(ps) > 2) # we get three essential properties eq_( sum(['procedure_type' in p and 'procedure_callfmt' in p and 'path' in p for p in ps]), len(ps)) # set up dataset with registered procedure (c&p from test_basics): ds = Dataset(path).create(force=True) ds.run_procedure('setup_yoda_dataset') ok_clean_git(ds.path) # configure dataset to look for procedures in its code folder ds.config.add( 'datalad.locations.dataset-procedures', 'code', where='dataset') # configure dataset to run the demo procedure prior to the clean command ds.config.add( 'datalad.clean.proc-pre', 'datalad_test_proc', where='dataset') ds.add(op.join('.datalad', 'config')) # run discovery on the dataset: ps = ds.run_procedure(discover=True) # still needs to find procedures coming with datalad assert_true(len(ps) > 2) # we get three essential properties eq_( sum(['procedure_type' in p and 'procedure_callfmt' in p and 'path' in p for p in ps]), len(ps)) # dataset's procedure needs to be in the results assert_in_results(ps, path=op.join(ds.path, 'code', 'datalad_test_proc.py')) # make it a subdataset and try again: super = Dataset(super_path).create() super.install('sub', source=ds.path) ps = super.run_procedure(discover=True) # still needs to find procedures coming with datalad assert_true(len(ps) > 2) # we get three essential properties eq_( sum(['procedure_type' in p and 'procedure_callfmt' in p and 'path' in p for p in ps]), len(ps)) # dataset's procedure needs to be in the results assert_in_results(ps, path=op.join(super.path, 'sub', 'code', 'datalad_test_proc.py')) if not on_windows: # no symlinks import os # create a procedure which is a broken symlink, but recognizable as a # python script: os.symlink(op.join(super.path, 'sub', 'not_existent'), op.join(super.path, 'sub', 'code', 'broken_link_proc.py')) # broken symlink at procedure location, but we can't tell, whether it is # an actual procedure without any guess on how to execute it: os.symlink(op.join(super.path, 'sub', 'not_existent'), op.join(super.path, 'sub', 'code', 'unknwon_broken_link')) ps = super.run_procedure(discover=True) # still needs to find procedures coming with datalad and the dataset # procedure registered before assert_true(len(ps) > 3) assert_in_results(ps, path=op.join(super.path, 'sub', 'code', 'broken_link_proc.py'), state='absent') assert_not_in_results(ps, path=op.join(super.path, 'sub', 'code', 'unknwon_broken_link'))