def test_get_content_info_dotgit(path=None): ds = Dataset(path).create() # Files in .git/ won't be reported, though this takes a kludge on our side # before Git 2.25. assert_false(ds.repo.get_content_info(paths=[op.join(".git", "config")]))
def test_save_obscure_name(path): ds = Dataset(path).create(force=True) fname = OBSCURE_FILENAME # Just check that we don't fail with a unicode error. with swallow_outputs(): ds.save(path=fname, result_renderer="default")
def eval_func(wrapped, instance, args, kwargs): lgr.log(2, "Entered eval_func for %s", func) # for result filters # we need to produce a dict with argname/argvalue pairs for all args # incl. defaults and args given as positionals allkwargs = get_allargs_as_kwargs(wrapped, args, kwargs) # determine the command class associated with `wrapped` wrapped_class = get_wrapped_class(wrapped) # retrieve common options from kwargs, and fall back on the command # class attributes, or general defaults if needed kwargs = kwargs.copy() # we will pop, which might cause side-effect common_params = { p_name: kwargs.pop( # go with any explicitly given default p_name, # otherwise determine the command class and pull any # default set in that class getattr( wrapped_class, p_name, # or the common default eval_defaults[p_name])) for p_name in eval_params } # short cuts and configured setup for common options return_type = common_params['return_type'] result_filter = get_result_filter(common_params['result_filter']) # resolve string labels for transformers too result_xfm = known_result_xfms.get( common_params['result_xfm'], # use verbatim, if not a known label common_params['result_xfm']) result_renderer = common_params['result_renderer'] # TODO remove this conditional branch entirely, done outside if not result_renderer: result_renderer = dlcfg.get('datalad.api.result-renderer', None) # look for potential override of logging behavior result_log_level = dlcfg.get('datalad.log.result-level', None) # query cfg for defaults # .is_installed and .config can be costly, so ensure we do # it only once. See https://github.com/datalad/datalad/issues/3575 dataset_arg = allkwargs.get('dataset', None) from datalad.distribution.dataset import Dataset ds = dataset_arg if isinstance(dataset_arg, Dataset) \ else Dataset(dataset_arg) if dataset_arg else None # do not reuse a dataset's existing config manager here # they are configured to read the committed dataset configuration # too. That means a datalad update can silently bring in new # procedure definitions from the outside, and in some sense enable # remote code execution by a 3rd-party # To avoid that, create a new config manager that only reads local # config (system and .git/config), plus any overrides given to this # datalad session proc_cfg = ConfigManager( ds, source='local', overrides=dlcfg.overrides) if ds and ds.is_installed() else dlcfg # look for hooks hooks = get_jsonhooks_from_config(proc_cfg) # this internal helper function actually drives the command # generator-style, it may generate an exception if desired, # on incomplete results def generator_func(*_args, **_kwargs): # flag whether to raise an exception incomplete_results = [] # track what actions were performed how many times action_summary = {} # if a custom summary is to be provided, collect the results # of the command execution results = [] do_custom_result_summary = result_renderer in ('tailored', 'default') \ and hasattr(wrapped_class, 'custom_result_summary_renderer') # process main results for r in _process_results( # execution wrapped(*_args, **_kwargs), wrapped_class, common_params['on_failure'], # bookkeeping action_summary, incomplete_results, # communication result_renderer, result_log_level, # let renderers get to see how a command was called allkwargs): for hook, spec in hooks.items(): # run the hooks before we yield the result # this ensures that they are executed before # a potentially wrapper command gets to act # on them if match_jsonhook2result(hook, r, spec['match']): lgr.debug('Result %s matches hook %s', r, hook) # a hook is also a command that yields results # so yield them outside too # users need to pay attention to void infinite # loops, i.e. when a hook yields a result that # triggers that same hook again for hr in run_jsonhook(hook, spec, r, dataset_arg): # apply same logic as for main results, otherwise # any filters would only tackle the primary results # and a mixture of return values could happen if not keep_result(hr, result_filter, **allkwargs): continue hr = xfm_result(hr, result_xfm) # rationale for conditional is a few lines down if hr: yield hr if not keep_result(r, result_filter, **allkwargs): continue r = xfm_result(r, result_xfm) # in case the result_xfm decided to not give us anything # exclude it from the results. There is no particular reason # to do so other than that it was established behavior when # this comment was written. This will not affect any real # result record if r: yield r # collect if summary is desired if do_custom_result_summary: results.append(r) # result summary before a potential exception # custom first if do_custom_result_summary: wrapped_class.custom_result_summary_renderer(results) elif result_renderer == 'default' and action_summary and \ sum(sum(s.values()) for s in action_summary.values()) > 1: # give a summary in default mode, when there was more than one # action performed ui.message("action summary:\n {}".format('\n '.join( '{} ({})'.format( act, ', '.join( '{}: {}'.format(status, action_summary[act] [status]) for status in sorted(action_summary[act]))) for act in sorted(action_summary)))) if incomplete_results: raise IncompleteResultsError( failed=incomplete_results, msg="Command did not complete successfully") if return_type == 'generator': # hand over the generator lgr.log(2, "Returning generator_func from eval_func for %s", wrapped_class) return generator_func(*args, **kwargs) else: @wrapt.decorator def return_func(wrapped_, instance_, args_, kwargs_): results = wrapped_(*args_, **kwargs_) if inspect.isgenerator(results): # unwind generator if there is one, this actually runs # any processing results = list(results) # render summaries if not result_xfm and result_renderer in ('tailored', 'default'): # cannot render transformed results if hasattr(wrapped_class, 'custom_result_summary_renderer'): wrapped_class.custom_result_summary_renderer(results) if return_type == 'item-or-list' and \ len(results) < 2: return results[0] if results else None else: return results lgr.log(2, "Returning return_func from eval_func for %s", wrapped_class) return return_func(generator_func)(*args, **kwargs)
def test_get_cached_dataset(cache_dir): # patch DATALAD_TESTS_CACHE to not use the actual cache with # the test testing that very cache. cache_dir = Path(cache_dir) # store file-based values for testrepo-minimalds for readability: annexed_file = opj('inannex', 'animated.gif') annexed_file_key = "MD5E-s144625--4c458c62b7ac8ec8e19c8ff14b2e34ad.gif" with patch(CACHE_PATCH_STR, new=cache_dir): # tuples to test (url, version, keys, class): test_cases = [ # a simple testrepo ("https://github.com/datalad/testrepo--minimalds", "541cf855d13c2a338ff2803d4488daf0035e568f", None, AnnexRepo), # Same repo, but request paths to be present. This should work # with a subsequent call, although the first one did not already # request any: ("https://github.com/datalad/testrepo--minimalds", "9dd8b56cc706ab56185f2ceb75fbe9de9b606724", annexed_file_key, AnnexRepo), # Same repo again, but invalid version ("https://github.com/datalad/testrepo--minimalds", "nonexistent", "irrelevantkey", # invalid version; don't even try to get the key AnnexRepo), # same thing with different name should be treated as a new thing: ("https://github.com/datalad/testrepo--minimalds", "git-annex", None, AnnexRepo), # try a plain git repo to make sure we can deal with that: # Note, that we first need a test case w/o a `key` parameter to not # blow up the test when Clone is patched, resulting in a MagicMock # instead of a Dataset instance within get_cached_dataset. In the # second case it's already cached then, so the patched Clone is # never executed. ("https://github.com/datalad/datalad.org", None, None, GitRepo), ("https://github.com/datalad/datalad.org", "gh-pages", "ignored-key", # it's a git repo; don't even try to get a key GitRepo), ] for url, version, keys, cls in test_cases: target = cache_dir / url2filename(url) # assuming it doesn't exist yet - patched cache dir! in_cache_before = target.exists() with patch(CLONE_PATCH_STR) as exec_clone: try: ds = get_cached_dataset(url, version, keys) invalid_version = False except AssertionError: # should happen only if `version` wasn't found. Implies # that the dataset exists in cache (although not returned # due to exception) assert_true(version) assert_false(Dataset(target).repo.commit_exists(version)) # mark for later assertions (most of them should still hold # true) invalid_version = True assert_equal(exec_clone.call_count, 0 if in_cache_before else 1) # Patch prevents actual execution. Now do it for real. Note, that # this might be necessary for content retrieval even if dataset was # in cache before. try: ds = get_cached_dataset(url, version, keys) except AssertionError: # see previous call assert_true(invalid_version) assert_is_instance(ds, Dataset) assert_true(ds.is_installed()) assert_equal(target, ds.pathobj) assert_is_instance(ds.repo, cls) if keys and not invalid_version and \ AnnexRepo.is_valid_repo(ds.path): # Note: it's not supposed to get that content if passed # `version` wasn't available. get_cached_dataset would then # raise before and not download anything only to raise # afterwards. here = ds.config.get("annex.uuid") where = ds.repo.whereis(ensure_list(keys), key=True) assert_true(all(here in remotes for remotes in where)) # version check. Note, that all `get_cached_dataset` is supposed to # do, is verifying, that specified version exists - NOT check it # out" if version and not invalid_version: assert_true(ds.repo.commit_exists(version)) # re-execution with patch(CLONE_PATCH_STR) as exec_clone: try: ds2 = get_cached_dataset(url, version, keys) except AssertionError: assert_true(invalid_version) exec_clone.assert_not_called() # returns the same Dataset as before: assert_is(ds, ds2)
def get_baseline(p): ds = Dataset(p).create() sub = create(str(ds.pathobj / 'sub')) assert_repo_status(ds.path, untracked=['sub']) return ds
def __call__( path=None, dataset=None, recursive=False, recursion_limit=None, update_mode='target', incremental=False, force_extraction=False, save=True): refds_path = Interface.get_refds_path(dataset) # it really doesn't work without a dataset ds = require_dataset( dataset, check_installed=True, purpose='metadata aggregation') path = ensure_list(path) if not path: # then current/reference dataset is "aggregated" # We should not add ds.path always since then --recursive would # also recurse current even if paths are given path.append(ds.path) agginfo_db_location, agg_base_path = get_ds_aggregate_db_locations( ds, # do not warn here, next call triggers the same warning warn_absent=False) agginfo_db = load_ds_aggregate_db(ds, abspath=True) to_save = [] to_aggregate = set() for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='aggregate_metadata', # uninstalled subdatasets could be queried via aggregated metadata # -> no 'error' unavailable_path_status='', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue ap_type = ap.get('type', None) ap_state = ap.get('state', None) assert('parentds' in ap or ap_type == 'dataset') if ap_type == 'dataset' and ap_state != 'absent': # a present dataset, we can take directly from it aggsrc = ap['path'] lgr.info('Aggregate metadata for dataset %s', aggsrc) else: # everything else needs to come from the parent aggsrc = ap['parentds'] if ap_state == 'absent': lgr.info( 'Attempt to use pre-aggregate metadata for absent %s from dataset at %s', ap['path'], aggsrc) else: lgr.info( 'Aggregate metadata for %s from dataset at %s', ap['path'], aggsrc) to_aggregate.add(aggsrc) if ap_state == 'absent': # key thought: recursive is done by path annotation, hence # once we hit an absent dataset, we are 100% certain that # there is nothing to recurse into on the file system # hence we only have to look into the aggregated metadata # of the last available dataset in the dataset tree edge # # if there is nothing at this path, we need to look into the # parentds and check if we know anything about this path # if we do, we need to grab all the info and objects # if not, we need to error res = _get_dsinfo_from_aggmetadata( aggsrc, ap['path'], recursive, agginfo_db) if not isinstance(res, list): yield get_status_dict( status='impossible', message=res, action='aggregate_metadata', path=ap['path'], logger=lgr) continue # cue for aggregation to_aggregate.update(res) else: # actually aggregate metadata for this dataset, immediately place # generated objects into the aggregated or reference dataset, # and put info into DB to get the distributed to all datasets # that need to be updated errored = _dump_extracted_metadata( ds, Dataset(aggsrc), agginfo_db, to_save, force_extraction, agg_base_path) if errored: yield get_status_dict( status='error', message='Metadata extraction failed (see previous error message, set datalad.runtime.raiseonerror=yes to fail immediately)', action='aggregate_metadata', path=aggsrc, logger=lgr) # at this point we have dumped all aggregated metadata into object files # somewhere, we know what needs saving, but having saved anything, and # we know about the states of all aggregated dataset in the DB # what remains to do is to update all dataset, so they have there own copy # of aggregated metadata and update their respective aggregate.json with # info on what states we just aggregated from # first, let's figure out what dataset need updating at all # get adjencency info of the dataset tree spanning the base to all leaf dataset # associated with the path arguments if update_mode == 'all': ds_adj = {} discover_dataset_trace_to_targets( ds.path, to_aggregate, [], ds_adj, # we know that to_aggregate only lists datasets, existing and # absent ones -- we want to aggregate all of them, either from # just extracted metadata, or from previously aggregated metadata # of the closest superdataset includeds=to_aggregate) # TODO we need to work in the info about dataset that we only got from # aggregated metadata, that had no trace on the file system in here!! subtrees = _adj2subtrees(ds.path, ds_adj, to_aggregate) elif update_mode == 'target': subtrees = {ds.path: list(agginfo_db.keys())} else: raise ValueError( "unknown `update_mode` '%s' for metadata aggregation", update_mode) # go over datasets in bottom-up fashion for parentds_path in sorted(subtrees, reverse=True): lgr.info('Update aggregate metadata in dataset at: %s', parentds_path) _update_ds_agginfo( ds.path, parentds_path, subtrees[parentds_path], incremental, agginfo_db, to_save) # update complete res = get_status_dict( status='ok', action='aggregate_metadata', path=parentds_path, type='dataset', logger=lgr) res.update(agginfo_db.get(parentds_path, {})) yield res # # save potential modifications to dataset global metadata # if not to_save: return lgr.info('Attempting to save %i files/datasets', len(to_save)) for res in Save.__call__( # save does not need any pre-annotated path hints path=[r['path'] for r in to_save], dataset=refds_path, message='[DATALAD] Dataset aggregate metadata update', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def __init__(self, repo): self._child_dataset = Dataset(repo.path) self._super = None self._super_tried = False
def test_basics(path, nodspath): ds = Dataset(path).create() last_state = ds.repo.get_hexsha() # run inside the dataset with chpwd(path), \ swallow_outputs(): # provoke command failure with assert_raises(CommandError) as cme: ds.run('7i3amhmuch9invalid') # let's not speculate that the exit code is always 127 ok_(cme.code > 0) eq_(last_state, ds.repo.get_hexsha()) # now one that must work res = ds.run('touch empty', message='TEST') ok_clean_git(ds.path) assert_result_count(res, 2) # TODO 'state' is still untracked!!! assert_result_count(res, 1, action='add', path=opj(ds.path, 'empty'), type='file') assert_result_count(res, 1, action='save', path=ds.path) commit_msg = ds.repo.repo.head.commit.message ok_(commit_msg.startswith('[DATALAD RUNCMD] TEST')) # crude test that we have a record for the PWD assert_in('"pwd": "."', commit_msg) last_state = ds.repo.get_hexsha() # now run a command that will not alter the dataset res = ds.run('touch empty', message='NOOP_TEST') assert_status('notneeded', res) eq_(last_state, ds.repo.get_hexsha()) # We can also run the command via a single-item list because this is # what the CLI interface passes in for quoted commands. res = ds.run(['touch empty'], message='NOOP_TEST') assert_status('notneeded', res) # run outside the dataset, should still work but with limitations with chpwd(nodspath), \ swallow_outputs(): res = ds.run(['touch', 'empty2'], message='TEST') assert_status('ok', res) assert_result_count(res, 1, action='add', path=opj(ds.path, 'empty2'), type='file') # running without a command is a noop with chpwd(path): with swallow_logs(new_level=logging.WARN) as cml: ds.run() assert_in("No command given", cml.out) # Simple sidecar message checks. ds.run(["touch", "dummy0"], message="sidecar arg", sidecar=True) assert_not_in('"cmd":', ds.repo.repo.head.commit.message) real_get = ds.config.get def mocked_get(key, default=None): if key == "datalad.run.record-sidecar": return True return real_get(key, default) with patch.object(ds.config, "get", mocked_get): ds.run(["touch", "dummy1"], message="sidecar config") assert_not_in('"cmd":', ds.repo.repo.head.commit.message)
def test_run_inputs_no_annex_repo(path): ds = Dataset(path).create(no_annex=True) # Running --input in a plain Git repo doesn't fail. ds.run("touch dummy", inputs=["*"]) ok_exists(opj(ds.path, "dummy")) ds.rerun()
def test_rerun_empty_branch(path): GitRepo(path, create=True) ds = Dataset(path) assert_status("impossible", ds.rerun(on_failure="ignore"))
def test_run_inputs_outputs(path): ds = Dataset(path) assert_false(ds.repo.file_has_content("test-annex.dat")) # If we specify test-annex.dat as an input, it will be retrieved before the # run. ds.run("cat test-annex.dat test-annex.dat >doubled.dat", inputs=["test-annex.dat"]) ok_clean_git(ds.path) ok_(ds.repo.file_has_content("test-annex.dat")) ok_(ds.repo.file_has_content("doubled.dat")) # Rerunning the commit will also get the input file. ds.repo.drop("test-annex.dat", options=["--force"]) assert_false(ds.repo.file_has_content("test-annex.dat")) ds.rerun() ok_(ds.repo.file_has_content("test-annex.dat")) with swallow_logs(new_level=logging.WARN) as cml: ds.run("touch dummy", inputs=["not-there"]) assert_in("Input does not exist: ", cml.out) # Test different combinations of globs and explicit files. inputs = ["a.dat", "b.dat", "c.txt", "d.txt"] create_tree(ds.path, {i: i for i in inputs}) ds.add(".") ds.repo.copy_to(inputs, remote="origin") ds.repo.drop(inputs, options=["--force"]) test_cases = [(["*.dat"], ["a.dat", "b.dat"]), (["*.dat", "c.txt"], ["a.dat", "b.dat", "c.txt"]), (["*"], inputs)] for idx, (inputs_arg, expected_present) in enumerate(test_cases): assert_false(any(ds.repo.file_has_content(i) for i in inputs)) ds.run("touch dummy{}".format(idx), inputs=inputs_arg) ok_(all(ds.repo.file_has_content(f) for f in expected_present)) # Globs are stored unexpanded by default. assert_in(inputs_arg[0], ds.repo.repo.head.commit.message) ds.repo.drop(inputs, options=["--force"]) # --input can be passed a subdirectory. create_tree(ds.path, {"subdir": {"a": "subdir a", "b": "subdir b"}}) ds.add("subdir") ds.repo.copy_to(["subdir/a", "subdir/b"], remote="origin") ds.repo.drop("subdir", options=["--force"]) ds.run("touch subdir-dummy", inputs=[opj(ds.path, "subdir")]) ok_(all(ds.repo.file_has_content(opj("subdir", f)) for f in ["a", "b"])) # Inputs are specified relative to a dataset's subdirectory. ds.repo.drop(opj("subdir", "a"), options=["--force"]) with chpwd(opj(path, "subdir")): run("touch subdir-dummy1", inputs=["a"]) ok_(ds.repo.file_has_content(opj("subdir", "a"))) # --input=. runs "datalad get ." ds.run("touch dot-dummy", inputs=["."]) eq_(ds.repo.get_annexed_files(), ds.repo.get_annexed_files(with_content_only=True)) # On rerun, we get all files, even those that weren't in the tree at the # time of the run. create_tree(ds.path, {"after-dot-run": "after-dot-run content"}) ds.add(".") ds.repo.copy_to(["after-dot-run"], remote="origin") ds.repo.drop(["after-dot-run"], options=["--force"]) ds.rerun("HEAD^") ds.repo.file_has_content("after-dot-run") # --output will unlock files that are present. ds.repo.get("a.dat") ds.run("echo ' appended' >>a.dat", outputs=["a.dat"]) with open(opj(path, "a.dat")) as fh: eq_(fh.read(), "a.dat appended\n") # --output will remove files that are not present. ds.repo.drop(["a.dat", "d.txt"], options=["--force"]) ds.run("echo ' appended' >>a.dat", outputs=["a.dat"]) with open(opj(path, "a.dat")) as fh: eq_(fh.read(), " appended\n") # --input can be combined with --output. ds.repo.repo.git.reset("--hard", "HEAD~2") ds.run("echo ' appended' >>a.dat", inputs=["a.dat"], outputs=["a.dat"]) with open(opj(path, "a.dat")) as fh: eq_(fh.read(), "a.dat appended\n") with swallow_logs(new_level=logging.DEBUG) as cml: ds.run("echo blah", outputs=["not-there"]) assert_in("Filtered out non-existing path: ", cml.out) ds.create('sub') ds.run("echo sub_orig >sub/subfile") ds.run("echo sub_overwrite >sub/subfile", outputs=["sub/subfile"]) ds.drop("sub/subfile", check=False) ds.run("echo sub_overwrite >sub/subfile", outputs=["sub/subfile"]) # --input/--output globs can be stored in expanded form. ds.run("touch expand-dummy", inputs=["a.*"], outputs=["b.*"], expand="both") assert_in("a.dat", ds.repo.repo.head.commit.message) assert_in("b.dat", ds.repo.repo.head.commit.message) res = ds.rerun(report=True, return_type='item-or-list') eq_(res["run_info"]['inputs'], ["a.dat"]) eq_(res["run_info"]['outputs'], ["b.dat"])
def test_rerun(path, nodspath): ds = Dataset(path).create() sub = ds.create('sub') probe_path = opj(sub.path, 'sequence') # run inside the dataset with chpwd(path), \ swallow_outputs(): ds.run('echo x$(cat sub/sequence) > sub/sequence') # command ran once, all clean ok_clean_git(ds.path) eq_('x\n', open(probe_path).read()) # now, for a rerun we can be anywhere, PWD and all are recorded # moreover, rerun must figure out which bits to unlock, even in # subdatasets with chpwd(nodspath), \ swallow_outputs(): ds.rerun() ok_clean_git(ds.path) # ran twice now eq_('xx\n', open(probe_path).read()) # Rerunning from a subdataset skips the command. _, sub_info = get_run_info(ds, sub.repo.repo.head.commit.message) eq_(ds.id, sub_info["dsid"]) assert_result_count(sub.rerun(return_type="list", on_failure="ignore"), 1, status="impossible", action="run", rerun_action="skip") eq_('xx\n', open(probe_path).read()) # Rerun fails with a dirty repo. dirt = opj(path, "dirt") with open(dirt, "w") as fh: fh.write("") assert_status('impossible', ds.rerun(on_failure="ignore")) remove(dirt) ok_clean_git(ds.path) # Make a non-run commit. with open(opj(path, "nonrun-file"), "w") as f: f.write("foo") ds.add("nonrun-file") # Now rerun the buried command. ds.rerun(revision="HEAD~", message="rerun buried") eq_('xxx\n', open(probe_path).read()) # Also check that the messasge override worked. eq_(ds.repo.repo.head.commit.message.splitlines()[0], "[DATALAD RUNCMD] rerun buried") # Or a range of commits, skipping non-run commits. ds.rerun(since="HEAD~3") eq_('xxxxx\n', open(probe_path).read()) # Or --since= to run all reachable commits. ds.rerun(since="") eq_('xxxxxxxxxx\n', open(probe_path).read()) # We can get back a report of what would happen rather than actually # rerunning anything. report = ds.rerun(since="", report=True, return_type="list") # Nothing changed. eq_('xxxxxxxxxx\n', open(probe_path).read()) assert_result_count(report, 1, rerun_action="skip") report[-1]["commit"] == ds.repo.get_hexsha() # If a file is dropped, we remove it instead of unlocking it. ds.drop(probe_path, check=False) ds.rerun() eq_('x\n', open(probe_path).read()) # If the history to rerun has a merge commit, we abort. ds.repo.checkout("HEAD~3", options=["-b", "topic"]) with open(opj(path, "topic-file"), "w") as f: f.write("topic") ds.add("topic-file") ds.repo.checkout("master") ds.repo.merge("topic") ok_clean_git(ds.path) assert_raises(IncompleteResultsError, ds.rerun)
def test_get_content_info(path=None): repo = GitRepo(path) assert_equal(repo.get_content_info(), {}) # an invalid reference causes an exception assert_raises(ValueError, repo.get_content_info, ref='HEAD') ds = get_convoluted_situation(path) repopath = ds.repo.pathobj assert_equal(ds.repo.pathobj, repopath) assert_equal(ds.pathobj, ut.Path(path)) # verify general rules on fused info records that are incrementally # assembled: for git content info, amended with annex info on 'HEAD' # (to get the last committed stage and with it possibly vanished # content), and lastly annex info wrt to the present worktree, to # also get info on added/staged content # this fuses the info reported from # - git ls-files # - git annex findref HEAD # - git annex find --include '*' for f, r in get_annexstatus(ds.repo).items(): if f.match('*_untracked'): assert (r.get('gitshasum', None) is None) if f.match('*_deleted'): assert (not f.exists() and not f.is_symlink() is None) if f.match('subds_*'): assert (r['type'] == 'dataset' if r.get('gitshasum', None) else 'directory') if f.match('file_*'): # which one exactly depends on many things assert_in(r['type'], ('file', 'symlink')) if f.match('file_ingit*'): assert (r['type'] == 'file') elif '.datalad' not in f.parts and not f.match('.git*') and \ r.get('gitshasum', None) and not f.match('subds*'): # this should be known to annex, one way or another # regardless of whether things add deleted or staged # or anything in between assert_in('key', r, f) assert_in('keyname', r, f) assert_in('backend', r, f) assert_in('bytesize', r, f) # no duplication with path assert_not_in('file', r, f) # query full untracked report res = ds.repo.get_content_info() assert_in(repopath.joinpath('dir_untracked', 'file_untracked'), res) assert_not_in(repopath.joinpath('dir_untracked'), res) # query for compact untracked report res = ds.repo.get_content_info(untracked='normal') assert_not_in(repopath.joinpath('dir_untracked', 'file_untracked'), res) assert_in(repopath.joinpath('dir_untracked'), res) # query no untracked report res = ds.repo.get_content_info(untracked='no') assert_not_in(repopath.joinpath('dir_untracked', 'file_untracked'), res) assert_not_in(repopath.joinpath('dir_untracked'), res) # git status integrity status = ds.repo.status() for t in ('subds', 'file'): for s in ('untracked', 'added', 'deleted', 'clean', 'ingit_clean', 'dropped_clean', 'modified', 'ingit_modified'): for l in ('', ut.PurePosixPath('subdir', '')): if t == 'subds' and 'ingit' in s or 'dropped' in s: # invalid combination continue if t == 'subds' and s == 'deleted': # same as subds_unavailable -> clean continue p = repopath.joinpath(l, '{}_{}'.format(t, s)) assert p.match('*_{}'.format(status[p]['state'])), p if t == 'subds': assert_in(status[p]['type'], ('dataset', 'directory'), p) else: assert_in(status[p]['type'], ('file', 'symlink'), p) # git annex status integrity annexstatus = get_annexstatus(ds.repo) for t in ('file', ): for s in ('untracked', 'added', 'deleted', 'clean', 'ingit_clean', 'dropped_clean', 'modified', 'ingit_modified'): for l in ('', ut.PurePosixPath('subdir', '')): p = repopath.joinpath(l, '{}_{}'.format(t, s)) if s in ('untracked', 'ingit_clean', 'ingit_modified'): # annex knows nothing about these things assert_not_in('key', annexstatus[p]) continue assert_in('key', annexstatus[p]) # dear future, # if the next one fails, git-annex might have changed the # nature of the path that are being reported by # `annex find --json` # when this was written `hashir*` was a native path, but # `file` was a POSIX path assert_equal(annexstatus[p]['has_content'], 'dropped' not in s) # check the different subds evaluation modes someds = Dataset(ds.pathobj / 'subds_modified' / 'someds') dirtyds_path = someds.pathobj / 'dirtyds' assert_not_in('state', someds.repo.status(eval_submodule_state='no')[dirtyds_path]) assert_equal( 'clean', someds.repo.status( eval_submodule_state='commit')[dirtyds_path]['state']) assert_equal( 'modified', someds.repo.status(eval_submodule_state='full')[dirtyds_path]['state'])
def test_status_paths_empty_list(path=None): ds = Dataset(path).create() assert_equal(ds.repo.status(paths=[]), {})
def __call__(sshurl, name=None, target_dir=None, target_url=None, target_pushurl=None, dataset=None, recursive=False, recursion_limit=None, existing='error', shared=None, group=None, ui=False, as_common_datasrc=None, publish_by_default=None, publish_depends=None, annex_wanted=None, annex_group=None, annex_groupwanted=None, inherit=False, since=None): # # nothing without a base dataset # ds = require_dataset(dataset, check_installed=True, purpose='creating a sibling') refds_path = ds.path # # all checks that are possible before we start parsing the dataset # # possibly use sshurl to get the name in case if not specified if not sshurl: if not inherit: raise InsufficientArgumentsError( "needs at least an SSH URL, if no inherit option" ) if name is None: raise ValueError( "Neither SSH URL, nor the name of sibling to inherit from " "was specified" ) # It might well be that we already have this remote setup try: sshurl = CreateSibling._get_remote_url(ds, name) except Exception as exc: lgr.debug('%s does not know about url for %s: %s', ds, name, exc_str(exc)) elif inherit: raise ValueError( "For now, for clarity not allowing specifying a custom sshurl " "while inheriting settings" ) # may be could be safely dropped -- still WiP if not sshurl: # TODO: may be more back up before _prep? super_ds = ds.get_superdataset() if not super_ds: raise ValueError( "Could not determine super dataset for %s to inherit URL" % ds ) super_url = CreateSibling._get_remote_url(super_ds, name) # for now assuming hierarchical setup # (TODO: to be able to destinguish between the two, probably # needs storing datalad.*.target_dir to have %RELNAME in there) sshurl = slash_join(super_url, relpath(refds_path, super_ds.path)) # check the login URL sibling_ri = RI(sshurl) ssh_sibling = is_ssh(sibling_ri) if not (ssh_sibling or isinstance(sibling_ri, PathRI)): raise ValueError( "Unsupported SSH URL or path: '{0}', " "use ssh://host/path, host:path or path syntax".format(sshurl)) if not name: name = sibling_ri.hostname if ssh_sibling else "local" lgr.debug( "No sibling name given. Using %s'%s' as sibling name", "URL hostname " if ssh_sibling else "", name) if since == '': # consider creating siblings only since the point of # the last update # XXX here we assume one to one mapping of names from local branches # to the remote active_branch = ds.repo.get_active_branch() since = '%s/%s' % (name, active_branch) # # parse the base dataset to find all subdatasets that need processing # to_process = [] cand_ds = [ Dataset(r['path']) for r in diff_dataset( ds, fr=since, to=None, # make explicit, but doesn't matter, no recursion in diff() constant_refs=True, # contrain to the paths of all locally existing subdatasets path=[ sds['path'] for sds in ds.subdatasets( recursive=recursive, recursion_limit=recursion_limit, fulfilled=True, result_renderer=None) ], # save cycles, we are only looking for datasets annex=None, untracked='no', # recursion was done faster by subdatasets() recursive=False, # save cycles, we are only looking for datasets eval_file_type=False, ) if r.get('type') == 'dataset' and r.get('state', None) != 'clean' ] # check remotes setup for d in cand_ds if since else ([ds] + cand_ds): d_repo = d.repo if d_repo is None: continue checkds_remotes = d.repo.get_remotes() res = dict( action='create_sibling', path=d.path, type='dataset', ) if publish_depends: # make sure dependencies are valid # TODO: inherit -- we might want to automagically create # those dependents as well??? unknown_deps = set(ensure_list(publish_depends)).difference( checkds_remotes) if unknown_deps: yield dict( res, status='error', message=('unknown sibling(s) specified as publication ' 'dependency: %s', unknown_deps), ) continue if name in checkds_remotes and existing in ('error', 'skip'): yield dict( res, status='error' if existing == 'error' else 'notneeded', message=( "sibling '%s' already configured (specify alternative " "name, or force reconfiguration via --existing", name), ) continue to_process.append(res) if not to_process: # we ruled out all possibilities # TODO wait for gh-1218 and make better return values lgr.info("No datasets qualify for sibling creation. " "Consider different settings for --existing " "or --since if this is unexpected") return if ssh_sibling: # request ssh connection: lgr.info("Connecting ...") shell = ssh_manager.get_connection(sshurl) else: shell = _RunnerAdapter() sibling_ri.path = str(resolve_path(sibling_ri.path, dataset)) if target_dir: target_dir = opj(sibling_ri.path, target_dir) if target_dir is None: if sibling_ri.path: target_dir = sibling_ri.path else: target_dir = '.' # TODO: centralize and generalize template symbol handling replicate_local_structure = "%RELNAME" not in target_dir if not shell.get_annex_version(): raise MissingExternalDependency( 'git-annex', msg="It's required on the {} machine to create a sibling" .format('remote' if ssh_sibling else 'local')) # # all checks done and we have a connection, now do something # # loop over all datasets, ordered from top to bottom to make test # below valid (existing directories would cause the machinery to halt) # But we need to run post-update hook in depth-first fashion, so # would only collect first and then run (see gh #790) yielded = set() remote_repos_to_run_hook_for = [] for currentds_ap in \ sorted(to_process, key=lambda x: x['path'].count('/')): current_ds = Dataset(currentds_ap['path']) path = _create_dataset_sibling( name, current_ds, refds_path, shell, replicate_local_structure, sibling_ri, target_dir, target_url, target_pushurl, existing, shared, group, publish_depends, publish_by_default, ui, as_common_datasrc, annex_wanted, annex_group, annex_groupwanted, inherit ) if not path: # nothing new was created # TODO is 'notneeded' appropriate in this case? currentds_ap['status'] = 'notneeded' # TODO explain status in 'message' yield currentds_ap yielded.add(currentds_ap['path']) continue remote_repos_to_run_hook_for.append((path, currentds_ap)) # publish web-interface to root dataset on publication server if current_ds.path == refds_path and ui: lgr.info("Uploading web interface to %s" % path) try: CreateSibling.upload_web_interface(path, shell, shared, ui) except CommandError as e: currentds_ap['status'] = 'error' currentds_ap['message'] = ( "failed to push web interface to the remote datalad repository (%s)", exc_str(e)) yield currentds_ap yielded.add(currentds_ap['path']) continue # in reverse order would be depth first lgr.info("Running post-update hooks in all created siblings") # TODO: add progressbar for path, currentds_ap in remote_repos_to_run_hook_for[::-1]: # Trigger the hook lgr.debug("Running hook for %s (if exists and executable)", path) try: shell("cd {} " "&& ( [ -x hooks/post-update ] && hooks/post-update || true )" "".format(sh_quote(_path_(path, ".git")))) except CommandError as e: currentds_ap['status'] = 'error' currentds_ap['message'] = ( "failed to run post-update hook under remote path %s (%s)", path, exc_str(e)) yield currentds_ap yielded.add(currentds_ap['path']) continue if not currentds_ap['path'] in yielded: # if we were silent until now everything is just splendid currentds_ap['status'] = 'ok' yield currentds_ap
def __call__(path=None, is_pipeline=False, is_template=False, recursive=False, chdir=None): # dry_run=False, dry_run = False from datalad_crawler.pipeline import ( load_pipeline_from_config, load_pipeline_from_module, get_repo_pipeline_config_path, get_repo_pipeline_script_path ) from datalad_crawler.pipeline import run_pipeline from datalad.utils import chpwd # import late so we could mock during tests with chpwd(chdir): assert not (is_pipeline and is_template), "it is either a pipeline or a template name, can't be both" if is_template: # generate a config and overload path with its filename path = initiate_pipeline_config(template=path, # kwargs=TODO, commit=True) # TODO: centralize via _params_ handling if dry_run: dryrun_optlabel = 'datalad.crawl.dryrun' if dryrun_optlabel in cfg: cfg.unset(dryrun_optlabel, where='local', reload=False) cfg.add(dryrun_optlabel, "True", where='local') if path is None: # get config from the current repository/dataset if is_pipeline: raise ValueError("You must specify the file if --pipeline") # Let's see if there is a config or pipeline in this repo path = get_repo_pipeline_config_path() if not path or not exists(path): # Check if there may be the pipeline provided path = get_repo_pipeline_script_path() if path and exists(path): is_pipeline = True stats = ActivityStats() if not path: raise RuntimeError("Cannot locate crawler config or pipeline file") if is_pipeline: lgr.info("Loading pipeline definition from %s" % path) pipeline = load_pipeline_from_module(path) else: lgr.info("Loading pipeline specification from %s" % path) pipeline = load_pipeline_from_config(path) lgr.info("Running pipeline %s" % str(pipeline)) # TODO: capture the state of all branches so in case of crash # we could gracefully reset back try: output = run_pipeline(pipeline, stats=stats) except Exception as exc: # TODO: config.crawl.failure = full-reset | last-good-master # probably ask via ui which action should be performed unless # explicitly specified raise stats.datasets_crawled += 1 # TODO: Move gc/clean over here! stats_total = stats.get_total() if recursive: # get all subdatasets, and crawl them too! ## ? assert path_orig is None, "Otherwise not sure what to do with path=%r in subdatasets" % path import os from datalad.distribution.dataset import Dataset from datalad.api import crawl from datalad.utils import swallow_logs from datalad.dochelpers import exc_str # Note: we could collect all datasets to be crawled here or pass recursive=True # into the subdatasets' crawl. We will collect all of them here so we might later # also introduce automatic commits when super-dataset got successfully updated subdatasets = Dataset(os.curdir).subdatasets(recursive=recursive, result_xfm='relpaths') lgr.info("Crawling %d subdatasets", len(subdatasets)) output = [output] # TODO: parallelize # TODO: assumes that all sub-datasets are 'crawllable', and if not # just adds them to crawl_failed count. But may be we should make it more # explicit, that some sub-datasets might not need to be crawled, so they get # skipped explicitly? for ds_ in subdatasets: ds_logfile = utils.get_logfilename(ds_, 'crawl') try: # TODO: might be cool to be able to report a 'heart beat' from the swallow into pbar or smth with swallow_logs(file_=ds_logfile) as cml: output_, stats_ = crawl(chdir=ds_) stats_total += stats_ output.append(output_) lgr.info("Crawled %s: %s (log: %s)", ds_, stats_.as_str(mode='line'), ds_logfile) except Exception as exc: stats_total.datasets_crawl_failed += 1 stats_total.datasets_crawled += 1 output += [None] lgr.warning("Crawling of %s has failed (more in %s): %s.", # Log output: %s", ds_, ds_logfile, exc_str(exc)) # , cml.out) lgr.info("Total stats: %s", stats_total.as_str(mode='line')) return output, stats_total
def _update_ds_agginfo(refds_path, ds_path, subds_paths, incremental, agginfo_db, to_save): """Perform metadata aggregation for ds and a given list of subdataset paths Parameters ---------- refds_path : str Absolute path to the reference dataset that aggregate_metadata() was called on. ds_path : str Absolute path to the dataset to have its aggregate info updates subds_paths : list(str) Sequence of absolute paths of subdatasets of the to-be-updated dataset, whose agginfo shall be updated within the to-be-updated dataset. Any subdataset that is not listed here is assumed to be gone (i.e. no longer a subdataset at all, not just not locally installed) incremental : bool If set, the update will not remove any information on datasets not listed in subds_paths agginfo_db : dict Dictionary with all information on aggregate metadata on all datasets. Keys are absolute paths of datasets. to_save : list List of paths to save eventually. This function will add new paths as necessary. """ ds = Dataset(ds_path) # load existing aggregate info dict # makes sure all file/dataset paths become absolute # TODO take from cache, once used in _get_dsinfo_from_aggmetadata() agginfo_fpath, agg_base_path = get_ds_aggregate_db_locations(ds) ds_agginfos = load_ds_aggregate_db(ds, abspath=True) # object locations referenced initially objlocs_was = set(ai[k] for ai in ds_agginfos.values() for k in location_keys if k in ai) # track which objects need to be copied (each item is a from/to tuple objs2copy = [] # for each subdataset (any depth level) procds_paths = [ds.path] + subds_paths for dpath in procds_paths: ds_dbinfo = agginfo_db.get(dpath, {}).copy() # relative path of the currect dataset within the dataset we are updating drelpath = op.relpath(dpath, start=ds.path) for loclabel in location_keys: # TODO filepath_info is obsolete if loclabel == 'filepath_info' and drelpath == op.curdir: # do not write a file list into the dataset it is from if 'filepath_info' in ds_dbinfo: del ds_dbinfo['filepath_info'] continue # abspath to object objloc = ds_dbinfo.get(loclabel, None) if objloc is None: continue # XXX needs to change when layout of object store is changed # current is ./datalad/metadata/objects/{hash}/{hash} target_objpath = op.join(agg_base_path, *objloc.split(os.sep)[-3:]) # make sure we copy the file from its current location to where it is # needed in this dataset objs2copy.append(( # this needs to turn into an absolute path # `dpath` will be relative to the reference dataset #op.normpath(op.join(ds.path, dpath, op.dirname(agginfo_relpath), objloc)), objloc, target_objpath)) # now build needed local relpath ds_dbinfo[loclabel] = target_objpath # (re)assign in case record is new ds_agginfos[dpath] = ds_dbinfo # remove all entries for which we did not (no longer) have a corresponding # subdataset to take care of if not incremental: ds_agginfos = {k: v for k, v in ds_agginfos.items() if k in procds_paths} # set of metadata objects now referenced objlocs_is = set( ai[k] for sdsrpath, ai in ds_agginfos.items() for k in location_keys if k in ai) objs2add = objlocs_is # yoh: we appanretly do need to filter the ones to remove - I did # "git reset --hard HEAD^" and # aggregate-metadata failed upon next run trying to remove # an unknown to git file. I am yet to figure out why that # mattered (hopefully not that reflog is used somehow) objs2remove = [] for obj in objlocs_was.difference(objlocs_is): if op.lexists(obj): objs2remove.append(obj) else: # not really a warning, we don't need it anymore, it is already gone lgr.debug( "To-be-deleted metadata object not found, skip deletion (%s)", obj ) # secretly remove obsolete object files, not really a result from a # user's perspective if not incremental and objs2remove: ds.remove( objs2remove, # Don't use the misleading default commit message of `remove`: message='[DATALAD] Remove obsolete metadata object files', # we do not want to drop these files by default, because we would # loose them for other branches, and earlier tags # TODO evaluate whether this should be exposed as a switch # to run an explicit force-drop prior to calling remove() check=False, result_renderer=None, return_type=list) if not objs2add and not refds_path == ds_path: # this is not the base dataset, make sure to save removal in the # parentds -- not needed when objects get added, as removal itself # is already committed to_save.append(dict(path=ds_path, type='dataset', staged=True)) objs2copy = [(f, t) for f, t in objs2copy if f != t] # must copy object files to local target destination # make sure those objects are present # use the reference dataset to resolve paths, as they might point to # any location in the dataset tree Dataset(refds_path).get( [f for f, t in objs2copy], result_renderer='disabled') for copy_from, copy_to in objs2copy: copy_from = op.join(agg_base_path, copy_from) copy_to = op.join(agg_base_path, copy_to) target_dir = op.dirname(copy_to) if not op.exists(target_dir): makedirs(target_dir) # TODO we could be more clever (later) and maybe `addurl` (or similar) # the file from another dataset if op.lexists(copy_to): # no need to unlock, just wipe out and replace os.remove(copy_to) shutil.copy(copy_from, copy_to) to_save.append( dict(path=agginfo_fpath, type='file', staged=True)) if objs2add: # they are added standard way, depending on the repo type ds.repo.add([op.join(agg_base_path, p) for p in objs2add]) # queue for save, and mark as staged to_save.extend( [dict(path=op.join(agg_base_path, p), type='file', staged=True) for p in objs2add]) # write aggregate info file if not ds_agginfos: return _store_agginfo_db(ds, ds_agginfos) ds.repo.add(agginfo_fpath, git=True) # queue for save, and mark as staged to_save.append( dict(path=agginfo_fpath, type='file', staged=True))
def test_unlock(path): ds = Dataset(path) # file is currently locked: # TODO: use get_annexed_files instead of hardcoded filename assert_raises(IOError, open, opj(path, 'test-annex.dat'), "w") # Note: In V6+ we can unlock even if the file's content isn't present, but # doing so when unlock() is called with no paths isn't consistent with the # current behavior when an explicit path is given (it doesn't unlock) or # with the behavior in V5, so we don't do it. # Unlocking the dataset without an explicit path does not fail if there # are files without content. eq_(ds.unlock(path=None, on_failure="ignore"), []) eq_(ds.unlock(path=[], on_failure="ignore"), []) # cannot unlock without content (annex get wasn't called) assert_in_results(ds.unlock(path="test-annex.dat", on_failure="ignore"), path=opj(path, "test-annex.dat"), status="impossible") ds.repo.get('test-annex.dat') result = ds.unlock() assert_result_count(result, 1) assert_in_results(result, path=opj(ds.path, 'test-annex.dat'), status='ok') with open(opj(path, 'test-annex.dat'), "w") as f: f.write("change content") ds.repo.add('test-annex.dat') # in V6+ we need to explicitly re-lock it: if ds.repo.supports_unlocked_pointers: # TODO: RF: make 'lock' a command as well # re-lock to further on have a consistent situation with V5: ds.repo._git_custom_command('test-annex.dat', ['git', 'annex', 'lock']) ds.repo.commit("edit 'test-annex.dat' via unlock and lock it again") # after commit, file is locked again: assert_raises(IOError, open, opj(path, 'test-annex.dat'), "w") # content was changed: with open(opj(path, 'test-annex.dat'), "r") as f: eq_("change content", f.read()) # unlock again, this time more specific: result = ds.unlock(path='test-annex.dat') assert_result_count(result, 1) assert_in_results(result, path=opj(ds.path, 'test-annex.dat'), status='ok') with open(opj(path, 'test-annex.dat'), "w") as f: f.write("change content again") ds.repo.add('test-annex.dat') # in V6+ we need to explicitly re-lock it: if ds.repo.supports_unlocked_pointers: # TODO: RF: make 'lock' a command as well # re-lock to further on have a consistent situation with V5: ds.repo._git_custom_command('test-annex.dat', ['git', 'annex', 'lock']) ds.repo.commit("edit 'test-annex.dat' via unlock and lock it again") # TODO: # BOOOM: test-annex.dat writeable in V6! # Why the hell is this different than the first time we wrote to the file # and locked it again? # Also: After opening the file is empty. # after commit, file is locked again: assert_raises(IOError, open, opj(path, 'test-annex.dat'), "w") # content was changed: with open(opj(path, 'test-annex.dat'), "r") as f: eq_("change content again", f.read())
def test_save_amend(dspath): dspath = Path(dspath) file_in_super = dspath / 'somefile' file_in_sub = dspath / 'subds' / 'file_in_sub' # test on a hierarchy including a plain git repo: ds = Dataset(dspath).create(force=True, no_annex=True) subds = ds.create('subds', force=True) ds.save(recursive=True) assert_repo_status(ds.repo) # recursive and amend are mutually exclusive: for d in (ds, subds): assert_raises(ValueError, d.save, recursive=True, amend=True) # in an annex repo the branch we are interested in might not be the active # branch (adjusted): sub_branch = subds.repo.get_corresponding_branch() # amend in subdataset w/ new message; otherwise empty amendment: last_sha = subds.repo.get_hexsha(sub_branch) subds.save(message="new message in sub", amend=True) # we did in fact commit something: neq_(last_sha, subds.repo.get_hexsha(sub_branch)) # repo is clean: assert_repo_status(subds.repo) # message is correct: eq_(subds.repo.format_commit("%B", sub_branch).strip(), "new message in sub") # actually replaced the previous commit: assert_not_in(last_sha, subds.repo.get_branch_commits_(sub_branch)) # amend modifications in subdataset w/o new message if not subds.repo.is_managed_branch(): subds.unlock('file_in_sub') file_in_sub.write_text("modified again") last_sha = subds.repo.get_hexsha(sub_branch) subds.save(amend=True) neq_(last_sha, subds.repo.get_hexsha(sub_branch)) assert_repo_status(subds.repo) # message unchanged: eq_(subds.repo.format_commit("%B", sub_branch).strip(), "new message in sub") # actually replaced the previous commit: assert_not_in(last_sha, subds.repo.get_branch_commits_(sub_branch)) # save --amend with nothing to amend with: res = subds.save(amend=True) assert_result_count(res, 1) assert_result_count(res, 1, status='notneeded', action='save') # amend in superdataset w/ new message; otherwise empty amendment: last_sha = ds.repo.get_hexsha() ds.save(message="new message in super", amend=True) neq_(last_sha, ds.repo.get_hexsha()) assert_repo_status(subds.repo) eq_(ds.repo.format_commit("%B").strip(), "new message in super") assert_not_in(last_sha, ds.repo.get_branch_commits_()) # amend modifications in superdataset w/o new message file_in_super.write_text("changed content") if not subds.repo.is_managed_branch(): subds.unlock('file_in_sub') file_in_sub.write_text("modified once again") last_sha = ds.repo.get_hexsha() last_sha_sub = subds.repo.get_hexsha(sub_branch) ds.save(amend=True) neq_(last_sha, ds.repo.get_hexsha()) eq_(ds.repo.format_commit("%B").strip(), "new message in super") assert_not_in(last_sha, ds.repo.get_branch_commits_()) # we didn't mess with the subds: assert_repo_status(ds.repo, modified=["subds"]) eq_(last_sha_sub, subds.repo.get_hexsha(sub_branch)) eq_(subds.repo.format_commit("%B", sub_branch).strip(), "new message in sub") # save --amend with nothing to amend with: last_sha = ds.repo.get_hexsha() res = ds.save(amend=True) assert_result_count(res, 1) assert_result_count(res, 1, status='notneeded', action='save') eq_(last_sha, ds.repo.get_hexsha()) # we didn't mess with the subds: assert_repo_status(ds.repo, modified=["subds"]) eq_(last_sha_sub, subds.repo.get_hexsha(sub_branch)) eq_(subds.repo.format_commit("%B", sub_branch).strip(), "new message in sub") # amend with different identity: orig_author = ds.repo.format_commit("%an") orig_email = ds.repo.format_commit("%ae") orig_date = ds.repo.format_commit("%ad") orig_committer = ds.repo.format_commit("%cn") orig_committer_mail = ds.repo.format_commit("%ce") eq_(orig_author, orig_committer) eq_(orig_email, orig_committer_mail) with patch.dict('os.environ', {'GIT_COMMITTER_NAME': 'Hopefully Different', 'GIT_COMMITTER_EMAIL': '*****@*****.**'}): ds.config.reload(force=True) ds.save(amend=True, message="amend with hope") # author was kept: eq_(orig_author, ds.repo.format_commit("%an")) eq_(orig_email, ds.repo.format_commit("%ae")) eq_(orig_date, ds.repo.format_commit("%ad")) # committer changed: eq_(ds.repo.format_commit("%cn"), "Hopefully Different") eq_(ds.repo.format_commit("%ce"), "*****@*****.**") # corner case: amend empty commit with no parent: rmtree(str(dspath)) # When adjusted branch is enforced by git-annex detecting a crippled FS, # git-annex produces an empty commit before switching to adjusted branch: # "commit before entering adjusted branch" # The commit by `create` would be the second one already. # Therefore go with plain annex repo and create an (empty) commit only when # not on adjusted branch: repo = AnnexRepo(dspath, create=True) if not repo.is_managed_branch(): repo.commit(msg="initial", options=['--allow-empty']) ds = Dataset(dspath) branch = ds.repo.get_corresponding_branch() or ds.repo.get_active_branch() # test pointless if we start with more than one commit eq_(len(list(ds.repo.get_branch_commits_(branch))), 1, msg="More than on commit '{}': {}".format( branch, ds.repo.call_git(['log', branch])) ) last_sha = ds.repo.get_hexsha(branch) ds.save(message="new initial commit", amend=True) assert_repo_status(ds.repo) eq_(len(list(ds.repo.get_branch_commits_(branch))), 1, msg="More than on commit '{}': {}".format( branch, ds.repo.call_git(['log', branch])) ) assert_not_in(last_sha, ds.repo.get_branch_commits_(branch)) eq_(ds.repo.format_commit("%B", branch).strip(), "new initial commit")
def test_status(_path, linkpath): # do the setup on the real path, not the symlink, to have its # bugs not affect this test of status() ds = get_deeply_nested_structure(str(_path)) if has_symlink_capability(): # make it more complicated by default ut.Path(linkpath).symlink_to(_path, target_is_directory=True) path = linkpath else: path = _path ds = Dataset(path) if has_symlink_capability(): assert ds.pathobj != ds.repo.pathobj # spotcheck that annex status reporting and availability evaluation # works assert_result_count( ds.status(annex='all', result_renderer=None), 1, path=str(ds.pathobj / 'subdir' / 'annexed_file.txt'), key='MD5E-s5--275876e34cf609db118f3d84b799a790.txt', has_content=True, objloc=str(ds.repo.pathobj / '.git' / 'annex' / 'objects' / # hashdir is different on windows ('f33' if ds.repo.is_managed_branch() else '7p') / ('94b' if ds.repo.is_managed_branch() else 'gp') / 'MD5E-s5--275876e34cf609db118f3d84b799a790.txt' / 'MD5E-s5--275876e34cf609db118f3d84b799a790.txt')) plain_recursive = ds.status(recursive=True, result_renderer=None) # check integrity of individual reports with a focus on how symlinks # are reported for res in plain_recursive: # anything that is an "intended" symlink should be reported # as such. In contrast, anything that is a symlink for mere # technical reasons (annex using it for something in some mode) # should be reported as the thing it is representing (i.e. # a file) if 'link2' in str(res['path']): assert res['type'] == 'symlink', res else: assert res['type'] != 'symlink', res # every item must report its parent dataset assert_in('parentds', res) # bunch of smoke tests # query of '.' is same as no path eq_(plain_recursive, ds.status(path='.', recursive=True, result_renderer=None)) # duplicate paths do not change things eq_(plain_recursive, ds.status(path=['.', '.'], recursive=True, result_renderer=None)) # neither do nested paths eq_( plain_recursive, ds.status(path=['.', 'subds_modified'], recursive=True, result_renderer=None)) # when invoked in a subdir of a dataset it still reports on the full thing # just like `git status`, as long as there are no paths specified with chpwd(op.join(path, 'directory_untracked')): plain_recursive = status(recursive=True, result_renderer=None) # should be able to take absolute paths and yield the same # output eq_(plain_recursive, ds.status(path=ds.path, recursive=True, result_renderer=None)) # query for a deeply nested path from the top, should just work with a # variety of approaches rpath = op.join('subds_modified', 'subds_lvl1_modified', OBSCURE_FILENAME + u'_directory_untracked') apathobj = ds.pathobj / rpath apath = str(apathobj) # ds.repo.pathobj will have the symlink resolved arealpath = ds.repo.pathobj / rpath # TODO include explicit relative path in test for p in (rpath, apath, arealpath, None): if p is None: # change into the realpath of the dataset and # query with an explicit path with chpwd(ds.repo.path): res = ds.status(path=op.join('.', rpath), result_renderer=None) else: res = ds.status(path=p, result_renderer=None) assert_result_count( res, 1, state='untracked', type='directory', refds=ds.path, # path always comes out a full path inside the queried dataset path=apath, ) assert_result_count(ds.status(recursive=True, result_renderer=None), 1, path=apath) # limiting recursion will exclude this particular path assert_result_count(ds.status(recursive=True, recursion_limit=1, result_renderer=None), 0, path=apath) # negative limit is unlimited limit eq_(ds.status(recursive=True, recursion_limit=-1, result_renderer=None), ds.status(recursive=True, result_renderer=None))
def __call__(path=None, dataset=None, recursive=False, recursion_limit=None): refds = require_dataset(dataset, check_installed=True, purpose="unlocking") # Before passing the results to status() # * record explicitly specified non-directory paths so that we can # decide whether to yield a result for reported paths # * filter out and yield results for paths that don't exist res_paths_nondir = set() paths_lexist = None res_paths = list() if path: # Note, that we need unresolved versions of the path input to be # passed on to status. See gh-5456 for example. path = ensure_list(path) res_paths = resolve_path(path, ds=dataset) paths_lexist = [] res_paths_lexist = [] for p, p_r in zip(path, res_paths): if p_r.exists() or p_r.is_symlink(): paths_lexist.append(p) res_paths_lexist.append(p_r) if not p_r.is_dir(): res_paths_nondir.add(p_r) res_kwargs = dict(action='unlock', logger=lgr, refds=refds.path) if res_paths: for p in set(res_paths).difference(set(res_paths_lexist)): yield get_status_dict(status="impossible", path=str(p), type="file", message="path does not exist", **res_kwargs) if not (paths_lexist or paths_lexist is None): return # Collect information on the paths to unlock. to_unlock = defaultdict(list) # ds => paths (relative to ds) for res in Status()( # ATTN: it is vital to pass the `dataset` argument as it, # and not a dataset instance in order to maintain the path # semantics between here and the status() call dataset=dataset, path=paths_lexist, untracked="normal" if res_paths_nondir else "no", report_filetype=False, annex="availability", recursive=recursive, recursion_limit=recursion_limit, result_renderer='disabled', on_failure="ignore"): if res["action"] != "status" or res["status"] != "ok": yield res continue has_content = res.get("has_content") if has_content: parentds = res["parentds"] to_unlock[parentds].append(op.relpath(res["path"], parentds)) elif res_paths_nondir and Path(res["path"]) in res_paths_nondir: if has_content is False: msg = "no content present" status = "impossible" elif res["state"] == "untracked": msg = "untracked" status = "impossible" else: # This is either a regular git file or an unlocked annex # file. msg = "non-annex file" status = "notneeded" yield get_status_dict(status=status, path=res["path"], type="file", message="{}; cannot unlock".format(msg), **res_kwargs) # Do the actual unlocking. for ds_path, files in to_unlock.items(): ds = Dataset(ds_path) for r in ds.repo._call_annex_records(["unlock"], files=files): yield get_status_dict(path=op.join(ds.path, r['file']), status='ok' if r['success'] else 'error', type='file', **res_kwargs)
def fs_traverse(path, repo, parent=None, subdatasets=None, render=True, recurse_datasets=False, recurse_directories=False, json=None, basepath=None): """Traverse path through its nodes and returns a dictionary of relevant attributes attached to each node Parameters ---------- path: str Path to the directory to be traversed repo: AnnexRepo or GitRepo Repo object the directory belongs too parent: dict Extracted info about parent directory recurse_directories: bool Recurse into subdirectories (note that subdatasets are not traversed) render: bool To render from within function or not. Set to false if results to be manipulated before final render Returns ------- list of dict extracts and returns a (recursive) list of directory info at path does not traverse into annex, git or hidden directories """ subdatasets = subdatasets or [] fs = fs_extract(path, repo, basepath=basepath or path) if isdir(path): # if node is a directory children = [ fs.copy() ] # store its info in its children dict too (Yarik is not sure why, but I guess for .?) # ATM seems some pieces still rely on having this duplication, so left as is # TODO: strip away for node in listdir(path): nodepath = opj(path, node) # Might contain subdatasets, so we should analyze and prepare entries # to pass down... in theory we could just pass full paths may be? strip node_subdatasets = [] is_subdataset = False if isdir(nodepath): node_sep = with_pathsep(node) for subds in subdatasets: if subds == node: # it is the subdataset is_subdataset = True else: # use path_is_subdir if subds.startswith(node_sep): node_subdatasets += [subds[len(node_sep):]] # TODO: it might be a subdir which is non-initialized submodule! # if not ignored, append child node info to current nodes dictionary if is_subdataset: subds = _traverse_handle_subds( relpath(nodepath, repo.path), Dataset(repo.path), recurse_datasets=recurse_datasets, recurse_directories=recurse_directories, json=json) children.append(subds) elif not ignored(nodepath): # if recursive, create info dictionary (within) each child node too if recurse_directories: subdir = fs_traverse( nodepath, repo, subdatasets=node_subdatasets, parent=None, # children[0], recurse_datasets=recurse_datasets, recurse_directories=recurse_directories, json=json, basepath=basepath or path) subdir.pop('nodes', None) else: # read child metadata from its metadata file if it exists subdir_json = metadata_locator(path=node, ds_path=basepath or path) if exists(subdir_json): with open(subdir_json) as data_file: subdir = js.load(data_file) subdir.pop('nodes', None) # else extract whatever information you can about the child else: # Yarik: this one is way too lean... subdir = fs_extract(nodepath, repo, basepath=basepath or path) # append child metadata to list children.extend([subdir]) # sum sizes of all 1st level children children_size = {} for node in children[1:]: for size_type, child_size in node['size'].items(): children_size[size_type] = children_size.get( size_type, 0) + machinesize(child_size) # update current node sizes to the humanized aggregate children size fs['size'] = children[0]['size'] = \ {size_type: humanize.naturalsize(child_size) for size_type, child_size in children_size.items()} children[0][ 'name'] = '.' # replace current node name with '.' to emulate unix syntax if parent: parent[ 'name'] = '..' # replace parent node name with '..' to emulate unix syntax children.insert( 1, parent ) # insert parent info after current node info in children dict fs['nodes'] = children # add children info to main fs dictionary if render: # render directory node at location(path) fs_render(fs, json=json, ds_path=basepath or path) lgr.info('Directory: %s' % path) return fs
def test_aggregation(path): # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(op.join(path, 'origin')).create(force=True) ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') subds = ds.create('sub', force=True) subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') subsubds = subds.create('subsub', force=True) subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') assert_status('ok', ds.save(recursive=True)) # while we are at it: dot it again, nothing should happen assert_status('notneeded', ds.save(recursive=True)) assert_repo_status(ds.path) # aggregate metadata from all subdatasets into any superdataset, including # intermediate ones res = ds.meta_aggregate(recursive=True, into='all') # we get success report for both subdatasets and the superdataset, # and they get saved assert_result_count(res, 3, status='ok', action='meta_aggregate') # the respective super datasets see two saves, one to record the change # in the subdataset after its own aggregation, and one after the super # updated with aggregated metadata assert_result_count(res, 5, status='ok', action='save', type='dataset') # nice and tidy assert_repo_status(ds.path) # quick test of aggregate report aggs = ds.meta_dump(reporton='aggregates', recursive=True) # one for each dataset assert_result_count(aggs, 3) # mother also report layout version assert_result_count(aggs, 1, path=ds.path, layout_version=1) # store clean direct result origres = ds.meta_dump(recursive=True) # basic sanity check assert_result_count(origres, 3, type='dataset') assert_result_count([r for r in origres if r['path'].endswith('.json')], 3, type='file') # Now that we have annex.key # three different IDs eq_( 3, len( set([ _get_dsid_from_core_metadata(s['metadata']['metalad_core']) for s in origres if s['type'] == 'dataset' ]))) # and we know about all three datasets for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'): assert_true( sum([s['metadata']['frictionless_datapackage']['name'] \ == assure_unicode(name) for s in origres if s['type'] == 'dataset'])) # now clone the beast to simulate a new user installing an empty dataset clone = install(op.join(path, 'clone'), source=ds.path, result_xfm='datasets', return_type='item-or-list') # ID mechanism works eq_(ds.id, clone.id) # get fresh metadata cloneres = clone.meta_dump() # basic sanity check assert_result_count(cloneres, 1, type='dataset') # payload file assert_result_count(cloneres, 1, type='file') # now loop over the previous results from the direct metadata query of # origin and make sure we get the extact same stuff from the clone _compare_metadata_helper(origres, clone) # now obtain a subdataset in the clone, should make no difference assert_status('ok', clone.install('sub', result_xfm=None, return_type='list')) _compare_metadata_helper(origres, clone) # test search in search tests, not all over the place ## query smoke test assert_result_count(clone.search('mother', mode='egrep'), 1) assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1) child_res = clone.search('child', mode='egrep') assert_result_count(child_res, 2) for r in child_res: if r['type'] == 'dataset': assert_in(r['query_matched']['frictionless_datapackage.name'], r['metadata']['frictionless_datapackage']['name'])
def _push(dspath, content, target, force, jobs, res_kwargs, pbars, done_fetch=None, got_path_arg=False): if not done_fetch: done_fetch = set() # nothing recursive in here, we only need a repo to work with ds = Dataset(dspath) repo = ds.repo res_kwargs.update(type='dataset', path=dspath) # content will be unique for every push (even on the some dataset) pbar_id = 'push-{}-{}'.format(target, id(content)) # register for final orderly take down pbars[pbar_id] = ds log_progress( lgr.info, pbar_id, 'Determine push target', unit=' Steps', label='Push', total=4, ) if not target: try: # let Git figure out what needs doing wannabe_gitpush = repo.push(remote=None, git_options=['--dry-run']) # we did not get an explicit push target, get it from Git target = set(p.get('remote', None) for p in wannabe_gitpush) # handle case where a pushinfo record did not have a 'remote' # property -- should not happen, but be robust target.discard(None) except Exception as e: lgr.debug( 'Dry-run push to determine default push target failed, ' 'assume no configuration: %s', e) target = set() if not len(target): yield dict( res_kwargs, status='impossible', message='No push target given, and none could be ' 'auto-detected, please specific via --to', ) return elif len(target) > 1: # dunno if this can ever happen, but if it does, report # nicely yield dict(res_kwargs, status='error', message=( 'No push target given, ' 'multiple candidates auto-detected: %s', list(target), )) return else: # can only be a single one at this point target = target.pop() if target not in repo.get_remotes(): yield dict(res_kwargs, status='error', message=("Unknown target sibling '%s'.", target)) return log_progress(lgr.info, pbar_id, "Push refspecs", label="Push to '{}'".format(target), update=1, total=4) # define config var name for potential publication dependencies depvar = 'remote.{}.datalad-publish-depends'.format(target) # list of remotes that are publication dependencies for the # target remote publish_depends = assure_list(ds.config.get(depvar, [])) if publish_depends: lgr.debug("Discovered publication dependencies for '%s': %s'", target, publish_depends) # cache repo type is_annex_repo = isinstance(ds.repo, AnnexRepo) # TODO prevent this when `target` is a special remote # (possibly redo) a push attempt to figure out what needs pushing # do this on the main target only, and apply the result to all # dependencies try: wannabe_gitpush = repo.push(remote=target, git_options=['--dry-run']) except Exception as e: lgr.debug( 'Dry-run push to check push configuration failed, ' 'assume no configuration: %s', e) wannabe_gitpush = [] refspecs2push = [ # if an upstream branch is set, go with it p['from_ref'] if ds.config.get( # refs come in as refs/heads/<branchname> # need to cut the prefix 'branch.{}.remote'.format(p['from_ref'][11:]), None) == target and ds.config.get('branch.{}.merge'.format(p['from_ref'][11:]), None) # if not, define target refspec explicitly to avoid having to # set an upstream branch, which would happen implicitly from # a users POV, and may also be hard to decide when publication # dependencies are present else '{}:{}'.format(p['from_ref'], p['to_ref']) for p in wannabe_gitpush # TODO: what if a publication dependency doesn't have it yet # should we not attempt to push, because the main target has it? if 'uptodate' not in p['operations'] and ( # cannot think of a scenario where we would want to push a # managed branch directly, instead of the corresponding branch 'refs/heads/adjusted' not in p['from_ref']) ] if not refspecs2push: lgr.debug( 'No refspecs configured for push, attempting to use active branch') # nothing was set up for push, push the current branch at minimum # TODO this is not right with managed branches active_branch = repo.get_active_branch() if not active_branch: yield dict( res_kwargs, status='impossible', message='There is no active branch, cannot determine remote ' 'branch') return if is_annex_repo: # we could face a managed branch, in which case we need to # determine the actual one and make sure it is sync'ed with the # managed one, and push that one instead. following methods can # be called unconditionally repo.localsync(managed_only=True) active_branch = repo.get_corresponding_branch( active_branch) or active_branch refspecs2push.append( # same dance as above active_branch if ds.config. get('branch.{}.merge'.format(active_branch), None ) else '{ab}:{ab}'.format(ab=active_branch)) # we know what to push and where, now dependency processing first for r in publish_depends: # simply make a call to this function again, all the same, but # target is different, pass done_fetch to avoid duplicate # and expensive calls to git-fetch yield from _push( dspath, content, # to this particular dependency r, force, jobs, res_kwargs.copy(), pbars, done_fetch=None, got_path_arg=got_path_arg, ) # and lastly the primary push target target_is_git_remote = repo.config.get('remote.{}.url'.format(target), None) is not None # only attempt, if Git knows about a URL, otherwise this is # a pure special remote that doesn't deal with the git repo if target_is_git_remote: # push the main branches of interest first, but not yet (necessarily) # the git-annex branch. We ant to push first in order to hit any # conflicts or unknown history before we move data. Otherwise out # decision making done above (--since ...) might have been # inappropriate. push_ok = True for p in _push_refspecs(repo, target, refspecs2push, force, res_kwargs.copy()): if p['status'] not in ('ok', 'notneeded'): push_ok = False yield p if not push_ok: # error-type results have been yielded, the local status quo is # outdated/invalid, stop to let user decide how to proceed. # TODO final global error result for the dataset?! return # git-annex data move # if not is_annex_repo: return if force == 'no-datatransfer': lgr.debug("Data transfer to '%s' disabled by argument", target) return log_progress(lgr.info, pbar_id, "Transfer data", label="Transfer data to '{}'".format(target), update=2, total=4) yield from _push_data( ds, target, content, force, jobs, res_kwargs.copy(), got_path_arg=got_path_arg, ) if not target_is_git_remote: # there is nothing that we need to push or sync with on the git-side # of things with this remote return log_progress(lgr.info, pbar_id, "Update availability information", label="Update availability for '{}'".format(target), update=3, total=4) # after file transfer the remote might have different commits to # the annex branch. They have to be merged locally, otherwise a # push of it further down will fail try: # fetch remote, let annex sync them locally, so that the push # later on works. # We have to fetch via the push url (if there is any), # not a pull url. # The latter might be dumb and without the execution of a # post-update hook we might not be able to retrieve the # server-side git-annex branch updates (and git-annex does # not trigger the hook on copy), but we know we have # full access via the push url -- we have just used it to copy. lgr.debug("Fetch 'git-annex' branch updates from '%s'", target) fetch_cmd = ['fetch', target, 'git-annex'] pushurl = repo.config.get('remote.{}.pushurl'.format(target), None) if pushurl: # for some reason overwriting remote.{target}.url # does not have any effect... fetch_cmd = [ '-c', 'url.{}.insteadof={}'.format( pushurl, repo.config.get('remote.{}.url'.format(target), None)) ] + fetch_cmd lgr.debug("Sync local annex branch from pushurl after remote " 'availability update.') repo.call_git(fetch_cmd) repo.localsync(target) except CommandError as e: # it is OK if the remote doesn't have a git-annex branch yet # (e.g. fresh repo) # TODO is this possible? we just copied? Maybe check if anything # was actually copied? if "fatal: couldn't find remote ref git-annex" not in e.stderr.lower(): raise lgr.debug('Remote does not have a git-annex branch: %s', e) # and push the annex branch to announce local availability info # too yield from _push_refspecs( repo, target, [ 'git-annex' if ds.config.get('branch.git-annex.merge', None) else 'git-annex:git-annex' ], force, res_kwargs.copy(), )
def test_save(path): ds = Dataset(path) with open(op.join(path, "new_file.tst"), "w") as f: f.write("something") ds.repo.add("new_file.tst", git=True) ok_(ds.repo.dirty) ds.save(message="add a new file") assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) with open(op.join(path, "new_file.tst"), "w") as f: f.write("modify") ok_(ds.repo.dirty) ds.save(message="modified new_file.tst") assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) # save works without ds and files given in the PWD with open(op.join(path, "new_file.tst"), "w") as f: f.write("rapunzel") with chpwd(path): save(message="love rapunzel") assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) # and also without `-a` when things are staged with open(op.join(path, "new_file.tst"), "w") as f: f.write("exotic") ds.repo.add("new_file.tst", git=True) with chpwd(path): save(message="love marsians") assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) files = ['one.txt', 'two.txt'] for fn in files: with open(op.join(path, fn), "w") as f: f.write(fn) ds.save([op.join(path, f) for f in files]) # superfluous call to save (alll saved it already), should not fail # but report that nothing was saved assert_status('notneeded', ds.save(message="set of new files")) assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) # create subdataset subds = ds.create('subds') assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) # modify subds with open(op.join(subds.path, "some_file.tst"), "w") as f: f.write("something") subds.save() assert_repo_status(subds.path, annex=isinstance(subds.repo, AnnexRepo)) # ensure modified subds is committed ds.save() assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) # now introduce a change downstairs subds.create('someotherds') assert_repo_status(subds.path, annex=isinstance(subds.repo, AnnexRepo)) ok_(ds.repo.dirty) # and save via subdataset path ds.save('subds', version_tag='new_sub') assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) tags = ds.repo.get_tags() ok_(len(tags) == 1) eq_(tags[0], dict(hexsha=ds.repo.get_hexsha(), name='new_sub')) # fails when retagged, like git does res = ds.save(version_tag='new_sub', on_failure='ignore') assert_status('error', res) assert_result_count(res, 1, action='save', type='dataset', path=ds.path, message=('cannot tag this version: %s', "fatal: tag 'new_sub' already exists"))
def test_ls_json(topdir, topurl): annex = AnnexRepo(topdir, create=True) ds = Dataset(topdir) # create some file and commit it with open(opj(ds.path, 'subdsfile.txt'), 'w') as f: f.write('123') ds.rev_save(path='subdsfile.txt', message="Hello!", version_tag=1) # add a subdataset ds.install('subds', source=topdir) subdirds = ds.rev_create(_path_('dir/subds2'), force=True) subdirds.rev_save('file') git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True) # create git repo git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt')) # commit to git to init git repo git.commit() annex.add(opj(topdir, 'dir', 'subgit')) # add the non-dataset git repo to annex annex.add(opj(topdir, 'dir')) # add to annex (links) annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # broken-link annex.commit() git.add('fgit.txt') # commit to git to init git repo git.commit() # annex.add doesn't add submodule, so using ds.add ds.rev_save(opj('dir', 'subgit')) # add the non-dataset git repo to annex ds.rev_save('dir') # add to annex (links) ds.drop(opj('dir', 'subdir', 'file2.txt'), check=False) # broken-link # register "external" submodule by installing and uninstalling it ext_url = topurl + '/dir/subgit/.git' # need to make it installable via http Runner()('git update-server-info', cwd=opj(topdir, 'dir', 'subgit')) ds.install(opj('dir', 'subgit_ext'), source=ext_url) ds.uninstall(opj('dir', 'subgit_ext')) meta_dir = opj('.git', 'datalad', 'metadata') def get_metahash(*path): if not path: path = ['/'] return hashlib.md5(opj(*path).encode('utf-8')).hexdigest() def get_metapath(dspath, *path): return _path_(dspath, meta_dir, get_metahash(*path)) def get_meta(dspath, *path): with open(get_metapath(dspath, *path)) as f: return js.load(f) # Let's see that there is no crash if one of the files is available only # in relaxed URL mode, so no size could be picked up ds.repo.add_url_to_file('fromweb', topurl + '/noteventhere', options=['--relaxed']) for all_ in [True, False]: # recurse directories for recursive in [True, False]: for state in ['file', 'delete']: # subdataset should have its json created and deleted when # all=True else not subds_metapath = get_metapath(opj(topdir, 'subds')) exists_prior = exists(subds_metapath) #with swallow_logs(), swallow_outputs(): dsj = _ls_json(topdir, json=state, all_=all_, recursive=recursive) ok_startswith(dsj['tags'], '1-') exists_post = exists(subds_metapath) # print("%s %s -> %s" % (state, exists_prior, exists_post)) assert_equal(exists_post, (state == 'file' and recursive)) # root should have its json file created and deleted in all cases ds_metapath = get_metapath(topdir) assert_equal(exists(ds_metapath), state == 'file') # children should have their metadata json's created and deleted only when recursive=True child_metapath = get_metapath(topdir, 'dir', 'subdir') assert_equal(exists(child_metapath), (state == 'file' and all_)) # ignored directories should not have json files created in any case for subdir in [('.hidden', ), ('dir', 'subgit')]: assert_false(exists(get_metapath(topdir, *subdir))) # check if its updated in its nodes sublist too. used by web-ui json. regression test assert_equal(dsj['nodes'][0]['size']['total'], dsj['size']['total']) # check size of subdataset subds = [ item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds') ][0] assert_equal(subds['size']['total'], '3 Bytes') # dir/subds2 must not be listed among nodes of the top dataset: topds_nodes = {x['name']: x for x in dsj['nodes']} assert_in('subds', topds_nodes) # XXX # # condition here is a bit a guesswork by yoh later on # # TODO: here and below clear destiny/interaction of all_ and recursive # assert_equal(dsj['size']['total'], # '15 Bytes' if (recursive and all_) else # ('9 Bytes' if (recursive or all_) else '3 Bytes') # ) # https://github.com/datalad/datalad/issues/1674 if state == 'file' and all_: dirj = get_meta(topdir, 'dir') dir_nodes = {x['name']: x for x in dirj['nodes']} # it should be present in the subdir meta assert_in('subds2', dir_nodes) assert_not_in('url_external', dir_nodes['subds2']) assert_in('subgit_ext', dir_nodes) assert_equal(dir_nodes['subgit_ext']['url'], ext_url) # and not in topds assert_not_in('subds2', topds_nodes) # run non-recursive dataset traversal after subdataset metadata already created # to verify sub-dataset metadata being picked up from its metadata file in such cases if state == 'file' and recursive and not all_: dsj = _ls_json(topdir, json='file', all_=False) subds = [ item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds') ][0] assert_equal(subds['size']['total'], '3 Bytes') assert_equal(topds_nodes['fromweb']['size']['total'], UNKNOWN_SIZE)
def test_save_hierarchy(path): # this test doesn't use API`remove` to avoid circularities ds = make_demo_hierarchy_datasets(path, demo_hierarchy) ds.add('.', recursive=True) ok_clean_git(ds.path) ds_bb = Dataset(opj(ds.path, 'b', 'bb')) ds_bba = Dataset(opj(ds_bb.path, 'bba')) ds_bbaa = Dataset(opj(ds_bba.path, 'bbaa')) # introduce a change at the lowest level ds_bbaa.repo.remove('file_bbaa') for d in (ds, ds_bb, ds_bba, ds_bbaa): ok_(d.repo.dirty) # need to give file specifically, otherwise it will simply just preserve # staged changes ds_bb.save(path=opj(ds_bbaa.path, 'file_bbaa')) # it has saved all changes in the subtrees spanned # by the given datasets, but nothing else for d in (ds_bb, ds_bba, ds_bbaa): ok_clean_git(d.path) ok_(ds.repo.dirty) # now with two modified repos d = Dataset(opj(ds.path, 'd')) da = Dataset(opj(d.path, 'da')) da.repo.remove('file_da') db = Dataset(opj(d.path, 'db')) db.repo.remove('file_db') # generator d.save(recursive=True) for d in (d, da, db): ok_clean_git(d.path) ok_(ds.repo.dirty) # and now with files all over the place and saving # all the way to the root aa = Dataset(opj(ds.path, 'a', 'aa')) aa.repo.remove('file_aa') ba = Dataset(opj(ds.path, 'b', 'ba')) ba.repo.remove('file_ba') bb = Dataset(opj(ds.path, 'b', 'bb')) bb.repo.remove('file_bb') c = Dataset(opj(ds.path, 'c')) c.repo.remove('file_c') ca = Dataset(opj(ds.path, 'c', 'ca')) ca.repo.remove('file_ca') d = Dataset(opj(ds.path, 'd')) d.repo.remove('file_d') ds.save( # append trailing slashes to the path to indicate that we want to # have the staged content in the dataset saved, rather than only the # subdataset state in the respective superds. # an alternative would have been to pass `save` annotated paths of # type {'path': dspath, 'process_content': True} for each dataset # in question, but here we want to test how this would most likely # by used from cmdline path=[opj(p, '') for p in (aa.path, ba.path, bb.path, c.path, ca.path, d.path)], super_datasets=True)
def test_unlock(path): ds = Dataset(path) # file is currently locked: # TODO: use get_annexed_files instead of hardcoded filename assert_raises(IOError, open, opj(path, 'test-annex.dat'), "w") # in direct mode there is no unlock: if ds.repo.is_direct_mode(): res = ds.unlock() assert_result_count(res, 1) assert_status('notneeded', res) # in V6 we can unlock even if the file's content isn't present: elif ds.repo.config.getint("annex", "version") == 6: res = ds.unlock() assert_result_count(res, 1) assert_status('ok', res) # TODO: RF: make 'lock' a command as well # re-lock to further on have a consistent situation with V5: ds.repo._git_custom_command('test-annex.dat', ['git', 'annex', 'lock']) else: # cannot unlock without content (annex get wasn't called) assert_raises(CommandError, ds.unlock) # FIXME ds.repo.get('test-annex.dat') result = ds.unlock() assert_result_count(result, 1) if ds.repo.is_direct_mode(): assert_status('notneeded', result) else: assert_in_results(result, path=opj(ds.path, 'test-annex.dat'), status='ok') with open(opj(path, 'test-annex.dat'), "w") as f: f.write("change content") ds.repo.add('test-annex.dat') # in V6 we need to explicitly re-lock it: if ds.repo.config.getint("annex", "version") == 6: # TODO: RF: make 'lock' a command as well # re-lock to further on have a consistent situation with V5: ds.repo._git_custom_command('test-annex.dat', ['git', 'annex', 'lock']) ds.repo.commit("edit 'test-annex.dat' via unlock and lock it again") if not ds.repo.is_direct_mode(): # after commit, file is locked again: assert_raises(IOError, open, opj(path, 'test-annex.dat'), "w") # content was changed: with open(opj(path, 'test-annex.dat'), "r") as f: eq_("change content", f.read()) # unlock again, this time more specific: result = ds.unlock(path='test-annex.dat') assert_result_count(result, 1) if ds.repo.is_direct_mode(): assert_in_results(result, path=opj(ds.path, 'test-annex.dat'), status='notneeded') else: assert_in_results(result, path=opj(ds.path, 'test-annex.dat'), status='ok') with open(opj(path, 'test-annex.dat'), "w") as f: f.write("change content again") ds.repo.add('test-annex.dat') # in V6 we need to explicitly re-lock it: if ds.repo.config.getint("annex", "version") == 6: # TODO: RF: make 'lock' a command as well # re-lock to further on have a consistent situation with V5: ds.repo._git_custom_command('test-annex.dat', ['git', 'annex', 'lock']) ds.repo.commit("edit 'test-annex.dat' via unlock and lock it again") # TODO: # BOOOM: test-annex.dat writeable in V6! # Why the hell is this different than the first time we wrote to the file # and locked it again? # Also: After opening the file is empty. if not ds.repo.is_direct_mode(): # after commit, file is locked again: assert_raises(IOError, open, opj(path, 'test-annex.dat'), "w") # content was changed: with open(opj(path, 'test-annex.dat'), "r") as f: eq_("change content again", f.read())
def test_get_subdatasets(path): ds = Dataset(path) # one more subdataset with a name that could ruin config option parsing dots = str(Path('subdir') / '.lots.of.dots.') ds.create(dots) eq_(ds.subdatasets(recursive=True, fulfilled=False, result_xfm='relpaths'), [ 'sub dataset1' ]) ds.get('sub dataset1') eq_(ds.subdatasets(recursive=True, fulfilled=False, result_xfm='relpaths'), [ 'sub dataset1/2', 'sub dataset1/sub sub dataset1', 'sub dataset1/subm 1', ]) # obtain key subdataset, so all leaf subdatasets are discoverable ds.get(opj('sub dataset1', 'sub sub dataset1')) eq_(ds.subdatasets(result_xfm='relpaths'), ['sub dataset1', dots]) eq_([(r['parentds'], r['path']) for r in ds.subdatasets()], [(path, opj(path, 'sub dataset1')), (path, opj(path, dots))]) all_subs = [ 'sub dataset1', 'sub dataset1/2', 'sub dataset1/sub sub dataset1', 'sub dataset1/sub sub dataset1/2', 'sub dataset1/sub sub dataset1/subm 1', 'sub dataset1/subm 1', dots, ] eq_(ds.subdatasets(recursive=True, result_xfm='relpaths'), all_subs) with chpwd(str(ds.pathobj)): # imitate cmdline invocation w/ no dataset argument eq_(subdatasets(dataset=None, path=[], recursive=True, result_xfm='relpaths'), all_subs) # redo, but limit to specific paths eq_( ds.subdatasets( path=['sub dataset1/2', 'sub dataset1/sub sub dataset1'], recursive=True, result_xfm='relpaths'), [ 'sub dataset1/2', 'sub dataset1/sub sub dataset1', 'sub dataset1/sub sub dataset1/2', 'sub dataset1/sub sub dataset1/subm 1', ] ) eq_( ds.subdatasets( path=['sub dataset1'], recursive=True, result_xfm='relpaths'), [ 'sub dataset1', 'sub dataset1/2', 'sub dataset1/sub sub dataset1', 'sub dataset1/sub sub dataset1/2', 'sub dataset1/sub sub dataset1/subm 1', 'sub dataset1/subm 1', ] ) with chpwd(str(ds.pathobj / 'subdir')): # imitate cmdline invocation w/ no dataset argument # -> curdir limits the query, when no info is given eq_(subdatasets(dataset=None, path=[], recursive=True, result_xfm='paths'), [str(ds.pathobj / dots)] ) # but with a dataset explicitly given, even if just as a path, # curdir does no limit the query eq_(subdatasets(dataset=os.pardir, path=None, recursive=True, result_xfm='relpaths'), ['sub dataset1', 'sub dataset1/2', 'sub dataset1/sub sub dataset1', 'sub dataset1/sub sub dataset1/2', 'sub dataset1/sub sub dataset1/subm 1', 'sub dataset1/subm 1', dots] ) # uses slow, flexible query eq_(ds.subdatasets(recursive=True, bottomup=True, result_xfm='relpaths'), [ 'sub dataset1/2', 'sub dataset1/sub sub dataset1/2', 'sub dataset1/sub sub dataset1/subm 1', 'sub dataset1/sub sub dataset1', 'sub dataset1/subm 1', 'sub dataset1', dots, ]) eq_(ds.subdatasets(recursive=True, fulfilled=True, result_xfm='relpaths'), [ 'sub dataset1', 'sub dataset1/sub sub dataset1', dots, ]) eq_([(relpath(r['parentds'], start=ds.path), relpath(r['path'], start=ds.path)) for r in ds.subdatasets(recursive=True)], [ (os.curdir, 'sub dataset1'), ('sub dataset1', 'sub dataset1/2'), ('sub dataset1', 'sub dataset1/sub sub dataset1'), ('sub dataset1/sub sub dataset1', 'sub dataset1/sub sub dataset1/2'), ('sub dataset1/sub sub dataset1', 'sub dataset1/sub sub dataset1/subm 1'), ('sub dataset1', 'sub dataset1/subm 1'), (os.curdir, dots), ]) # uses slow, flexible query eq_(ds.subdatasets(recursive=True, recursion_limit=0), []) # uses slow, flexible query eq_(ds.subdatasets(recursive=True, recursion_limit=1, result_xfm='relpaths'), ['sub dataset1', dots]) # uses slow, flexible query eq_(ds.subdatasets(recursive=True, recursion_limit=2, result_xfm='relpaths'), [ 'sub dataset1', 'sub dataset1/2', 'sub dataset1/sub sub dataset1', 'sub dataset1/subm 1', dots, ]) res = ds.subdatasets(recursive=True) assert_status('ok', res) for r in res: #for prop in ('gitmodule_url', 'state', 'revision', 'gitmodule_name'): for prop in ('gitmodule_url', 'revision', 'gitmodule_name'): assert_in(prop, r) # random property is unknown assert_not_in('mike', r) # now add info to all datasets res = ds.subdatasets( recursive=True, set_property=[('mike', 'slow'), ('expansion', '<{refds_relname}>')]) assert_status('ok', res) for r in res: eq_(r['gitmodule_mike'], 'slow') eq_(r['gitmodule_expansion'], relpath(r['path'], r['refds']).replace(os.sep, '-')) # plain query again to see if it got into the files res = ds.subdatasets(recursive=True) assert_status('ok', res) for r in res: eq_(r['gitmodule_mike'], 'slow') eq_(r['gitmodule_expansion'], relpath(r['path'], r['refds']).replace(os.sep, '-')) # and remove again res = ds.subdatasets(recursive=True, delete_property='mike') assert_status('ok', res) for r in res: for prop in ('gitmodule_mike'): assert_not_in(prop, r) # and again, because above yields on the fly edit res = ds.subdatasets(recursive=True) assert_status('ok', res) for r in res: for prop in ('gitmodule_mike'): assert_not_in(prop, r) # # test --contains # target_sub = 'sub dataset1/sub sub dataset1/subm 1' # give the closest direct subdataset eq_(ds.subdatasets(contains=opj(target_sub, 'something_inside'), result_xfm='relpaths'), ['sub dataset1']) # should find the actual subdataset trail eq_(ds.subdatasets(recursive=True, contains=opj(target_sub, 'something_inside'), result_xfm='relpaths'), ['sub dataset1', 'sub dataset1/sub sub dataset1', 'sub dataset1/sub sub dataset1/subm 1']) # doesn't affect recursion limit eq_(ds.subdatasets(recursive=True, recursion_limit=2, contains=opj(target_sub, 'something_inside'), result_xfm='relpaths'), ['sub dataset1', 'sub dataset1/sub sub dataset1']) # for a direct dataset path match, return the matching dataset eq_(ds.subdatasets(recursive=True, contains=target_sub, result_xfm='relpaths'), ['sub dataset1', 'sub dataset1/sub sub dataset1', 'sub dataset1/sub sub dataset1/subm 1']) # but it has to be a subdataset, otherwise no match # which is what get_containing_subdataset() used to do assert_status('impossible', ds.subdatasets(contains=ds.path, on_failure='ignore')) # 'impossible' if contains is bullshit assert_status('impossible', ds.subdatasets(recursive=True, contains='impossible_yes', on_failure='ignore')) assert_status('impossible', ds.subdatasets(recursive=True, contains=opj(pardir, 'impossible_yes'), on_failure='ignore')) eq_(ds.subdatasets( recursive=True, contains=[target_sub, 'sub dataset1/2'], result_xfm='relpaths'), [ 'sub dataset1', 'sub dataset1/2', 'sub dataset1/sub sub dataset1', 'sub dataset1/sub sub dataset1/subm 1', ])
def __call__(path=None, dataset=None, recursive=False, recursion_limit=None, action=None, unavailable_path_status='', unavailable_path_msg=None, nondataset_path_status='error', force_parentds_discovery=True, force_subds_discovery=True, force_no_revision_change_discovery=True, force_untracked_discovery=True, modified=None): # upfront check for the fastest possible response if not path and dataset is None: # nothing given, try "here", but do not use `require_dataset`, as # it will determine the root dataset of `curdir` and further down # lead to path annotation of upstairs directories dataset = curdir if force_subds_discovery and not force_parentds_discovery: raise ValueError( 'subdataset discovery requires parent dataset discovery') # CONCEPT: yield with no status to indicate further processing # everything in one big loop to be able too yield as fast a possible # without any precomputing for all paths refds_path = Interface.get_refds_path(dataset) if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)): raise ValueError( "modification detection only works with a base dataset (non-given or found)" ) # prep common result props res_kwargs = dict(action=action if action else 'annotate_path', refds=refds_path, logger=lgr) # handle the case of recursion into a single dataset without any # extra fancy processing first -- full recursion can be done # faster than manual recursion, hence we gain quite some speed # from these few lines of extra code if not modified and not path and refds_path: if not GitRepo.is_valid_repo(refds_path): yield get_status_dict( # doesn't matter if the path is in another dataset # it was given as reference dataset status=nondataset_path_status, message='given reference dataset is not a dataset', path=refds_path, **res_kwargs) return refds = Dataset(refds_path) path = [] # yield the dataset itself r = get_status_dict(ds=refds, status='', **res_kwargs) yield r if recursive: # if we have nothing given, but need recursion, we need to feed # the dataset path itself for r in yield_recursive(refds, refds_path, action, recursion_limit): r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] yield r return # goal: structure in a way that makes most information on any path # available in a single pass, at the cheapest possible cost reported_paths = {} requested_paths = assure_list(path) if modified is not None: # modification detection wwould silently kill all nondataset paths # but we have to complain about them, hence doing it here if requested_paths and refds_path: for r in requested_paths: p = r['path'] if isinstance(r, dict) else r p = resolve_path(p, ds=refds_path) if _with_sep(p).startswith(_with_sep(refds_path)): # all good continue # not the refds path_props = r if isinstance(r, dict) else {} res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with reference dataset' reported_paths[path] = res yield res # replace the requested paths by those paths that were actually # modified underneath or at a requested location requested_paths = get_modified_subpaths( # either the request, or the base dataset, if there was no request requested_paths if requested_paths else [refds_path], refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit) # do not loop over unique(), this could be a list of dicts # we avoid duplicates manually below via `reported_paths` for path in requested_paths: if not isinstance(path, dict): path = rawpath2ap(path, refds_path) # this is now an annotated path! path_props = path path = path['path'] # we need to mark our territory, who knows where this has been path_props.update(res_kwargs) if path in reported_paths: # we already recorded this path in the output # this can happen, whenever `path` is a subdataset, that was # discovered via recursive processing of another path before continue # the path exists in some shape or form # TODO if we have path_props already we could skip this test if isdir(path): # keep any existing type info, previously a more expensive run # could have discovered an uninstalled 'dataset', and we don't # want it to be relabeled to a directory path_props['type'] = \ path_props.get( 'type', 'dataset' if GitRepo.is_valid_repo(path) else 'directory') # this could contain all types of additional content containing_dir = path else: if lexists(path): path_props['type'] = 'file' else: path_props['state'] = 'absent' # for everything else we are interested in the container containing_dir = dirname(path) if not containing_dir: containing_dir = curdir dspath = parent = get_dataset_root(containing_dir) if dspath: if path_props.get('type', None) == 'dataset': # for a dataset the root is not the parent, for anything else # it is parent = path_props.get('parentds', None) oneupdir = normpath(opj(containing_dir, pardir)) if parent is None and (force_parentds_discovery or (refds_path and _with_sep(oneupdir).startswith( _with_sep(refds_path)))): # either forced, or only if we have a reference dataset, and # only if we stay within this refds when searching for the # parent parent = get_dataset_root( normpath(opj(containing_dir, pardir))) # NOTE the `and refds_path` is critical, as it will determine # whether a top-level dataset that was discovered gets the # parent property or not, it won't get it without a common # base dataset, and that is how we always rolled if parent and refds_path: path_props['parentds'] = parent # don't check whether this is actually a true subdataset of the # parent, done further down else: # set parent, but prefer existing property path_props['parentds'] = path_props.get('parentds', dspath) # test for `dspath` not `parent`, we only need to know whether there is # ANY dataset, not which one is the true parent, logic below relies on # the fact that we end here, if there is no dataset at all if not dspath: # not in any dataset res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with any dataset' reported_paths[path] = res yield res continue # check that we only got SUBdatasets if refds_path and not _with_sep(dspath).startswith( _with_sep(refds_path)): res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = \ ('path not part of the reference dataset at %s', refds_path) reported_paths[path] = res yield res continue if path_props.get('type', None) == 'file': # nothing else we can learn about this res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res continue containing_ds = None path_type = path_props.get('type', None) if parent and force_subds_discovery and ( (path_type == 'dataset' and 'registered_subds' not in path_props) or path_type == 'directory' or not lexists(path)): # if the path doesn't exist, or is labeled a directory, or a dataset even # a dataset (without this info) -> record whether this is a known subdataset # to its parent containing_ds = Dataset(parent) subdss = containing_ds.subdatasets(fulfilled=None, recursive=False, result_xfm=None, result_filter=None, return_type='list') if path in [s['path'] for s in subdss]: if path_type == 'directory' or not lexists(path): # first record that it isn't here, if just a dir or not here at all path_props['state'] = 'absent' # this must be a directory, and it is not installed path_props['type'] = 'dataset' path_props['registered_subds'] = True if not lexists(path) or \ (path_props.get('type', None) == 'dataset' and path_props.get('state', None) == 'absent'): # not there (yet) message = unavailable_path_msg if unavailable_path_msg else None if message and '%s' in message: message = (message, path) path_props['message'] = message res = get_status_dict(**dict(res_kwargs, **path_props)) # assign given status, but only if the props don't indicate a status # already res['status'] = path_props.get('status', unavailable_path_status) reported_paths[path] = res yield res continue # we know everything we can, report res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res rec_paths = [] if recursive: # here we need to consider the special case that `path` is # a dataset itself, if a recursion_limit is given (e.g. # `remove` will do that by default), we need to recurse # from the dataset itself, and not its parent to get things # right -- this will also avoid needless discovery of # unrelated subdatasets if path_props.get('type', None) == 'dataset': containing_ds = Dataset(path) else: # regular parent, we might have a dataset already containing_ds = Dataset( parent) if containing_ds is None else containing_ds for r in yield_recursive(containing_ds, path, action, recursion_limit): # capture reported paths r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] reported_paths[r['path']] = r if modified is not None: # we cannot yield right away, maybe it wasn't modified rec_paths.append(r) else: yield r if modified is not None and rec_paths: # replace the recursively discovered paths by those paths that # were actually modified underneath or at a requested location for r in get_modified_subpaths( rec_paths, refds=Dataset(refds_path), revision=modified, report_no_revision_change= force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit): res = get_status_dict(**dict(r, **res_kwargs)) reported_paths[res['path']] = res yield res return