def test_dirty(path): for mode in _dirty_modes: # does nothing without a dataset handle_dirty_dataset(None, mode) # placeholder, but not yet created ds = Dataset(path) # unknown mode assert_raises(ValueError, handle_dirty_dataset, ds, 'MADEUP') # not yet created is very dirty assert_raises(RuntimeError, handle_dirty_dataset, ds, 'fail') handle_dirty_dataset(ds, 'ignore') assert_raises(RuntimeError, handle_dirty_dataset, ds, 'save-before') # should yield a clean repo ds.create() orig_state = ds.repo.get_hexsha() _check_all_clean(ds, orig_state) # tainted: untracked with open(opj(ds.path, 'something'), 'w') as f: f.write('some') # we don't want to auto-add untracked files by saving (anymore) assert_raises(AssertionError, _check_auto_save, ds, orig_state) # tainted: staged ds.repo.add('something', git=True) orig_state = _check_auto_save(ds, orig_state) # tainted: submodule # not added to super on purpose! subds = ds.create('subds') _check_all_clean(subds, subds.repo.get_hexsha()) ok_clean_git(ds.path) # subdataset must be added as a submodule! assert_equal(ds.subdatasets(result_xfm='relpaths'), ['subds'])
def test_diff_nonexistent_ref_unicode(path): ds = Dataset(path).create() assert_result_count( ds.diff(fr="HEAD", to=u"β", on_failure="ignore"), 1, path=ds.path, status="impossible")
def test_unlock_raises(path, path2, path3): # make sure, we are not within a dataset: _cwd = getpwd() chpwd(path) # no dataset and no path: assert_raises(InsufficientArgumentsError, unlock, dataset=None, path=None) # no dataset and path not within a dataset: with swallow_logs(new_level=logging.WARNING) as cml: unlock(dataset=None, path=path2) assert_in("ignored paths that do not belong to any dataset: ['{0}'".format(path2), cml.out) create(path=path, no_annex=True) ds = Dataset(path) # no complaints ds.unlock() # make it annex, but call unlock with invalid path: AnnexRepo(path, create=True) with swallow_logs(new_level=logging.WARNING) as cml: ds.unlock(path="notexistent.txt") assert_in("ignored non-existing paths", cml.out) chpwd(_cwd)
def test_get_metadata(path): ds = Dataset(path).create(force=True) ds.save() meta = MetadataExtractor(ds, [])._get_dataset_metadata() assert_equal( dumps(meta, sort_keys=True, indent=2), """\ { "citation": "Cool (2016)", "conformsto": "http://docs.datalad.org/metadata.html#v0-1", "description": "A text with arbitrary length and content that can span multiple\\nparagraphs (this is a new one)", "fundedby": "BMBFGQ1411, NSF 1429999", "homepage": "http://studyforrest.org", "issuetracker": "https://github.com/psychoinformatics-de/studyforrest-data-phase2/issues", "license": [ "CC0", "The person who associated a work with this deed has dedicated the work to the public domain by waiving all of his or her rights to the work worldwide under copyright law, including all related and neighboring rights, to the extent allowed by law.\\nYou can copy, modify, distribute and perform the work, even for commercial purposes, all without asking permission." ], "maintainer": [ "Mike One <*****@*****.**>", "Anna Two <*****@*****.**>" ], "name": "studyforrest_phase2", "sameas": "http://dx.doi.org/10.5281/zenodo.48421", "shortdescription": "Basic summary", "version": "1.0.0-rc3" }""")
def test_runnin_on_empty(path): # empty repo repo = AnnexRepo(path, create=True) # just wrap with a dataset ds = Dataset(path) # and run status ... should be good and do nothing eq_([], ds.status())
def test_gh2043p1(path): # this tests documents the interim agreement on what should happen # in the case documented in gh-2043 ds = Dataset(path).create(force=True) ds.save('1') assert_repo_status(ds.path, untracked=['2', '3']) ds.unlock('1') assert_repo_status( ds.path, # on windows we are in an unlocked branch by default, hence # we would see no change modified=[] if on_windows else ['1'], untracked=['2', '3']) # save(.) should recommit unlocked file, and not touch anything else # this tests the second issue in #2043 with chpwd(path): # only save modified bits save(path='.', updated=True) # state of the file (unlocked/locked) is committed as well, and the # test doesn't lock the file again assert_repo_status(ds.path, untracked=['2', '3']) with chpwd(path): # but when a path is given, anything that matches this path # untracked or not is added/saved save(path='.') # state of the file (unlocked/locked) is committed as well, and the # test doesn't lock the file again assert_repo_status(ds.path)
def test_symlinked_relpath(path): # initially ran into on OSX https://github.com/datalad/datalad/issues/2406 os.makedirs(op.join(path, "origin")) dspath = op.join(path, "linked") os.symlink('origin', dspath) ds = Dataset(dspath).create() create_tree(dspath, { "mike1": 'mike1', # will be added from topdir "later": "later", # later from within subdir "d": { "mike2": 'mike2', # to be added within subdir } }) # in the root of ds with chpwd(dspath): ds.repo.add("mike1", git=True) ds.save(message="committing", path="./mike1") # Let's also do in subdirectory as CWD, check that relative path # given to a plain command (not dataset method) are treated as # relative to CWD with chpwd(op.join(dspath, 'd')): save(dataset=ds.path, message="committing", path="mike2") later = op.join(op.pardir, "later") ds.repo.add(later, git=True) save(dataset=ds.path, message="committing", path=later) assert_repo_status(dspath)
def test_unlock_raises(path, path2, path3): # make sure, we are not within a dataset: _cwd = getpwd() chpwd(path) # no dataset and no path: assert_raises(InsufficientArgumentsError, unlock, dataset=None, path=None) # no dataset and path not within a dataset: res = unlock(dataset=None, path=path2, result_xfm=None, on_failure='ignore', return_type='item-or-list') eq_(res['message'], "path not associated with any dataset") eq_(res['path'], path2) create(path=path, no_annex=True) ds = Dataset(path) # no complaints ds.unlock() # make it annex, but call unlock with invalid path: AnnexRepo(path, create=True) res = ds.unlock(path="notexistent.txt", result_xfm=None, on_failure='ignore', return_type='item-or-list') eq_(res['message'], "path does not exist") chpwd(_cwd)
def test_subdataset_save(path): parent = Dataset(path).create() sub = parent.create('sub') assert_repo_status(parent.path) create_tree(parent.path, { "untracked": 'ignore', 'sub': { "new": "wanted"}}) sub.save('new') # defined state: one untracked, modified (but clean in itself) subdataset assert_repo_status(sub.path) assert_repo_status(parent.path, untracked=['untracked'], modified=['sub']) # `save sub` does not save the parent!! with chpwd(parent.path): assert_status('notneeded', save(dataset=sub.path)) assert_repo_status(parent.path, untracked=['untracked'], modified=['sub']) # `save -u .` saves the state change in the subdataset, # but leaves any untracked content alone with chpwd(parent.path): assert_status('ok', parent.save(updated=True)) assert_repo_status(parent.path, untracked=['untracked']) # get back to the original modified state and check that -S behaves in # exactly the same way create_tree(parent.path, { 'sub': { "new2": "wanted2"}}) sub.save('new2') assert_repo_status(parent.path, untracked=['untracked'], modified=['sub'])
def test_run_from_subds(path): if 'APPVEYOR' in os.environ: raise SkipTest('test causes appveyor (only) to crash, reason unknown') subds = Dataset(path).create().create("sub") subds.run("cd .> foo") assert_repo_status(subds.path)
def test_rerun_just_one_commit(path): ds = Dataset(path).create() # Check out an orphan branch so that we can test the "one commit # in a repo" case. ds.repo.checkout("orph", options=["--orphan"]) ds.repo.repo.git.reset("--hard") ds.repo.config.reload() ds.run('echo static-content > static') assert_result_count(ds.repo.repo.git.rev_list("HEAD").split(), 1) # Rerunning with just one commit doesn't raise an error ... ds.rerun() # ... but we're still at one commit because the content didn't # change. assert_result_count(ds.repo.repo.git.rev_list("HEAD").split(), 1) # We abort rather than trying to do anything when --onto='' and # --since='' are given together and the first commit contains a # run command. ds.repo.commit(msg="empty", options=["--allow-empty"]) assert_raises(IncompleteResultsError, ds.rerun, since="", onto="") # --script propagates the error. with swallow_outputs(): assert_raises(IncompleteResultsError, ds.rerun, since="", onto="", script="-") # --dry-run propagates the error. assert_raises(IncompleteResultsError, ds.rerun, since="", onto="", report=True, return_type="list")
def test_add_recursive(path): # make simple hierarchy parent = Dataset(path).create() assert_repo_status(parent.path) sub1 = parent.create(op.join('down', 'sub1')) assert_repo_status(parent.path) sub2 = parent.create('sub2') # next one make the parent dirty subsub = sub2.create('subsub') assert_repo_status(parent.path, modified=['sub2']) res = parent.save() assert_repo_status(parent.path) # now add content deep in the hierarchy create_tree(subsub.path, {'new': 'empty'}) assert_repo_status(parent.path, modified=['sub2']) # recursive add should not even touch sub1, because # it knows that it is clean res = parent.save(recursive=True) # the key action is done assert_result_count( res, 1, path=op.join(subsub.path, 'new'), action='add', status='ok') # saved all the way up assert_result_count(res, 3, action='save', status='ok') assert_repo_status(parent.path)
def check_renamed_file(recursive, no_annex, path): ds = Dataset(path).create(no_annex=no_annex) create_tree(path, {'old': ''}) ds.add('old') ds.repo._git_custom_command(['old', 'new'], ['git', 'mv']) ds._save(recursive=recursive) ok_clean_git(path)
def test_status_basics(path, linkpath, otherdir): if not on_windows: # make it more complicated by default ut.Path(linkpath).symlink_to(path, target_is_directory=True) path = linkpath with chpwd(path): assert_raises(NoDatasetArgumentFound, status) ds = Dataset(path).create() # outcome identical between ds= and auto-discovery with chpwd(path): assert_raises(IncompleteResultsError, status, path=otherdir) stat = status() eq_(stat, ds.status()) assert_status('ok', stat) # we have a bunch of reports (be vague to be robust to future changes assert len(stat) > 2 # check the composition for s in stat: eq_(s['status'], 'ok') eq_(s['action'], 'status') eq_(s['state'], 'clean') eq_(s['type'], 'file') assert_in('gitshasum', s) assert_in('bytesize', s) eq_(s['refds'], ds.path)
def test_add_mimetypes(path): ds = Dataset(path).create(force=True) ds.repo.add('.gitattributes') ds.repo.commit('added attributes to git explicitly') # now test that those files will go into git/annex correspondingly # WINDOWS FAILURE NEXT __not_tested__ = ds.save(['file.txt', 'empty']) assert_repo_status(path, untracked=['file2.txt']) # But we should be able to force adding file to annex when desired ds.save('file2.txt', to_git=False) # check annex file status annexinfo = ds.repo.get_content_annexinfo() for path, in_annex in ( # Empty one considered to be application/octet-stream # i.e. non-text ('empty', True), ('file.txt', False), ('file2.txt', True)): # low-level API report -> repo path reference, no ds path p = ds.repo.pathobj / path assert_in(p, annexinfo) if in_annex: assert_in('key', annexinfo[p], p) else: assert_not_in('key', annexinfo[p], p)
def test_add_subdataset(path, other): subds = create(op.join(path, 'dir'), force=True) ds = create(path, force=True) ok_(subds.repo.dirty) ok_(ds.repo.dirty) assert_not_in('dir', ds.subdatasets(result_xfm='relpaths')) # "add everything in subds to subds" save(dataset=subds.path) assert_repo_status(subds.path) assert_not_in('dir', ds.subdatasets(result_xfm='relpaths')) # but with a base directory we add the dataset subds as a subdataset # to ds res = ds.save(subds.path) assert_in_results(res, action="add", path=subds.path, refds=ds.path) assert_in('dir', ds.subdatasets(result_xfm='relpaths')) # create another one other = create(other) # install into superdataset, but don't add other_clone = install(source=other.path, path=op.join(ds.path, 'other')) # little dance to get the revolution-type dataset other_clone = Dataset(other_clone.path) ok_(other_clone.is_installed) assert_not_in('other', ds.subdatasets(result_xfm='relpaths')) # now add, it should pick up the source URL ds.save('other') # and that is why, we can reobtain it from origin ds.uninstall('other') ok_(not other_clone.is_installed()) ds.get('other') ok_(other_clone.is_installed())
def test_add_files(path): ds = Dataset(path).create(force=True) test_list_1 = ['test_annex.txt'] test_list_2 = ['test.txt'] test_list_3 = ['test1.dat', 'test2.dat'] test_list_4 = [op.join('dir', 'testindir'), op.join('dir', OBSCURE_FILENAME)] for arg in [(test_list_1[0], False), (test_list_2[0], True), (test_list_3, False), (test_list_4, False)]: # special case 4: give the dir: if arg[0] == test_list_4: result = ds.save('dir', to_git=arg[1]) status = ds.repo.annexstatus(['dir']) else: result = ds.save(arg[0], to_git=arg[1]) for a in assure_list(arg[0]): assert_result_count(result, 1, path=text_type(ds.pathobj / a)) status = ds.repo.get_content_annexinfo( ut.Path(p) for p in assure_list(arg[0])) for f, p in iteritems(status): if arg[1]: assert p.get('key', None) is None, f else: assert p.get('key', None) is not None, f
def test_rerun_ambiguous_revision_file(path): ds = Dataset(path).create() ds.run('echo ambig > ambig') ds.repo.tag("ambig") # Don't fail when "ambig" refers to both a file and revision. ds.rerun(since="", revision="ambig", branch="rerun") eq_(len(ds.repo.repo.git.rev_list("rerun").split()), len(ds.repo.repo.git.rev_list("ambig", "--").split()))
def test_encoding(path): staged = OBSCURE_FILENAME + u'_staged' untracked = OBSCURE_FILENAME + u'_untracked' ds = Dataset(path).create(force=True) ds.repo.add(staged) assert_repo_status(ds.path, added=[staged], untracked=[untracked]) ds.save(updated=True) assert_repo_status(ds.path, untracked=[untracked])
def make_demo_hierarchy_datasets(path, tree, parent=None): if parent is None: parent = Dataset(path).create(force=True) for node, items in tree.items(): if isinstance(items, dict): node_path = opj(path, node) nodeds = parent.create(node_path, force=True) make_demo_hierarchy_datasets(node_path, items, parent=nodeds) return parent
def __call__( path=None, dataset=None, recursive=False, check=True, if_dirty='save-before'): if dataset and not path: # act on the whole dataset if nothing else was specified path = dataset.path if isinstance(dataset, Dataset) else dataset content_by_ds, unavailable_paths = Interface._prep( path=path, dataset=dataset, recursive=recursive) if unavailable_paths: lgr.warning('ignored non-installed paths: %s', unavailable_paths) # upfront sanity and compliance checks if path_is_under(content_by_ds.keys()): # behave like `rm` and refuse to remove where we are raise ValueError( "refusing to uninstall current or parent directory") # check that we have no top-level datasets and not files to process args_ok = True for ds_path in content_by_ds: ds = Dataset(ds_path) paths = content_by_ds[ds_path] if ds_path not in paths: lgr.error( "will not act on files at %s (consider the `drop` command)", paths) args_ok = False if not ds.get_superdataset( datalad_only=False, topmost=False): lgr.error( "will not uninstall top-level dataset at %s (consider the `remove` command)", ds.path) args_ok = False if not args_ok: raise ValueError( 'inappropriate arguments, see previous error message(s)') handle_dirty_datasets( content_by_ds, mode=if_dirty, base=dataset) results = [] # iterate over all datasets, starting at the bottom # to deinit contained submodules first for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) paths = content_by_ds[ds_path] results.extend( # we confirmed the super dataset presence above _uninstall_dataset(ds, check=check, has_super=True)) # there is nothing to save at the end return results
def test_bf2043p2(path): ds = Dataset(path).create(force=True) ds.add('staged', save=False) ok_clean_git(ds.path, head_modified=['staged'], untracked=['untracked']) # plain save does not commit untracked content # this tests the second issue in #2043 with chpwd(path): save() ok_clean_git(ds.path, untracked=['untracked'])
def test_bf3285(path): ds = Dataset(path).create(force=True) # Note: Using repo.pathobj matters in the "TMPDIR=/var/tmp/sym\ link" case # because assert_repo_status is based off of {Annex,Git}Repo.path, which is # the realpath'd path (from the processing in _flyweight_id_from_args). subds = create(ds.repo.pathobj.joinpath("subds")) # Explicitly saving a path does not save an untracked, unspecified # subdataset. ds.save("foo") assert_repo_status(ds.path, untracked=[subds.path])
def test_py2_unicode_command(path): # Avoid OBSCURE_FILENAME to avoid windows-breakage (gh-2929). ds = Dataset(path).create() touch_cmd = "import sys; open(sys.argv[1], 'w').write('')" cmd_str = u"{} -c \"{}\" {}".format(sys.executable, touch_cmd, u"bβ0.dat") ds.run(cmd_str) assert_repo_status(ds.path) ok_exists(op.join(path, u"bβ0.dat")) if not on_windows: # FIXME ds.run([sys.executable, "-c", touch_cmd, u"bβ1.dat"]) assert_repo_status(ds.path) ok_exists(op.join(path, u"bβ1.dat")) # Send in a list of byte-strings to mimic a py2 command-line # invocation. ds.run([s.encode("utf-8") for s in [sys.executable, "-c", touch_cmd, u" β1 "]]) assert_repo_status(ds.path) ok_exists(op.join(path, u" β1 ")) with assert_raises(CommandError), swallow_outputs(): ds.run(u"bβ2.dat")
def test_save_message_file(path): ds = Dataset(path).create() with assert_raises(ValueError): ds.save("blah", message="me", message_file="and me") create_tree(path, {"foo": "x", "msg": "add foo"}) ds.repo.add("foo") ds.save(message_file=op.join(ds.path, "msg")) eq_(ds.repo.repo.git.show("--format=%s", "--no-patch"), "add foo")
def test_create_curdir(path, path2): with chpwd(path, mkdir=True): create() ds = Dataset(path) ok_(ds.is_installed()) assert_repo_status(ds.path, annex=True) with chpwd(path2, mkdir=True): create(no_annex=True) ds = Dataset(path2) ok_(ds.is_installed()) assert_repo_status(ds.path, annex=False) ok_(op.exists(op.join(ds.path, '.noannex')))
def test_nested_create(path): # to document some more organic usage pattern ds = Dataset(path).create() assert_repo_status(ds.path) lvl2relpath = op.join('lvl1', 'lvl2') lvl2path = op.join(ds.path, lvl2relpath) os.makedirs(lvl2path) os.makedirs(op.join(ds.path, 'lvl1', 'empty')) with open(op.join(lvl2path, 'file'), 'w') as f: f.write('some') ok_(ds.save()) # Empty directories are filtered out. assert_repo_status(ds.path, untracked=[]) # later create subdataset in a fresh dir # WINDOWS FAILURE IS NEXT LINE subds1 = ds.create(op.join('lvl1', 'subds')) assert_repo_status(ds.path, untracked=[]) eq_(ds.subdatasets(result_xfm='relpaths'), [op.join('lvl1', 'subds')]) # later create subdataset in an existing empty dir subds2 = ds.create(op.join('lvl1', 'empty')) assert_repo_status(ds.path) # later try to wrap existing content into a new subdataset # but that won't work assert_in_results( ds.create(lvl2relpath, **raw), status='error', message=( 'collision with content in parent dataset at %s: %s', ds.path, [op.join(lvl2path, 'file')])) # even with force, as to do this properly complicated surgery would need to # take place # MIH disable shaky test till proper dedicated upfront check is in-place in `create` # gh-1725 #assert_in_results( # ds.create(lvl2relpath, force=True, # on_failure='ignore', result_xfm=None, result_filter=None), # status='error', action='add') # only way to make it work is to unannex the content upfront ds.repo._run_annex_command('unannex', annex_options=[op.join(lvl2relpath, 'file')]) # nothing to save, git-annex commits the unannex itself, but only on v5 ds.repo.commit() # still nothing without force # "err='lvl1/lvl2' already exists in the index" assert_in_results( ds.create(lvl2relpath, **raw), status='error', message='will not create a dataset in a non-empty directory, use `force` option to ignore') # XXX even force doesn't help, because (I assume) GitPython doesn't update # its representation of the Git index properly ds.create(lvl2relpath, force=True) assert_in(lvl2relpath, ds.subdatasets(result_xfm='relpaths'))
def _traverse_handle_subds( subds_rpath, rootds, recurse_datasets, recurse_directories, json): """A helper to deal with the subdataset node - recurse or just pick up may be alrady collected in it web meta """ subds_path = opj(rootds.path, subds_rpath) subds = Dataset(subds_path) subds_json = metadata_locator(path='.', ds_path=subds_path) def handle_not_installed(): # for now just traverse as fs lgr.warning("%s is either not installed or lacks meta-data", subds) subfs = fs_extract(subds_path, rootds, basepath=rootds.path) # but add a custom type that it is a not installed subds subfs['type'] = 'uninitialized' # we need to kick it out from 'children' # TODO: this is inefficient and cruel -- "ignored" should be made # smarted to ignore submodules for the repo #if fs['nodes']: # fs['nodes'] = [c for c in fs['nodes'] if c['path'] != subds_rpath] return subfs if not subds.is_installed(): subfs = handle_not_installed() elif recurse_datasets: subfs = ds_traverse(subds, json=json, recurse_datasets=recurse_datasets, recurse_directories=recurse_directories, parent=rootds) subfs.pop('nodes', None) #size_list.append(subfs['size']) # else just pick the data from metadata_file of each subdataset else: subfs = None lgr.info(subds.path) if exists(subds_json): with open(subds_json) as data_file: subfs = js.load(data_file) subfs.pop('nodes', None) # remove children subfs['path'] = subds_rpath # reassign the path #size_list.append(subfs['size']) else: # the same drill as if not installed lgr.warning("%s is installed but no meta-data yet", subds) subfs = handle_not_installed() # add URL field return subfs
def test_save_directory(path): # Sequence of save invocations on subdirectories. ds = Dataset(path).create(force=True) ds._save(path='sdir1') ok_clean_git(ds.path, untracked=['sdir2/foo', 'sdir3/sdir/subsub/foo']) # There is also difference from with chpwd(path): save(path='sdir2') ok_clean_git(ds.path, untracked=['sdir3/sdir/subsub/foo']) with chpwd(opj(path, 'sdir3')): save(path='sdir') ok_clean_git(ds.path)
def test_run_explicit(path): ds = Dataset(path) assert_false(ds.repo.file_has_content("test-annex.dat")) create_tree(ds.path, {"dirt_untracked": "untracked", "dirt_modified": "modified"}) ds.save("dirt_modified", to_git=True) with open(op.join(path, "dirt_modified"), "a") as ofh: ofh.write(", more") # We need explicit=True to run with dirty repo. assert_status("impossible", ds.run("cat test-annex.dat test-annex.dat >doubled.dat", inputs=["test-annex.dat"], on_failure="ignore")) hexsha_initial = ds.repo.get_hexsha() # If we specify test-annex.dat as an input, it will be retrieved before the # run. ds.run("cat test-annex.dat test-annex.dat >doubled.dat", inputs=["test-annex.dat"], explicit=True) ok_(ds.repo.file_has_content("test-annex.dat")) # We didn't commit anything because outputs weren't specified. assert_false(ds.repo.file_has_content("doubled.dat")) eq_(hexsha_initial, ds.repo.get_hexsha()) # If an input doesn't exist, we just show the standard warning. with swallow_logs(new_level=logging.WARN) as cml: with swallow_outputs(): ds.run("ls", inputs=["not-there"], explicit=True) assert_in("Input does not exist: ", cml.out) remove(op.join(path, "doubled.dat")) hexsha_initial = ds.repo.get_hexsha() ds.run("cat test-annex.dat test-annex.dat >doubled.dat", inputs=["test-annex.dat"], outputs=["doubled.dat"], explicit=True) ok_(ds.repo.file_has_content("doubled.dat")) assert_repo_status(ds.path, modified=["dirt_modified"], untracked=['dirt_untracked']) neq_(hexsha_initial, ds.repo.get_hexsha()) # Saving explicit outputs works from subdirectories. subdir = op.join(path, "subdir") mkdir(subdir) with chpwd(subdir): run("echo insubdir >foo", explicit=True, outputs=["foo"]) ok_(ds.repo.file_has_content(op.join("subdir", "foo")))
def test_force_checkdatapresent(srcpath, dstpath): src = Dataset(srcpath).create() target = mk_push_target(src, 'target', dstpath, annex=True, bare=True) (src.pathobj / 'test_mod_annex_file').write_text("Heavy stuff.") src.save(to_git=False, message="New annex file") assert_repo_status(src.path, annex=True) whereis_prior = src.repo.whereis(files=['test_mod_annex_file'])[0] res = src.push(to='target', data='nothing') # nothing reported to be copied assert_not_in_results(res, action='copy') # we got the git-push nevertheless eq_(src.repo.get_hexsha(DEFAULT_BRANCH), target.get_hexsha(DEFAULT_BRANCH)) # nothing moved eq_(whereis_prior, src.repo.whereis(files=['test_mod_annex_file'])[0]) # now a push without forced no-transfer # we do not give since, so the non-transfered file is picked up # and transferred res = src.push(to='target', force=None) # no branch change, done before assert_in_results(res, action='publish', status='notneeded', refspec=DEFAULT_REFSPEC) # but availability update assert_in_results(res, action='publish', status='ok', refspec='refs/heads/git-annex:refs/heads/git-annex') assert_in_results(res, status='ok', path=str(src.pathobj / 'test_mod_annex_file'), action='copy') # whereis info reflects the change ok_( len(whereis_prior) < len( src.repo.whereis(files=['test_mod_annex_file'])[0])) # do it yet again will do nothing, because all is uptodate assert_status('notneeded', src.push(to='target', force=None)) # an explicit reference point doesn't change that assert_status('notneeded', src.push(to='target', force=None, since='HEAD~1')) # now force data transfer res = src.push(to='target', force='checkdatapresent') # no branch change, done before assert_in_results(res, action='publish', status='notneeded', refspec=DEFAULT_REFSPEC) # no availability update assert_in_results(res, action='publish', status='notneeded', refspec='refs/heads/git-annex:refs/heads/git-annex') # but data transfer assert_in_results(res, status='ok', path=str(src.pathobj / 'test_mod_annex_file'), action='copy') # force data transfer, but data isn't available src.repo.drop('test_mod_annex_file') res = src.push(to='target', path='.', force='checkdatapresent', on_failure='ignore') assert_in_results(res, status='impossible', path=str(src.pathobj / 'test_mod_annex_file'), action='copy', message='Slated for transport, but no content present')
def test_save_obscure_name(path): ds = Dataset(path).create(force=True) fname = OBSCURE_FILENAME # Just check that we don't fail with a unicode error. with swallow_outputs(): ds.save(path=fname, result_renderer="default")
def test_path_diff(_path, linkpath): # do the setup on the real path, not the symlink, to have its # bugs not affect this test of status() ds = get_deeply_nested_structure(str(_path)) if has_symlink_capability(): # make it more complicated by default ut.Path(linkpath).symlink_to(_path, target_is_directory=True) path = linkpath else: path = _path ds = Dataset(path) if has_symlink_capability(): assert ds.pathobj != ds.repo.pathobj plain_recursive = ds.diff(recursive=True, annex='all', result_renderer=None) # check integrity of individual reports with a focus on how symlinks # are reported for res in plain_recursive: # anything that is an "intended" symlink should be reported # as such. In contrast, anything that is a symlink for mere # technical reasons (annex using it for something in some mode) # should be reported as the thing it is representing (i.e. # a file) if 'link2' in str(res['path']): assert res['type'] == 'symlink', res else: assert res['type'] != 'symlink', res # every item must report its parent dataset assert_in('parentds', res) # bunch of smoke tests # query of '.' is same as no path eq_(plain_recursive, ds.diff(path='.', recursive=True, annex='all', result_renderer=None)) # duplicate paths do not change things eq_( plain_recursive, ds.diff(path=['.', '.'], recursive=True, annex='all', result_renderer=None)) # neither do nested paths if not "2.24.0" <= ds.repo.git_version < "2.25.0": # Release 2.24.0 contained a regression that was fixed with 072a231016 # (2019-12-10). eq_( plain_recursive, ds.diff(path=['.', 'subds_modified'], recursive=True, annex='all', result_renderer=None)) # when invoked in a subdir of a dataset it still reports on the full thing # just like `git status`, as long as there are no paths specified with chpwd(op.join(path, 'directory_untracked')): plain_recursive = diff(recursive=True, annex='all', result_renderer=None) # should be able to take absolute paths and yield the same # output eq_( plain_recursive, ds.diff(path=ds.path, recursive=True, annex='all', result_renderer=None)) # query for a deeply nested path from the top, should just work with a # variety of approaches rpath = op.join('subds_modified', 'subds_lvl1_modified', u'{}_directory_untracked'.format(OBSCURE_FILENAME)) apathobj = ds.pathobj / rpath apath = str(apathobj) for p in (rpath, apath, None): if p is None: # change into the realpath of the dataset and # query with an explicit path with chpwd(ds.path): res = ds.diff(path=op.join('.', rpath), recursive=True, annex='all', result_renderer=None) else: res = ds.diff(path=p, recursive=True, annex='all', result_renderer=None) assert_result_count( res, 1, state='untracked', type='directory', refds=ds.path, # path always comes out a full path inside the queried dataset path=apath, ) assert_result_count(ds.diff(recursive=True, result_renderer=None), 1, path=apath) # limiting recursion will exclude this particular path assert_result_count(ds.diff(recursive=True, recursion_limit=1, result_renderer=None), 0, path=apath) # negative limit is unlimited limit eq_(ds.diff(recursive=True, recursion_limit=-1, result_renderer=None), ds.diff(recursive=True, result_renderer=None))
def test_sidecar(path): ds = Dataset(path).create() # Simple sidecar message checks. ds.run("cd .> dummy0", message="sidecar arg", sidecar=True) assert_not_in('"cmd":', ds.repo.format_commit("%B")) ds.config.set("datalad.run.record-sidecar", "false", where="local") ds.run("cd .> dummy1", message="sidecar config") assert_in('"cmd":', last_commit_msg(ds.repo)) ds.config.set("datalad.run.record-sidecar", "true", where="local") ds.run("cd .> dummy2", message="sidecar config") assert_not_in('"cmd":', last_commit_msg(ds.repo)) # Don't break when config.get() returns multiple values. Here it's two # values in .gitconfig, but a more realistic scenario is a value in # $repo/.git/config that overrides a setting in ~/.config/git/config. ds.config.add("datalad.run.record-sidecar", "false", where="local") ds.run("cd .> dummy3", message="sidecar config") assert_in('"cmd":', last_commit_msg(ds.repo)) # make sure sidecar file is committed when explicitly specifying outputs ds.run("cd .> dummy4", outputs=["dummy4"], sidecar=True, explicit=True, message="sidecar + specified outputs") assert_not_in('"cmd":', last_commit_msg(ds.repo)) assert_repo_status(ds.path)
def test_run_save_deletion(path): ds = Dataset(path).create(force=True) ds.save() ds.run("{} to_remove".format("del" if on_windows else "rm")) assert_repo_status(ds.path)
def test_run_assume_ready(path): ds = Dataset(path).create() repo = ds.repo adjusted = repo.is_managed_branch() # --assume-ready=inputs (repo.pathobj / "f1").write_text("f1") ds.save() def cat_cmd(fname): return [ sys.executable, "-c", "import sys; print(open(sys.argv[-1]).read())", fname ] assert_in_results(ds.run(cat_cmd("f1"), inputs=["f1"]), action="get", type="file") # Same thing, but without the get() call. assert_not_in_results(ds.run(cat_cmd("f1"), inputs=["f1"], assume_ready="inputs"), action="get", type="file") ds.drop("f1", check=False) if not adjusted: # If the input is not actually ready, the command will fail. with assert_raises(CommandError): ds.run(cat_cmd("f1"), inputs=["f1"], assume_ready="inputs") # --assume-ready=outputs def unlink_and_write_cmd(fname): # This command doesn't care whether the output file is unlocked because # it removes it ahead of time anyway. return [ sys.executable, "-c", "import sys; import os; import os.path as op; " "f = sys.argv[-1]; op.lexists(f) and os.unlink(f); " "open(f, mode='w').write(str(sys.argv))", fname ] (repo.pathobj / "f2").write_text("f2") ds.save() res = ds.run(unlink_and_write_cmd("f2"), outputs=["f2"]) if not adjusted: assert_in_results(res, action="unlock", type="file") # Same thing, but without the unlock() call. res = ds.run(unlink_and_write_cmd("f2"), outputs=["f2"], assume_ready="outputs") assert_not_in_results(res, action="unlock", type="file") # --assume-ready=both res = ds.run(unlink_and_write_cmd("f2"), outputs=["f2"], inputs=["f2"]) assert_in_results(res, action="get", type="file") if not adjusted: assert_in_results(res, action="unlock", type="file") res = ds.run(unlink_and_write_cmd("f2"), outputs=["f2"], inputs=["f2"], assume_ready="both") assert_not_in_results(res, action="get", type="file") assert_not_in_results(res, action="unlock", type="file")
def test_ls_json(topdir): annex = AnnexRepo(topdir, create=True) ds = Dataset(topdir) # create some file and commit it open(opj(ds.path, 'subdsfile.txt'), 'w').write('123') ds.add(path='subdsfile.txt') ds.save("Hello!", version_tag=1) # add a subdataset ds.install('subds', source=topdir) git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True) # create git repo git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt'), commit=True) # commit to git to init git repo annex.add(opj(topdir, 'dir', 'subgit'), commit=True) # add the non-dataset git repo to annex annex.add(opj(topdir, 'dir'), commit=True) # add to annex (links) annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # broken-link meta_dir = opj('.git', 'datalad', 'metadata') meta_path = opj(topdir, meta_dir) def get_metahash(*path): return hashlib.md5(opj(*path).encode('utf-8')).hexdigest() for all_ in [True, False]: for recursive in [True, False]: for state in ['file', 'delete']: with swallow_logs(), swallow_outputs(): ds = _ls_json(topdir, json=state, all_=all_, recursive=recursive) # subdataset should have its json created and deleted when all=True else not subds_metahash = get_metahash('/') subds_metapath = opj(topdir, 'subds', meta_dir, subds_metahash) assert_equal(exists(subds_metapath), (state == 'file' and recursive)) # root should have its json file created and deleted in all cases ds_metahash = get_metahash('/') ds_metapath = opj(meta_path, ds_metahash) assert_equal(exists(ds_metapath), state == 'file') # children should have their metadata json's created and deleted only when recursive=True child_metahash = get_metahash('dir', 'subdir') child_metapath = opj(meta_path, child_metahash) assert_equal(exists(child_metapath), (state == 'file' and all_)) # ignored directories should not have json files created in any case for subdir in [('.hidden'), ('dir', 'subgit')]: child_metahash = get_metahash(*subdir) assert_equal(exists(opj(meta_path, child_metahash)), False) # check if its updated in its nodes sublist too. used by web-ui json. regression test assert_equal(ds['nodes'][0]['size']['total'], ds['size']['total']) # check size of subdataset subds = [ item for item in ds['nodes'] if item['name'] == ('subdsfile.txt' or 'subds') ][0] assert_equal(subds['size']['total'], '3 Bytes') # run non-recursive dataset traversal after subdataset metadata already created # to verify sub-dataset metadata being picked up from its metadata file in such cases if state == 'file' and recursive and not all_: ds = _ls_json(topdir, json='file', all_=False) subds = [ item for item in ds['nodes'] if item['name'] == ('subdsfile.txt' or 'subds') ][0] assert_equal(subds['size']['total'], '3 Bytes')
def test_basic_filemeta(path): with chpwd(path): # no repo -> error assert_status('error', metadata(on_failure='ignore')) # fine with annex AnnexRepo('.', create=True) _assert_metadata_empty(metadata()[0]['metadata']) _assert_metadata_empty(metadata('.')[0]['metadata']) # create playing field create_tree(path, { 'somefile': 'content', 'dir': { 'deepfile': 'othercontent' } }) ds = Dataset(path) res = ds.metadata('somefile', add=['plaintag'], on_failure='ignore') assert_result_count(res, 1, status='impossible', message="metadata not supported (only annex'ed files)", path=opj(ds.path, 'somefile')) ds.add('.') ok_clean_git(path) # full query -> 2 files res = ds.metadata(reporton='files') assert_result_count(res, 2) assert_result_count(res, 2, type='file', metadata={}) # # tags: just a special case of a metadata key without a value # # tag one file target_file = opj('dir', 'deepfile') # needs a sequence or dict assert_raises(ValueError, ds.metadata, target_file, add='mytag') # like this res = ds.metadata(target_file, add=['mytag'], reporton='files') assert_result_count(res, 1) assert_result_count(res, 1, type='file', path=opj(ds.path, target_file), metadata={'tag': 'mytag'}) # now init tag for all files that don't have one yet res = ds.metadata(init=['rest'], reporton='files') assert_result_count(res, 2) # from before assert_result_count(res, 1, type='file', path=opj(ds.path, target_file), metadata={'tag': 'mytag'}) # and the other one assert_result_count(res, 1, type='file', path=opj(ds.path, 'somefile'), metadata={'tag': 'rest'}) # add two more different tags res = ds.metadata(add=['other1', 'other2', 'other3'], reporton='files') assert_result_count(res, 2) for r in res: assert_in('other1', r['metadata']['tag']) assert_in('other2', r['metadata']['tag']) assert_in('other3', r['metadata']['tag']) # now remove two specifics tag from all files that exists in all files res = ds.metadata(remove=['other1', 'other3'], reporton='files') assert_result_count(res, 2) for r in res: assert_not_in('other1', r['metadata']['tag']) assert_in('other2', r['metadata']['tag']) # and now one that only exists in one file res = ds.metadata(remove=['rest'], reporton='files') # we still get 2 results, because we still touch all files assert_result_count(res, 2) # however there is no modification to files that don't have the tag assert_result_count(res, 1, type='file', path=opj(ds.path, 'somefile'), metadata={'tag': 'other2'}) assert_result_count(res, 1, type='file', path=opj(ds.path, target_file), metadata={'tag': ['mytag', 'other2']}) # and finally kill the tags res = ds.metadata(target_file, reset=['tag'], reporton='files') assert_result_count(res, 1) assert_result_count(res, 1, type='file', metadata={}, path=opj(ds.path, target_file)) # no change to the other one assert_result_count(ds.metadata('somefile'), 1, type='file', path=opj(ds.path, 'somefile'), metadata={'tag': 'other2'}) # kill all tags everywhere res = ds.metadata(reset=['tag'], reporton='files') assert_result_count(res, 2) assert_result_count(res, 2, type='file', metadata={}) # # key: value mapping # # invalid key -> exception assert_raises(ValueError, ds.metadata, 'somefile', add={'hd%aa': ('v1', 'v2')}) # on_failure='ignore') # unknown key, rejected by default res = ds.metadata('somefile', add=dict(new=('v1', 'v2')), on_failure='ignore') assert_status('error', res) res = ds.metadata('somefile', add=dict(new=('v1', 'v2')), define_key=dict(new="something fresh")) assert_result_count(res, 1, metadata={'new': ['v1', 'v2']}) # same as this, which exits to support the way things come # in from the cmdline res = ds.metadata(target_file, add=[['new', 'v1', 'v2']]) assert_result_count(res, 1, metadata={'new': ['v1', 'v2']}) # other file got the exact same metadata now assert_result_count(ds.metadata(), 2, metadata={'new': ['v1', 'v2']}) # reset with just a key removes the entire mapping res = ds.metadata(target_file, reset=['new']) assert_result_count(res, 1, metadata={}) # reset with a mapping, overrides the old one res = ds.metadata('somefile', reset=dict(new='george', more='yeah'), permit_undefined_keys=True) assert_result_count(res, 1, metadata=dict(new='george', more='yeah')) # remove single value from mapping, last value to go removes the key res = ds.metadata('somefile', remove=dict(more='yeah')) assert_result_count(res, 1, metadata=dict(new='george')) # and finally init keys res = ds.metadata(init=dict(new=['two', 'three'], super='fresh'), permit_undefined_keys=True, reporton='files') assert_result_count(res, 2) assert_result_count( res, 1, path=opj(ds.path, target_file), # order of values is not maintained metadata=dict(new=['three', 'two'], super='fresh')) assert_result_count( res, 1, path=opj(ds.path, 'somefile'), # order of values is not maintained metadata=dict(new='george', super='fresh'))
def test_custom_native_merge(path): ds = Dataset(path).create(force=True) # no metadata, because nothing is commited _assert_metadata_empty( ds.metadata(reporton='datasets', result_xfm='metadata', return_type='item-or-list')) # enable BIDS metadata, BIDS metadata should become THE metadata ds.config.add('datalad.metadata.nativetype', 'bids', where='dataset') ds.aggregate_metadata() # no metadata, because still nothing is commited _assert_metadata_empty( ds.metadata(reporton='datasets', result_xfm='metadata', return_type='item-or-list')) ds.add('.') ds.aggregate_metadata() meta = ds.metadata(reporton='datasets', result_xfm='metadata', return_type='item-or-list') _clean_meta(meta) assert_dict_equal({'name': u'myds', 'author': ['one', 'two']}, meta) # now give the ds a custom name, must override the native one # but authors still come from BIDS ds.metadata(apply2global=True, add=dict(name='mycustom')) meta = ds.metadata(reporton='datasets', result_xfm='metadata', return_type='item-or-list') _clean_meta(meta) assert_dict_equal({'name': u'mycustom', 'author': ['one', 'two']}, meta) # we can disable the merge meta = ds.metadata(reporton='datasets', merge_native='none', result_xfm='metadata', return_type='item-or-list') _clean_meta(meta) assert_dict_equal({'name': u'mycustom'}, meta) # we can accumulate values meta = ds.metadata(reporton='datasets', merge_native='add', result_xfm='metadata', return_type='item-or-list') _clean_meta(meta) assert_dict_equal({ 'name': ['mycustom', 'myds'], 'author': ['one', 'two'] }, meta) # we can have native override custom (not sure when needed, though) # add one more custom to make visible ds.metadata(apply2global=True, init=dict(homepage='fresh')) meta = ds.metadata(reporton='datasets', merge_native='reset', result_xfm='metadata', return_type='item-or-list') _clean_meta(meta) assert_dict_equal( { 'name': u'myds', 'author': ['one', 'two'], 'homepage': u'fresh' }, meta) # enable an additional metadata source ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') # we need to reaggregate after the config change ds.aggregate_metadata(merge_native='add') meta = ds.metadata(reporton='datasets', merge_native='add', result_xfm='metadata', return_type='item-or-list') _clean_meta(meta) assert_dict_equal( { 'name': ['mycustom', 'myds', 'someother'], 'author': ['one', 'two'], 'homepage': u'fresh' }, meta)
def test_mod_hierarchy(path): base = Dataset(path).create() sub = base.create('sub') basedb_path = opj(base.path, '.datalad', 'metadata', 'dataset.json') subdb_path = opj(sub.path, '.datalad', 'metadata', 'dataset.json') assert (not exists(basedb_path)) assert (not exists(subdb_path)) # modify sub through base res = base.metadata('sub', init=['tag1'], apply2global=True) # only sub modified assert_result_count(res, 3) assert_result_count(res, 1, status='ok', action='metadata', metadata={'tag': 'tag1'}) assert_result_count(res, 2, status='ok', action='save') assert (not exists(basedb_path)) assert (exists(subdb_path)) # saved all the way up ok_clean_git(base.path) # now again, different init, sub has tag already, should be spared res = base.metadata(init=['tag2'], apply2global=True) assert_result_count(res, 2) assert_result_count(res, 1, status='ok', action='metadata', metadata={'tag': 'tag2'}, path=base.path) assert_result_count(res, 1, status='ok', action='save', path=base.path) # and again with removal of all metadata in sub ok_clean_git(base.path) # put to probe files so we see that nothing unrelated gets saved create_tree(base.path, { 'probe': 'content', 'sub': { 'probe': 'othercontent' } }) res = base.metadata('sub', reset=['tag'], apply2global=True) assert_result_count(res, 3) assert_result_count(res, 1, status='ok', action='metadata', metadata={}) assert_result_count(res, 1, status='ok', action='save', path=base.path) assert (exists(basedb_path)) assert (not exists(subdb_path)) # when we remove the probe files things should be clean os.remove(opj(base.path, 'probe')) os.remove(opj(sub.path, 'probe')) ok_clean_git(base.path) # no uninstall the subdataset and check of errors are caught properly base.uninstall(sub.path) ok_clean_git(base.path) res = base.metadata('sub', add=['mike1'], apply2global=True, on_failure='ignore') assert_result_count(res, 1, status='error', path=sub.path, message='cannot edit metadata of unavailable dataset')
def test_basic_dsmeta(path): ds = Dataset(path).create() ok_clean_git(path) # ensure clean slate res = ds.metadata(reporton='datasets') assert_result_count(res, 1) _assert_metadata_empty(res[0]['metadata']) # init res = ds.metadata(init=['tag1', 'tag2'], apply2global=True) eq_(res[0]['metadata']['tag'], ['tag1', 'tag2']) # init again does nothing res = ds.metadata(init=['tag3'], apply2global=True) eq_(res[0]['metadata']['tag'], ['tag1', 'tag2']) # reset whole key ds.metadata(reset=['tag'], apply2global=True) res = ds.metadata(reporton='datasets') assert_result_count(res, 1) _assert_metadata_empty(res[0]['metadata']) # add something arbitrary res = ds.metadata(add=dict(dtype=['heavy'], readme=['short', 'long']), apply2global=True, on_failure='ignore') # fails due to unknown keys assert_status('error', res) res = ds.metadata(add=dict(dtype=['heavy'], readme=['short', 'long']), define_key=dict(dtype='is_a_datatype', readme='is_readme_content'), apply2global=True) eq_(res[0]['metadata']['dtype'], 'heavy') # sorted! eq_(res[0]['metadata']['readme'], ['long', 'short']) # check it reports common keys with swallow_outputs() as cmo: ds.metadata(show_keys=True) assert_in('license', cmo.out) # supply key definitions, no need for apply2global res = ds.metadata(define_key=dict(mykey='truth')) eq_(res[0]['metadata']['definition']['mykey'], u'truth') with swallow_outputs() as cmo: ds.metadata(show_keys=True) assert_in('mykey: truth (dataset: {})'.format(ds.path), cmo.out) # re-supply different key definitions -> error res = ds.metadata(define_key=dict(mykey='lie'), on_failure='ignore') assert_result_count( res, 1, status='error', message=("conflicting definition for key '%s': '%s' != '%s'", "mykey", "lie", "truth")) res = ds.metadata(define_key=dict(otherkey='altfact'), ) eq_(res[0]['metadata']['definition']['otherkey'], 'altfact') # 'definition' is a regular key, we can remove items res = ds.metadata(remove=dict(definition=['mykey']), apply2global=True) assert_dict_equal( res[0]['metadata']['definition'], { 'otherkey': u'altfact', 'readme': u'is_readme_content', 'dtype': u'is_a_datatype' }) res = ds.metadata(remove=dict(definition=['otherkey', 'readme', 'dtype']), apply2global=True) # when there are no items left, the key vanishes too assert ('definition' not in res[0]['metadata']) # we still have metadata, so there is a DB file assert (res[0]['metadata']) db_path = opj(ds.path, '.datalad', 'metadata', 'dataset.json') assert (exists(db_path)) ok_clean_git(ds.path) # but if we remove it, the file is gone res = ds.metadata(reset=['readme', 'dtype'], apply2global=True) eq_(res[0]['metadata'], {}) assert (not exists(db_path)) ok_clean_git(ds.path)
def test_dry_run(path): ds = Dataset(path).create(force=True) # The dataset is reported as dirty, and the custom result render relays # that to the default renderer. with swallow_outputs() as cmo: with assert_raises(IncompleteResultsError): ds.run("blah ", dry_run="basic") assert_in("run(impossible)", cmo.out) assert_not_in("blah", cmo.out) ds.save() with swallow_outputs() as cmo: ds.run("blah ", dry_run="basic") assert_in("Dry run", cmo.out) assert_in("location", cmo.out) assert_in("blah", cmo.out) assert_not_in("expanded inputs", cmo.out) assert_not_in("expanded outputs", cmo.out) with swallow_outputs() as cmo: ds.run("blah {inputs} {outputs}", dry_run="basic", inputs=["fo*"], outputs=["b*r"]) assert_in('blah "foo" "bar"' if on_windows else "blah foo bar", cmo.out) assert_in("expanded inputs", cmo.out) assert_in("['foo']", cmo.out) assert_in("expanded outputs", cmo.out) assert_in("['bar']", cmo.out) # Just the command. with swallow_outputs() as cmo: ds.run("blah ", dry_run="command") assert_not_in("Dry run", cmo.out) assert_in("blah", cmo.out) assert_not_in("inputs", cmo.out) # The output file wasn't unlocked. assert_repo_status(ds.path) # Subdaset handling subds = ds.create("sub") (subds.pathobj / "baz").write_text("z") ds.save(recursive=True) # If a subdataset is installed, it works as usual. with swallow_outputs() as cmo: ds.run("blah {inputs}", dry_run="basic", inputs=["sub/b*"]) assert_in('blah "sub\\baz"' if on_windows else 'blah sub/baz', cmo.out) # However, a dry run will not do the install/reglob procedure. ds.uninstall("sub", check=False) with swallow_outputs() as cmo: ds.run("blah {inputs}", dry_run="basic", inputs=["sub/b*"]) assert_in("sub/b*", cmo.out) assert_not_in("baz", cmo.out)
def __call__(dataset, urlfile, urlformat, filenameformat, input_type="ext", exclude_autometa=None, meta=None, message=None, dry_run=False, fast=False, ifexists=None, missing_value=None, save=True): # Temporarily work around gh-2269. url_file, url_format, filename_format = urlfile, urlformat, filenameformat import logging import os from requests.exceptions import RequestException from datalad.distribution.add import Add from datalad.distribution.create import Create from datalad.distribution.dataset import Dataset, require_dataset from datalad.dochelpers import exc_str from datalad.interface.results import get_status_dict from datalad.support.annexrepo import AnnexRepo lgr = logging.getLogger("datalad.plugin.addurls") dataset = require_dataset(dataset, check_installed=False) if dataset.repo and not isinstance(dataset.repo, AnnexRepo): yield get_status_dict(action="addurls", ds=dataset, status="error", message="not an annex repo") return if input_type == "ext": extension = os.path.splitext(url_file)[1] input_type = "json" if extension == ".json" else "csv" with open(url_file) as fd: try: rows, subpaths = extract(fd, input_type, url_format, filename_format, exclude_autometa, meta, dry_run, missing_value) except (ValueError, RequestException) as exc: yield get_status_dict(action="addurls", ds=dataset, status="error", message=exc_str(exc)) return if len(rows) != len(set(row["filename"] for row in rows)): yield get_status_dict(action="addurls", ds=dataset, status="error", message=("There are file name collisions; " "consider using {_repindex}")) return if dry_run: for subpath in subpaths: lgr.info("Would create a subdataset at %s", subpath) for row in rows: lgr.info("Would download %s to %s", row["url"], os.path.join(dataset.path, row["filename"])) lgr.info("Metadata: %s", sorted(u"{}={}".format(k, v) for k, v in row["meta_args"].items())) yield get_status_dict(action="addurls", ds=dataset, status="ok", message="dry-run finished") return if not dataset.repo: # Populate a new dataset with the URLs. for r in dataset.create(result_xfm=None, return_type='generator', save=save): yield r annex_options = ["--fast"] if fast else [] for spath in subpaths: if os.path.exists(os.path.join(dataset.path, spath)): lgr.warning( "Not creating subdataset at existing path: %s", spath) else: for r in dataset.create(spath, result_xfm=None, return_type='generator', save=save): yield r for row in rows: # Add additional information that we'll need for various # operations. filename_abs = os.path.join(dataset.path, row["filename"]) if row["subpath"]: ds_current = Dataset(os.path.join(dataset.path, row["subpath"])) ds_filename = os.path.relpath(filename_abs, ds_current.path) else: ds_current = dataset ds_filename = row["filename"] row.update({"filename_abs": filename_abs, "ds": ds_current, "ds_filename": ds_filename}) files_to_add = set() for r in add_urls(rows, ifexists=ifexists, options=annex_options): if r["status"] == "ok": files_to_add.add(r["path"]) yield r msg = message or """\ [DATALAD] add files from URLs url_file='{}' url_format='{}' filename_format='{}'""".format(url_file, url_format, filename_format) if files_to_add: for r in dataset.add(files_to_add, message=msg, save=save): yield r meta_rows = [r for r in rows if r["filename_abs"] in files_to_add] for r in add_meta(meta_rows): yield r
def test_push_wanted(srcpath, dstpath): src = Dataset(srcpath).create() if src.repo.is_managed_branch(): # on crippled FS post-update hook enabling via create-sibling doesn't # work ATM raise SkipTest("no create-sibling on crippled FS") (src.pathobj / 'data.0').write_text('0') (src.pathobj / 'secure.1').write_text('1') (src.pathobj / 'secure.2').write_text('2') src.save() # Dropping a file to mimic a case of simply not having it locally (thus not # to be "pushed") src.drop('secure.2', check=False) # Annotate sensitive content, actual value "verysecure" does not matter in # this example src.repo.set_metadata(add={'distribution-restrictions': 'verysecure'}, files=['secure.1', 'secure.2']) src.create_sibling( dstpath, annex_wanted="not metadata=distribution-restrictions=*", name='target', ) # check that wanted is obeyed, since set in sibling configuration res = src.push(to='target') assert_in_results(res, action='copy', path=str(src.pathobj / 'data.0'), status='ok') for p in ('secure.1', 'secure.2'): assert_not_in_results(res, path=str(src.pathobj / p)) assert_status('notneeded', src.push(to='target')) # check the target to really make sure dst = Dataset(dstpath) # normal file, yes eq_((dst.pathobj / 'data.0').read_text(), '0') # secure file, no if dst.repo.is_managed_branch(): neq_((dst.pathobj / 'secure.1').read_text(), '1') else: assert_raises(FileNotFoundError, (dst.pathobj / 'secure.1').read_text) # reset wanted config, which must enable push of secure file src.repo.set_preferred_content('wanted', '', remote='target') res = src.push(to='target') assert_in_results(res, path=str(src.pathobj / 'secure.1')) eq_((dst.pathobj / 'secure.1').read_text(), '1')
def test_run_explicit(path): ds = Dataset(path) assert_false(ds.repo.file_has_content("test-annex.dat")) create_tree(ds.path, { "dirt_untracked": "untracked", "dirt_modified": "modified" }) ds.save("dirt_modified", to_git=True) with open(op.join(path, "dirt_modified"), "a") as ofh: ofh.write(", more") # We need explicit=True to run with dirty repo. assert_status( "impossible", ds.run("cat test-annex.dat test-annex.dat >doubled.dat", inputs=["test-annex.dat"], on_failure="ignore")) hexsha_initial = ds.repo.get_hexsha() # If we specify test-annex.dat as an input, it will be retrieved before the # run. ds.run("cat test-annex.dat test-annex.dat >doubled.dat", inputs=["test-annex.dat"], explicit=True) ok_(ds.repo.file_has_content("test-annex.dat")) # We didn't commit anything because outputs weren't specified. assert_false(ds.repo.file_has_content("doubled.dat")) eq_(hexsha_initial, ds.repo.get_hexsha()) # If an input doesn't exist, we just show the standard warning. with assert_raises(IncompleteResultsError): ds.run("ls", inputs=["not-there"], explicit=True, on_failure="stop") remove(op.join(path, "doubled.dat")) hexsha_initial = ds.repo.get_hexsha() ds.run("cat test-annex.dat test-annex.dat >doubled.dat", inputs=["test-annex.dat"], outputs=["doubled.dat"], explicit=True) ok_(ds.repo.file_has_content("doubled.dat")) assert_repo_status(ds.path, modified=["dirt_modified"], untracked=['dirt_untracked']) neq_(hexsha_initial, ds.repo.get_hexsha()) # Saving explicit outputs works from subdirectories. subdir = op.join(path, "subdir") mkdir(subdir) with chpwd(subdir): run("echo insubdir >foo", explicit=True, outputs=["foo"]) ok_(ds.repo.file_has_content(op.join("subdir", "foo")))
def test_remove_uninstalled(path=None): ds = Dataset(path) assert_raises(ValueError, ds.remove)
def test_run_from_subds(path): subds = Dataset(path).create().create("sub") subds.run("cd .> foo") assert_repo_status(subds.path)
def test_clean_subds_removal(path=None): ds = Dataset(path).create() subds1 = ds.create('one') subds2 = ds.create('two') eq_(sorted(ds.subdatasets(result_xfm='relpaths')), ['one', 'two']) assert_repo_status(ds.path) # now kill one res = ds.remove('one', reckless='availability', result_xfm=None) # subds1 got uninstalled, and ds got the removal of subds1 saved assert_result_count(res, 1, path=subds1.path, action='uninstall', status='ok') assert_result_count(res, 1, path=subds1.path, action='remove', status='ok') assert_result_count(res, 1, path=ds.path, action='save', status='ok') ok_(not subds1.is_installed()) assert_repo_status(ds.path) # two must remain eq_(ds.subdatasets(result_xfm='relpaths'), ['two']) # one is gone nok_(subds1.pathobj.exists()) # and now again, but this time remove something that is not installed ds.create('three') eq_(sorted(ds.subdatasets(result_xfm='relpaths')), ['three', 'two']) ds.drop('two', what='all', reckless='availability') assert_repo_status(ds.path) eq_(sorted(ds.subdatasets(result_xfm='relpaths')), ['three', 'two']) nok_(subds2.is_installed()) # oderly empty mountpoint is maintained ok_(subds2.pathobj.exists()) res = ds.remove('two', reckless='availability') assert_in_results(res, path=str(ds.pathobj / 'two'), action='remove') assert_repo_status(ds.path) # subds2 was already uninstalled, now ds got the removal of subds2 saved nok_(subds2.pathobj.exists()) eq_(ds.subdatasets(result_xfm='relpaths'), ['three'])
def get_modified_subpaths(aps, refds, revision, recursion_limit=None, report_no_revision_change=True, report_untracked='all'): """ Parameters ---------- aps : list refds : Dataset revision : str Commit-ish """ from datalad.interface.diff import Diff # TODO needs recursion limit # NOTE this is implemented as a generator despite that fact that we need # to sort through _all_ the inputs initially, diff'ing each involved # dataset takes time that we can use to already act on intermediate # result paths, without having to wait for 100% completion if revision is None: # we want all, subds not matching the ref are assumed to have been # sorted out before (e.g. one level up) for r in aps: yield r # life is simple: we diff the base dataset modified = [] for r in refds.diff( # we cannot really limit the diff paths easily because we might get # or miss content (e.g. subdatasets) if we don't figure out which # ones are known -- and we don't want that path=None, # `revision` can be anything that Git support for `diff` # `True` is code for diff without revision revision=revision if revision is not True else None, # it is important that staged is False, otherwise we would miss unstaged # changes when e.g. diffing against HEAD (save does that) staged=False, # we might want to consider putting 'untracked' here # maybe that is a little faster, not tested yet ignore_subdatasets='none', # by default, we want to see any individual untracked file, this simplifies further # processing dramatically, but may require subsequent filtering # in order to avoid flooding user output with useless info report_untracked=report_untracked, # no recursion, we needs to update `revision` for every subdataset # before we can `diff` recursive=False, return_type='generator', result_renderer=None, # need to be able to yield the errors on_failure='ignore'): if r['status'] in ('impossible', 'error'): # something unexpected, tell daddy yield r continue # if asked, and no change in revision -- skip if not report_no_revision_change \ and (r.get('revision_src') or r.get('revision')) \ and (r.get('revision_src') == r.get('revision')): continue r['status'] = '' modified.append(r) if not len(modified): # nothing modified nothing to report return # now we can grab the APs that are in this dataset and yield them for ap in aps: # need to preserve pristine info first ap = ap if isinstance(ap, dict) else rawpath2ap(ap, refds.path) for m in modified: if ap['path'] == m['path']: # is directly modified, yield input AP # but update with what we learned about the modification ap.update(m) yield ap break if path_is_subpath(m['path'], ap['path']): # a modified path is underneath this AP # yield the modified one instead yield m continue mod_subs = [m for m in modified if m.get('type', None) == 'dataset'] if not mod_subs or (recursion_limit is not None and recursion_limit < 1): return aps = [ ap if isinstance(ap, dict) else rawpath2ap(ap, refds.path) for ap in aps ] # now for all submodules that were found modified for sub in [m for m in modified if m.get('type', None) == 'dataset']: sub_path_ = _with_sep(sub['path']) # these AP match something inside this submodule, or the whole submodule sub_aps = [ ap for ap in aps if _with_sep(ap['path']).startswith(sub_path_) ] if not sub_aps: continue # we are interested in the modifications within this subdataset # from the state we previously had on record, till the state # we have in record now diff_range = '{}..{}'.format( sub['revision_src'] if sub['revision_src'] else PRE_INIT_COMMIT_SHA, sub['revision'] if sub['revision'] else '') if sub['revision_src'] and sub['revision_src'] == sub['revision']: # this is a special case, where subdataset reported changes without # a change in state/commit -- this is code for uncommited changes # in the subdataset (including staged ones). In such a case, we # must not provide a diff range, but only the source commit we want # to diff against # XXX if this is changed, likely the same logic in diff needs # changing too! diff_range = sub['revision_src'] for r in get_modified_subpaths( sub_aps, Dataset(sub['path']), diff_range, recursion_limit=(recursion_limit - 1) if recursion_limit is not None else None): yield r
def __call__(path=None, dataset=None, recursive=False, recursion_limit=None, action=None, unavailable_path_status='', unavailable_path_msg=None, nondataset_path_status='error', force_parentds_discovery=True, force_subds_discovery=True, force_no_revision_change_discovery=True, force_untracked_discovery=True, modified=None): # upfront check for the fastest possible response if not path and dataset is None: # nothing given, try "here", but do not use `require_dataset`, as # it will determine the root dataset of `curdir` and further down # lead to path annotation of upstairs directories dataset = curdir if force_subds_discovery and not force_parentds_discovery: raise ValueError( 'subdataset discovery requires parent dataset discovery') # CONCEPT: yield with no status to indicate further processing # everything in one big loop to be able too yield as fast a possible # without any precomputing for all paths refds_path = Interface.get_refds_path(dataset) if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)): raise ValueError( "modification detection only works with a base dataset (non-given or found)" ) # prep common result props res_kwargs = dict(action=action if action else 'annotate_path', refds=refds_path, logger=lgr) # handle the case of recursion into a single dataset without any # extra fancy processing first -- full recursion can be done # faster than manual recursion, hence we gain quite some speed # from these few lines of extra code if not modified and not path and refds_path: if not GitRepo.is_valid_repo(refds_path): yield get_status_dict( # doesn't matter if the path is in another dataset # it was given as reference dataset status=nondataset_path_status, message='given reference dataset is not a dataset', path=refds_path, **res_kwargs) return refds = Dataset(refds_path) path = [] # yield the dataset itself r = get_status_dict(ds=refds, status='', **res_kwargs) yield r if recursive: # if we have nothing given, but need recursion, we need to feed # the dataset path itself for r in yield_recursive(refds, refds_path, action, recursion_limit): r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] yield r return # goal: structure in a way that makes most information on any path # available in a single pass, at the cheapest possible cost reported_paths = {} requested_paths = assure_list(path) if modified is not None: # modification detection would silently kill all nondataset paths # but we have to complain about them, hence doing it here if requested_paths and refds_path: for r in requested_paths: p = r['path'] if isinstance(r, dict) else r p = resolve_path(p, ds=refds_path) if path_startswith(p, refds_path): # all good continue # not the refds path_props = r if isinstance(r, dict) else {} res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with reference dataset' reported_paths[r] = res yield res # preserve non-existing paths to be silently killed by modification # detection and append them to requested_paths again after detection. # TODO: This might be melted in with treatment of non dataset paths # above. Re-appending those paths seems to be better than yielding # directly to avoid code duplication, since both cases later on are # dealt with again. preserved_paths = [] if requested_paths: [ preserved_paths.append(r) for r in requested_paths if not lexists(r['path'] if isinstance(r, dict) else r) ] # replace the requested paths by those paths that were actually # modified underneath or at a requested location requested_paths = get_modified_subpaths( # either the request, or the base dataset, if there was no request requested_paths if requested_paths else [refds_path], refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit) from itertools import chain # re-append the preserved paths: requested_paths = chain(requested_paths, iter(preserved_paths)) # do not loop over unique(), this could be a list of dicts # we avoid duplicates manually below via `reported_paths` for path in requested_paths: if not isinstance(path, dict): path = rawpath2ap(path, refds_path) # this is now an annotated path! path_props = path path = path['path'] # we need to mark our territory, who knows where this has been path_props.update(res_kwargs) if path in reported_paths: # we already recorded this path in the output # this can happen, whenever `path` is a subdataset, that was # discovered via recursive processing of another path before continue # the path exists in some shape or form # TODO if we have path_props already we could skip this test if isdir(path): # keep any existing type info, previously a more expensive run # could have discovered an uninstalled 'dataset', and we don't # want it to be relabeled to a directory path_props['type'] = \ path_props.get( 'type', 'dataset' if not islink(path) and GitRepo.is_valid_repo(path) else 'directory') # this could contain all types of additional content containing_dir = path if not islink(path) else normpath( opj(path, pardir)) else: if lexists(path): path_props['type'] = 'file' else: path_props['state'] = 'absent' # for everything else we are interested in the container containing_dir = dirname(path) if not containing_dir: containing_dir = curdir dspath = parent = get_dataset_root(containing_dir) if dspath: if path_props.get('type', None) == 'dataset': # for a dataset the root is not the parent, for anything else # it is parent = path_props.get('parentds', None) oneupdir = normpath(opj(containing_dir, pardir)) if parent is None and (force_parentds_discovery or (refds_path and _with_sep(oneupdir).startswith( _with_sep(refds_path)))): # either forced, or only if we have a reference dataset, and # only if we stay within this refds when searching for the # parent parent = get_dataset_root( normpath(opj(containing_dir, pardir))) # NOTE the `and refds_path` is critical, as it will determine # whether a top-level dataset that was discovered gets the # parent property or not, it won't get it without a common # base dataset, and that is how we always rolled if parent and refds_path: path_props['parentds'] = parent # don't check whether this is actually a true subdataset of the # parent, done further down else: # set parent, but prefer existing property path_props['parentds'] = path_props.get('parentds', dspath) # test for `dspath` not `parent`, we only need to know whether there is # ANY dataset, not which one is the true parent, logic below relies on # the fact that we end here, if there is no dataset at all if not dspath: # not in any dataset res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with any dataset' reported_paths[path] = res yield res continue # check that we only got SUBdatasets if refds_path and not path_startswith(dspath, refds_path): res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = \ ('path not part of the reference dataset at %s', refds_path) reported_paths[path] = res yield res continue if path_props.get('type', None) == 'file': # nothing else we can learn about this res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res continue containing_ds = None path_type = path_props.get('type', None) if parent and force_subds_discovery and ( (path_type == 'dataset' and 'registered_subds' not in path_props) or path_type == 'directory' or not lexists(path)): from datalad.distribution.subdatasets import Subdatasets # if the path doesn't exist, or is labeled a directory, or a dataset even # a dataset (without this info) -> record whether this is a known subdataset # to its parent containing_ds = Dataset(parent) subdss = containing_ds.subdatasets(fulfilled=None, recursive=False, result_xfm=None, result_filter=None, return_type='list') if path in [s['path'] for s in subdss]: if path_type == 'directory' or not lexists(path): # first record that it isn't here, if just a dir or not here at all path_props['state'] = 'absent' # this must be a directory, and it is not installed path_props['type'] = 'dataset' path_props['registered_subds'] = True if not lexists(path) or \ (path_props.get('type', None) == 'dataset' and path_props.get('state', None) == 'absent'): # not there (yet) message = unavailable_path_msg if unavailable_path_msg else None if message and '%s' in message: message = (message, path) path_props['message'] = message res = get_status_dict(**dict(res_kwargs, **path_props)) # assign given status, but only if the props don't indicate a status # already res['status'] = path_props.get('status', unavailable_path_status) reported_paths[path] = res yield res continue # we know everything we can, report res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res rec_paths = [] if recursive: # here we need to consider the special case that `path` is # a dataset itself, if a recursion_limit is given (e.g. # `remove` will do that by default), we need to recurse # from the dataset itself, and not its parent to get things # right -- this will also avoid needless discovery of # unrelated subdatasets if path_props.get('type', None) == 'dataset': containing_ds = Dataset(path) else: # regular parent, we might have a dataset already containing_ds = Dataset( parent) if containing_ds is None else containing_ds for r in yield_recursive(containing_ds, path, action, recursion_limit): # capture reported paths r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] reported_paths[r['path']] = r if modified is not None: # we cannot yield right away, maybe it wasn't modified rec_paths.append(r) else: yield r if modified is not None and rec_paths: # replace the recursively discovered paths by those paths that # were actually modified underneath or at a requested location for r in get_modified_subpaths( rec_paths, refds=Dataset(refds_path), revision=modified, report_no_revision_change= force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit): res = get_status_dict(**dict(r, **res_kwargs)) reported_paths[res['path']] = res yield res return
def test_repo_diff(path, norepo): ds = Dataset(path).create() assert_repo_status(ds.path) assert_raises(ValueError, ds.repo.diff, fr='WTF', to='MIKE') if ds.repo.is_managed_branch(): fr_base = DEFAULT_BRANCH to = DEFAULT_BRANCH else: fr_base = "HEAD" to = None # no diff eq_(ds.repo.diff(fr_base, to), {}) # bogus path makes no difference eq_(ds.repo.diff(fr_base, to, paths=['THIS']), {}) # let's introduce a known change create_tree(ds.path, {'new': 'empty'}) ds.save(to_git=True) assert_repo_status(ds.path) eq_( ds.repo.diff(fr=fr_base + '~1', to=fr_base), { ut.Path(ds.repo.pathobj / 'new'): { 'state': 'added', 'type': 'file', 'bytesize': 5, 'gitshasum': '7b4d68d70fcae134d5348f5e118f5e9c9d3f05f6' } }) # modify known file create_tree(ds.path, {'new': 'notempty'}) eq_( ds.repo.diff(fr='HEAD', to=None), { ut.Path(ds.repo.pathobj / 'new'): { 'state': 'modified', 'type': 'file', # the beast is modified, but no change in shasum -> not staged 'gitshasum': '7b4d68d70fcae134d5348f5e118f5e9c9d3f05f6', 'prev_gitshasum': '7b4d68d70fcae134d5348f5e118f5e9c9d3f05f6' } }) # per path query gives the same result eq_(ds.repo.diff(fr=fr_base, to=to), ds.repo.diff(fr=fr_base, to=to, paths=['new'])) # also given a directory as a constraint does the same eq_(ds.repo.diff(fr=fr_base, to=to), ds.repo.diff(fr=fr_base, to=to, paths=['.'])) # but if we give another path, it doesn't show up eq_(ds.repo.diff(fr=fr_base, to=to, paths=['other']), {}) # make clean ds.save() assert_repo_status(ds.path) # untracked stuff create_tree(ds.path, {'deep': {'down': 'untracked', 'down2': 'tobeadded'}}) # default is to report all files eq_( ds.repo.diff(fr='HEAD', to=None), { ut.Path(ds.repo.pathobj / 'deep' / 'down'): { 'state': 'untracked', 'type': 'file' }, ut.Path(ds.repo.pathobj / 'deep' / 'down2'): { 'state': 'untracked', 'type': 'file' } }) # but can be made more compact eq_( ds.repo.diff(fr='HEAD', to=None, untracked='normal'), { ut.Path(ds.repo.pathobj / 'deep'): { 'state': 'untracked', 'type': 'directory' } }) # again a unmatching path constrainted will give an empty report eq_(ds.repo.diff(fr='HEAD', to=None, paths=['other']), {}) # perfect match and anything underneath will do eq_( ds.repo.diff(fr='HEAD', to=None, paths=['deep']), { ut.Path(ds.repo.pathobj / 'deep' / 'down'): { 'state': 'untracked', 'type': 'file' }, ut.Path(ds.repo.pathobj / 'deep' / 'down2'): { 'state': 'untracked', 'type': 'file' } })
def test_diff(path, norepo): with chpwd(norepo): assert_raises(NoDatasetFound, diff) ds = Dataset(path).create() assert_repo_status(ds.path) # reports stupid revision input assert_result_count(ds.diff(fr='WTF', on_failure='ignore', result_renderer=None), 1, status='impossible', message="Git reference 'WTF' invalid") # no diff assert_result_count(_dirty_results(ds.diff(result_renderer=None)), 0) assert_result_count( _dirty_results(ds.diff(fr='HEAD', result_renderer=None)), 0) # bogus path makes no difference assert_result_count( _dirty_results(ds.diff(path='THIS', fr='HEAD', result_renderer=None)), 0) # let's introduce a known change create_tree(ds.path, {'new': 'empty'}) ds.save(to_git=True) assert_repo_status(ds.path) if ds.repo.is_managed_branch(): fr_base = DEFAULT_BRANCH to = DEFAULT_BRANCH else: fr_base = "HEAD" to = None res = _dirty_results( ds.diff(fr=fr_base + '~1', to=to, result_renderer=None)) assert_result_count(res, 1) assert_result_count(res, 1, action='diff', path=op.join(ds.path, 'new'), state='added') # we can also find the diff without going through the dataset explicitly with chpwd(ds.path): assert_result_count(_dirty_results( diff(fr=fr_base + '~1', to=to, result_renderer=None)), 1, action='diff', path=op.join(ds.path, 'new'), state='added') # no diff against HEAD assert_result_count(_dirty_results(ds.diff(result_renderer=None)), 0) # modify known file create_tree(ds.path, {'new': 'notempty'}) res = _dirty_results(ds.diff(result_renderer=None)) assert_result_count(res, 1) assert_result_count(res, 1, action='diff', path=op.join(ds.path, 'new'), state='modified') # but if we give another path, it doesn't show up assert_result_count(ds.diff(path='otherpath', result_renderer=None), 0) # giving the right path must work though assert_result_count(ds.diff(path='new', result_renderer=None), 1, action='diff', path=op.join(ds.path, 'new'), state='modified') # stage changes ds.repo.add('.', git=True) # no change in diff, staged is not committed assert_result_count(_dirty_results(ds.diff(result_renderer=None)), 1) ds.save() assert_repo_status(ds.path) assert_result_count(_dirty_results(ds.diff(result_renderer=None)), 0) # untracked stuff create_tree(ds.path, {'deep': {'down': 'untracked', 'down2': 'tobeadded'}}) # a plain diff should report the untracked file # but not directly, because the parent dir is already unknown res = _dirty_results(ds.diff(result_renderer=None)) assert_result_count(res, 1) assert_result_count(res, 1, state='untracked', type='directory', path=op.join(ds.path, 'deep')) # report of individual files is also possible assert_result_count(ds.diff(untracked='all', result_renderer=None), 2, state='untracked', type='file') # an unmatching path will hide this result assert_result_count(ds.diff(path='somewhere', result_renderer=None), 0) # perfect match and anything underneath will do assert_result_count(ds.diff(path='deep', result_renderer=None), 1, state='untracked', path=op.join(ds.path, 'deep'), type='directory') assert_result_count(ds.diff(path='deep', result_renderer=None), 1, state='untracked', path=op.join(ds.path, 'deep')) ds.repo.add(op.join('deep', 'down2'), git=True) # now the remaining file is the only untracked one assert_result_count(ds.diff(result_renderer=None), 1, state='untracked', path=op.join(ds.path, 'deep', 'down'), type='file')
def test_diff_recursive(path): ds = Dataset(path).create() sub = ds.create('sub') # look at the last change, and confirm a dataset was added res = ds.diff(fr=DEFAULT_BRANCH + '~1', to=DEFAULT_BRANCH, result_renderer=None) assert_result_count(res, 1, action='diff', state='added', path=sub.path, type='dataset') # now recursive res = ds.diff(recursive=True, fr=DEFAULT_BRANCH + '~1', to=DEFAULT_BRANCH, result_renderer=None) # we also get the entire diff of the subdataset from scratch assert_status('ok', res) ok_(len(res) > 3) # one specific test assert_result_count(res, 1, action='diff', state='added', path=op.join(sub.path, '.datalad', 'config')) # now we add a file to just the parent create_tree(ds.path, { 'onefile': 'tobeadded', 'sub': { 'twofile': 'tobeadded' } }) res = ds.diff(recursive=True, untracked='all', result_renderer=None) assert_result_count(_dirty_results(res), 3) assert_result_count(res, 1, action='diff', state='untracked', path=op.join(ds.path, 'onefile'), type='file') assert_result_count(res, 1, action='diff', state='modified', path=sub.path, type='dataset') assert_result_count(res, 1, action='diff', state='untracked', path=op.join(sub.path, 'twofile'), type='file') # intentional save in two steps to make check below easier ds.save('sub', recursive=True) ds.save() assert_repo_status(ds.path) head_ref = DEFAULT_BRANCH if ds.repo.is_managed_branch() else 'HEAD' # look at the last change, only one file was added res = ds.diff(fr=head_ref + '~1', to=head_ref, result_renderer=None) assert_result_count(_dirty_results(res), 1) assert_result_count(res, 1, action='diff', state='added', path=op.join(ds.path, 'onefile'), type='file') # now the exact same thing with recursion, must not be different from the # call above res = ds.diff(recursive=True, fr=head_ref + '~1', to=head_ref, result_renderer=None) assert_result_count(_dirty_results(res), 1) # last change in parent assert_result_count(res, 1, action='diff', state='added', path=op.join(ds.path, 'onefile'), type='file') if ds.repo.is_managed_branch(): raise SkipTest( "Test assumption broken: https://github.com/datalad/datalad/issues/3818" ) # one further back brings in the modified subdataset, and the added file # within it res = ds.diff(recursive=True, fr=head_ref + '~2', to=head_ref, result_renderer=None) assert_result_count(_dirty_results(res), 3) assert_result_count(res, 1, action='diff', state='added', path=op.join(ds.path, 'onefile'), type='file') assert_result_count(res, 1, action='diff', state='added', path=op.join(sub.path, 'twofile'), type='file') assert_result_count(res, 1, action='diff', state='modified', path=sub.path, type='dataset')
def get_baseline(p): ds = Dataset(p).create() sub = create(str(ds.pathobj / 'sub')) assert_repo_status(ds.path, untracked=['sub']) return ds
def check_push(annex, src_path, dst_path): # prepare src src = Dataset(src_path).create(annex=annex) src_repo = src.repo # push should not add branches to the local dataset orig_branches = src_repo.get_branches() assert_not_in('synced/' + DEFAULT_BRANCH, orig_branches) res = src.push(on_failure='ignore') assert_result_count(res, 1) assert_in_results( res, status='impossible', message='No push target given, and none could be auto-detected, ' 'please specify via --to') eq_(orig_branches, src_repo.get_branches()) # target sibling target = mk_push_target(src, 'target', dst_path, annex=annex) eq_(orig_branches, src_repo.get_branches()) res = src.push(to="target") eq_(orig_branches, src_repo.get_branches()) assert_result_count(res, 2 if annex else 1) assert_in_results(res, action='publish', status='ok', target='target', refspec=DEFAULT_REFSPEC, operations=['new-branch']) assert_repo_status(src_repo, annex=annex) eq_(list(target.get_branch_commits_(DEFAULT_BRANCH)), list(src_repo.get_branch_commits_(DEFAULT_BRANCH))) # configure a default merge/upstream target src.config.set('branch.{}.remote'.format(DEFAULT_BRANCH), 'target', where='local') src.config.set('branch.{}.merge'.format(DEFAULT_BRANCH), DEFAULT_BRANCH, where='local') # don't fail when doing it again, no explicit target specification # needed anymore res = src.push() eq_(orig_branches, src_repo.get_branches()) # and nothing is pushed assert_status('notneeded', res) assert_repo_status(src_repo, annex=annex) eq_(list(target.get_branch_commits_(DEFAULT_BRANCH)), list(src_repo.get_branch_commits_(DEFAULT_BRANCH))) # some modification: (src.pathobj / 'test_mod_file').write_text("Some additional stuff.") src.save(to_git=True, message="Modified.") (src.pathobj / 'test_mod_annex_file').write_text("Heavy stuff.") src.save(to_git=not annex, message="Modified again.") assert_repo_status(src_repo, annex=annex) # we could say since='HEAD~2' to make things fast, or we are lazy # and say since='^' to indicate the state of the tracking remote # which is the same, because we made to commits since the last push. res = src.push(to='target', since="^", jobs=2) assert_in_results( res, action='publish', status='ok', target='target', refspec=DEFAULT_REFSPEC, # we get to see what happened operations=['fast-forward']) if annex: # we got to see the copy result for the annexed files assert_in_results(res, action='copy', status='ok', path=str(src.pathobj / 'test_mod_annex_file')) # we published, so we can drop and reobtain ok_(src_repo.file_has_content('test_mod_annex_file')) src_repo.drop('test_mod_annex_file') ok_(not src_repo.file_has_content('test_mod_annex_file')) src_repo.get('test_mod_annex_file') ok_(src_repo.file_has_content('test_mod_annex_file')) ok_file_has_content(src_repo.pathobj / 'test_mod_annex_file', 'Heavy stuff.') eq_(list(target.get_branch_commits_(DEFAULT_BRANCH)), list(src_repo.get_branch_commits_(DEFAULT_BRANCH))) if not (annex and src_repo.is_managed_branch()): # the following doesn't make sense in managed branches, because # a commit that could be amended is no longer the last commit # of a branch after a sync has happened (which did happen # during the last push above # amend and change commit msg in order to test for force push: src_repo.commit("amended", options=['--amend']) # push should be rejected (non-fast-forward): res = src.push(to='target', since='HEAD~2', on_failure='ignore') # fails before even touching the annex branch assert_in_results(res, action='publish', status='error', target='target', refspec=DEFAULT_REFSPEC, operations=['rejected', 'error']) # push with force=True works: res = src.push(to='target', since='HEAD~2', force='gitpush') assert_in_results(res, action='publish', status='ok', target='target', refspec=DEFAULT_REFSPEC, operations=['forced-update']) eq_(list(target.get_branch_commits_(DEFAULT_BRANCH)), list(src_repo.get_branch_commits_(DEFAULT_BRANCH))) # we do not have more branches than we had in the beginning # in particular no 'synced/<default branch>' eq_(orig_branches, src_repo.get_branches())
def test_save(path): ds = Dataset(path) with open(op.join(path, "new_file.tst"), "w") as f: f.write("something") ds.repo.add("new_file.tst", git=True) ok_(ds.repo.dirty) ds.save(message="add a new file") assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) with open(op.join(path, "new_file.tst"), "w") as f: f.write("modify") ok_(ds.repo.dirty) ds.save(message="modified new_file.tst") assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) # save works without ds and files given in the PWD with open(op.join(path, "new_file.tst"), "w") as f: f.write("rapunzel") with chpwd(path): save(message="love rapunzel") assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) # and also without `-a` when things are staged with open(op.join(path, "new_file.tst"), "w") as f: f.write("exotic") ds.repo.add("new_file.tst", git=True) with chpwd(path): save(message="love marsians") assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) files = ['one.txt', 'two.txt'] for fn in files: with open(op.join(path, fn), "w") as f: f.write(fn) ds.save([op.join(path, f) for f in files]) # superfluous call to save (alll saved it already), should not fail # but report that nothing was saved assert_status('notneeded', ds.save(message="set of new files")) assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) # create subdataset subds = ds.create('subds') assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) # modify subds with open(op.join(subds.path, "some_file.tst"), "w") as f: f.write("something") subds.save() assert_repo_status(subds.path, annex=isinstance(subds.repo, AnnexRepo)) # ensure modified subds is committed ds.save() assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) # now introduce a change downstairs subds.create('someotherds') assert_repo_status(subds.path, annex=isinstance(subds.repo, AnnexRepo)) ok_(ds.repo.dirty) # and save via subdataset path ds.save('subds', version_tag='new_sub') assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) tags = ds.repo.get_tags() ok_(len(tags) == 1) eq_(tags[0], dict(hexsha=ds.repo.get_hexsha(), name='new_sub')) # fails when retagged, like git does res = ds.save(version_tag='new_sub', on_failure='ignore') assert_status('error', res) assert_result_count(res, 1, action='save', type='dataset', path=ds.path, message=('cannot tag this version: %s', "fatal: tag 'new_sub' already exists"))
def test_push_recursive(origin_path, src_path, dst_top, dst_sub, dst_subnoannex, dst_subsub): # dataset with two submodules and one subsubmodule origin = Dataset(origin_path).create() origin_subm1 = origin.create('sub m') origin_subm1.create('subsub m') origin.create('subm noannex', annex=False) origin.save() assert_repo_status(origin.path) # prepare src as a fresh clone with all subdatasets checkout out recursively # running on a clone should make the test scenario more different than # test_push(), even for the pieces that should be identical top = Clone.__call__(source=origin.path, path=src_path) sub, subsub, subnoannex = top.get('.', recursive=True, get_data=False, result_xfm='datasets') target_top = mk_push_target(top, 'target', dst_top, annex=True) # subdatasets have no remote yet, so recursive publishing should fail: res = top.push(to="target", recursive=True, on_failure='ignore') assert_in_results(res, path=top.path, type='dataset', refspec=DEFAULT_REFSPEC, operations=['new-branch'], action='publish', status='ok', target='target') for d in (sub, subsub, subnoannex): assert_in_results(res, status='error', type='dataset', path=d.path, message=("Unknown target sibling '%s'.", 'target')) # now fix that and set up targets for the submodules target_sub = mk_push_target(sub, 'target', dst_sub, annex=True) target_subnoannex = mk_push_target(subnoannex, 'target', dst_subnoannex, annex=False) target_subsub = mk_push_target(subsub, 'target', dst_subsub, annex=True) # and same push call as above res = top.push(to="target", recursive=True) # topds skipped assert_in_results(res, path=top.path, type='dataset', action='publish', status='notneeded', target='target') # the rest pushed for d in (sub, subsub, subnoannex): assert_in_results(res, status='ok', type='dataset', path=d.path, refspec=DEFAULT_REFSPEC) # all correspondig branches match across all datasets for s, d in zip( (top, sub, subnoannex, subsub), (target_top, target_sub, target_subnoannex, target_subsub)): eq_(list(s.repo.get_branch_commits_(DEFAULT_BRANCH)), list(d.get_branch_commits_(DEFAULT_BRANCH))) if s != subnoannex: eq_(list(s.repo.get_branch_commits_("git-annex")), list(d.get_branch_commits_("git-annex"))) # rerun should not result in further pushes of the default branch res = top.push(to="target", recursive=True) assert_not_in_results(res, status='ok', refspec=DEFAULT_REFSPEC) assert_in_results(res, status='notneeded', refspec=DEFAULT_REFSPEC) if top.repo.is_managed_branch(): raise SkipTest( 'Save/status of subdataset with managed branches is an still ' 'unresolved issue') # now annex a file in subsub test_copy_file = subsub.pathobj / 'test_mod_annex_file' test_copy_file.write_text("Heavy stuff.") # save all the way up assert_status(('ok', 'notneeded'), top.save(message='subsub got something', recursive=True)) assert_repo_status(top.path) # publish straight up, should be smart by default res = top.push(to="target", recursive=True) # we see 3 out of 4 datasets pushed (sub noannex was left unchanged) for d in (top, sub, subsub): assert_in_results(res, status='ok', type='dataset', path=d.path, refspec=DEFAULT_REFSPEC) # file content copied too assert_in_results(res, action='copy', status='ok', path=str(test_copy_file)) # verify it is accessible, drop and bring back assert_status('ok', top.drop(str(test_copy_file))) ok_(not subsub.repo.file_has_content('test_mod_annex_file')) top.get(test_copy_file) ok_file_has_content(test_copy_file, 'Heavy stuff.') # make two modification (sub.pathobj / 'test_mod_annex_file').write_text('annex') (subnoannex.pathobj / 'test_mod_file').write_text('git') # save separately top.save(sub.pathobj, message='annexadd', recursive=True) top.save(subnoannex.pathobj, message='gitadd', recursive=True) # now only publish the latter one res = top.push(to="target", since='HEAD~1', recursive=True) # nothing copied, no reports on the other modification assert_not_in_results(res, action='copy') assert_not_in_results(res, path=sub.path) for d in (top, subnoannex): assert_in_results(res, status='ok', type='dataset', path=d.path, refspec=DEFAULT_REFSPEC) # an unconditional push should now pick up the remaining changes res = top.push(to="target", recursive=True) assert_in_results(res, action='copy', status='ok', path=str(sub.pathobj / 'test_mod_annex_file')) assert_in_results(res, status='ok', type='dataset', path=sub.path, refspec=DEFAULT_REFSPEC) for d in (top, subnoannex, subsub): assert_in_results(res, status='notneeded', type='dataset', path=d.path, refspec=DEFAULT_REFSPEC)
def test_bf1886(path): parent = Dataset(path).create() parent.create('sub') assert_repo_status(parent.path) # create a symlink pointing down to the subdataset, and add it os.symlink('sub', op.join(parent.path, 'down')) parent.save('down') assert_repo_status(parent.path) # now symlink pointing up os.makedirs(op.join(parent.path, 'subdir', 'subsubdir')) os.symlink(op.join(op.pardir, 'sub'), op.join(parent.path, 'subdir', 'up')) parent.save(op.join('subdir', 'up')) # 'all' to avoid the empty dir being listed assert_repo_status(parent.path, untracked_mode='all') # now symlink pointing 2xup, as in #1886 os.symlink(op.join(op.pardir, op.pardir, 'sub'), op.join(parent.path, 'subdir', 'subsubdir', 'upup')) parent.save(op.join('subdir', 'subsubdir', 'upup')) assert_repo_status(parent.path) # simulatenously add a subds and a symlink pointing to it # create subds, but don't register it create(op.join(parent.path, 'sub2')) os.symlink(op.join(op.pardir, op.pardir, 'sub2'), op.join(parent.path, 'subdir', 'subsubdir', 'upup2')) parent.save(['sub2', op.join('subdir', 'subsubdir', 'upup2')]) assert_repo_status(parent.path) # full replication of #1886: the above but be in subdir of symlink # with no reference dataset create(op.join(parent.path, 'sub3')) os.symlink(op.join(op.pardir, op.pardir, 'sub3'), op.join(parent.path, 'subdir', 'subsubdir', 'upup3')) # need to use absolute paths with chpwd(op.join(parent.path, 'subdir', 'subsubdir')): save([ op.join(parent.path, 'sub3'), op.join(parent.path, 'subdir', 'subsubdir', 'upup3') ]) assert_repo_status(parent.path)
class ArchiveAnnexCustomRemote(AnnexCustomRemote): """Special custom remote allowing to obtain files from archives Archives must be under annex'ed themselves. """ CUSTOM_REMOTE_NAME = "archive" SUPPORTED_SCHEMES = ( AnnexCustomRemote._get_custom_scheme(CUSTOM_REMOTE_NAME), ) # Since we support only 1 scheme here URL_SCHEME = SUPPORTED_SCHEMES[0] URL_PREFIX = URL_SCHEME + ":" AVAILABILITY = "local" COST = 500 def __init__(self, annex, path=None, persistent_cache=True, **kwargs): super().__init__(annex) # MIH figure out what the following is all about # in particular path==None self.repo = Dataset(get_dataset_root(Path.cwd())).repo \ if not path \ else AnnexRepo(path, create=False, init=False) self.path = self.repo.path # annex requests load by KEY not but URL which it originally asked # about. So for a key we might get back multiple URLs and as a # heuristic let's use the most recently asked one self._last_url = None # for heuristic to choose among multiple URLs self._cache = ArchivesCache(self.path, persistent=persistent_cache) self._contentlocations = DictCache(size_limit=100) # TODO: config ? def stop(self, *args): """Stop communication with annex""" self._cache.clean() def get_file_url(self, archive_file=None, archive_key=None, file=None, size=None): """Given archive (file or a key) and a file -- compose URL for access Examples -------- dl+archive:SHA256E-s176--69...3e.tar.gz#path=1/d2/2d&size=123 when size of file within archive was known to be 123 dl+archive:SHA256E-s176--69...3e.tar.gz#path=1/d2/2d when size of file within archive was not provided Parameters ---------- size: int, optional Size of the file. If not provided, will simply be empty """ assert (file is not None) if archive_file is not None: if archive_key is not None: raise ValueError( "Provide archive_file or archive_key - not both") archive_key = self.repo.get_file_annexinfo(archive_file)['key'] assert (archive_key is not None) attrs = OrderedDict() # looking forward for more if file: attrs['path'] = file.lstrip('/') if size is not None: attrs['size'] = size return str( URL(scheme=self.URL_SCHEME, path=archive_key, fragment=attrs)) @property def cache(self): return self._cache def _parse_url(self, url): """Parse url and return archive key, file within archive and additional attributes (such as size)""" url = URL(url) assert (url.scheme == self.URL_SCHEME) fdict = url.fragment_dict if 'path' not in fdict: # must be old-style key/path#size= assert '/' in url.path, "must be of key/path format" key, path = url.path.split('/', 1) else: key, path = url.path, fdict.pop('path') if 'size' in fdict: fdict['size'] = int(fdict['size']) return key, path, fdict def _gen_akey_afiles(self, key, sorted=False, unique_akeys=True): """Given a key, yield akey, afile pairs if `sorted`, then first those which have extracted version in local cache will be yielded Gets determined based on urls for datalad archives Made "generators all the way" as an exercise but also to delay any checks etc until really necessary. """ # we will need all URLs anyways later on ATM, so lets list() them # Anyways here we have a single scheme (archive) so there is not # much optimization possible urls = list(self.gen_URLS(key)) akey_afiles = [ self._parse_url(url)[:2] # skip size for url in urls ] if unique_akeys: akey_afiles = unique(akey_afiles, key=itemgetter(0)) if not sorted: for pair in akey_afiles: yield pair return # Otherwise we will go through each one # multiple URLs are available so we need to figure out which one # would be most efficient to "deal with" akey_afile_paths = (((akey, afile), self.get_contentlocation(akey, absolute=True, verify_exists=False)) for akey, afile in akey_afiles) # by default get_contentlocation would return empty result for a key # which is not available locally. But we could still have extracted # archive in the cache. So we need pretty much get first all possible # and then only remove those which aren't present locally. So # verify_exists was added yielded = set() akey_afile_paths_ = [] # utilize cache to check which archives might already be present in the # cache for akey_afile, akey_path in akey_afile_paths: if akey_path and self.cache[akey_path].is_extracted: yield akey_afile yielded.add(akey_afile) akey_afile_paths_.append((akey_afile, akey_path)) # replace generators with already collected ones into a list. The idea # that in many cases we don't even need to create a full list and that # initial single yield would be enough, thus we don't need to check # locations etc for every possible hit akey_afile_paths = akey_afile_paths_ # if not present in the cache -- check which are present # locally and choose that one to use, so it would get extracted for akey_afile, akey_path in akey_afile_paths: if akey_path and op.exists(akey_path): yielded.add(akey_afile) yield akey_afile # So no archive is present either in the cache or originally under # annex XXX some kind of a heuristic I guess is to use last_url ;-) if self._last_url and self._last_url in urls \ and (len(urls) == len(akey_afiles)): akey_afile, _ = akey_afile_paths[urls.index(self._last_url)] yielded.add(akey_afile) yield akey_afile for akey_afile, _ in akey_afile_paths: if akey_afile not in yielded: yield akey_afile def get_contentlocation(self, key, absolute=False, verify_exists=True): """Return (relative to top or absolute) path to the file containing the key This is a wrapper around AnnexRepo.get_contentlocation which provides caching of the result (we are asking the location for the same archive key often) """ if key not in self._contentlocations: fpath = self.repo.get_contentlocation(key, batch=True) if fpath: # shouldn't store empty ones self._contentlocations[key] = fpath else: fpath = self._contentlocations[key] # but verify that it exists if verify_exists and not op.lexists(op.join(self.path, fpath)): # prune from cache del self._contentlocations[key] fpath = '' if absolute and fpath: return op.join(self.path, fpath) else: return fpath # Protocol implementation def checkurl(self, url): # TODO: what about those MULTI and list to be returned? # should we return all filenames or keys within archive? # might be way too many? # only if just archive portion of url is given or the one pointing # to specific file? lgr.debug("Current directory: %s, url: %s", os.getcwd(), url) akey, afile, attrs = self._parse_url(url) size = attrs.get('size', None) # But reply that present only if archive is present # TODO: this would throw exception if not present, so this statement is # kinda bogus akey_path = self.get_contentlocation(akey, absolute=True) if akey_path: # Extract via cache only if size is not yet known if size is None: # if for testing we want to force getting the archive extracted efile = self.cache[akey_path].get_extracted_filename(afile) efile = ensure_bytes(efile) if op.exists(efile): size = os.stat(efile).st_size # so it was a good successful one -- record self._last_url = url if size is None: return True else: # FIXME: providing filename causes annex to not even talk to # ask upon drop :-/ return [dict(size=size)] # , basename(afile)) else: # TODO: theoretically we should first check if key is available # from any remote to know if file is available return False def checkpresent(self, key): # TODO: so we need to maintain mapping from urls to keys. Then # we could even store the filename within archive # Otherwise it is unrealistic to even require to recompute key if we # knew the backend etc # The same content could be available from multiple locations within # the same archive, so let's not ask it twice since here we don't care # about "afile" for akey, _ in self._gen_akey_afiles(key, unique_akeys=True): if self.get_contentlocation(akey) \ or self.repo.is_available(akey, batch=True, key=True): return True # it is unclear to MIH why this must be UNKNOWN rather than FALSE # but this is how I found it raise RemoteError('Key not present') def remove(self, key): raise UnsupportedRequest('This special remote cannot remove content') # # TODO: proxy query to the underlying tarball under annex that if # # tarball was removed (not available at all) -- report success, # # otherwise failure (current the only one) # akey, afile = self._get_akey_afile(key) # if False: # # TODO: proxy, checking present of local tarball is not # # sufficient # # not exists(self.get_key_path(key)): # self.send("REMOVE-SUCCESS", akey) # else: # self.send("REMOVE-FAILURE", akey, # "Removal from file archives is not supported") def whereis(self, key): return False # although more logical is to report back success, it leads to imho # more confusing duplication. See # http://git-annex.branchable.com/design/external_special_remote_protocol/#comment-3f9588f6a972ae566347b6f467b53b54 # try: # key, file = self._get_akey_afile(key) # self.send("WHEREIS-SUCCESS", "file %s within archive %s" % (file, key)) # except ValueError: # self.send("WHEREIS-FAILURE") def transfer_retrieve(self, key, file): akeys_tried = [] # the same file could come from multiple files within the same archive # So far it doesn't make sense to "try all" of them since if one fails # it means the others would fail too, so it makes sense to immediately # prune the list so we keep only the ones from unique akeys. # May be whenever we support extraction directly from the tarballs # we should go through all and choose the one easiest to get or smth. for akey, afile in self._gen_akey_afiles(key, sorted=True, unique_akeys=True): if not akey: lgr.warning("Got an empty archive key %r for key %s. Skipping", akey, key) continue akeys_tried.append(akey) try: with lock_if_check_fails( check=(self.get_contentlocation, (akey, )), lock_path=(lambda k: op.join( self.repo.path, '.git', 'datalad-archives-%s' % k), (akey, )), operation="annex-get") as (akey_fpath, lock): if lock: assert not akey_fpath self._annex_get_archive_by_key(akey) akey_fpath = self.get_contentlocation(akey) if not akey_fpath: raise RuntimeError( "We were reported to fetch it alright but now can't " "get its location. Check logic") akey_path = op.join(self.repo.path, akey_fpath) assert op.exists(akey_path), \ "Key file %s is not present" % akey_path # Extract that bloody file from the bloody archive # TODO: implement/use caching, for now a simple one # actually patool doesn't support extraction of a single file # https://github.com/wummel/patool/issues/20 # so pwd = getpwd() lgr.debug("Getting file {afile} from {akey_path} " "while PWD={pwd}".format(**locals())) was_extracted = self.cache[akey_path].is_extracted apath = self.cache[akey_path].get_extracted_file(afile) link_file_load(apath, file) if not was_extracted and self.cache[akey_path].is_extracted: self.message( "%s special remote is using an extraction cache " "under %s. Remove it with DataLad's 'clean' " "command to save disk space." % (ARCHIVES_SPECIAL_REMOTE, self.cache[akey_path].path), type='info', ) return except Exception as exc: ce = CapturedException(exc) self.message( "Failed to fetch {akey} containing {key}: {msg}".format( akey=akey, key=key, # we need to get rid of any newlines, or we might # break the special remote protocol msg=str(ce).replace('\n', '|'))) continue raise RemoteError("Failed to fetch any archive containing {key}. " "Tried: {akeys_tried}".format(**locals())) def claimurl(self, url): scheme = urlparse(url).scheme if scheme in self.SUPPORTED_SCHEMES: return True else: return False def _annex_get_archive_by_key(self, akey): # TODO: make it more stringent? # Command could have fail to run if key was not present locally yet # Thus retrieve the key using annex # TODO: we need to report user somehow about this happening and # progress on the download from humanize import naturalsize from datalad.support.annexrepo import AnnexJsonProtocol akey_size = self.repo.get_size_from_key(akey) self.message( "To obtain some keys we need to fetch an archive " "of size %s" % (naturalsize(akey_size) if akey_size else "unknown"), type='info', ) try: self.repo._call_annex( ["get", "--json", "--json-progress", "--key", akey], protocol=AnnexJsonProtocol, ) except Exception: self.message(f'Failed to fetch archive with key {akey}') raise
def _get_procedure_implementation(name='*', ds=None): """get potential procedure path and configuration Order of consideration is user-level, system-level, dataset, datalad extensions, datalad. First one found according to this order is the one to be returned. Therefore local definitions/configurations take precedence over ones, that come from outside (via a datalad-extension or a dataset with its .datalad/config). If a dataset had precedence (as it was before), the addition (or just an update) of a (sub-)dataset would otherwise surprisingly cause you do execute code different from what you defined within ~/.gitconfig or your local repository's .git/config. So, local definitions take precedence over remote ones and more specific ones over more general ones. Returns ------- tuple path, name, format string, help message """ ds = ds if isinstance(ds, Dataset) else Dataset(ds) if ds else None # 1. check system and user account for procedure for loc in (cfg.obtain('datalad.locations.user-procedures'), cfg.obtain('datalad.locations.system-procedures')): for dir in assure_list(loc): for m, n in _get_file_match(dir, name): yield ( m, n, ) + _get_proc_config(n) # 2. check dataset for procedure if ds is not None and ds.is_installed(): # could be more than one dirs = assure_list( ds.config.obtain('datalad.locations.dataset-procedures')) for dir in dirs: # TODO `get` dirs if necessary for m, n in _get_file_match(op.join(ds.path, dir), name): yield ( m, n, ) + _get_proc_config(n, ds=ds) # 2.1. check subdatasets recursively for subds in ds.subdatasets(return_type='generator', result_xfm='datasets'): for m, n, f, h in _get_procedure_implementation(name=name, ds=subds): yield m, n, f, h # 3. check extensions for procedure # delay heavy import until here from pkg_resources import iter_entry_points from pkg_resources import resource_isdir from pkg_resources import resource_filename for entry_point in iter_entry_points('datalad.extensions'): # use of '/' here is OK wrt to platform compatibility if resource_isdir(entry_point.module_name, 'resources/procedures'): for m, n in _get_file_match( resource_filename(entry_point.module_name, 'resources/procedures'), name): yield ( m, n, ) + _get_proc_config(n) # 4. at last check datalad itself for procedure for m, n in _get_file_match( resource_filename('datalad', 'resources/procedures'), name): yield ( m, n, ) + _get_proc_config(n)