def test_report_absent_keys(path=None): ds = Dataset(path).create() # create an annexed file testfile = ds.pathobj / 'dummy' testfile.write_text(u'nothing') ds.save() # present in a full report and in a partial report # based on worktree of HEAD ref for ai in (ds.repo.get_content_annexinfo(eval_availability=True), ds.repo.get_content_annexinfo(paths=['dummy'], eval_availability=True), ds.repo.get_content_annexinfo(ref='HEAD', eval_availability=True), ds.repo.get_content_annexinfo(ref='HEAD', paths=['dummy'], eval_availability=True)): assert_in(testfile, ai) assert_equal(ai[testfile]['has_content'], True) # drop the key, not available anywhere else ds.drop('dummy', reckless='kill') # does not change a thing, except the key is gone for ai in (ds.repo.get_content_annexinfo(eval_availability=True), ds.repo.get_content_annexinfo(paths=['dummy'], eval_availability=True), ds.repo.get_content_annexinfo(ref='HEAD', eval_availability=True), ds.repo.get_content_annexinfo(ref='HEAD', paths=['dummy'], eval_availability=True)): assert_in(testfile, ai) assert_equal(ai[testfile]['has_content'], False) # make sure files with URL keys are correctly reported: from datalad.conftest import test_http_server remote_file_name = 'imaremotefile.dat' local_file_name = 'mehasurlkey' (Path(test_http_server.path) / remote_file_name).write_text("weee") remote_file_url = f'{test_http_server.url}/{remote_file_name}' # we need to get a file with a URL key and check its local availability ds.repo.call_annex( ['addurl', '--relaxed', remote_file_url, '--file', local_file_name]) ds.save("URL keys!") # should not be there res = ds.repo.get_file_annexinfo(local_file_name, eval_availability=True) assert_equal(res['has_content'], False) ds.get(local_file_name) # should be there res = ds.repo.get_file_annexinfo(local_file_name, eval_availability=True) assert_equal(res['has_content'], True)
def test_copy_file(workdir=None, webdir=None, weburl=None): workdir = Path(workdir) webdir = Path(webdir) src_ds = Dataset(workdir / 'src').create() # put a file into the dataset by URL and drop it again src_ds.download_url('/'.join((weburl, 'webfile1')), path='myfile1.txt') src_ds.download_url('/'.join((weburl, 'webfile2')), path=opj('subdir', 'myfile2.txt')) ok_file_has_content(src_ds.pathobj / 'myfile1.txt', '123') # now create a fresh dataset dest_ds = Dataset(workdir / 'dest').create() if dest_ds.repo._check_version_kludges("fromkey-supports-unlocked") or \ not dest_ds.repo.is_managed_branch(): # unless we have a target ds on a cripples FS (where `annex fromkey` # doesn't work until after 8.20210428), we can even drop the file # content in the source repo src_ds.drop('myfile1.txt', reckless='kill') nok_(src_ds.repo.file_has_content('myfile1.txt')) # copy the file from the source dataset into it. # it must copy enough info to actually put datalad into the position # to obtain the file content from the original URL dest_ds.copy_file(src_ds.pathobj / 'myfile1.txt') dest_ds.get('myfile1.txt') ok_file_has_content(dest_ds.pathobj / 'myfile1.txt', '123') # purposefully pollute the employed tmp folder to check that we do not trip # over such a condition tmploc = dest_ds.pathobj / '.git' / 'tmp' / 'datalad-copy' / 'some' tmploc.parent.mkdir(parents=True) tmploc.touch() # copy again, but to different target file name # (source+dest pair now) dest_ds.copy_file( [src_ds.pathobj / 'myfile1.txt', dest_ds.pathobj / 'renamed.txt']) ok_file_has_content(dest_ds.pathobj / 'renamed.txt', '123') # copying more than one at once dest_ds.copy_file([ src_ds.pathobj / 'myfile1.txt', src_ds.pathobj / 'subdir' / 'myfile2.txt', dest_ds.pathobj ]) # copy directly from a non-dataset location dest_ds.copy_file(webdir / 'webfile1') # copy from annex dataset into gitrepo git_ds = Dataset(workdir / 'git').create(annex=False) git_ds.copy_file(src_ds.pathobj / 'subdir' / 'myfile2.txt')
def test_get_subdatasets(path): ds = Dataset(path) # one more subdataset with a name that could ruin config option parsing dots = text_type(Path('subdir') / '.lots.of.dots.') ds.create(dots) eq_(ds.subdatasets(recursive=True, fulfilled=False, result_xfm='relpaths'), [ 'sub dataset1' ]) ds.get('sub dataset1') eq_(ds.subdatasets(recursive=True, fulfilled=False, result_xfm='relpaths'), [ 'sub dataset1/2', 'sub dataset1/sub sub dataset1', 'sub dataset1/subm 1', ]) # obtain key subdataset, so all leaf subdatasets are discoverable ds.get(opj('sub dataset1', 'sub sub dataset1')) eq_(ds.subdatasets(result_xfm='relpaths'), ['sub dataset1', dots]) eq_([(r['parentds'], r['path']) for r in ds.subdatasets()], [(path, opj(path, 'sub dataset1')), (path, opj(path, dots))]) eq_(ds.subdatasets(recursive=True, result_xfm='relpaths'), [ 'sub dataset1', 'sub dataset1/2', 'sub dataset1/sub sub dataset1', 'sub dataset1/sub sub dataset1/2', 'sub dataset1/sub sub dataset1/subm 1', 'sub dataset1/subm 1', dots, ]) # redo, but limit to specific paths eq_( ds.subdatasets( path=['sub dataset1/2', 'sub dataset1/sub sub dataset1'], recursive=True, result_xfm='relpaths'), [ 'sub dataset1/2', 'sub dataset1/sub sub dataset1', 'sub dataset1/sub sub dataset1/2', 'sub dataset1/sub sub dataset1/subm 1', ] ) with chpwd(text_type(ds.pathobj / 'subdir')): # imitate cmdline invocation w/ no dataset argument # -> curdir limits the query, when no info is given eq_(subdatasets(dataset=None, path=[], recursive=True, result_xfm='paths'), [text_type(ds.pathobj / dots)] ) # but with a dataset explicitly given, even if just as a path, # curdir does no limit the query eq_(subdatasets(dataset=os.pardir, path=None, recursive=True, result_xfm='relpaths'), ['sub dataset1', 'sub dataset1/2', 'sub dataset1/sub sub dataset1', 'sub dataset1/sub sub dataset1/2', 'sub dataset1/sub sub dataset1/subm 1', 'sub dataset1/subm 1', dots] ) # uses slow, flexible query eq_(ds.subdatasets(recursive=True, bottomup=True, result_xfm='relpaths'), [ 'sub dataset1/2', 'sub dataset1/sub sub dataset1/2', 'sub dataset1/sub sub dataset1/subm 1', 'sub dataset1/sub sub dataset1', 'sub dataset1/subm 1', 'sub dataset1', dots, ]) eq_(ds.subdatasets(recursive=True, fulfilled=True, result_xfm='relpaths'), [ 'sub dataset1', 'sub dataset1/sub sub dataset1', dots, ]) eq_([(relpath(r['parentds'], start=ds.path), relpath(r['path'], start=ds.path)) for r in ds.subdatasets(recursive=True)], [ (os.curdir, 'sub dataset1'), ('sub dataset1', 'sub dataset1/2'), ('sub dataset1', 'sub dataset1/sub sub dataset1'), ('sub dataset1/sub sub dataset1', 'sub dataset1/sub sub dataset1/2'), ('sub dataset1/sub sub dataset1', 'sub dataset1/sub sub dataset1/subm 1'), ('sub dataset1', 'sub dataset1/subm 1'), (os.curdir, dots), ]) # uses slow, flexible query eq_(ds.subdatasets(recursive=True, recursion_limit=0), []) # uses slow, flexible query eq_(ds.subdatasets(recursive=True, recursion_limit=1, result_xfm='relpaths'), ['sub dataset1', dots]) # uses slow, flexible query eq_(ds.subdatasets(recursive=True, recursion_limit=2, result_xfm='relpaths'), [ 'sub dataset1', 'sub dataset1/2', 'sub dataset1/sub sub dataset1', 'sub dataset1/subm 1', dots, ]) res = ds.subdatasets(recursive=True) assert_status('ok', res) for r in res: #for prop in ('gitmodule_url', 'state', 'revision', 'gitmodule_name'): for prop in ('gitmodule_url', 'revision', 'gitmodule_name'): assert_in(prop, r) # random property is unknown assert_not_in('mike', r) # now add info to all datasets res = ds.subdatasets( recursive=True, set_property=[('mike', 'slow'), ('expansion', '<{refds_relname}>')]) assert_status('ok', res) for r in res: eq_(r['gitmodule_mike'], 'slow') eq_(r['gitmodule_expansion'], relpath(r['path'], r['refds']).replace(os.sep, '-')) # plain query again to see if it got into the files res = ds.subdatasets(recursive=True) assert_status('ok', res) for r in res: eq_(r['gitmodule_mike'], 'slow') eq_(r['gitmodule_expansion'], relpath(r['path'], r['refds']).replace(os.sep, '-')) # and remove again res = ds.subdatasets(recursive=True, delete_property='mike') assert_status('ok', res) for r in res: for prop in ('gitmodule_mike'): assert_not_in(prop, r) # and again, because above yields on the fly edit res = ds.subdatasets(recursive=True) assert_status('ok', res) for r in res: for prop in ('gitmodule_mike'): assert_not_in(prop, r) # # test --contains # target_sub = 'sub dataset1/sub sub dataset1/subm 1' # give the closest direct subdataset eq_(ds.subdatasets(contains=opj(target_sub, 'something_inside'), result_xfm='relpaths'), ['sub dataset1']) # should find the actual subdataset trail eq_(ds.subdatasets(recursive=True, contains=opj(target_sub, 'something_inside'), result_xfm='relpaths'), ['sub dataset1', 'sub dataset1/sub sub dataset1', 'sub dataset1/sub sub dataset1/subm 1']) # doesn't affect recursion limit eq_(ds.subdatasets(recursive=True, recursion_limit=2, contains=opj(target_sub, 'something_inside'), result_xfm='relpaths'), ['sub dataset1', 'sub dataset1/sub sub dataset1']) # for a direct dataset path match, return the matching dataset eq_(ds.subdatasets(recursive=True, contains=target_sub, result_xfm='relpaths'), ['sub dataset1', 'sub dataset1/sub sub dataset1', 'sub dataset1/sub sub dataset1/subm 1']) # but it has to be a subdataset, otherwise no match # which is what get_containing_subdataset() used to do eq_(ds.subdatasets(contains=ds.path), []) # no error if contains is bullshit eq_(ds.subdatasets(recursive=True, contains='errrr_nope', result_xfm='paths'), []) # TODO maybe at a courtesy bullshit detector some day eq_(ds.subdatasets(recursive=True, contains=opj(pardir, 'errrr_nope'), result_xfm='paths'), []) eq_(ds.subdatasets( recursive=True, contains=[target_sub, 'sub dataset1/2'], result_xfm='relpaths'), [ 'sub dataset1', 'sub dataset1/2', 'sub dataset1/sub sub dataset1', 'sub dataset1/sub sub dataset1/subm 1', ])
def _update_ds_agginfo(refds_path, ds_path, subds_paths, agginfo_db, to_save): """Perform metadata aggregation for ds and a given list of subdataset paths Parameters ---------- refds_path : str Absolute path to the reference dataset that aggregate_metadata() was called on. ds_path : str Absolute path to the dataset to have its aggregate info updates subds_paths : list(str) Sequence of absolute paths of subdatasets of the to-be-updated dataset, whose agginfo shall be updated within the to-be-updated dataset. Any subdataset that is not listed here is assumed to be gone (i.e. no longer a subdataset at all, not just not locally installed) agginfo_db : dict Dictionary with all information on aggregate metadata on all datasets. Keys are absolute paths of datasets. to_save : list List of paths to save eventually. This function will add new paths as necessary. """ ds = Dataset(ds_path) # location info of aggregate metadata # aggregate.json agginfo_fpath = opj(ds.path, agginfo_relpath) # base path in which aggregate.json and objects is located agg_base_path = dirname(agginfo_fpath) # load existing aggregate info dict # TODO take from cache, once used in _get_dsinfo_from_aggmetadata() ds_agginfos = _load_json_object(agginfo_fpath) # object locations referenced initially objlocs_was = set(ai[k] for ai in ds_agginfos.values() for k in location_keys if k in ai) # track which objects need to be copied (each item is a from/to tuple objs2copy = [] # for each subdataset (any depth level) procds_paths = [ds_path] + subds_paths for dpath in procds_paths: ds_dbinfo = agginfo_db.get(dpath, {}).copy() # relative path of the currect dataset within the dataset we are updating drelpath = relpath(dpath, start=ds.path) for loclabel in location_keys: # TODO filepath_info is obsolete if loclabel == 'filepath_info' and drelpath == curdir: # do not write a file list into the dataset it is from if 'filepath_info' in ds_dbinfo: del ds_dbinfo['filepath_info'] continue # abspath to object objloc = ds_dbinfo.get(loclabel, None) if objloc is None: continue # XXX needs to change when layout of object store is changed # current is ./datalad/metadata/objects/{hash}/{hash} target_objrelpath = opj(*objloc.split(os.sep)[-3:]) # make sure we copy the file from its current location to where it is # needed in this dataset target_objpath = opj(agg_base_path, target_objrelpath) objs2copy.append((objloc, target_objpath)) # now build needed local relpath ds_dbinfo[loclabel] = target_objrelpath # (re)assign in case record is new ds_agginfos[drelpath] = ds_dbinfo # remove all entries for which we did not (no longer) have a corresponding # subdataset to take care of ds_agginfos = {k: v for k, v in ds_agginfos.items() if normpath(opj(ds_path, k)) in procds_paths} # set of metadata objects now referenced objlocs_is = set( ai[k] for sdsrpath, ai in ds_agginfos.items() for k in location_keys if k in ai) objs2add = objlocs_is # yoh: we appanretly do need to filter the ones to remove - I did # "git reset --hard HEAD^" and # aggregate-metadata failed upon next run trying to remove # an unknown to git file. I am yet to figure out why that # mattered (hopefully not that reflog is used somehow) objs2remove = [] for obj in objlocs_was.difference(objlocs_is): obj_path = opj(agg_base_path, obj) if lexists(obj_path): objs2remove.append(obj_path) else: # not really a warning, we don't need it anymore, it is already gone lgr.debug( "To-be-deleted metadata object not found, skip deletion (%s)", obj_path ) # secretly remove obsolete object files, not really a result from a # user's perspective if objs2remove: ds.remove( objs2remove, # Don't use the misleading default commit message of `remove`: message='[DATALAD] Remove obsolete metadata object files', # we do not want to drop these files by default, because we would # loose them for other branches, and earlier tags # TODO evaluate whether this should be exposed as a switch # to run an explicit force-drop prior to calling remove() check=False, result_renderer=None, return_type=list) if not objs2add and not refds_path == ds_path: # this is not the base dataset, make sure to save removal in the # parentds -- not needed when objects get added, as removal itself # is already committed to_save.append(dict(path=ds_path, type='dataset', staged=True)) # must copy object files to local target destination # make sure those objects are present ds.get([f for f, t in objs2copy], result_renderer='disabled') for copy_from, copy_to in objs2copy: if copy_to == copy_from: continue target_dir = dirname(copy_to) if not exists(target_dir): makedirs(target_dir) # TODO we could be more clever (later) and maybe `addurl` (or similar) # the file from another dataset if lexists(copy_to): # no need to unlock, just wipe out and replace os.remove(copy_to) shutil.copy(copy_from, copy_to) to_save.append( dict(path=agginfo_fpath, type='file', staged=True)) if objs2add: # they are added standard way, depending on the repo type ds.add( [opj(agg_base_path, p) for p in objs2add], save=False, result_renderer=None, return_type=list) # queue for save, and mark as staged to_save.extend( [dict(path=opj(agg_base_path, p), type='file', staged=True) for p in objs2add]) # write aggregate info file if not ds_agginfos: return json_py.dump(ds_agginfos, agginfo_fpath) ds.add(agginfo_fpath, save=False, to_git=True, result_renderer=None, return_type=list) # queue for save, and mark as staged to_save.append( dict(path=agginfo_fpath, type='file', staged=True))
def test_get_subdatasets(path): ds = Dataset(path) # one more subdataset with a name that could ruin config option parsing dots = text_type(Path('subdir') / '.lots.of.dots.') ds.create(dots) eq_(ds.subdatasets(recursive=True, fulfilled=False, result_xfm='relpaths'), ['sub dataset1']) ds.get('sub dataset1') eq_(ds.subdatasets(recursive=True, fulfilled=False, result_xfm='relpaths'), [ 'sub dataset1/2', 'sub dataset1/sub sub dataset1', 'sub dataset1/subm 1', ]) # obtain key subdataset, so all leaf subdatasets are discoverable ds.get(opj('sub dataset1', 'sub sub dataset1')) eq_(ds.subdatasets(result_xfm='relpaths'), ['sub dataset1', dots]) eq_([(r['parentds'], r['path']) for r in ds.subdatasets()], [(path, opj(path, 'sub dataset1')), (path, opj(path, dots))]) eq_(ds.subdatasets(recursive=True, result_xfm='relpaths'), [ 'sub dataset1', 'sub dataset1/2', 'sub dataset1/sub sub dataset1', 'sub dataset1/sub sub dataset1/2', 'sub dataset1/sub sub dataset1/subm 1', 'sub dataset1/subm 1', dots, ]) # redo, but limit to specific paths eq_( ds.subdatasets( path=['sub dataset1/2', 'sub dataset1/sub sub dataset1'], recursive=True, result_xfm='relpaths'), [ 'sub dataset1/2', 'sub dataset1/sub sub dataset1', 'sub dataset1/sub sub dataset1/2', 'sub dataset1/sub sub dataset1/subm 1', ]) with chpwd(text_type(ds.pathobj / 'subdir')): # imitate cmdline invocation w/ no dataset argument # -> curdir limits the query, when no info is given eq_( subdatasets(dataset=None, path=[], recursive=True, result_xfm='paths'), [text_type(ds.pathobj / dots)]) # but with a dataset explicitly given, even if just as a path, # curdir does no limit the query eq_( subdatasets(dataset=os.pardir, path=None, recursive=True, result_xfm='relpaths'), [ 'sub dataset1', 'sub dataset1/2', 'sub dataset1/sub sub dataset1', 'sub dataset1/sub sub dataset1/2', 'sub dataset1/sub sub dataset1/subm 1', 'sub dataset1/subm 1', dots ]) # uses slow, flexible query eq_(ds.subdatasets(recursive=True, bottomup=True, result_xfm='relpaths'), [ 'sub dataset1/2', 'sub dataset1/sub sub dataset1/2', 'sub dataset1/sub sub dataset1/subm 1', 'sub dataset1/sub sub dataset1', 'sub dataset1/subm 1', 'sub dataset1', dots, ]) eq_(ds.subdatasets(recursive=True, fulfilled=True, result_xfm='relpaths'), [ 'sub dataset1', 'sub dataset1/sub sub dataset1', dots, ]) eq_([ (relpath(r['parentds'], start=ds.path), relpath(r['path'], start=ds.path)) for r in ds.subdatasets(recursive=True) ], [ (os.curdir, 'sub dataset1'), ('sub dataset1', 'sub dataset1/2'), ('sub dataset1', 'sub dataset1/sub sub dataset1'), ('sub dataset1/sub sub dataset1', 'sub dataset1/sub sub dataset1/2'), ('sub dataset1/sub sub dataset1', 'sub dataset1/sub sub dataset1/subm 1'), ('sub dataset1', 'sub dataset1/subm 1'), (os.curdir, dots), ]) # uses slow, flexible query eq_(ds.subdatasets(recursive=True, recursion_limit=0), []) # uses slow, flexible query eq_( ds.subdatasets(recursive=True, recursion_limit=1, result_xfm='relpaths'), ['sub dataset1', dots]) # uses slow, flexible query eq_( ds.subdatasets(recursive=True, recursion_limit=2, result_xfm='relpaths'), [ 'sub dataset1', 'sub dataset1/2', 'sub dataset1/sub sub dataset1', 'sub dataset1/subm 1', dots, ]) res = ds.subdatasets(recursive=True) assert_status('ok', res) for r in res: #for prop in ('gitmodule_url', 'state', 'revision', 'gitmodule_name'): for prop in ('gitmodule_url', 'revision', 'gitmodule_name'): assert_in(prop, r) # random property is unknown assert_not_in('mike', r) # now add info to all datasets res = ds.subdatasets(recursive=True, set_property=[('mike', 'slow'), ('expansion', '<{refds_relname}>')]) assert_status('ok', res) for r in res: eq_(r['gitmodule_mike'], 'slow') eq_(r['gitmodule_expansion'], relpath(r['path'], r['refds']).replace(os.sep, '-')) # plain query again to see if it got into the files res = ds.subdatasets(recursive=True) assert_status('ok', res) for r in res: eq_(r['gitmodule_mike'], 'slow') eq_(r['gitmodule_expansion'], relpath(r['path'], r['refds']).replace(os.sep, '-')) # and remove again res = ds.subdatasets(recursive=True, delete_property='mike') assert_status('ok', res) for r in res: for prop in ('gitmodule_mike'): assert_not_in(prop, r) # and again, because above yields on the fly edit res = ds.subdatasets(recursive=True) assert_status('ok', res) for r in res: for prop in ('gitmodule_mike'): assert_not_in(prop, r) # # test --contains # target_sub = 'sub dataset1/sub sub dataset1/subm 1' # give the closest direct subdataset eq_( ds.subdatasets(contains=opj(target_sub, 'something_inside'), result_xfm='relpaths'), ['sub dataset1']) # should find the actual subdataset trail eq_( ds.subdatasets(recursive=True, contains=opj(target_sub, 'something_inside'), result_xfm='relpaths'), [ 'sub dataset1', 'sub dataset1/sub sub dataset1', 'sub dataset1/sub sub dataset1/subm 1' ]) # doesn't affect recursion limit eq_( ds.subdatasets(recursive=True, recursion_limit=2, contains=opj(target_sub, 'something_inside'), result_xfm='relpaths'), ['sub dataset1', 'sub dataset1/sub sub dataset1']) # for a direct dataset path match, return the matching dataset eq_( ds.subdatasets(recursive=True, contains=target_sub, result_xfm='relpaths'), [ 'sub dataset1', 'sub dataset1/sub sub dataset1', 'sub dataset1/sub sub dataset1/subm 1' ]) # but it has to be a subdataset, otherwise no match # which is what get_containing_subdataset() used to do eq_(ds.subdatasets(contains=ds.path), []) # no error if contains is bullshit eq_( ds.subdatasets(recursive=True, contains='errrr_nope', result_xfm='paths'), []) # TODO maybe at a courtesy bullshit detector some day eq_( ds.subdatasets(recursive=True, contains=opj(pardir, 'errrr_nope'), result_xfm='paths'), []) eq_( ds.subdatasets(recursive=True, contains=[target_sub, 'sub dataset1/2'], result_xfm='relpaths'), [ 'sub dataset1', 'sub dataset1/2', 'sub dataset1/sub sub dataset1', 'sub dataset1/sub sub dataset1/subm 1', ])