def _store_agginfo_db(ds, db): # base path in which aggregate.json and objects is located agginfo_path, agg_base_path = get_ds_aggregate_db_locations(ds) # make DB paths on disk always relative json_py.dump( { op.relpath(p, start=ds.path): {k: op.relpath(v, start=agg_base_path) if k in location_keys else v for k, v in props.items()} for p, props in db.items() }, agginfo_path )
def _store_agginfo_db(ds, db): # base path in which aggregate.json and objects is located agginfo_path, agg_base_path = get_ds_aggregate_db_locations( ds, warn_absent=False) # make DB paths on disk always relative json_py.dump( { op.relpath(p, start=ds.path): {k: op.relpath(v, start=agg_base_path) if k in location_keys else v for k, v in props.items()} for p, props in db.items() }, agginfo_path )
def __call__( path=None, *, dataset=None, recursive=False, recursion_limit=None, update_mode='target', incremental=False, force_extraction=False, save=True): refds_path = require_dataset(dataset) # it really doesn't work without a dataset ds = require_dataset( dataset, check_installed=True, purpose='metadata aggregation') path = ensure_list(path) if not path: # then current/reference dataset is "aggregated" # We should not add ds.path always since then --recursive would # also recurse current even if paths are given path.append(ds.path) agginfo_db_location, agg_base_path = get_ds_aggregate_db_locations( ds, # do not warn here, next call triggers the same warning warn_absent=False) agginfo_db = load_ds_aggregate_db(ds, abspath=True) to_save = [] to_aggregate = set() paths_by_ds, errors = get_paths_by_ds( require_dataset(dataset), dataset, paths=ensure_list(path), subdsroot_mode='super') for ap in _minimal_annotate_paths( paths_by_ds, errors, action='aggregate_metadata', recursive=recursive, recursion_limit=recursion_limit): if ap.get('status', None): # this is done yield ap continue ap_type = ap.get('type', None) ap_state = ap.get('state', None) assert('parentds' in ap or ap_type == 'dataset') if ap_type == 'dataset' and ap_state != 'absent': # a present dataset, we can take directly from it aggsrc = ap['path'] lgr.info('Aggregate metadata for dataset %s', aggsrc) else: # everything else needs to come from the parent aggsrc = ap['parentds'] if ap_state == 'absent': lgr.info( 'Attempt to use pre-aggregate metadata for absent %s from dataset at %s', ap['path'], aggsrc) else: lgr.info( 'Aggregate metadata for %s from dataset at %s', ap['path'], aggsrc) to_aggregate.add(aggsrc) if ap_state == 'absent': # key thought: recursive is done by path annotation, hence # once we hit an absent dataset, we are 100% certain that # there is nothing to recurse into on the file system # hence we only have to look into the aggregated metadata # of the last available dataset in the dataset tree edge # # if there is nothing at this path, we need to look into the # parentds and check if we know anything about this path # if we do, we need to grab all the info and objects # if not, we need to error res = _get_dsinfo_from_aggmetadata( aggsrc, ap['path'], recursive, agginfo_db) if not isinstance(res, list): yield get_status_dict( status='impossible', message=res, action='aggregate_metadata', path=ap['path'], logger=lgr) continue # cue for aggregation to_aggregate.update(res) else: # actually aggregate metadata for this dataset, immediately place # generated objects into the aggregated or reference dataset, # and put info into DB to get the distributed to all datasets # that need to be updated errored = _dump_extracted_metadata( ds, Dataset(aggsrc), agginfo_db, to_save, force_extraction, agg_base_path) if errored: yield get_status_dict( status='error', message='Metadata extraction failed (see previous error message, set datalad.runtime.raiseonerror=yes to fail immediately)', action='aggregate_metadata', path=aggsrc, logger=lgr) # at this point we have dumped all aggregated metadata into object files # somewhere, we know what needs saving, but having saved anything, and # we know about the states of all aggregated dataset in the DB # what remains to do is to update all dataset, so they have there own copy # of aggregated metadata and update their respective aggregate.json with # info on what states we just aggregated from # first, let's figure out what dataset need updating at all # get adjencency info of the dataset tree spanning the base to all leaf dataset # associated with the path arguments if update_mode == 'all': ds_adj = {} discover_dataset_trace_to_targets( ds.path, to_aggregate, [], ds_adj, # we know that to_aggregate only lists datasets, existing and # absent ones -- we want to aggregate all of them, either from # just extracted metadata, or from previously aggregated metadata # of the closest superdataset includeds=to_aggregate) # TODO we need to work in the info about dataset that we only got from # aggregated metadata, that had no trace on the file system in here!! subtrees = _adj2subtrees(ds.path, ds_adj, to_aggregate) elif update_mode == 'target': subtrees = {ds.path: list(agginfo_db.keys())} else: raise ValueError( "unknown `update_mode` '%s' for metadata aggregation", update_mode) # go over datasets in bottom-up fashion for parentds_path in sorted(subtrees, reverse=True): lgr.info('Update aggregate metadata in dataset at: %s', parentds_path) _update_ds_agginfo( ds.path, parentds_path, subtrees[parentds_path], incremental, agginfo_db, to_save) # update complete res = get_status_dict( status='ok', action='aggregate_metadata', path=parentds_path, type='dataset', logger=lgr) res.update(agginfo_db.get(parentds_path, {})) yield res # # save potential modifications to dataset global metadata # if not to_save: return lgr.info('Attempting to save %i files/datasets', len(to_save)) for res in Save.__call__( # save does not need any pre-annotated path hints path=[r['path'] for r in to_save], dataset=refds_path, message='[DATALAD] Dataset aggregate metadata update', return_type='generator', result_renderer='disabled', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def _update_ds_agginfo(refds_path, ds_path, subds_paths, incremental, agginfo_db, to_save): """Perform metadata aggregation for ds and a given list of subdataset paths Parameters ---------- refds_path : str Absolute path to the reference dataset that aggregate_metadata() was called on. ds_path : str Absolute path to the dataset to have its aggregate info updates subds_paths : list(str) Sequence of absolute paths of subdatasets of the to-be-updated dataset, whose agginfo shall be updated within the to-be-updated dataset. Any subdataset that is not listed here is assumed to be gone (i.e. no longer a subdataset at all, not just not locally installed) incremental : bool If set, the update will not remove any information on datasets not listed in subds_paths agginfo_db : dict Dictionary with all information on aggregate metadata on all datasets. Keys are absolute paths of datasets. to_save : list List of paths to save eventually. This function will add new paths as necessary. """ ds = Dataset(ds_path) # load existing aggregate info dict # makes sure all file/dataset paths become absolute # TODO take from cache, once used in _get_dsinfo_from_aggmetadata() agginfo_fpath, agg_base_path = get_ds_aggregate_db_locations(ds) ds_agginfos = load_ds_aggregate_db(ds, abspath=True) # object locations referenced initially objlocs_was = set(ai[k] for ai in ds_agginfos.values() for k in location_keys if k in ai) # track which objects need to be copied (each item is a from/to tuple objs2copy = [] # for each subdataset (any depth level) procds_paths = [ds.path] + subds_paths for dpath in procds_paths: ds_dbinfo = agginfo_db.get(dpath, {}).copy() # relative path of the current dataset within the dataset we are updating drelpath = op.relpath(dpath, start=ds.path) for loclabel in location_keys: # TODO filepath_info is obsolete if loclabel == 'filepath_info' and drelpath == op.curdir: # do not write a file list into the dataset it is from if 'filepath_info' in ds_dbinfo: del ds_dbinfo['filepath_info'] continue # abspath to object objloc = ds_dbinfo.get(loclabel, None) if objloc is None: continue # XXX needs to change when layout of object store is changed # current is ./datalad/metadata/objects/{hash}/{hash} target_objpath = op.join(agg_base_path, *objloc.split(os.sep)[-3:]) # make sure we copy the file from its current location to where it is # needed in this dataset objs2copy.append(( # this needs to turn into an absolute path # `dpath` will be relative to the reference dataset #op.normpath(op.join(ds.path, dpath, op.dirname(agginfo_relpath), objloc)), objloc, target_objpath)) # now build needed local relpath ds_dbinfo[loclabel] = target_objpath # (re)assign in case record is new ds_agginfos[dpath] = ds_dbinfo # remove all entries for which we did not (no longer) have a corresponding # subdataset to take care of if not incremental: ds_agginfos = {k: v for k, v in ds_agginfos.items() if k in procds_paths} # set of metadata objects now referenced objlocs_is = set( ai[k] for sdsrpath, ai in ds_agginfos.items() for k in location_keys if k in ai) objs2add = objlocs_is # yoh: we appanretly do need to filter the ones to remove - I did # "git reset --hard HEAD^" and # aggregate-metadata failed upon next run trying to remove # an unknown to git file. I am yet to figure out why that # mattered (hopefully not that reflog is used somehow) objs2remove = [] for obj in objlocs_was.difference(objlocs_is): if op.lexists(obj): objs2remove.append(obj) else: # not really a warning, we don't need it anymore, it is already gone lgr.debug( "To-be-deleted metadata object not found, skip deletion (%s)", obj ) # secretly remove obsolete object files, not really a result from a # user's perspective if not incremental and objs2remove: ds.remove( objs2remove, # Don't use the misleading default commit message of `remove`: message='[DATALAD] Remove obsolete metadata object files', # we do not want to drop these files by default, because we would # loose them for other branches, and earlier tags # TODO evaluate whether this should be exposed as a switch # to run an explicit force-drop prior to calling remove() reckless='kill', result_renderer='disabled', return_type='list') if not objs2add and not refds_path == ds_path: # this is not the base dataset, make sure to save removal in the # parentds -- not needed when objects get added, as removal itself # is already committed to_save.append(dict(path=ds_path, type='dataset', staged=True)) objs2copy = [(f, t) for f, t in objs2copy if f != t] # must copy object files to local target destination # make sure those objects are present # use the reference dataset to resolve paths, as they might point to # any location in the dataset tree Dataset(refds_path).get( [f for f, t in objs2copy], result_renderer='disabled') for copy_from, copy_to in objs2copy: copy_from = op.join(agg_base_path, copy_from) copy_to = op.join(agg_base_path, copy_to) target_dir = op.dirname(copy_to) if not op.exists(target_dir): makedirs(target_dir) # TODO we could be more clever (later) and maybe `addurl` (or similar) # the file from another dataset if op.lexists(copy_to): # no need to unlock, just wipe out and replace os.remove(copy_to) shutil.copy(copy_from, copy_to) to_save.append( dict(path=agginfo_fpath, type='file', staged=True)) if objs2add: # they are added standard way, depending on the repo type ds.repo.add([op.join(agg_base_path, p) for p in objs2add]) # queue for save, and mark as staged to_save.extend( [dict(path=op.join(agg_base_path, p), type='file', staged=True) for p in objs2add]) # write aggregate info file if not ds_agginfos: return _store_agginfo_db(ds, ds_agginfos) ds.repo.add(agginfo_fpath, git=True) # queue for save, and mark as staged to_save.append( dict(path=agginfo_fpath, type='file', staged=True))
def test_openfmri_pipeline1(ind, topurl, outd, clonedir): index_html = opj(ind, 'ds666', 'index.html') list( initiate_dataset(template="openfmri", dataset_name='dataladtest-ds666', path=outd, data_fields=['dataset'])({ 'dataset': 'ds666' })) repo = AnnexRepo(outd, create=False) # to be used in the checks # Since datalad 0.11.2 all .metadata/objects go under annex. # Here we have a test where we force drop all annexed content, # to mitigate that let's place all metadata under git dotdatalad_attributes_file = opj('.datalad', '.gitattributes') repo.set_gitattributes([('metadata/objects/**', { 'annex.largefiles': '(nothing)' })], dotdatalad_attributes_file) # --amend so we do not cause change in # of commits below repo.commit("gitattributes", files=dotdatalad_attributes_file, options=['--amend']) with chpwd(outd): pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # We do not have custom changes in master yet, so it just follows incoming-processed atm # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # Since we did initiate_dataset -- now we have separate master! assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) t1w_fpath_nover = opj(outd, 'sub1', 'anat', 'sub-1_T1w.dat') ok_file_has_content(t1w_fpath_nover, "mighty load in old format") # # And now versioned files were specified! # add_to_index(index_html, content=_versioned_files) with chpwd(outd): pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) ok_( not exists(t1w_fpath_nover), "%s file should no longer be there if unversioned files get removed correctly" % t1w_fpath_nover) repo = AnnexRepo(outd, create=False) # to be used in the checks # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # We do not have custom changes in master yet, so it just follows incoming-processed atm # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # Since we did initiate_dataset -- now we have separate master! assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) # actually the tree should look quite neat with 1.0.0 tag having 1 parent in incoming # 1.0.1 having 1.0.0 and the 2nd commit in incoming as parents commits_hexsha = {b: list(_get_branch_commits(repo, b)) for b in branches} commits_l = { b: list(_get_branch_commits(repo, b, limit='left-only')) for b in branches } # all commits out there: # dataset init, crawler init # (2 commits) # + 3*(incoming, processed, merge) # + 3*aggregate-metadata update # - 1 since now that incoming starts with master, there is one less merge # In --incremental mode there is a side effect of absent now # 2*remove of obsolete metadata object files, # see https://github.com/datalad/datalad/issues/2772 # TODO inspect by knowledgeable person and re-enable #ncommits_master = len(commits_hexsha['master']) #assert_in(ncommits_master, [13, 14]) #assert_in(len(commits_l['master']), [8, 9]) # TODO inspect by knowledgeable person and re-enable #eq_(len(commits_hexsha['incoming']), ncommits_master - 8) #eq_(len(commits_l['incoming']), ncommits_master - 8) #eq_(len(commits_hexsha['incoming-processed']), ncommits_master - 5) #eq_(len(commits_l['incoming-processed']), ncommits_master - 8) # Check tags for the versions eq_(out[0]['datalad_stats'].get_total().versions, ['1.0.0', '1.0.1']) # +1 because original "release" was assumed to be 1.0.0 repo_tags = repo.get_tags() eq_(repo.get_tags(output='name'), ['1.0.0', '1.0.0+1', '1.0.1']) # Ben: The tagged ones currently are the ones with the message # '[DATALAD] dataset aggregate metadata update\n': #eq_(repo_tags[0]['hexsha'], commits_l['master'][4]) # next to the last one #eq_(repo_tags[-1]['hexsha'], commits_l['master'][0]) # the last one def hexsha(l): return l.__class__(x.hexsha for x in l) # TODO requires additional tooling to re-enable ## Verify that we have desired tree of merges #eq_(hexsha(commits_l['incoming-processed'][0].parents), (commits_l['incoming-processed'][1], # commits_l['incoming'][0])) #eq_(hexsha(commits_l['incoming-processed'][2].parents), (commits_l['incoming-processed'][3], # also in master # commits_l['incoming'][2],)) # ben: The following two comparisons are targeting these commits: # commit "Merge branch 'incoming-processed'\n" in commits_l['master'], # parents are: # commit "[DATALAD] dataset aggregate metadata update\n" in commits_l['master'] and # commit "[DATALAD] Added files from extracted archives\n\nFiles processed: 6\n renamed: 2\n +annex: 3\nBranches merged: incoming->incoming-processed\n" in commits_l['incoming-processed'] # TODO requires additional tooling to re-enable #eq_(hexsha(commits_l['master'][1].parents), (commits_l['master'][2], # commits_l['incoming-processed'][0])) #eq_(hexsha(commits_l['master'][3].parents), (commits_l['master'][4], # commits_l['incoming-processed'][1])) with chpwd(outd): eq_(set(glob('*')), {'changelog.txt', 'sub-1'}) all_files = sorted(find_files('.')) t1w_fpath = opj(outd, 'sub-1', 'anat', 'sub-1_T1w.dat') ok_file_has_content(t1w_fpath, "mighty load 1.0.1") ok_file_under_git(opj(outd, 'changelog.txt'), annexed=False) ok_file_under_git(t1w_fpath, annexed=True) try: # this is the new way from datalad.metadata.metadata import get_ds_aggregate_db_locations ds = Dataset('.') dbloc, objbase = get_ds_aggregate_db_locations(ds) dbloc = op.relpath(dbloc, start=ds.path) except ImportError: # this stopped working in early 2019 versions of datalad from datalad.metadata.metadata import agginfo_relpath dbloc = agginfo_relpath target_files = { './.datalad/config', './.datalad/crawl/crawl.cfg', # no more! # './.datalad/config.ttl', './.datalad/datalad.ttl', './.datalad/crawl/statuses/incoming.json', './.datalad/crawl/versions/incoming.json', './changelog.txt', './sub-1/anat/sub-1_T1w.dat', './sub-1/beh/responses.tsv', './' + dbloc, } target_incoming_files = { '.gitattributes', # we marked default backend right in the incoming # we now base 'incoming' on master branch, so we get all those as well '.datalad/.gitattributes', '.datalad/config', '.datalad/crawl/crawl.cfg', 'changelog.txt', 'ds666.tar.gz', 'ds666-beh_R1.0.1.tar.gz', 'ds666_R1.0.0.tar.gz', 'ds666_R1.0.1.tar.gz', 'ds666_R2.0.0.tar.gz', '.datalad/crawl/statuses/incoming.json', '.datalad/crawl/versions/incoming.json' } # Ben: metadata object files may differ in their names containing some checksum-ish shit ... # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison eq_( set([ f for f in all_files if not f.startswith('./.datalad/metadata/objects/') ]), target_files) # check that -beh was committed in 2nd commit in incoming, not the first one assert_not_in('ds666-beh_R1.0.1.tar.gz', repo.get_files(commits_l['incoming'][-1])) assert_in('ds666-beh_R1.0.1.tar.gz', repo.get_files(commits_l['incoming'][0])) # rerun pipeline -- make sure we are on the same in all branches! with chpwd(outd): out = run_pipeline(pipeline) eq_(len(out), 1) commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches} eq_(commits_hexsha, commits_hexsha_) # i.e. nothing new # actually we do manage to add_git 1 (README) since it is generated committed directly to git # BUT now fixed -- if not committed (was the same), should be marked as skipped # Nothing was committed so stats leaked all the way up eq_(out[0]['datalad_stats'], ActivityStats(files=5, skipped=5, urls=5)) eq_(out[0]['datalad_stats'], out[0]['datalad_stats'].get_total()) # rerun pipeline when new content is available # add new revision, rerun pipeline and check that stuff was processed/added correctly add_to_index( index_html, content= '<a href="ds666_R2.0.0.tar.gz">Raw data on AWS version 2.0.0</a>') with chpwd(outd): out = run_pipeline(pipeline) all_files_updated = sorted(find_files('.')) eq_(len(out), 1) assert_not_equal(out[0]['datalad_stats'].get_total(), ActivityStats()) # there is no overlays ATM, so behav would be gone since no 2.0.0 for it! target_files.remove('./sub-1/beh/responses.tsv') # Ben: metadata object files may differ in their names containing some checksum-ish shit ... # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison eq_( set([ f for f in all_files_updated if not f.startswith('./.datalad/metadata/objects/') ]), target_files) # new instance so it re-reads git stuff etc # repo = AnnexRepo(outd, create=False) # to be used in the checks commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches} commits_l_ = { b: list(_get_branch_commits(repo, b, limit='left-only')) for b in branches } assert_not_equal(commits_hexsha, commits_hexsha_) eq_(out[0]['datalad_stats'], ActivityStats()) # commit happened so stats were consumed # numbers seems to be right total_stats = out[0]['datalad_stats'].get_total() # but for some reason downloaded_size fluctuates.... why? probably archiving...? total_stats.downloaded_size = 0 eq_( total_stats, ActivityStats( files=8, skipped=5, downloaded=1, renamed=1, urls=6, add_annex=2, # add_git=1, # README versions=['2.0.0'], merges=[['incoming', 'incoming-processed']])) check_dropall_get(repo) # Let's see if pipeline would remove files we stopped tracking remove_from_index(index_html, '<a href=.ds666_R1.0.0[^<]*</a>') with chpwd(outd): with swallow_logs(new_level=logging.WARNING) as cml: out = run_pipeline(pipeline) # since files get removed in incoming, but repreprocessed completely # incomming-processed and merged into master -- new commits will come # They shouldn't have any difference but still should be new commits assert_in("There is already a tag 2.0.0 in the repository", cml.out) eq_(len(out), 1) incoming_files = repo.get_files('incoming') target_incoming_files.remove('ds666_R1.0.0.tar.gz') eq_(set(incoming_files), target_incoming_files) commits_hexsha_removed = { b: list(_get_branch_commits(repo, b)) for b in branches } # our 'statuses' database should have recorded the change thus got a diff # which propagated through all branches for b in 'master', 'incoming-processed': # with non persistent DB we had no changes # eq_(repo.repo.branches[b].commit.diff(commits_hexsha_[b][0]), []) assert_in(repo.pathobj / '.datalad/crawl/statuses/incoming.json', repo.diff(b, commits_hexsha_[b][0])) dincoming = repo.diff('incoming', commits_hexsha_['incoming'][0]) eq_(len(dincoming), 2) # 2 diff objects -- 1 file removed, 1 statuses updated eq_( set(dincoming.keys()), { repo.pathobj / '.datalad/crawl/statuses/incoming.json', repo.pathobj / 'ds666_R1.0.0.tar.gz' }) eq_(out[0]['datalad_stats'].get_total().removed, 1) assert_not_equal(commits_hexsha_, commits_hexsha_removed) # we will check if a clone would be crawling just as good from datalad.api import crawl # make a brand new clone GitRepo.clone(outd, clonedir) def _pipeline(*args, **kwargs): """Helper to mock openfmri.pipeline invocation so it looks at our 'server'""" kwargs = updated(kwargs, {'topurl': topurl, 'versioned_urls': False}) return ofpipeline(*args, **kwargs) with chpwd(clonedir), patch.object(openfmri, 'pipeline', _pipeline): output, stats = crawl( ) # we should be able to recrawl without doing anything ok_(stats, ActivityStats(files=6, skipped=6, urls=5))
def __call__( path=None, dataset=None, recursive=False, recursion_limit=None, update_mode='target', incremental=False, force_extraction=False, save=True): refds_path = Interface.get_refds_path(dataset) # it really doesn't work without a dataset ds = require_dataset( dataset, check_installed=True, purpose='metadata aggregation') path = assure_list(path) if not path: # then current/reference dataset is "aggregated" # We should not add ds.path always since then --recursive would # also recurse current even if paths are given path.append(ds.path) agginfo_db_location, agg_base_path = get_ds_aggregate_db_locations(ds) agginfo_db = load_ds_aggregate_db(ds, abspath=True) to_save = [] to_aggregate = set() for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='aggregate_metadata', # uninstalled subdatasets could be queried via aggregated metadata # -> no 'error' unavailable_path_status='', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue ap_type = ap.get('type', None) ap_state = ap.get('state', None) assert('parentds' in ap or ap_type == 'dataset') if ap_type == 'dataset' and ap_state != 'absent': # a present dataset, we can take directly from it aggsrc = ap['path'] lgr.info('Aggregate metadata for dataset %s', aggsrc) else: # everything else needs to come from the parent aggsrc = ap['parentds'] if ap_state == 'absent': lgr.info( 'Attempt to use pre-aggregate metadata for absent %s from dataset at %s', ap['path'], aggsrc) else: lgr.info( 'Aggregate metadata for %s from dataset at %s', ap['path'], aggsrc) to_aggregate.add(aggsrc) if ap_state == 'absent': # key thought: recursive is done by path annotation, hence # once we hit an absent dataset, we are 100% certain that # there is nothing to recurse into on the file system # hence we only have to look into the aggregated metadata # of the last available dataset in the dataset tree edge # # if there is nothing at this path, we need to look into the # parentds and check if we know anything about this path # if we do, we need to grab all the info and objects # if not, we need to error res = _get_dsinfo_from_aggmetadata( aggsrc, ap['path'], recursive, agginfo_db) if not isinstance(res, list): yield get_status_dict( status='impossible', message=res, action='aggregate_metadata', path=ap['path'], logger=lgr) continue # cue for aggregation to_aggregate.update(res) else: # actually aggregate metadata for this dataset, immediately place # generated objects into the aggregated or reference dataset, # and put info into DB to get the distributed to all datasets # that need to be updated errored = _dump_extracted_metadata( ds, Dataset(aggsrc), agginfo_db, to_save, force_extraction, agg_base_path) if errored: yield get_status_dict( status='error', message='Metadata extraction failed (see previous error message, set datalad.runtime.raiseonerror=yes to fail immediately)', action='aggregate_metadata', path=aggsrc, logger=lgr) # at this point we have dumped all aggregated metadata into object files # somewhere, we know what needs saving, but having saved anything, and # we know about the states of all aggregated dataset in the DB # what remains to do is to update all dataset, so they have there own copy # of aggregated metadata and update their respective aggregate.json with # info on what states we just aggregated from # first, let's figure out what dataset need updating at all # get adjencency info of the dataset tree spanning the base to all leaf dataset # associated with the path arguments if update_mode == 'all': ds_adj = {} discover_dataset_trace_to_targets( ds.path, to_aggregate, [], ds_adj, # we know that to_aggregate only lists datasets, existing and # absent ones -- we want to aggregate all of them, either from # just extracted metadata, or from previously aggregated metadata # of the closest superdataset includeds=to_aggregate) # TODO we need to work in the info about dataset that we only got from # aggregated metadata, that had no trace on the file system in here!! subtrees = _adj2subtrees(ds.path, ds_adj, to_aggregate) elif update_mode == 'target': subtrees = {ds.path: list(agginfo_db.keys())} else: raise ValueError( "unknown `update_mode` '%s' for metadata aggregation", update_mode) # go over datasets in bottom-up fashion for parentds_path in sorted(subtrees, reverse=True): lgr.info('Update aggregate metadata in dataset at: %s', parentds_path) _update_ds_agginfo( ds.path, parentds_path, subtrees[parentds_path], incremental, agginfo_db, to_save) # update complete res = get_status_dict( status='ok', action='aggregate_metadata', path=parentds_path, type='dataset', logger=lgr) res.update(agginfo_db.get(parentds_path, {})) yield res # # save potential modifications to dataset global metadata # if not to_save: return lgr.info('Attempting to save %i files/datasets', len(to_save)) for res in Save.__call__( path=to_save, dataset=refds_path, message='[DATALAD] Dataset aggregate metadata update', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def _update_ds_agginfo(refds_path, ds_path, subds_paths, incremental, agginfo_db, to_save): """Perform metadata aggregation for ds and a given list of subdataset paths Parameters ---------- refds_path : str Absolute path to the reference dataset that aggregate_metadata() was called on. ds_path : str Absolute path to the dataset to have its aggregate info updates subds_paths : list(str) Sequence of absolute paths of subdatasets of the to-be-updated dataset, whose agginfo shall be updated within the to-be-updated dataset. Any subdataset that is not listed here is assumed to be gone (i.e. no longer a subdataset at all, not just not locally installed) incremental : bool If set, the update will not remove any information on datasets not listed in subds_paths agginfo_db : dict Dictionary with all information on aggregate metadata on all datasets. Keys are absolute paths of datasets. to_save : list List of paths to save eventually. This function will add new paths as necessary. """ ds = Dataset(ds_path) # load existing aggregate info dict # makes sure all file/dataset paths become absolute # TODO take from cache, once used in _get_dsinfo_from_aggmetadata() agginfo_fpath, agg_base_path = get_ds_aggregate_db_locations(ds) ds_agginfos = load_ds_aggregate_db(ds, abspath=True) # object locations referenced initially objlocs_was = set(ai[k] for ai in ds_agginfos.values() for k in location_keys if k in ai) # track which objects need to be copied (each item is a from/to tuple objs2copy = [] # for each subdataset (any depth level) procds_paths = [ds.path] + subds_paths for dpath in procds_paths: ds_dbinfo = agginfo_db.get(dpath, {}).copy() # relative path of the currect dataset within the dataset we are updating drelpath = op.relpath(dpath, start=ds.path) for loclabel in location_keys: # TODO filepath_info is obsolete if loclabel == 'filepath_info' and drelpath == op.curdir: # do not write a file list into the dataset it is from if 'filepath_info' in ds_dbinfo: del ds_dbinfo['filepath_info'] continue # abspath to object objloc = ds_dbinfo.get(loclabel, None) if objloc is None: continue # XXX needs to change when layout of object store is changed # current is ./datalad/metadata/objects/{hash}/{hash} target_objpath = op.join(agg_base_path, *objloc.split(os.sep)[-3:]) # make sure we copy the file from its current location to where it is # needed in this dataset objs2copy.append(( # this needs to turn into an absolute path # `dpath` will be relative to the reference dataset #op.normpath(op.join(ds.path, dpath, op.dirname(agginfo_relpath), objloc)), objloc, target_objpath)) # now build needed local relpath ds_dbinfo[loclabel] = target_objpath # (re)assign in case record is new ds_agginfos[dpath] = ds_dbinfo # remove all entries for which we did not (no longer) have a corresponding # subdataset to take care of if not incremental: ds_agginfos = {k: v for k, v in ds_agginfos.items() if k in procds_paths} # set of metadata objects now referenced objlocs_is = set( ai[k] for sdsrpath, ai in ds_agginfos.items() for k in location_keys if k in ai) objs2add = objlocs_is # yoh: we appanretly do need to filter the ones to remove - I did # "git reset --hard HEAD^" and # aggregate-metadata failed upon next run trying to remove # an unknown to git file. I am yet to figure out why that # mattered (hopefully not that reflog is used somehow) objs2remove = [] for obj in objlocs_was.difference(objlocs_is): if op.lexists(obj): objs2remove.append(obj) else: # not really a warning, we don't need it anymore, it is already gone lgr.debug( "To-be-deleted metadata object not found, skip deletion (%s)", obj ) # secretly remove obsolete object files, not really a result from a # user's perspective if not incremental and objs2remove: ds.remove( objs2remove, # Don't use the misleading default commit message of `remove`: message='[DATALAD] Remove obsolete metadata object files', # we do not want to drop these files by default, because we would # loose them for other branches, and earlier tags # TODO evaluate whether this should be exposed as a switch # to run an explicit force-drop prior to calling remove() check=False, result_renderer=None, return_type=list) if not objs2add and not refds_path == ds_path: # this is not the base dataset, make sure to save removal in the # parentds -- not needed when objects get added, as removal itself # is already committed to_save.append(dict(path=ds_path, type='dataset', staged=True)) objs2copy = [(f, t) for f, t in objs2copy if f != t] # must copy object files to local target destination # make sure those objects are present # use the reference dataset to resolve paths, as they might point to # any location in the dataset tree Dataset(refds_path).get( [f for f, t in objs2copy], result_renderer='disabled') for copy_from, copy_to in objs2copy: copy_from = op.join(agg_base_path, copy_from) copy_to = op.join(agg_base_path, copy_to) target_dir = op.dirname(copy_to) if not op.exists(target_dir): makedirs(target_dir) # TODO we could be more clever (later) and maybe `addurl` (or similar) # the file from another dataset if op.lexists(copy_to): # no need to unlock, just wipe out and replace os.remove(copy_to) shutil.copy(copy_from, copy_to) to_save.append( dict(path=agginfo_fpath, type='file', staged=True)) if objs2add: # they are added standard way, depending on the repo type ds.add( [op.join(agg_base_path, p) for p in objs2add], save=False, result_renderer=None, return_type=list) # queue for save, and mark as staged to_save.extend( [dict(path=op.join(agg_base_path, p), type='file', staged=True) for p in objs2add]) # write aggregate info file if not ds_agginfos: return _store_agginfo_db(ds, ds_agginfos) ds.add(agginfo_fpath, save=False, to_git=True, result_renderer=None, return_type=list) # queue for save, and mark as staged to_save.append( dict(path=agginfo_fpath, type='file', staged=True))