def __call__( path=None, message=None, dataset=None, version_tag=None, recursive=False, recursion_limit=None, updated=False, message_file=None, to_git=None, ): if message and message_file: raise ValueError( "Both a message and message file were specified for save()") path = assure_list(path) if message_file: with open(message_file) as mfh: message = mfh.read() # we want 'normal' to achieve the most compact argument list # for git calls # untracked_mode = 'no' if updated else 'normal' # TODO however, Repo.add() would refuse to add any dotfiles # in a directory that is itself untracked, hence the only # choice is to go with potentially crazy long lists # until https://github.com/datalad/datalad/issues/1454 # has a resolution untracked_mode = 'no' if updated else 'all' # there are three basic scenarios: # 1. save modifications to any already tracked content # 2. save any content (including removal of deleted content) # to bring things to a clean state # 3. like (2), but only operate on a given subset of content # identified by paths # - all three have to work in conjunction with --recursive # - the difference between (1) and (2) should be no more # that a switch from --untracked=no to --untracked=all # in Repo.save() # we do not support # - simultaneous operations on multiple datasets from disjoint # dataset hierarchies, hence a single reference dataset must be # identifiable from the either # - curdir or # - the `dataset` argument. # This avoids complex annotation loops and hierarchy tracking. # - any modification upwards from the root dataset ds = require_dataset(dataset, check_installed=True, purpose='saving') # use status() to do all discovery and annotation of paths paths_by_ds = {} for s in Status()( # ATTN: it is vital to pass the `dataset` argument as it, # and not a dataset instance in order to maintain the path # semantics between here and the status() call dataset=dataset, path=path, untracked=untracked_mode, recursive=recursive, recursion_limit=recursion_limit, result_renderer='disabled'): # fish out status dict for this parent dataset ds_status = paths_by_ds.get(s['parentds'], {}) # reassemble path status info as repo.status() would have made it ds_status[ut.Path(s['path'])] = \ {k: v for k, v in iteritems(s) if k not in ( 'path', 'parentds', 'refds', 'status', 'action', 'logger')} paths_by_ds[s['parentds']] = ds_status lgr.debug('Determined %i datasets for saving from input arguments', len(paths_by_ds)) # figure out what datasets to process, start with the ones containing # the paths that were given as arguments discovered_datasets = list(paths_by_ds.keys()) if dataset: # if a reference dataset was given we want to save all the way up # to it, so let's throw it into the mix discovered_datasets.append(ds.path) # sort the datasets into (potentially) disjoint hierarchies, # or a single one, if a reference dataset was given dataset_hierarchies = get_tree_roots(discovered_datasets) for rootds, children in iteritems(dataset_hierarchies): edges = {} discover_dataset_trace_to_targets(rootds, children, [], edges, includeds=children) for superds, subdss in iteritems(edges): superds_status = paths_by_ds.get(superds, {}) for subds in subdss: # TODO actually start from an entry that may already # exist in the status record superds_status[ut.Path(subds)] = dict( # shot from the hip, some status config # to trigger this specific super/sub # relation to be saved state='untracked', type='dataset') paths_by_ds[superds] = superds_status # TODO parallelize, whenever we have multiple subdataset of a single # dataset they can all be processed simultaneously # sort list of dataset to handle, starting with the ones deep down for pdspath in sorted(paths_by_ds, reverse=True): pds = Dataset(pdspath) # pop status for this dataset, we are not coming back to it pds_status = { # for handing over to the low-level code, we recode any # path relative to the real repo location, this avoid # cumbersome symlink handling without context in the # lower levels pds.repo.pathobj / p.relative_to(pdspath): props for p, props in iteritems(paths_by_ds.pop(pdspath)) } start_commit = pds.repo.get_hexsha() if not all(p['state'] == 'clean' for p in pds_status.values()): for res in pds.repo.save_( message=message, # make sure to have the `path` arg be None, as we want # to prevent and bypass any additional repo.status() # calls paths=None, # prevent whining of GitRepo git=True if not hasattr(ds.repo, 'annexstatus') else to_git, # we are supplying the full status already, do not # detect anything else untracked='no', _status=pds_status): # TODO remove stringification when datalad-core can handle # path objects, or when PY3.6 is the lowest supported # version for k in ('path', 'refds'): if k in res: res[k] = str( # recode path back to dataset path anchor pds.pathobj / res[k].relative_to(pds.repo.pathobj)) yield res # report on the dataset itself dsres = dict( action='save', type='dataset', path=pds.path, refds=ds.path, status='ok' if start_commit != pds.repo.get_hexsha() else 'notneeded', logger=lgr, ) if not version_tag: yield dsres continue try: pds.repo.tag(version_tag) dsres.update(status='ok', version_tag=version_tag) yield dsres except CommandError as e: if dsres['status'] == 'ok': # first we yield the result for the actual save yield dsres.copy() # and now complain that tagging didn't work dsres.update(status='error', message=('cannot tag this version: %s', e.stderr.strip())) yield dsres
def __call__(message=None, files=None, dataset=None, all_updated=True, all_changes=None, version_tag=None, recursive=False, recursion_limit=None, super_datasets=False): if all_changes is not None: from datalad.support.exceptions import DeprecatedError raise DeprecatedError( new="all_updated option where fits and/or datalad add", version="0.5.0", msg="RF: all_changes option passed to the save") if not dataset and not files: # we got nothing at all -> save what is staged in the repo in "this" directory? # we verify that there is an actual repo next dataset = abspath(curdir) refds_path = Interface.get_refds_path(dataset) to_process = [] for ap in AnnotatePaths.__call__( path=files, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='save', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): # next check should not be done during annotation, as it is possibly expensive # and not generally useful if ap.get('status', None) == 'impossible' and \ ap.get('state', None) == 'absent' and \ ap.get('parentds', None): # this is not here anymore, but it might actually have been a deleted # component if relpath(ap['path'], start=ap['parentds']) \ in Dataset(ap['parentds']).repo.get_deleted_files(): # ok, this is a staged deletion that we want to save ap['status'] = '' del ap['message'] if ap.get('status', None): # this is done yield ap continue # for things like: `ds.save()` # or recursively discovered datasets if ap['path'] == refds_path or \ (ap.get('type', None) == 'dataset' and not ap.get('raw_input', False) and not ap.get('state', None) == 'absent'): ap['process_content'] = True ap['process_updated_only'] = all_updated to_process.append(ap) if not to_process: # nothing left to do, potentially all errored before return if super_datasets: # search for the topmost superdatasets of any path dss = [ Dataset(ap.get('parentds', ap['path'])) for ap in to_process ] superdss = [ds.get_superdataset(topmost=True) for ds in dss] superdss = get_tree_roots( unique(ds.path for ds in dss + superdss if ds)) if dataset: # need to adjust the reference to the new superds # if we had one ref before, we should still have exactly one assert len(superdss) <= 1 dataset = list(superdss.keys())[0] refds_path = dataset elif refds_path: # there is a single superdataset superdss = { refds_path: unique( [ap['parentds'] for ap in to_process if 'parentds' in ap]) } else: # sort all datasets under their potential superdatasets # start from the top to get all subdatasets down the line # and collate them into as few superdatasets as possible # this is quick, just string operations superdss = get_tree_roots( unique( [ap['parentds'] for ap in to_process if 'parentds' in ap])) # for each "superdataset" check the tree of subdatasets and make sure # we gather all datasets between the super and any subdataset # so we can save them all bottom-up in order to be able to properly # save the superdataset # if this is called from e.g. `add` this is actually not necessary, # but in the general case we cannot avoid it # TODO maybe introduce a switch? discovered = {} for superds_path in superdss: target_subs = superdss[superds_path] discover_dataset_trace_to_targets( # from here superds_path, # to all target_subs, [], discovered) # create a new minimally annotated path for each discovered dataset discovered_added = set() for parentds in discovered: for subds in discovered[parentds]: to_process.append( dict(path=subds, parentds=parentds, type='dataset')) discovered_added.add(subds) # make sure we have an entry for each dataset, including those # tha are just parents for parentds in discovered: if parentds not in discovered_added: to_process.append( dict( path=parentds, type='dataset', # make sure we save content of superds later on process_content=True)) # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, deduplication happens here too annotated_paths = AnnotatePaths.__call__( path=to_process, dataset=dataset, # never recursion, done already recursive=False, action='save', unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') # now sort into datasets so we can process them one by one content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path, path_only=False) assert (not completed) # iterate over all datasets, starting at the bottom for dspath in sorted(content_by_ds.keys(), reverse=True): ds = Dataset(dspath) res = get_status_dict('save', ds=ds, logger=lgr) if not ds.is_installed(): # TODO This is likely impossible now res['status'] = 'impossible' res['message'] = ('dataset %s is not installed', ds) yield res continue saved_state = save_dataset(ds, content_by_ds[dspath], message=message, version_tag=version_tag) if saved_state: res['status'] = 'ok' else: res['status'] = 'notneeded' yield res
def __call__(path=None, message=None, dataset=None, version_tag=None, recursive=False, recursion_limit=None, updated=False, message_file=None, to_git=None, ): if message and message_file: raise ValueError( "Both a message and message file were specified for save()") path = assure_list(path) if message_file: with open(message_file) as mfh: message = mfh.read() # we want 'normal' to achieve the most compact argument list # for git calls # untracked_mode = 'no' if updated else 'normal' # TODO however, Repo.add() would refuse to add any dotfiles # in a directory that is itself untracked, hence the only # choice is to go with potentially crazy long lists # until https://github.com/datalad/datalad/issues/1454 # has a resolution untracked_mode = 'no' if updated else 'all' # there are three basic scenarios: # 1. save modifications to any already tracked content # 2. save any content (including removal of deleted content) # to bring things to a clean state # 3. like (2), but only operate on a given subset of content # identified by paths # - all three have to work in conjunction with --recursive # - the difference between (1) and (2) should be no more # that a switch from --untracked=no to --untracked=all # in Repo.save() # we do not support # - simultaneous operations on multiple datasets from disjoint # dataset hierarchies, hence a single reference dataset must be # identifiable from the either # - curdir or # - the `dataset` argument. # This avoids complex annotation loops and hierarchy tracking. # - any modification upwards from the root dataset ds = require_dataset(dataset, check_installed=True, purpose='saving') # use status() to do all discovery and annotation of paths paths_by_ds = {} for s in Status()( # ATTN: it is vital to pass the `dataset` argument as it, # and not a dataset instance in order to maintain the path # semantics between here and the status() call dataset=dataset, path=path, untracked=untracked_mode, recursive=recursive, recursion_limit=recursion_limit, result_renderer='disabled'): # fish out status dict for this parent dataset ds_status = paths_by_ds.get(s['parentds'], {}) # reassemble path status info as repo.status() would have made it ds_status[ut.Path(s['path'])] = \ {k: v for k, v in iteritems(s) if k not in ( 'path', 'parentds', 'refds', 'status', 'action', 'logger')} paths_by_ds[s['parentds']] = ds_status lgr.debug('Determined %i datasets for saving from input arguments', len(paths_by_ds)) # figure out what datasets to process, start with the ones containing # the paths that were given as arguments discovered_datasets = list(paths_by_ds.keys()) if dataset: # if a reference dataset was given we want to save all the way up # to it, so let's throw it into the mix discovered_datasets.append(ds.path) # sort the datasets into (potentially) disjoint hierarchies, # or a single one, if a reference dataset was given dataset_hierarchies = get_tree_roots(discovered_datasets) for rootds, children in iteritems(dataset_hierarchies): edges = {} discover_dataset_trace_to_targets( rootds, children, [], edges, includeds=children) for superds, subdss in iteritems(edges): superds_status = paths_by_ds.get(superds, {}) for subds in subdss: # TODO actually start from an entry that may already # exist in the status record superds_status[ut.Path(subds)] = dict( # shot from the hip, some status config # to trigger this specific super/sub # relation to be saved state='untracked', type='dataset') paths_by_ds[superds] = superds_status # TODO parallelize, whenever we have multiple subdataset of a single # dataset they can all be processed simultaneously # sort list of dataset to handle, starting with the ones deep down for pdspath in sorted(paths_by_ds, reverse=True): pds = Dataset(pdspath) # pop status for this dataset, we are not coming back to it pds_status = { # for handing over to the low-level code, we recode any # path relative to the real repo location, this avoid # cumbersome symlink handling without context in the # lower levels pds.repo.pathobj / p.relative_to(pdspath): props for p, props in iteritems(paths_by_ds.pop(pdspath))} start_commit = pds.repo.get_hexsha() if not all(p['state'] == 'clean' for p in pds_status.values()): for res in pds.repo.save_( message=message, # make sure to have the `path` arg be None, as we want # to prevent and bypass any additional repo.status() # calls paths=None, # prevent whining of GitRepo git=True if not hasattr(ds.repo, 'annexstatus') else to_git, # we are supplying the full status already, do not # detect anything else untracked='no', _status=pds_status): # TODO remove stringification when datalad-core can handle # path objects, or when PY3.6 is the lowest supported # version for k in ('path', 'refds'): if k in res: res[k] = text_type( # recode path back to dataset path anchor pds.pathobj / res[k].relative_to( pds.repo.pathobj) ) yield res # report on the dataset itself dsres = dict( action='save', type='dataset', path=pds.path, refds=ds.path, status='ok' if start_commit != pds.repo.get_hexsha() else 'notneeded', logger=lgr, ) if not version_tag: yield dsres continue try: pds.repo.tag(version_tag) dsres.update( status='ok', version_tag=version_tag) yield dsres except CommandError as e: if dsres['status'] == 'ok': # first we yield the result for the actual save yield dsres.copy() # and now complain that tagging didn't work dsres.update( status='error', message=('cannot tag this version: %s', e.stderr.strip())) yield dsres
def __call__(message=None, path=None, dataset=None, all_updated=True, version_tag=None, recursive=False, recursion_limit=None, super_datasets=False, message_file=None ): if not dataset and not path: # we got nothing at all -> save what is staged in the repo in "this" directory? # make sure we don't treat this as a user-provided '.' argument path = [{'path': abspath(curdir), 'raw_input': False}] refds_path = Interface.get_refds_path(dataset) if message and message_file: raise ValueError("Both a message and message file were specified") if message_file: with open(message_file, "rb") as mfh: message = assure_unicode(mfh.read()) to_process = [] got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='save', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', modified='HEAD' if not path and recursive else None, return_type='generator', on_failure='ignore'): if ap.get('state', None) == 'untracked' and not ap.get('raw_input', False): # this path was found untracked, but not explicitly given to save # we will silently ignore this continue got_nothing = False # next check should not be done during annotation, as it is possibly expensive # and not generally useful if ap.get('status', None) == 'impossible' and \ ap.get('state', None) == 'absent' and \ ap.get('parentds', None): # this is not here anymore, but it might actually have been a deleted # component if relpath(ap['path'], start=ap['parentds']) \ in Dataset(ap['parentds']).repo.get_deleted_files(): # ok, this is a staged deletion that we want to save ap['status'] = '' del ap['message'] if ap.get('status', None): # this is done yield ap continue # for things like: `ds.save()` # or recursively discovered datasets if ap['path'] == refds_path or \ (ap.get('type', None) == 'dataset' and not ap.get('raw_input', False) and not ap.get('state', None) == 'absent'): ap['process_content'] = True ap['process_updated_only'] = all_updated to_process.append(ap) lgr.log(2, "save, to_process=%r", to_process) if got_nothing and recursive and refds_path: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict( 'save', status='notneeded', path=refds_path, type='dataset', logger=lgr) return if not to_process: # nothing left to do, potentially all errored before return if super_datasets: # search for the topmost superdatasets of any path dss = [Dataset(ap.get('parentds', ap['path'])) for ap in to_process] superdss = [ds.get_superdataset(topmost=True) for ds in dss] superdss = get_tree_roots( unique(ds.path for ds in dss + superdss if ds)) if dataset: # need to adjust the reference to the new superds # if we had one ref before, we should still have exactly one assert len(superdss) <= 1 dataset = list(superdss.keys())[0] refds_path = dataset elif refds_path: # there is a single superdataset superdss = { refds_path: unique([ap['parentds'] for ap in to_process if 'parentds' in ap])} else: # sort all datasets under their potential superdatasets # start from the top to get all subdatasets down the line # and collate them into as few superdatasets as possible # this is quick, just string operations superdss = get_tree_roots( unique([ap['parentds'] for ap in to_process if 'parentds' in ap])) # for each "superdataset" check the tree of subdatasets and make sure # we gather all datasets between the super and any subdataset # so we can save them all bottom-up in order to be able to properly # save the superdataset # if this is called from e.g. `add` this is actually not necessary, # but in the general case we cannot avoid it # TODO maybe introduce a switch? discovered = {} for superds_path in superdss: target_subs = superdss[superds_path] discover_dataset_trace_to_targets( # from here superds_path, # to all target_subs, [], discovered) # create a new minimally annotated path for each discovered dataset discovered_added = set() for parentds in discovered: for subds in discovered[parentds]: to_process.append(dict( path=subds, parentds=parentds, type='dataset')) discovered_added.add(subds) # make sure we have an entry for each dataset, including those # tha are just parents for parentds in discovered: if parentds not in discovered_added: to_process.append(dict( path=parentds, type='dataset', # make sure we save content of superds later on process_content=True, # but not do nasty things, like adding untracked content # just because we discovered this dataset process_updated_only=True)) # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, deduplication happens here too annotated_paths = AnnotatePaths.__call__( path=to_process, dataset=dataset, # never recursion, done already recursive=False, action='save', unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') # now sort into datasets so we can process them one by one content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert(not completed) # iterate over all datasets, starting at the bottom for dspath in sorted(content_by_ds.keys(), reverse=True): ds = Dataset(dspath) res = get_status_dict('save', ds=ds, logger=lgr) if not ds.is_installed(): # TODO This is likely impossible now res['status'] = 'impossible' res['message'] = ('dataset %s is not installed', ds) yield res continue saved_state = save_dataset( ds, content_by_ds[dspath], message=message) res['status'] = 'ok' if saved_state else 'notneeded' # MIH: let's tag even if there was nothing commit. I'd forget this # option too often... if version_tag: try: # TODO: check whether comment below is still true after # removing the log swallowing: # again cannot help but force-silence low-level code, because # it screams like a made man instead of allowing top-level # code an orderly error report ds.repo.tag(version_tag) # even if we haven't saved anything res['status'] = 'ok' yield res except CommandError as e: if saved_state: # first we yield the result for the actual save yield res # and now complain that tagging didn't work yield get_status_dict( 'save', ds=ds, logger=lgr, status='error', message=( 'cannot tag this version: %s', e.stderr.strip())) else: yield res
def __call__( path=None, message=None, dataset=None, version_tag=None, recursive=False, recursion_limit=None, updated=False, message_file=None, to_git=None, jobs=None, amend=False, ): if message and message_file: raise ValueError( "Both a message and message file were specified for save()") if amend and recursive: raise ValueError("Cannot amend a commit recursively.") path = ensure_list(path) if message_file: with open(message_file) as mfh: message = mfh.read() # we want 'normal' to achieve the most compact argument list # for git calls # untracked_mode = 'no' if updated else 'normal' # TODO however, Repo.add() would refuse to add any dotfiles # in a directory that is itself untracked, hence the only # choice is to go with potentially crazy long lists # until https://github.com/datalad/datalad/issues/1454 # has a resolution untracked_mode = 'no' if updated else 'all' # there are three basic scenarios: # 1. save modifications to any already tracked content # 2. save any content (including removal of deleted content) # to bring things to a clean state # 3. like (2), but only operate on a given subset of content # identified by paths # - all three have to work in conjunction with --recursive # - the difference between (1) and (2) should be no more # that a switch from --untracked=no to --untracked=all # in Repo.save() # we do not support # - simultaneous operations on multiple datasets from disjoint # dataset hierarchies, hence a single reference dataset must be # identifiable from the either # - curdir or # - the `dataset` argument. # This avoids complex annotation loops and hierarchy tracking. # - any modification upwards from the root dataset ds = require_dataset(dataset, check_installed=True, purpose='saving') # use status() to do all discovery and annotation of paths paths_by_ds = {} for s in Status()( # ATTN: it is vital to pass the `dataset` argument as it, # and not a dataset instance in order to maintain the path # semantics between here and the status() call dataset=dataset, path=path, untracked=untracked_mode, report_filetype=False, recursive=recursive, recursion_limit=recursion_limit, on_failure='ignore', # for save without recursion only commit matters eval_subdataset_state='full' if recursive else 'commit', result_renderer='disabled'): if s['status'] == 'error': # Downstream code can't do anything with these. Let the caller # decide their fate. yield s continue # fish out status dict for this parent dataset ds_status = paths_by_ds.get(s['parentds'], {}) # reassemble path status info as repo.status() would have made it ds_status[ut.Path(s['path'])] = \ {k: v for k, v in s.items() if k not in ( 'path', 'parentds', 'refds', 'status', 'action', 'logger')} paths_by_ds[s['parentds']] = ds_status lgr.debug('Determined %i datasets for saving from input arguments', len(paths_by_ds)) # figure out what datasets to process, start with the ones containing # the paths that were given as arguments discovered_datasets = list(paths_by_ds.keys()) if dataset: # if a reference dataset was given we want to save all the way up # to it, so let's throw it into the mix discovered_datasets.append(ds.path) # sort the datasets into (potentially) disjoint hierarchies, # or a single one, if a reference dataset was given dataset_hierarchies = get_tree_roots(discovered_datasets) for rootds, children in dataset_hierarchies.items(): edges = {} discover_dataset_trace_to_targets(rootds, children, [], edges, includeds=children) for superds, subdss in edges.items(): superds_status = paths_by_ds.get(superds, {}) for subds in subdss: subds_path = ut.Path(subds) sub_status = superds_status.get(subds_path, {}) if not (sub_status.get("state") == "clean" and sub_status.get("type") == "dataset"): # TODO actually start from an entry that may already # exist in the status record superds_status[subds_path] = dict( # shot from the hip, some status config # to trigger this specific super/sub # relation to be saved state='untracked', type='dataset') paths_by_ds[superds] = superds_status def save_ds(args, version_tag=None): pdspath, paths = args pds = Dataset(pdspath) pds_repo = pds.repo # pop status for this dataset, we are not coming back to it pds_status = { # for handing over to the low-level code, we recode any # path relative to the real repo location, this avoid # cumbersome symlink handling without context in the # lower levels pds_repo.pathobj / p.relative_to(pdspath): props for p, props in paths.items() } start_commit = pds_repo.get_hexsha() if not all(p['state'] == 'clean' for p in pds_status.values()) or \ (amend and message): for res in pds_repo.save_( message=message, # make sure to have the `path` arg be None, as we want # to prevent and bypass any additional repo.status() # calls paths=None, # prevent whining of GitRepo git=True if not hasattr(ds.repo, 'annexstatus') else to_git, # we are supplying the full status already, do not # detect anything else untracked='no', _status=pds_status, amend=amend): # TODO remove stringification when datalad-core can handle # path objects, or when PY3.6 is the lowest supported # version for k in ('path', 'refds'): if k in res: res[k] = str( # recode path back to dataset path anchor pds.pathobj / res[k].relative_to(pds_repo.pathobj)) yield res # report on the dataset itself dsres = dict( action='save', type='dataset', path=pds.path, refds=ds.path, status='ok' if start_commit != pds_repo.get_hexsha() else 'notneeded', logger=lgr, ) if not version_tag: yield dsres return try: # method requires str version_tag = str(version_tag) pds_repo.tag(version_tag) dsres.update(status='ok', version_tag=version_tag) yield dsres except CommandError as e: if dsres['status'] == 'ok': # first we yield the result for the actual save # TODO: we will get duplicate dataset/save record obscuring # progress reporting. yoh thought to decouple "tag" from "save" # messages but was worrying that original authors would disagree yield dsres.copy() # and now complain that tagging didn't work dsres.update(status='error', message=('cannot tag this version: %s', e.stderr.strip())) yield dsres if not paths_by_ds: # Special case: empty repo. There's either an empty commit only or # none at all. An empty one we can amend otherwise there's nothing # to do. if amend and ds.repo.get_hexsha(): yield from save_ds((ds.pathobj, dict()), version_tag=version_tag) else: yield dict(action='save', type='dataset', path=ds.path, refds=ds.path, status='notneeded', logger=lgr) return # TODO: in principle logging could be improved to go not by a dataset # but by path(s) within subdatasets. That should provide a bit better ETA # and more "dynamic" feedback than jumpy datasets count. # See addurls where it is implemented that way by providing agg and another # log_filter yield from ProducerConsumerProgressLog( sorted(paths_by_ds.items(), key=lambda v: v[0], reverse=True), partial(save_ds, version_tag=version_tag), safe_to_consume=no_subds_in_futures, producer_future_key=lambda ds_items: ds_items[0], jobs=jobs, log_filter=_log_filter_save_dataset, unit="datasets", lgr=lgr, )
def __call__(message=None, path=None, dataset=None, all_updated=True, version_tag=None, recursive=False, recursion_limit=None, super_datasets=False, message_file=None): if not dataset and not path: # we got nothing at all -> save what is staged in the repo in "this" directory? # make sure we don't treat this as a user-provided '.' argument path = [{'path': abspath(curdir), 'raw_input': False}] refds_path = Interface.get_refds_path(dataset) if message and message_file: yield get_status_dict( 'save', status='error', path=refds_path, message="Both a message and message file were specified", logger=lgr) return if message_file: with open(message_file) as mfh: message = mfh.read() to_process = [] got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='save', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', modified='HEAD' if not path and recursive else None, return_type='generator', on_failure='ignore'): if ap.get('state', None) == 'untracked' and not ap.get('raw_input', False): # this path was found untracked, but not explicitly given to save # we will silently ignore this continue got_nothing = False # next check should not be done during annotation, as it is possibly expensive # and not generally useful if ap.get('status', None) == 'impossible' and \ ap.get('state', None) == 'absent' and \ ap.get('parentds', None): # this is not here anymore, but it might actually have been a deleted # component if relpath(ap['path'], start=ap['parentds']) \ in Dataset(ap['parentds']).repo.get_deleted_files(): # ok, this is a staged deletion that we want to save ap['status'] = '' del ap['message'] if ap.get('status', None): # this is done yield ap continue # for things like: `ds.save()` # or recursively discovered datasets if ap['path'] == refds_path or \ (ap.get('type', None) == 'dataset' and not ap.get('raw_input', False) and not ap.get('state', None) == 'absent'): ap['process_content'] = True ap['process_updated_only'] = all_updated to_process.append(ap) lgr.log(2, "save, to_process=%r", to_process) if got_nothing and recursive and refds_path: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict('save', status='notneeded', path=refds_path, type='dataset', logger=lgr) return if not to_process: # nothing left to do, potentially all errored before return if super_datasets: # search for the topmost superdatasets of any path dss = [ Dataset(ap.get('parentds', ap['path'])) for ap in to_process ] superdss = [ds.get_superdataset(topmost=True) for ds in dss] superdss = get_tree_roots( unique(ds.path for ds in dss + superdss if ds)) if dataset: # need to adjust the reference to the new superds # if we had one ref before, we should still have exactly one assert len(superdss) <= 1 dataset = list(superdss.keys())[0] refds_path = dataset elif refds_path: # there is a single superdataset superdss = { refds_path: unique( [ap['parentds'] for ap in to_process if 'parentds' in ap]) } else: # sort all datasets under their potential superdatasets # start from the top to get all subdatasets down the line # and collate them into as few superdatasets as possible # this is quick, just string operations superdss = get_tree_roots( unique( [ap['parentds'] for ap in to_process if 'parentds' in ap])) # for each "superdataset" check the tree of subdatasets and make sure # we gather all datasets between the super and any subdataset # so we can save them all bottom-up in order to be able to properly # save the superdataset # if this is called from e.g. `add` this is actually not necessary, # but in the general case we cannot avoid it # TODO maybe introduce a switch? discovered = {} for superds_path in superdss: target_subs = superdss[superds_path] discover_dataset_trace_to_targets( # from here superds_path, # to all target_subs, [], discovered) # create a new minimally annotated path for each discovered dataset discovered_added = set() for parentds in discovered: for subds in discovered[parentds]: to_process.append( dict(path=subds, parentds=parentds, type='dataset')) discovered_added.add(subds) # make sure we have an entry for each dataset, including those # tha are just parents for parentds in discovered: if parentds not in discovered_added: to_process.append( dict( path=parentds, type='dataset', # make sure we save content of superds later on process_content=True, # but not do nasty things, like adding untracked content # just because we discovered this dataset process_updated_only=True)) # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, deduplication happens here too annotated_paths = AnnotatePaths.__call__( path=to_process, dataset=dataset, # never recursion, done already recursive=False, action='save', unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') # now sort into datasets so we can process them one by one content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert (not completed) # iterate over all datasets, starting at the bottom for dspath in sorted(content_by_ds.keys(), reverse=True): ds = Dataset(dspath) res = get_status_dict('save', ds=ds, logger=lgr) if not ds.is_installed(): # TODO This is likely impossible now res['status'] = 'impossible' res['message'] = ('dataset %s is not installed', ds) yield res continue saved_state = save_dataset(ds, content_by_ds[dspath], message=message) res['status'] = 'ok' if saved_state else 'notneeded' # MIH: let's tag even if there was nothing commit. I'd forget this # option too often... if version_tag: try: # TODO: check whether comment below is still true after # removing the log swallowing: # again cannot help but force-silence low-level code, because # it screams like a made man instead of allowing top-level # code an orderly error report ds.repo.tag(version_tag) # even if we haven't saved anything res['status'] = 'ok' yield res except CommandError as e: if saved_state: # first we yield the result for the actual save yield res # and now complain that tagging didn't work yield get_status_dict( 'save', ds=ds, logger=lgr, status='error', message=('cannot tag this version: %s', e.stderr.strip())) else: yield res