def __call__(path=None, dataset=None, revision='HEAD', staged=False, ignore_subdatasets='none', recursive=False, recursion_limit=None): if not dataset and not path: # act on the whole dataset if nothing else was specified dataset = curdir refds_path = Interface.get_refds_path(dataset) if not (refds_path or path): raise InsufficientArgumentsError( "Neither dataset nor target path(s) provided") to_process = [] for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='diff', # unavailable is OK, because we might query for a deleted file unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): if ap.get('status', None): # we know what to report already yield ap continue if ap.get('type', None) == 'dataset': ap['process_content'] = True to_process.append(ap) # sort into datasets content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path, path_only=False) assert (not completed) for ds_path in sorted(content_by_ds.keys()): for r in _parse_git_diff(ds_path, diff_thingie=revision, paths=content_by_ds[ds_path], ignore_submodules=ignore_subdatasets, staged=staged): r.update(dict(action='diff', refds=refds_path), logger=lgr) if 'status' not in r: r['status'] = 'ok' yield r
def __call__( path=None, source=None, dataset=None, recursive=False, recursion_limit=None, get_data=True, description=None, reckless=False, #git_opts=None, #annex_opts=None, #annex_get_opts=None, jobs='auto', verbose=False, ): # IMPLEMENTATION CONCEPT: # # 1. Sort the world into existing handles and the rest # 2. Try locate missing handles (obtain subdatasets along the way) # 3. Expand into subdatasets with recursion enables (potentially # obtain even more subdatasets # 4. Shoot info of which handles to get in each subdataset to, # git-annex, once at the very end refds_path = Interface.get_refds_path(dataset) if not (dataset or path): raise InsufficientArgumentsError( "Neither dataset nor target path(s) provided") if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path # remember which results we already reported, to avoid duplicates yielded_ds = [] to_get = [] unavailable_paths = [] for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='get', # NOTE: Do not act upon unavailable paths yet! Done below after # testing which ones could be obtained unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): if ap.get('status', None): # we know what to report already yield ap continue if ap.get('state', None) == 'absent' and ap.get( 'raw_input', False): # if this wasn't found, but directly requested, queue for further # exploration unavailable_paths.append(ap) continue if ap.get('type', None) == 'dataset' and \ GitRepo.is_valid_repo(ap['path']) and \ not ap['path'] == refds_path: # do not report what hasn't arived yet # also do not report the base dataset that is already # present -- no surprise yield dict(ap, status='notneeded', logger=lgr, message='already installed') yielded_ds.append(ap['path']) ap['process_content'] = get_data to_get.append(ap) # explore the unknown for ap in sorted(unavailable_paths, key=lambda x: x['path']): lgr.debug("Investigate yet unavailable path %s", ap) # how close can we get? dspath = ap.get('parentds', get_dataset_root(ap['path'])) if dspath is None: # nothing we can do for this path continue lgr.debug("Found containing dataset %s for path %s", dspath, ap['path']) ds = Dataset(dspath) # now actually obtain whatever is necessary to get to this path containing_ds = [dspath] for res in _install_necessary_subdatasets(ds, ap['path'], reckless, refds_path, description=description): # yield immediately so errors could be acted upon outside, before # we continue if not (res['type'] == 'dataset' and res['path'] in yielded_ds): # unless we reported on this dataset before if res['type'] == 'dataset': # make a record, recursive below might now want to report # a 'notneeded' yielded_ds.append(res['path']) yield res # update to the current innermost dataset containing_ds.append(res['path']) if len(containing_ds) < 2: # no subdataset was installed, hence if the path was unavailable # before it still is, no need to bother git annex ap.update(status='impossible', message='path does not exist') yield ap continue # important to only do the next for the innermost subdataset # as the `recursive` logic below relies on that! # set the correct parent, for a dataset this would be the second-last # reported subdataset ap.update(parentds=containing_ds[-1]) if containing_ds[-1] == ap['path']: # the path actually refers to the last installed dataset ap.update(parentds=containing_ds[-2], process_content=get_data, type='dataset') to_get.append(ap) # results of recursive installation of yet undiscovered datasets rec_get = [] if recursive and not recursion_limit == 'existing': # obtain any subdatasets underneath the paths given inside the # subdatasets that we know already exist # unless we do not want recursion into not-yet-installed datasets for ap in sorted(to_get, key=lambda x: x['path']): if ap['type'] not in ('dataset', 'directory') or not ap.get( 'raw_input', False): # a non-directory cannot have content underneath # also we do NOT want to recurse into anything that was specifically # requested, to avoid duplication continue subds = Dataset(ap['path'] if ap['type'] == 'dataset' else ap['parentds']) lgr.info("Installing %s%s recursively", subds, (" underneath %s" % ap['path'] if subds.path != ap['path'] else "")) for res in _recursive_install_subds_underneath( subds, # `ap['path']` was explicitly given as input # we count recursions from the input, hence we # can start with the full number recursion_limit, reckless, start=ap['path'], refds_path=refds_path, description=description): # yield immediately so errors could be acted upon # outside, before we continue if not (res['type'] == 'dataset' and res['path'] in yielded_ds): # unless we reported on this dataset before if res['type'] == 'dataset': # make a record yielded_ds.append(res['path']) yield res if not (res['status'] == 'ok' and res['type'] == 'dataset'): # not a dataset that was just installed, we just reported it # upstairs, and can ignore it from now on continue # paranoia, so popular these days... assert GitRepo.is_valid_repo(res['path']) # keep a copy of the install record for `get` later on get_ap = { k: v for k, v in res.items() if not k == 'status' } get_ap['process_content'] = get_data rec_get.append(get_ap) if not get_data: # done already return # merge the two AP lists to_get.extend(rec_get) # sort into datasets content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_get, refds_path=refds_path) assert (not completed) # hand over to git-annex, get files content, # report files in git as 'notneeded' to get for ds_path in sorted(content_by_ds.keys()): ds = Dataset(ds_path) # grab content, ignore subdataset entries content = [ ap['path'] for ap in content_by_ds[ds_path] if ap.get('type', None) != 'dataset' or ap['path'] == ds.path ] if not content: # cut this short should there be nothing continue # needs to be an annex to get content if not isinstance(ds.repo, AnnexRepo): for r in results_from_paths( content, status='notneeded', message="no dataset annex, content already present", action='get', logger=lgr, refds=refds_path): yield r continue respath_by_status = {} for res in ds.repo.get(content, options=['--from=%s' % source] if source else [], jobs=jobs): res = annexjson2result(res, ds, type='file', logger=lgr, refds=refds_path) success = success_status_map[res['status']] # TODO: in case of some failed commands (e.g. get) there might # be no path in the record. yoh has only vague idea of logic # here so just checks for having 'path', but according to # results_from_annex_noinfo, then it would be assumed that # `content` was acquired successfully, which is not the case if 'path' in res: respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] yield res for r in results_from_annex_noinfo( ds, content, respath_by_status, dir_fail_msg='could not get some content in %s %s', noinfo_dir_msg='nothing to get from %s', noinfo_file_msg='already present', action='get', logger=lgr, refds=refds_path): yield r
def __call__( path=None, dataset=None, revision=None, staged=False, ignore_subdatasets='none', report_untracked='normal', recursive=False, recursion_limit=None): if not dataset and not path: # act on the whole dataset if nothing else was specified dataset = curdir refds_path = Interface.get_refds_path(dataset) to_process = [] # tracked what commit ranges we want to diff per dataset ds_diffies = {} for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='diff', # unavailable is OK, because we might query for a deleted file unavailable_path_status='', nondataset_path_status='impossible', # must not use `modified`, infinite loop otherwise modified=None, return_type='generator', on_failure='ignore'): if ap.get('status', None): # we know what to report already yield ap continue if ap.get('type', None) == 'dataset': ap['process_content'] = True if ap.get('raw_input', False) or ap['path'] == refds_path: # prepopulate the revision specs for all input paths ds_diffies[ap['path'] if ap.get('type', None) == 'dataset' else ap['parentds']] = revision to_process.append(ap) # sort into datasets content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path) assert(not completed) for ds_path in sorted(content_by_ds.keys()): if ds_path not in ds_diffies: # we don't know how to diff # this was not neither an input path, not did we see it # when diffing its parent continue content_paths = content_by_ds[ds_path] revision = ds_diffies[ds_path] for r in _parse_git_diff( ds_path, diff_thingie=ds_diffies[ds_path], paths=content_paths, ignore_submodules=ignore_subdatasets, staged=staged): r.update(dict( action='diff', logger=lgr)) if refds_path: r['refds'] = refds_path if 'status' not in r: r['status'] = 'ok' if r.get('type', None) == 'dataset': # this is a subdataset report # we need to use the reported commit range to properly adjust the # query once we hit that subdataset from_rev = r.get('revision_src', '') to_rev = r.get('revision', '') subrev = '{}..{}'.format( from_rev if from_rev else PRE_INIT_COMMIT_SHA, to_rev if to_rev else '', ) if from_rev and from_rev == to_rev: # this is a special case, where subdataset reported changes without # a change in state/commit -- this is code for uncommited changes # in the subdataset (including staged ones). In such a case, we # must not provide a diff range, but only the source commit we want # to diff against # XXX if this is changed, likely the same logic in annotate_paths needs # changing too! subrev = from_rev ds_diffies[r['path']] = subrev yield r if (revision and '..' in revision) or report_untracked == 'no': # don't look for untracked content, we got a revision range continue for r in _get_untracked_content( ds_path, report_untracked, paths=content_paths): r.update(dict( action='diff', logger=lgr)) if refds_path: r['refds'] = refds_path if 'status' not in r: r['status'] = 'ok' yield r
def __call__(path=None, dataset=None, recursive=False, recursion_limit=None, check=True, if_dirty='save-before'): if not dataset and not path: raise InsufficientArgumentsError( "insufficient information for `drop`: requires at least a path or dataset" ) refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='drop', logger=lgr, refds=refds_path) if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path to_drop = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='drop', # justification for status: # content need not be dropped where there is none unavailable_path_status='notneeded', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) == 'dataset' and \ GitRepo.is_valid_repo(ap['path']) and \ not ap['path'] == refds_path: ap['process_content'] = True if ap.get('registered_subds', False) and ap.get('state', None) == 'absent': # nothing to drop in an absent subdataset, don't be annoying # and skip silently continue to_drop.append(ap) content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_drop, refds_path=refds_path) assert (not completed) # iterate over all datasets, order doesn't matter for ds_path in content_by_ds: ds = Dataset(ds_path) # TODO generator # this should yield what it did handle_dirty_dataset(ds, mode=if_dirty) # ignore submodule entries content = [ ap['path'] for ap in content_by_ds[ds_path] if ap.get('type', None) != 'dataset' or ap['path'] == ds.path ] if not content: continue for r in _drop_files(ds, content, check=check, **res_kwargs): yield r
def __call__(message=None, files=None, dataset=None, all_updated=True, all_changes=None, version_tag=None, recursive=False, recursion_limit=None, super_datasets=False): if all_changes is not None: from datalad.support.exceptions import DeprecatedError raise DeprecatedError( new="all_updated option where fits and/or datalad add", version="0.5.0", msg="RF: all_changes option passed to the save") if not dataset and not files: # we got nothing at all -> save what is staged in the repo in "this" directory? # we verify that there is an actual repo next dataset = abspath(curdir) refds_path = Interface.get_refds_path(dataset) to_process = [] for ap in AnnotatePaths.__call__( path=files, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='save', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): # next check should not be done during annotation, as it is possibly expensive # and not generally useful if ap.get('status', None) == 'impossible' and \ ap.get('state', None) == 'absent' and \ ap.get('parentds', None): # this is not here anymore, but it might actually have been a deleted # component if relpath(ap['path'], start=ap['parentds']) \ in Dataset(ap['parentds']).repo.get_deleted_files(): # ok, this is a staged deletion that we want to save ap['status'] = '' del ap['message'] if ap.get('status', None): # this is done yield ap continue # for things like: `ds.save()` # or recursively discovered datasets if ap['path'] == refds_path or \ (ap.get('type', None) == 'dataset' and not ap.get('raw_input', False) and not ap.get('state', None) == 'absent'): ap['process_content'] = True ap['process_updated_only'] = all_updated to_process.append(ap) if not to_process: # nothing left to do, potentially all errored before return if super_datasets: # search for the topmost superdatasets of any path dss = [ Dataset(ap.get('parentds', ap['path'])) for ap in to_process ] superdss = [ds.get_superdataset(topmost=True) for ds in dss] superdss = get_tree_roots( unique(ds.path for ds in dss + superdss if ds)) if dataset: # need to adjust the reference to the new superds # if we had one ref before, we should still have exactly one assert len(superdss) <= 1 dataset = list(superdss.keys())[0] refds_path = dataset elif refds_path: # there is a single superdataset superdss = { refds_path: unique( [ap['parentds'] for ap in to_process if 'parentds' in ap]) } else: # sort all datasets under their potential superdatasets # start from the top to get all subdatasets down the line # and collate them into as few superdatasets as possible # this is quick, just string operations superdss = get_tree_roots( unique( [ap['parentds'] for ap in to_process if 'parentds' in ap])) # for each "superdataset" check the tree of subdatasets and make sure # we gather all datasets between the super and any subdataset # so we can save them all bottom-up in order to be able to properly # save the superdataset # if this is called from e.g. `add` this is actually not necessary, # but in the general case we cannot avoid it # TODO maybe introduce a switch? discovered = {} for superds_path in superdss: target_subs = superdss[superds_path] discover_dataset_trace_to_targets( # from here superds_path, # to all target_subs, [], discovered) # create a new minimally annotated path for each discovered dataset discovered_added = set() for parentds in discovered: for subds in discovered[parentds]: to_process.append( dict(path=subds, parentds=parentds, type='dataset')) discovered_added.add(subds) # make sure we have an entry for each dataset, including those # tha are just parents for parentds in discovered: if parentds not in discovered_added: to_process.append( dict( path=parentds, type='dataset', # make sure we save content of superds later on process_content=True)) # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, deduplication happens here too annotated_paths = AnnotatePaths.__call__( path=to_process, dataset=dataset, # never recursion, done already recursive=False, action='save', unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') # now sort into datasets so we can process them one by one content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path, path_only=False) assert (not completed) # iterate over all datasets, starting at the bottom for dspath in sorted(content_by_ds.keys(), reverse=True): ds = Dataset(dspath) res = get_status_dict('save', ds=ds, logger=lgr) if not ds.is_installed(): # TODO This is likely impossible now res['status'] = 'impossible' res['message'] = ('dataset %s is not installed', ds) yield res continue saved_state = save_dataset(ds, content_by_ds[dspath], message=message, version_tag=version_tag) if saved_state: res['status'] = 'ok' else: res['status'] = 'notneeded' yield res
def __call__(path=None, dataset=None, revision=None, staged=False, ignore_subdatasets='none', report_untracked='normal', recursive=False, recursion_limit=None): if not dataset and not path: # act on the whole dataset if nothing else was specified dataset = curdir refds_path = Interface.get_refds_path(dataset) to_process = [] # tracked what commit ranges we want to diff per dataset ds_diffies = {} for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='diff', # unavailable is OK, because we might query for a deleted file unavailable_path_status='', nondataset_path_status='impossible', # must not use `modified`, infinite loop otherwise modified=None, return_type='generator', on_failure='ignore'): if ap.get('status', None): # we know what to report already yield ap continue if ap.get('type', None) == 'dataset': ap['process_content'] = True if ap.get('raw_input', False) or ap['path'] == refds_path: # prepopulate the revision specs for all input paths ds_diffies[ap['path'] if ap.get('type', None) == 'dataset' else ap['parentds']] = revision to_process.append(ap) # sort into datasets content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path) assert (not completed) for ds_path in sorted(content_by_ds.keys()): if ds_path not in ds_diffies: # we don't know how to diff # this was not neither an input path, not did we see it # when diffing its parent continue content_paths = content_by_ds[ds_path] revision = ds_diffies[ds_path] for r in _parse_git_diff(ds_path, diff_thingie=ds_diffies[ds_path], paths=content_paths, ignore_submodules=ignore_subdatasets, staged=staged): r.update(dict(action='diff', logger=lgr)) if refds_path: r['refds'] = refds_path if 'status' not in r: r['status'] = 'ok' if r.get('type', None) == 'dataset': # this is a subdataset report # we need to use the reported commit range to properly adjust the # query once we hit that subdataset from_rev = r.get('revision_src', '') to_rev = r.get('revision', '') subrev = '{}..{}'.format( from_rev if from_rev else PRE_INIT_COMMIT_SHA, to_rev if to_rev else '', ) if from_rev and from_rev == to_rev: # this is a special case, where subdataset reported changes without # a change in state/commit -- this is code for uncommited changes # in the subdataset (including staged ones). In such a case, we # must not provide a diff range, but only the source commit we want # to diff against # XXX if this is changed, likely the same logic in annotate_paths needs # changing too! subrev = from_rev ds_diffies[r['path']] = subrev yield r if (revision and '..' in revision) or report_untracked == 'no': # don't look for untracked content, we got a revision range continue for r in _get_untracked_content(ds_path, report_untracked, paths=content_paths): r.update(dict(action='diff', logger=lgr)) if refds_path: r['refds'] = refds_path if 'status' not in r: r['status'] = 'ok' yield r
def __call__( path=None, dataset=None, recursive=False, recursion_limit=None, check=True, if_dirty='save-before'): if not dataset and not path: raise InsufficientArgumentsError( "insufficient information for `drop`: requires at least a path or dataset") refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='drop', logger=lgr, refds=refds_path) if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path to_drop = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='drop', # justification for status: # content need not be dropped where there is none unavailable_path_status='notneeded', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) == 'dataset' and \ GitRepo.is_valid_repo(ap['path']) and \ not ap['path'] == refds_path: ap['process_content'] = True if ap.get('registered_subds', False) and ap.get('state', None) == 'absent': # nothing to drop in an absent subdataset, don't be annoying # and skip silently continue to_drop.append(ap) content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_drop, refds_path=refds_path) assert(not completed) # iterate over all datasets, order doesn't matter for ds_path in content_by_ds: ds = Dataset(ds_path) # TODO generator # this should yield what it did handle_dirty_dataset(ds, mode=if_dirty) # ignore submodule entries content = [ap['path'] for ap in content_by_ds[ds_path] if ap.get('type', None) != 'dataset' or ap['path'] == ds.path] if not content: continue for r in _drop_files(ds, content, check=check, **res_kwargs): yield r
def __call__( path=None, dataset=None, # support passing this through in a path by path basis to_git=None, save=True, message=None, recursive=False, recursion_limit=None, ds2super=False, git_opts=None, annex_opts=None, annex_add_opts=None, jobs=None): # parameter constraints: if not path: raise InsufficientArgumentsError( "insufficient information for adding: requires at least a path" ) refds_path = Interface.get_refds_path(dataset) common_report = dict(action='add', logger=lgr, refds=refds_path) to_add = [] subds_to_add = {} ds_to_annotate_from_recursion = {} got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=dataset, # never recursion, need to handle manually below to be able to # discover untracked content recursive=False, action='add', # speed things up by using Git's modification detection, if there # is a repo with at least one commit modified='HEAD' \ if dataset and \ GitRepo.is_valid_repo(refds_path) and \ GitRepo(refds_path).get_hexsha() \ else None, unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): got_nothing = False if ap.get('status', None): # this is done yield ap continue if ap.get('parentds', None) is None and ap.get('type', None) != 'dataset': yield get_status_dict( status='impossible', message='"there is no dataset to add this path to', **dict(common_report, **ap)) continue if ap.get('type', None) == 'directory' and \ ap.get('state', None) == 'untracked' and \ GitRepo.is_valid_repo(ap['path']): # this is an untracked wannabe subdataset in disguise ap['type'] = 'dataset' if recursive and \ (ap.get('raw_input', False) or ap.get('state', None) in ('added', 'modified', 'untracked')) and \ (ap.get('parentds', None) or ap.get('type', None) == 'dataset'): # this was an actually requested input path, or a path that was found # modified by path annotation, based on an input argument # we need to recurse into all subdirs to find potentially # unregistered subdatasets # but only if this path has a parent, or is itself a dataset # otherwise there is nothing to add to _discover_subdatasets_recursively( ds_to_annotate_from_recursion, ap['path'], [ap['parentds'] if 'parentds' in ap else ap['path']], recursion_limit) # get the file content of the root dataset of this search added too # but be careful with extreme recursion_limit settings if recursion_limit is None or recursion_limit > 0: ap['process_content'] = True # record for further processing if not ap['path'] in ds_to_annotate_from_recursion: # if it was somehow already discovered to_add.append(ap) # TODO check if next isn't covered by discover_dataset_trace_to_targets already?? if dataset and ap.get('type', None) == 'dataset': # duplicates not possible, annotated_paths returns unique paths subds_to_add[ap['path']] = ap if got_nothing: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict('add', status='notneeded', path=refds_path, type='dataset', logger=lgr) return for subds in ds_to_annotate_from_recursion: if subds not in subds_to_add: # always prefer the already annotated path subds_to_add[subds] = ds_to_annotate_from_recursion[subds] if dataset: # we have a base dataset, discover any intermediate datasets between # the base and any already discovered dataset discovered = {} discover_dataset_trace_to_targets( # from here dataset.path, # to any dataset we are aware of subds_to_add.keys(), [], discovered) for parentds in discovered: for subds in discovered[parentds]: subds_to_add[subds] = subds_to_add.get( subds, dict(path=subds, parentds=parentds, type='dataset')) # merge custom paths and discovered dataset records, paths needs to go first, # because we know most about then, and subsequent annotation call we skip the # later duplicate ones to_add.extend(subds_to_add.values()) # and compact, this should be OK as all the info is in each ap dict to_add = unique(to_add, lambda x: x['path']) if not to_add: # nothing left to do, potentially all errored before return # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, it will also deduplicate annotated_paths = AnnotatePaths.__call__( path=to_add, dataset=dataset, # never recursion, done already recursive=False, action='add', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert (not completed) if not content_by_ds: # we should have complained about any inappropriate path argument # above, so if nothing is left, we can simply exit return # simple loop over datasets -- save happens later # start deep down to_save = [] for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) torepoadd = {} respath_by_status = {} for ap in content_by_ds[ds_path]: # we have a new story ap.pop('status', None) torepoadd[ap['path']] = ap # skip anything that doesn't look like a wannabe subdataset if not ap.get('type', None) == 'dataset' or \ ap['path'] == ds_path: continue if ap.get('registered_subds', False): # subdataset that might be in this list because of the # need to save all the way up to a super dataset respath_by_status['success'] = \ respath_by_status.get('success', []) + [ap['path']] yield get_status_dict(status='notneeded', message="already known subdataset", **dict(common_report, **ap)) continue subds = Dataset(ap['path']) # check that the subds has a commit, and refuse # to operate on it otherwise, or we would get a bastard # submodule that cripples git operations if not subds.repo.get_hexsha(): yield get_status_dict( ds=subds, status='impossible', message='cannot add subdataset with no commits', **dict(common_report, **ap)) continue subds_relpath = relpath(ap['path'], ds_path) # make an attempt to configure a submodule source URL based on the # discovered remote configuration remote, branch = subds.repo.get_tracking_branch() subds_url = subds.repo.get_remote_url( remote) if remote else None # Register the repository in the repo tree as a submodule try: ds.repo.add_submodule(subds_relpath, url=subds_url, name=None) except CommandError as e: yield get_status_dict(ds=subds, status='error', message=e.stderr, **dict(common_report, **ap)) continue # queue for saving using the updated annotated path ap['registered_subds'] = True # I hope this is true in direct mode too # TODO this is disabled, because in some circumstances # staging just doesn't happen, and it is unclear when # exactly -- the case that prompted disabling was a submodule # that had no content except for other submodules was not staged, # whereas another submodule on the same level in the same # superdataset which also has one file in it was staged # disable to work correctly, while paying a little bit of # slow down #ap['staged'] = True to_save.append(ap) _fixup_submodule_dotgit_setup(ds, subds_relpath) # report added subdatasets -- `annex add` below won't do it yield get_status_dict(ds=subds, status='ok', message='added new subdataset', **dict(common_report, **ap)) # make sure that .gitmodules is added to the list of files gitmodules_path = opj(ds.path, '.gitmodules') # for git torepoadd[gitmodules_path] = dict(path=gitmodules_path) # and for save to_save.append( dict(path=gitmodules_path, parentds=ds_path, type='file')) # make sure any last minute additions make it to the saving stage # XXX? should content_by_ds become OrderedDict so that possible # super here gets processed last? lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd) is_annex = isinstance(ds.repo, AnnexRepo) add_kw = {'jobs': jobs} if is_annex and jobs else {} added = ds.repo.add(list(torepoadd.keys()), git=to_git if is_annex else True, **add_kw) for a in added: res = annexjson2result(a, ds, type='file', **common_report) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] # produce best possible path/result annotation if res['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report res = dict(torepoadd[res['path']], **res) # override this in all cases to be safe res['parentds'] = ds.path if success: # this was successfully added, queue for saving this very path # in the dataset ap = {k: v for k, v in res.items() if k != 'status'} ap['staged'] = True # strip any status and state info (e.g. save will refuse to save # stuff that is marked state='untracked' to_save.append({ k: v for k, v in res.items() if k not in ('status', 'state') }) if a['file'] == '.gitmodules': # filter out .gitmodules, because this is only included for # technical reasons and has nothing to do with the actual content continue if GitRepo.is_valid_repo(res['path']): # more accurate report in case of an added submodule # mountpoint. # XXX Actually not sure if this can really happen # (depends on what our low-level code would do) # but worst case is that we loose a little bit of # coverage... res['type'] = 'dataset' res['message'] = 'added new state as submodule' yield res for r in results_from_annex_noinfo( ds, torepoadd, respath_by_status, dir_fail_msg='could not add some content in %s %s', noinfo_dir_msg='nothing to add from %s', noinfo_file_msg='already included in the dataset', action='add', logger=lgr, refds=refds_path): if r['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report r = dict(r, **torepoadd[r['path']]) if r['status'] == 'notneeded': # this could be a file that was staged already, it doesn't need # to be added, but it should be saved/commited if so desired to_save.append({ k: v for k, v in r.items() if k not in ('status', 'state') }) # XXX something is fishy with the next one, rethink when sober.... if r['path'] == ds_path and r['status'] == 'ok': # this is for the entire dataset itself which was explicitly requested # make sure to save all r['type'] = 'dataset' r['process_content'] = True to_save.append( {k: v for k, v in r.items() if k != 'status'}) yield r if refds_path and ds_path != refds_path and len( respath_by_status.get('success', [])): # TODO XXX we have an issue here when with `add('.')` and annex ignores any # dotfiles. In this case we end up not saving a dataset completely, because # we rely on accurate reporting. there is an issue about this already # TODO look up the issue ID # if there is a base dataset, but we are below it, and we have anything done to this # dataset -> queue dataset itself for saving its state in the parent ds_ap = dict( path=ds.path, # we have to look for the parent here, as we must save the # subdataset in the parent and not the whole subdataset itself type='dataset') parentds = get_dataset_root(normpath(opj(ds.path, pardir))) if parentds: ds_ap['parentds'] = parentds if dataset: ds_ap['refds'] = refds_path to_save.append(ds_ap) if not save: lgr.debug('Not calling `save` as instructed') return # TODO tell save what was staged already! Set 'staged=True' for # respective annotated paths that are fed into `save` # do not reuse any of the sorting done in here for saving, but instead # pass on all the annotated paths to have `save` figure out what to do with # them -- this is costs something, but should be safer, and frankly is # more comprehensible for res in Save.__call__( # hand-selected annotated paths path=to_save, dataset=refds_path, message=message if message else '[DATALAD] added content', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def __call__(message=None, path=None, dataset=None, all_updated=True, version_tag=None, recursive=False, recursion_limit=None, super_datasets=False, message_file=None): if not dataset and not path: # we got nothing at all -> save what is staged in the repo in "this" directory? # make sure we don't treat this as a user-provided '.' argument path = [{'path': abspath(curdir), 'raw_input': False}] refds_path = Interface.get_refds_path(dataset) if message and message_file: yield get_status_dict( 'save', status='error', path=refds_path, message="Both a message and message file were specified", logger=lgr) return if message_file: with open(message_file) as mfh: message = mfh.read() to_process = [] got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='save', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', modified='HEAD' if not path and recursive else None, return_type='generator', on_failure='ignore'): if ap.get('state', None) == 'untracked' and not ap.get('raw_input', False): # this path was found untracked, but not explicitly given to save # we will silently ignore this continue got_nothing = False # next check should not be done during annotation, as it is possibly expensive # and not generally useful if ap.get('status', None) == 'impossible' and \ ap.get('state', None) == 'absent' and \ ap.get('parentds', None): # this is not here anymore, but it might actually have been a deleted # component if relpath(ap['path'], start=ap['parentds']) \ in Dataset(ap['parentds']).repo.get_deleted_files(): # ok, this is a staged deletion that we want to save ap['status'] = '' del ap['message'] if ap.get('status', None): # this is done yield ap continue # for things like: `ds.save()` # or recursively discovered datasets if ap['path'] == refds_path or \ (ap.get('type', None) == 'dataset' and not ap.get('raw_input', False) and not ap.get('state', None) == 'absent'): ap['process_content'] = True ap['process_updated_only'] = all_updated to_process.append(ap) lgr.log(2, "save, to_process=%r", to_process) if got_nothing and recursive and refds_path: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict('save', status='notneeded', path=refds_path, type='dataset', logger=lgr) return if not to_process: # nothing left to do, potentially all errored before return if super_datasets: # search for the topmost superdatasets of any path dss = [ Dataset(ap.get('parentds', ap['path'])) for ap in to_process ] superdss = [ds.get_superdataset(topmost=True) for ds in dss] superdss = get_tree_roots( unique(ds.path for ds in dss + superdss if ds)) if dataset: # need to adjust the reference to the new superds # if we had one ref before, we should still have exactly one assert len(superdss) <= 1 dataset = list(superdss.keys())[0] refds_path = dataset elif refds_path: # there is a single superdataset superdss = { refds_path: unique( [ap['parentds'] for ap in to_process if 'parentds' in ap]) } else: # sort all datasets under their potential superdatasets # start from the top to get all subdatasets down the line # and collate them into as few superdatasets as possible # this is quick, just string operations superdss = get_tree_roots( unique( [ap['parentds'] for ap in to_process if 'parentds' in ap])) # for each "superdataset" check the tree of subdatasets and make sure # we gather all datasets between the super and any subdataset # so we can save them all bottom-up in order to be able to properly # save the superdataset # if this is called from e.g. `add` this is actually not necessary, # but in the general case we cannot avoid it # TODO maybe introduce a switch? discovered = {} for superds_path in superdss: target_subs = superdss[superds_path] discover_dataset_trace_to_targets( # from here superds_path, # to all target_subs, [], discovered) # create a new minimally annotated path for each discovered dataset discovered_added = set() for parentds in discovered: for subds in discovered[parentds]: to_process.append( dict(path=subds, parentds=parentds, type='dataset')) discovered_added.add(subds) # make sure we have an entry for each dataset, including those # tha are just parents for parentds in discovered: if parentds not in discovered_added: to_process.append( dict( path=parentds, type='dataset', # make sure we save content of superds later on process_content=True, # but not do nasty things, like adding untracked content # just because we discovered this dataset process_updated_only=True)) # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, deduplication happens here too annotated_paths = AnnotatePaths.__call__( path=to_process, dataset=dataset, # never recursion, done already recursive=False, action='save', unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') # now sort into datasets so we can process them one by one content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert (not completed) # iterate over all datasets, starting at the bottom for dspath in sorted(content_by_ds.keys(), reverse=True): ds = Dataset(dspath) res = get_status_dict('save', ds=ds, logger=lgr) if not ds.is_installed(): # TODO This is likely impossible now res['status'] = 'impossible' res['message'] = ('dataset %s is not installed', ds) yield res continue saved_state = save_dataset(ds, content_by_ds[dspath], message=message) res['status'] = 'ok' if saved_state else 'notneeded' # MIH: let's tag even if there was nothing commit. I'd forget this # option too often... if version_tag: try: # TODO: check whether comment below is still true after # removing the log swallowing: # again cannot help but force-silence low-level code, because # it screams like a made man instead of allowing top-level # code an orderly error report ds.repo.tag(version_tag) # even if we haven't saved anything res['status'] = 'ok' yield res except CommandError as e: if saved_state: # first we yield the result for the actual save yield res # and now complain that tagging didn't work yield get_status_dict( 'save', ds=ds, logger=lgr, status='error', message=('cannot tag this version: %s', e.stderr.strip())) else: yield res
def __call__( path=None, dataset=None, recursive=False, check=True, save=True, message=None, if_dirty='save-before'): res_kwargs = dict(action='remove', logger=lgr) if not dataset and not path: raise InsufficientArgumentsError( "insufficient information for `remove`: requires at least a path or dataset") refds_path = Interface.get_refds_path(dataset) res_kwargs['refds'] = refds_path if refds_path and not path and not GitRepo.is_valid_repo(refds_path): # nothing here, nothing to remove yield get_status_dict(path=refds_path, status='notneeded', **res_kwargs) return if refds_path and not path: # act on the whole dataset if nothing else was specified # TODO i think that would happen automatically in annotation? path = refds_path to_process = [] for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, # we only ever want to discover immediate subdatasets, the rest # will happen in `uninstall` recursion_limit=1, action='remove', unavailable_path_status='', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('state', None) == 'absent' and \ ap.get('parentds', None) is None: # nothing exists at location, and there is no parent to # remove from ap['status'] = 'notneeded' ap['message'] = "path does not exist and is not in a dataset" yield ap continue if ap.get('raw_input', False) and ap.get('type', None) == 'dataset': # make sure dataset sorting yields a dedicted entry for this one ap['process_content'] = True to_process.append(ap) if not to_process: # nothing left to do, potentially all errored before return if path_is_under([ap['path'] for ap in to_process]): # behave like `rm` and refuse to remove where we are raise ValueError( "refusing to uninstall current or parent directory") # now sort into datasets so we can process them one by one content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path) assert(not completed) # iterate over all datasets, starting at the bottom # to make the removal of dataset content known upstairs to_save = [] # track which submodules we have removed in the process, to avoid # failure in case we revisit them due to a subsequent path argument subm_removed = [] for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) paths = content_by_ds[ds_path] to_reporemove = dict() # PLAN any dataset that was not raw_input, uninstall (passing recursive flag) # if dataset itself is in paths, skip any nondataset # sort reverse so we get subdatasets first for ap in sorted(paths, key=lambda x: x['path'], reverse=True): if ap.get('type', None) == 'dataset': # entire dataset needs to go, uninstall if present, pass recursive! uninstall_failed = False if ap['path'] == refds_path or \ (refds_path is None and ap.get('raw_input', False)): # top-level handling, cannot use regular uninstall call, as # it will refuse to uninstall a top-level dataset # and rightfully so, it is really a remove in that case # bypass all the safety by using low-level helper for r in _uninstall_dataset(ds, check=check, has_super=False, **res_kwargs): if r['status'] in ('impossible', 'error'): # we need to inspect if something went wrong, in order # to prevent failure from removing a non-empty dir below, # but at the same time allow for continued processing uninstall_failed = True r['refds'] = refds_path yield r # recheck that it wasn't removed during a previous iteration elif ap.get('state', None) != 'absent' and GitRepo.is_valid_repo(ap['path']): # anything that is not the top-level -> regular uninstall # this is for subdatasets of the to-be-removed dataset # we want to simply uninstall them in a regular manner for r in Uninstall.__call__( ap['path'], dataset=refds_path, recursive=recursive, check=check, if_dirty=if_dirty, result_xfm=None, result_filter=None, on_failure='ignore'): if r['status'] in ('impossible', 'error'): # we need to inspect if something went wrong, in order # to prevent failure from removing a non-empty dir below, # but at the same time allow for continued processing uninstall_failed = True yield r if not ap.get('raw_input', False): # we only ever want to actually unregister subdatasets that # were given explicitly continue if not uninstall_failed and \ not ap['path'] in subm_removed and \ refds_path and \ ap.get('parentds', None) and \ not (relpath(ap['path'], start=refds_path).startswith(pardir) or ap['path'] == refds_path) and \ ap.get('registered_subds', False): # strip from superdataset, but only if a dataset was given explcitly # as in "remove from this dataset", but not when just a path was given # as in "remove from the filesystem" subds_relpath = relpath(ap['path'], start=ap['parentds']) # remove submodule reference parentds = Dataset(ap['parentds']) # play safe, will fail on dirty parentds.repo.deinit_submodule(ap['path']) # remove now empty submodule link parentds.repo.remove(ap['path']) # make a record that we removed this already, should it be # revisited via another path argument, because do not reannotate # the paths after every removal subm_removed.append(ap['path']) yield dict(ap, status='ok', **res_kwargs) # need .gitmodules update in parent to_save.append(dict( path=opj(parentds.path, '.gitmodules'), parents=parentds.path, type='file')) # and the removal itself needs to be committed # inform `save` that it is OK that this path # doesn't exist on the filesystem anymore ap['unavailable_path_status'] = '' ap['process_content'] = False to_save.append(ap) if not uninstall_failed and exists(ap['path']): # could be an empty dir in case an already uninstalled subdataset # got removed rmdir(ap['path']) else: # anything that is not a dataset can simply be passed on to_reporemove[ap['path']] = ap # avoid unnecessary git calls when there is nothing to do if to_reporemove: if check and hasattr(ds.repo, 'drop'): for r in _drop_files(ds, list(to_reporemove), check=True): if r['status'] == 'error': # if drop errored on that path, we can't remove it to_reporemove.pop(r['path'], 'avoidKeyError') yield r if to_reporemove: for r in ds.repo.remove(list(to_reporemove), r=True): # these were removed, but we still need to save the # removal r_abs = opj(ds.path, r) if r_abs in to_reporemove: ap = to_reporemove[r_abs] else: ap = {'path': r_abs, 'parentds': ds.path, 'refds': refds_path } ap['unavailable_path_status'] = '' to_save.append(ap) yield get_status_dict( status='ok', path=r, **res_kwargs) if not to_save: # nothing left to do, potentially all errored before return if not save: lgr.debug('Not calling `save` as instructed') return for res in Save.__call__( path=[ap["path"] for ap in to_save], # we might have removed the reference dataset by now, recheck dataset=refds_path if (refds_path and GitRepo.is_valid_repo(refds_path)) else None, message=message if message else '[DATALAD] removed content', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def __call__( path=None, dataset=None, recursive=False, recursion_limit=None): if path is None and dataset is None: raise InsufficientArgumentsError( "insufficient arguments for unlocking: needs at least " "a dataset or a path to unlock.") refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='unlock', logger=lgr, refds=refds_path) to_process = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='unlock', unavailable_path_status='impossible', unavailable_path_msg="path does not exist", nondataset_path_status='impossible', modified=None, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', 'dataset') == 'dataset': # this is a dataset ap['process_content'] = True to_process.append(ap) content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path) assert(not completed) for ds_path in sorted(content_by_ds.keys()): ds = Dataset(ds_path) content = content_by_ds[ds_path] # no annex, no unlock: if not isinstance(ds.repo, AnnexRepo): for ap in content: ap['status'] = 'notneeded' ap['message'] = "not annex'ed, nothing to unlock" ap.update(res_kwargs) yield ap continue # direct mode, no unlock: elif ds.repo.is_direct_mode(): for ap in content: ap['status'] = 'notneeded' ap['message'] = "direct mode, nothing to unlock" ap.update(res_kwargs) yield ap continue # only files in annex with their content present: files = [ap['path'] for ap in content] to_unlock = [] for ap, under_annex, has_content in \ zip(content, ds.repo.is_under_annex(files), ds.repo.file_has_content(files)): # TODO: what about directories? Make sure, there is no # situation like no file beneath with content or everything in # git, that leads to a CommandError # For now pass to annex: from os.path import isdir if isdir(ap['path']): to_unlock.append(ap) continue # Note, that `file_has_content` is (planned to report) True on # files in git. Therefore order matters: First check for annex! if under_annex: if has_content: to_unlock.append(ap) # no content, no unlock: else: ap['status'] = 'impossible' ap['message'] = "no content present, can't unlock" ap.update(res_kwargs) yield ap # file in git, no unlock: else: ap['status'] = 'notneeded' ap['message'] = "not controlled by annex, nothing to unlock" ap.update(res_kwargs) yield ap # don't call annex-unlock with no path, if this is this case because # nothing survived the filtering above if content and not to_unlock: continue for r in ds.repo.unlock([ap['path'] for ap in to_unlock]): yield get_status_dict( path=opj(ds.path, r), status='ok', type='file', **res_kwargs)
def __call__( path=None, dataset=None, recursive=False, recursion_limit=None): if path is None and dataset is None: raise InsufficientArgumentsError( "insufficient arguments for unlocking: needs at least " "a dataset or a path to unlock.") refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='unlock', logger=lgr, refds=refds_path) to_process = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='unlock', unavailable_path_status='impossible', unavailable_path_msg="path does not exist", nondataset_path_status='impossible', modified=None, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', 'dataset') == 'dataset': # this is a dataset ap['process_content'] = True to_process.append(ap) content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path) assert(not completed) for ds_path in sorted(content_by_ds.keys()): ds = Dataset(ds_path) content = content_by_ds[ds_path] # no annex, no unlock: if not isinstance(ds.repo, AnnexRepo): for ap in content: ap['status'] = 'notneeded' ap['message'] = "not annex'ed, nothing to unlock" ap.update(res_kwargs) yield ap continue # direct mode, no unlock: elif ds.repo.is_direct_mode(): for ap in content: ap['status'] = 'notneeded' ap['message'] = "direct mode, nothing to unlock" ap.update(res_kwargs) yield ap continue # only files in annex with their content present: files = [ap['path'] for ap in content] to_unlock = [] for ap, under_annex, has_content in \ zip(content, ds.repo.is_under_annex(files), ds.repo.file_has_content(files)): # TODO: what about directories? Make sure, there is no # situation like no file beneath with content or everything in # git, that leads to a CommandError # For now pass to annex: from os.path import isdir if isdir(ap['path']): to_unlock.append(ap) continue # Note, that `file_has_content` is (planned to report) True on # files in git. Therefore order matters: First check for annex! if under_annex: if has_content: to_unlock.append(ap) # no content, no unlock: else: ap['status'] = 'impossible' ap['message'] = "no content present, can't unlock" ap.update(res_kwargs) yield ap # file in git, no unlock: else: ap['status'] = 'notneeded' ap['message'] = "not controlled by annex, nothing to unlock" ap.update(res_kwargs) yield ap # don't call annex-unlock with no path, if this is this case because # nothing survived the filtering above if content and not to_unlock: continue for r in ds.repo.unlock([ap['path'] for ap in to_unlock]): yield get_status_dict( path=r, status='ok', type='file', **res_kwargs)
def __call__(path=None, dataset=None, add=None, init=None, remove=None, reset=None, define_key=None, dataset_global=False, recursive=False, recursion_limit=None): # bring metadataset setter args in shape first untag, remove = _parse_argspec(remove) purge, reset = _parse_argspec(reset) tag_add, add = _parse_argspec(add) tag_init, init = _parse_argspec(init) define_key = dict(define_key) if define_key else None # merge all potential sources of tag specifications all_untag = remove.get('tag', []) + untag if all_untag: remove['tag'] = all_untag all_addtag = add.get('tag', []) + tag_add if all_addtag: add['tag'] = all_addtag all_inittag = init.get('tag', []) + tag_init if all_inittag: init['tag'] = all_inittag lgr.debug("Will 'init' metadata items: %s", init) lgr.debug("Will 'add' metadata items: %s", add) lgr.debug("Will 'remove' metadata items: %s", remove) lgr.debug("Will 'reset' metadata items: %s", reset) lgr.debug("Will 'purge' metadata items: %s", purge) refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='metadata', logger=lgr, refds=refds_path) to_process = [] for ap in AnnotatePaths.__call__(dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='metadata', unavailable_path_status='error', nondataset_path_status='error', force_subds_discovery=False, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) == 'dataset': if ap.get('state', None) == 'absent': # just discovered via recursion, but not relevant here continue if GitRepo.is_valid_repo(ap['path']): ap['process_content'] = True to_process.append(ap) content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path, path_only=False) assert (not completed) # iterate over all datasets, order doesn't matter to_save = [] for ds_path in content_by_ds: # ignore submodule entries content = [ ap for ap in content_by_ds[ds_path] if ap.get('type', None) != 'dataset' or ap['path'] == ds_path ] if not content: # nothing other than subdatasets were given or discovered in # this dataset, ignore continue ds = Dataset(ds_path) if dataset_global or define_key: db_path = opj(ds.path, '.datalad', 'metadata', 'dataset.json') db = {} if exists(db_path): db_fp = open(db_path) # need to read manually, load() would puke on an empty file db_content = db_fp.read() # minimize time for collision db_fp.close() if db_content: db = json.loads(db_content) # TODO make manipulation order identical to what git-annex does for k, v in init.items() if init else []: if k not in db: db[k] = v for k in purge: if k in db: del db[k] for k, v in reset.items(): db[k] = v for k, v in add.items(): db[k] = sorted(unique(db.get(k, []) + v)) for k, v in remove.items(): existing_data = db.get(k, []) if isinstance(existing_data, dict): db[k] = { dk: existing_data[dk] for dk in set(existing_data).difference(v) } else: db[k] = list(set(existing_data).difference(v)) # wipe out if empty if not db[k]: del db[k] added_def = False if define_key: defs = db.get('definition', {}) for k, v in define_key.items(): if k in defs: if not defs[k] == v: yield get_status_dict( status='error', ds=ds, message= ("conflicting definition for key '%s': '%s' != '%s'", k, v, defs[k]), **res_kwargs) continue else: defs[k] = v added_def = True db['definition'] = defs # store, if there is anything if db: if not exists(dirname(db_path)): makedirs(dirname(db_path)) db_fp = open(db_path, 'w') # produce relatively compact, but also diff-friendly format json.dump(db, db_fp, indent=0, separators=(',', ':\n'), sort_keys=True) # minimize time for collision db_fp.close() # use add not save to also cover case of a fresh file ds.add(db_path, save=False) to_save.append( dict(path=db_path, parentds=ds.path, type='file')) elif exists(db_path): # no metadata left, kill file ds.remove(db_path) to_save.append(dict(path=ds.path, type='dataset')) if added_def or init or add or remove or reset or purge: # if anything happended or could have happended yield get_status_dict(status='ok', ds=ds, metadata=db, **res_kwargs) elif not isinstance(ds.repo, AnnexRepo): # report on all explicitly requested paths only for ap in [c for c in content if ap.get('raw_input', False)]: yield dict( ap, status='impossible', message=( 'non-annex dataset %s has no file metadata support', ds), **res_kwargs) continue ds_paths = [p['path'] for p in content] if not dataset_global: if reset or purge or add or init or remove: # file metadata manipulation mod_paths = [] for mp in ds.repo.set_metadata( ds_paths, reset=reset, add=add, init=init, remove=remove, purge=purge, # we always go recursive # TODO is that a good thing? But how to otherwise distinuish # this kind of recursive from the one across datasets in # the API? recursive=True): if mp.get('success', False): mod_paths.append(mp['file']) else: yield get_status_dict( status='error', message='setting metadata failed', path=opj(ds.path, mp[0]), type='file', **res_kwargs) # query the actually modified paths only ds_paths = mod_paths # and lastly, query -- even if we set before -- there could # be side-effect from multiple set paths on an individual # path, hence we need to query to get the final result for file, meta in ds.repo.get_metadata(ds_paths): r = get_status_dict(status='ok', path=opj(ds.path, file), type='file', metadata=meta, **res_kwargs) yield r # save potential modifications to dataset global metadata if not to_save: return for res in Save.__call__(path=to_save, dataset=refds_path, message='[DATALAD] dataset metadata update', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def __call__( path=None, source=None, dataset=None, recursive=False, recursion_limit=None, get_data=True, description=None, reckless=False, #git_opts=None, #annex_opts=None, #annex_get_opts=None, jobs='auto', verbose=False, ): # IMPLEMENTATION CONCEPT: # # 1. Sort the world into existing handles and the rest # 2. Try locate missing handles (obtain subdatasets along the way) # 3. Expand into subdatasets with recursion enables (potentially # obtain even more subdatasets # 4. Shoot info of which handles to get in each subdataset to, # git-annex, once at the very end refds_path = Interface.get_refds_path(dataset) if not (dataset or path): raise InsufficientArgumentsError( "Neither dataset nor target path(s) provided") if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path # remember which results we already reported, to avoid duplicates yielded_ds = [] to_get = [] unavailable_paths = [] for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='get', # NOTE: Do not act upon unavailable paths yet! Done below after # testing which ones could be obtained unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): if ap.get('status', None): # we know what to report already yield ap continue if ap.get('state', None) == 'absent' and ap.get('raw_input', False): # if this wasn't found, but directly requested, queue for further # exploration unavailable_paths.append(ap) continue if ap.get('type', None) == 'dataset' and \ GitRepo.is_valid_repo(ap['path']) and \ not ap['path'] == refds_path: # do not report what hasn't arived yet # also do not report the base dataset that is already # present -- no surprise yield dict(ap, status='notneeded', logger=lgr, message='already installed') yielded_ds.append(ap['path']) ap['process_content'] = get_data to_get.append(ap) # explore the unknown for ap in sorted(unavailable_paths, key=lambda x: x['path']): lgr.debug("Investigate yet unavailable path %s", ap) # how close can we get? dspath = ap.get('parentds', get_dataset_root(ap['path'])) if dspath is None: # nothing we can do for this path continue lgr.debug("Found containing dataset %s for path %s", dspath, ap['path']) ds = Dataset(dspath) # now actually obtain whatever is necessary to get to this path containing_ds = [dspath] for res in _install_necessary_subdatasets( ds, ap['path'], reckless, refds_path, description=description): # yield immediately so errors could be acted upon outside, before # we continue if not (res['type'] == 'dataset' and res['path'] in yielded_ds): # unless we reported on this dataset before if res['type'] == 'dataset': # make a record, recursive below might now want to report # a 'notneeded' yielded_ds.append(res['path']) yield res # update to the current innermost dataset containing_ds.append(res['path']) if len(containing_ds) < 2: # no subdataset was installed, hence if the path was unavailable # before it still is, no need to bother git annex ap.update(status='impossible', message='path does not exist') yield ap continue # important to only do the next for the innermost subdataset # as the `recursive` logic below relies on that! # set the correct parent, for a dataset this would be the second-last # reported subdataset ap.update(parentds=containing_ds[-1]) if containing_ds[-1] == ap['path']: # the path actually refers to the last installed dataset ap.update(parentds=containing_ds[-2], process_content=get_data, type='dataset') to_get.append(ap) # results of recursive installation of yet undiscovered datasets rec_get = [] if recursive and not recursion_limit == 'existing': # obtain any subdatasets underneath the paths given inside the # subdatasets that we know already exist # unless we do not want recursion into not-yet-installed datasets for ap in sorted(to_get, key=lambda x: x['path']): if ap['type'] not in ('dataset', 'directory') or not ap.get('raw_input', False): # a non-directory cannot have content underneath # also we do NOT want to recurse into anything that was specifically # requested, to avoid duplication continue subds = Dataset(ap['path'] if ap['type'] == 'dataset' else ap['parentds']) lgr.info( "Installing %s%s recursively", subds, (" underneath %s" % ap['path'] if subds.path != ap['path'] else "")) for res in _recursive_install_subds_underneath( subds, # `ap['path']` was explicitly given as input # we count recursions from the input, hence we # can start with the full number recursion_limit, reckless, start=ap['path'], refds_path=refds_path, description=description): # yield immediately so errors could be acted upon # outside, before we continue if not (res['type'] == 'dataset' and res['path'] in yielded_ds): # unless we reported on this dataset before if res['type'] == 'dataset': # make a record yielded_ds.append(res['path']) yield res if not (res['status'] == 'ok' and res['type'] == 'dataset'): # not a dataset that was just installed, we just reported it # upstairs, and can ignore it from now on continue # paranoia, so popular these days... assert GitRepo.is_valid_repo(res['path']) # keep a copy of the install record for `get` later on get_ap = {k: v for k, v in res.items() if not k == 'status'} get_ap['process_content'] = get_data rec_get.append(get_ap) if not get_data: # done already return # merge the two AP lists to_get.extend(rec_get) # sort into datasets content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_get, refds_path=refds_path) assert(not completed) # hand over to git-annex, get files content, # report files in git as 'notneeded' to get for ds_path in sorted(content_by_ds.keys()): ds = Dataset(ds_path) # grab content, ignore subdataset entries content = [ap['path'] for ap in content_by_ds[ds_path] if ap.get('type', None) != 'dataset' or ap['path'] == ds.path] if not content: # cut this short should there be nothing continue # needs to be an annex to get content if not isinstance(ds.repo, AnnexRepo): for r in results_from_paths( content, status='notneeded', message="no dataset annex, content already present", action='get', logger=lgr, refds=refds_path): yield r continue respath_by_status = {} for res in ds.repo.get( content, options=['--from=%s' % source] if source else [], jobs=jobs): res = annexjson2result(res, ds, type='file', logger=lgr, refds=refds_path) success = success_status_map[res['status']] # TODO: in case of some failed commands (e.g. get) there might # be no path in the record. yoh has only vague idea of logic # here so just checks for having 'path', but according to # results_from_annex_noinfo, then it would be assumed that # `content` was acquired successfully, which is not the case if 'path' in res: respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] yield res for r in results_from_annex_noinfo( ds, content, respath_by_status, dir_fail_msg='could not get some content in %s %s', noinfo_dir_msg='nothing to get from %s', noinfo_file_msg='already present', action='get', logger=lgr, refds=refds_path): yield r
def __call__( path=None, dataset=None, recursive=False, check=True, save=True, message=None, if_dirty='save-before'): res_kwargs = dict(action='remove', logger=lgr) if not dataset and not path: raise InsufficientArgumentsError( "insufficient information for `remove`: requires at least a path or dataset") refds_path = Interface.get_refds_path(dataset) res_kwargs['refds'] = refds_path if refds_path and not path and not GitRepo.is_valid_repo(refds_path): # nothing here, nothing to remove yield get_status_dict(path=refds_path, status='notneeded', **res_kwargs) return if refds_path and not path: # act on the whole dataset if nothing else was specified # TODO i think that would happen automatically in annotation? path = refds_path to_process = [] for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, # we only ever want to discover immediate subdatasets, the rest # will happen in `uninstall` recursion_limit=1, action='remove', unavailable_path_status='', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('state', None) == 'absent' and \ ap.get('parentds', None) is None: # nothing exists at location, and there is no parent to # remove from ap['status'] = 'notneeded' ap['message'] = "path does not exist and is not in a dataset" yield ap continue if ap.get('raw_input', False) and ap.get('type', None) == 'dataset': # make sure dataset sorting yields a dedicted entry for this one ap['process_content'] = True to_process.append(ap) if not to_process: # nothing left to do, potentially all errored before return if path_is_under([ap['path'] for ap in to_process]): # behave like `rm` and refuse to remove where we are raise ValueError( "refusing to uninstall current or parent directory") # now sort into datasets so we can process them one by one content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path) assert(not completed) # iterate over all datasets, starting at the bottom # to make the removal of dataset content known upstairs to_save = [] # track which submodules we have removed in the process, to avoid # failure in case we revisit them due to a subsequent path argument subm_removed = [] for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) paths = content_by_ds[ds_path] to_reporemove = dict() # PLAN any dataset that was not raw_input, uninstall (passing recursive flag) # if dataset itself is in paths, skip any nondataset # sort reverse so we get subdatasets first for ap in sorted(paths, key=lambda x: x['path'], reverse=True): if ap.get('type', None) == 'dataset': # entire dataset needs to go, uninstall if present, pass recursive! uninstall_failed = False if ap['path'] == refds_path or \ (refds_path is None and ap.get('raw_input', False)): # top-level handling, cannot use regular uninstall call, as # it will refuse to uninstall a top-level dataset # and rightfully so, it is really a remove in that case # bypass all the safety by using low-level helper for r in _uninstall_dataset(ds, check=check, has_super=False, **res_kwargs): if r['status'] in ('impossible', 'error'): # we need to inspect if something went wrong, in order # to prevent failure from removing a non-empty dir below, # but at the same time allow for continued processing uninstall_failed = True r['refds'] = refds_path yield r # recheck that it wasn't removed during a previous iteration elif ap.get('state', None) != 'absent' and GitRepo.is_valid_repo(ap['path']): # anything that is not the top-level -> regular uninstall # this is for subdatasets of the to-be-removed dataset # we want to simply uninstall them in a regular manner for r in Uninstall.__call__( # use annotate path as input, but pass a copy because # we cannot rely on it being unaltered by reannotation # TODO maybe adjust annotate_path to do that [ap.copy()], dataset=refds_path, recursive=recursive, check=check, if_dirty=if_dirty, result_xfm=None, result_filter=None, on_failure='ignore'): if r['status'] in ('impossible', 'error'): # we need to inspect if something went wrong, in order # to prevent failure from removing a non-empty dir below, # but at the same time allow for continued processing uninstall_failed = True yield r if not ap.get('raw_input', False): # we only ever want to actually unregister subdatasets that # were given explicitly continue if not uninstall_failed and \ not ap['path'] in subm_removed and \ refds_path and \ ap.get('parentds', None) and \ not (relpath(ap['path'], start=refds_path).startswith(pardir) or ap['path'] == refds_path) and \ ap.get('registered_subds', False): # strip from superdataset, but only if a dataset was given explcitly # as in "remove from this dataset", but not when just a path was given # as in "remove from the filesystem" subds_relpath = relpath(ap['path'], start=ap['parentds']) # remove submodule reference parentds = Dataset(ap['parentds']) # play safe, will fail on dirty parentds.repo.deinit_submodule(ap['path']) # remove now empty submodule link parentds.repo.remove(ap['path']) # make a record that we removed this already, should it be # revisited via another path argument, because do not reannotate # the paths after every removal subm_removed.append(ap['path']) yield dict(ap, status='ok', **res_kwargs) # need .gitmodules update in parent to_save.append(dict( path=opj(parentds.path, '.gitmodules'), parents=parentds.path, type='file')) # and the removal itself needs to be committed # inform `save` that it is OK that this path # doesn't exist on the filesystem anymore ap['unavailable_path_status'] = '' ap['process_content'] = False to_save.append(ap) if not uninstall_failed and exists(ap['path']): # could be an empty dir in case an already uninstalled subdataset # got removed rmdir(ap['path']) else: # anything that is not a dataset can simply be passed on to_reporemove[ap['path']] = ap # avoid unnecessary git calls when there is nothing to do if to_reporemove: if check and hasattr(ds.repo, 'drop'): for r in _drop_files(ds, list(to_reporemove), check=True): if r['status'] == 'error': # if drop errored on that path, we can't remove it to_reporemove.pop(r['path'], 'avoidKeyError') yield r if to_reporemove: for r in ds.repo.remove(list(to_reporemove), r=True): # these were removed, but we still need to save the # removal r_abs = opj(ds.path, r) if r_abs in to_reporemove: ap = to_reporemove[r_abs] else: ap = {'path': r_abs, 'parentds': ds.path, 'refds': refds_path } ap['unavailable_path_status'] = '' to_save.append(ap) yield get_status_dict( status='ok', path=r, **res_kwargs) if not to_save: # nothing left to do, potentially all errored before return if not save: lgr.debug('Not calling `save` as instructed') return for res in Save.__call__( # TODO compose hand-selected annotated paths path=to_save, # we might have removed the reference dataset by now, recheck dataset=refds_path if (refds_path and GitRepo.is_valid_repo(refds_path)) else None, message=message if message else '[DATALAD] removed content', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def __call__(path=None, dataset=None, recursive=False, recursion_limit=None): if path is None and dataset is None: raise InsufficientArgumentsError( "insufficient arguments for unlocking: needs at least " "a dataset or a path to unlock.") refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='unlock', logger=lgr, refds=refds_path) to_process = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='unlock', unavailable_path_status='impossible', unavailable_path_msg="path does not exist", nondataset_path_status='impossible', modified=None, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', 'dataset') == 'dataset': # this is a dataset ap['process_content'] = True to_process.append(ap) content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path, path_only=False) assert (not completed) for ds_path in sorted(content_by_ds.keys()): ds = Dataset(ds_path) content = content_by_ds[ds_path] if not isinstance(ds.repo, AnnexRepo): for ap in content: ap['status'] = 'notneeded' ap['message'] = "not annex'ed, nothing to unlock" ap.update(res_kwargs) yield ap continue files = [ap['path'] for ap in content] for r in ds.repo.unlock(files): yield get_status_dict(path=r, status='ok', type='file', **res_kwargs)
def __call__(path=None, dataset=None, to=None, since=None, missing='fail', force=False, transfer_data='auto', recursive=False, recursion_limit=None, git_opts=None, annex_opts=None, annex_copy_opts=None, jobs=None): # if ever we get a mode, for "with-data" we would need this #if dataset and not path: # # act on the whole dataset if nothing else was specified # path = dataset.path if isinstance(dataset, Dataset) else dataset if not dataset and not path: # try to find a dataset in PWD dataset = require_dataset(None, check_installed=True, purpose='publishing') if since and not dataset: raise InsufficientArgumentsError( 'Modification detection (--since) without a base dataset ' 'is not supported') if dataset and since == '': # only update since last update so we figure out what was the last update active_branch = dataset.repo.get_active_branch() if to: # XXX here we assume one to one mapping of names from local branches # to the remote since = '%s/%s' % (to, active_branch) else: # take tracking remote for the active branch tracked_remote, tracked_refspec = dataset.repo.get_tracking_branch( ) if tracked_remote: if tracked_refspec.startswith('refs/heads/'): tracked_refspec = tracked_refspec[len('refs/heads/'):] #to = tracked_remote since = '%s/%s' % (tracked_remote, tracked_refspec) else: lgr.info( "No tracked remote for %s. since option is of no effect", active_branch) since = None # here is the plan # 1. figure out remote to publish to # 2. figure out which content needs to be published to this remote # 3. look for any pre-publication dependencies of that remote # (i.e. remotes that need to be published to before) # 4. publish the content needed to go to the primary remote to # the dependencies first, and to the primary afterwards ds_remote_info = {} refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(refds=refds_path, logger=lgr, action='publish') to_process = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='publish', unavailable_path_status='impossible', nondataset_path_status='error', modified=since, return_type='generator', on_failure='ignore', force_no_revision_change_discovery= False, # we cannot publish what was not committed force_untracked_discovery=False # we cannot publish untracked ): if ap.get('status', None): # this is done yield ap continue remote_info_result = None if ap.get('type', ap.get('type_src', 'dataset')) != 'dataset': # for everything that is not a dataset get the remote info # for the parent parentds = ap.get('parentds', None) if parentds and parentds not in ds_remote_info: remote_info_result = _get_remote_info( parentds, ds_remote_info, to, missing) else: # this is a dataset if ap.get('state', None) == 'absent': continue # get the remote info for itself remote_info_result = _get_remote_info(ap['path'], ds_remote_info, to, missing) ap['process_content'] = True if remote_info_result is not None: ap['status'] = remote_info_result[0] ap['message'] = remote_info_result[1] yield ap continue to_process.append(ap) content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path) assert (not completed) lgr.debug("Evaluating %i dataset publication candidate(s)", len(content_by_ds)) # TODO: fancier sorting, so we still follow somewhat the hierarchy # in sorted order, e.g. # d1/sub1/sub1 # d1/sub1 # d1 # d2/sub1 # d2 content_by_ds = OrderedDict( (d, content_by_ds[d]) for d in sorted(content_by_ds, reverse=True)) lgr.debug("Attempt to publish %i datasets", len(content_by_ds)) for ds_path in content_by_ds: remote_info = ds_remote_info.get(ds_path, None) if remote_info is None: # maybe this dataset wasn't annotated above, try to get info # MIH: I think this entire if-branch is practically impossible # to reach. It is certainly untested, but I think this is due # to mutually exclusive conditions during remote_info detection remote_info_result = _get_remote_info(ds_path, ds_remote_info, to, missing) if remote_info_result is not None: yield get_status_dict(type='dataset', path=ds_path, status=remote_info_result[0], message=remote_info_result[1], **res_kwargs) continue # continue with freshly obtained info remote_info = ds_remote_info[ds_path] # condition above must catch all other cases assert remote_info # and publish ds = Dataset(ds_path) for r in _publish_dataset( ds, remote=remote_info['remote'], refspec=remote_info.get('refspec', None), # only send paths that were explicitly requested paths= [ p for p in content_by_ds[ds_path] # do not feed (sub)dataset paths into the beast # makes no sense to try to annex copy them # for the base dataset itself let `transfer_data` # decide if p.get('type', None) != 'dataset' ], annex_copy_options=annex_copy_opts, force=force, jobs=jobs, transfer_data=transfer_data, **res_kwargs): yield r
def __call__( path=None, dataset=None, to=None, since=None, missing='fail', force=False, transfer_data='auto', recursive=False, recursion_limit=None, git_opts=None, annex_opts=None, annex_copy_opts=None, jobs=None ): # if ever we get a mode, for "with-data" we would need this #if dataset and not path: # # act on the whole dataset if nothing else was specified # path = dataset.path if isinstance(dataset, Dataset) else dataset if not dataset and not path: # try to find a dataset in PWD dataset = require_dataset( None, check_installed=True, purpose='publishing') if since and not dataset: raise InsufficientArgumentsError( 'Modification detection (--since) without a base dataset ' 'is not supported') if dataset and since == '': # only update since last update so we figure out what was the last update active_branch = dataset.repo.get_active_branch() if to: # XXX here we assume one to one mapping of names from local branches # to the remote since = '%s/%s' % (to, active_branch) else: # take tracking remote for the active branch tracked_remote, tracked_refspec = dataset.repo.get_tracking_branch() if tracked_remote: if tracked_refspec.startswith('refs/heads/'): tracked_refspec = tracked_refspec[len('refs/heads/'):] #to = tracked_remote since = '%s/%s' % (tracked_remote, tracked_refspec) else: lgr.info( "No tracked remote for %s. since option is of no effect", active_branch ) since = None # here is the plan # 1. figure out remote to publish to # 2. figure out which content needs to be published to this remote # 3. look for any pre-publication dependencies of that remote # (i.e. remotes that need to be published to before) # 4. publish the content needed to go to the primary remote to # the dependencies first, and to the primary afterwards ds_remote_info = {} refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(refds=refds_path, logger=lgr, action='publish') to_process = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='publish', unavailable_path_status='impossible', nondataset_path_status='error', modified=since, return_type='generator', on_failure='ignore', force_no_revision_change_discovery=False, # we cannot publish what was not committed force_untracked_discovery=False # we cannot publish untracked ): if ap.get('status', None): # this is done yield ap continue remote_info_result = None if ap.get('type', ap.get('type_src', 'dataset')) != 'dataset': # for everything that is not a dataset get the remote info # for the parent parentds = ap.get('parentds', None) if parentds and parentds not in ds_remote_info: remote_info_result = _get_remote_info( parentds, ds_remote_info, to, missing) else: # this is a dataset if ap.get('state', None) == 'absent': continue # get the remote info for itself remote_info_result = _get_remote_info( ap['path'], ds_remote_info, to, missing) ap['process_content'] = True if remote_info_result is not None: ap['status'] = remote_info_result[0] ap['message'] = remote_info_result[1] yield ap continue to_process.append(ap) content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path) assert(not completed) lgr.debug( "Evaluating %i dataset publication candidate(s)", len(content_by_ds)) # TODO: fancier sorting, so we still follow somewhat the hierarchy # in sorted order, e.g. # d1/sub1/sub1 # d1/sub1 # d1 # d2/sub1 # d2 content_by_ds = OrderedDict( (d, content_by_ds[d]) for d in sorted(content_by_ds, reverse=True) ) lgr.debug("Attempt to publish %i datasets", len(content_by_ds)) for ds_path in content_by_ds: remote_info = ds_remote_info.get(ds_path, None) if remote_info is None: # maybe this dataset wasn't annotated above, try to get info # MIH: I think this entire if-branch is practically impossible # to reach. It is certainly untested, but I think this is due # to mutually exclusive conditions during remote_info detection remote_info_result = _get_remote_info( ds_path, ds_remote_info, to, missing) if remote_info_result is not None: yield get_status_dict( type='dataset', path=ds_path, status=remote_info_result[0], message=remote_info_result[1], **res_kwargs) continue # continue with freshly obtained info remote_info = ds_remote_info[ds_path] # condition above must catch all other cases assert remote_info # and publish ds = Dataset(ds_path) for r in _publish_dataset( ds, remote=remote_info['remote'], refspec=remote_info.get('refspec', None), # only send paths that were explicitly requested paths=[p for p in content_by_ds[ds_path] # do not feed (sub)dataset paths into the beast # makes no sense to try to annex copy them # for the base dataset itself let `transfer_data` # decide if p.get('type', None) != 'dataset'], annex_copy_options=annex_copy_opts, force=force, jobs=jobs, transfer_data=transfer_data, **res_kwargs): yield r
def __call__( path=None, dataset=None, # support passing this through in a path by path basis to_git=None, save=True, message=None, message_file=None, recursive=False, recursion_limit=None, ds2super=False, git_opts=None, annex_opts=None, annex_add_opts=None, jobs=None): # parameter constraints: if not path: raise InsufficientArgumentsError( "insufficient information for adding: requires at least a path") refds_path = Interface.get_refds_path(dataset) common_report = dict(action='add', logger=lgr, refds=refds_path) if message and message_file: raise ValueError("Both a message and message file were specified") if message_file: with open(message_file, "rb") as mfh: message = assure_unicode(mfh.read()) to_add = [] subds_to_add = {} ds_to_annotate_from_recursion = {} got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=dataset, # never recursion, need to handle manually below to be able to # discover untracked content recursive=False, action='add', # speed things up by using Git's modification detection, if there # is a repo with at least one commit modified='HEAD' \ if dataset and \ GitRepo.is_valid_repo(refds_path) and \ GitRepo(refds_path).get_hexsha() \ else None, unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): got_nothing = False if ap.get('status', None): # this is done yield ap continue if ap.get('parentds', None) is None and ap.get('type', None) != 'dataset': yield get_status_dict( status='impossible', message='"there is no dataset to add this path to', **dict(common_report, **ap)) continue if ap.get('type', None) == 'directory' and \ ap.get('state', None) == 'untracked' and \ GitRepo.is_valid_repo(ap['path']): # this is an untracked wannabe subdataset in disguise ap['type'] = 'dataset' if recursive and \ (ap.get('raw_input', False) or ap.get('state', None) in ('added', 'modified', 'untracked')) and \ (ap.get('parentds', None) or ap.get('type', None) == 'dataset'): # this was an actually requested input path, or a path that was found # modified by path annotation, based on an input argument # we need to recurse into all subdirs to find potentially # unregistered subdatasets # but only if this path has a parent, or is itself a dataset # otherwise there is nothing to add to _discover_subdatasets_recursively( ds_to_annotate_from_recursion, ap['path'], [ap['parentds'] if 'parentds' in ap else ap['path']], recursion_limit) # get the file content of the root dataset of this search added too # but be careful with extreme recursion_limit settings if recursion_limit is None or recursion_limit > 0: ap['process_content'] = True # record for further processing if not ap['path'] in ds_to_annotate_from_recursion: # if it was somehow already discovered to_add.append(ap) if got_nothing: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict( 'add', status='notneeded', path=refds_path, type='dataset', logger=lgr) return for subds in ds_to_annotate_from_recursion: if subds not in subds_to_add: # always prefer the already annotated path subds_to_add[subds] = ds_to_annotate_from_recursion[subds] if dataset: # we have a base dataset, discover any intermediate datasets between # the base and any already discovered dataset discovered = {} discover_dataset_trace_to_targets( # from here dataset.path, # to any dataset we are aware of subds_to_add.keys(), [], discovered) for parentds in discovered: for subds in discovered[parentds]: subds_to_add[subds] = subds_to_add.get( subds, dict(path=subds, parentds=parentds, type='dataset')) # merge custom paths and discovered dataset records, paths needs to go first, # because we know most about then, and subsequent annotation call we skip the # later duplicate ones to_add.extend(subds_to_add.values()) # and compact, this should be OK as all the info is in each ap dict to_add = unique(to_add, lambda x: x['path']) if not to_add: # nothing left to do, potentially all errored before return # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, it will also deduplicate annotated_paths = AnnotatePaths.__call__( path=to_add, dataset=dataset, # never recursion, done already recursive=False, action='add', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert(not completed) if not content_by_ds: # we should have complained about any inappropriate path argument # above, so if nothing is left, we can simply exit return # simple loop over datasets -- save happens later # start deep down to_save = [] for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) torepoadd = {} respath_by_status = {} for ap in content_by_ds[ds_path]: # we have a new story ap.pop('status', None) torepoadd[ap['path']] = ap # skip anything that doesn't look like a wannabe subdataset if not ap.get('type', None) == 'dataset' or \ ap['path'] == ds_path: continue if ap.get('registered_subds', False): # subdataset that might be in this list because of the # need to save all the way up to a super dataset respath_by_status['success'] = \ respath_by_status.get('success', []) + [ap['path']] yield get_status_dict( status='notneeded', message="already known subdataset", **dict(common_report, **ap)) continue subds = Dataset(ap['path']) subds_relpath = relpath(ap['path'], ds_path) # Register the repository in the repo tree as a submodule try: ds.repo.add_submodule(subds_relpath, url=None, name=None) except (CommandError, InvalidGitRepositoryError) as e: yield get_status_dict( ds=subds, status='error', message=e.stderr, **dict(common_report, **ap)) continue # queue for saving using the updated annotated path ap['registered_subds'] = True # I hope this is true in direct mode too # TODO this is disabled, because in some circumstances # staging just doesn't happen, and it is unclear when # exactly -- the case that prompted disabling was a submodule # that had no content except for other submodules was not staged, # whereas another submodule on the same level in the same # superdataset which also has one file in it was staged # disable to work correctly, while paying a little bit of # slow down #ap['staged'] = True to_save.append(ap) # report added subdatasets -- `annex add` below won't do it yield get_status_dict( ds=subds, status='ok', message='added new subdataset', **dict(common_report, **ap)) # make sure that .gitmodules is added to the list of files gitmodules_path = opj(ds.path, '.gitmodules') # for git torepoadd[gitmodules_path] = dict(path=gitmodules_path) # and for save to_save.append(dict( path=gitmodules_path, parentds=ds_path, type='file')) # make sure any last minute additions make it to the saving stage # XXX? should content_by_ds become OrderedDict so that possible # super here gets processed last? lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd) is_annex = isinstance(ds.repo, AnnexRepo) add_kw = {'jobs': jobs} if is_annex and jobs else {} added = ds.repo.add_( list(torepoadd.keys()), git=to_git if is_annex else True, **add_kw ) for a in added: res = annexjson2result(a, ds, type='file', **common_report) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] # produce best possible path/result annotation if res['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report res = dict(torepoadd[res['path']], **res) # override this in all cases to be safe res['parentds'] = ds.path if success: # this was successfully added, queue for saving this very path # in the dataset ap = {k: v for k, v in res.items() if k != 'status'} ap['staged'] = True # strip any status and state info (e.g. save will refuse to save # stuff that is marked state='untracked' to_save.append({k: v for k, v in res.items() if k not in ('status', 'state')}) if a['file'] == '.gitmodules': # filter out .gitmodules, because this is only included for # technical reasons and has nothing to do with the actual content continue if GitRepo.is_valid_repo(res['path']): # more accurate report in case of an added submodule # mountpoint. # XXX Actually not sure if this can really happen # (depends on what our low-level code would do) # but worst case is that we loose a little bit of # coverage... res['type'] = 'dataset' res['message'] = 'added new state as submodule' yield res for r in results_from_annex_noinfo( ds, torepoadd, respath_by_status, dir_fail_msg='could not add some content in %s %s', noinfo_dir_msg='nothing to add from %s', noinfo_file_msg='already included in the dataset', action='add', logger=lgr, refds=refds_path): if r['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report r = dict(r, **torepoadd[r['path']]) if r['status'] == 'notneeded': # this could be a file that was staged already, it doesn't need # to be added, but it should be saved/commited if so desired to_save.append({k: v for k, v in r.items() if k not in ('status', 'state')}) # XXX something is fishy with the next one, rethink when sober.... if r['path'] == ds_path and r['status'] == 'ok': # this is for the entire dataset itself which was explicitly requested # make sure to save all r['type'] = 'dataset' r['process_content'] = True to_save.append({k: v for k, v in r.items() if k != 'status'}) yield r if refds_path and ds_path != refds_path and len(respath_by_status.get('success', [])): # TODO XXX we have an issue here when with `add('.')` and annex ignores any # dotfiles. In this case we end up not saving a dataset completely, because # we rely on accurate reporting. there is an issue about this already # TODO look up the issue ID # if there is a base dataset, but we are below it, and we have anything done to this # dataset -> queue dataset itself for saving its state in the parent ds_ap = dict( path=ds.path, # we have to look for the parent here, as we must save the # subdataset in the parent and not the whole subdataset itself type='dataset') parentds = get_dataset_root(normpath(opj(ds.path, pardir))) if parentds: ds_ap['parentds'] = parentds if dataset: ds_ap['refds'] = refds_path to_save.append(ds_ap) if not save: lgr.debug('Not calling `save` as instructed') return # TODO tell save what was staged already! Set 'staged=True' for # respective annotated paths that are fed into `save` # do not reuse any of the sorting done in here for saving, but instead # pass on all the annotated paths to have `save` figure out what to do with # them -- this is costs something, but should be safer, and frankly is # more comprehensible for res in Save.__call__( # hand-selected annotated paths path=to_save, dataset=refds_path, message=message if message else '[DATALAD] added content', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def __call__(message=None, path=None, dataset=None, all_updated=True, version_tag=None, recursive=False, recursion_limit=None, super_datasets=False, message_file=None ): if not dataset and not path: # we got nothing at all -> save what is staged in the repo in "this" directory? # make sure we don't treat this as a user-provided '.' argument path = [{'path': abspath(curdir), 'raw_input': False}] refds_path = Interface.get_refds_path(dataset) if message and message_file: raise ValueError("Both a message and message file were specified") if message_file: with open(message_file, "rb") as mfh: message = assure_unicode(mfh.read()) to_process = [] got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='save', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', modified='HEAD' if not path and recursive else None, return_type='generator', on_failure='ignore'): if ap.get('state', None) == 'untracked' and not ap.get('raw_input', False): # this path was found untracked, but not explicitly given to save # we will silently ignore this continue got_nothing = False # next check should not be done during annotation, as it is possibly expensive # and not generally useful if ap.get('status', None) == 'impossible' and \ ap.get('state', None) == 'absent' and \ ap.get('parentds', None): # this is not here anymore, but it might actually have been a deleted # component if relpath(ap['path'], start=ap['parentds']) \ in Dataset(ap['parentds']).repo.get_deleted_files(): # ok, this is a staged deletion that we want to save ap['status'] = '' del ap['message'] if ap.get('status', None): # this is done yield ap continue # for things like: `ds.save()` # or recursively discovered datasets if ap['path'] == refds_path or \ (ap.get('type', None) == 'dataset' and not ap.get('raw_input', False) and not ap.get('state', None) == 'absent'): ap['process_content'] = True ap['process_updated_only'] = all_updated to_process.append(ap) lgr.log(2, "save, to_process=%r", to_process) if got_nothing and recursive and refds_path: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict( 'save', status='notneeded', path=refds_path, type='dataset', logger=lgr) return if not to_process: # nothing left to do, potentially all errored before return if super_datasets: # search for the topmost superdatasets of any path dss = [Dataset(ap.get('parentds', ap['path'])) for ap in to_process] superdss = [ds.get_superdataset(topmost=True) for ds in dss] superdss = get_tree_roots( unique(ds.path for ds in dss + superdss if ds)) if dataset: # need to adjust the reference to the new superds # if we had one ref before, we should still have exactly one assert len(superdss) <= 1 dataset = list(superdss.keys())[0] refds_path = dataset elif refds_path: # there is a single superdataset superdss = { refds_path: unique([ap['parentds'] for ap in to_process if 'parentds' in ap])} else: # sort all datasets under their potential superdatasets # start from the top to get all subdatasets down the line # and collate them into as few superdatasets as possible # this is quick, just string operations superdss = get_tree_roots( unique([ap['parentds'] for ap in to_process if 'parentds' in ap])) # for each "superdataset" check the tree of subdatasets and make sure # we gather all datasets between the super and any subdataset # so we can save them all bottom-up in order to be able to properly # save the superdataset # if this is called from e.g. `add` this is actually not necessary, # but in the general case we cannot avoid it # TODO maybe introduce a switch? discovered = {} for superds_path in superdss: target_subs = superdss[superds_path] discover_dataset_trace_to_targets( # from here superds_path, # to all target_subs, [], discovered) # create a new minimally annotated path for each discovered dataset discovered_added = set() for parentds in discovered: for subds in discovered[parentds]: to_process.append(dict( path=subds, parentds=parentds, type='dataset')) discovered_added.add(subds) # make sure we have an entry for each dataset, including those # tha are just parents for parentds in discovered: if parentds not in discovered_added: to_process.append(dict( path=parentds, type='dataset', # make sure we save content of superds later on process_content=True, # but not do nasty things, like adding untracked content # just because we discovered this dataset process_updated_only=True)) # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, deduplication happens here too annotated_paths = AnnotatePaths.__call__( path=to_process, dataset=dataset, # never recursion, done already recursive=False, action='save', unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') # now sort into datasets so we can process them one by one content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert(not completed) # iterate over all datasets, starting at the bottom for dspath in sorted(content_by_ds.keys(), reverse=True): ds = Dataset(dspath) res = get_status_dict('save', ds=ds, logger=lgr) if not ds.is_installed(): # TODO This is likely impossible now res['status'] = 'impossible' res['message'] = ('dataset %s is not installed', ds) yield res continue saved_state = save_dataset( ds, content_by_ds[dspath], message=message) res['status'] = 'ok' if saved_state else 'notneeded' # MIH: let's tag even if there was nothing commit. I'd forget this # option too often... if version_tag: try: # TODO: check whether comment below is still true after # removing the log swallowing: # again cannot help but force-silence low-level code, because # it screams like a made man instead of allowing top-level # code an orderly error report ds.repo.tag(version_tag) # even if we haven't saved anything res['status'] = 'ok' yield res except CommandError as e: if saved_state: # first we yield the result for the actual save yield res # and now complain that tagging didn't work yield get_status_dict( 'save', ds=ds, logger=lgr, status='error', message=( 'cannot tag this version: %s', e.stderr.strip())) else: yield res