def __call__( path=None, dataset=None, recursive=False, recursion_limit=None, check=True, if_dirty='save-before'): if dataset and not path: # act on the whole dataset if nothing else was specified path = dataset.path if isinstance(dataset, Dataset) else dataset content_by_ds, unavailable_paths = Interface._prep( path=path, dataset=dataset, recursive=recursive, recursion_limit=recursion_limit) handle_dirty_datasets( content_by_ds.keys(), mode=if_dirty, base=dataset) results = [] # iterate over all datasets, order doesn't matter for ds_path in content_by_ds: ds = Dataset(ds_path) paths = content_by_ds[ds_path] res = _drop_files(ds, paths, check=check) results.extend(res) # there is nothing to save at the end return results
def __call__( path=None, dataset=None, recursive=False, check=True, if_dirty='save-before'): if dataset and not path: # act on the whole dataset if nothing else was specified path = dataset.path if isinstance(dataset, Dataset) else dataset content_by_ds, unavailable_paths = Interface._prep( path=path, dataset=dataset, recursive=recursive) if unavailable_paths: lgr.warning('ignored non-installed paths: %s', unavailable_paths) # upfront sanity and compliance checks if path_is_under(content_by_ds.keys()): # behave like `rm` and refuse to remove where we are raise ValueError( "refusing to uninstall current or parent directory") # check that we have no top-level datasets and not files to process args_ok = True for ds_path in content_by_ds: ds = Dataset(ds_path) paths = content_by_ds[ds_path] if ds_path not in paths: lgr.error( "will not act on files at %s (consider the `drop` command)", paths) args_ok = False if not ds.get_superdataset( datalad_only=False, topmost=False): lgr.error( "will not uninstall top-level dataset at %s (consider the `remove` command)", ds.path) args_ok = False if not args_ok: raise ValueError( 'inappropriate arguments, see previous error message(s)') handle_dirty_datasets( content_by_ds, mode=if_dirty, base=dataset) results = [] # iterate over all datasets, starting at the bottom # to deinit contained submodules first for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) paths = content_by_ds[ds_path] results.extend( # we confirmed the super dataset presence above _uninstall_dataset(ds, check=check, has_super=True)) # there is nothing to save at the end return results
def __call__( path=None, name=None, merge=False, dataset=None, recursive=False, recursion_limit=None, fetch_all=False, reobtain_data=False): """ """ if reobtain_data: # TODO: properly define, what to do raise NotImplementedError("TODO: Option '--reobtain-data' not " "implemented yet.") if dataset and not path: # act on the whole dataset if nothing else was specified path = dataset.path if isinstance(dataset, Dataset) else dataset content_by_ds, unavailable_paths = Interface._prep( path=path, dataset=dataset, recursive=recursive, recursion_limit=recursion_limit) # TODO: check parsed inputs if any paths within a dataset were given # and issue a message that we will update the associate dataset as a whole # or fail -- see #1185 for a potential discussion results = [] for ds_path in content_by_ds: ds = Dataset(ds_path) repo = ds.repo # get all remotes which have references (would exclude # special remotes) remotes = repo.get_remotes(with_refs_only=True) if not remotes: lgr.debug("No siblings known to dataset at %s\nSkipping", repo.path) continue if name and name not in remotes: lgr.warning("'%s' not known to dataset %s\nSkipping", name, repo.path) continue # Currently '--merge' works for single remote only: # TODO: - condition still incomplete # - We can merge if a remote was given or there is a # tracking branch # - we also can fetch all remotes independently on whether or # not we merge a certain remote if not name and len(remotes) > 1 and merge: lgr.debug("Found multiple remotes:\n%s" % remotes) raise NotImplementedError("No merge strategy for multiple " "remotes implemented yet.") lgr.info("Updating dataset '%s' ..." % repo.path) _update_repo(repo, name, merge, fetch_all)
def __call__(path=None, name=None, merge=False, dataset=None, recursive=False, recursion_limit=None, fetch_all=False, reobtain_data=False): """ """ if reobtain_data: # TODO: properly define, what to do raise NotImplementedError("TODO: Option '--reobtain-data' not " "implemented yet.") if dataset and not path: # act on the whole dataset if nothing else was specified path = dataset.path if isinstance(dataset, Dataset) else dataset content_by_ds, unavailable_paths = Interface._prep( path=path, dataset=dataset, recursive=recursive, recursion_limit=recursion_limit) # TODO: check parsed inputs if any paths within a dataset were given # and issue a message that we will update the associate dataset as a whole # or fail -- see #1185 for a potential discussion results = [] for ds_path in content_by_ds: ds = Dataset(ds_path) repo = ds.repo # get all remotes which have references (would exclude # special remotes) remotes = repo.get_remotes(with_refs_only=True) if not remotes: lgr.debug("No siblings known to dataset at %s\nSkipping", repo.path) continue if name and name not in remotes: lgr.warning("'%s' not known to dataset %s\nSkipping", name, repo.path) continue # Currently '--merge' works for single remote only: # TODO: - condition still incomplete # - We can merge if a remote was given or there is a # tracking branch # - we also can fetch all remotes independently on whether or # not we merge a certain remote if not name and len(remotes) > 1 and merge: lgr.debug("Found multiple remotes:\n%s" % remotes) raise NotImplementedError("No merge strategy for multiple " "remotes implemented yet.") lgr.info("Updating dataset '%s' ..." % repo.path) _update_repo(repo, name, merge, fetch_all)
def __call__(path=None, dataset=None, revision='HEAD', staged=False, ignore_subdatasets='none', recursive=False, recursion_limit=None): if not dataset and not path: # act on the whole dataset if nothing else was specified dataset = curdir refds_path = Interface.get_refds_path(dataset) if not (refds_path or path): raise InsufficientArgumentsError( "Neither dataset nor target path(s) provided") to_process = [] for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='diff', # unavailable is OK, because we might query for a deleted file unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): if ap.get('status', None): # we know what to report already yield ap continue if ap.get('type', None) == 'dataset': ap['process_content'] = True to_process.append(ap) # sort into datasets content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path, path_only=False) assert (not completed) for ds_path in sorted(content_by_ds.keys()): for r in _parse_git_diff(ds_path, diff_thingie=revision, paths=content_by_ds[ds_path], ignore_submodules=ignore_subdatasets, staged=staged): r.update(dict(action='diff', refds=refds_path), logger=lgr) if 'status' not in r: r['status'] = 'ok' yield r
def __call__(path=None, dataset=None, get_aggregates=False, reporton='all', recursive=False): # prep results refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='metadata', logger=lgr) if refds_path: res_kwargs['refds'] = refds_path if get_aggregates: # yield all datasets for which we have aggregated metadata as results # the get actual dataset results, so we can turn them into dataset # instances using generic top-level code if desired ds = require_dataset(refds_path, check_installed=True, purpose='aggregate metadata query') info_fpath = opj(ds.path, agginfo_relpath) if not exists(info_fpath): # if there has ever been an aggregation run, this file would # exist, hence there has not been and we need to tell this # to people yield get_status_dict( ds=ds, status='impossible', action='metadata', logger=lgr, message= 'metadata aggregation has never been performed in this dataset' ) return agginfos = _load_json_object(info_fpath) parentds = [] for sd in sorted(agginfos): info = agginfos[sd] dspath = normpath(opj(ds.path, sd)) if parentds and not path_is_subpath(dspath, parentds[-1]): parentds.pop() info.update( path=dspath, type='dataset', status='ok', ) if sd == curdir: info['layout_version'] = aggregate_layout_version if parentds: info['parentds'] = parentds[-1] yield dict(info, **res_kwargs) parentds.append(dspath) return if not dataset and not path: # makes no sense to have no dataset, go with "here" # error generation happens during annotation path = curdir content_by_ds = OrderedDict() for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, # MIH: we are querying the aggregated metadata anyways, and that # mechanism has its own, faster way to go down the hierarchy #recursive=recursive, #recursion_limit=recursion_limit, action='metadata', # uninstalled subdatasets could be queried via aggregated metadata # -> no 'error' unavailable_path_status='', nondataset_path_status='error', # we need to know when to look into aggregated data force_subds_discovery=True, force_parentds_discovery=True, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) == 'dataset' and GitRepo.is_valid_repo( ap['path']): ap['process_content'] = True to_query = None if ap.get('state', None) == 'absent' or \ ap.get('type', 'dataset') != 'dataset': # this is a lonely absent dataset/file or content in a present dataset # -> query through parent # there must be a parent, otherwise this would be a non-dataset path # and would have errored during annotation to_query = ap['parentds'] else: to_query = ap['path'] if to_query: pcontent = content_by_ds.get(to_query, []) pcontent.append(ap) content_by_ds[to_query] = pcontent for ds_path in content_by_ds: ds = Dataset(ds_path) query_agg = [ ap for ap in content_by_ds[ds_path] # this is an available subdataset, will be processed in another # iteration if ap.get('state', None) == 'absent' or not (ap.get( 'type', None) == 'dataset' and ap['path'] != ds_path) ] if not query_agg: continue # report from aggregated metadata for r in query_aggregated_metadata( reporton, # by default query the reference dataset, only if there is none # try our luck in the dataset that contains the queried path # this is consistent with e.g. `get_aggregates` reporting the # situation in the reference dataset only Dataset(refds_path) if refds_path else ds, query_agg, # recursion above could only recurse into datasets # on the filesystem, but there might be any number of # uninstalled datasets underneath the last installed one # for which we might have metadata recursive=recursive, **res_kwargs): yield r return
def __call__(path=None, dataset=None, recursive=False, recursion_limit=None, check=True, if_dirty='save-before'): if not dataset and not path: raise InsufficientArgumentsError( "insufficient information for `drop`: requires at least a path or dataset" ) refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='drop', logger=lgr, refds=refds_path) if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path to_drop = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='drop', # justification for status: # content need not be dropped where there is none unavailable_path_status='notneeded', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) == 'dataset' and \ GitRepo.is_valid_repo(ap['path']) and \ not ap['path'] == refds_path: ap['process_content'] = True if ap.get('registered_subds', False) and ap.get('state', None) == 'absent': # nothing to drop in an absent subdataset, don't be annoying # and skip silently continue to_drop.append(ap) content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_drop, refds_path=refds_path) assert (not completed) # iterate over all datasets, order doesn't matter for ds_path in content_by_ds: ds = Dataset(ds_path) # TODO generator # this should yield what it did handle_dirty_dataset(ds, mode=if_dirty) # ignore submodule entries content = [ ap['path'] for ap in content_by_ds[ds_path] if ap.get('type', None) != 'dataset' or ap['path'] == ds.path ] if not content: continue for r in _drop_files(ds, content, check=check, **res_kwargs): yield r
def __call__( path=None, source=None, dataset=None, recursive=False, recursion_limit=None, get_data=True, description=None, reckless=False, #git_opts=None, #annex_opts=None, #annex_get_opts=None, jobs='auto', verbose=False, ): # IMPLEMENTATION CONCEPT: # # 1. Sort the world into existing handles and the rest # 2. Try locate missing handles (obtain subdatasets along the way) # 3. Expand into subdatasets with recursion enables (potentially # obtain even more subdatasets # 4. Shoot info of which handles to get in each subdataset to, # git-annex, once at the very end refds_path = Interface.get_refds_path(dataset) if not (dataset or path): raise InsufficientArgumentsError( "Neither dataset nor target path(s) provided") if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path # remember which results we already reported, to avoid duplicates yielded_ds = [] to_get = [] unavailable_paths = [] for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='get', # NOTE: Do not act upon unavailable paths yet! Done below after # testing which ones could be obtained unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): if ap.get('status', None): # we know what to report already yield ap continue if ap.get('state', None) == 'absent' and ap.get( 'raw_input', False): # if this wasn't found, but directly requested, queue for further # exploration unavailable_paths.append(ap) continue if ap.get('type', None) == 'dataset' and \ GitRepo.is_valid_repo(ap['path']) and \ not ap['path'] == refds_path: # do not report what hasn't arived yet # also do not report the base dataset that is already # present -- no surprise yield dict(ap, status='notneeded', logger=lgr, message='already installed') yielded_ds.append(ap['path']) ap['process_content'] = get_data to_get.append(ap) # explore the unknown for ap in sorted(unavailable_paths, key=lambda x: x['path']): lgr.debug("Investigate yet unavailable path %s", ap) # how close can we get? dspath = ap.get('parentds', get_dataset_root(ap['path'])) if dspath is None: # nothing we can do for this path continue lgr.debug("Found containing dataset %s for path %s", dspath, ap['path']) ds = Dataset(dspath) # now actually obtain whatever is necessary to get to this path containing_ds = [dspath] for res in _install_necessary_subdatasets(ds, ap['path'], reckless, refds_path, description=description): # yield immediately so errors could be acted upon outside, before # we continue if not (res['type'] == 'dataset' and res['path'] in yielded_ds): # unless we reported on this dataset before if res['type'] == 'dataset': # make a record, recursive below might now want to report # a 'notneeded' yielded_ds.append(res['path']) yield res # update to the current innermost dataset containing_ds.append(res['path']) if len(containing_ds) < 2: # no subdataset was installed, hence if the path was unavailable # before it still is, no need to bother git annex ap.update(status='impossible', message='path does not exist') yield ap continue # important to only do the next for the innermost subdataset # as the `recursive` logic below relies on that! # set the correct parent, for a dataset this would be the second-last # reported subdataset ap.update(parentds=containing_ds[-1]) if containing_ds[-1] == ap['path']: # the path actually refers to the last installed dataset ap.update(parentds=containing_ds[-2], process_content=get_data, type='dataset') to_get.append(ap) # results of recursive installation of yet undiscovered datasets rec_get = [] if recursive and not recursion_limit == 'existing': # obtain any subdatasets underneath the paths given inside the # subdatasets that we know already exist # unless we do not want recursion into not-yet-installed datasets for ap in sorted(to_get, key=lambda x: x['path']): if ap['type'] not in ('dataset', 'directory') or not ap.get( 'raw_input', False): # a non-directory cannot have content underneath # also we do NOT want to recurse into anything that was specifically # requested, to avoid duplication continue subds = Dataset(ap['path'] if ap['type'] == 'dataset' else ap['parentds']) lgr.info("Installing %s%s recursively", subds, (" underneath %s" % ap['path'] if subds.path != ap['path'] else "")) for res in _recursive_install_subds_underneath( subds, # `ap['path']` was explicitly given as input # we count recursions from the input, hence we # can start with the full number recursion_limit, reckless, start=ap['path'], refds_path=refds_path, description=description): # yield immediately so errors could be acted upon # outside, before we continue if not (res['type'] == 'dataset' and res['path'] in yielded_ds): # unless we reported on this dataset before if res['type'] == 'dataset': # make a record yielded_ds.append(res['path']) yield res if not (res['status'] == 'ok' and res['type'] == 'dataset'): # not a dataset that was just installed, we just reported it # upstairs, and can ignore it from now on continue # paranoia, so popular these days... assert GitRepo.is_valid_repo(res['path']) # keep a copy of the install record for `get` later on get_ap = { k: v for k, v in res.items() if not k == 'status' } get_ap['process_content'] = get_data rec_get.append(get_ap) if not get_data: # done already return # merge the two AP lists to_get.extend(rec_get) # sort into datasets content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_get, refds_path=refds_path) assert (not completed) # hand over to git-annex, get files content, # report files in git as 'notneeded' to get for ds_path in sorted(content_by_ds.keys()): ds = Dataset(ds_path) # grab content, ignore subdataset entries content = [ ap['path'] for ap in content_by_ds[ds_path] if ap.get('type', None) != 'dataset' or ap['path'] == ds.path ] if not content: # cut this short should there be nothing continue # needs to be an annex to get content if not isinstance(ds.repo, AnnexRepo): for r in results_from_paths( content, status='notneeded', message="no dataset annex, content already present", action='get', logger=lgr, refds=refds_path): yield r continue respath_by_status = {} for res in ds.repo.get(content, options=['--from=%s' % source] if source else [], jobs=jobs): res = annexjson2result(res, ds, type='file', logger=lgr, refds=refds_path) success = success_status_map[res['status']] # TODO: in case of some failed commands (e.g. get) there might # be no path in the record. yoh has only vague idea of logic # here so just checks for having 'path', but according to # results_from_annex_noinfo, then it would be assumed that # `content` was acquired successfully, which is not the case if 'path' in res: respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] yield res for r in results_from_annex_noinfo( ds, content, respath_by_status, dir_fail_msg='could not get some content in %s %s', noinfo_dir_msg='nothing to get from %s', noinfo_file_msg='already present', action='get', logger=lgr, refds=refds_path): yield r
def __call__(path=None, dataset=None, to=None, since=None, missing='fail', force=False, recursive=False, recursion_limit=None, git_opts=None, annex_opts=None, annex_copy_opts=None, jobs=None): # if ever we get a mode, for "with-data" we would need this #if dataset and not path: # # act on the whole dataset if nothing else was specified # path = dataset.path if isinstance(dataset, Dataset) else dataset if not dataset and not path: # try to find a dataset in PWD dataset = require_dataset(None, check_installed=True, purpose='publishing') if since and not dataset: raise InsufficientArgumentsError( 'Modification detection (--since) without a base dataset ' 'is not supported') content_by_ds, unavailable_paths = Interface._prep( path=path, dataset=dataset, recursive=recursive, recursion_limit=recursion_limit, # we do not want for this command state that we want to publish # content by default by assigning paths for each sub-dataset # automagically. But if paths were provided -- sorting would # happen to point only to the submodules under those paths, and # then to stay consistent we want to copy those paths data sub_paths=bool(path)) if unavailable_paths: raise ValueError( 'cannot publish content that is not available locally: %s' % ', '.join(unavailable_paths)) # here is the plan # 1. figure out remote to publish to # 2. figure out which content needs to be published to this remote # 3. look for any pre-publication dependencies of that remote # (i.e. remotes that need to be published to before) # 4. publish the content needed to go to the primary remote to # the dependencies first, and to the primary afterwards ds_remote_info = {} lgr.debug("Evaluating %i dataset publication candidate(s)", len(content_by_ds)) # TODO: fancier sorting, so we still follow somewhat the hierarchy # in sorted order, e.g. # d1/sub1/sub1 # d1/sub1 # d1 # d2/sub1 # d2 content_by_ds = OrderedDict( (d, content_by_ds[d]) for d in sorted(content_by_ds, reverse=True)) for ds_path in content_by_ds: ds = Dataset(ds_path) if to is None: # we need an upstream remote, if there's none given. We could # wait for git push to complain, but we need to explicitly # figure it out for pushing annex branch anyway and we might as # well fail right here. track_remote, track_refspec = ds.repo.get_tracking_branch() if not track_remote: # no tracking remote configured, but let try one more # if we only have one remote, and it has a push target # configured that is "good enough" for us cand_remotes = [ r for r in ds.repo.get_remotes() if 'remote.{}.push'.format(r) in ds.config ] if len(cand_remotes) > 1: lgr.warning( 'Target sibling ambiguous, please specific via --to' ) elif len(cand_remotes) == 1: track_remote = cand_remotes[0] else: lgr.warning( 'No target sibling configured for default publication, ' 'please specific via --to') if track_remote: ds_remote_info[ds_path] = dict( zip(('remote', 'refspec'), (track_remote, track_refspec))) elif missing == 'skip': lgr.warning('Cannot determine target sibling, skipping %s', ds) ds_remote_info[ds_path] = None else: # we have no remote given and no upstream => fail raise InsufficientArgumentsError( 'Cannot determine target sibling for %s' % (ds, )) elif to not in ds.repo.get_remotes(): # unknown given remote if missing == 'skip': lgr.warning("Unknown target sibling '%s', skipping %s", to, ds) ds_remote_info[ds_path] = None elif missing == 'inherit': superds = ds.get_superdataset() if not superds: raise RuntimeError( "%s has no super-dataset to inherit settings for the remote %s" % (ds, to)) # XXX due to difference between create-sibling and create-sibling-github # would not be as transparent to inherit for -github lgr.info( "Will try to create a sibling inheriting settings from %s", superds) # XXX explicit None as sshurl for now ds.create_sibling(None, name=to, inherit=True) ds_remote_info[ds_path] = {'remote': to} else: raise ValueError("Unknown target sibling '%s' for %s" % (to, ds)) else: # all good: remote given and is known ds_remote_info[ds_path] = {'remote': to} if dataset and since: # remove all unmodified components from the spec lgr.debug("Testing %i dataset(s) for modifications since '%s'", len(content_by_ds), since) content_by_ds = filter_unmodified(content_by_ds, dataset, since) lgr.debug("Attempt to publish %i datasets", len(content_by_ds)) published, skipped = [], [] for ds_path in content_by_ds: remote_info = ds_remote_info[ds_path] if not remote_info: # in case we are skipping lgr.debug("Skipping dataset at '%s'", ds_path) continue # and publish ds = Dataset(ds_path) pblsh, skp = _publish_dataset(ds, remote=remote_info['remote'], refspec=remote_info.get( 'refspec', None), paths=content_by_ds[ds_path], annex_copy_options=annex_copy_opts, force=force, jobs=jobs) published.extend(pblsh) skipped.extend(skp) return published, skipped
def __call__(path=None, sibling=None, merge=False, dataset=None, recursive=False, recursion_limit=None, fetch_all=False, reobtain_data=False): """ """ if not dataset and not path: # try to find a dataset in PWD dataset = require_dataset(None, check_installed=True, purpose='updating') refds_path = Interface.get_refds_path(dataset) if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path for ap in AnnotatePaths.__call__(dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='update', unavailable_path_status='impossible', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if not ap.get('type', None) == 'dataset': ap.update(status='impossible', message="can only update datasets") yield ap continue # this is definitely as dataset from here on ds = Dataset(ap['path']) if not ds.is_installed(): lgr.debug("Skipping update since not installed %s", ds) continue repo = ds.repo # prepare return value # TODO reuse AP for return props res = get_status_dict('update', ds=ds, logger=lgr, refds=refds_path) # get all remotes which have references (would exclude # special remotes) remotes = repo.get_remotes(**({ 'exclude_special_remotes': True } if isinstance(repo, AnnexRepo) else {})) if not remotes: res['message'] = ( "No siblings known to dataset at %s\nSkipping", repo.path) res['status'] = 'notneeded' yield res continue if not sibling: # nothing given, look for tracking branch sibling_ = repo.get_tracking_branch()[0] else: sibling_ = sibling if sibling_ and sibling_ not in remotes: res['message'] = ("'%s' not known to dataset %s\nSkipping", sibling_, repo.path) res['status'] = 'impossible' yield res continue if not sibling_ and len(remotes) == 1: # there is only one remote, must be this one sibling_ = remotes[0] if not sibling_ and len(remotes) > 1 and merge: lgr.debug("Found multiple siblings:\n%s" % remotes) res['status'] = 'impossible' res['error'] = NotImplementedError( "Multiple siblings, please specify from which to update.") yield res continue lgr.info("Updating dataset '%s' ..." % repo.path) # fetch remote fetch_kwargs = dict( remote=None if fetch_all else sibling_, all_=fetch_all, prune=True) # prune to not accumulate a mess over time try: repo.fetch(**fetch_kwargs) except BadName: # pragma: no cover # Workaround for # https://github.com/gitpython-developers/GitPython/issues/768 # also see https://github.com/datalad/datalad/issues/2550 # Let's try to precommit (to flush anything flushable) and do # it again repo.precommit() repo.fetch(**fetch_kwargs) # NOTE if any further acces to `repo` is needed, reevaluate # ds.repo again, as it might have be converted from an GitRepo # to an AnnexRepo if merge: for fr in _update_repo(ds, sibling_, reobtain_data): yield fr res['status'] = 'ok' yield res
def __call__( path=None, dataset=None, recursive=False, check=True, save=True, message=None, if_dirty='save-before'): res_kwargs = dict(action='remove', logger=lgr) if not dataset and not path: raise InsufficientArgumentsError( "insufficient information for `remove`: requires at least a path or dataset") refds_path = Interface.get_refds_path(dataset) res_kwargs['refds'] = refds_path if refds_path and not path and not GitRepo.is_valid_repo(refds_path): # nothing here, nothing to remove yield get_status_dict(path=refds_path, status='notneeded', **res_kwargs) return if refds_path and not path: # act on the whole dataset if nothing else was specified # TODO i think that would happen automatically in annotation? path = refds_path to_process = [] for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, # we only ever want to discover immediate subdatasets, the rest # will happen in `uninstall` recursion_limit=1, action='remove', unavailable_path_status='', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('state', None) == 'absent' and \ ap.get('parentds', None) is None: # nothing exists at location, and there is no parent to # remove from ap['status'] = 'notneeded' ap['message'] = "path does not exist and is not in a dataset" yield ap continue if ap.get('raw_input', False) and ap.get('type', None) == 'dataset': # make sure dataset sorting yields a dedicted entry for this one ap['process_content'] = True to_process.append(ap) if not to_process: # nothing left to do, potentially all errored before return if path_is_under([ap['path'] for ap in to_process]): # behave like `rm` and refuse to remove where we are raise ValueError( "refusing to uninstall current or parent directory") # now sort into datasets so we can process them one by one content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path) assert(not completed) # iterate over all datasets, starting at the bottom # to make the removal of dataset content known upstairs to_save = [] # track which submodules we have removed in the process, to avoid # failure in case we revisit them due to a subsequent path argument subm_removed = [] for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) paths = content_by_ds[ds_path] to_reporemove = dict() # PLAN any dataset that was not raw_input, uninstall (passing recursive flag) # if dataset itself is in paths, skip any nondataset # sort reverse so we get subdatasets first for ap in sorted(paths, key=lambda x: x['path'], reverse=True): if ap.get('type', None) == 'dataset': # entire dataset needs to go, uninstall if present, pass recursive! uninstall_failed = False if ap['path'] == refds_path or \ (refds_path is None and ap.get('raw_input', False)): # top-level handling, cannot use regular uninstall call, as # it will refuse to uninstall a top-level dataset # and rightfully so, it is really a remove in that case # bypass all the safety by using low-level helper for r in _uninstall_dataset(ds, check=check, has_super=False, **res_kwargs): if r['status'] in ('impossible', 'error'): # we need to inspect if something went wrong, in order # to prevent failure from removing a non-empty dir below, # but at the same time allow for continued processing uninstall_failed = True r['refds'] = refds_path yield r # recheck that it wasn't removed during a previous iteration elif ap.get('state', None) != 'absent' and GitRepo.is_valid_repo(ap['path']): # anything that is not the top-level -> regular uninstall # this is for subdatasets of the to-be-removed dataset # we want to simply uninstall them in a regular manner for r in Uninstall.__call__( ap['path'], dataset=refds_path, recursive=recursive, check=check, if_dirty=if_dirty, result_xfm=None, result_filter=None, on_failure='ignore'): if r['status'] in ('impossible', 'error'): # we need to inspect if something went wrong, in order # to prevent failure from removing a non-empty dir below, # but at the same time allow for continued processing uninstall_failed = True yield r if not ap.get('raw_input', False): # we only ever want to actually unregister subdatasets that # were given explicitly continue if not uninstall_failed and \ not ap['path'] in subm_removed and \ refds_path and \ ap.get('parentds', None) and \ not (relpath(ap['path'], start=refds_path).startswith(pardir) or ap['path'] == refds_path) and \ ap.get('registered_subds', False): # strip from superdataset, but only if a dataset was given explcitly # as in "remove from this dataset", but not when just a path was given # as in "remove from the filesystem" subds_relpath = relpath(ap['path'], start=ap['parentds']) # remove submodule reference parentds = Dataset(ap['parentds']) # play safe, will fail on dirty parentds.repo.deinit_submodule(ap['path']) # remove now empty submodule link parentds.repo.remove(ap['path']) # make a record that we removed this already, should it be # revisited via another path argument, because do not reannotate # the paths after every removal subm_removed.append(ap['path']) yield dict(ap, status='ok', **res_kwargs) # need .gitmodules update in parent to_save.append(dict( path=opj(parentds.path, '.gitmodules'), parents=parentds.path, type='file')) # and the removal itself needs to be committed # inform `save` that it is OK that this path # doesn't exist on the filesystem anymore ap['unavailable_path_status'] = '' ap['process_content'] = False to_save.append(ap) if not uninstall_failed and exists(ap['path']): # could be an empty dir in case an already uninstalled subdataset # got removed rmdir(ap['path']) else: # anything that is not a dataset can simply be passed on to_reporemove[ap['path']] = ap # avoid unnecessary git calls when there is nothing to do if to_reporemove: if check and hasattr(ds.repo, 'drop'): for r in _drop_files(ds, list(to_reporemove), check=True): if r['status'] == 'error': # if drop errored on that path, we can't remove it to_reporemove.pop(r['path'], 'avoidKeyError') yield r if to_reporemove: for r in ds.repo.remove(list(to_reporemove), r=True): # these were removed, but we still need to save the # removal r_abs = opj(ds.path, r) if r_abs in to_reporemove: ap = to_reporemove[r_abs] else: ap = {'path': r_abs, 'parentds': ds.path, 'refds': refds_path } ap['unavailable_path_status'] = '' to_save.append(ap) yield get_status_dict( status='ok', path=r, **res_kwargs) if not to_save: # nothing left to do, potentially all errored before return if not save: lgr.debug('Not calling `save` as instructed') return for res in Save.__call__( path=[ap["path"] for ap in to_save], # we might have removed the reference dataset by now, recheck dataset=refds_path if (refds_path and GitRepo.is_valid_repo(refds_path)) else None, message=message if message else '[DATALAD] removed content', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def __call__( path=None, source=None, dataset=None, recursive=False, recursion_limit=None, get_data=True, description=None, reckless=None, jobs='auto', ): refds_path = Interface.get_refds_path(dataset) if not (dataset or path): raise InsufficientArgumentsError( "Neither dataset nor target path(s) provided") if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path # we have to have a single dataset to operate on refds = require_dataset(dataset, check_installed=True, purpose='get content') content_by_ds = {} # use subdatasets() to discover any relevant content that is not # already present in the root dataset (refds) for sdsres in Subdatasets.__call__( contains=path, # maintain path argument semantics and pass in dataset arg # as is dataset=dataset, # always come from the top to get sensible generator behavior bottomup=False, # when paths are given, they will constrain the recursion # automatically, and we need to enable recursion so we can # location path in subdatasets several levels down recursive=True if path else recursive, recursion_limit=None if path else recursion_limit, return_type='generator', on_failure='ignore'): if sdsres.get('type', None) != 'dataset': # if it is not about a 'dataset' it is likely content in # the root dataset if sdsres.get('status', None) == 'impossible' and \ sdsres.get('message', None) == \ 'path not contained in any matching subdataset': target_path = Path(sdsres['path']) if refds.pathobj != target_path and \ refds.pathobj not in target_path.parents: yield dict( action='get', path=str(target_path), status='error', message=('path not associated with dataset %s', refds), ) continue # check if we need to obtain anything underneath this path # the subdataset() call above will only look _until_ it # hits the targetpath for res in _install_targetpath( refds, Path(sdsres['path']), recursive, recursion_limit, reckless, refds_path, description, jobs=jobs, ): # fish out the datasets that 'contains' a targetpath # and store them for later if res.get('status', None) in ('ok', 'notneeded') and \ 'contains' in res: dsrec = content_by_ds.get(res['path'], set()) dsrec.update(res['contains']) content_by_ds[res['path']] = dsrec if res.get('status', None) != 'notneeded': # all those messages on not having installed anything # are a bit pointless # "notneeded" for annex get comes below yield res else: # dunno what this is, send upstairs yield sdsres # must continue for both conditional branches above # the rest is about stuff in real subdatasets continue # instance of the closest existing dataset for this result ds = Dataset(sdsres['parentds'] if sdsres.get('state', None) == 'absent' else sdsres['path']) assert 'contains' in sdsres # explore the unknown for target_path in sdsres.get('contains', []): # essentially the same as done above for paths in the root # dataset, but here we are starting from the closest # discovered subdataset for res in _install_targetpath( ds, Path(target_path), recursive, recursion_limit, reckless, refds_path, description, jobs=jobs, ): known_ds = res['path'] in content_by_ds if res.get('status', None) in ('ok', 'notneeded') and \ 'contains' in res: dsrec = content_by_ds.get(res['path'], set()) dsrec.update(res['contains']) content_by_ds[res['path']] = dsrec # prevent double-reporting of datasets that have been # installed by explorative installation to get to target # paths, prior in this loop if res.get('status', None) != 'notneeded' or not known_ds: yield res if not get_data: # done already return # and now annex-get, this could all be done in parallel now for ds, content in content_by_ds.items(): for res in _get_targetpaths(Dataset(ds), content, refds.path, source, jobs): if res['path'] not in content_by_ds: # we had reports on datasets and subdatasets already # before the annex stage yield res
def __call__(path=None, dataset=None, recursive=False, check=True, if_dirty='save-before'): refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='uninstall', logger=lgr, refds=refds_path) if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path if not dataset and not path: raise InsufficientArgumentsError( "insufficient information for `uninstall`: requires at least a path or dataset" ) to_uninstall = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, action='uninstall', # justification for status: # content need not be uninstalled where there is none unavailable_path_status='notneeded', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue # upfront sanity and compliance checks # check that we have no top-level datasets and not files to process if ap.get('type') == 'dataset' and \ not ap.get('state', None) == 'absent' and \ path_is_under([ap['path']]): # wants a sequence! ap.update( status='error', message="refusing to uninstall current or parent directory" ) yield ap continue if not ap.get('type', None) == 'dataset': ap.update( status='impossible', message= "can only uninstall datasets (consider the `drop` command)" ) yield ap continue # we only have dataset from here if not ap.get('parentds', None): ap.update( status='error', message= "will not uninstall top-level dataset (consider `remove` command)" ) yield ap continue if not ap['path'] == refds_path: ap['process_content'] = True to_uninstall.append(ap) # iterate over all datasets, starting at the bottom # to deinit contained submodules first for ap in sorted(to_uninstall, key=lambda x: x['path'], reverse=True): if ap.get('state', None) == 'absent': # already gone continue ds = Dataset(ap['path']) # TODO generator # this should yield what it did handle_dirty_dataset(ds, mode=if_dirty) # we confirmed the super dataset presence above for r in _uninstall_dataset(ds, check=check, has_super=True, **res_kwargs): yield r
def __call__(path=None, dataset=None, add=None, init=None, remove=None, reset=None, define_key=None, dataset_global=False, recursive=False, recursion_limit=None): # bring metadataset setter args in shape first untag, remove = _parse_argspec(remove) purge, reset = _parse_argspec(reset) tag_add, add = _parse_argspec(add) tag_init, init = _parse_argspec(init) define_key = dict(define_key) if define_key else None # merge all potential sources of tag specifications all_untag = remove.get('tag', []) + untag if all_untag: remove['tag'] = all_untag all_addtag = add.get('tag', []) + tag_add if all_addtag: add['tag'] = all_addtag all_inittag = init.get('tag', []) + tag_init if all_inittag: init['tag'] = all_inittag lgr.debug("Will 'init' metadata items: %s", init) lgr.debug("Will 'add' metadata items: %s", add) lgr.debug("Will 'remove' metadata items: %s", remove) lgr.debug("Will 'reset' metadata items: %s", reset) lgr.debug("Will 'purge' metadata items: %s", purge) refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='metadata', logger=lgr, refds=refds_path) to_process = [] for ap in AnnotatePaths.__call__(dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='metadata', unavailable_path_status='error', nondataset_path_status='error', force_subds_discovery=False, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) == 'dataset': if ap.get('state', None) == 'absent': # just discovered via recursion, but not relevant here continue if GitRepo.is_valid_repo(ap['path']): ap['process_content'] = True to_process.append(ap) content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path, path_only=False) assert (not completed) # iterate over all datasets, order doesn't matter to_save = [] for ds_path in content_by_ds: # ignore submodule entries content = [ ap for ap in content_by_ds[ds_path] if ap.get('type', None) != 'dataset' or ap['path'] == ds_path ] if not content: # nothing other than subdatasets were given or discovered in # this dataset, ignore continue ds = Dataset(ds_path) if dataset_global or define_key: db_path = opj(ds.path, '.datalad', 'metadata', 'dataset.json') db = {} if exists(db_path): db_fp = open(db_path) # need to read manually, load() would puke on an empty file db_content = db_fp.read() # minimize time for collision db_fp.close() if db_content: db = json.loads(db_content) # TODO make manipulation order identical to what git-annex does for k, v in init.items() if init else []: if k not in db: db[k] = v for k in purge: if k in db: del db[k] for k, v in reset.items(): db[k] = v for k, v in add.items(): db[k] = sorted(unique(db.get(k, []) + v)) for k, v in remove.items(): existing_data = db.get(k, []) if isinstance(existing_data, dict): db[k] = { dk: existing_data[dk] for dk in set(existing_data).difference(v) } else: db[k] = list(set(existing_data).difference(v)) # wipe out if empty if not db[k]: del db[k] added_def = False if define_key: defs = db.get('definition', {}) for k, v in define_key.items(): if k in defs: if not defs[k] == v: yield get_status_dict( status='error', ds=ds, message= ("conflicting definition for key '%s': '%s' != '%s'", k, v, defs[k]), **res_kwargs) continue else: defs[k] = v added_def = True db['definition'] = defs # store, if there is anything if db: if not exists(dirname(db_path)): makedirs(dirname(db_path)) db_fp = open(db_path, 'w') # produce relatively compact, but also diff-friendly format json.dump(db, db_fp, indent=0, separators=(',', ':\n'), sort_keys=True) # minimize time for collision db_fp.close() # use add not save to also cover case of a fresh file ds.add(db_path, save=False) to_save.append( dict(path=db_path, parentds=ds.path, type='file')) elif exists(db_path): # no metadata left, kill file ds.remove(db_path) to_save.append(dict(path=ds.path, type='dataset')) if added_def or init or add or remove or reset or purge: # if anything happended or could have happended yield get_status_dict(status='ok', ds=ds, metadata=db, **res_kwargs) elif not isinstance(ds.repo, AnnexRepo): # report on all explicitly requested paths only for ap in [c for c in content if ap.get('raw_input', False)]: yield dict( ap, status='impossible', message=( 'non-annex dataset %s has no file metadata support', ds), **res_kwargs) continue ds_paths = [p['path'] for p in content] if not dataset_global: if reset or purge or add or init or remove: # file metadata manipulation mod_paths = [] for mp in ds.repo.set_metadata( ds_paths, reset=reset, add=add, init=init, remove=remove, purge=purge, # we always go recursive # TODO is that a good thing? But how to otherwise distinuish # this kind of recursive from the one across datasets in # the API? recursive=True): if mp.get('success', False): mod_paths.append(mp['file']) else: yield get_status_dict( status='error', message='setting metadata failed', path=opj(ds.path, mp[0]), type='file', **res_kwargs) # query the actually modified paths only ds_paths = mod_paths # and lastly, query -- even if we set before -- there could # be side-effect from multiple set paths on an individual # path, hence we need to query to get the final result for file, meta in ds.repo.get_metadata(ds_paths): r = get_status_dict(status='ok', path=opj(ds.path, file), type='file', metadata=meta, **res_kwargs) yield r # save potential modifications to dataset global metadata if not to_save: return for res in Save.__call__(path=to_save, dataset=refds_path, message='[DATALAD] dataset metadata update', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def __call__( path=None, source=None, dataset=None, recursive=False, recursion_limit=None, get_data=True, description=None, reckless=False, #git_opts=None, #annex_opts=None, #annex_get_opts=None, jobs='auto', verbose=False, ): # IMPLEMENTATION CONCEPT: # # 1. Sort the world into existing handles and the rest # 2. Try locate missing handles (obtain subdatasets along the way) # 3. Expand into subdatasets with recursion enables (potentially # obtain even more subdatasets # 4. Shoot info of which handles to get in each subdataset to, # git-annex, once at the very end refds_path = Interface.get_refds_path(dataset) if not (dataset or path): raise InsufficientArgumentsError( "Neither dataset nor target path(s) provided") if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path # remember which results we already reported, to avoid duplicates yielded_ds = [] to_get = [] unavailable_paths = [] for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='get', # NOTE: Do not act upon unavailable paths yet! Done below after # testing which ones could be obtained unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): if ap.get('status', None): # we know what to report already yield ap continue if ap.get('state', None) == 'absent' and ap.get('raw_input', False): # if this wasn't found, but directly requested, queue for further # exploration unavailable_paths.append(ap) continue if ap.get('type', None) == 'dataset' and \ GitRepo.is_valid_repo(ap['path']) and \ not ap['path'] == refds_path: # do not report what hasn't arived yet # also do not report the base dataset that is already # present -- no surprise yield dict(ap, status='notneeded', logger=lgr, message='already installed') yielded_ds.append(ap['path']) ap['process_content'] = get_data to_get.append(ap) # explore the unknown for ap in sorted(unavailable_paths, key=lambda x: x['path']): lgr.debug("Investigate yet unavailable path %s", ap) # how close can we get? dspath = ap.get('parentds', get_dataset_root(ap['path'])) if dspath is None: # nothing we can do for this path continue lgr.debug("Found containing dataset %s for path %s", dspath, ap['path']) ds = Dataset(dspath) # now actually obtain whatever is necessary to get to this path containing_ds = [dspath] for res in _install_necessary_subdatasets( ds, ap['path'], reckless, refds_path, description=description): # yield immediately so errors could be acted upon outside, before # we continue if not (res['type'] == 'dataset' and res['path'] in yielded_ds): # unless we reported on this dataset before if res['type'] == 'dataset': # make a record, recursive below might now want to report # a 'notneeded' yielded_ds.append(res['path']) yield res # update to the current innermost dataset containing_ds.append(res['path']) if len(containing_ds) < 2: # no subdataset was installed, hence if the path was unavailable # before it still is, no need to bother git annex ap.update(status='impossible', message='path does not exist') yield ap continue # important to only do the next for the innermost subdataset # as the `recursive` logic below relies on that! # set the correct parent, for a dataset this would be the second-last # reported subdataset ap.update(parentds=containing_ds[-1]) if containing_ds[-1] == ap['path']: # the path actually refers to the last installed dataset ap.update(parentds=containing_ds[-2], process_content=get_data, type='dataset') to_get.append(ap) # results of recursive installation of yet undiscovered datasets rec_get = [] if recursive and not recursion_limit == 'existing': # obtain any subdatasets underneath the paths given inside the # subdatasets that we know already exist # unless we do not want recursion into not-yet-installed datasets for ap in sorted(to_get, key=lambda x: x['path']): if ap['type'] not in ('dataset', 'directory') or not ap.get('raw_input', False): # a non-directory cannot have content underneath # also we do NOT want to recurse into anything that was specifically # requested, to avoid duplication continue subds = Dataset(ap['path'] if ap['type'] == 'dataset' else ap['parentds']) lgr.info( "Installing %s%s recursively", subds, (" underneath %s" % ap['path'] if subds.path != ap['path'] else "")) for res in _recursive_install_subds_underneath( subds, # `ap['path']` was explicitly given as input # we count recursions from the input, hence we # can start with the full number recursion_limit, reckless, start=ap['path'], refds_path=refds_path, description=description): # yield immediately so errors could be acted upon # outside, before we continue if not (res['type'] == 'dataset' and res['path'] in yielded_ds): # unless we reported on this dataset before if res['type'] == 'dataset': # make a record yielded_ds.append(res['path']) yield res if not (res['status'] == 'ok' and res['type'] == 'dataset'): # not a dataset that was just installed, we just reported it # upstairs, and can ignore it from now on continue # paranoia, so popular these days... assert GitRepo.is_valid_repo(res['path']) # keep a copy of the install record for `get` later on get_ap = {k: v for k, v in res.items() if not k == 'status'} get_ap['process_content'] = get_data rec_get.append(get_ap) if not get_data: # done already return # merge the two AP lists to_get.extend(rec_get) # sort into datasets content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_get, refds_path=refds_path) assert(not completed) # hand over to git-annex, get files content, # report files in git as 'notneeded' to get for ds_path in sorted(content_by_ds.keys()): ds = Dataset(ds_path) # grab content, ignore subdataset entries content = [ap['path'] for ap in content_by_ds[ds_path] if ap.get('type', None) != 'dataset' or ap['path'] == ds.path] if not content: # cut this short should there be nothing continue # needs to be an annex to get content if not isinstance(ds.repo, AnnexRepo): for r in results_from_paths( content, status='notneeded', message="no dataset annex, content already present", action='get', logger=lgr, refds=refds_path): yield r continue respath_by_status = {} for res in ds.repo.get( content, options=['--from=%s' % source] if source else [], jobs=jobs): res = annexjson2result(res, ds, type='file', logger=lgr, refds=refds_path) success = success_status_map[res['status']] # TODO: in case of some failed commands (e.g. get) there might # be no path in the record. yoh has only vague idea of logic # here so just checks for having 'path', but according to # results_from_annex_noinfo, then it would be assumed that # `content` was acquired successfully, which is not the case if 'path' in res: respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] yield res for r in results_from_annex_noinfo( ds, content, respath_by_status, dir_fail_msg='could not get some content in %s %s', noinfo_dir_msg='nothing to get from %s', noinfo_file_msg='already present', action='get', logger=lgr, refds=refds_path): yield r
def __call__( path=None, dataset=None, recursive=False, check=True, if_dirty='save-before'): refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='uninstall', logger=lgr, refds=refds_path) if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path if not dataset and not path: raise InsufficientArgumentsError( "insufficient information for `uninstall`: requires at least a path or dataset") to_uninstall = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, action='uninstall', # justification for status: # content need not be uninstalled where there is none unavailable_path_status='notneeded', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue # upfront sanity and compliance checks # check that we have no top-level datasets and not files to process if ap.get('type') == 'dataset' and \ not ap.get('state', None) == 'absent' and \ path_is_under([ap['path']]): # wants a sequence! ap.update( status='error', message="refusing to uninstall current or parent directory") yield ap continue if not ap.get('type', None) == 'dataset': ap.update( status='impossible', message="can only uninstall datasets (consider the `drop` command)") yield ap continue # we only have dataset from here if not ap.get('parentds', None): # this could be a side-effect of the specific call semantics. # As stated in #1714, we are not really interested in whether # a superdataset was obvious in the call, but only whether there # is a superdataset at all. So let's look for one, and only barf # when there really isn't parentds = Dataset(ap['path']).get_superdataset( datalad_only=False, topmost=False, # unless it is properly registered we have no way of # reinstalling it registered_only=True) if parentds is None: ap.update( status='error', message="will not uninstall top-level dataset (consider `remove` command)") yield ap continue ap['parentds'] = parentds.path if not ap['path'] == refds_path: ap['process_content'] = True to_uninstall.append(ap) # iterate over all datasets, starting at the bottom # to deinit contained submodules first for ap in sorted(to_uninstall, key=lambda x: x['path'], reverse=True): if ap.get('state', None) == 'absent': # already gone continue ds = Dataset(ap['path']) # TODO generator # this should yield what it did handle_dirty_dataset(ds, mode=if_dirty) # we confirmed the super dataset presence above for r in _uninstall_dataset(ds, check=check, has_super=True, **res_kwargs): yield r
def __call__( path=None, dataset=None, get_aggregates=False, reporton='all', recursive=False): # prep results refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='metadata', logger=lgr) if refds_path: res_kwargs['refds'] = refds_path if get_aggregates: # yield all datasets for which we have aggregated metadata as results # the get actual dataset results, so we can turn them into dataset # instances using generic top-level code if desired ds = require_dataset( refds_path, check_installed=True, purpose='aggregate metadata query') agginfos = load_ds_aggregate_db( ds, version=str(aggregate_layout_version), abspath=True ) if not agginfos: # if there has ever been an aggregation run, this file would # exist, hence there has not been and we need to tell this # to people yield get_status_dict( ds=ds, status='impossible', action='metadata', logger=lgr, message='metadata aggregation has never been performed in this dataset') return parentds = [] for dspath in sorted(agginfos): info = agginfos[dspath] if parentds and not path_is_subpath(dspath, parentds[-1]): parentds.pop() info.update( path=dspath, type='dataset', status='ok', ) if dspath == ds.path: info['layout_version'] = aggregate_layout_version if parentds: info['parentds'] = parentds[-1] yield dict( info, **res_kwargs ) parentds.append(dspath) return if not dataset and not path: # makes no sense to have no dataset, go with "here" # error generation happens during annotation path = op.curdir content_by_ds = OrderedDict() for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, # MIH: we are querying the aggregated metadata anyways, and that # mechanism has its own, faster way to go down the hierarchy #recursive=recursive, #recursion_limit=recursion_limit, action='metadata', # uninstalled subdatasets could be queried via aggregated metadata # -> no 'error' unavailable_path_status='', nondataset_path_status='error', # we need to know when to look into aggregated data force_subds_discovery=True, force_parentds_discovery=True, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) == 'dataset' and GitRepo.is_valid_repo(ap['path']): ap['process_content'] = True to_query = None if ap.get('state', None) == 'absent' or \ ap.get('type', 'dataset') != 'dataset': # this is a lonely absent dataset/file or content in a present dataset # -> query through parent # there must be a parent, otherwise this would be a non-dataset path # and would have errored during annotation to_query = ap['parentds'] else: to_query = ap['path'] if to_query: pcontent = content_by_ds.get(to_query, []) pcontent.append(ap) content_by_ds[to_query] = pcontent for ds_path in content_by_ds: ds = Dataset(ds_path) query_agg = [ap for ap in content_by_ds[ds_path] # this is an available subdataset, will be processed in another # iteration if ap.get('state', None) == 'absent' or not(ap.get('type', None) == 'dataset' and ap['path'] != ds_path)] if not query_agg: continue # report from aggregated metadata for r in query_aggregated_metadata( reporton, # by default query the reference dataset, only if there is none # try our luck in the dataset that contains the queried path # this is consistent with e.g. `get_aggregates` reporting the # situation in the reference dataset only Dataset(refds_path) if refds_path else ds, query_agg, # recursion above could only recurse into datasets # on the filesystem, but there might be any number of # uninstalled datasets underneath the last installed one # for which we might have metadata recursive=recursive, **res_kwargs): yield r return
def __call__( path=None, sibling=None, merge=False, dataset=None, recursive=False, recursion_limit=None, fetch_all=None, reobtain_data=False): """ """ if fetch_all is not None: lgr.warning('update(fetch_all=...) called. Option has no effect, and will be removed') if not dataset and not path: # try to find a dataset in PWD dataset = require_dataset( None, check_installed=True, purpose='updating') refds_path = Interface.get_refds_path(dataset) if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path save_paths = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='update', unavailable_path_status='impossible', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if not ap.get('type', None) == 'dataset': ap.update( status='impossible', message="can only update datasets") yield ap continue # this is definitely as dataset from here on ds = Dataset(ap['path']) if not ds.is_installed(): lgr.debug("Skipping update since not installed %s", ds) continue repo = ds.repo # prepare return value # TODO reuse AP for return props res = get_status_dict('update', ds=ds, logger=lgr, refds=refds_path) # get all remotes which have references (would exclude # special remotes) remotes = repo.get_remotes( **({'exclude_special_remotes': True} if isinstance(repo, AnnexRepo) else {})) if not remotes and not sibling: res['message'] = ("No siblings known to dataset at %s\nSkipping", repo.path) res['status'] = 'notneeded' yield res continue if not sibling and len(remotes) == 1: # there is only one remote, must be this one sibling_ = remotes[0] elif not sibling: # nothing given, look for tracking branch sibling_ = repo.get_tracking_branch()[0] else: sibling_ = sibling if sibling_ and sibling_ not in remotes: res['message'] = ("'%s' not known to dataset %s\nSkipping", sibling_, repo.path) res['status'] = 'impossible' yield res continue if not sibling_ and len(remotes) > 1 and merge: lgr.debug("Found multiple siblings:\n%s" % remotes) res['status'] = 'impossible' res['message'] = "Multiple siblings, please specify from which to update." yield res continue lgr.info("Fetching updates for %s", ds) # fetch remote fetch_kwargs = dict( # test against user-provided value! remote=None if sibling is None else sibling_, all_=sibling is None, # required to not trip over submodules that # were removed in the origin clone recurse_submodules="no", prune=True) # prune to not accumulate a mess over time repo.fetch(**fetch_kwargs) # NOTE if any further acces to `repo` is needed, reevaluate # ds.repo again, as it might have be converted from an GitRepo # to an AnnexRepo if merge: for fr in _update_repo(ds, sibling_, reobtain_data): yield fr res['status'] = 'ok' yield res save_paths.append(ap['path']) if recursive: save_paths = [p for p in save_paths if p != refds_path] if not save_paths: return lgr.debug( 'Subdatasets where updated state may need to be ' 'saved in the parent dataset: %s', save_paths) for r in Dataset(refds_path).save( path=save_paths, recursive=False, message='[DATALAD] Save updated subdatasets'): yield r
def __call__( path=None, source=None, dataset=None, get_data=False, description=None, recursive=False, recursion_limit=None, save=True, reckless=False, # git_opts=None, # git_clone_opts=None, # annex_opts=None, # annex_init_opts=None, jobs="auto"): # normalize path argument to be equal when called from cmdline and # python and nothing was passed into `path` path = assure_list(path) if not source and not path: raise InsufficientArgumentsError( "Please provide at least a source or a path") # Common kwargs to pass to underlying git/install calls. # They might need adjustments (e.g. for recursion_limit, but # otherwise would be applicable throughout # # There should have been more of common options! # since underneath get could do similar installs common_kwargs = dict( get_data=get_data, recursive=recursive, recursion_limit=recursion_limit, # git_opts=git_opts, # annex_opts=annex_opts, reckless=reckless, jobs=jobs, ) #installed_items = [] #failed_items = [] # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. ds = None if dataset is not None: ds = require_dataset(dataset, check_installed=True, purpose='installation') common_kwargs['dataset'] = dataset # switch into the two scenarios without --source: # 1. list of URLs # 2. list of (sub)dataset content if source is None: # we need to collect URLs and paths to_install = [] to_get = [] # TODO: this approach is problematic, it disrupts the order of input args. # consequently results will be returned in an unexpected order when a # mixture of source URL and paths is given. Reordering is only possible when # everything in here is fully processed before any results can be yielded. # moreover, I think the semantics of the status quo implementation are a # bit complicated: in a mixture list a source URL will lead to a new dataset # at a generated default location, but a path will lead to a subdataset # at that exact location for urlpath in path: ri = RI(urlpath) (to_get if isinstance(ri, PathRI) else to_install).append(urlpath) # 1. multiple source URLs for s in to_install: lgr.debug("Install passes into install source=%s", s) for r in Install.__call__( source=s, description=description, save=save, # git_clone_opts=git_clone_opts, # annex_init_opts=annex_init_opts, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" on_failure='ignore', return_type='generator', result_xfm=None, result_filter=None, **common_kwargs): # no post-processing of the installed content on disk # should be necessary here, all done by code further # down that deals with an install from an actuall `source` # any necessary fixes should go there too! # TODO generator: possibly adjust refds yield r # 2. one or more dataset content paths if to_get: lgr.debug("Install passes into get %d items", len(to_get)) # all commented out hint on inability to pass those options # into underlying install-related calls. # Also need to pass from get: # annex_get_opts for r in Get.__call__( to_get, # TODO should pass-through description, not sure why disabled # description=description, # save=save, # git_clone_opts=git_clone_opts, # annex_init_opts=annex_init_opts, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" on_failure='ignore', return_type='generator', result_xfm=None, result_filter=None, **common_kwargs): # no post-processing of get'ed content on disk should be # necessary here, this is the responsibility of `get` # (incl. adjusting parent's gitmodules when submodules end # up in an "updated" state (done in get helpers) # any required fixes should go there! yield r # we are done here # the rest is about install from a `source` return # an actual `source` was given if source and path and len(path) > 1: # exception is ok here, if this fails it is either direct user error # or we f****d up one of our internal calls raise ValueError( "install needs a single PATH when source is provided. " "Was given mutliple PATHs: %s" % str(path)) # parameter constraints: if not source: # exception is ok here, if this fails it is either direct user error # or we f****d up one of our internal calls raise InsufficientArgumentsError( "a `source` is required for installation") # code below deals with a single path only path = path[0] if path else None # pre-compute for results below refds_path = Interface.get_refds_path(ds) if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination yield get_status_dict( 'install', path=path, status='impossible', logger=lgr, source_url=source, refds=refds_path, message="installation `source` and destination `path` are identical. " "If you are trying to add a subdataset simply use the `add` command") return # resolve the target location (if local) against the provided dataset # or CWD: if path is not None: # MIH everything in here is highly similar to what common # interface helpers do (or should/could do), but at the same # is very much tailored to just apply to `install` -- I guess # it has to stay special # Should work out just fine for regular paths, so no additional # conditioning is necessary try: path_ri = RI(path) except Exception as e: raise ValueError( "invalid path argument {}: ({})".format(path, exc_str(e))) try: # Wouldn't work for SSHRI ATM, see TODO within SSHRI # yoh: path should be a local path, and mapping note within # SSHRI about mapping localhost:path to path is kinda # a peculiar use-case IMHO path = resolve_path(path_ri.localpath, dataset) # any `path` argument that point to something local now # resolved and is no longer a URL except ValueError: # `path` is neither a valid source nor a local path. # TODO: The only thing left is a known subdataset with a # name, that is not a path; Once we correctly distinguish # between path and name of a submodule, we need to consider # this. # For now: Just raise raise ValueError("Invalid path argument {0}".format(path)) # `path` resolved, if there was any. # clone dataset, will also take care of adding to superdataset, if one # is given res = Clone.__call__( source, path, dataset=ds, description=description, reckless=reckless, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" result_xfm=None, return_type='generator', result_filter=None, on_failure='ignore') # helper as_ds = YieldDatasets() destination_dataset = None for r in res: if r['action'] == 'install' and r['type'] == 'dataset': # make sure logic below is valid, only one dataset result is # coming back assert(destination_dataset is None) destination_dataset = as_ds(r) yield r assert(destination_dataset) # Now, recursive calls: if recursive or get_data: # dataset argument must not be passed inside since we use bound .get # It is ok to do "inplace" as long as we still return right # after the loop ends common_kwargs.pop('dataset', '') for r in destination_dataset.get( curdir, description=description, # TODO expose this # yoh: exactly! #annex_get_opts=annex_get_opts, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" on_failure='ignore', return_type='generator', result_xfm=None, **common_kwargs): yield r # at this point no futher post-processing should be necessary, # `clone` and `get` must have done that (incl. parent handling) # if not, bugs should be fixed in those commands return
def __call__( path=None, dataset=None, recursive=False, check=True, save=True, message=None, if_dirty='save-before'): res_kwargs = dict(action='remove', logger=lgr) if not dataset and not path: raise InsufficientArgumentsError( "insufficient information for `remove`: requires at least a path or dataset") refds_path = Interface.get_refds_path(dataset) res_kwargs['refds'] = refds_path if refds_path and not path and not GitRepo.is_valid_repo(refds_path): # nothing here, nothing to remove yield get_status_dict(path=refds_path, status='notneeded', **res_kwargs) return if refds_path and not path: # act on the whole dataset if nothing else was specified # TODO i think that would happen automatically in annotation? path = refds_path to_process = [] for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, # we only ever want to discover immediate subdatasets, the rest # will happen in `uninstall` recursion_limit=1, action='remove', unavailable_path_status='', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('state', None) == 'absent' and \ ap.get('parentds', None) is None: # nothing exists at location, and there is no parent to # remove from ap['status'] = 'notneeded' ap['message'] = "path does not exist and is not in a dataset" yield ap continue if ap.get('raw_input', False) and ap.get('type', None) == 'dataset': # make sure dataset sorting yields a dedicted entry for this one ap['process_content'] = True to_process.append(ap) if not to_process: # nothing left to do, potentially all errored before return if path_is_under([ap['path'] for ap in to_process]): # behave like `rm` and refuse to remove where we are raise ValueError( "refusing to uninstall current or parent directory") # now sort into datasets so we can process them one by one content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path) assert(not completed) # iterate over all datasets, starting at the bottom # to make the removal of dataset content known upstairs to_save = [] # track which submodules we have removed in the process, to avoid # failure in case we revisit them due to a subsequent path argument subm_removed = [] for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) paths = content_by_ds[ds_path] to_reporemove = dict() # PLAN any dataset that was not raw_input, uninstall (passing recursive flag) # if dataset itself is in paths, skip any nondataset # sort reverse so we get subdatasets first for ap in sorted(paths, key=lambda x: x['path'], reverse=True): if ap.get('type', None) == 'dataset': # entire dataset needs to go, uninstall if present, pass recursive! uninstall_failed = False if ap['path'] == refds_path or \ (refds_path is None and ap.get('raw_input', False)): # top-level handling, cannot use regular uninstall call, as # it will refuse to uninstall a top-level dataset # and rightfully so, it is really a remove in that case # bypass all the safety by using low-level helper for r in _uninstall_dataset(ds, check=check, has_super=False, **res_kwargs): if r['status'] in ('impossible', 'error'): # we need to inspect if something went wrong, in order # to prevent failure from removing a non-empty dir below, # but at the same time allow for continued processing uninstall_failed = True r['refds'] = refds_path yield r # recheck that it wasn't removed during a previous iteration elif ap.get('state', None) != 'absent' and GitRepo.is_valid_repo(ap['path']): # anything that is not the top-level -> regular uninstall # this is for subdatasets of the to-be-removed dataset # we want to simply uninstall them in a regular manner for r in Uninstall.__call__( # use annotate path as input, but pass a copy because # we cannot rely on it being unaltered by reannotation # TODO maybe adjust annotate_path to do that [ap.copy()], dataset=refds_path, recursive=recursive, check=check, if_dirty=if_dirty, result_xfm=None, result_filter=None, on_failure='ignore'): if r['status'] in ('impossible', 'error'): # we need to inspect if something went wrong, in order # to prevent failure from removing a non-empty dir below, # but at the same time allow for continued processing uninstall_failed = True yield r if not ap.get('raw_input', False): # we only ever want to actually unregister subdatasets that # were given explicitly continue if not uninstall_failed and \ not ap['path'] in subm_removed and \ refds_path and \ ap.get('parentds', None) and \ not (relpath(ap['path'], start=refds_path).startswith(pardir) or ap['path'] == refds_path) and \ ap.get('registered_subds', False): # strip from superdataset, but only if a dataset was given explcitly # as in "remove from this dataset", but not when just a path was given # as in "remove from the filesystem" subds_relpath = relpath(ap['path'], start=ap['parentds']) # remove submodule reference parentds = Dataset(ap['parentds']) # play safe, will fail on dirty parentds.repo.deinit_submodule(ap['path']) # remove now empty submodule link parentds.repo.remove(ap['path']) # make a record that we removed this already, should it be # revisited via another path argument, because do not reannotate # the paths after every removal subm_removed.append(ap['path']) yield dict(ap, status='ok', **res_kwargs) # need .gitmodules update in parent to_save.append(dict( path=opj(parentds.path, '.gitmodules'), parents=parentds.path, type='file')) # and the removal itself needs to be committed # inform `save` that it is OK that this path # doesn't exist on the filesystem anymore ap['unavailable_path_status'] = '' ap['process_content'] = False to_save.append(ap) if not uninstall_failed and exists(ap['path']): # could be an empty dir in case an already uninstalled subdataset # got removed rmdir(ap['path']) else: # anything that is not a dataset can simply be passed on to_reporemove[ap['path']] = ap # avoid unnecessary git calls when there is nothing to do if to_reporemove: if check and hasattr(ds.repo, 'drop'): for r in _drop_files(ds, list(to_reporemove), check=True): if r['status'] == 'error': # if drop errored on that path, we can't remove it to_reporemove.pop(r['path'], 'avoidKeyError') yield r if to_reporemove: for r in ds.repo.remove(list(to_reporemove), r=True): # these were removed, but we still need to save the # removal r_abs = opj(ds.path, r) if r_abs in to_reporemove: ap = to_reporemove[r_abs] else: ap = {'path': r_abs, 'parentds': ds.path, 'refds': refds_path } ap['unavailable_path_status'] = '' to_save.append(ap) yield get_status_dict( status='ok', path=r, **res_kwargs) if not to_save: # nothing left to do, potentially all errored before return if not save: lgr.debug('Not calling `save` as instructed') return for res in Save.__call__( # TODO compose hand-selected annotated paths path=to_save, # we might have removed the reference dataset by now, recheck dataset=refds_path if (refds_path and GitRepo.is_valid_repo(refds_path)) else None, message=message if message else '[DATALAD] removed content', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def __call__( path=None, sibling=None, merge=False, dataset=None, recursive=False, recursion_limit=None, fetch_all=None, reobtain_data=False): """ """ if fetch_all is not None: lgr.warning('update(fetch_all=...) called. Option has no effect, and will be removed') if not dataset and not path: # try to find a dataset in PWD dataset = require_dataset( None, check_installed=True, purpose='updating') refds_path = Interface.get_refds_path(dataset) if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path save_paths = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='update', unavailable_path_status='impossible', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if not ap.get('type', None) == 'dataset': ap.update( status='impossible', message="can only update datasets") yield ap continue # this is definitely as dataset from here on ds = Dataset(ap['path']) if not ds.is_installed(): lgr.debug("Skipping update since not installed %s", ds) continue repo = ds.repo # prepare return value # TODO reuse AP for return props res = get_status_dict('update', ds=ds, logger=lgr, refds=refds_path) # get all remotes which have references (would exclude # special remotes) remotes = repo.get_remotes( **({'exclude_special_remotes': True} if isinstance(repo, AnnexRepo) else {})) if not remotes and not sibling: res['message'] = ("No siblings known to dataset at %s\nSkipping", repo.path) res['status'] = 'notneeded' yield res continue if not sibling and len(remotes) == 1: # there is only one remote, must be this one sibling_ = remotes[0] elif not sibling: # nothing given, look for tracking branch sibling_ = repo.get_tracking_branch()[0] else: sibling_ = sibling if sibling_ and sibling_ not in remotes: res['message'] = ("'%s' not known to dataset %s\nSkipping", sibling_, repo.path) res['status'] = 'impossible' yield res continue if not sibling_ and len(remotes) > 1 and merge: lgr.debug("Found multiple siblings:\n%s" % remotes) res['status'] = 'impossible' res['message'] = "Multiple siblings, please specify from which to update." yield res continue lgr.info("Fetching updates for %s", ds) # fetch remote fetch_kwargs = dict( # test against user-provided value! remote=None if sibling is None else sibling_, all_=sibling is None, # required to not trip over submodules that # were removed in the origin clone recurse_submodules="no", prune=True) # prune to not accumulate a mess over time repo.fetch(**fetch_kwargs) # NOTE if any further acces to `repo` is needed, reevaluate # ds.repo again, as it might have be converted from an GitRepo # to an AnnexRepo if merge: for fr in _update_repo(ds, sibling_, reobtain_data): yield fr res['status'] = 'ok' yield res save_paths.append(ap['path']) if recursive: save_paths = [p for p in save_paths if p != refds_path] if not save_paths: return lgr.debug( 'Subdatasets where updated state may need to be ' 'saved in the parent dataset: %s', save_paths) for r in Dataset(refds_path).add( path=save_paths, recursive=False, message='[DATALAD] Save updated subdatasets'): yield r
def __call__(path=None, dataset=None, recursive=False, recursion_limit=None, check=True, if_dirty='save-before'): if not dataset and not path: raise InsufficientArgumentsError( "insufficient information for `drop`: requires at least a path or dataset" ) refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='drop', logger=lgr, refds=refds_path) # this try-except dance is only to maintain a previous behavior of `drop` # where it did not ValueError, but yielded error status try: ds = require_dataset(dataset, check_installed=True, purpose='dropping content') except ValueError as e: yield dict( status='error', message=str(e), **res_kwargs, ) return if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path content_by_ds = {} for st in Status.__call__( # do not use `ds` to preserve path semantics dataset=dataset, path=path, annex=None, untracked='no', recursive=recursive, recursion_limit=recursion_limit, eval_subdataset_state='no', report_filetype='raw', return_type='generator', result_renderer=None, # yield errors and let caller decide on_failure='ignore'): if st['status'] == 'error': # Downstream code can't do anything with these. Let the caller # decide their fate. yield st continue # ignore submodule entries if st.get('type') == 'dataset': if not Dataset(st['path']).is_installed(): continue parentds = st['path'] else: parentds = st['parentds'] cbd = content_by_ds.get(parentds, []) cbd.append(st['path']) content_by_ds[parentds] = cbd # iterate over all datasets, order doesn't matter for ds_path in content_by_ds: ds = Dataset(ds_path) # TODO generator # this should yield what it did handle_dirty_dataset(ds, mode=if_dirty) for r in _drop_files(ds, content_by_ds[ds_path], check=check, **res_kwargs): yield r
def __call__( path=None, dataset=None, to_git=False, save=True, recursive=False, recursion_limit=None, ds2super=False, git_opts=None, annex_opts=None, annex_add_opts=None, jobs=None): # parameter constraints: if not path: raise InsufficientArgumentsError( "insufficient information for adding: requires at least a path") # never recursion, need to handle manually below to be able to # discover untracked content content_by_ds, unavailable_paths = Interface._prep( path=path, dataset=dataset, recursive=False) if unavailable_paths: lgr.warning("ignoring non-existent path(s): %s", unavailable_paths) if recursive: # with --recursive for each input path traverse the directory # tree, when we find a dataset, add it to the spec, AND add it as # a path to the spec of the parent # MIH: wrap in list() to avoid exception, because dict size might # change, but we want to loop over all that are in at the start # only for d in list(content_by_ds.keys()): for p in content_by_ds[d]: _discover_subdatasets_recursively( p, [d], content_by_ds, recursion_limit) if not content_by_ds: raise InsufficientArgumentsError( "no existing content given to add") if dataset: # remeber the datasets associated with actual inputs input_ds = list(content_by_ds.keys()) # forge chain from base dataset to any leaf dataset _discover_trace_to_known(dataset.path, [], content_by_ds) if ds2super: # now check all dataset entries corresponding to the original # input to see if they contain their own paths and remove them for inpds in input_ds: content_by_ds[inpds] = [p for p in content_by_ds[inpds] if not p == inpds] # and lastly remove all entries that contain no path to avoid # saving any staged content in the final step content_by_ds = {d: v for d, v in content_by_ds.items() if v} results = [] # simple loop over datasets -- save happens later # start deep down for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) toadd = list(set(content_by_ds[ds_path])) # handle anything that looks like a wannabe subdataset for subds_path in [d for d in toadd if GitRepo.is_valid_repo(d) and d != ds_path and d not in ds.get_subdatasets( recursive=False, absolute=True, fulfilled=True)]: # TODO add check that the subds has a commit, and refuse # to operate on it otherwise, or we would get a bastard # submodule that cripples git operations _install_subds_inplace( ds=ds, path=subds_path, relativepath=relpath(subds_path, ds_path)) # make sure that .gitmodules is added to the list of files toadd.append(opj(ds.path, '.gitmodules')) # report added subdatasets -- add below won't do it results.append({ 'success': True, 'file': Dataset(subds_path)}) # make sure any last minute additions make it to the saving stage content_by_ds[ds_path] = toadd added = ds.repo.add( toadd, git=to_git if isinstance(ds.repo, AnnexRepo) else True, commit=False) for a in added: a['file'] = opj(ds_path, a['file']) results.extend(added) if results and save: save_dataset_hierarchy( content_by_ds, base=dataset.path if dataset and dataset.is_installed() else None, message='[DATALAD] added content') return results
def __call__( path=None, dataset=None, to=None, since=None, missing='fail', force=False, transfer_data='auto', recursive=False, recursion_limit=None, git_opts=None, annex_opts=None, annex_copy_opts=None, jobs=None ): # if ever we get a mode, for "with-data" we would need this #if dataset and not path: # # act on the whole dataset if nothing else was specified # path = dataset.path if isinstance(dataset, Dataset) else dataset if not dataset and not path: # try to find a dataset in PWD dataset = require_dataset( None, check_installed=True, purpose='publishing') if since and not dataset: raise InsufficientArgumentsError( 'Modification detection (--since) without a base dataset ' 'is not supported') if dataset and since == '': # only update since last update so we figure out what was the last update active_branch = dataset.repo.get_active_branch() if to: # XXX here we assume one to one mapping of names from local branches # to the remote since = '%s/%s' % (to, active_branch) else: # take tracking remote for the active branch tracked_remote, tracked_refspec = dataset.repo.get_tracking_branch() if tracked_remote: if tracked_refspec.startswith('refs/heads/'): tracked_refspec = tracked_refspec[len('refs/heads/'):] #to = tracked_remote since = '%s/%s' % (tracked_remote, tracked_refspec) else: lgr.info( "No tracked remote for %s. since option is of no effect", active_branch ) since = None # here is the plan # 1. figure out remote to publish to # 2. figure out which content needs to be published to this remote # 3. look for any pre-publication dependencies of that remote # (i.e. remotes that need to be published to before) # 4. publish the content needed to go to the primary remote to # the dependencies first, and to the primary afterwards ds_remote_info = {} refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(refds=refds_path, logger=lgr, action='publish') to_process = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='publish', unavailable_path_status='impossible', nondataset_path_status='error', modified=since, return_type='generator', on_failure='ignore', force_no_revision_change_discovery=False, # we cannot publish what was not committed force_untracked_discovery=False # we cannot publish untracked ): if ap.get('status', None): # this is done yield ap continue remote_info_result = None if ap.get('type', ap.get('type_src', 'dataset')) != 'dataset': # for everything that is not a dataset get the remote info # for the parent parentds = ap.get('parentds', None) if parentds and parentds not in ds_remote_info: remote_info_result = _get_remote_info( parentds, ds_remote_info, to, missing) else: # this is a dataset if ap.get('state', None) == 'absent': continue # get the remote info for itself remote_info_result = _get_remote_info( ap['path'], ds_remote_info, to, missing) ap['process_content'] = True if remote_info_result is not None: ap['status'] = remote_info_result[0] ap['message'] = remote_info_result[1] yield ap continue to_process.append(ap) content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path) assert(not completed) lgr.debug( "Evaluating %i dataset publication candidate(s)", len(content_by_ds)) # TODO: fancier sorting, so we still follow somewhat the hierarchy # in sorted order, e.g. # d1/sub1/sub1 # d1/sub1 # d1 # d2/sub1 # d2 content_by_ds = OrderedDict( (d, content_by_ds[d]) for d in sorted(content_by_ds, reverse=True) ) lgr.debug("Attempt to publish %i datasets", len(content_by_ds)) for ds_path in content_by_ds: remote_info = ds_remote_info.get(ds_path, None) if remote_info is None: # maybe this dataset wasn't annotated above, try to get info # MIH: I think this entire if-branch is practically impossible # to reach. It is certainly untested, but I think this is due # to mutually exclusive conditions during remote_info detection remote_info_result = _get_remote_info( ds_path, ds_remote_info, to, missing) if remote_info_result is not None: yield get_status_dict( type='dataset', path=ds_path, status=remote_info_result[0], message=remote_info_result[1], **res_kwargs) continue # continue with freshly obtained info remote_info = ds_remote_info[ds_path] # condition above must catch all other cases assert remote_info # and publish ds = Dataset(ds_path) for r in _publish_dataset( ds, remote=remote_info['remote'], refspec=remote_info.get('refspec', None), # only send paths that were explicitly requested paths=[p for p in content_by_ds[ds_path] # do not feed (sub)dataset paths into the beast # makes no sense to try to annex copy them # for the base dataset itself let `transfer_data` # decide if p.get('type', None) != 'dataset'], annex_copy_options=annex_copy_opts, force=force, jobs=jobs, transfer_data=transfer_data, **res_kwargs): yield r
def __call__( path=None, source=None, dataset=None, recursive=False, recursion_limit=None, get_data=True, reckless=False, git_opts=None, annex_opts=None, annex_get_opts=None, jobs=None, verbose=False, # internal -- instead of returning 'get'ed items, return final # content_by_ds, unavailable_paths. To be used by the call from # Install.__call__ and done so to avoid creating another reusable # function which would need to duplicate all this heavy list of # kwargs _return_datasets=False ): # IMPLEMENTATION CONCEPT: # # 1. Sort the world into existing handles and the rest # 2. Try locate missing handles (obtain subdatasets along the way) # 3. Expand into subdatasets with recursion enables (potentially # obtain even more subdatasets # 4. Shoot info of which handles to get in each subdataset to, # git-annex, once at the very end dataset_path = dataset.path if isinstance(dataset, Dataset) else dataset if not (dataset or path): raise InsufficientArgumentsError( "Neither dataset nor target path(s) provided") if dataset and not path: # act on the whole dataset if nothing else was specified path = dataset_path # use lookup cache -- we need that info further down dir_lookup = {} content_by_ds, unavailable_paths = Interface._prep( path=path, dataset=dataset, recursive=recursive, recursion_limit=recursion_limit, dir_lookup=dir_lookup) # explore the unknown for path in sorted(unavailable_paths): # how close can we get? dspath = get_dataset_root(path) if dspath is None: # nothing we can do for this path continue ds = Dataset(dspath) # must always yield a dataset -- we sorted out the ones outside # any dataset at the very top assert ds.is_installed() # now actually obtain whatever is necessary to get to this path containing_ds = install_necessary_subdatasets(ds, path, reckless) if containing_ds.path != ds.path: lgr.debug("Installed %s to fulfill request for content for " "path %s", containing_ds, path) # mark resulting dataset as auto-installed if containing_ds.path == path: # we had to get the entire dataset, not something within # mark that it just appeared content_by_ds[path] = [curdir] else: # we need to get content within content_by_ds[path] = [path] if recursive and not recursion_limit == 'existing': # obtain any subdatasets underneath the paths given inside the # subdatasets that we know already exist # unless we do not want recursion into not-yet-installed datasets for subdspath in sorted(content_by_ds.keys()): for content_path in content_by_ds[subdspath]: if not isdir(content_path): # a non-directory cannot have content underneath continue subds = Dataset(subdspath) lgr.info( "Obtaining %s %s recursively", subds, ("underneath %s" % content_path if subds.path != content_path else "")) cbysubds = _recursive_install_subds_underneath( subds, # `content_path` was explicitly given as input # we count recursions from the input, hence we # can start with the full number recursion_limit, reckless, # protect against magic marker misinterpretation # only relevant for _get, hence replace here start=content_path if content_path != curdir else None) # gets file content for all freshly installed subdatasets content_by_ds.update(cbysubds) ## we have now done everything we could to obtain whatever subdataset ## to get something on the file system for previously unavailable paths ## check and sort one last content_by_ds, unavailable_paths, nondataset_paths = \ get_paths_by_dataset( unavailable_paths, recursive=recursive, recursion_limit=recursion_limit, out=content_by_ds, dir_lookup=dir_lookup) if nondataset_paths: # XXX likely can never get here lgr.warning( "ignored paths that do not belong to any dataset: %s", nondataset_paths) if unavailable_paths: lgr.warning('ignored non-existing paths: %s', unavailable_paths) # hand over to git-annex results = list(chain.from_iterable( _get(content_by_ds, refpath=dataset_path, source=source, jobs=jobs, get_data=get_data))) # ??? should we in _return_datasets case just return both content_by_ds # and unavailable_paths may be so we provide consistent across runs output # and then issue outside similar IncompleteResultsError? if unavailable_paths: # and likely other error flags if _return_datasets: results = sorted(set(content_by_ds).difference(unavailable_paths)) raise IncompleteResultsError(results, failed=unavailable_paths) else: return sorted(content_by_ds) if _return_datasets else results
def __call__( path=None, dataset=None, recursive=False, recursion_limit=None, action=None, unavailable_path_status='', unavailable_path_msg=None, nondataset_path_status='error', force_parentds_discovery=True, force_subds_discovery=True, force_no_revision_change_discovery=True, force_untracked_discovery=True, modified=None): # upfront check for the fastest possible response if not path and dataset is None: # nothing given, try "here", but do not use `require_dataset`, as # it will determine the root dataset of `curdir` and further down # lead to path annotation of upstairs directories dataset = curdir if force_subds_discovery and not force_parentds_discovery: raise ValueError( 'subdataset discovery requires parent dataset discovery') # CONCEPT: yield with no status to indicate further processing # everything in one big loop to be able too yield as fast a possible # without any precomputing for all paths refds_path = Interface.get_refds_path(dataset) if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)): raise ValueError( "modification detection only works with a base dataset (non-given or found)") # prep common result props res_kwargs = dict( action=action if action else 'annotate_path', refds=refds_path, logger=lgr) # handle the case of recursion into a single dataset without any # extra fancy processing first -- full recursion can be done # faster than manual recursion, hence we gain quite some speed # from these few lines of extra code if not modified and not path and refds_path: if not GitRepo.is_valid_repo(refds_path): yield get_status_dict( # doesn't matter if the path is in another dataset # it was given as reference dataset status=nondataset_path_status, message='given reference dataset is not a dataset', path=refds_path, **res_kwargs) return refds = Dataset(refds_path) path = [] # yield the dataset itself r = get_status_dict(ds=refds, status='', **res_kwargs) yield r if recursive: # if we have nothing given, but need recursion, we need to feed # the dataset path itself for r in yield_recursive( refds, refds_path, action, recursion_limit): r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] yield r return # goal: structure in a way that makes most information on any path # available in a single pass, at the cheapest possible cost reported_paths = {} requested_paths = assure_list(path) if modified is not None: # modification detection would silently kill all nondataset paths # but we have to complain about them, hence doing it here if requested_paths and refds_path: for r in requested_paths: p = r['path'] if isinstance(r, dict) else r p = resolve_path(p, ds=refds_path) if path_startswith(p, refds_path): # all good continue # not the refds path_props = r if isinstance(r, dict) else {} res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with reference dataset' reported_paths[r] = res yield res # preserve non-existing paths to be silently killed by modification # detection and append them to requested_paths again after detection. # TODO: This might be melted in with treatment of non dataset paths # above. Re-appending those paths seems to be better than yielding # directly to avoid code duplication, since both cases later on are # dealt with again. preserved_paths = [] if requested_paths: [preserved_paths.append(r) for r in requested_paths if not lexists(r['path'] if isinstance(r, dict) else r)] # replace the requested paths by those paths that were actually # modified underneath or at a requested location requested_paths = get_modified_subpaths( # either the request, or the base dataset, if there was no request requested_paths if requested_paths else [refds_path], refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit) from itertools import chain # re-append the preserved paths: requested_paths = chain(requested_paths, iter(preserved_paths)) # do not loop over unique(), this could be a list of dicts # we avoid duplicates manually below via `reported_paths` for path in requested_paths: if not isinstance(path, dict): path = rawpath2ap(path, refds_path) # this is now an annotated path! path_props = path path = path['path'] # we need to mark our territory, who knows where this has been path_props.update(res_kwargs) if path in reported_paths: # we already recorded this path in the output # this can happen, whenever `path` is a subdataset, that was # discovered via recursive processing of another path before continue # the path exists in some shape or form # TODO if we have path_props already we could skip this test if isdir(path): # keep any existing type info, previously a more expensive run # could have discovered an uninstalled 'dataset', and we don't # want it to be relabeled to a directory path_props['type'] = \ path_props.get( 'type', 'dataset' if not islink(path) and GitRepo.is_valid_repo(path) else 'directory') # this could contain all types of additional content containing_dir = path if not islink(path) else normpath(opj(path, pardir)) else: if lexists(path): path_props['type'] = 'file' else: path_props['state'] = 'absent' # for everything else we are interested in the container containing_dir = dirname(path) if not containing_dir: containing_dir = curdir dspath = parent = get_dataset_root(containing_dir) if dspath: if path_props.get('type', None) == 'dataset': # for a dataset the root is not the parent, for anything else # it is parent = path_props.get('parentds', None) oneupdir = normpath(opj(containing_dir, pardir)) if parent is None and (force_parentds_discovery or ( refds_path and _with_sep(oneupdir).startswith( _with_sep(refds_path)))): # either forced, or only if we have a reference dataset, and # only if we stay within this refds when searching for the # parent parent = get_dataset_root(normpath(opj(containing_dir, pardir))) # NOTE the `and refds_path` is critical, as it will determine # whether a top-level dataset that was discovered gets the # parent property or not, it won't get it without a common # base dataset, and that is how we always rolled if parent and refds_path: path_props['parentds'] = parent # don't check whether this is actually a true subdataset of the # parent, done further down else: # set parent, but prefer existing property path_props['parentds'] = path_props.get('parentds', dspath) # test for `dspath` not `parent`, we only need to know whether there is # ANY dataset, not which one is the true parent, logic below relies on # the fact that we end here, if there is no dataset at all if not dspath: # not in any dataset res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with any dataset' reported_paths[path] = res yield res continue # check that we only got SUBdatasets if refds_path and not path_startswith(dspath, refds_path): res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = \ ('path not part of the reference dataset at %s', refds_path) reported_paths[path] = res yield res continue if path_props.get('type', None) == 'file': # nothing else we can learn about this res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res continue containing_ds = None path_type = path_props.get('type', None) if parent and force_subds_discovery and ( (path_type == 'dataset' and 'registered_subds' not in path_props) or path_type == 'directory' or not lexists(path)): # if the path doesn't exist, or is labeled a directory, or a dataset even # a dataset (without this info) -> record whether this is a known subdataset # to its parent containing_ds = Dataset(parent) subdss = containing_ds.subdatasets( fulfilled=None, recursive=False, result_xfm=None, result_filter=None, return_type='list') if path in [s['path'] for s in subdss]: if path_type == 'directory' or not lexists(path): # first record that it isn't here, if just a dir or not here at all path_props['state'] = 'absent' # this must be a directory, and it is not installed path_props['type'] = 'dataset' path_props['registered_subds'] = True if not lexists(path) or \ (path_props.get('type', None) == 'dataset' and path_props.get('state', None) == 'absent'): # not there (yet) message = unavailable_path_msg if unavailable_path_msg else None if message and '%s' in message: message = (message, path) path_props['message'] = message res = get_status_dict(**dict(res_kwargs, **path_props)) # assign given status, but only if the props don't indicate a status # already res['status'] = path_props.get( 'status', unavailable_path_status) reported_paths[path] = res yield res continue # we know everything we can, report res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res rec_paths = [] if recursive: # here we need to consider the special case that `path` is # a dataset itself, if a recursion_limit is given (e.g. # `remove` will do that by default), we need to recurse # from the dataset itself, and not its parent to get things # right -- this will also avoid needless discovery of # unrelated subdatasets if path_props.get('type', None) == 'dataset': containing_ds = Dataset(path) else: # regular parent, we might have a dataset already containing_ds = Dataset(parent) if containing_ds is None else containing_ds for r in yield_recursive(containing_ds, path, action, recursion_limit): # capture reported paths r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] reported_paths[r['path']] = r if modified is not None: # we cannot yield right away, maybe it wasn't modified rec_paths.append(r) else: yield r if modified is not None and rec_paths: # replace the recursively discovered paths by those paths that # were actually modified underneath or at a requested location for r in get_modified_subpaths( rec_paths, refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit): res = get_status_dict(**dict(r, **res_kwargs)) reported_paths[res['path']] = res yield res return
def __call__( path=None, dataset=None, recursive=False, check=True, if_dirty='save-before'): if dataset: dataset = require_dataset( dataset, check_installed=False, purpose='removal') if not dataset.is_installed() and not path: # all done already return [] if not path: # act on the whole dataset if nothing else was specified path = dataset.path if isinstance(dataset, Dataset) else dataset content_by_ds, unavailable_paths = Interface._prep( path=path, dataset=dataset, recursive=recursive) nonexistent_paths = [] for p in unavailable_paths: # we need to check whether any of these correspond # to a known subdataset, and add those to the list of # things to be removed toppath = get_dataset_root(p) if not toppath: nonexistent_paths.append(p) continue if p in Dataset(toppath).get_subdatasets( recursive=False, absolute=True): # this is a known subdataset that needs to be removed pl = content_by_ds.get(p, []) pl.append(p) content_by_ds[p] = pl if nonexistent_paths: lgr.warning("ignoring non-existent path(s): %s", nonexistent_paths) if path_is_under(content_by_ds): # behave like `rm` and refuse to remove where we are raise ValueError( "refusing to uninstall current or parent directory") handle_dirty_datasets( content_by_ds, mode=if_dirty, base=dataset) ds2save = set() results = [] # iterate over all datasets, starting at the bottom # to make the removal of dataset content known upstairs for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) paths = content_by_ds[ds_path] if ds_path in paths: # entire dataset needs to go superds = ds.get_superdataset( datalad_only=False, topmost=False) res = _uninstall_dataset(ds, check=check, has_super=False) results.extend(res) if ds.path in ds2save: # we just uninstalled it, no need to save anything ds2save.discard(ds.path) if not superds: continue subds_relpath = relpath(ds_path, start=superds.path) # remove submodule reference submodule = [sm for sm in superds.repo.repo.submodules if sm.path == subds_relpath] # there can only be one! assert(len(submodule) == 1) submodule = submodule[0] submodule.remove() if exists(ds_path): # could be an empty dir in case an already uninstalled subdataset # got removed os.rmdir(ds_path) # need to save changes to .gitmodules later content_by_ds[superds.path] = \ content_by_ds.get(superds.path, []) \ + [opj(superds.path, '.gitmodules'), ds_path] ds2save.add(superds.path) else: if check and hasattr(ds.repo, 'drop'): _drop_files(ds, paths, check=True) results.extend(ds.repo.remove(paths, r=True)) ds2save.add(ds.path) if dataset and dataset.is_installed(): # forge chain from base dataset to any leaf dataset # in order to save state changes all the way up _discover_trace_to_known(dataset.path, [], content_by_ds) save_dataset_hierarchy( content_by_ds, base=dataset.path if dataset and dataset.is_installed() else None, message='[DATALAD] removed content') return results
def __call__(path=None, dataset=None, revision=None, staged=False, ignore_subdatasets='none', report_untracked='normal', recursive=False, recursion_limit=None): if not dataset and not path: # act on the whole dataset if nothing else was specified dataset = curdir refds_path = Interface.get_refds_path(dataset) to_process = [] # tracked what commit ranges we want to diff per dataset ds_diffies = {} for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='diff', # unavailable is OK, because we might query for a deleted file unavailable_path_status='', nondataset_path_status='impossible', # must not use `modified`, infinite loop otherwise modified=None, return_type='generator', on_failure='ignore'): if ap.get('status', None): # we know what to report already yield ap continue if ap.get('type', None) == 'dataset': ap['process_content'] = True if ap.get('raw_input', False) or ap['path'] == refds_path: # prepopulate the revision specs for all input paths ds_diffies[ap['path'] if ap.get('type', None) == 'dataset' else ap['parentds']] = revision to_process.append(ap) # sort into datasets content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path) assert (not completed) for ds_path in sorted(content_by_ds.keys()): if ds_path not in ds_diffies: # we don't know how to diff # this was not neither an input path, not did we see it # when diffing its parent continue content_paths = content_by_ds[ds_path] revision = ds_diffies[ds_path] for r in _parse_git_diff(ds_path, diff_thingie=ds_diffies[ds_path], paths=content_paths, ignore_submodules=ignore_subdatasets, staged=staged): r.update(dict(action='diff', logger=lgr)) if refds_path: r['refds'] = refds_path if 'status' not in r: r['status'] = 'ok' if r.get('type', None) == 'dataset': # this is a subdataset report # we need to use the reported commit range to properly adjust the # query once we hit that subdataset from_rev = r.get('revision_src', '') to_rev = r.get('revision', '') subrev = '{}..{}'.format( from_rev if from_rev else PRE_INIT_COMMIT_SHA, to_rev if to_rev else '', ) if from_rev and from_rev == to_rev: # this is a special case, where subdataset reported changes without # a change in state/commit -- this is code for uncommited changes # in the subdataset (including staged ones). In such a case, we # must not provide a diff range, but only the source commit we want # to diff against # XXX if this is changed, likely the same logic in annotate_paths needs # changing too! subrev = from_rev ds_diffies[r['path']] = subrev yield r if (revision and '..' in revision) or report_untracked == 'no': # don't look for untracked content, we got a revision range continue for r in _get_untracked_content(ds_path, report_untracked, paths=content_paths): r.update(dict(action='diff', logger=lgr)) if refds_path: r['refds'] = refds_path if 'status' not in r: r['status'] = 'ok' yield r
def __call__( path=None, dataset=None, # support passing this through in a path by path basis to_git=None, save=True, message=None, recursive=False, recursion_limit=None, ds2super=False, git_opts=None, annex_opts=None, annex_add_opts=None, jobs=None): # parameter constraints: if not path: raise InsufficientArgumentsError( "insufficient information for adding: requires at least a path" ) refds_path = Interface.get_refds_path(dataset) common_report = dict(action='add', logger=lgr, refds=refds_path) to_add = [] subds_to_add = {} ds_to_annotate_from_recursion = {} got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=dataset, # never recursion, need to handle manually below to be able to # discover untracked content recursive=False, action='add', # speed things up by using Git's modification detection, if there # is a repo with at least one commit modified='HEAD' \ if dataset and \ GitRepo.is_valid_repo(refds_path) and \ GitRepo(refds_path).get_hexsha() \ else None, unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): got_nothing = False if ap.get('status', None): # this is done yield ap continue if ap.get('parentds', None) is None and ap.get('type', None) != 'dataset': yield get_status_dict( status='impossible', message='"there is no dataset to add this path to', **dict(common_report, **ap)) continue if ap.get('type', None) == 'directory' and \ ap.get('state', None) == 'untracked' and \ GitRepo.is_valid_repo(ap['path']): # this is an untracked wannabe subdataset in disguise ap['type'] = 'dataset' if recursive and \ (ap.get('raw_input', False) or ap.get('state', None) in ('added', 'modified', 'untracked')) and \ (ap.get('parentds', None) or ap.get('type', None) == 'dataset'): # this was an actually requested input path, or a path that was found # modified by path annotation, based on an input argument # we need to recurse into all subdirs to find potentially # unregistered subdatasets # but only if this path has a parent, or is itself a dataset # otherwise there is nothing to add to _discover_subdatasets_recursively( ds_to_annotate_from_recursion, ap['path'], [ap['parentds'] if 'parentds' in ap else ap['path']], recursion_limit) # get the file content of the root dataset of this search added too # but be careful with extreme recursion_limit settings if recursion_limit is None or recursion_limit > 0: ap['process_content'] = True # record for further processing if not ap['path'] in ds_to_annotate_from_recursion: # if it was somehow already discovered to_add.append(ap) # TODO check if next isn't covered by discover_dataset_trace_to_targets already?? if dataset and ap.get('type', None) == 'dataset': # duplicates not possible, annotated_paths returns unique paths subds_to_add[ap['path']] = ap if got_nothing: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict('add', status='notneeded', path=refds_path, type='dataset', logger=lgr) return for subds in ds_to_annotate_from_recursion: if subds not in subds_to_add: # always prefer the already annotated path subds_to_add[subds] = ds_to_annotate_from_recursion[subds] if dataset: # we have a base dataset, discover any intermediate datasets between # the base and any already discovered dataset discovered = {} discover_dataset_trace_to_targets( # from here dataset.path, # to any dataset we are aware of subds_to_add.keys(), [], discovered) for parentds in discovered: for subds in discovered[parentds]: subds_to_add[subds] = subds_to_add.get( subds, dict(path=subds, parentds=parentds, type='dataset')) # merge custom paths and discovered dataset records, paths needs to go first, # because we know most about then, and subsequent annotation call we skip the # later duplicate ones to_add.extend(subds_to_add.values()) # and compact, this should be OK as all the info is in each ap dict to_add = unique(to_add, lambda x: x['path']) if not to_add: # nothing left to do, potentially all errored before return # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, it will also deduplicate annotated_paths = AnnotatePaths.__call__( path=to_add, dataset=dataset, # never recursion, done already recursive=False, action='add', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert (not completed) if not content_by_ds: # we should have complained about any inappropriate path argument # above, so if nothing is left, we can simply exit return # simple loop over datasets -- save happens later # start deep down to_save = [] for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) torepoadd = {} respath_by_status = {} for ap in content_by_ds[ds_path]: # we have a new story ap.pop('status', None) torepoadd[ap['path']] = ap # skip anything that doesn't look like a wannabe subdataset if not ap.get('type', None) == 'dataset' or \ ap['path'] == ds_path: continue if ap.get('registered_subds', False): # subdataset that might be in this list because of the # need to save all the way up to a super dataset respath_by_status['success'] = \ respath_by_status.get('success', []) + [ap['path']] yield get_status_dict(status='notneeded', message="already known subdataset", **dict(common_report, **ap)) continue subds = Dataset(ap['path']) # check that the subds has a commit, and refuse # to operate on it otherwise, or we would get a bastard # submodule that cripples git operations if not subds.repo.get_hexsha(): yield get_status_dict( ds=subds, status='impossible', message='cannot add subdataset with no commits', **dict(common_report, **ap)) continue subds_relpath = relpath(ap['path'], ds_path) # make an attempt to configure a submodule source URL based on the # discovered remote configuration remote, branch = subds.repo.get_tracking_branch() subds_url = subds.repo.get_remote_url( remote) if remote else None # Register the repository in the repo tree as a submodule try: ds.repo.add_submodule(subds_relpath, url=subds_url, name=None) except CommandError as e: yield get_status_dict(ds=subds, status='error', message=e.stderr, **dict(common_report, **ap)) continue # queue for saving using the updated annotated path ap['registered_subds'] = True # I hope this is true in direct mode too # TODO this is disabled, because in some circumstances # staging just doesn't happen, and it is unclear when # exactly -- the case that prompted disabling was a submodule # that had no content except for other submodules was not staged, # whereas another submodule on the same level in the same # superdataset which also has one file in it was staged # disable to work correctly, while paying a little bit of # slow down #ap['staged'] = True to_save.append(ap) _fixup_submodule_dotgit_setup(ds, subds_relpath) # report added subdatasets -- `annex add` below won't do it yield get_status_dict(ds=subds, status='ok', message='added new subdataset', **dict(common_report, **ap)) # make sure that .gitmodules is added to the list of files gitmodules_path = opj(ds.path, '.gitmodules') # for git torepoadd[gitmodules_path] = dict(path=gitmodules_path) # and for save to_save.append( dict(path=gitmodules_path, parentds=ds_path, type='file')) # make sure any last minute additions make it to the saving stage # XXX? should content_by_ds become OrderedDict so that possible # super here gets processed last? lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd) is_annex = isinstance(ds.repo, AnnexRepo) add_kw = {'jobs': jobs} if is_annex and jobs else {} added = ds.repo.add(list(torepoadd.keys()), git=to_git if is_annex else True, **add_kw) for a in added: res = annexjson2result(a, ds, type='file', **common_report) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] # produce best possible path/result annotation if res['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report res = dict(torepoadd[res['path']], **res) # override this in all cases to be safe res['parentds'] = ds.path if success: # this was successfully added, queue for saving this very path # in the dataset ap = {k: v for k, v in res.items() if k != 'status'} ap['staged'] = True # strip any status and state info (e.g. save will refuse to save # stuff that is marked state='untracked' to_save.append({ k: v for k, v in res.items() if k not in ('status', 'state') }) if a['file'] == '.gitmodules': # filter out .gitmodules, because this is only included for # technical reasons and has nothing to do with the actual content continue if GitRepo.is_valid_repo(res['path']): # more accurate report in case of an added submodule # mountpoint. # XXX Actually not sure if this can really happen # (depends on what our low-level code would do) # but worst case is that we loose a little bit of # coverage... res['type'] = 'dataset' res['message'] = 'added new state as submodule' yield res for r in results_from_annex_noinfo( ds, torepoadd, respath_by_status, dir_fail_msg='could not add some content in %s %s', noinfo_dir_msg='nothing to add from %s', noinfo_file_msg='already included in the dataset', action='add', logger=lgr, refds=refds_path): if r['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report r = dict(r, **torepoadd[r['path']]) if r['status'] == 'notneeded': # this could be a file that was staged already, it doesn't need # to be added, but it should be saved/commited if so desired to_save.append({ k: v for k, v in r.items() if k not in ('status', 'state') }) # XXX something is fishy with the next one, rethink when sober.... if r['path'] == ds_path and r['status'] == 'ok': # this is for the entire dataset itself which was explicitly requested # make sure to save all r['type'] = 'dataset' r['process_content'] = True to_save.append( {k: v for k, v in r.items() if k != 'status'}) yield r if refds_path and ds_path != refds_path and len( respath_by_status.get('success', [])): # TODO XXX we have an issue here when with `add('.')` and annex ignores any # dotfiles. In this case we end up not saving a dataset completely, because # we rely on accurate reporting. there is an issue about this already # TODO look up the issue ID # if there is a base dataset, but we are below it, and we have anything done to this # dataset -> queue dataset itself for saving its state in the parent ds_ap = dict( path=ds.path, # we have to look for the parent here, as we must save the # subdataset in the parent and not the whole subdataset itself type='dataset') parentds = get_dataset_root(normpath(opj(ds.path, pardir))) if parentds: ds_ap['parentds'] = parentds if dataset: ds_ap['refds'] = refds_path to_save.append(ds_ap) if not save: lgr.debug('Not calling `save` as instructed') return # TODO tell save what was staged already! Set 'staged=True' for # respective annotated paths that are fed into `save` # do not reuse any of the sorting done in here for saving, but instead # pass on all the annotated paths to have `save` figure out what to do with # them -- this is costs something, but should be safer, and frankly is # more comprehensible for res in Save.__call__( # hand-selected annotated paths path=to_save, dataset=refds_path, message=message if message else '[DATALAD] added content', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def __call__(path=None, source=None, dataset=None, get_data=False, description=None, recursive=False, recursion_limit=None, reckless=None, jobs="auto"): # normalize path argument to be equal when called from cmdline and # python and nothing was passed into `path` path = ensure_list(path) if not source and not path: raise InsufficientArgumentsError( "Please provide at least a source or a path") # Common kwargs to pass to underlying git/install calls. # They might need adjustments (e.g. for recursion_limit, but # otherwise would be applicable throughout # # There should have been more of common options! # since underneath get could do similar installs common_kwargs = dict( get_data=get_data, recursive=recursive, recursion_limit=recursion_limit, # git_opts=git_opts, # annex_opts=annex_opts, reckless=reckless, jobs=jobs, ) # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. ds = None if dataset is not None: ds = require_dataset(dataset, check_installed=True, purpose='installation') common_kwargs['dataset'] = dataset # pre-compute for results below refds_path = Interface.get_refds_path(ds) # switch into the two scenarios without --source: # 1. list of URLs # 2. list of (sub)dataset content if source is None: # we need to collect URLs and paths to_install = [] to_get = [] # TODO: this approach is problematic, it disrupts the order of input args. # consequently results will be returned in an unexpected order when a # mixture of source URL and paths is given. Reordering is only possible when # everything in here is fully processed before any results can be yielded. # moreover, I think the semantics of the status quo implementation are a # bit complicated: in a mixture list a source URL will lead to a new dataset # at a generated default location, but a path will lead to a subdataset # at that exact location for urlpath in path: ri = RI(urlpath) (to_get if isinstance(ri, PathRI) else to_install).append(urlpath) # 1. multiple source URLs for s in to_install: lgr.debug("Install passes into install source=%s", s) for r in Install.__call__( source=s, description=description, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" on_failure='ignore', return_type='generator', result_xfm=None, result_filter=None, **common_kwargs): # no post-processing of the installed content on disk # should be necessary here, all done by code further # down that deals with an install from an actuall `source` # any necessary fixes should go there too! r['refds'] = refds_path yield r # 2. one or more dataset content paths if to_get: lgr.debug("Install passes into get %d items", len(to_get)) # all commented out hint on inability to pass those options # into underlying install-related calls. # Also need to pass from get: # annex_get_opts for r in Get.__call__( to_get, # TODO should pass-through description, not sure why disabled # description=description, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" on_failure='ignore', return_type='generator', result_xfm=None, result_filter=None, **common_kwargs): # no post-processing of get'ed content on disk should be # necessary here, this is the responsibility of `get` # (incl. adjusting parent's gitmodules when submodules end # up in an "updated" state (done in get helpers) # any required fixes should go there! r['refds'] = refds_path yield r # we are done here # the rest is about install from a `source` return # an actual `source` was given if source and path and len(path) > 1: # exception is ok here, if this fails it is either direct user error # or we f****d up one of our internal calls raise ValueError( "install needs a single PATH when source is provided. " "Was given mutliple PATHs: %s" % str(path)) # parameter constraints: if not source: # exception is ok here, if this fails it is either direct user error # or we f****d up one of our internal calls raise InsufficientArgumentsError( "a `source` is required for installation") # code below deals with a single path only path = path[0] if path else None if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination yield get_status_dict( 'install', path=path, status='impossible', logger=lgr, source_url=source, refds=refds_path, message= "installation `source` and destination `path` are identical. " "If you are trying to add a subdataset simply use the `save` command" ) return # resolve the target location (if local) against the provided dataset # or CWD: if path is not None: # MIH everything in here is highly similar to what common # interface helpers do (or should/could do), but at the same # is very much tailored to just apply to `install` -- I guess # it has to stay special # Should work out just fine for regular paths, so no additional # conditioning is necessary try: path_ri = RI(path) except Exception as e: raise ValueError("invalid path argument {}: ({})".format( path, exc_str(e))) try: # Wouldn't work for SSHRI ATM, see TODO within SSHRI # yoh: path should be a local path, and mapping note within # SSHRI about mapping localhost:path to path is kinda # a peculiar use-case IMHO # TODO Stringification can be removed once PY35 is no longer # supported path = str(resolve_path(path_ri.localpath, dataset)) # any `path` argument that point to something local now # resolved and is no longer a URL except ValueError: # `path` is neither a valid source nor a local path. # TODO: The only thing left is a known subdataset with a # name, that is not a path; Once we correctly distinguish # between path and name of a submodule, we need to consider # this. # For now: Just raise raise ValueError("Invalid path argument {0}".format(path)) # `path` resolved, if there was any. # clone dataset, will also take care of adding to superdataset, if one # is given res = Clone.__call__( source, path, dataset=ds, description=description, reckless=reckless, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" result_xfm=None, return_type='generator', result_filter=None, on_failure='ignore') # helper as_ds = YieldDatasets() destination_dataset = None for r in res: if r['action'] == 'install' and r['type'] == 'dataset': # make sure logic below is valid, only one dataset result is # coming back assert (destination_dataset is None) destination_dataset = as_ds(r) r['refds'] = refds_path yield r assert (destination_dataset) # Now, recursive calls: if recursive or get_data: # dataset argument must not be passed inside since we use bound .get # It is ok to do "inplace" as long as we still return right # after the loop ends common_kwargs.pop('dataset', '') for r in destination_dataset.get( curdir, description=description, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" on_failure='ignore', return_type='generator', result_xfm=None, **common_kwargs): r['refds'] = refds_path yield r # at this point no futher post-processing should be necessary, # `clone` and `get` must have done that (incl. parent handling) # if not, bugs should be fixed in those commands return
def __call__( path=None, dataset=None, recursive=False, recursion_limit=None, update_mode='target', incremental=False, force_extraction=False, save=True): refds_path = Interface.get_refds_path(dataset) # it really doesn't work without a dataset ds = require_dataset( dataset, check_installed=True, purpose='metadata aggregation') path = assure_list(path) if not path: # then current/reference dataset is "aggregated" # We should not add ds.path always since then --recursive would # also recurse current even if paths are given path.append(ds.path) agginfo_db_location, agg_base_path = get_ds_aggregate_db_locations(ds) agginfo_db = load_ds_aggregate_db(ds, abspath=True) to_save = [] to_aggregate = set() for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='aggregate_metadata', # uninstalled subdatasets could be queried via aggregated metadata # -> no 'error' unavailable_path_status='', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue ap_type = ap.get('type', None) ap_state = ap.get('state', None) assert('parentds' in ap or ap_type == 'dataset') if ap_type == 'dataset' and ap_state != 'absent': # a present dataset, we can take directly from it aggsrc = ap['path'] lgr.info('Aggregate metadata for dataset %s', aggsrc) else: # everything else needs to come from the parent aggsrc = ap['parentds'] if ap_state == 'absent': lgr.info( 'Attempt to use pre-aggregate metadata for absent %s from dataset at %s', ap['path'], aggsrc) else: lgr.info( 'Aggregate metadata for %s from dataset at %s', ap['path'], aggsrc) to_aggregate.add(aggsrc) if ap_state == 'absent': # key thought: recursive is done by path annotation, hence # once we hit an absent dataset, we are 100% certain that # there is nothing to recurse into on the file system # hence we only have to look into the aggregated metadata # of the last available dataset in the dataset tree edge # # if there is nothing at this path, we need to look into the # parentds and check if we know anything about this path # if we do, we need to grab all the info and objects # if not, we need to error res = _get_dsinfo_from_aggmetadata( aggsrc, ap['path'], recursive, agginfo_db) if not isinstance(res, list): yield get_status_dict( status='impossible', message=res, action='aggregate_metadata', path=ap['path'], logger=lgr) continue # cue for aggregation to_aggregate.update(res) else: # actually aggregate metadata for this dataset, immediately place # generated objects into the aggregated or reference dataset, # and put info into DB to get the distributed to all datasets # that need to be updated errored = _dump_extracted_metadata( ds, Dataset(aggsrc), agginfo_db, to_save, force_extraction, agg_base_path) if errored: yield get_status_dict( status='error', message='Metadata extraction failed (see previous error message, set datalad.runtime.raiseonerror=yes to fail immediately)', action='aggregate_metadata', path=aggsrc, logger=lgr) # at this point we have dumped all aggregated metadata into object files # somewhere, we know what needs saving, but having saved anything, and # we know about the states of all aggregated dataset in the DB # what remains to do is to update all dataset, so they have there own copy # of aggregated metadata and update their respective aggregate.json with # info on what states we just aggregated from # first, let's figure out what dataset need updating at all # get adjencency info of the dataset tree spanning the base to all leaf dataset # associated with the path arguments if update_mode == 'all': ds_adj = {} discover_dataset_trace_to_targets( ds.path, to_aggregate, [], ds_adj, # we know that to_aggregate only lists datasets, existing and # absent ones -- we want to aggregate all of them, either from # just extracted metadata, or from previously aggregated metadata # of the closest superdataset includeds=to_aggregate) # TODO we need to work in the info about dataset that we only got from # aggregated metadata, that had no trace on the file system in here!! subtrees = _adj2subtrees(ds.path, ds_adj, to_aggregate) elif update_mode == 'target': subtrees = {ds.path: list(agginfo_db.keys())} else: raise ValueError( "unknown `update_mode` '%s' for metadata aggregation", update_mode) # go over datasets in bottom-up fashion for parentds_path in sorted(subtrees, reverse=True): lgr.info('Update aggregate metadata in dataset at: %s', parentds_path) _update_ds_agginfo( ds.path, parentds_path, subtrees[parentds_path], incremental, agginfo_db, to_save) # update complete res = get_status_dict( status='ok', action='aggregate_metadata', path=parentds_path, type='dataset', logger=lgr) res.update(agginfo_db.get(parentds_path, {})) yield res # # save potential modifications to dataset global metadata # if not to_save: return lgr.info('Attempting to save %i files/datasets', len(to_save)) for res in Save.__call__( path=to_save, dataset=refds_path, message='[DATALAD] Dataset aggregate metadata update', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def __call__( path=None, dataset=None, revision=None, staged=False, ignore_subdatasets='none', report_untracked='normal', recursive=False, recursion_limit=None): if not dataset and not path: # act on the whole dataset if nothing else was specified dataset = curdir refds_path = Interface.get_refds_path(dataset) to_process = [] # tracked what commit ranges we want to diff per dataset ds_diffies = {} for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='diff', # unavailable is OK, because we might query for a deleted file unavailable_path_status='', nondataset_path_status='impossible', # must not use `modified`, infinite loop otherwise modified=None, return_type='generator', on_failure='ignore'): if ap.get('status', None): # we know what to report already yield ap continue if ap.get('type', None) == 'dataset': ap['process_content'] = True if ap.get('raw_input', False) or ap['path'] == refds_path: # prepopulate the revision specs for all input paths ds_diffies[ap['path'] if ap.get('type', None) == 'dataset' else ap['parentds']] = revision to_process.append(ap) # sort into datasets content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path) assert(not completed) for ds_path in sorted(content_by_ds.keys()): if ds_path not in ds_diffies: # we don't know how to diff # this was not neither an input path, not did we see it # when diffing its parent continue content_paths = content_by_ds[ds_path] revision = ds_diffies[ds_path] for r in _parse_git_diff( ds_path, diff_thingie=ds_diffies[ds_path], paths=content_paths, ignore_submodules=ignore_subdatasets, staged=staged): r.update(dict( action='diff', logger=lgr)) if refds_path: r['refds'] = refds_path if 'status' not in r: r['status'] = 'ok' if r.get('type', None) == 'dataset': # this is a subdataset report # we need to use the reported commit range to properly adjust the # query once we hit that subdataset from_rev = r.get('revision_src', '') to_rev = r.get('revision', '') subrev = '{}..{}'.format( from_rev if from_rev else PRE_INIT_COMMIT_SHA, to_rev if to_rev else '', ) if from_rev and from_rev == to_rev: # this is a special case, where subdataset reported changes without # a change in state/commit -- this is code for uncommited changes # in the subdataset (including staged ones). In such a case, we # must not provide a diff range, but only the source commit we want # to diff against # XXX if this is changed, likely the same logic in annotate_paths needs # changing too! subrev = from_rev ds_diffies[r['path']] = subrev yield r if (revision and '..' in revision) or report_untracked == 'no': # don't look for untracked content, we got a revision range continue for r in _get_untracked_content( ds_path, report_untracked, paths=content_paths): r.update(dict( action='diff', logger=lgr)) if refds_path: r['refds'] = refds_path if 'status' not in r: r['status'] = 'ok' yield r
def __call__(path, dataset=None, spec_file=None, properties=None, replace=False): # TODO: message dataset = require_dataset(dataset, check_installed=True, purpose="hirni spec4anything") path = assure_list(path) path = [resolve_path(p, dataset) for p in path] res_kwargs = dict(action='hirni spec4anything', logger=lgr) res_kwargs['refds'] = Interface.get_refds_path(dataset) # ### This might become superfluous. See datalad-gh-2653 ds_path = PathRI(dataset.path) # ### updated_files = [] paths = [] for ap in AnnotatePaths.__call__( dataset=dataset, path=path, action='hirni spec4anything', unavailable_path_status='impossible', nondataset_path_status='error', return_type='generator', # TODO: Check this one out: on_failure='ignore', # Note/TODO: Not sure yet whether and when we need those. # Generally we want to be able to create a spec for subdatasets, # too: # recursive=recursive, # recursion_limit=recursion_limit, # force_subds_discovery=True, # force_parentds_discovery=True, ): if ap.get('status', None) in ['error', 'impossible']: yield ap continue # ### This might become superfluous. See datalad-gh-2653 ap_path = PathRI(ap['path']) # ### # find acquisition and respective specification file: rel_path = posixpath.relpath(ap_path.posixpath, ds_path.posixpath) path_parts = rel_path.split('/') # TODO: Note: Outcommented this warning for now. We used to not have # a spec file at the toplevel of the study dataset, but now we do. # The logic afterwards works, but should be revisited. At least, # `acq` should be called differently now. # if len(path_parts) < 2: # lgr.warning("Not within an acquisition") acq = path_parts[0] # TODO: spec file specifiable or fixed path? # if we want the former, what we actually need is an # association of acquisition and its spec path # => prob. not an option but a config spec_path = spec_file if spec_file \ else posixpath.join(ds_path.posixpath, acq, dataset.config.get("datalad.hirni.studyspec.filename", "studyspec.json")) spec = [r for r in json_py.load_stream(spec_path)] \ if posixpath.exists(spec_path) else list() lgr.debug("Add specification snippet for %s", ap['path']) # XXX 'add' does not seem to be the thing we want to do # rather 'set', so we have to check whether a spec for a location # is already known and fail or replace it (maybe with --force) # go through all existing specs and extract unique value # and also assign them to the new record (subjects, ...), but only # editable fields!! uniques = dict() for s in spec: for k in s: if isinstance(s[k], dict) and 'value' in s[k]: if k not in uniques: uniques[k] = set() uniques[k].add(s[k]['value']) overrides = dict() for k in uniques: if len(uniques[k]) == 1: overrides[k] = _get_edit_dict(value=uniques[k].pop(), approved=False) if properties: # TODO: This entire reading of properties needs to be RF'd # into proper generalized functions. # spec got more complex. update() prob. can't simply override # (think: 'procedures' and 'tags' prob. need to be appended # instead) # load from file or json string if isinstance(properties, dict): props = properties elif op.exists(properties): props = json_py.load(properties) else: props = json_py.loads(properties) # turn into editable, pre-approved records spec_props = { k: dict(value=v, approved=True) for k, v in props.items() if k not in non_editables + ['tags', 'procedures'] } spec_props.update({ k: v for k, v in props.items() if k in non_editables + ['tags'] }) # TODO: still wrong. It's a list. Append or override? How to decide? spec_props.update({ o_k: [{ i_k: dict(value=i_v, approved=True) for i_k, i_v in o_v.items() }] for o_k, o_v in props.items() if o_k in ['procedures'] }) overrides.update(spec_props) # TODO: It's probably wrong to use uniques for overwriting! At least # they cannot be used to overwrite values explicitly set in # _add_to_spec like "location", "type", etc. # # But then: This should concern non-editable fields only, right? spec = _add_to_spec(spec, posixpath.split(spec_path)[0], ap, dataset, overrides=overrides, replace=replace) # Note: Not sure whether we really want one commit per snippet. # If not - consider: # - What if we fail amidst? => Don't write to file yet. # - What about input paths from different acquisitions? # => store specs per acquisition in memory # MIH: One commit per line seems silly. why not update all files # collect paths of updated files, and give them to a single `add` # at the very end? # MIH: if we fail, we fail and nothing is committed from datalad_hirni.support.spec_helpers import sort_spec json_py.dump2stream(sorted(spec, key=lambda x: sort_spec(x)), spec_path) updated_files.append(spec_path) yield get_status_dict(status='ok', type=ap['type'], path=ap['path'], **res_kwargs) paths.append(ap) from datalad.dochelpers import single_or_plural from os import linesep message = "[HIRNI] Add specification {n_snippets} for: {paths}".format( n_snippets=single_or_plural("snippet", "snippets", len(paths)), paths=linesep.join(" - " + op.relpath(p['path'], dataset.path) for p in paths) if len(paths) > 1 else op.relpath(paths[0]['path'], dataset.path)) for r in dataset.save(updated_files, to_git=True, message=message, return_type='generator', result_renderer='disabled'): yield r
def __call__(path=None, dataset=None, recursive=False, recursion_limit=None, update_mode='target', incremental=False, force_extraction=False, save=True): refds_path = Interface.get_refds_path(dataset) # it really doesn't work without a dataset ds = require_dataset(dataset, check_installed=True, purpose='metadata aggregation') path = assure_list(path) if not path: # then current/reference dataset is "aggregated" # We should not add ds.path always since then --recursive would # also recurse current even if paths are given path.append(ds.path) agginfo_db_location, agg_base_path = get_ds_aggregate_db_locations( ds, # do not warn here, next call triggers the same warning warn_absent=False) agginfo_db = load_ds_aggregate_db(ds, abspath=True) to_save = [] to_aggregate = set() for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='aggregate_metadata', # uninstalled subdatasets could be queried via aggregated metadata # -> no 'error' unavailable_path_status='', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue ap_type = ap.get('type', None) ap_state = ap.get('state', None) assert ('parentds' in ap or ap_type == 'dataset') if ap_type == 'dataset' and ap_state != 'absent': # a present dataset, we can take directly from it aggsrc = ap['path'] lgr.info('Aggregate metadata for dataset %s', aggsrc) else: # everything else needs to come from the parent aggsrc = ap['parentds'] if ap_state == 'absent': lgr.info( 'Attempt to use pre-aggregate metadata for absent %s from dataset at %s', ap['path'], aggsrc) else: lgr.info('Aggregate metadata for %s from dataset at %s', ap['path'], aggsrc) to_aggregate.add(aggsrc) if ap_state == 'absent': # key thought: recursive is done by path annotation, hence # once we hit an absent dataset, we are 100% certain that # there is nothing to recurse into on the file system # hence we only have to look into the aggregated metadata # of the last available dataset in the dataset tree edge # # if there is nothing at this path, we need to look into the # parentds and check if we know anything about this path # if we do, we need to grab all the info and objects # if not, we need to error res = _get_dsinfo_from_aggmetadata(aggsrc, ap['path'], recursive, agginfo_db) if not isinstance(res, list): yield get_status_dict(status='impossible', message=res, action='aggregate_metadata', path=ap['path'], logger=lgr) continue # cue for aggregation to_aggregate.update(res) else: # actually aggregate metadata for this dataset, immediately place # generated objects into the aggregated or reference dataset, # and put info into DB to get the distributed to all datasets # that need to be updated errored = _dump_extracted_metadata(ds, Dataset(aggsrc), agginfo_db, to_save, force_extraction, agg_base_path) if errored: yield get_status_dict( status='error', message= 'Metadata extraction failed (see previous error message, set datalad.runtime.raiseonerror=yes to fail immediately)', action='aggregate_metadata', path=aggsrc, logger=lgr) # at this point we have dumped all aggregated metadata into object files # somewhere, we know what needs saving, but having saved anything, and # we know about the states of all aggregated dataset in the DB # what remains to do is to update all dataset, so they have there own copy # of aggregated metadata and update their respective aggregate.json with # info on what states we just aggregated from # first, let's figure out what dataset need updating at all # get adjencency info of the dataset tree spanning the base to all leaf dataset # associated with the path arguments if update_mode == 'all': ds_adj = {} discover_dataset_trace_to_targets( ds.path, to_aggregate, [], ds_adj, # we know that to_aggregate only lists datasets, existing and # absent ones -- we want to aggregate all of them, either from # just extracted metadata, or from previously aggregated metadata # of the closest superdataset includeds=to_aggregate) # TODO we need to work in the info about dataset that we only got from # aggregated metadata, that had no trace on the file system in here!! subtrees = _adj2subtrees(ds.path, ds_adj, to_aggregate) elif update_mode == 'target': subtrees = {ds.path: list(agginfo_db.keys())} else: raise ValueError( "unknown `update_mode` '%s' for metadata aggregation", update_mode) # go over datasets in bottom-up fashion for parentds_path in sorted(subtrees, reverse=True): lgr.info('Update aggregate metadata in dataset at: %s', parentds_path) _update_ds_agginfo(ds.path, parentds_path, subtrees[parentds_path], incremental, agginfo_db, to_save) # update complete res = get_status_dict(status='ok', action='aggregate_metadata', path=parentds_path, type='dataset', logger=lgr) res.update(agginfo_db.get(parentds_path, {})) yield res # # save potential modifications to dataset global metadata # if not to_save: return lgr.info('Attempting to save %i files/datasets', len(to_save)) for res in Save.__call__( # save does not need any pre-annotated path hints path=[r['path'] for r in to_save], dataset=refds_path, message='[DATALAD] Dataset aggregate metadata update', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def __call__( path=None, dataset=None, recursive=False, recursion_limit=None, action=None, unavailable_path_status='', unavailable_path_msg=None, nondataset_path_status='error', force_parentds_discovery=True, force_subds_discovery=True, force_no_revision_change_discovery=True, force_untracked_discovery=True, modified=None): # upfront check for the fastest possible response if not path and dataset is None: # nothing given, try "here", but do not use `require_dataset`, as # it will determine the root dataset of `curdir` and further down # lead to path annotation of upstairs directories dataset = curdir if force_subds_discovery and not force_parentds_discovery: raise ValueError( 'subdataset discovery requires parent dataset discovery') # CONCEPT: yield with no status to indicate further processing # everything in one big loop to be able too yield as fast a possible # without any precomputing for all paths refds_path = Interface.get_refds_path(dataset) if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)): raise ValueError( "modification detection only works with a base dataset (non-given or found)") # prep common result props res_kwargs = dict( action=action if action else 'annotate_path', refds=refds_path, logger=lgr) # handle the case of recursion into a single dataset without any # extra fancy processing first -- full recursion can be done # faster than manual recursion, hence we gain quite some speed # from these few lines of extra code if not modified and not path and refds_path: if not GitRepo.is_valid_repo(refds_path): yield get_status_dict( # doesn't matter if the path is in another dataset # it was given as reference dataset status=nondataset_path_status, message='given reference dataset is not a dataset', path=refds_path, **res_kwargs) return refds = Dataset(refds_path) path = [] # yield the dataset itself r = get_status_dict(ds=refds, status='', **res_kwargs) yield r if recursive: # if we have nothing given, but need recursion, we need to feed # the dataset path itself for r in yield_recursive( refds, refds_path, action, recursion_limit): r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] yield r return # goal: structure in a way that makes most information on any path # available in a single pass, at the cheapest possible cost reported_paths = {} requested_paths = assure_list(path) if modified is not None: # modification detection would silently kill all nondataset paths # but we have to complain about them, hence doing it here if requested_paths and refds_path: for r in requested_paths: p = r['path'] if isinstance(r, dict) else r p = resolve_path(p, ds=refds_path) if _with_sep(p).startswith(_with_sep(refds_path)): # all good continue # not the refds path_props = r if isinstance(r, dict) else {} res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with reference dataset' reported_paths[r] = res yield res # preserve non-existing paths to be silently killed by modification # detection and append them to requested_paths again after detection. # TODO: This might be melted in with treatment of non dataset paths # above. Re-appending those paths seems to be better than yielding # directly to avoid code duplication, since both cases later on are # dealt with again. preserved_paths = [] if requested_paths: [preserved_paths.append(r) for r in requested_paths if not lexists(r['path'] if isinstance(r, dict) else r)] # replace the requested paths by those paths that were actually # modified underneath or at a requested location requested_paths = get_modified_subpaths( # either the request, or the base dataset, if there was no request requested_paths if requested_paths else [refds_path], refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit) from itertools import chain # re-append the preserved paths: requested_paths = chain(requested_paths, iter(preserved_paths)) # do not loop over unique(), this could be a list of dicts # we avoid duplicates manually below via `reported_paths` for path in requested_paths: if not isinstance(path, dict): path = rawpath2ap(path, refds_path) # this is now an annotated path! path_props = path path = path['path'] # we need to mark our territory, who knows where this has been path_props.update(res_kwargs) if path in reported_paths: # we already recorded this path in the output # this can happen, whenever `path` is a subdataset, that was # discovered via recursive processing of another path before continue # the path exists in some shape or form # TODO if we have path_props already we could skip this test if isdir(path): # keep any existing type info, previously a more expensive run # could have discovered an uninstalled 'dataset', and we don't # want it to be relabeled to a directory path_props['type'] = \ path_props.get( 'type', 'dataset' if GitRepo.is_valid_repo(path) else 'directory') # this could contain all types of additional content containing_dir = path else: if lexists(path): path_props['type'] = 'file' else: path_props['state'] = 'absent' # for everything else we are interested in the container containing_dir = dirname(path) if not containing_dir: containing_dir = curdir dspath = parent = get_dataset_root(containing_dir) if dspath: if path_props.get('type', None) == 'dataset': # for a dataset the root is not the parent, for anything else # it is parent = path_props.get('parentds', None) oneupdir = normpath(opj(containing_dir, pardir)) if parent is None and (force_parentds_discovery or ( refds_path and _with_sep(oneupdir).startswith( _with_sep(refds_path)))): # either forced, or only if we have a reference dataset, and # only if we stay within this refds when searching for the # parent parent = get_dataset_root(normpath(opj(containing_dir, pardir))) # NOTE the `and refds_path` is critical, as it will determine # whether a top-level dataset that was discovered gets the # parent property or not, it won't get it without a common # base dataset, and that is how we always rolled if parent and refds_path: path_props['parentds'] = parent # don't check whether this is actually a true subdataset of the # parent, done further down else: # set parent, but prefer existing property path_props['parentds'] = path_props.get('parentds', dspath) # test for `dspath` not `parent`, we only need to know whether there is # ANY dataset, not which one is the true parent, logic below relies on # the fact that we end here, if there is no dataset at all if not dspath: # not in any dataset res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with any dataset' reported_paths[path] = res yield res continue # check that we only got SUBdatasets if refds_path and not _with_sep(dspath).startswith(_with_sep(refds_path)): res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = \ ('path not part of the reference dataset at %s', refds_path) reported_paths[path] = res yield res continue if path_props.get('type', None) == 'file': # nothing else we can learn about this res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res continue containing_ds = None path_type = path_props.get('type', None) if parent and force_subds_discovery and ( (path_type == 'dataset' and 'registered_subds' not in path_props) or path_type == 'directory' or not lexists(path)): # if the path doesn't exist, or is labeled a directory, or a dataset even # a dataset (without this info) -> record whether this is a known subdataset # to its parent containing_ds = Dataset(parent) subdss = containing_ds.subdatasets( fulfilled=None, recursive=False, result_xfm=None, result_filter=None, return_type='list') if path in [s['path'] for s in subdss]: if path_type == 'directory' or not lexists(path): # first record that it isn't here, if just a dir or not here at all path_props['state'] = 'absent' # this must be a directory, and it is not installed path_props['type'] = 'dataset' path_props['registered_subds'] = True if not lexists(path) or \ (path_props.get('type', None) == 'dataset' and path_props.get('state', None) == 'absent'): # not there (yet) message = unavailable_path_msg if unavailable_path_msg else None if message and '%s' in message: message = (message, path) path_props['message'] = message res = get_status_dict(**dict(res_kwargs, **path_props)) # assign given status, but only if the props don't indicate a status # already res['status'] = path_props.get( 'status', unavailable_path_status) reported_paths[path] = res yield res continue # we know everything we can, report res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res rec_paths = [] if recursive: # here we need to consider the special case that `path` is # a dataset itself, if a recursion_limit is given (e.g. # `remove` will do that by default), we need to recurse # from the dataset itself, and not its parent to get things # right -- this will also avoid needless discovery of # unrelated subdatasets if path_props.get('type', None) == 'dataset': containing_ds = Dataset(path) else: # regular parent, we might have a dataset already containing_ds = Dataset(parent) if containing_ds is None else containing_ds for r in yield_recursive(containing_ds, path, action, recursion_limit): # capture reported paths r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] reported_paths[r['path']] = r if modified is not None: # we cannot yield right away, maybe it wasn't modified rec_paths.append(r) else: yield r if modified is not None and rec_paths: # replace the recursively discovered paths by those paths that # were actually modified underneath or at a requested location for r in get_modified_subpaths( rec_paths, refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit): res = get_status_dict(**dict(r, **res_kwargs)) reported_paths[res['path']] = res yield res return
def __call__(path=None, dataset=None, to=None, since=None, missing='fail', force=False, transfer_data='auto', recursive=False, recursion_limit=None, git_opts=None, annex_opts=None, annex_copy_opts=None, jobs=None): # if ever we get a mode, for "with-data" we would need this #if dataset and not path: # # act on the whole dataset if nothing else was specified # path = dataset.path if isinstance(dataset, Dataset) else dataset if not dataset and not path: # try to find a dataset in PWD dataset = require_dataset(None, check_installed=True, purpose='publishing') if since and not dataset: raise InsufficientArgumentsError( 'Modification detection (--since) without a base dataset ' 'is not supported') if dataset and since == '': # only update since last update so we figure out what was the last update active_branch = dataset.repo.get_active_branch() if to: # XXX here we assume one to one mapping of names from local branches # to the remote since = '%s/%s' % (to, active_branch) else: # take tracking remote for the active branch tracked_remote, tracked_refspec = dataset.repo.get_tracking_branch( ) if tracked_remote: if tracked_refspec.startswith('refs/heads/'): tracked_refspec = tracked_refspec[len('refs/heads/'):] #to = tracked_remote since = '%s/%s' % (tracked_remote, tracked_refspec) else: lgr.info( "No tracked remote for %s. since option is of no effect", active_branch) since = None # here is the plan # 1. figure out remote to publish to # 2. figure out which content needs to be published to this remote # 3. look for any pre-publication dependencies of that remote # (i.e. remotes that need to be published to before) # 4. publish the content needed to go to the primary remote to # the dependencies first, and to the primary afterwards ds_remote_info = {} refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(refds=refds_path, logger=lgr, action='publish') to_process = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='publish', unavailable_path_status='impossible', nondataset_path_status='error', modified=since, return_type='generator', on_failure='ignore', force_no_revision_change_discovery= False, # we cannot publish what was not committed force_untracked_discovery=False # we cannot publish untracked ): if ap.get('status', None): # this is done yield ap continue remote_info_result = None if ap.get('type', ap.get('type_src', 'dataset')) != 'dataset': # for everything that is not a dataset get the remote info # for the parent parentds = ap.get('parentds', None) if parentds and parentds not in ds_remote_info: remote_info_result = _get_remote_info( parentds, ds_remote_info, to, missing) else: # this is a dataset if ap.get('state', None) == 'absent': continue # get the remote info for itself remote_info_result = _get_remote_info(ap['path'], ds_remote_info, to, missing) ap['process_content'] = True if remote_info_result is not None: ap['status'] = remote_info_result[0] ap['message'] = remote_info_result[1] yield ap continue to_process.append(ap) content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path) assert (not completed) lgr.debug("Evaluating %i dataset publication candidate(s)", len(content_by_ds)) # TODO: fancier sorting, so we still follow somewhat the hierarchy # in sorted order, e.g. # d1/sub1/sub1 # d1/sub1 # d1 # d2/sub1 # d2 content_by_ds = OrderedDict( (d, content_by_ds[d]) for d in sorted(content_by_ds, reverse=True)) lgr.debug("Attempt to publish %i datasets", len(content_by_ds)) for ds_path in content_by_ds: remote_info = ds_remote_info.get(ds_path, None) if remote_info is None: # maybe this dataset wasn't annotated above, try to get info # MIH: I think this entire if-branch is practically impossible # to reach. It is certainly untested, but I think this is due # to mutually exclusive conditions during remote_info detection remote_info_result = _get_remote_info(ds_path, ds_remote_info, to, missing) if remote_info_result is not None: yield get_status_dict(type='dataset', path=ds_path, status=remote_info_result[0], message=remote_info_result[1], **res_kwargs) continue # continue with freshly obtained info remote_info = ds_remote_info[ds_path] # condition above must catch all other cases assert remote_info # and publish ds = Dataset(ds_path) for r in _publish_dataset( ds, remote=remote_info['remote'], refspec=remote_info.get('refspec', None), # only send paths that were explicitly requested paths= [ p for p in content_by_ds[ds_path] # do not feed (sub)dataset paths into the beast # makes no sense to try to annex copy them # for the base dataset itself let `transfer_data` # decide if p.get('type', None) != 'dataset' ], annex_copy_options=annex_copy_opts, force=force, jobs=jobs, transfer_data=transfer_data, **res_kwargs): yield r
def __call__( path=None, dataset=None, recursive=False, recursion_limit=None, check=True, if_dirty='save-before'): if not dataset and not path: raise InsufficientArgumentsError( "insufficient information for `drop`: requires at least a path or dataset") refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='drop', logger=lgr, refds=refds_path) if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path to_drop = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='drop', # justification for status: # content need not be dropped where there is none unavailable_path_status='notneeded', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) == 'dataset' and \ GitRepo.is_valid_repo(ap['path']) and \ not ap['path'] == refds_path: ap['process_content'] = True if ap.get('registered_subds', False) and ap.get('state', None) == 'absent': # nothing to drop in an absent subdataset, don't be annoying # and skip silently continue to_drop.append(ap) content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_drop, refds_path=refds_path) assert(not completed) # iterate over all datasets, order doesn't matter for ds_path in content_by_ds: ds = Dataset(ds_path) # TODO generator # this should yield what it did handle_dirty_dataset(ds, mode=if_dirty) # ignore submodule entries content = [ap['path'] for ap in content_by_ds[ds_path] if ap.get('type', None) != 'dataset' or ap['path'] == ds.path] if not content: continue for r in _drop_files(ds, content, check=check, **res_kwargs): yield r
def __call__( path=None, dataset=None, # support passing this through in a path by path basis to_git=None, save=True, message=None, message_file=None, recursive=False, recursion_limit=None, ds2super=False, git_opts=None, annex_opts=None, annex_add_opts=None, jobs=None): # parameter constraints: if not path: raise InsufficientArgumentsError( "insufficient information for adding: requires at least a path") refds_path = Interface.get_refds_path(dataset) common_report = dict(action='add', logger=lgr, refds=refds_path) if message and message_file: raise ValueError("Both a message and message file were specified") if message_file: with open(message_file, "rb") as mfh: message = assure_unicode(mfh.read()) to_add = [] subds_to_add = {} ds_to_annotate_from_recursion = {} got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=dataset, # never recursion, need to handle manually below to be able to # discover untracked content recursive=False, action='add', # speed things up by using Git's modification detection, if there # is a repo with at least one commit modified='HEAD' \ if dataset and \ GitRepo.is_valid_repo(refds_path) and \ GitRepo(refds_path).get_hexsha() \ else None, unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): got_nothing = False if ap.get('status', None): # this is done yield ap continue if ap.get('parentds', None) is None and ap.get('type', None) != 'dataset': yield get_status_dict( status='impossible', message='"there is no dataset to add this path to', **dict(common_report, **ap)) continue if ap.get('type', None) == 'directory' and \ ap.get('state', None) == 'untracked' and \ GitRepo.is_valid_repo(ap['path']): # this is an untracked wannabe subdataset in disguise ap['type'] = 'dataset' if recursive and \ (ap.get('raw_input', False) or ap.get('state', None) in ('added', 'modified', 'untracked')) and \ (ap.get('parentds', None) or ap.get('type', None) == 'dataset'): # this was an actually requested input path, or a path that was found # modified by path annotation, based on an input argument # we need to recurse into all subdirs to find potentially # unregistered subdatasets # but only if this path has a parent, or is itself a dataset # otherwise there is nothing to add to _discover_subdatasets_recursively( ds_to_annotate_from_recursion, ap['path'], [ap['parentds'] if 'parentds' in ap else ap['path']], recursion_limit) # get the file content of the root dataset of this search added too # but be careful with extreme recursion_limit settings if recursion_limit is None or recursion_limit > 0: ap['process_content'] = True # record for further processing if not ap['path'] in ds_to_annotate_from_recursion: # if it was somehow already discovered to_add.append(ap) if got_nothing: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict( 'add', status='notneeded', path=refds_path, type='dataset', logger=lgr) return for subds in ds_to_annotate_from_recursion: if subds not in subds_to_add: # always prefer the already annotated path subds_to_add[subds] = ds_to_annotate_from_recursion[subds] if dataset: # we have a base dataset, discover any intermediate datasets between # the base and any already discovered dataset discovered = {} discover_dataset_trace_to_targets( # from here dataset.path, # to any dataset we are aware of subds_to_add.keys(), [], discovered) for parentds in discovered: for subds in discovered[parentds]: subds_to_add[subds] = subds_to_add.get( subds, dict(path=subds, parentds=parentds, type='dataset')) # merge custom paths and discovered dataset records, paths needs to go first, # because we know most about then, and subsequent annotation call we skip the # later duplicate ones to_add.extend(subds_to_add.values()) # and compact, this should be OK as all the info is in each ap dict to_add = unique(to_add, lambda x: x['path']) if not to_add: # nothing left to do, potentially all errored before return # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, it will also deduplicate annotated_paths = AnnotatePaths.__call__( path=to_add, dataset=dataset, # never recursion, done already recursive=False, action='add', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert(not completed) if not content_by_ds: # we should have complained about any inappropriate path argument # above, so if nothing is left, we can simply exit return # simple loop over datasets -- save happens later # start deep down to_save = [] for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) torepoadd = {} respath_by_status = {} for ap in content_by_ds[ds_path]: # we have a new story ap.pop('status', None) torepoadd[ap['path']] = ap # skip anything that doesn't look like a wannabe subdataset if not ap.get('type', None) == 'dataset' or \ ap['path'] == ds_path: continue if ap.get('registered_subds', False): # subdataset that might be in this list because of the # need to save all the way up to a super dataset respath_by_status['success'] = \ respath_by_status.get('success', []) + [ap['path']] yield get_status_dict( status='notneeded', message="already known subdataset", **dict(common_report, **ap)) continue subds = Dataset(ap['path']) subds_relpath = relpath(ap['path'], ds_path) # Register the repository in the repo tree as a submodule try: ds.repo.add_submodule(subds_relpath, url=None, name=None) except (CommandError, InvalidGitRepositoryError) as e: yield get_status_dict( ds=subds, status='error', message=e.stderr, **dict(common_report, **ap)) continue # queue for saving using the updated annotated path ap['registered_subds'] = True # I hope this is true in direct mode too # TODO this is disabled, because in some circumstances # staging just doesn't happen, and it is unclear when # exactly -- the case that prompted disabling was a submodule # that had no content except for other submodules was not staged, # whereas another submodule on the same level in the same # superdataset which also has one file in it was staged # disable to work correctly, while paying a little bit of # slow down #ap['staged'] = True to_save.append(ap) # report added subdatasets -- `annex add` below won't do it yield get_status_dict( ds=subds, status='ok', message='added new subdataset', **dict(common_report, **ap)) # make sure that .gitmodules is added to the list of files gitmodules_path = opj(ds.path, '.gitmodules') # for git torepoadd[gitmodules_path] = dict(path=gitmodules_path) # and for save to_save.append(dict( path=gitmodules_path, parentds=ds_path, type='file')) # make sure any last minute additions make it to the saving stage # XXX? should content_by_ds become OrderedDict so that possible # super here gets processed last? lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd) is_annex = isinstance(ds.repo, AnnexRepo) add_kw = {'jobs': jobs} if is_annex and jobs else {} added = ds.repo.add_( list(torepoadd.keys()), git=to_git if is_annex else True, **add_kw ) for a in added: res = annexjson2result(a, ds, type='file', **common_report) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] # produce best possible path/result annotation if res['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report res = dict(torepoadd[res['path']], **res) # override this in all cases to be safe res['parentds'] = ds.path if success: # this was successfully added, queue for saving this very path # in the dataset ap = {k: v for k, v in res.items() if k != 'status'} ap['staged'] = True # strip any status and state info (e.g. save will refuse to save # stuff that is marked state='untracked' to_save.append({k: v for k, v in res.items() if k not in ('status', 'state')}) if a['file'] == '.gitmodules': # filter out .gitmodules, because this is only included for # technical reasons and has nothing to do with the actual content continue if GitRepo.is_valid_repo(res['path']): # more accurate report in case of an added submodule # mountpoint. # XXX Actually not sure if this can really happen # (depends on what our low-level code would do) # but worst case is that we loose a little bit of # coverage... res['type'] = 'dataset' res['message'] = 'added new state as submodule' yield res for r in results_from_annex_noinfo( ds, torepoadd, respath_by_status, dir_fail_msg='could not add some content in %s %s', noinfo_dir_msg='nothing to add from %s', noinfo_file_msg='already included in the dataset', action='add', logger=lgr, refds=refds_path): if r['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report r = dict(r, **torepoadd[r['path']]) if r['status'] == 'notneeded': # this could be a file that was staged already, it doesn't need # to be added, but it should be saved/commited if so desired to_save.append({k: v for k, v in r.items() if k not in ('status', 'state')}) # XXX something is fishy with the next one, rethink when sober.... if r['path'] == ds_path and r['status'] == 'ok': # this is for the entire dataset itself which was explicitly requested # make sure to save all r['type'] = 'dataset' r['process_content'] = True to_save.append({k: v for k, v in r.items() if k != 'status'}) yield r if refds_path and ds_path != refds_path and len(respath_by_status.get('success', [])): # TODO XXX we have an issue here when with `add('.')` and annex ignores any # dotfiles. In this case we end up not saving a dataset completely, because # we rely on accurate reporting. there is an issue about this already # TODO look up the issue ID # if there is a base dataset, but we are below it, and we have anything done to this # dataset -> queue dataset itself for saving its state in the parent ds_ap = dict( path=ds.path, # we have to look for the parent here, as we must save the # subdataset in the parent and not the whole subdataset itself type='dataset') parentds = get_dataset_root(normpath(opj(ds.path, pardir))) if parentds: ds_ap['parentds'] = parentds if dataset: ds_ap['refds'] = refds_path to_save.append(ds_ap) if not save: lgr.debug('Not calling `save` as instructed') return # TODO tell save what was staged already! Set 'staged=True' for # respective annotated paths that are fed into `save` # do not reuse any of the sorting done in here for saving, but instead # pass on all the annotated paths to have `save` figure out what to do with # them -- this is costs something, but should be safer, and frankly is # more comprehensible for res in Save.__call__( # hand-selected annotated paths path=to_save, dataset=refds_path, message=message if message else '[DATALAD] added content', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def __call__(path=None, dataset=None, to_git=False, save=True, recursive=False, recursion_limit=None, ds2super=False, git_opts=None, annex_opts=None, annex_add_opts=None, jobs=None): # parameter constraints: if not path: raise InsufficientArgumentsError( "insufficient information for adding: requires at least a path" ) # never recursion, need to handle manually below to be able to # discover untracked content content_by_ds, unavailable_paths = Interface._prep(path=path, dataset=dataset, recursive=False) if unavailable_paths: lgr.warning("ignoring non-existent path(s): %s", unavailable_paths) if recursive: # with --recursive for each input path traverse the directory # tree, when we find a dataset, add it to the spec, AND add it as # a path to the spec of the parent # MIH: wrap in list() to avoid exception, because dict size might # change, but we want to loop over all that are in at the start # only for d in list(content_by_ds.keys()): for p in content_by_ds[d]: _discover_subdatasets_recursively(p, [d], content_by_ds, recursion_limit) if not content_by_ds: raise InsufficientArgumentsError( "no existing content given to add") if dataset: # remeber the datasets associated with actual inputs input_ds = list(content_by_ds.keys()) # forge chain from base dataset to any leaf dataset _discover_trace_to_known(dataset.path, [], content_by_ds) if ds2super: # now check all dataset entries corresponding to the original # input to see if they contain their own paths and remove them for inpds in input_ds: content_by_ds[inpds] = [ p for p in content_by_ds[inpds] if not p == inpds ] # and lastly remove all entries that contain no path to avoid # saving any staged content in the final step content_by_ds = {d: v for d, v in content_by_ds.items() if v} results = [] # simple loop over datasets -- save happens later # start deep down for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) toadd = list(set(content_by_ds[ds_path])) # handle anything that looks like a wannabe subdataset for subds_path in [ d for d in toadd if GitRepo.is_valid_repo(d) and d != ds_path and d not in ds.get_subdatasets( recursive=False, absolute=True, fulfilled=True) ]: # TODO add check that the subds has a commit, and refuse # to operate on it otherwise, or we would get a bastard # submodule that cripples git operations _install_subds_inplace(ds=ds, path=subds_path, relativepath=relpath( subds_path, ds_path)) # make sure that .gitmodules is added to the list of files toadd.append(opj(ds.path, '.gitmodules')) # report added subdatasets -- add below won't do it results.append({'success': True, 'file': Dataset(subds_path)}) # make sure any last minute additions make it to the saving stage content_by_ds[ds_path] = toadd added = ds.repo.add( toadd, git=to_git if isinstance(ds.repo, AnnexRepo) else True, commit=False) for a in added: a['file'] = opj(ds_path, a['file']) results.extend(added) if results and save: save_dataset_hierarchy(content_by_ds, base=dataset.path if dataset and dataset.is_installed() else None, message='[DATALAD] added content') return results