def _get_flexible_source_candidates_for_submodule(ds, sm_path, sm_url=None): """Retrieve candidates from where to install the submodule Even if url for submodule is provided explicitly -- first tries urls under parent's module tracking branch remote. """ clone_urls = [] # should be our first candidate tracking_remote, tracking_branch = ds.repo.get_tracking_branch() candidate_remotes = [tracking_remote] if tracking_remote else [] # if we have a remote, let's check the location of that remote # for the presence of the desired submodule try: last_commit = next(ds.repo._get_files_history(sm_path)).hexsha # ideally should also give preference to the remotes which have # the same branch checked out I guess candidate_remotes += list(ds.repo._get_remotes_having_commit(last_commit)) except StopIteration: # no commit for it known yet, ... oh well pass for remote in unique(candidate_remotes): remote_url = ds.repo.get_remote_url(remote, push=False) # Directly on parent's ds url if remote_url: # attempt: submodule checkout at parent remote URL # We might need to quote sm_path portion, e.g. for spaces etc if isinstance(RI(remote_url), URL): sm_path_url = urlquote(sm_path) else: sm_path_url = sm_path clone_urls.extend( _get_flexible_source_candidates( # alternate suffixes are tested by `clone` anyways sm_path_url, remote_url, alternate_suffix=False)) # attempt: provided (configured?) submodule URL # TODO: consider supporting DataLadRI here? or would confuse # git and we wouldn't want that (i.e. not allow pure git clone # --recursive) if sm_url: clone_urls += _get_flexible_source_candidates( sm_url, remote_url, alternate_suffix=False ) # Do based on the ds.path as the last resort if sm_url: clone_urls += _get_flexible_source_candidates( sm_url, ds.path, alternate_suffix=False) return unique(clone_urls)
def to_str(self, include_output=True): from datalad.utils import ( ensure_unicode, join_cmdline, ) to_str = "{}: ".format(self.__class__.__name__) cmd = self.cmd if cmd: to_str += "'{}'".format( # go for a compact, normal looking, properly quoted # command rendering if the command is in list form join_cmdline(cmd) if isinstance(cmd, list) else cmd) if self.code: to_str += " failed with exitcode {}".format(self.code) if self.cwd: # only if not under standard PWD to_str += " under {}".format(self.cwd) if self.msg: # typically a command error has no specific idea to_str += " [{}]".format(ensure_unicode(self.msg)) if not include_output: return to_str if self.stdout: to_str += " [out: '{}']".format( ensure_unicode(self.stdout).strip()) if self.stderr: to_str += " [err: '{}']".format( ensure_unicode(self.stderr).strip()) if self.kwargs: if 'stdout_json' in self.kwargs: src_keys = ('note', 'error-messages') from datalad.utils import unique json_errors = unique('; '.join( str(m[key]) for key in src_keys if m.get(key)) for m in self.kwargs['stdout_json'] if any( m.get(k) for k in src_keys)) if json_errors: to_str += " [errors from JSON records: {}]".format( json_errors) to_str += " [info keys: {}]".format(', '.join(self.kwargs.keys())) return to_str
def _get_flexible_source_candidates_for_submodule(ds, sm_path, sm_url=None): """Retrieve candidates from where to install the submodule Even if url for submodule is provided explicitly -- first tries urls under parent's module tracking branch remote. """ clone_urls = [] # if we have a remote, let's check the location of that remote # for the presence of the desired submodule remote_name, remote_url = _get_tracking_source(ds) # Directly on parent's ds url if remote_url: # attempt: submodule checkout at parent remote URL # We might need to quote sm_path portion, e.g. for spaces etc if isinstance(RI(remote_url), URL): sm_path_url = urlquote(sm_path) else: sm_path_url = sm_path clone_urls.extend( _get_flexible_source_candidates( # alternate suffixes are tested by `clone` anyways sm_path_url, remote_url, alternate_suffix=False)) # attempt: provided (configured?) submodule URL # TODO: consider supporting DataLadRI here? or would confuse # git and we wouldn't want that (i.e. not allow pure git clone # --recursive) if sm_url: clone_urls += _get_flexible_source_candidates( sm_url, remote_url if remote_url else ds.path, alternate_suffix=False) return unique(clone_urls)
def _the_same_across_datasets(relpath, *dss): """Check if the file (present content or not) is identical across two datasets Compares files by content if under git, or by checksum if under annex Parameters ---------- *ds: Datasets relpath: str path within datasets Returns ------- bool or None True if identical, False if not, None if cannot be decided (e.g. different git-annex backend used) """ from datalad.utils import md5sum, unique from datalad.support.exceptions import FileInGitError from datalad.support.digests import Digester paths = [op.join(ds.path, relpath) for ds in dss] # The simplest check first -- exist in both and content is the same. # Even if content is just a symlink file on windows, the same content # condition would be correct if all(map(op.exists, paths)) and all_same(map(md5sum, paths)): return True # We first need to find problematic ones which are annexed and # have no content locally, and take their keys = [] backends = [] presents = [] for ds in dss: repo = ds.repo key = None present = True if isinstance(repo, AnnexRepo): annexprops = repo.get_file_annexinfo( relpath, eval_availability=True) if 'key' not in annexprops: continue key = annexprops['key'] # For now the rest (e.g. not tracked) remains an error if not annexprops['has_content']: present = False backends.append(repo.get_key_backend(key)) keys.append(key) presents.append(present) if all(presents): return all_same(map(md5sum, paths)) backends = unique(backends) assert backends, "Since not all present - some must be under annex, and thus must have a backend!" # so some files are missing! assert not all(presents) NeedContentError = RuntimeError if len(backends) > 1: # TODO: or signal otherwise somehow that we just need to get at least some # of those files to do the check!... raise NeedContentError( "Following paths are missing content and have different annex " "backends: %s. Cannot determine here if the same or not!" % ", ".join(p for (p, b) in zip(paths, presents) if not b) ) backend = backends[0].lower() if backend.endswith('E'): backend = backend[':-1'] if backend not in Digester.DEFAULT_DIGESTS: raise NeedContentError( "Do not know how to figure out content check for backend %s" % backend ) checksums = [ split_ext(key).split('--', 1)[1] if key else key for key in keys ] thechecksum = set( checksum for present, checksum in zip(presents, checksums) if present ) if len(thechecksum) > 1: # Different checksum (with the same backend) return False elif not thechecksum: raise RuntimeError("We must have had at least one key since prior logic" " showed that not all files have content here") thechecksum = thechecksum[0] if any(presents): # We do need to extract checksum from the key and check the present # files' content to match digester = Digester([backend]) for present, path in zip(presents, paths): if present and digester(path)[backend] != thechecksum: return False return True return False
def __call__(message=None, files=None, dataset=None, all_updated=True, all_changes=None, version_tag=None, recursive=False, recursion_limit=None, super_datasets=False): if all_changes is not None: from datalad.support.exceptions import DeprecatedError raise DeprecatedError( new="all_updated option where fits and/or datalad add", version="0.5.0", msg="RF: all_changes option passed to the save") if not dataset and not files: # we got nothing at all -> save what is staged in the repo in "this" directory? # we verify that there is an actual repo next dataset = abspath(curdir) refds_path = Interface.get_refds_path(dataset) to_process = [] for ap in AnnotatePaths.__call__( path=files, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='save', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): # next check should not be done during annotation, as it is possibly expensive # and not generally useful if ap.get('status', None) == 'impossible' and \ ap.get('state', None) == 'absent' and \ ap.get('parentds', None): # this is not here anymore, but it might actually have been a deleted # component if relpath(ap['path'], start=ap['parentds']) \ in Dataset(ap['parentds']).repo.get_deleted_files(): # ok, this is a staged deletion that we want to save ap['status'] = '' del ap['message'] if ap.get('status', None): # this is done yield ap continue # for things like: `ds.save()` # or recursively discovered datasets if ap['path'] == refds_path or \ (ap.get('type', None) == 'dataset' and not ap.get('raw_input', False) and not ap.get('state', None) == 'absent'): ap['process_content'] = True ap['process_updated_only'] = all_updated to_process.append(ap) if not to_process: # nothing left to do, potentially all errored before return if super_datasets: # search for the topmost superdatasets of any path dss = [ Dataset(ap.get('parentds', ap['path'])) for ap in to_process ] superdss = [ds.get_superdataset(topmost=True) for ds in dss] superdss = get_tree_roots( unique(ds.path for ds in dss + superdss if ds)) if dataset: # need to adjust the reference to the new superds # if we had one ref before, we should still have exactly one assert len(superdss) <= 1 dataset = list(superdss.keys())[0] refds_path = dataset elif refds_path: # there is a single superdataset superdss = { refds_path: unique( [ap['parentds'] for ap in to_process if 'parentds' in ap]) } else: # sort all datasets under their potential superdatasets # start from the top to get all subdatasets down the line # and collate them into as few superdatasets as possible # this is quick, just string operations superdss = get_tree_roots( unique( [ap['parentds'] for ap in to_process if 'parentds' in ap])) # for each "superdataset" check the tree of subdatasets and make sure # we gather all datasets between the super and any subdataset # so we can save them all bottom-up in order to be able to properly # save the superdataset # if this is called from e.g. `add` this is actually not necessary, # but in the general case we cannot avoid it # TODO maybe introduce a switch? discovered = {} for superds_path in superdss: target_subs = superdss[superds_path] discover_dataset_trace_to_targets( # from here superds_path, # to all target_subs, [], discovered) # create a new minimally annotated path for each discovered dataset discovered_added = set() for parentds in discovered: for subds in discovered[parentds]: to_process.append( dict(path=subds, parentds=parentds, type='dataset')) discovered_added.add(subds) # make sure we have an entry for each dataset, including those # tha are just parents for parentds in discovered: if parentds not in discovered_added: to_process.append( dict( path=parentds, type='dataset', # make sure we save content of superds later on process_content=True)) # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, deduplication happens here too annotated_paths = AnnotatePaths.__call__( path=to_process, dataset=dataset, # never recursion, done already recursive=False, action='save', unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') # now sort into datasets so we can process them one by one content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path, path_only=False) assert (not completed) # iterate over all datasets, starting at the bottom for dspath in sorted(content_by_ds.keys(), reverse=True): ds = Dataset(dspath) res = get_status_dict('save', ds=ds, logger=lgr) if not ds.is_installed(): # TODO This is likely impossible now res['status'] = 'impossible' res['message'] = ('dataset %s is not installed', ds) yield res continue saved_state = save_dataset(ds, content_by_ds[dspath], message=message, version_tag=version_tag) if saved_state: res['status'] = 'ok' else: res['status'] = 'notneeded' yield res
def test_unique(): eq_(unique(range(3)), [0, 1, 2]) eq_(unique(range(3), reverse=True), [0, 1, 2]) eq_(unique((1, 0, 1, 3, 2, 0, 1)), [1, 0, 3, 2]) eq_(unique((1, 0, 1, 3, 2, 0, 1), reverse=True), [3, 2, 0, 1]) eq_(unique([]), []) eq_(unique([], reverse=True), []) eq_(unique([(1, 2), (1, ), (1, 2), (0, 3)]), [(1, 2), (1, ), (0, 3)]) eq_(unique([(1, 2), (1, ), (1, 2), (0, 3)], reverse=True), [(1, ), (1, 2), (0, 3)]) # with a key now eq_(unique([(1, 2), (1, ), (1, 2), (0, 3)], key=itemgetter(0)), [(1, 2), (0, 3)]) eq_( unique([(1, 2), (1, ), (1, 2), (0, 3)], key=itemgetter(0), reverse=True), [(1, 2), (0, 3)]) eq_(unique([(1, 2), (1, 3), (1, 2), (0, 3)], key=itemgetter(1)), [(1, 2), (1, 3)]) eq_( unique([(1, 2), (1, 3), (1, 2), (0, 3)], key=itemgetter(1), reverse=True), [(1, 2), (0, 3)])
def __call__(message=None, path=None, dataset=None, all_updated=True, version_tag=None, recursive=False, recursion_limit=None, super_datasets=False, message_file=None ): if not dataset and not path: # we got nothing at all -> save what is staged in the repo in "this" directory? # make sure we don't treat this as a user-provided '.' argument path = [{'path': abspath(curdir), 'raw_input': False}] refds_path = Interface.get_refds_path(dataset) if message and message_file: raise ValueError("Both a message and message file were specified") if message_file: with open(message_file, "rb") as mfh: message = assure_unicode(mfh.read()) to_process = [] got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='save', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', modified='HEAD' if not path and recursive else None, return_type='generator', on_failure='ignore'): if ap.get('state', None) == 'untracked' and not ap.get('raw_input', False): # this path was found untracked, but not explicitly given to save # we will silently ignore this continue got_nothing = False # next check should not be done during annotation, as it is possibly expensive # and not generally useful if ap.get('status', None) == 'impossible' and \ ap.get('state', None) == 'absent' and \ ap.get('parentds', None): # this is not here anymore, but it might actually have been a deleted # component if relpath(ap['path'], start=ap['parentds']) \ in Dataset(ap['parentds']).repo.get_deleted_files(): # ok, this is a staged deletion that we want to save ap['status'] = '' del ap['message'] if ap.get('status', None): # this is done yield ap continue # for things like: `ds.save()` # or recursively discovered datasets if ap['path'] == refds_path or \ (ap.get('type', None) == 'dataset' and not ap.get('raw_input', False) and not ap.get('state', None) == 'absent'): ap['process_content'] = True ap['process_updated_only'] = all_updated to_process.append(ap) lgr.log(2, "save, to_process=%r", to_process) if got_nothing and recursive and refds_path: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict( 'save', status='notneeded', path=refds_path, type='dataset', logger=lgr) return if not to_process: # nothing left to do, potentially all errored before return if super_datasets: # search for the topmost superdatasets of any path dss = [Dataset(ap.get('parentds', ap['path'])) for ap in to_process] superdss = [ds.get_superdataset(topmost=True) for ds in dss] superdss = get_tree_roots( unique(ds.path for ds in dss + superdss if ds)) if dataset: # need to adjust the reference to the new superds # if we had one ref before, we should still have exactly one assert len(superdss) <= 1 dataset = list(superdss.keys())[0] refds_path = dataset elif refds_path: # there is a single superdataset superdss = { refds_path: unique([ap['parentds'] for ap in to_process if 'parentds' in ap])} else: # sort all datasets under their potential superdatasets # start from the top to get all subdatasets down the line # and collate them into as few superdatasets as possible # this is quick, just string operations superdss = get_tree_roots( unique([ap['parentds'] for ap in to_process if 'parentds' in ap])) # for each "superdataset" check the tree of subdatasets and make sure # we gather all datasets between the super and any subdataset # so we can save them all bottom-up in order to be able to properly # save the superdataset # if this is called from e.g. `add` this is actually not necessary, # but in the general case we cannot avoid it # TODO maybe introduce a switch? discovered = {} for superds_path in superdss: target_subs = superdss[superds_path] discover_dataset_trace_to_targets( # from here superds_path, # to all target_subs, [], discovered) # create a new minimally annotated path for each discovered dataset discovered_added = set() for parentds in discovered: for subds in discovered[parentds]: to_process.append(dict( path=subds, parentds=parentds, type='dataset')) discovered_added.add(subds) # make sure we have an entry for each dataset, including those # tha are just parents for parentds in discovered: if parentds not in discovered_added: to_process.append(dict( path=parentds, type='dataset', # make sure we save content of superds later on process_content=True, # but not do nasty things, like adding untracked content # just because we discovered this dataset process_updated_only=True)) # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, deduplication happens here too annotated_paths = AnnotatePaths.__call__( path=to_process, dataset=dataset, # never recursion, done already recursive=False, action='save', unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') # now sort into datasets so we can process them one by one content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert(not completed) # iterate over all datasets, starting at the bottom for dspath in sorted(content_by_ds.keys(), reverse=True): ds = Dataset(dspath) res = get_status_dict('save', ds=ds, logger=lgr) if not ds.is_installed(): # TODO This is likely impossible now res['status'] = 'impossible' res['message'] = ('dataset %s is not installed', ds) yield res continue saved_state = save_dataset( ds, content_by_ds[dspath], message=message) res['status'] = 'ok' if saved_state else 'notneeded' # MIH: let's tag even if there was nothing commit. I'd forget this # option too often... if version_tag: try: # TODO: check whether comment below is still true after # removing the log swallowing: # again cannot help but force-silence low-level code, because # it screams like a made man instead of allowing top-level # code an orderly error report ds.repo.tag(version_tag) # even if we haven't saved anything res['status'] = 'ok' yield res except CommandError as e: if saved_state: # first we yield the result for the actual save yield res # and now complain that tagging didn't work yield get_status_dict( 'save', ds=ds, logger=lgr, status='error', message=( 'cannot tag this version: %s', e.stderr.strip())) else: yield res
def _get_flexible_source_candidates_for_submodule(ds, sm): """Assemble candidate locations from where to clone a submodule The following locations candidates are considered. For each candidate a cost is given in parenthesis, lower values indicate higher cost: - URL of any configured superdataset remote that is known to have the desired submodule commit, with the submodule path appended to it. There can be more than one candidate (cost 500). - A URL or absolute path recorded in `.gitmodules` (cost 600). - In case `.gitmodules` contains a relative path instead of a URL, the URL of any configured superdataset remote that is known to have the desired submodule commit, with this relative path appended to it. There can be more than one candidate (cost 500). - In case `.gitmodules` contains a relative path as a URL, the absolute path of the superdataset, appended with this relative path (cost 900). Additional candidate URLs can be generated based on templates specified as configuration variables with the pattern `datalad.get.subdataset-source-candidate-<name>` where `name` is an arbitrary identifier. If name starts with three digits (e.g. '400myserver') these will be interpreted as a cost, and the respective candidate will be sorted into the generated candidate list according to this cost. If no cost is given, a default of 700 is used. A template string assigned to such a variable can utilize the Python format mini language and may reference a number of properties that are inferred from the parent dataset's knowledge about the target subdataset. Properties include any submodule property specified in the respective `.gitmodules` record. For convenience, an existing `datalad-id` record is made available under the shortened name `id`. Additionally, the URL of any configured remote that contains the respective submodule commit is available as `remote-<name>` properties, where `name` is the configured remote name. Lastly, all candidates are sorted according to their cost (lower values first, and duplicate URLs are stripped, while preserving the first item in the candidate list. Parameters ---------- ds : Dataset Parent dataset of to-be-installed subdataset. sm : dict Submodule record as produced by `subdatasets()`. Returns ------- list of dict Where each dict has keys 'cost' (int), 'name' (str), 'url' (str). Names are not unique and either derived from the name of the respective remote, template configuration variable, or 'local'. """ # short cuts ds_repo = ds.repo sm_url = sm.get('gitmodule_url', None) sm_path = op.relpath(sm['path'], start=sm['parentds']) clone_urls = [] # CANDIDATE: tracking remote of the current branch tracking_remote, tracking_branch = ds_repo.get_tracking_branch() candidate_remotes = [tracking_remote] if tracking_remote else [] # if we have a remote, let's check the location of that remote # for the presence of the desired submodule last_commit = ds_repo.get_last_commit_hexsha(sm_path) if last_commit: # CANDIDATE: any remote that has the commit when the submodule was # last modified # ideally should also give preference to the remotes which have # the same branch checked out I guess candidate_remotes += list( _get_remotes_having_commit(ds_repo, last_commit)) # prepare a dict to generate URL candidates from templates sm_candidate_props = { k[10:].replace('datalad-id', 'id'): v for k, v in sm.items() if k.startswith('gitmodule_') } for remote in unique(candidate_remotes): remote_url = ds_repo.get_remote_url(remote, push=False) # Directly on parent's ds url if remote_url: # make remotes and their URLs available to template rendering sm_candidate_props['remoteurl-{}'.format(remote)] = remote_url # attempt: submodule checkout at parent remote URL # We might need to quote sm_path portion, e.g. for spaces etc if isinstance(RI(remote_url), URL): sm_path_url = urlquote(sm_path) else: sm_path_url = sm_path clone_urls.extend( dict(cost=500, name=remote, url=url) for url in _get_flexible_source_candidates( # alternate suffixes are tested by `clone` anyways sm_path_url, remote_url, alternate_suffix=False)) # attempt: provided (configured?) submodule URL # TODO: consider supporting DataLadRI here? or would confuse # git and we wouldn't want that (i.e. not allow pure git clone # --recursive) if sm_url: clone_urls.extend( dict(cost=600, name=remote, url=url) for url in _get_flexible_source_candidates( sm_url, remote_url, alternate_suffix=False)) cost_candidate_expr = re.compile('[0-9][0-9][0-9].*') candcfg_prefix = 'datalad.get.subdataset-source-candidate-' for name, tmpl in [(c[len(candcfg_prefix):], ds_repo.config[c]) for c in ds_repo.config.keys() if c.startswith(candcfg_prefix)]: url = tmpl.format(**sm_candidate_props) # we don't want "flexible_source_candidates" here, this is # configuration that can be made arbitrarily precise from the # outside. Additional guesswork can only make it slower has_cost = cost_candidate_expr.match(name) is not None clone_urls.append( # assign a default cost, if a config doesn't have one dict( cost=int(name[:3]) if has_cost else 700, name=name[3:] if has_cost else name, url=url, from_config=True, )) # CANDIDATE: the actual configured gitmodule URL if sm_url: clone_urls.extend( dict(cost=900, name='local', url=url) for url in _get_flexible_source_candidates( sm_url, ds.path, alternate_suffix=False) # avoid inclusion of submodule location itself if url != sm['path']) # sort all candidates by their label, thereby allowing a # candidate provided by configuration to purposefully # sort before or after automatically generated configuration clone_urls = sorted(clone_urls, key=lambda x: x['cost']) # take out any duplicate source candidates # unique() takes out the duplicated at the tail end clone_urls = unique(clone_urls, lambda x: x['url']) return clone_urls
def get_paths_by_dataset(paths, recursive=False, recursion_limit=None, out=None, dir_lookup=None, sub_paths=True): """Sort a list of paths per dataset they are contained in. Any paths that are not part of a dataset, or presently unavailable are reported. Parameter --------- paths : sequence A sequence of path specifications to sort. recursive : bool Flag whether to report subdatasets under any of the given paths recursion_limit : Depth constraint for recursion. See `subdatasets()` for more information. out : dict or None By default a new output dictionary is created, however an existing one can be provided via this argument to enable incremental processing. dir_lookup : dict or None, optional Optional lookup cache that maps paths to previously determined datasets. This can speed up repeated processing. sub_paths : bool, optional Provide a list containing the sub-dataset path, as the entry for that sub-dataset. If False, empty list is assigned Returns ------- Tuple(dict, list, list) Dict of `existing dataset path`: `path` mappings, the list of currently non-existing paths (possibly matching currently uninstalled datasets), and any paths that are not part of any dataset. """ # sort paths into the respective datasets if dir_lookup is None: dir_lookup = {} if out is None: out = {} # paths that don't exist (yet) unavailable_paths = [] nondataset_paths = [] for path in unique(paths): if not lexists(path): # not there yet, impossible to say which ds it will actually # be in, if any unavailable_paths.append(path) continue # the path exists in some shape or form if isdir(path): # this could contain all types of additional content d = path else: # for everything else we are interested in the container d = dirname(path) if not d: d = curdir dspath = dir_lookup.get(d, None) if dspath: _ds_looked_up = True else: _ds_looked_up = False # this could be `None` if there is no git repo dspath = get_dataset_root(d) dir_lookup[d] = dspath if not dspath: nondataset_paths.append(path) continue if path in out.get(dspath, []): # we already recorded this path in the output # this can happen, whenever `path` is a subdataset, that was # discovered via recursive processing of another path before continue if isdir(path): ds = Dataset(dspath) # we need to doublecheck that this is not a subdataset mount # point, in which case get_dataset_root() would point to the parent. if not _ds_looked_up: # we didn't deal with it before # TODO this is a slow call, no need for dedicated RF, will vanish # together with the entire function smpath = ds.get_containing_subdataset(path, recursion_limit=1).path if smpath != dspath: # fix entry dir_lookup[d] = smpath # submodule still needs to be obtained unavailable_paths.append(path) continue else: # we figured out the dataset previously, so we can spare some # effort by not calling ds.subdatasets or # ds.get_containing_subdataset. Instead we just need # get_dataset_root, which is cheaper if dspath != get_dataset_root(dspath): # if the looked up path isn't the default value, # it's a 'fixed' entry for an unavailable dataset (see above) unavailable_paths.append(path) continue if recursive: # make sure we get everything relevant in all _checked out_ # subdatasets, obtaining of previously unavailable subdataset # else done elsewhere for subdspath in ds.subdatasets( fulfilled=True, recursive=recursive, recursion_limit=recursion_limit, result_xfm='paths'): if subdspath.startswith(_with_sep(path)): # this subdatasets is underneath the search path # be careful to not overwrite anything, in case # this subdataset has been processed before out[subdspath] = out.get( subdspath, [subdspath] if sub_paths else []) out[dspath] = out.get(dspath, []) + [path] return out, unavailable_paths, nondataset_paths
def __call__(message=None, path=None, dataset=None, all_updated=True, version_tag=None, recursive=False, recursion_limit=None, super_datasets=False, message_file=None): if not dataset and not path: # we got nothing at all -> save what is staged in the repo in "this" directory? # make sure we don't treat this as a user-provided '.' argument path = [{'path': abspath(curdir), 'raw_input': False}] refds_path = Interface.get_refds_path(dataset) if message and message_file: yield get_status_dict( 'save', status='error', path=refds_path, message="Both a message and message file were specified", logger=lgr) return if message_file: with open(message_file) as mfh: message = mfh.read() to_process = [] got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='save', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', modified='HEAD' if not path and recursive else None, return_type='generator', on_failure='ignore'): if ap.get('state', None) == 'untracked' and not ap.get('raw_input', False): # this path was found untracked, but not explicitly given to save # we will silently ignore this continue got_nothing = False # next check should not be done during annotation, as it is possibly expensive # and not generally useful if ap.get('status', None) == 'impossible' and \ ap.get('state', None) == 'absent' and \ ap.get('parentds', None): # this is not here anymore, but it might actually have been a deleted # component if relpath(ap['path'], start=ap['parentds']) \ in Dataset(ap['parentds']).repo.get_deleted_files(): # ok, this is a staged deletion that we want to save ap['status'] = '' del ap['message'] if ap.get('status', None): # this is done yield ap continue # for things like: `ds.save()` # or recursively discovered datasets if ap['path'] == refds_path or \ (ap.get('type', None) == 'dataset' and not ap.get('raw_input', False) and not ap.get('state', None) == 'absent'): ap['process_content'] = True ap['process_updated_only'] = all_updated to_process.append(ap) lgr.log(2, "save, to_process=%r", to_process) if got_nothing and recursive and refds_path: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict('save', status='notneeded', path=refds_path, type='dataset', logger=lgr) return if not to_process: # nothing left to do, potentially all errored before return if super_datasets: # search for the topmost superdatasets of any path dss = [ Dataset(ap.get('parentds', ap['path'])) for ap in to_process ] superdss = [ds.get_superdataset(topmost=True) for ds in dss] superdss = get_tree_roots( unique(ds.path for ds in dss + superdss if ds)) if dataset: # need to adjust the reference to the new superds # if we had one ref before, we should still have exactly one assert len(superdss) <= 1 dataset = list(superdss.keys())[0] refds_path = dataset elif refds_path: # there is a single superdataset superdss = { refds_path: unique( [ap['parentds'] for ap in to_process if 'parentds' in ap]) } else: # sort all datasets under their potential superdatasets # start from the top to get all subdatasets down the line # and collate them into as few superdatasets as possible # this is quick, just string operations superdss = get_tree_roots( unique( [ap['parentds'] for ap in to_process if 'parentds' in ap])) # for each "superdataset" check the tree of subdatasets and make sure # we gather all datasets between the super and any subdataset # so we can save them all bottom-up in order to be able to properly # save the superdataset # if this is called from e.g. `add` this is actually not necessary, # but in the general case we cannot avoid it # TODO maybe introduce a switch? discovered = {} for superds_path in superdss: target_subs = superdss[superds_path] discover_dataset_trace_to_targets( # from here superds_path, # to all target_subs, [], discovered) # create a new minimally annotated path for each discovered dataset discovered_added = set() for parentds in discovered: for subds in discovered[parentds]: to_process.append( dict(path=subds, parentds=parentds, type='dataset')) discovered_added.add(subds) # make sure we have an entry for each dataset, including those # tha are just parents for parentds in discovered: if parentds not in discovered_added: to_process.append( dict( path=parentds, type='dataset', # make sure we save content of superds later on process_content=True, # but not do nasty things, like adding untracked content # just because we discovered this dataset process_updated_only=True)) # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, deduplication happens here too annotated_paths = AnnotatePaths.__call__( path=to_process, dataset=dataset, # never recursion, done already recursive=False, action='save', unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') # now sort into datasets so we can process them one by one content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert (not completed) # iterate over all datasets, starting at the bottom for dspath in sorted(content_by_ds.keys(), reverse=True): ds = Dataset(dspath) res = get_status_dict('save', ds=ds, logger=lgr) if not ds.is_installed(): # TODO This is likely impossible now res['status'] = 'impossible' res['message'] = ('dataset %s is not installed', ds) yield res continue saved_state = save_dataset(ds, content_by_ds[dspath], message=message) res['status'] = 'ok' if saved_state else 'notneeded' # MIH: let's tag even if there was nothing commit. I'd forget this # option too often... if version_tag: try: # TODO: check whether comment below is still true after # removing the log swallowing: # again cannot help but force-silence low-level code, because # it screams like a made man instead of allowing top-level # code an orderly error report ds.repo.tag(version_tag) # even if we haven't saved anything res['status'] = 'ok' yield res except CommandError as e: if saved_state: # first we yield the result for the actual save yield res # and now complain that tagging didn't work yield get_status_dict( 'save', ds=ds, logger=lgr, status='error', message=('cannot tag this version: %s', e.stderr.strip())) else: yield res
def __call__(path=None, dataset=None, add=None, init=None, remove=None, reset=None, define_key=None, dataset_global=False, recursive=False, recursion_limit=None): # bring metadataset setter args in shape first untag, remove = _parse_argspec(remove) purge, reset = _parse_argspec(reset) tag_add, add = _parse_argspec(add) tag_init, init = _parse_argspec(init) define_key = dict(define_key) if define_key else None # merge all potential sources of tag specifications all_untag = remove.get('tag', []) + untag if all_untag: remove['tag'] = all_untag all_addtag = add.get('tag', []) + tag_add if all_addtag: add['tag'] = all_addtag all_inittag = init.get('tag', []) + tag_init if all_inittag: init['tag'] = all_inittag lgr.debug("Will 'init' metadata items: %s", init) lgr.debug("Will 'add' metadata items: %s", add) lgr.debug("Will 'remove' metadata items: %s", remove) lgr.debug("Will 'reset' metadata items: %s", reset) lgr.debug("Will 'purge' metadata items: %s", purge) refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='metadata', logger=lgr, refds=refds_path) to_process = [] for ap in AnnotatePaths.__call__(dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='metadata', unavailable_path_status='error', nondataset_path_status='error', force_subds_discovery=False, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) == 'dataset': if ap.get('state', None) == 'absent': # just discovered via recursion, but not relevant here continue if GitRepo.is_valid_repo(ap['path']): ap['process_content'] = True to_process.append(ap) content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path, path_only=False) assert (not completed) # iterate over all datasets, order doesn't matter to_save = [] for ds_path in content_by_ds: # ignore submodule entries content = [ ap for ap in content_by_ds[ds_path] if ap.get('type', None) != 'dataset' or ap['path'] == ds_path ] if not content: # nothing other than subdatasets were given or discovered in # this dataset, ignore continue ds = Dataset(ds_path) if dataset_global or define_key: db_path = opj(ds.path, '.datalad', 'metadata', 'dataset.json') db = {} if exists(db_path): db_fp = open(db_path) # need to read manually, load() would puke on an empty file db_content = db_fp.read() # minimize time for collision db_fp.close() if db_content: db = json.loads(db_content) # TODO make manipulation order identical to what git-annex does for k, v in init.items() if init else []: if k not in db: db[k] = v for k in purge: if k in db: del db[k] for k, v in reset.items(): db[k] = v for k, v in add.items(): db[k] = sorted(unique(db.get(k, []) + v)) for k, v in remove.items(): existing_data = db.get(k, []) if isinstance(existing_data, dict): db[k] = { dk: existing_data[dk] for dk in set(existing_data).difference(v) } else: db[k] = list(set(existing_data).difference(v)) # wipe out if empty if not db[k]: del db[k] added_def = False if define_key: defs = db.get('definition', {}) for k, v in define_key.items(): if k in defs: if not defs[k] == v: yield get_status_dict( status='error', ds=ds, message= ("conflicting definition for key '%s': '%s' != '%s'", k, v, defs[k]), **res_kwargs) continue else: defs[k] = v added_def = True db['definition'] = defs # store, if there is anything if db: if not exists(dirname(db_path)): makedirs(dirname(db_path)) db_fp = open(db_path, 'w') # produce relatively compact, but also diff-friendly format json.dump(db, db_fp, indent=0, separators=(',', ':\n'), sort_keys=True) # minimize time for collision db_fp.close() # use add not save to also cover case of a fresh file ds.add(db_path, save=False) to_save.append( dict(path=db_path, parentds=ds.path, type='file')) elif exists(db_path): # no metadata left, kill file ds.remove(db_path) to_save.append(dict(path=ds.path, type='dataset')) if added_def or init or add or remove or reset or purge: # if anything happended or could have happended yield get_status_dict(status='ok', ds=ds, metadata=db, **res_kwargs) elif not isinstance(ds.repo, AnnexRepo): # report on all explicitly requested paths only for ap in [c for c in content if ap.get('raw_input', False)]: yield dict( ap, status='impossible', message=( 'non-annex dataset %s has no file metadata support', ds), **res_kwargs) continue ds_paths = [p['path'] for p in content] if not dataset_global: if reset or purge or add or init or remove: # file metadata manipulation mod_paths = [] for mp in ds.repo.set_metadata( ds_paths, reset=reset, add=add, init=init, remove=remove, purge=purge, # we always go recursive # TODO is that a good thing? But how to otherwise distinuish # this kind of recursive from the one across datasets in # the API? recursive=True): if mp.get('success', False): mod_paths.append(mp['file']) else: yield get_status_dict( status='error', message='setting metadata failed', path=opj(ds.path, mp[0]), type='file', **res_kwargs) # query the actually modified paths only ds_paths = mod_paths # and lastly, query -- even if we set before -- there could # be side-effect from multiple set paths on an individual # path, hence we need to query to get the final result for file, meta in ds.repo.get_metadata(ds_paths): r = get_status_dict(status='ok', path=opj(ds.path, file), type='file', metadata=meta, **res_kwargs) yield r # save potential modifications to dataset global metadata if not to_save: return for res in Save.__call__(path=to_save, dataset=refds_path, message='[DATALAD] dataset metadata update', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def _get_flexible_source_candidates_for_submodule(ds, sm): """Assemble candidates from where to install a submodule Even if a URL for submodule is provided explicitly -- first tries urls under parent's module tracking branch remote. Additional candidate URLs can be generated based on templates specified as configuration variables with the pattern `datalad.get.subdataset-source-candidate-<name>` where `name` is an arbitrary identifier. A template string assigned to such a variable can utilize the Python format mini language and may reference a number of properties that are inferred from the parent dataset's knowledge about the target subdataset. Properties include any submodule property specified in the respective .gitmodules record. For convenience, an existing `datalad-id` record is made available under the shortened name `id`. Additionally, the URL of any configured remote that contains the respective submodule commit is available as `remote-<name>` properties, where `name` is the configured remote name. Parameters ---------- ds : Dataset Parent dataset of to-be-installed subdataset. sm : dict Submodule record as produced by `subdatasets()`. Returns ------- list of tuples Where each tuples consists of a name and a URL. Names are not unique and either derived from the name of the respective remote, template configuration variable, or 'origin' for the candidate URL that was obtained from the .gitmodule record. """ # short cuts ds_repo = ds.repo sm_url = sm.get('gitmodule_url', None) sm_path = op.relpath(sm['path'], start=sm['parentds']) clone_urls = [] # CANDIDATE: tracking remote of the current branch tracking_remote, tracking_branch = ds_repo.get_tracking_branch() candidate_remotes = [tracking_remote] if tracking_remote else [] # if we have a remote, let's check the location of that remote # for the presence of the desired submodule last_commit = ds_repo.get_last_commit_hexsha(sm_path) if last_commit: # CANDIDATE: any remote that has the commit when the submodule was # last modified # ideally should also give preference to the remotes which have # the same branch checked out I guess candidate_remotes += list( _get_remotes_having_commit(ds_repo, last_commit)) # prepare a dict to generate URL candidates from templates sm_candidate_props = { k[10:].replace('datalad-id', 'id'): v for k, v in sm.items() if k.startswith('gitmodule_') } for remote in unique(candidate_remotes): remote_url = ds_repo.get_remote_url(remote, push=False) # Directly on parent's ds url if remote_url: # make remotes and their URLs available to template rendering sm_candidate_props['remoteurl-{}'.format(remote)] = remote_url # attempt: submodule checkout at parent remote URL # We might need to quote sm_path portion, e.g. for spaces etc if isinstance(RI(remote_url), URL): sm_path_url = urlquote(sm_path) else: sm_path_url = sm_path clone_urls.extend( (remote, url) for url in _get_flexible_source_candidates( # alternate suffixes are tested by `clone` anyways sm_path_url, remote_url, alternate_suffix=False)) # attempt: provided (configured?) submodule URL # TODO: consider supporting DataLadRI here? or would confuse # git and we wouldn't want that (i.e. not allow pure git clone # --recursive) if sm_url: clone_urls.extend( (remote, url) for url in _get_flexible_source_candidates( sm_url, remote_url, alternate_suffix=False)) for name, tmpl in [ (c[12:], ds_repo.config[c]) for c in ds_repo.config.keys() if c.startswith('datalad.get.subdataset-source-candidate-') ]: url = tmpl.format(**sm_candidate_props) # we don't want "flexible_source_candidates" here, this is # configuration that can be made arbitrarily precise from the # outside. Additional guesswork can only make it slower clone_urls.append((name, url)) # CANDIDATE: the actual configured gitmodule URL if sm_url: clone_urls.extend(('local', url) for url in _get_flexible_source_candidates( sm_url, ds.path, alternate_suffix=False) # avoid inclusion of submodule location itself if url != sm['path']) return unique(clone_urls, lambda x: x[1])
def _gen_akey_afiles(self, key, sorted=False, unique_akeys=True): """Given a key, yield akey, afile pairs if `sorted`, then first those which have extracted version in local cache will be yielded Gets determined based on urls for datalad archives Made "generators all the way" as an exercise but also to delay any checks etc until really necessary. """ # we will need all URLs anyways later on ATM, so lets list() them # Anyways here we have a single scheme (archive) so there is not # much optimization possible urls = list(self.gen_URLS(key)) akey_afiles = [ self._parse_url(url)[:2] # skip size for url in urls ] if unique_akeys: akey_afiles = unique(akey_afiles, key=itemgetter(0)) if not sorted: for pair in akey_afiles: yield pair return # Otherwise we will go through each one # multiple URLs are available so we need to figure out which one # would be most efficient to "deal with" akey_afile_paths = (((akey, afile), self.get_contentlocation(akey, absolute=True, verify_exists=False)) for akey, afile in akey_afiles) # by default get_contentlocation would return empty result for a key # which is not available locally. But we could still have extracted # archive in the cache. So we need pretty much get first all possible # and then only remove those which aren't present locally. So # verify_exists was added yielded = set() akey_afile_paths_ = [] # utilize cache to check which archives might already be present in the # cache for akey_afile, akey_path in akey_afile_paths: if akey_path and self.cache[akey_path].is_extracted: yield akey_afile yielded.add(akey_afile) akey_afile_paths_.append((akey_afile, akey_path)) # replace generators with already collected ones into a list. The idea # that in many cases we don't even need to create a full list and that # initial single yield would be enough, thus we don't need to check # locations etc for every possible hit akey_afile_paths = akey_afile_paths_ # if not present in the cache -- check which are present # locally and choose that one to use, so it would get extracted for akey_afile, akey_path in akey_afile_paths: if akey_path and op.exists(akey_path): yielded.add(akey_afile) yield akey_afile # So no archive is present either in the cache or originally under # annex XXX some kind of a heuristic I guess is to use last_url ;-) if self._last_url and self._last_url in urls \ and (len(urls) == len(akey_afiles)): akey_afile, _ = akey_afile_paths[urls.index(self._last_url)] yielded.add(akey_afile) yield akey_afile for akey_afile, _ in akey_afile_paths: if akey_afile not in yielded: yield akey_afile
def __call__( path=None, dataset=None, # support passing this through in a path by path basis to_git=None, save=True, message=None, recursive=False, recursion_limit=None, ds2super=False, git_opts=None, annex_opts=None, annex_add_opts=None, jobs=None): # parameter constraints: if not path: raise InsufficientArgumentsError( "insufficient information for adding: requires at least a path" ) refds_path = Interface.get_refds_path(dataset) common_report = dict(action='add', logger=lgr, refds=refds_path) to_add = [] subds_to_add = {} ds_to_annotate_from_recursion = {} got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=dataset, # never recursion, need to handle manually below to be able to # discover untracked content recursive=False, action='add', # speed things up by using Git's modification detection, if there # is a repo with at least one commit modified='HEAD' \ if dataset and \ GitRepo.is_valid_repo(refds_path) and \ GitRepo(refds_path).get_hexsha() \ else None, unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): got_nothing = False if ap.get('status', None): # this is done yield ap continue if ap.get('parentds', None) is None and ap.get('type', None) != 'dataset': yield get_status_dict( status='impossible', message='"there is no dataset to add this path to', **dict(common_report, **ap)) continue if ap.get('type', None) == 'directory' and \ ap.get('state', None) == 'untracked' and \ GitRepo.is_valid_repo(ap['path']): # this is an untracked wannabe subdataset in disguise ap['type'] = 'dataset' if recursive and \ (ap.get('raw_input', False) or ap.get('state', None) in ('added', 'modified', 'untracked')) and \ (ap.get('parentds', None) or ap.get('type', None) == 'dataset'): # this was an actually requested input path, or a path that was found # modified by path annotation, based on an input argument # we need to recurse into all subdirs to find potentially # unregistered subdatasets # but only if this path has a parent, or is itself a dataset # otherwise there is nothing to add to _discover_subdatasets_recursively( ds_to_annotate_from_recursion, ap['path'], [ap['parentds'] if 'parentds' in ap else ap['path']], recursion_limit) # get the file content of the root dataset of this search added too # but be careful with extreme recursion_limit settings if recursion_limit is None or recursion_limit > 0: ap['process_content'] = True # record for further processing if not ap['path'] in ds_to_annotate_from_recursion: # if it was somehow already discovered to_add.append(ap) # TODO check if next isn't covered by discover_dataset_trace_to_targets already?? if dataset and ap.get('type', None) == 'dataset': # duplicates not possible, annotated_paths returns unique paths subds_to_add[ap['path']] = ap if got_nothing: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict('add', status='notneeded', path=refds_path, type='dataset', logger=lgr) return for subds in ds_to_annotate_from_recursion: if subds not in subds_to_add: # always prefer the already annotated path subds_to_add[subds] = ds_to_annotate_from_recursion[subds] if dataset: # we have a base dataset, discover any intermediate datasets between # the base and any already discovered dataset discovered = {} discover_dataset_trace_to_targets( # from here dataset.path, # to any dataset we are aware of subds_to_add.keys(), [], discovered) for parentds in discovered: for subds in discovered[parentds]: subds_to_add[subds] = subds_to_add.get( subds, dict(path=subds, parentds=parentds, type='dataset')) # merge custom paths and discovered dataset records, paths needs to go first, # because we know most about then, and subsequent annotation call we skip the # later duplicate ones to_add.extend(subds_to_add.values()) # and compact, this should be OK as all the info is in each ap dict to_add = unique(to_add, lambda x: x['path']) if not to_add: # nothing left to do, potentially all errored before return # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, it will also deduplicate annotated_paths = AnnotatePaths.__call__( path=to_add, dataset=dataset, # never recursion, done already recursive=False, action='add', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert (not completed) if not content_by_ds: # we should have complained about any inappropriate path argument # above, so if nothing is left, we can simply exit return # simple loop over datasets -- save happens later # start deep down to_save = [] for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) torepoadd = {} respath_by_status = {} for ap in content_by_ds[ds_path]: # we have a new story ap.pop('status', None) torepoadd[ap['path']] = ap # skip anything that doesn't look like a wannabe subdataset if not ap.get('type', None) == 'dataset' or \ ap['path'] == ds_path: continue if ap.get('registered_subds', False): # subdataset that might be in this list because of the # need to save all the way up to a super dataset respath_by_status['success'] = \ respath_by_status.get('success', []) + [ap['path']] yield get_status_dict(status='notneeded', message="already known subdataset", **dict(common_report, **ap)) continue subds = Dataset(ap['path']) # check that the subds has a commit, and refuse # to operate on it otherwise, or we would get a bastard # submodule that cripples git operations if not subds.repo.get_hexsha(): yield get_status_dict( ds=subds, status='impossible', message='cannot add subdataset with no commits', **dict(common_report, **ap)) continue subds_relpath = relpath(ap['path'], ds_path) # make an attempt to configure a submodule source URL based on the # discovered remote configuration remote, branch = subds.repo.get_tracking_branch() subds_url = subds.repo.get_remote_url( remote) if remote else None # Register the repository in the repo tree as a submodule try: ds.repo.add_submodule(subds_relpath, url=subds_url, name=None) except CommandError as e: yield get_status_dict(ds=subds, status='error', message=e.stderr, **dict(common_report, **ap)) continue # queue for saving using the updated annotated path ap['registered_subds'] = True # I hope this is true in direct mode too # TODO this is disabled, because in some circumstances # staging just doesn't happen, and it is unclear when # exactly -- the case that prompted disabling was a submodule # that had no content except for other submodules was not staged, # whereas another submodule on the same level in the same # superdataset which also has one file in it was staged # disable to work correctly, while paying a little bit of # slow down #ap['staged'] = True to_save.append(ap) _fixup_submodule_dotgit_setup(ds, subds_relpath) # report added subdatasets -- `annex add` below won't do it yield get_status_dict(ds=subds, status='ok', message='added new subdataset', **dict(common_report, **ap)) # make sure that .gitmodules is added to the list of files gitmodules_path = opj(ds.path, '.gitmodules') # for git torepoadd[gitmodules_path] = dict(path=gitmodules_path) # and for save to_save.append( dict(path=gitmodules_path, parentds=ds_path, type='file')) # make sure any last minute additions make it to the saving stage # XXX? should content_by_ds become OrderedDict so that possible # super here gets processed last? lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd) is_annex = isinstance(ds.repo, AnnexRepo) add_kw = {'jobs': jobs} if is_annex and jobs else {} added = ds.repo.add(list(torepoadd.keys()), git=to_git if is_annex else True, **add_kw) for a in added: res = annexjson2result(a, ds, type='file', **common_report) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] # produce best possible path/result annotation if res['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report res = dict(torepoadd[res['path']], **res) # override this in all cases to be safe res['parentds'] = ds.path if success: # this was successfully added, queue for saving this very path # in the dataset ap = {k: v for k, v in res.items() if k != 'status'} ap['staged'] = True # strip any status and state info (e.g. save will refuse to save # stuff that is marked state='untracked' to_save.append({ k: v for k, v in res.items() if k not in ('status', 'state') }) if a['file'] == '.gitmodules': # filter out .gitmodules, because this is only included for # technical reasons and has nothing to do with the actual content continue if GitRepo.is_valid_repo(res['path']): # more accurate report in case of an added submodule # mountpoint. # XXX Actually not sure if this can really happen # (depends on what our low-level code would do) # but worst case is that we loose a little bit of # coverage... res['type'] = 'dataset' res['message'] = 'added new state as submodule' yield res for r in results_from_annex_noinfo( ds, torepoadd, respath_by_status, dir_fail_msg='could not add some content in %s %s', noinfo_dir_msg='nothing to add from %s', noinfo_file_msg='already included in the dataset', action='add', logger=lgr, refds=refds_path): if r['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report r = dict(r, **torepoadd[r['path']]) if r['status'] == 'notneeded': # this could be a file that was staged already, it doesn't need # to be added, but it should be saved/commited if so desired to_save.append({ k: v for k, v in r.items() if k not in ('status', 'state') }) # XXX something is fishy with the next one, rethink when sober.... if r['path'] == ds_path and r['status'] == 'ok': # this is for the entire dataset itself which was explicitly requested # make sure to save all r['type'] = 'dataset' r['process_content'] = True to_save.append( {k: v for k, v in r.items() if k != 'status'}) yield r if refds_path and ds_path != refds_path and len( respath_by_status.get('success', [])): # TODO XXX we have an issue here when with `add('.')` and annex ignores any # dotfiles. In this case we end up not saving a dataset completely, because # we rely on accurate reporting. there is an issue about this already # TODO look up the issue ID # if there is a base dataset, but we are below it, and we have anything done to this # dataset -> queue dataset itself for saving its state in the parent ds_ap = dict( path=ds.path, # we have to look for the parent here, as we must save the # subdataset in the parent and not the whole subdataset itself type='dataset') parentds = get_dataset_root(normpath(opj(ds.path, pardir))) if parentds: ds_ap['parentds'] = parentds if dataset: ds_ap['refds'] = refds_path to_save.append(ds_ap) if not save: lgr.debug('Not calling `save` as instructed') return # TODO tell save what was staged already! Set 'staged=True' for # respective annotated paths that are fed into `save` # do not reuse any of the sorting done in here for saving, but instead # pass on all the annotated paths to have `save` figure out what to do with # them -- this is costs something, but should be safer, and frankly is # more comprehensible for res in Save.__call__( # hand-selected annotated paths path=to_save, dataset=refds_path, message=message if message else '[DATALAD] added content', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def __call__( path=None, dataset=None, # support passing this through in a path by path basis to_git=None, save=True, message=None, message_file=None, recursive=False, recursion_limit=None, ds2super=False, git_opts=None, annex_opts=None, annex_add_opts=None, jobs=None): # parameter constraints: if not path: raise InsufficientArgumentsError( "insufficient information for adding: requires at least a path") refds_path = Interface.get_refds_path(dataset) common_report = dict(action='add', logger=lgr, refds=refds_path) if message and message_file: raise ValueError("Both a message and message file were specified") if message_file: with open(message_file, "rb") as mfh: message = assure_unicode(mfh.read()) to_add = [] subds_to_add = {} ds_to_annotate_from_recursion = {} got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=dataset, # never recursion, need to handle manually below to be able to # discover untracked content recursive=False, action='add', # speed things up by using Git's modification detection, if there # is a repo with at least one commit modified='HEAD' \ if dataset and \ GitRepo.is_valid_repo(refds_path) and \ GitRepo(refds_path).get_hexsha() \ else None, unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): got_nothing = False if ap.get('status', None): # this is done yield ap continue if ap.get('parentds', None) is None and ap.get('type', None) != 'dataset': yield get_status_dict( status='impossible', message='"there is no dataset to add this path to', **dict(common_report, **ap)) continue if ap.get('type', None) == 'directory' and \ ap.get('state', None) == 'untracked' and \ GitRepo.is_valid_repo(ap['path']): # this is an untracked wannabe subdataset in disguise ap['type'] = 'dataset' if recursive and \ (ap.get('raw_input', False) or ap.get('state', None) in ('added', 'modified', 'untracked')) and \ (ap.get('parentds', None) or ap.get('type', None) == 'dataset'): # this was an actually requested input path, or a path that was found # modified by path annotation, based on an input argument # we need to recurse into all subdirs to find potentially # unregistered subdatasets # but only if this path has a parent, or is itself a dataset # otherwise there is nothing to add to _discover_subdatasets_recursively( ds_to_annotate_from_recursion, ap['path'], [ap['parentds'] if 'parentds' in ap else ap['path']], recursion_limit) # get the file content of the root dataset of this search added too # but be careful with extreme recursion_limit settings if recursion_limit is None or recursion_limit > 0: ap['process_content'] = True # record for further processing if not ap['path'] in ds_to_annotate_from_recursion: # if it was somehow already discovered to_add.append(ap) if got_nothing: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict( 'add', status='notneeded', path=refds_path, type='dataset', logger=lgr) return for subds in ds_to_annotate_from_recursion: if subds not in subds_to_add: # always prefer the already annotated path subds_to_add[subds] = ds_to_annotate_from_recursion[subds] if dataset: # we have a base dataset, discover any intermediate datasets between # the base and any already discovered dataset discovered = {} discover_dataset_trace_to_targets( # from here dataset.path, # to any dataset we are aware of subds_to_add.keys(), [], discovered) for parentds in discovered: for subds in discovered[parentds]: subds_to_add[subds] = subds_to_add.get( subds, dict(path=subds, parentds=parentds, type='dataset')) # merge custom paths and discovered dataset records, paths needs to go first, # because we know most about then, and subsequent annotation call we skip the # later duplicate ones to_add.extend(subds_to_add.values()) # and compact, this should be OK as all the info is in each ap dict to_add = unique(to_add, lambda x: x['path']) if not to_add: # nothing left to do, potentially all errored before return # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, it will also deduplicate annotated_paths = AnnotatePaths.__call__( path=to_add, dataset=dataset, # never recursion, done already recursive=False, action='add', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert(not completed) if not content_by_ds: # we should have complained about any inappropriate path argument # above, so if nothing is left, we can simply exit return # simple loop over datasets -- save happens later # start deep down to_save = [] for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) torepoadd = {} respath_by_status = {} for ap in content_by_ds[ds_path]: # we have a new story ap.pop('status', None) torepoadd[ap['path']] = ap # skip anything that doesn't look like a wannabe subdataset if not ap.get('type', None) == 'dataset' or \ ap['path'] == ds_path: continue if ap.get('registered_subds', False): # subdataset that might be in this list because of the # need to save all the way up to a super dataset respath_by_status['success'] = \ respath_by_status.get('success', []) + [ap['path']] yield get_status_dict( status='notneeded', message="already known subdataset", **dict(common_report, **ap)) continue subds = Dataset(ap['path']) subds_relpath = relpath(ap['path'], ds_path) # Register the repository in the repo tree as a submodule try: ds.repo.add_submodule(subds_relpath, url=None, name=None) except (CommandError, InvalidGitRepositoryError) as e: yield get_status_dict( ds=subds, status='error', message=e.stderr, **dict(common_report, **ap)) continue # queue for saving using the updated annotated path ap['registered_subds'] = True # I hope this is true in direct mode too # TODO this is disabled, because in some circumstances # staging just doesn't happen, and it is unclear when # exactly -- the case that prompted disabling was a submodule # that had no content except for other submodules was not staged, # whereas another submodule on the same level in the same # superdataset which also has one file in it was staged # disable to work correctly, while paying a little bit of # slow down #ap['staged'] = True to_save.append(ap) # report added subdatasets -- `annex add` below won't do it yield get_status_dict( ds=subds, status='ok', message='added new subdataset', **dict(common_report, **ap)) # make sure that .gitmodules is added to the list of files gitmodules_path = opj(ds.path, '.gitmodules') # for git torepoadd[gitmodules_path] = dict(path=gitmodules_path) # and for save to_save.append(dict( path=gitmodules_path, parentds=ds_path, type='file')) # make sure any last minute additions make it to the saving stage # XXX? should content_by_ds become OrderedDict so that possible # super here gets processed last? lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd) is_annex = isinstance(ds.repo, AnnexRepo) add_kw = {'jobs': jobs} if is_annex and jobs else {} added = ds.repo.add_( list(torepoadd.keys()), git=to_git if is_annex else True, **add_kw ) for a in added: res = annexjson2result(a, ds, type='file', **common_report) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] # produce best possible path/result annotation if res['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report res = dict(torepoadd[res['path']], **res) # override this in all cases to be safe res['parentds'] = ds.path if success: # this was successfully added, queue for saving this very path # in the dataset ap = {k: v for k, v in res.items() if k != 'status'} ap['staged'] = True # strip any status and state info (e.g. save will refuse to save # stuff that is marked state='untracked' to_save.append({k: v for k, v in res.items() if k not in ('status', 'state')}) if a['file'] == '.gitmodules': # filter out .gitmodules, because this is only included for # technical reasons and has nothing to do with the actual content continue if GitRepo.is_valid_repo(res['path']): # more accurate report in case of an added submodule # mountpoint. # XXX Actually not sure if this can really happen # (depends on what our low-level code would do) # but worst case is that we loose a little bit of # coverage... res['type'] = 'dataset' res['message'] = 'added new state as submodule' yield res for r in results_from_annex_noinfo( ds, torepoadd, respath_by_status, dir_fail_msg='could not add some content in %s %s', noinfo_dir_msg='nothing to add from %s', noinfo_file_msg='already included in the dataset', action='add', logger=lgr, refds=refds_path): if r['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report r = dict(r, **torepoadd[r['path']]) if r['status'] == 'notneeded': # this could be a file that was staged already, it doesn't need # to be added, but it should be saved/commited if so desired to_save.append({k: v for k, v in r.items() if k not in ('status', 'state')}) # XXX something is fishy with the next one, rethink when sober.... if r['path'] == ds_path and r['status'] == 'ok': # this is for the entire dataset itself which was explicitly requested # make sure to save all r['type'] = 'dataset' r['process_content'] = True to_save.append({k: v for k, v in r.items() if k != 'status'}) yield r if refds_path and ds_path != refds_path and len(respath_by_status.get('success', [])): # TODO XXX we have an issue here when with `add('.')` and annex ignores any # dotfiles. In this case we end up not saving a dataset completely, because # we rely on accurate reporting. there is an issue about this already # TODO look up the issue ID # if there is a base dataset, but we are below it, and we have anything done to this # dataset -> queue dataset itself for saving its state in the parent ds_ap = dict( path=ds.path, # we have to look for the parent here, as we must save the # subdataset in the parent and not the whole subdataset itself type='dataset') parentds = get_dataset_root(normpath(opj(ds.path, pardir))) if parentds: ds_ap['parentds'] = parentds if dataset: ds_ap['refds'] = refds_path to_save.append(ds_ap) if not save: lgr.debug('Not calling `save` as instructed') return # TODO tell save what was staged already! Set 'staged=True' for # respective annotated paths that are fed into `save` # do not reuse any of the sorting done in here for saving, but instead # pass on all the annotated paths to have `save` figure out what to do with # them -- this is costs something, but should be safer, and frankly is # more comprehensible for res in Save.__call__( # hand-selected annotated paths path=to_save, dataset=refds_path, message=message if message else '[DATALAD] added content', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def _the_same_across_datasets(relpath, *dss): """Check if the file (present content or not) is identical across two datasets Compares files by content if under git, or by checksum if under annex Parameters ---------- *ds: Datasets relpath: str path within datasets Returns ------- bool or None True if identical, False if not, None if cannot be decided (e.g. different git-annex backend used) """ from datalad.utils import md5sum, unique from datalad.support.exceptions import FileInGitError from datalad.support.digests import Digester paths = [op.join(ds.path, relpath) for ds in dss] # The simplest check first -- exist in both and content is the same. # Even if content is just a symlink file on windows, the same content # condition would be correct if all(map(op.exists, paths)) and all_same(map(md5sum, paths)): return True # We first need to find problematic ones which are annexed and # have no content locally, and take their keys = [] backends = [] presents = [] for ds in dss: repo = ds.repo key = None present = True if isinstance(repo, AnnexRepo): try: key = repo.get_file_key(relpath) except FileInGitError: continue if not key: raise ValueError( "Must have got a key, unexpectedly got %r for %s within %s" % (key, relpath, ds) ) # For now the rest (e.g. not tracked) remains an error if not repo.file_has_content(relpath): present = False backends.append(repo.get_key_backend(key)) keys.append(key) presents.append(present) if all(presents): return all_same(map(md5sum, paths)) backends = unique(backends) assert backends, "Since not all present - some must be under annex, and thus must have a backend!" # so some files are missing! assert not all(presents) NeedContentError = RuntimeError if len(backends) > 1: # TODO: or signal otherwise somehow that we just need to get at least some # of those files to do the check!... raise NeedContentError( "Following paths are missing conent and have different annex " "backends: %s. Cannot determine here if the same or not!" % ", ".join(p for (p, b) in zip(paths, presents) if not b) ) backend = backends[0].lower() if backend.endswith('E'): backend = backend[':-1'] if backend not in Digester.DEFAULT_DIGESTS: raise NeedContentError( "Do not know how to figure out content check for backend %s" % backend ) checksums = [ split_ext(key).split('--', 1)[1] if key else key for key in keys ] thechecksum = set( checksum for present, checksum in zip(presents, checksums) if present ) if len(thechecksum) > 1: # Different checksum (with the same backend) return False elif not thechecksum: raise RuntimeError("We must have had at least one key since prior logic" " showed that not all files have content here") thechecksum = thechecksum[0] if any(presents): # We do need to extract checksum from the key and check the present # files' content to match digester = Digester([backend]) for present, path in zip(presents, paths): if present and digester(path)[backend] != thechecksum: return False return True return False