def test_add_files(path): ds = Dataset(path).create(force=True) test_list_1 = ['test_annex.txt'] test_list_2 = ['test.txt'] test_list_3 = ['test1.dat', 'test2.dat'] test_list_4 = [ op.join('dir', 'testindir'), op.join('dir', OBSCURE_FILENAME) ] for arg in [(test_list_1[0], False), (test_list_2[0], True), (test_list_3, False), (test_list_4, False)]: # special case 4: give the dir: if arg[0] == test_list_4: result = ds.save('dir', to_git=arg[1]) status = ds.repo.annexstatus(['dir']) else: result = ds.save(arg[0], to_git=arg[1]) for a in assure_list(arg[0]): assert_result_count(result, 1, path=text_type(ds.pathobj / a)) status = ds.repo.get_content_annexinfo( ut.Path(p) for p in assure_list(arg[0])) for f, p in iteritems(status): if arg[1]: assert p.get('key', None) is None, f else: assert p.get('key', None) is not None, f
def test_add_files(path): ds = Dataset(path).create(force=True) test_list_1 = ['test_annex.txt'] test_list_2 = ['test.txt'] test_list_3 = ['test1.dat', 'test2.dat'] test_list_4 = [op.join('dir', 'testindir'), op.join('dir', OBSCURE_FILENAME)] for arg in [(test_list_1[0], False), (test_list_2[0], True), (test_list_3, False), (test_list_4, False)]: # special case 4: give the dir: if arg[0] == test_list_4: result = ds.save('dir', to_git=arg[1]) status = ds.repo.annexstatus(['dir']) else: result = ds.save(arg[0], to_git=arg[1]) for a in assure_list(arg[0]): assert_result_count(result, 1, path=text_type(ds.pathobj / a)) status = ds.repo.get_content_annexinfo( ut.Path(p) for p in assure_list(arg[0])) for f, p in iteritems(status): if arg[1]: assert p.get('key', None) is None, f else: assert p.get('key', None) is not None, f
def _get_procedure_implementation(name='*', ds=None): ds = ds if isinstance(ds, Dataset) else Dataset(ds) if ds else None # 1. check dataset for procedure if ds is not None and ds.is_installed(): # could be more than one dirs = assure_list(ds.config.obtain('datalad.locations.dataset-procedures')) for dir in dirs: # TODO `get` dirs if necessary for m in _get_file_match(op.join(ds.path, dir), name): yield m # 2. check system and user account for procedure for loc in (cfg.obtain('datalad.locations.user-procedures'), cfg.obtain('datalad.locations.system-procedures')): for dir in assure_list(loc): for m in _get_file_match(dir, name): yield m # 3. check extensions for procedure # delay heavy import until here from pkg_resources import iter_entry_points from pkg_resources import resource_isdir from pkg_resources import resource_filename for entry_point in iter_entry_points('datalad.extensions'): # use of '/' here is OK wrt to platform compatibility if resource_isdir(entry_point.module_name, 'resources/procedures'): for m in _get_file_match( resource_filename( entry_point.module_name, 'resources/procedures'), name): yield m # 4. at last check datalad itself for procedure for m in _get_file_match( resource_filename('datalad', 'resources/procedures'), name): yield m
def _get_procedure_implementation(name='*', ds=None): """get potential procedure path and configuration Order of consideration is user-level, system-level, dataset, datalad extensions, datalad. First one found according to this order is the one to be returned. Therefore local definitions/configurations take precedence over ones, that come from outside (via a datalad-extension or a dataset with its .datalad/config). If a dataset had precedence (as it was before), the addition (or just an update) of a (sub-)dataset would otherwise surprisingly cause you do execute code different from what you defined within ~/.gitconfig or your local repository's .git/config. So, local definitions take precedence over remote ones and more specific ones over more general ones. Returns ------- tuple path, format string, help message """ ds = ds if isinstance(ds, Dataset) else Dataset(ds) if ds else None # 1. check system and user account for procedure for loc in (cfg.obtain('datalad.locations.user-procedures'), cfg.obtain('datalad.locations.system-procedures')): for dir in assure_list(loc): for m, n in _get_file_match(dir, name): yield (m, ) + _get_proc_config(n) # 2. check dataset for procedure if ds is not None and ds.is_installed(): # could be more than one dirs = assure_list( ds.config.obtain('datalad.locations.dataset-procedures')) for dir in dirs: # TODO `get` dirs if necessary for m, n in _get_file_match(op.join(ds.path, dir), name): yield (m, ) + _get_proc_config(n, ds=ds) # 2.1. check subdatasets recursively for subds in ds.subdatasets(return_type='generator', result_xfm='datasets'): for m, f, h in _get_procedure_implementation(name=name, ds=subds): yield m, f, h # 3. check extensions for procedure # delay heavy import until here from pkg_resources import iter_entry_points from pkg_resources import resource_isdir from pkg_resources import resource_filename for entry_point in iter_entry_points('datalad.extensions'): # use of '/' here is OK wrt to platform compatibility if resource_isdir(entry_point.module_name, 'resources/procedures'): for m, n in _get_file_match( resource_filename(entry_point.module_name, 'resources/procedures'), name): yield (m, ) + _get_proc_config(n) # 4. at last check datalad itself for procedure for m, n in _get_file_match( resource_filename('datalad', 'resources/procedures'), name): yield (m, ) + _get_proc_config(n)
def save_dataset_hierarchy( info, base=None, message='[DATALAD] saved changes', version_tag=None): """Save (disjoint) hierarchies of datasets. Saving is done in an order that guarantees that all to be saved datasets reflect any possible change of any other to be saved subdataset, before they are saved themselves. Parameters ---------- info : dict Absolute paths of datasets to be saved are the keys, and paths in each dataset to be saved are the values base : path or None, optional Common super dataset that should also be saved. message : str Message to be used for saving individual datasets Returns ------- list Instances of saved datasets, in the order in which they where saved. """ if not isinstance(info, dict): info = assure_list(info) info = dict(zip(info, [[i] for i in info])) dpaths = info.keys() if base: # just a convenience... dpaths = assure_list(dpaths) dpaths.append(base.path if isinstance(base, Dataset) else base) # sort all datasets under their potential superdatasets # start from the top to get all subdatasets down the line # and collate them into as few superdatasets as possible superdss = get_tree_roots(dpaths) # for each "superdataset" check the tree of subdatasets and make sure # we gather all datasets between the super and any subdataset # so we can save them all bottom-up in order to be able to properly # save the superdataset for superds_path in superdss: target_subs = superdss[superds_path] sort_paths_into_subdatasets(superds_path, target_subs, info) # iterate over all datasets, starting at the bottom saved = [] for dpath in sorted(info.keys(), reverse=True): ds = Dataset(dpath) if ds.is_installed(): saved_state = save_dataset( ds, info[dpath], message=message, version_tag=version_tag) if saved_state: saved.append(ds) return saved
def __call__(path=None, dataset=None, fulfilled=None, recursive=False, recursion_limit=None, contains=None, bottomup=False, set_property=None, delete_property=None): # no constraints given -> query subdatasets under curdir if not path and dataset is None: path = os.curdir paths = [rev_resolve_path(p, dataset) for p in assure_list(path)] \ if path else None ds = require_dataset(dataset, check_installed=False, purpose='subdataset reporting/modification') lgr.debug('Query subdatasets of %s', dataset) if paths is not None: lgr.debug('Query subdatasets underneath paths: %s', paths) refds_path = ds.path # XXX this seems strange, but is tested to be the case -- I'd rather set # `check_installed` to true above and fail if not GitRepo.is_valid_repo(refds_path): return # return as quickly as possible if isinstance(recursion_limit, int) and (recursion_limit <= 0): return if set_property: for k, v in set_property: if valid_key.match(k) is None: raise ValueError( "key '%s' is invalid (alphanumeric plus '-' only, must " "start with a letter)" % k) if contains: contains = [ rev_resolve_path(c, dataset) for c in assure_list(contains) ] for r in _get_submodules(ds, paths, fulfilled, recursive, recursion_limit, contains, bottomup, set_property, delete_property, refds_path): # a boat-load of ancient code consumes this and is ignorant of # Path objects r['path'] = text_type(r['path']) # without the refds_path cannot be rendered/converted relative # in the eval_results decorator r['refds'] = refds_path yield r
def __call__( path=None, dataset=None, fulfilled=None, recursive=False, recursion_limit=None, contains=None, bottomup=False, set_property=None, delete_property=None): # no constraints given -> query subdatasets under curdir if not path and dataset is None: path = os.curdir paths = [rev_resolve_path(p, dataset) for p in assure_list(path)] \ if path else None ds = require_dataset( dataset, check_installed=False, purpose='subdataset reporting/modification') lgr.debug('Query subdatasets of %s', dataset) if paths is not None: lgr.debug('Query subdatasets underneath paths: %s', paths) refds_path = ds.path # XXX this seems strange, but is tested to be the case -- I'd rather set # `check_installed` to true above and fail if not GitRepo.is_valid_repo(refds_path): return # return as quickly as possible if isinstance(recursion_limit, int) and (recursion_limit <= 0): return if set_property: for k, v in set_property: if valid_key.match(k) is None: raise ValueError( "key '%s' is invalid (alphanumeric plus '-' only, must " "start with a letter)" % k) if contains: contains = [rev_resolve_path(c, dataset) for c in assure_list(contains)] for r in _get_submodules( ds, paths, fulfilled, recursive, recursion_limit, contains, bottomup, set_property, delete_property, refds_path): # a boat-load of ancient code consumes this and is ignorant of # Path objects r['path'] = text_type(r['path']) # without the refds_path cannot be rendered/converted relative # in the eval_results decorator r['refds'] = refds_path yield r
def get_query(self, query): query = assure_list(query) simple_fieldspec = re.compile(r"(?P<field>\S*?):(?P<query>.*)") quoted_fieldspec = re.compile(r"'(?P<field>[^']+?)':(?P<query>.*)") query_rec_matches = [ simple_fieldspec.match(q) or quoted_fieldspec.match(q) or q for q in query] query_group_dicts_only = [ q.groupdict() for q in query_rec_matches if hasattr(q, 'groupdict') ] self._queried_keys = [ qgd['field'] for qgd in query_group_dicts_only if ('field' in qgd and qgd['field']) ] if len(query_group_dicts_only) != len(query_rec_matches): # we had a query element without field specification add # None as an indicator of that self._queried_keys.append(None) # expand matches, compile expressions query = [ {k: re.compile(self._xfm_query(v)) for k, v in q.groupdict().items()} if hasattr(q, 'groupdict') else re.compile(self._xfm_query(q)) for q in query_rec_matches ] # turn "empty" field specs into simple queries # this is used to forcibly disable field-based search # e.g. when searching for a value query = [q['query'] if isinstance(q, dict) and q['field'].pattern == '' else q for q in query] return query
def test_recurse_existing(src, path): origin_ds = _make_dataset_hierarchy(src) # make sure recursion_limit works as expected across a range of depths for depth in range(len(origin_ds)): datasets = assure_list( install(path, source=src, recursive=True, recursion_limit=depth)) # we expect one dataset per level eq_(len(datasets), depth + 1) rmtree(path) # now install all but the last two levels, no data root, sub1, sub2 = install(path, source=src, recursive=True, recursion_limit=2) ok_(sub2.repo.file_has_content('file_in_annex.txt') is False) sub3 = Dataset(opj(sub2.path, 'sub3')) ok_(not sub3.is_installed()) # now get all content in all existing datasets, no new datasets installed # in the process files = root.get(curdir, recursive=True, recursion_limit='existing') eq_(len(files), 1) ok_(sub2.repo.file_has_content('file_in_annex.txt') is True) ok_(not sub3.is_installed()) # now pull down all remaining datasets, no data sub3, sub4 = root.get(curdir, recursive=True, get_data=False) ok_(sub4.is_installed()) ok_(sub3.repo.file_has_content('file_in_annex.txt') is False) # aaannd all data files = root.get(curdir, recursive=True) eq_(len(files), 1) ok_(sub3.repo.file_has_content('file_in_annex.txt') is True)
def get_normalized_path_arguments(paths, dataset=None, default=None): """Apply standard resolution to path arguments This is nothing more than a helper to standardize path argument preprocessing. Parameter --------- paths : sequence or single path Path(s) to normalize dataset : path or Dataset or None Optional dataset identifying something against which to resolve input path arguments default: sequence of paths or single path or None If `paths` is empty, use this instead Returns ------- tuple(list(paths), path) Normalized paths and path to a potential dataset against which paths were resolved. """ dataset_path = dataset.path if isinstance(dataset, Dataset) else dataset if not paths and default: paths = default paths = assure_list(paths) # resolve path(s): resolved_paths = [resolve_path(p, dataset) for p in paths] if dataset: # guarantee absolute paths resolved_paths = [opj(dataset_path, p) for p in resolved_paths] lgr.debug('Resolved input path arguments: %s', resolved_paths) return resolved_paths, dataset_path
def test_recurse_existing(src, path): origin_ds = _make_dataset_hierarchy(src) # make sure recursion_limit works as expected across a range of depths for depth in range(len(origin_ds)): datasets = assure_list( install(path, source=src, recursive=True, recursion_limit=depth)) # we expect one dataset per level eq_(len(datasets), depth + 1) rmtree(path) # now install all but the last two levels, no data root, sub1, sub2 = install(path, source=src, recursive=True, recursion_limit=2) ok_(sub2.repo.file_has_content('file_in_annex.txt') is False) sub3 = Dataset(opj(sub2.path, 'sub3')) ok_(not sub3.is_installed()) # now get all content in all existing datasets, no new datasets installed # in the process files = root.get(curdir, recursive=True, recursion_limit='existing') eq_(len(files), 1) ok_(sub2.repo.file_has_content('file_in_annex.txt') is True) ok_(not sub3.is_installed()) # now pull down all remaining datasets, no data sub3, sub4 = root.get(curdir, recursive=True, get_data=False) ok_(sub4.is_installed()) ok_(sub3.repo.file_has_content('file_in_annex.txt') is False) # aaannd all data files = root.get(curdir, recursive=True) eq_(len(files), 1) ok_(sub3.repo.file_has_content('file_in_annex.txt') is True)
def close(self, allow_fail=True, ctrl_path=None): """Closes all connections, known to this instance. Parameters ---------- allow_fail: bool, optional If True, swallow exceptions which might be thrown during connection.close, and just log them at DEBUG level ctrl_path: str or list of str, optional If specified, only the path(s) provided would be considered """ if self._connections: from datalad.utils import assure_list ctrl_paths = assure_list(ctrl_path) to_close = [ c for c in self._connections # don't close if connection wasn't opened by SSHManager if self._connections[c].ctrl_path not in self._prev_connections and self._connections[c].ctrl_path.exists() and (not ctrl_paths or self._connections[c].ctrl_path in ctrl_paths ) ] if to_close: lgr.debug("Closing %d SSH connections..." % len(to_close)) for cnct in to_close: f = self._connections[cnct].close if allow_fail: f() else: try: f() except Exception as exc: lgr.debug("Failed to close a connection: " "%s", exc_str(exc)) self._connections = dict()
def results_from_paths(paths, action=None, type=None, logger=None, refds=None, status=None, message=None): """ Helper to yield analog result dicts for each path in a sequence. Parameters ---------- message: str A result message. May contain `%s` which will be replaced by the respective `path`. Returns ------- generator """ for p in assure_list(paths): yield get_status_dict(action, path=p, type=type, logger=logger, refds=refds, status=status, message=(message, p) if '%s' in message else message)
def close(self, allow_fail=True, ctrl_path=None): """Closes all connections, known to this instance. Parameters ---------- allow_fail: bool, optional If True, swallow exceptions which might be thrown during connection.close, and just log them at DEBUG level ctrl_path: str or list of str, optional If specified, only the path(s) provided would be considered """ if self._connections: from datalad.utils import assure_list ctrl_paths = assure_list(ctrl_path) to_close = [c for c in self._connections # don't close if connection wasn't opened by SSHManager if self._connections[c].ctrl_path not in self._prev_connections and exists(self._connections[c].ctrl_path) and (not ctrl_paths or self._connections[c].ctrl_path in ctrl_paths)] if to_close: lgr.debug("Closing %d SSH connections..." % len(to_close)) for cnct in to_close: f = self._connections[cnct].close if allow_fail: f() else: try: f() except Exception as exc: lgr.debug("Failed to close a connection: " "%s", exc_str(exc)) self._connections = dict()
def is_result_matching_pathsource_argument(res, **kwargs): # we either have any non-zero number of "paths" (that could be anything), or # we have one path and one source # we don't do any error checking here, done by the command itself source = kwargs.get('source', None) if source is not None: # if there was a source, it needs to be recorded in the result # otherwise this is not what we are looking for return source == res.get('source_url', None) # the only thing left is a potentially heterogeneous list of paths/URLs paths = assure_list(kwargs.get('path', [])) # three cases left: # 1. input arg was an absolute path -> must match 'path' property # 2. input arg was relative to a dataset -> must match refds/relpath # 3. something nifti with a relative input path that uses PWD as the # reference respath = res.get('path', None) if respath in paths: # absolute match, pretty sure we want this return True elif kwargs.get('dataset', None) and YieldRelativePaths()(res) in paths: # command was called with a reference dataset, and a relative # path of a result matches in input argument -- not 100% exhaustive # test, but could be good enough return True elif any(abspath(p) == respath for p in paths): # one absolutified input path matches the result path # I'd say: got for it! return True elif any(p == res.get('source_url', None) for p in paths): # this was installed from a URL that was given, we'll take that too return True else: False
def get_query(self, query): # parse the query string self._mk_parser() # for convenience we accept any number of args-words from the # shell and put them together to a single string here querystr = ' '.join(assure_list(query)) # this gives a formal whoosh query wquery = self.parser.parse(querystr) return wquery
def __call__(module=None, verbose=False, nocapture=False, pdb=False, stop=False): if not module: from pkg_resources import iter_entry_points module = ['datalad'] module.extend(ep.module_name for ep in iter_entry_points('datalad.tests')) module = assure_list(module) lgr.info('Starting test run for module(s): %s', module) for mod in module: datalad.test(module=mod, verbose=verbose, nocapture=nocapture, pdb=pdb, stop=stop)
def _drop_files(ds, paths, check, noannex_iserror=False, **kwargs): """Helper to drop content in datasets. Parameters ---------- ds : Dataset paths : path or list(path) which content to drop check : bool whether to instruct annex to perform minimum copy availability checks noannex_iserror : bool whether calling this function on a pure Git repo results in an 'impossible' or 'notneeded' result. **kwargs additional payload for the result dicts """ if 'action' not in kwargs: kwargs['action'] = 'drop' # always need to make sure that we pass a list # `normalize_paths` decorator will otherwise screw all logic below paths = assure_list(paths) if not hasattr(ds.repo, 'drop'): for p in paths: r = get_status_dict( status='impossible' if noannex_iserror else 'notneeded', path=p if isabs(p) else normpath(opj(ds.path, p)), message="no annex'ed content", **kwargs) r['action'] = 'drop' yield r return opts = ['--force'] if not check else [] respath_by_status = {} for res in ds.repo.drop(paths, options=opts): res = annexjson2result( # annex reports are always about files res, ds, type='file', **kwargs) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] yield res # report on things requested that annex was silent about for r in results_from_annex_noinfo( ds, paths, respath_by_status, dir_fail_msg='could not drop some content in %s %s', noinfo_dir_msg='nothing to drop from %s', noinfo_file_msg="no annex'ed content", **kwargs): r['action'] = 'drop' yield r
def custom_result_renderer(res, **kwargs): if res['status'] != 'ok' or \ not res.get('action', None) == 'meta_extract': # logging complained about this already return if 'state' in res and 'extractor' in res: # extractor report, special treatment ui.message('{name}({state})'.format( name=ac.color_word(res['extractor'], ac.BOLD), state=','.join( '{}{}{}{}'.format( # boolean states get a + or - prefix '+' if v is True else '-' if v is False else '', k, '=' if not isinstance(v, bool) else '', v if not isinstance(v, bool) else '') for k, v in iteritems(res['state']) # this is an extractor property, and mostly serves # internal purposes if k not in ('unique_exclude', )), )) return if kwargs.get('format', None) == 'jsonld': # special case of a JSON-LD report request # all reports are consolidated into a single # graph, dumps just that (no pretty printing, can # be done outside) ui.message( jsondumps( res['metadata'], # support utf-8 output ensure_ascii=False, # this cannot happen, spare the checks check_circular=False, # this will cause the output to not necessarily be # JSON compliant, but at least contain all info that went # in, and be usable for javascript consumers allow_nan=True, )) return # list the path, available metadata keys, and tags path = op.relpath(res['path'], res['refds']) if res.get( 'refds', None) else res['path'] meta = res.get('metadata', {}) ui.message('{path}{type}:{spacer}{meta}{tags}'.format( path=ac.color_word(path, ac.BOLD), type=' ({})'.format(ac.color_word(res['type'], ac.MAGENTA)) if 'type' in res else '', spacer=' ' if len([m for m in meta if m != 'tag']) else '', meta=','.join(k for k in sorted(meta.keys()) if k not in ('tag', '@context', '@id')) if meta else ' -' if 'metadata' in res else ' {}'.format(','.join( e for e in res['extractors'] if e not in ('datalad_core', 'metalad_core', 'metalad_annex'))) if 'extractors' in res else '', tags='' if 'tag' not in meta else ' [{}]'.format(','.join( assure_list(meta['tag'])))))
def _get_dsmeta_srcfiles(ds): # which files to look at cfg_srcfiles = ds.config.obtain('datalad.metadata.custom-dataset-source', []) cfg_srcfiles = assure_list(cfg_srcfiles) # OK to be always POSIX srcfiles = ['.metadata/dataset.json'] \ if not cfg_srcfiles and op.lexists( text_type(ds.pathobj / '.metadata' / 'dataset.json')) \ else cfg_srcfiles return srcfiles, cfg_srcfiles
def result_renderer_cmdline(res, args): from datalad.ui import ui res = assure_list(res) if args.dryrun: ui.message('DRYRUN -- Anticipated results:') if not len(res): ui.message("Nothing done") else: for d, url, existed in res: ui.message("'{}'{} configured as sibling '{}' for {}".format( url, " (existing repository)" if existed else '', args.name, d))
def pipeline(url, project, project_access='public', subjects=None): # TODO: Ben: Clarify parameters. In particular `project_access` is unclear to me subjects = assure_list(subjects) xnat = XNATServer(url) def get_project_info(data): out = xnat('data/projects/%s' % project, return_plain=True ) # for NITRC I need to get more! # "http://nitrc_es.projects.nitrc.org/datalad/%s" % dataset items = out['items'] assert len(items) == 1 dataset_meta = items[0]['data_fields'] # TODO: save into a file yield data def get_files(data): for f in xnat.get_all_files_for_project(project, subjects=subjects): # TODO: tune up filename # TODO: get url prefix = '/data/experiments/' assert f['uri'].startswith('%s' % prefix) # TODO: use label for subject/experiment # TODO: might want to allow for # XNAT2BIDS whenever that one is available: # http://reproducibility.stanford.edu/accepted-projects-for-the-2nd-crn-coding-sprint/ exp_label = xnat.experiment_labels[f['experiment_id']] yield updated(data, {'url': url + f['uri'], 'path': f['uri'][len(prefix):], 'name': '%s-%s' % (exp_label, f['name']) }) annex = Annexificator( create=False, # must be already initialized etc # leave in Git only obvious descriptors and code snippets -- the rest goes to annex # so may be eventually we could take advantage of git tags for changing layout statusdb='json', special_remotes=['datalad'] if project_access != 'public' else None ) return [ get_project_info, [ get_files, annex ], annex.finalize(cleanup=True, aggregate=True), ]
def _drop_files(ds, paths, check, noannex_iserror=False, **kwargs): """Helper to drop content in datasets. Parameters ---------- ds : Dataset paths : path or list(path) which content to drop check : bool whether to instruct annex to perform minimum copy availability checks noannex_iserror : bool whether calling this function on a pure Git repo results in an 'impossible' or 'notneeded' result. **kwargs additional payload for the result dicts """ if 'action' not in kwargs: kwargs['action'] = 'drop' # always need to make sure that we pass a list # `normalize_paths` decorator will otherwise screw all logic below paths = assure_list(paths) if not hasattr(ds.repo, 'drop'): for p in paths: r = get_status_dict( status='impossible' if noannex_iserror else 'notneeded', path=p if isabs(p) else normpath(opj(ds.path, p)), message="no annex'ed content", **kwargs) r['action'] = 'drop' yield r return opts = ['--force'] if not check else [] respath_by_status = {} for res in ds.repo.drop(paths, options=opts): res = annexjson2result( # annex reports are always about files res, ds, type='file', **kwargs) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] yield res # report on things requested that annex was silent about for r in results_from_annex_noinfo( ds, paths, respath_by_status, dir_fail_msg='could not drop some content in %s %s', noinfo_dir_msg='nothing to drop from %s', noinfo_file_msg="no annex'ed content", **kwargs): r['action'] = 'drop' yield r
def _check_deps(repo, deps): """Check if all `deps` remotes are known to the `repo` Raises ------ ValueError if any of the deps is an unknown remote """ unknown_deps = set(assure_list(deps)).difference(repo.get_remotes()) if unknown_deps: raise ValueError( 'unknown sibling(s) specified as publication dependency: %s' % unknown_deps)
def superdataset_pipeline(url, limit=None, drop_empty=True): """ Parameters ---------- url limit : TODO, optional Types of access to limit to, see XNAT.get_datasets drop_empty: bool, optional If set, do not create datasets which are empty (no files). Note - it requires obtaining details for every project, which could be a heavy operation kwargs Returns ------- """ annex = Annexificator(no_annex=True, allow_dirty=False) lgr.info("Creating a pipeline with url=%s limit=%s drop_empty=%s", url, limit, drop_empty) limit = assure_list(limit) drop_empty = assure_bool(drop_empty) def get_projects(data): xnat = XNATServer(url) for p in xnat.get_projects( asdict=False, limit=limit or PROJECT_ACCESS_TYPES, drop_empty=drop_empty ): yield updated(data, p) return [ get_projects, assign({'project': '%(id)s', 'dataset_name': '%(id)s', 'url': url }, interpolate=True), # TODO: should we respect x quarantine_status annex.initiate_dataset( template="xnat", data_fields=['project', 'url', 'project_access'], # TODO: may be project_access # let's all specs and modifications reside in master # branch='incoming', # there will be archives etc existing='skip' # further any additional options ) ]
def result_renderer_cmdline(res, args): from datalad.ui import ui res = assure_list(res) if args.dryrun: ui.message('DRYRUN -- Anticipated results:') if not len(res): ui.message("Nothing done") else: for d, url, existed in res: ui.message( "'{}'{} configured as sibling '{}' for {}".format( url, " (existing repository)" if existed else '', args.name, d))
def __init__(self, dataset=None): """Retrieves the configured set of rules Rules are defined by classes ... + __datalad_hirni_rules datalad.hirni.dicom2spec.rules ... multiple Parameters ---------- dataset: Dataset Dataset to read possibly customized rules from """ from datalad.utils import assure_list from datalad import cfg as dl_cfg from datalad_hirni.support.default_rules import DefaultRules cfg = dataset.config if dataset else dl_cfg self._rule_set = [] # get a list of paths to build the rule set from # Note: assure_list is supposed to return empty list if there's nothing self._file_list = \ assure_list(cfg.get("datalad.hirni.dicom2spec.rules")) lgr.debug("loaded list of rule files: %s", self._file_list) for file in self._file_list: if not op.exists(file) or not op.isfile(file): lgr.warning( "Ignored invalid path for dicom2spec rules " "definition: %s", file) continue from datalad.utils import import_module_from_file from datalad.dochelpers import exc_str try: mod = import_module_from_file(file) except Exception as e: # any exception means full stop raise ValueError("Rules definition file at {} is broken: {}" "".format(file, exc_str(e))) # check file's __datalad_hirni_rules for the actual class: if not hasattr(mod, "__datalad_hirni_rules"): raise ValueError("Rules definition file {} missed attribute " "'__datalad_hirni_rules'.".format(file)) self._rule_set.append(getattr(mod, "__datalad_hirni_rules")) if not self._rule_set: self._rule_set = [DefaultRules]
def check_integration1(login, keyring, path, organization=None, kwargs={}, oauthtokens=None): kwargs = kwargs.copy() if organization: kwargs['github_organization'] = organization ds = Dataset(path).create() if oauthtokens: for oauthtoken in assure_list(oauthtokens): ds.config.add('hub.oauthtoken', oauthtoken, where='local') # so we do not pick up local repo configuration/token repo_name = 'test_integration1' with chpwd(path): # ATM all the github goodness does not care about "this dataset" # so force "process wide" cfg to pick up our defined above oauthtoken cfg.reload(force=True) # everything works just nice, no conflicts etc res = ds.create_sibling_github(repo_name, **kwargs) if organization: url_fmt = 'https://{login}@github.com/{organization}/{repo_name}.git' else: url_fmt = 'https://github.com/{login}/{repo_name}.git' eq_(res, [(ds, url_fmt.format(**locals()), False)]) # but if we rerun - should kaboom since already has this sibling: with assert_raises(ValueError) as cme: ds.create_sibling_github(repo_name, **kwargs) assert_in("already has a configured sibling", str(cme.exception)) # but we can give it a new name, but it should kaboom since the remote one # exists already with assert_raises(ValueError) as cme: ds.create_sibling_github(repo_name, name="github2", **kwargs) assert_in("already exists on", str(cme.exception)) # we should not leave the broken sibling behind assert_not_in('github2', ds.repo.get_remotes()) # If we ask to reconfigure - should proceed normally ds.create_sibling_github(repo_name, existing='reconfigure', **kwargs) cfg.reload(force=True)
def check_integration1(login, keyring, path, organization=None, kwargs={}, oauthtokens=None): kwargs = kwargs.copy() if organization: kwargs['github_organization'] = organization ds = Dataset(path).create() if oauthtokens: for oauthtoken in assure_list(oauthtokens): ds.config.add('hub.oauthtoken', oauthtoken, where='local') # so we do not pick up local repo configuration/token repo_name = 'test_integration1' with chpwd(path): # ATM all the github goodness does not care about "this dataset" # so force "process wide" cfg to pick up our defined above oauthtoken cfg.reload(force=True) # everything works just nice, no conflicts etc res = ds.create_sibling_github(repo_name, **kwargs) if organization: url_fmt = 'https://{login}@github.com/{organization}/{repo_name}.git' else: url_fmt = 'https://github.com/{login}/{repo_name}.git' eq_(res, [(ds, url_fmt.format(**locals()), False)]) # but if we rerun - should kaboom since already has this sibling: with assert_raises(ValueError) as cme: ds.create_sibling_github(repo_name, **kwargs) assert_in("already has a configured sibling", str(cme.exception)) # but we can give it a new name, but it should kaboom since the remote one # exists already with assert_raises(ValueError) as cme: ds.create_sibling_github(repo_name, name="github2", **kwargs) assert_in("already exists on", str(cme.exception)) # we should not leave the broken sibling behind assert_not_in('github2', ds.repo.get_remotes()) # If we ask to reconfigure - should proceed normally ds.create_sibling_github(repo_name, existing='reconfigure', **kwargs) cfg.reload(force=True)
def custom_result_renderer(res, **kwargs): if res['status'] != 'ok' or not res.get('action', None) == 'metadata': # logging complained about this already return # list the path, available metadata keys, and tags path = relpath(res['path'], res['refds']) if res.get( 'refds', None) else res['path'] meta = res.get('metadata', {}) ui.message('{path}{type}:{spacer}{meta}{tags}'.format( path=ac.color_word(path, ac.BOLD), type=' ({})'.format(ac.color_word(res['type'], ac.MAGENTA)) if 'type' in res else '', spacer=' ' if len([m for m in meta if m != 'tag']) else '', meta=','.join(k for k in sorted(meta.keys()) if k not in ('tag', '@context', '@id')) if meta else ' -' if 'metadata' in res else ' aggregated', tags='' if 'tag' not in meta else ' [{}]'.format(','.join( assure_list(meta['tag'])))))
def custom_result_renderer(res, **kwargs): if res['status'] != 'ok' or not res.get('action', None) == 'metadata': # logging complained about this already return # list the path, available metadata keys, and tags path = op.relpath(res['path'], res['refds']) if res.get('refds', None) else res['path'] meta = res.get('metadata', {}) ui.message('{path}{type}:{spacer}{meta}{tags}'.format( path=ac.color_word(path, ac.BOLD), type=' ({})'.format( ac.color_word(res['type'], ac.MAGENTA)) if 'type' in res else '', spacer=' ' if len([m for m in meta if m != 'tag']) else '', meta=','.join(k for k in sorted(meta.keys()) if k not in ('tag', '@context', '@id')) if meta else ' -' if 'metadata' in res else ' aggregated', tags='' if 'tag' not in meta else ' [{}]'.format( ','.join(assure_list(meta['tag'])))))
def sort_paths_into_subdatasets(superds_path, target_subs, spec): # XXX forge a chain: whenever some path needs to be pushed down # put the receiving dataset as a components to process into the # respective superdataset -- this will enable further processing # of all datasets in a completely independent fashion # (except for order of processing) # get all existing subdataset as candidate nodes of the graph # that needs to be built and checked subds_graph = Dataset(superds_path).get_subdatasets( absolute=True, recursive=True, edges=True, fulfilled=True) if not subds_graph: # no subdatasets, nothing to sort return for t in target_subs: trace = get_trace( subds_graph, superds_path, t) if not trace: # not connected, or identical continue tosort = [superds_path] + trace + [t] # loop over all but the last one, simplifies logic below for i, d in enumerate(tosort[:-1]): paths = spec.get(d, []) keep_paths = [] next_ds = tosort[i + 1] next_dspaths = spec.get(next_ds, []) comp = _with_sep(next_ds) for p in assure_list(paths): if p.startswith(comp): next_dspaths.append(p) # remember that we pushed the path into this dataset keep_paths.append(next_ds) else: keep_paths.append(p) spec[next_ds] = next_dspaths spec[d] = keep_paths # tidy up -- deduplicate for c in spec: spec[c] = list(set(spec[c]))
def results_from_paths(paths, action=None, type=None, logger=None, refds=None, status=None, message=None): """ Helper to yield analog result dicts for each path in a sequence. Parameters ---------- message: str A result message. May contain `%s` which will be replaced by the respective `path`. Returns ------- generator """ for p in assure_list(paths): yield get_status_dict( action, path=p, type=type, logger=logger, refds=refds, status=status, message=(message, p) if '%s' in message else message)
def put(self, source, destination, recursive=False, preserve_attrs=False): """Copies source file/folder to destination on the remote. Note: this method performs escaping of filenames to an extent that moderately weird ones should work (spaces, quotes, pipes, other characters with special shell meaning), but more complicated cases might require appropriate external preprocessing of filenames. Parameters ---------- source : str or list file/folder path(s) to copy from on local destination : str file/folder path to copy to on remote recursive : bool flag to enable recursive copying of given sources preserve_attrs : bool preserve modification times, access times, and modes from the original file Returns ------- str stdout, stderr of the copy operation. """ # make sure we have an open connection, will test if action is needed # by itself self.open() scp_cmd = self._get_scp_command_spec(recursive, preserve_attrs) # add source filepath(s) to scp command scp_cmd += assure_list(source) # add destination path scp_cmd += [ '%s:%s' % ( self.sshri.hostname, _quote_filename_for_scp(destination), ) ] return self.runner.run(scp_cmd)
def is_result_matching_pathsource_argument(res, **kwargs): # we either have any non-zero number of "paths" (that could be anything), or # we have one path and one source # we don't do any error checking here, done by the command itself source = kwargs.get('source', None) if source is not None: # we want to be able to deal with Dataset instances given as 'source': if isinstance(source, Dataset): source = source.path # if there was a source, it needs to be recorded in the result # otherwise this is not what we are looking for return source == res.get('source_url', None) # the only thing left is a potentially heterogeneous list of paths/URLs paths = assure_list(kwargs.get('path', [])) # three cases left: # 1. input arg was an absolute path -> must match 'path' property # 2. input arg was relative to a dataset -> must match refds/relpath # 3. something nifti with a relative input path that uses PWD as the # reference respath = res.get('path', None) if respath in paths: # absolute match, pretty sure we want this return True elif kwargs.get('dataset', None) and YieldRelativePaths()(res) in paths: # command was called with a reference dataset, and a relative # path of a result matches in input argument -- not 100% exhaustive # test, but could be good enough return True elif any(robust_abspath(p) == respath for p in paths): # one absolutified input path matches the result path # I'd say: got for it! return True elif any(p == res.get('source_url', None) for p in paths): # this was installed from a URL that was given, we'll take that too return True else: return False
def add(self, var, value, where='dataset', reload=True): """Add a configuration variable and value Parameters ---------- var : str Variable name including any section like `git config` expects them, e.g. 'core.editor' value : str Variable value %s""" if where == 'override': from datalad.utils import assure_list val = assure_list(self.overrides.pop(var, None)) val.append(value) self.overrides[var] = val[0] if len(val) == 1 else val if reload: self.reload(force=True) return self._run(['--add', var, value], where=where, reload=reload, log_stderr=True)
def get_query(self, query): query = assure_list(query) simple_fieldspec = re.compile(r"(?P<field>\S*?):(?P<query>.*)") quoted_fieldspec = re.compile(r"'(?P<field>[^']+?)':(?P<query>.*)") query = [ simple_fieldspec.match(q) or quoted_fieldspec.match(q) or q for q in query ] # expand matches, compile expressions query = [{ k: re.compile(self._xfm_query(v)) for k, v in q.groupdict().items() } if hasattr(q, 'groupdict') else re.compile(self._xfm_query(q)) for q in query] # turn "empty" field specs into simple queries # this is used to forcibly disable field-based search # e.g. when searching for a value query = [ q['query'] if isinstance(q, dict) and q['field'].pattern == '' else q for q in query ] return query
def __call__(dataset, filename='README.md', existing='skip'): from os.path import lexists from os.path import join as opj from io import open import logging lgr = logging.getLogger('datalad.plugin.add_readme') from datalad.distribution.dataset import require_dataset from datalad.utils import assure_list dataset = require_dataset(dataset, check_installed=True, purpose='add README') filename = opj(dataset.path, filename) res_kwargs = dict(action='add_readme', path=filename) if lexists(filename) and existing == 'skip': yield dict( res_kwargs, status='notneeded', message='file already exists, and not appending content') return # unlock, file could be annexed if lexists(filename): dataset.unlock(filename) # get any metadata on the dataset itself dsinfo = dataset.metadata( '.', reporton='datasets', return_type='item-or-list', on_failure='ignore') meta = {} if not isinstance(dsinfo, dict) or dsinfo.get('status', None) != 'ok': lgr.warn("Could not obtain dataset metadata, proceeding without") dsinfo = {} else: # flatten possibly existing multiple metadata sources for src in dsinfo['metadata']: if src.startswith('@'): # not a source continue meta.update(dsinfo['metadata'][src]) metainfo = '' for label, content in ( ('', meta.get('description', meta.get('shortdescription', ''))), ('Author{}'.format('s' if isinstance(meta.get('author', None), list) else ''), u'\n'.join([u'- {}'.format(a) for a in assure_list(meta.get('author', []))])), ('Homepage', meta.get('homepage', '')), ('Reference', meta.get('citation', '')), ('License', meta.get('license', '')), ('Keywords', u', '.join([u'`{}`'.format(k) for k in assure_list(meta.get('tag', []))])), ('Funding', meta.get('fundedby', '')), ): if label and content: metainfo += u'\n\n### {}\n\n{}'.format(label, content) elif content: metainfo += u'\n\n{}'.format(content) for key in 'title', 'name', 'shortdescription': if 'title' in meta: break if key in meta: meta['title'] = meta[key] default_content=u"""\ # {title}{metainfo} ## General information This is a DataLad dataset{id}. For more information on DataLad and on how to work with its datasets, see the DataLad documentation at: http://docs.datalad.org """.format( title='Dataset "{}"'.format(meta['title']) if 'title' in meta else 'About this dataset', metainfo=metainfo, id=u' (id: {})'.format(dataset.id) if dataset.id else '', ) with open(filename, 'a' if existing == 'append' else 'w', encoding='utf-8') as fp: fp.write(default_content) yield dict( status='ok', path=filename, type='file', action='add_readme') for r in dataset.save( filename, message='[DATALAD] added README', result_filter=None, result_xfm=None): yield r
def __call__(name=None, url=None, dataset=None, pushurl=None, recursive=False, fetch=False, force=False, as_common_datasrc=None, publish_depends=None, publish_by_default=None): # TODO: Detect malformed URL and fail? # XXX possibly fail if fetch is False and as_common_datasrc # not yet sure if that is an error if name is None or (url is None and pushurl is None): raise ValueError("""insufficient information to add a sibling (needs at least a dataset, a name and an URL).""") if url is None: url = pushurl ds = require_dataset(dataset, check_installed=True, purpose='sibling addition') assert(ds.repo is not None) ds_basename = basename(ds.path) repos = OrderedDict() repos[ds_basename] = {'repo': ds.repo} if recursive: for subds_name in ds.get_subdatasets(recursive=True): subds_path = opj(ds.path, subds_name) subds = Dataset(subds_path) lgr.debug("Adding sub-dataset %s for adding a sibling", subds_path) if not subds.is_installed(): lgr.info("Skipping adding sibling for %s since it " "is not installed", subds) continue repos[ds_basename + '/' + subds_name] = { # repos[subds_name] = { 'repo': GitRepo(subds_path, create=False) } # Note: This is copied from create_sibling # as it is the same logic as for its target_dir. # TODO: centralize and generalize template symbol handling # TODO: Check pushurl for template symbols too. Probably raise if only # one of them uses such symbols replicate_local_structure = "%NAME" not in url for repo_name in repos: repo = repos[repo_name] if not replicate_local_structure: repo['url'] = url.replace("%NAME", repo_name.replace("/", "-")) if pushurl: repo['pushurl'] = pushurl.replace("%NAME", repo_name.replace("/", "-")) else: repo['url'] = url if pushurl: repo['pushurl'] = pushurl if repo_name != ds_basename: repo['url'] = _urljoin(repo['url'], repo_name[len(ds_basename) + 1:]) if pushurl: repo['pushurl'] = _urljoin(repo['pushurl'], repo_name[len(ds_basename) + 1:]) # define config var name for potential publication dependencies depvar = 'remote.{}.datalad-publish-depends'.format(name) # collect existing remotes: already_existing = list() conflicting = list() for repo_name in repos: repoinfo = repos[repo_name] repo = repoinfo['repo'] if name in repo.get_remotes(): already_existing.append(repo_name) lgr.debug("Remote '{0}' already exists " "in '{1}'.""".format(name, repo_name)) existing_url = repo.get_remote_url(name) existing_pushurl = \ repo.get_remote_url(name, push=True) if (not existing_url or repoinfo['url'].rstrip('/') != existing_url.rstrip('/')) \ or (pushurl and existing_pushurl and repoinfo['pushurl'].rstrip('/') != existing_pushurl.rstrip('/')) \ or (pushurl and not existing_pushurl) \ or (publish_depends and set(ds.config.get(depvar, [])) != set(publish_depends)): conflicting.append(repo_name) if not force and conflicting: raise RuntimeError("Sibling '{0}' already exists with conflicting" " URL for {1} dataset(s). {2}".format( name, len(conflicting), conflicting)) successfully_added = list() for repo_name in repos: repoinfo = repos[repo_name] repo = repoinfo['repo'] if repo_name in already_existing: if repo_name not in conflicting: lgr.debug("Skipping {0}. Nothing to do.".format(repo_name)) continue # rewrite url repo.set_remote_url(name, repoinfo['url']) else: # add the remote repo.add_remote(name, repoinfo['url']) if pushurl: repo.set_remote_url(name, repoinfo['pushurl'], push=True) if fetch: # fetch the remote so we are up to date lgr.debug("Fetching sibling %s of %s", name, repo_name) repo.fetch(name) if publish_depends: if depvar in ds.config: # config vars are incremental, so make sure we start from # scratch ds.config.unset(depvar, where='local', reload=False) for d in assure_list(publish_depends): lgr.info( 'Configure additional publication dependency on "%s"', d) ds.config.add(depvar, d, where='local', reload=False) ds.config.reload() if publish_by_default: for refspec in assure_list(publish_by_default): lgr.info( 'Configure additional default publication refspec "%s"', refspec) ds.config.add("remote.{}.push".format(name), refspec, 'local') ds.config.reload() assert isinstance(repo, GitRepo) # just against silly code if isinstance(repo, AnnexRepo): # we need to check if added sibling an annex, and try to enable it # another part of the fix for #463 and #432 try: if not ds.config.obtain( 'remote.{}.annex-ignore'.format(name), default=False, valtype=EnsureBool(), store=False): repo.enable_remote(name) except CommandError as exc: lgr.info("Failed to enable annex remote %s, " "could be a pure git" % name) lgr.debug("Exception was: %s" % exc_str(exc)) if as_common_datasrc: ri = RI(repoinfo['url']) if isinstance(ri, URL) and ri.scheme in ('http', 'https'): # XXX what if there is already a special remote # of this name? Above check for remotes ignores special # remotes. we need to `git annex dead REMOTE` on reconfigure # before we can init a new one # XXX except it is not enough # make special remote of type=git (see #335) repo._run_annex_command( 'initremote', annex_options=[ as_common_datasrc, 'type=git', 'location={}'.format(repoinfo['url']), 'autoenable=true']) else: lgr.warning( 'Not configuring "%s" as a common data source, ' 'URL protocol is not http or https', name) successfully_added.append(repo_name) return successfully_added
def __call__( path=None, source=None, dataset=None, get_data=False, description=None, recursive=False, recursion_limit=None, if_dirty='save-before', save=True, reckless=False, git_opts=None, git_clone_opts=None, annex_opts=None, annex_init_opts=None, jobs=None): # normalize path argument to be equal when called from cmdline and # python and nothing was passed into `path` path = assure_list(path) if not source and not path: raise InsufficientArgumentsError( "Please provide at least a source or a path") ## Common kwargs to pass to underlying git/install calls. # They might need adjustments (e.g. for recursion_limit, but # otherwise would be applicable throughout # # There should have been more of common options! # since underneath get could do similar installs, but now they # have duplicated implementations which differ (e.g. get does not # annex init installed annexes) common_kwargs = dict( get_data=get_data, recursive=recursive, recursion_limit=recursion_limit, git_opts=git_opts, annex_opts=annex_opts, reckless=reckless, jobs=jobs, ) installed_items = [] failed_items = [] # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. ds = None if dataset is not None: ds = require_dataset(dataset, check_installed=True, purpose='installation') handle_dirty_dataset(ds, if_dirty) # switch into scenario without --source: if source is None: # we need to collect URLs and paths to_install = [] to_get = [] for urlpath in path: ri = RI(urlpath) (to_get if isinstance(ri, PathRI) else to_install).append(urlpath) common_kwargs['dataset'] = dataset # first install, and then get for s in to_install: lgr.debug("Install passes into install source=%s", s) try: result = Install.__call__( source=s, description=description, if_dirty=if_dirty, save=save, git_clone_opts=git_clone_opts, annex_init_opts=annex_init_opts, **common_kwargs ) installed_items += assure_list(result) except Exception as exc: lgr.warning("Installation of %s has failed: %s", s, exc_str(exc)) failed_items.append(s) if to_get: lgr.debug("Install passes into get %d items", len(to_get)) # all commented out hint on inability to pass those options # into underlying install-related calls. # Also need to pass from get: # annex_get_opts try: installed_datasets = Get.__call__( to_get, # description=description, # if_dirty=if_dirty, # save=save, # git_clone_opts=git_clone_opts, # annex_init_opts=annex_init_opts _return_datasets=True, **common_kwargs ) except IncompleteResultsError as exc: exc_str_ = ': ' + exc_str(exc) if exc.results else '' lgr.warning("Some items failed to install: %s", exc_str_) installed_datasets = exc.results failed_items.extend(exc.failed) # compose content_by_ds into result for dspath in installed_datasets: ds_ = Dataset(dspath) if ds_.is_installed(): installed_items.append(ds_) else: lgr.warning("%s was not installed", ds_) return Install._handle_and_return_installed_items( ds, installed_items, failed_items, save) if source and path and len(path) > 1: raise ValueError( "install needs a single PATH when source is provided. " "Was given mutliple PATHs: %s" % str(path)) # parameter constraints: if not source: raise InsufficientArgumentsError( "a `source` is required for installation") # code below deals with a single path only path = path[0] if path else None if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination raise ValueError( "installation `source` and destination `path` are identical. " "If you are trying to add a subdataset simply use `save` %s".format( path)) # resolve the target location (if local) against the provided dataset # or CWD: if path is not None: # Should work out just fine for regular paths, so no additional # conditioning is necessary try: path_ri = RI(path) except Exception as e: raise ValueError( "invalid path argument {}: ({})".format(path, exc_str(e))) try: # Wouldn't work for SSHRI ATM, see TODO within SSHRI # yoh: path should be a local path, and mapping note within # SSHRI about mapping localhost:path to path is kinda # a peculiar use-case IMHO path = resolve_path(path_ri.localpath, dataset) # any `path` argument that point to something local now # resolved and is no longer a URL except ValueError: # URL doesn't point to a local something # so we have an actual URL in `path`. Since this is valid as a # single positional argument, `source` has to be None at this # point. if is_datalad_compat_ri(path) and source is None: # we have an actual URL -> this should be the source lgr.debug( "Single argument given to install, that doesn't seem to " "be a local path. " "Assuming the argument identifies a source location.") source = path path = None else: # `path` is neither a valid source nor a local path. # TODO: The only thing left is a known subdataset with a # name, that is not a path; Once we correctly distinguish # between path and name of a submodule, we need to consider # this. # For now: Just raise raise ValueError("Invalid path argument {0}".format(path)) # `path` resolved, if there was any. # Possibly do conversion from source into a git-friendly url # luckily GitRepo will undo any fancy file:/// url to make use of Git's # optimization for local clones.... source = _get_git_url_from_source(source) lgr.debug("Resolved source: {0}".format(source)) # TODO: we probably need to resolve source, if it is a local path; # expandpath, normpath, ... Where exactly is the point to do it? # derive target from source: if path is None: # we got nothing but a source. do something similar to git clone # and derive the path from the source and continue lgr.debug( "Neither dataset nor target installation path provided. " "Deriving destination path from given source %s", source) path = _get_installationpath_from_url(source) # since this is a relative `path`, resolve it: path = resolve_path(path, dataset) # there is no other way -- my intoxicated brain tells me assert(path is not None) lgr.debug("Resolved installation target: {0}".format(path)) destination_dataset = Dataset(path) if destination_dataset.is_installed(): # this should not be, check if this is an error, or a reinstall # from the same source # this is where we would have installed this from candidate_sources = _get_flexible_source_candidates( source, destination_dataset.path) # this is where it was installed from track_name, track_url = _get_tracking_source(destination_dataset) if track_url in candidate_sources or get_local_file_url(track_url): # TODO: this one breaks "promise" assumptions of the repeated # invocations of install. # yoh thinks that we actually should be the ones to run update # (without merge) after basic # check that it is clean and up-to-date with its super dataset # and if so, not return here but continue with errands (recursive # installation and get_data) so we could provide the same # result if we rerun the same install twice. lgr.info( "%s was already installed from %s. Use `update` to obtain " "latest updates, or `get` or `install` with a path, not URL, " "to (re)fetch data and / or subdatasets", destination_dataset, track_url) return destination_dataset else: raise ValueError("There is already a dataset installed at the " "destination: %s", destination_dataset) ########### # we should know everything necessary by now # actual installation starts ########### # FLOW GUIDE: # four cases: # 1. install into a dataset # 1.1. we install a known subdataset # => git submodule update --init # 1.2. we install an existing repo as a subdataset inplace # => git submodule add + magic # 1.3. we (recursively) try to install implicit subdatasets between # ds and path # 1.4. we install a new subdataset from an explicit source # => git submodule add # 2. we "just" install from an explicit source # => git clone if ds is not None: # FLOW GUIDE: 1. # express the destination path relative to the root of # the dataset relativepath = relpath(path, start=ds.path) if relativepath.startswith(pardir): raise ValueError("installation path outside dataset " "({0})".format(path)) lgr.debug("Resolved installation target relative to dataset " "{0}: {1}".format(ds, relativepath)) # FLOW_GUIDE 1.4. lgr.info("Installing subdataset from '{0}' at: {0}".format( source, relativepath)) destination_dataset = _install_subds_from_flexible_source( ds, relativepath, source, reckless) else: # FLOW GUIDE: 2. lgr.info("Installing dataset at {0} from {1}".format(path, source)) # Currently assuming there is nothing at the target to deal with # and rely on failures raising from the git call ... # We possibly need to consider /.git URL candidate_sources = _get_flexible_source_candidates(source) _clone_from_any_source(candidate_sources, destination_dataset.path) # FLOW GUIDE: All four cases done. if not destination_dataset.is_installed(): # XXX shouldn't we just fail!? (unless some explicit --skip-failing?) lgr.error("Installation failed.") return None _handle_possible_annex_dataset(destination_dataset, reckless) lgr.debug("Installation of %s done.", destination_dataset) if not destination_dataset.is_installed(): # log error and don't report as installed item, but don't raise, # since we might be in a process of recursive installation where # a lot of other datasets can still be installed successfully. lgr.error("Installation of {0} failed.".format(destination_dataset)) else: installed_items.append(destination_dataset) # we need to decrease the recursion limit, relative to # subdatasets now subds_recursion_limit = max(0, recursion_limit - 1) \ if isinstance(recursion_limit, int) \ else recursion_limit # Now, recursive calls: if recursive: if description: # yoh: why? especially if we somehow allow for templating them # with e.g. '%s' to catch the subdataset path lgr.warning("Description can't be assigned recursively.") subs = destination_dataset.get_subdatasets( # yes, it does make sense to combine no recursion with # recursion_limit: when the latter is 0 we get no subdatasets # reported, otherwise we always get the 1st-level subs recursive=False, recursion_limit=recursion_limit, absolute=False) if subs: lgr.debug("Obtaining subdatasets of %s: %s", destination_dataset, subs) kwargs = common_kwargs.copy() kwargs['recursion_limit'] = subds_recursion_limit rec_installed = Get.__call__( subs, # all at once dataset=destination_dataset, # TODO expose this # yoh: exactly! #annex_get_opts=annex_get_opts, **kwargs ) # TODO do we want to filter this so `install` only returns # the datasets? if isinstance(rec_installed, list): installed_items.extend(rec_installed) else: installed_items.append(rec_installed) if get_data: lgr.debug("Getting data of {0}".format(destination_dataset)) kwargs = common_kwargs.copy() kwargs['recursive'] = False destination_dataset.get(curdir, **kwargs) return Install._handle_and_return_installed_items( ds, installed_items, failed_items, save)
def __call__(sshurl, name=None, target_dir=None, target_url=None, target_pushurl=None, dataset=None, recursive=False, recursion_limit=None, existing='error', shared=None, group=None, ui=False, as_common_datasrc=None, publish_by_default=None, publish_depends=None, annex_wanted=None, annex_group=None, annex_groupwanted=None, inherit=False, since=None): # # nothing without a base dataset # ds = require_dataset(dataset, check_installed=True, purpose='creating a sibling') refds_path = ds.path # # all checks that are possible before we start parsing the dataset # # possibly use sshurl to get the name in case if not specified if not sshurl: if not inherit: raise InsufficientArgumentsError( "needs at least an SSH URL, if no inherit option" ) if name is None: raise ValueError( "Neither SSH URL, nor the name of sibling to inherit from " "was specified" ) # It might well be that we already have this remote setup try: sshurl = CreateSibling._get_remote_url(ds, name) except Exception as exc: lgr.debug('%s does not know about url for %s: %s', ds, name, exc_str(exc)) elif inherit: raise ValueError( "For now, for clarity not allowing specifying a custom sshurl " "while inheriting settings" ) # may be could be safely dropped -- still WiP if not sshurl: # TODO: may be more back up before _prep? super_ds = ds.get_superdataset() if not super_ds: raise ValueError( "Could not determine super dataset for %s to inherit URL" % ds ) super_url = CreateSibling._get_remote_url(super_ds, name) # for now assuming hierarchical setup # (TODO: to be able to destinguish between the two, probably # needs storing datalad.*.target_dir to have %RELNAME in there) sshurl = slash_join(super_url, relpath(ds.path, super_ds.path)) # check the login URL sshri = RI(sshurl) if not is_ssh(sshri): raise ValueError( "Unsupported SSH URL: '{0}', " "use ssh://host/path or host:path syntax".format(sshurl)) if not name: # use the hostname as default remote name name = sshri.hostname lgr.debug( "No sibling name given, use URL hostname '%s' as sibling name", name) if since == '': # consider creating siblings only since the point of # the last update # XXX here we assume one to one mapping of names from local branches # to the remote active_branch = ds.repo.get_active_branch() since = '%s/%s' % (name, active_branch) # # parse the base dataset to find all subdatasets that need processing # to_process = [] for ap in AnnotatePaths.__call__( dataset=refds_path, # only a single path! path=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='create_sibling', # both next should not happen anyways unavailable_path_status='impossible', nondataset_path_status='error', modified=since, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) != 'dataset' or ap.get('state', None) == 'absent': # this can happen when there is `since`, but we have no # use for anything but datasets here continue checkds_remotes = Dataset(ap['path']).repo.get_remotes() \ if ap.get('state', None) != 'absent' \ else [] if publish_depends: # make sure dependencies are valid # TODO: inherit -- we might want to automagically create # those dependents as well??? unknown_deps = set(assure_list(publish_depends)).difference(checkds_remotes) if unknown_deps: ap['status'] = 'error' ap['message'] = ( 'unknown sibling(s) specified as publication dependency: %s', unknown_deps) yield ap continue if name in checkds_remotes and existing in ('error', 'skip'): ap['status'] = 'error' if existing == 'error' else 'notneeded' ap['message'] = ( "sibling '%s' already configured (specify alternative name, or force " "reconfiguration via --existing", name) yield ap continue to_process.append(ap) if not to_process: # we ruled out all possibilities # TODO wait for gh-1218 and make better return values lgr.info("No datasets qualify for sibling creation. " "Consider different settings for --existing " "or --since if this is unexpected") return if target_dir is None: if sshri.path: target_dir = sshri.path else: target_dir = '.' # TODO: centralize and generalize template symbol handling replicate_local_structure = "%RELNAME" not in target_dir # request ssh connection: lgr.info("Connecting ...") assert(sshurl is not None) # delayed anal verification ssh = ssh_manager.get_connection(sshurl) if not ssh.get_annex_version(): raise MissingExternalDependency( 'git-annex', msg='on the remote system') # # all checks done and we have a connection, now do something # # loop over all datasets, ordered from top to bottom to make test # below valid (existing directories would cause the machinery to halt) # But we need to run post-update hook in depth-first fashion, so # would only collect first and then run (see gh #790) yielded = set() remote_repos_to_run_hook_for = [] for currentds_ap in \ sorted(to_process, key=lambda x: x['path'].count('/')): current_ds = Dataset(currentds_ap['path']) path = _create_dataset_sibling( name, current_ds, ds.path, ssh, replicate_local_structure, sshri, target_dir, target_url, target_pushurl, existing, shared, group, publish_depends, publish_by_default, ui, as_common_datasrc, annex_wanted, annex_group, annex_groupwanted, inherit ) if not path: # nothing new was created # TODO is 'notneeded' appropriate in this case? currentds_ap['status'] = 'notneeded' # TODO explain status in 'message' yield currentds_ap yielded.add(currentds_ap['path']) continue remote_repos_to_run_hook_for.append((path, currentds_ap)) # publish web-interface to root dataset on publication server if current_ds.path == ds.path and ui: lgr.info("Uploading web interface to %s" % path) try: CreateSibling.upload_web_interface(path, ssh, shared, ui) except CommandError as e: currentds_ap['status'] = 'error' currentds_ap['message'] = ( "failed to push web interface to the remote datalad repository (%s)", exc_str(e)) yield currentds_ap yielded.add(currentds_ap['path']) continue # in reverse order would be depth first lgr.info("Running post-update hooks in all created siblings") # TODO: add progressbar for path, currentds_ap in remote_repos_to_run_hook_for[::-1]: # Trigger the hook lgr.debug("Running hook for %s (if exists and executable)", path) try: ssh("cd {} " "&& ( [ -x hooks/post-update ] && hooks/post-update || : )" "".format(sh_quote(_path_(path, ".git")))) except CommandError as e: currentds_ap['status'] = 'error' currentds_ap['message'] = ( "failed to run post-update hook under remote path %s (%s)", path, exc_str(e)) yield currentds_ap yielded.add(currentds_ap['path']) continue if not currentds_ap['path'] in yielded: # if we were silent until now everything is just splendid currentds_ap['status'] = 'ok' yield currentds_ap
def __call__(dataset, filename='README.md', existing='skip'): from os.path import lexists from os.path import join as opj from io import open import logging lgr = logging.getLogger('datalad.plugin.add_readme') from datalad.distribution.dataset import require_dataset from datalad.utils import assure_list dataset = require_dataset(dataset, check_installed=True, purpose='add README') filename = opj(dataset.path, filename) res_kwargs = dict(action='add_readme', path=filename) if lexists(filename) and existing == 'skip': yield dict( res_kwargs, status='notneeded', message='file already exists, and not appending content') return # unlock, file could be annexed if lexists(filename): dataset.unlock(filename) # get any metadata on the dataset itself dsinfo = dataset.metadata('.', reporton='datasets', return_type='item-or-list', on_failure='ignore') meta = {} if not isinstance(dsinfo, dict) or dsinfo.get('status', None) != 'ok': lgr.warn("Could not obtain dataset metadata, proceeding without") dsinfo = {} else: # flatten possibly existing multiple metadata sources for src in dsinfo['metadata']: if src.startswith('@'): # not a source continue meta.update(dsinfo['metadata'][src]) metainfo = '' for label, content in ( ('', meta.get('description', meta.get('shortdescription', ''))), ('Author{}'.format( 's' if isinstance(meta.get('author', None), list) else ''), u'\n'.join([ u'- {}'.format(a) for a in assure_list(meta.get('author', [])) ])), ('Homepage', meta.get('homepage', '')), ('Reference', meta.get('citation', '')), ('License', meta.get('license', '')), ('Keywords', u', '.join([ u'`{}`'.format(k) for k in assure_list(meta.get('tag', [])) ])), ('Funding', meta.get('fundedby', '')), ): if label and content: metainfo += u'\n\n### {}\n\n{}'.format(label, content) elif content: metainfo += u'\n\n{}'.format(content) for key in 'title', 'name', 'shortdescription': if 'title' in meta: break if key in meta: meta['title'] = meta[key] default_content = u"""\ # {title}{metainfo} ## General information This is a DataLad dataset{id}. For more information on DataLad and on how to work with its datasets, see the DataLad documentation at: http://docs.datalad.org """.format( title='Dataset "{}"'.format(meta['title']) if 'title' in meta else 'About this dataset', metainfo=metainfo, id=u' (id: {})'.format(dataset.id) if dataset.id else '', ) with open(filename, 'a' if existing == 'append' else 'w', encoding='utf-8') as fp: fp.write(default_content) yield dict(status='ok', path=filename, type='file', action='add_readme') for r in dataset.rev_save(filename, message='[DATALAD] added README', result_filter=None, result_xfm=None): yield r
def _extract_metadata(agginto_ds, aggfrom_ds, db, to_save, objid, metasources, refcommit, subds_relpaths, agg_base_path): lgr.debug('Performing metadata extraction from %s', aggfrom_ds) # we will replace any conflicting info on this dataset with fresh stuff agginfo = db.get(aggfrom_ds.path, {}) # paths to extract from relevant_paths = sorted(_get_metadatarelevant_paths(aggfrom_ds, subds_relpaths)) # get extractors to engage from source dataset nativetypes = ['datalad_core', 'annex'] + assure_list(get_metadata_type(aggfrom_ds)) # store esssential extraction config in dataset record agginfo['extractors'] = nativetypes agginfo['datalad_version'] = datalad.__version__ # perform the actual extraction dsmeta, contentmeta, errored = _get_metadata( aggfrom_ds, nativetypes, # None indicates to honor a datasets per-extractor configuration and to be # on by default global_meta=None, content_meta=None, paths=relevant_paths) meta = { 'ds': dsmeta, 'cn': (dict(contentmeta[k], path=k) for k in sorted(contentmeta)) } # inject the info which commmit we are describing into the core metadata # this is done here in order to avoid feeding it all the way down coremeta = dsmeta.get('datalad_core', {}) version = aggfrom_ds.repo.describe(commitish=refcommit) if version: coremeta['version'] = version coremeta['refcommit'] = refcommit dsmeta['datalad_core'] = coremeta # for both types of metadata for label, props in metasources.items(): dest = props['targetds'] if not meta[label]: continue # only write to disk if there is something objrelpath = _get_obj_location(objid, label, props['dumper']) # place metadata object into the source dataset objpath = op.join(dest.path, agg_base_path, objrelpath) # write obj files if op.exists(objpath): dest.unlock(objpath) elif op.lexists(objpath): # if it gets here, we have a symlink that is pointing nowhere # kill it, to be replaced with the newly aggregated content dest.repo.remove(objpath) # TODO actually dump a compressed file when annexing is possible # to speed up on-demand access props['dumper'](meta[label], objpath) # stage for dataset.save() to_save.append(dict(path=objpath, type='file')) # important to use abspath here, needs to be rewritten relative to # all receiving datasets agginfo['{}_info'.format(props['type'])] = objpath # overwrite existing info with stuff from just finished extraction db[aggfrom_ds.path] = agginfo return errored
def _get_metadata(ds, types, global_meta=None, content_meta=None, paths=None): """Make a direct query of a dataset to extract its metadata. Parameters ---------- ds : Dataset types : list """ errored = False dsmeta = dict() contentmeta = {} if global_meta is not None and content_meta is not None and \ not global_meta and not content_meta: # both are false and not just none return dsmeta, contentmeta, errored context = { '@vocab': 'http://docs.datalad.org/schema_v{}.json'.format( vocabulary_version)} fullpathlist = paths if paths and isinstance(ds.repo, AnnexRepo): # Ugly? Jep: #2055 content_info = zip(paths, ds.repo.file_has_content(paths), ds.repo.is_under_annex(paths)) paths = [p for p, c, a in content_info if not a or c] nocontent = len(fullpathlist) - len(paths) if nocontent: # TODO better fail, or support incremental and label this file as no present lgr.warn( '{} files have no content present, ' 'some extractors will not operate on {}'.format( nocontent, 'them' if nocontent > 10 else [p for p, c, a in content_info if not c and a]) ) # pull out potential metadata field blacklist config settings blacklist = [re.compile(bl) for bl in assure_list(ds.config.obtain( 'datalad.metadata.aggregate-ignore-fields', default=[]))] # enforce size limits max_fieldsize = ds.config.obtain('datalad.metadata.maxfieldsize') # keep local, who knows what some extractors might pull in from pkg_resources import iter_entry_points # delayed heavy import extractors = {ep.name: ep for ep in iter_entry_points('datalad.metadata.extractors')} log_progress( lgr.info, 'metadataextractors', 'Start metadata extraction from %s', ds, total=len(types), label='Metadata extraction', unit=' extractors', ) for mtype in types: mtype_key = mtype log_progress( lgr.info, 'metadataextractors', 'Engage %s metadata extractor', mtype_key, update=1, increment=True) if mtype_key not in extractors: # we said that we want to fail, rather then just moan about less metadata log_progress( lgr.error, 'metadataextractors', 'Failed %s metadata extraction from %s', mtype_key, ds, ) raise ValueError( 'Enabled metadata extractor %s is not available in this installation', mtype_key) try: extractor_cls = extractors[mtype_key].load() extractor = extractor_cls( ds, paths=paths if extractor_cls.NEEDS_CONTENT else fullpathlist) except Exception as e: log_progress( lgr.error, 'metadataextractors', 'Failed %s metadata extraction from %s', mtype_key, ds, ) raise ValueError( "Failed to load metadata extractor for '%s', " "broken dataset configuration (%s)?: %s", mtype, ds, exc_str(e)) continue try: dsmeta_t, contentmeta_t = extractor.get_metadata( dataset=global_meta if global_meta is not None else ds.config.obtain( 'datalad.metadata.aggregate-dataset-{}'.format(mtype.replace('_', '-')), default=True, valtype=EnsureBool()), content=content_meta if content_meta is not None else ds.config.obtain( 'datalad.metadata.aggregate-content-{}'.format(mtype.replace('_', '-')), default=True, valtype=EnsureBool())) except Exception as e: lgr.error('Failed to get dataset metadata ({}): {}'.format( mtype, exc_str(e))) if cfg.get('datalad.runtime.raiseonerror'): log_progress( lgr.error, 'metadataextractors', 'Failed %s metadata extraction from %s', mtype_key, ds, ) raise errored = True # if we dont get global metadata we do not want content metadata continue if dsmeta_t: if _ok_metadata(dsmeta_t, mtype, ds, None): dsmeta_t = _filter_metadata_fields( dsmeta_t, maxsize=max_fieldsize, blacklist=blacklist) dsmeta[mtype_key] = dsmeta_t else: errored = True unique_cm = {} extractor_unique_exclude = getattr(extractor_cls, "_unique_exclude", set()) # TODO: ATM neuroimaging extractors all provide their own internal # log_progress but if they are all generators, we could provide generic # handling of the progress here. Note also that log message is actually # seems to be ignored and not used, only the label ;-) # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'Metadata extraction per location for %s', mtype, # # contentmeta_t is a generator... so no cound is known # # total=len(contentmeta_t or []), # label='Metadata extraction per location', # unit=' locations', # ) for loc, meta in contentmeta_t or {}: lgr.log(5, "Analyzing metadata for %s", loc) # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'ignoredatm', # label=loc, # update=1, # increment=True) if not _ok_metadata(meta, mtype, ds, loc): errored = True # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'ignoredatm', # label='Failed for %s' % loc, # ) continue # we also want to store info that there was no metadata(e.g. to get a list of # files that have no metadata) # if there is an issue that a extractor needlessly produces empty records, the # extractor should be fixed and not a general switch. For example the datalad_core # issues empty records to document the presence of a file #elif not meta: # continue # apply filters meta = _filter_metadata_fields( meta, maxsize=max_fieldsize, blacklist=blacklist) if not meta: continue # assign # only ask each metadata extractor once, hence no conflict possible loc_dict = contentmeta.get(loc, {}) loc_dict[mtype_key] = meta contentmeta[loc] = loc_dict if ds.config.obtain( 'datalad.metadata.generate-unique-{}'.format(mtype_key.replace('_', '-')), default=True, valtype=EnsureBool()): # go through content metadata and inject report of unique keys # and values into `dsmeta` for k, v in iteritems(meta): if k in dsmeta.get(mtype_key, {}): # if the dataset already has a dedicated idea # about a key, we skip it from the unique list # the point of the list is to make missing info about # content known in the dataset, not to blindly # duplicate metadata. Example: list of samples data # were recorded from. If the dataset has such under # a 'sample' key, we should prefer that, over an # aggregated list of a hopefully-kinda-ok structure continue elif k in extractor_unique_exclude: # the extractor thinks this key is worthless for the purpose # of discovering whole datasets # we keep the key (so we know that some file is providing this key), # but ignore any value it came with unique_cm[k] = None continue vset = unique_cm.get(k, set()) vset.add(_val2hashable(v)) unique_cm[k] = vset # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'Finished metadata extraction across locations for %s', mtype) if unique_cm: # per source storage here too ucp = dsmeta.get('datalad_unique_content_properties', {}) # important: we want to have a stable order regarding # the unique values (a list). we cannot guarantee the # same order of discovery, hence even when not using a # set above we would still need sorting. the callenge # is that any value can be an arbitrarily complex nested # beast # we also want to have each unique value set always come # in a top-level list, so we known if some unique value # was a list, os opposed to a list of unique values def _ensure_serializable(val): if isinstance(val, ReadOnlyDict): return {k: _ensure_serializable(v) for k, v in iteritems(val)} if isinstance(val, (tuple, list)): return [_ensure_serializable(v) for v in val] else: return val ucp[mtype_key] = { k: [_ensure_serializable(i) for i in sorted( v, key=_unique_value_key)] if v is not None else None for k, v in iteritems(unique_cm) # v == None (disable unique, but there was a value at some point) # otherwise we only want actual values, and also no single-item-lists # of a non-value # those contribute no information, but bloat the operation # (inflated number of keys, inflated storage, inflated search index, ...) if v is None or (v and not v == {''})} dsmeta['datalad_unique_content_properties'] = ucp log_progress( lgr.info, 'metadataextractors', 'Finished metadata extraction from %s', ds, ) # always identify the effective vocabulary - JSON-LD style if context: dsmeta['@context'] = context return dsmeta, contentmeta, errored
def __call__( path=None, dataset=None, recursive=False, recursion_limit=None, update_mode='target', incremental=False, force_extraction=False, save=True): refds_path = Interface.get_refds_path(dataset) # it really doesn't work without a dataset ds = require_dataset( dataset, check_installed=True, purpose='metadata aggregation') path = assure_list(path) if not path: # then current/reference dataset is "aggregated" # We should not add ds.path always since then --recursive would # also recurse current even if paths are given path.append(ds.path) agginfo_db_location, agg_base_path = get_ds_aggregate_db_locations(ds) agginfo_db = load_ds_aggregate_db(ds, abspath=True) to_save = [] to_aggregate = set() for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='aggregate_metadata', # uninstalled subdatasets could be queried via aggregated metadata # -> no 'error' unavailable_path_status='', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue ap_type = ap.get('type', None) ap_state = ap.get('state', None) assert('parentds' in ap or ap_type == 'dataset') if ap_type == 'dataset' and ap_state != 'absent': # a present dataset, we can take directly from it aggsrc = ap['path'] lgr.info('Aggregate metadata for dataset %s', aggsrc) else: # everything else needs to come from the parent aggsrc = ap['parentds'] if ap_state == 'absent': lgr.info( 'Attempt to use pre-aggregate metadata for absent %s from dataset at %s', ap['path'], aggsrc) else: lgr.info( 'Aggregate metadata for %s from dataset at %s', ap['path'], aggsrc) to_aggregate.add(aggsrc) if ap_state == 'absent': # key thought: recursive is done by path annotation, hence # once we hit an absent dataset, we are 100% certain that # there is nothing to recurse into on the file system # hence we only have to look into the aggregated metadata # of the last available dataset in the dataset tree edge # # if there is nothing at this path, we need to look into the # parentds and check if we know anything about this path # if we do, we need to grab all the info and objects # if not, we need to error res = _get_dsinfo_from_aggmetadata( aggsrc, ap['path'], recursive, agginfo_db) if not isinstance(res, list): yield get_status_dict( status='impossible', message=res, action='aggregate_metadata', path=ap['path'], logger=lgr) continue # cue for aggregation to_aggregate.update(res) else: # actually aggregate metadata for this dataset, immediately place # generated objects into the aggregated or reference dataset, # and put info into DB to get the distributed to all datasets # that need to be updated errored = _dump_extracted_metadata( ds, Dataset(aggsrc), agginfo_db, to_save, force_extraction, agg_base_path) if errored: yield get_status_dict( status='error', message='Metadata extraction failed (see previous error message, set datalad.runtime.raiseonerror=yes to fail immediately)', action='aggregate_metadata', path=aggsrc, logger=lgr) # at this point we have dumped all aggregated metadata into object files # somewhere, we know what needs saving, but having saved anything, and # we know about the states of all aggregated dataset in the DB # what remains to do is to update all dataset, so they have there own copy # of aggregated metadata and update their respective aggregate.json with # info on what states we just aggregated from # first, let's figure out what dataset need updating at all # get adjencency info of the dataset tree spanning the base to all leaf dataset # associated with the path arguments if update_mode == 'all': ds_adj = {} discover_dataset_trace_to_targets( ds.path, to_aggregate, [], ds_adj, # we know that to_aggregate only lists datasets, existing and # absent ones -- we want to aggregate all of them, either from # just extracted metadata, or from previously aggregated metadata # of the closest superdataset includeds=to_aggregate) # TODO we need to work in the info about dataset that we only got from # aggregated metadata, that had no trace on the file system in here!! subtrees = _adj2subtrees(ds.path, ds_adj, to_aggregate) elif update_mode == 'target': subtrees = {ds.path: list(agginfo_db.keys())} else: raise ValueError( "unknown `update_mode` '%s' for metadata aggregation", update_mode) # go over datasets in bottom-up fashion for parentds_path in sorted(subtrees, reverse=True): lgr.info('Update aggregate metadata in dataset at: %s', parentds_path) _update_ds_agginfo( ds.path, parentds_path, subtrees[parentds_path], incremental, agginfo_db, to_save) # update complete res = get_status_dict( status='ok', action='aggregate_metadata', path=parentds_path, type='dataset', logger=lgr) res.update(agginfo_db.get(parentds_path, {})) yield res # # save potential modifications to dataset global metadata # if not to_save: return lgr.info('Attempting to save %i files/datasets', len(to_save)) for res in Save.__call__( path=to_save, dataset=refds_path, message='[DATALAD] Dataset aggregate metadata update', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def only_matching_paths(res, **kwargs): # TODO handle relative paths by using a contained 'refds' value paths = assure_list(kwargs.get('path', [])) respath = res.get('path', None) return respath in paths
def _get_procedure_implementation(name='*', ds=None): """get potential procedure path and configuration Order of consideration is user-level, system-level, dataset, datalad extensions, datalad. First one found according to this order is the one to be returned. Therefore local definitions/configurations take precedence over ones, that come from outside (via a datalad-extension or a dataset with its .datalad/config). If a dataset had precedence (as it was before), the addition (or just an update) of a (sub-)dataset would otherwise surprisingly cause you do execute code different from what you defined within ~/.gitconfig or your local repository's .git/config. So, local definitions take precedence over remote ones and more specific ones over more general ones. Returns ------- tuple path, format string, help message """ ds = ds if isinstance(ds, Dataset) else Dataset(ds) if ds else None # 1. check system and user account for procedure for loc in (cfg.obtain('datalad.locations.user-procedures'), cfg.obtain('datalad.locations.system-procedures')): for dir in assure_list(loc): for m, n in _get_file_match(dir, name): yield (m, n,) + _get_proc_config(n) # 2. check dataset for procedure if ds is not None and ds.is_installed(): # could be more than one dirs = assure_list( ds.config.obtain('datalad.locations.dataset-procedures')) for dir in dirs: # TODO `get` dirs if necessary for m, n in _get_file_match(op.join(ds.path, dir), name): yield (m, n,) + _get_proc_config(n, ds=ds) # 2.1. check subdatasets recursively for subds in ds.subdatasets(return_type='generator', result_xfm='datasets'): for m, n, f, h in _get_procedure_implementation(name=name, ds=subds): yield m, n, f, h # 3. check extensions for procedure # delay heavy import until here from pkg_resources import iter_entry_points from pkg_resources import resource_isdir from pkg_resources import resource_filename for entry_point in iter_entry_points('datalad.extensions'): # use of '/' here is OK wrt to platform compatibility if resource_isdir(entry_point.module_name, 'resources/procedures'): for m, n in _get_file_match( resource_filename( entry_point.module_name, 'resources/procedures'), name): yield (m, n,) + _get_proc_config(n) # 4. at last check datalad itself for procedure for m, n in _get_file_match( resource_filename('datalad', 'resources/procedures'), name): yield (m, n,) + _get_proc_config(n)
def _get_submodules(dspath, fulfilled, recursive, recursion_limit, contains, bottomup, set_property, delete_property, refds_path): if not GitRepo.is_valid_repo(dspath): return modinfo = _parse_gitmodules(dspath) # write access parser parser = None # TODO bring back in more global scope from below once segfaults are # figured out #if set_property or delete_property: # gitmodule_path = opj(dspath, ".gitmodules") # parser = GitConfigParser( # gitmodule_path, read_only=False, merge_includes=False) # parser.read() # put in giant for-loop to be able to yield results before completion for sm in _parse_git_submodules(dspath): if contains and not path_startswith(contains, sm['path']): # we are not looking for this subds, because it doesn't # match the target path continue sm.update(modinfo.get(sm['path'], {})) if set_property or delete_property: gitmodule_path = opj(dspath, ".gitmodules") parser = GitConfigParser( gitmodule_path, read_only=False, merge_includes=False) parser.read() # do modifications now before we read the info out for reporting # use 'submodule "NAME"' section ID style as this seems to be the default submodule_section = 'submodule "{}"'.format(sm['gitmodule_name']) # first deletions for dprop in assure_list(delete_property): parser.remove_option(submodule_section, dprop) # also kick from the info we just read above sm.pop('gitmodule_{}'.format(dprop), None) # and now setting values for sprop in assure_list(set_property): prop, val = sprop if val.startswith('<') and val.endswith('>') and '{' in val: # expand template string val = val[1:-1].format( **dict( sm, refds_relpath=relpath(sm['path'], refds_path), refds_relname=relpath(sm['path'], refds_path).replace(os.sep, '-'))) parser.set_value( submodule_section, prop, val) # also add to the info we just read above sm['gitmodule_{}'.format(prop)] = val Dataset(dspath).add( '.gitmodules', to_git=True, message='[DATALAD] modified subdataset properties') # let go of resources, locks, ... parser.release() #common = commonprefix((with_pathsep(subds), with_pathsep(path))) #if common.endswith(sep) and common == with_pathsep(subds): # candidates.append(common) subdsres = get_status_dict( 'subdataset', status='ok', type='dataset', logger=lgr) subdsres.update(sm) subdsres['parentds'] = dspath if not bottomup and \ (fulfilled is None or GitRepo.is_valid_repo(sm['path']) == fulfilled): yield subdsres # expand list with child submodules. keep all paths relative to parent # and convert jointly at the end if recursive and \ (recursion_limit in (None, 'existing') or (isinstance(recursion_limit, int) and recursion_limit > 1)): for r in _get_submodules( sm['path'], fulfilled, recursive, (recursion_limit - 1) if isinstance(recursion_limit, int) else recursion_limit, contains, bottomup, set_property, delete_property, refds_path): yield r if bottomup and \ (fulfilled is None or GitRepo.is_valid_repo(sm['path']) == fulfilled): yield subdsres if parser is not None: # release parser lock manually, auto-cleanup is not reliable in PY3 parser.release()
def __call__( path=None, dataset=None, recursive=False, recursion_limit=None, action=None, unavailable_path_status='', unavailable_path_msg=None, nondataset_path_status='error', force_parentds_discovery=True, force_subds_discovery=True, force_no_revision_change_discovery=True, force_untracked_discovery=True, modified=None): # upfront check for the fastest possible response if not path and dataset is None: # nothing given, try "here", but do not use `require_dataset`, as # it will determine the root dataset of `curdir` and further down # lead to path annotation of upstairs directories dataset = curdir if force_subds_discovery and not force_parentds_discovery: raise ValueError( 'subdataset discovery requires parent dataset discovery') # CONCEPT: yield with no status to indicate further processing # everything in one big loop to be able too yield as fast a possible # without any precomputing for all paths refds_path = Interface.get_refds_path(dataset) if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)): raise ValueError( "modification detection only works with a base dataset (non-given or found)") # prep common result props res_kwargs = dict( action=action if action else 'annotate_path', refds=refds_path, logger=lgr) # handle the case of recursion into a single dataset without any # extra fancy processing first -- full recursion can be done # faster than manual recursion, hence we gain quite some speed # from these few lines of extra code if not modified and not path and refds_path: if not GitRepo.is_valid_repo(refds_path): yield get_status_dict( # doesn't matter if the path is in another dataset # it was given as reference dataset status=nondataset_path_status, message='given reference dataset is not a dataset', path=refds_path, **res_kwargs) return refds = Dataset(refds_path) path = [] # yield the dataset itself r = get_status_dict(ds=refds, status='', **res_kwargs) yield r if recursive: # if we have nothing given, but need recursion, we need to feed # the dataset path itself for r in yield_recursive( refds, refds_path, action, recursion_limit): r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] yield r return # goal: structure in a way that makes most information on any path # available in a single pass, at the cheapest possible cost reported_paths = {} requested_paths = assure_list(path) if modified is not None: # modification detection would silently kill all nondataset paths # but we have to complain about them, hence doing it here if requested_paths and refds_path: for r in requested_paths: p = r['path'] if isinstance(r, dict) else r p = resolve_path(p, ds=refds_path) if path_startswith(p, refds_path): # all good continue # not the refds path_props = r if isinstance(r, dict) else {} res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with reference dataset' reported_paths[r] = res yield res # preserve non-existing paths to be silently killed by modification # detection and append them to requested_paths again after detection. # TODO: This might be melted in with treatment of non dataset paths # above. Re-appending those paths seems to be better than yielding # directly to avoid code duplication, since both cases later on are # dealt with again. preserved_paths = [] if requested_paths: [preserved_paths.append(r) for r in requested_paths if not lexists(r['path'] if isinstance(r, dict) else r)] # replace the requested paths by those paths that were actually # modified underneath or at a requested location requested_paths = get_modified_subpaths( # either the request, or the base dataset, if there was no request requested_paths if requested_paths else [refds_path], refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit) from itertools import chain # re-append the preserved paths: requested_paths = chain(requested_paths, iter(preserved_paths)) # do not loop over unique(), this could be a list of dicts # we avoid duplicates manually below via `reported_paths` for path in requested_paths: if not isinstance(path, dict): path = rawpath2ap(path, refds_path) # this is now an annotated path! path_props = path path = path['path'] # we need to mark our territory, who knows where this has been path_props.update(res_kwargs) if path in reported_paths: # we already recorded this path in the output # this can happen, whenever `path` is a subdataset, that was # discovered via recursive processing of another path before continue # the path exists in some shape or form # TODO if we have path_props already we could skip this test if isdir(path): # keep any existing type info, previously a more expensive run # could have discovered an uninstalled 'dataset', and we don't # want it to be relabeled to a directory path_props['type'] = \ path_props.get( 'type', 'dataset' if not islink(path) and GitRepo.is_valid_repo(path) else 'directory') # this could contain all types of additional content containing_dir = path if not islink(path) else normpath(opj(path, pardir)) else: if lexists(path): path_props['type'] = 'file' else: path_props['state'] = 'absent' # for everything else we are interested in the container containing_dir = dirname(path) if not containing_dir: containing_dir = curdir dspath = parent = get_dataset_root(containing_dir) if dspath: if path_props.get('type', None) == 'dataset': # for a dataset the root is not the parent, for anything else # it is parent = path_props.get('parentds', None) oneupdir = normpath(opj(containing_dir, pardir)) if parent is None and (force_parentds_discovery or ( refds_path and _with_sep(oneupdir).startswith( _with_sep(refds_path)))): # either forced, or only if we have a reference dataset, and # only if we stay within this refds when searching for the # parent parent = get_dataset_root(normpath(opj(containing_dir, pardir))) # NOTE the `and refds_path` is critical, as it will determine # whether a top-level dataset that was discovered gets the # parent property or not, it won't get it without a common # base dataset, and that is how we always rolled if parent and refds_path: path_props['parentds'] = parent # don't check whether this is actually a true subdataset of the # parent, done further down else: # set parent, but prefer existing property path_props['parentds'] = path_props.get('parentds', dspath) # test for `dspath` not `parent`, we only need to know whether there is # ANY dataset, not which one is the true parent, logic below relies on # the fact that we end here, if there is no dataset at all if not dspath: # not in any dataset res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with any dataset' reported_paths[path] = res yield res continue # check that we only got SUBdatasets if refds_path and not path_startswith(dspath, refds_path): res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = \ ('path not part of the reference dataset at %s', refds_path) reported_paths[path] = res yield res continue if path_props.get('type', None) == 'file': # nothing else we can learn about this res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res continue containing_ds = None path_type = path_props.get('type', None) if parent and force_subds_discovery and ( (path_type == 'dataset' and 'registered_subds' not in path_props) or path_type == 'directory' or not lexists(path)): # if the path doesn't exist, or is labeled a directory, or a dataset even # a dataset (without this info) -> record whether this is a known subdataset # to its parent containing_ds = Dataset(parent) subdss = containing_ds.subdatasets( fulfilled=None, recursive=False, result_xfm=None, result_filter=None, return_type='list') if path in [s['path'] for s in subdss]: if path_type == 'directory' or not lexists(path): # first record that it isn't here, if just a dir or not here at all path_props['state'] = 'absent' # this must be a directory, and it is not installed path_props['type'] = 'dataset' path_props['registered_subds'] = True if not lexists(path) or \ (path_props.get('type', None) == 'dataset' and path_props.get('state', None) == 'absent'): # not there (yet) message = unavailable_path_msg if unavailable_path_msg else None if message and '%s' in message: message = (message, path) path_props['message'] = message res = get_status_dict(**dict(res_kwargs, **path_props)) # assign given status, but only if the props don't indicate a status # already res['status'] = path_props.get( 'status', unavailable_path_status) reported_paths[path] = res yield res continue # we know everything we can, report res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res rec_paths = [] if recursive: # here we need to consider the special case that `path` is # a dataset itself, if a recursion_limit is given (e.g. # `remove` will do that by default), we need to recurse # from the dataset itself, and not its parent to get things # right -- this will also avoid needless discovery of # unrelated subdatasets if path_props.get('type', None) == 'dataset': containing_ds = Dataset(path) else: # regular parent, we might have a dataset already containing_ds = Dataset(parent) if containing_ds is None else containing_ds for r in yield_recursive(containing_ds, path, action, recursion_limit): # capture reported paths r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] reported_paths[r['path']] = r if modified is not None: # we cannot yield right away, maybe it wasn't modified rec_paths.append(r) else: yield r if modified is not None and rec_paths: # replace the recursively discovered paths by those paths that # were actually modified underneath or at a requested location for r in get_modified_subpaths( rec_paths, refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit): res = get_status_dict(**dict(r, **res_kwargs)) reported_paths[res['path']] = res yield res return
def __call__(path=None, message=None, dataset=None, version_tag=None, recursive=False, recursion_limit=None, updated=False, message_file=None, to_git=None, ): if message and message_file: raise ValueError( "Both a message and message file were specified for save()") path = assure_list(path) if message_file: with open(message_file) as mfh: message = mfh.read() # we want 'normal' to achieve the most compact argument list # for git calls # untracked_mode = 'no' if updated else 'normal' # TODO however, Repo.add() would refuse to add any dotfiles # in a directory that is itself untracked, hence the only # choice is to go with potentially crazy long lists # until https://github.com/datalad/datalad/issues/1454 # has a resolution untracked_mode = 'no' if updated else 'all' # there are three basic scenarios: # 1. save modifications to any already tracked content # 2. save any content (including removal of deleted content) # to bring things to a clean state # 3. like (2), but only operate on a given subset of content # identified by paths # - all three have to work in conjunction with --recursive # - the difference between (1) and (2) should be no more # that a switch from --untracked=no to --untracked=all # in Repo.save() # we do not support # - simultaneous operations on multiple datasets from disjoint # dataset hierarchies, hence a single reference dataset must be # identifiable from the either # - curdir or # - the `dataset` argument. # This avoids complex annotation loops and hierarchy tracking. # - any modification upwards from the root dataset ds = require_dataset(dataset, check_installed=True, purpose='saving') # use status() to do all discovery and annotation of paths paths_by_ds = {} for s in Status()( # ATTN: it is vital to pass the `dataset` argument as it, # and not a dataset instance in order to maintain the path # semantics between here and the status() call dataset=dataset, path=path, untracked=untracked_mode, recursive=recursive, recursion_limit=recursion_limit, result_renderer='disabled'): # fish out status dict for this parent dataset ds_status = paths_by_ds.get(s['parentds'], {}) # reassemble path status info as repo.status() would have made it ds_status[ut.Path(s['path'])] = \ {k: v for k, v in iteritems(s) if k not in ( 'path', 'parentds', 'refds', 'status', 'action', 'logger')} paths_by_ds[s['parentds']] = ds_status lgr.debug('Determined %i datasets for saving from input arguments', len(paths_by_ds)) # figure out what datasets to process, start with the ones containing # the paths that were given as arguments discovered_datasets = list(paths_by_ds.keys()) if dataset: # if a reference dataset was given we want to save all the way up # to it, so let's throw it into the mix discovered_datasets.append(ds.path) # sort the datasets into (potentially) disjoint hierarchies, # or a single one, if a reference dataset was given dataset_hierarchies = get_tree_roots(discovered_datasets) for rootds, children in iteritems(dataset_hierarchies): edges = {} discover_dataset_trace_to_targets( rootds, children, [], edges, includeds=children) for superds, subdss in iteritems(edges): superds_status = paths_by_ds.get(superds, {}) for subds in subdss: # TODO actually start from an entry that may already # exist in the status record superds_status[ut.Path(subds)] = dict( # shot from the hip, some status config # to trigger this specific super/sub # relation to be saved state='untracked', type='dataset') paths_by_ds[superds] = superds_status # TODO parallelize, whenever we have multiple subdataset of a single # dataset they can all be processed simultaneously # sort list of dataset to handle, starting with the ones deep down for pdspath in sorted(paths_by_ds, reverse=True): pds = Dataset(pdspath) # pop status for this dataset, we are not coming back to it pds_status = { # for handing over to the low-level code, we recode any # path relative to the real repo location, this avoid # cumbersome symlink handling without context in the # lower levels pds.repo.pathobj / p.relative_to(pdspath): props for p, props in iteritems(paths_by_ds.pop(pdspath))} start_commit = pds.repo.get_hexsha() if not all(p['state'] == 'clean' for p in pds_status.values()): for res in pds.repo.save_( message=message, # make sure to have the `path` arg be None, as we want # to prevent and bypass any additional repo.status() # calls paths=None, # prevent whining of GitRepo git=True if not hasattr(ds.repo, 'annexstatus') else to_git, # we are supplying the full status already, do not # detect anything else untracked='no', _status=pds_status): # TODO remove stringification when datalad-core can handle # path objects, or when PY3.6 is the lowest supported # version for k in ('path', 'refds'): if k in res: res[k] = text_type( # recode path back to dataset path anchor pds.pathobj / res[k].relative_to( pds.repo.pathobj) ) yield res # report on the dataset itself dsres = dict( action='save', type='dataset', path=pds.path, refds=ds.path, status='ok' if start_commit != pds.repo.get_hexsha() else 'notneeded', logger=lgr, ) if not version_tag: yield dsres continue try: pds.repo.tag(version_tag) dsres.update( status='ok', version_tag=version_tag) yield dsres except CommandError as e: if dsres['status'] == 'ok': # first we yield the result for the actual save yield dsres.copy() # and now complain that tagging didn't work dsres.update( status='error', message=('cannot tag this version: %s', e.stderr.strip())) yield dsres
def rev_resolve_path(path, ds=None): """Resolve a path specification (against a Dataset location) Any path is returned as an absolute path. If, and only if, a dataset object instance is given as `ds`, relative paths are interpreted as relative to the given dataset. In all other cases, relative paths are treated as relative to the current working directory. Note however, that this function is not able to resolve arbitrarily obfuscated path specifications. All operations are purely lexical, and no actual path resolution against the filesystem content is performed. Consequently, common relative path arguments like '../something' (relative to PWD) can be handled properly, but things like 'down/../under' cannot, as resolving this path properly depends on the actual target of any (potential) symlink leading up to '..'. Parameters ---------- path : str or PathLike or list Platform-specific path specific path specification. Multiple path specifications can be given as a list ds : Dataset or None Dataset instance to resolve relative paths against. Returns ------- `pathlib.Path` object or list(Path) When a list was given as input a list is returned, a Path instance otherwise. """ got_ds_instance = isinstance(ds, Dataset) if ds is not None and not got_ds_instance: ds = require_dataset( ds, check_installed=False, purpose='path resolution') out = [] for p in assure_list(path): if ds is None or not got_ds_instance: # no dataset at all or no instance provided -> CWD is always the reference # nothing needs to be done here. Path-conversion and absolutification # are done next pass # we have a given datasets instance elif not Path(p).is_absolute(): # we have a dataset and no abspath nor an explicit relative path -> # resolve it against the dataset p = ds.pathobj / p p = ut.Path(p) # make sure we return an absolute path, but without actually # resolving anything if not p.is_absolute(): # in general it is almost impossible to use resolve() when # we can have symlinks in the root path of a dataset # (that we don't want to resolve here), symlinks to annex'ed # files (that we never want to resolve), and other within-repo # symlinks that we (sometimes) want to resolve (i.e. symlinked # paths for addressing content vs adding content) # CONCEPT: do the minimal thing to catch most real-world inputs # ASSUMPTION: the only sane relative path input that needs # handling and can be handled are upward references like # '../../some/that', wherease stuff like 'down/../someotherdown' # are intellectual excercises # ALGORITHM: match any number of leading '..' path components # and shorten the PWD by that number # NOT using ut.Path.cwd(), because it has symlinks resolved!! pwd_parts = ut.Path(getpwd()).parts path_parts = p.parts leading_parents = 0 for pp in p.parts: if pp == op.pardir: leading_parents += 1 path_parts = path_parts[1:] elif pp == op.curdir: # we want to discard that, but without stripping # a corresponding parent path_parts = path_parts[1:] else: break p = ut.Path( op.join( *(pwd_parts[:-leading_parents if leading_parents else None] + path_parts))) # note that we will not "normpath()" the result, check the # pathlib docs for why this is the only sane choice in the # face of the possibility of symlinks in the path out.append(p) return out[0] if isinstance(path, (string_types, PurePath)) else out
def __call__( path=None, source=None, dataset=None, get_data=False, description=None, recursive=False, recursion_limit=None, save=True, reckless=False, # git_opts=None, # git_clone_opts=None, # annex_opts=None, # annex_init_opts=None, jobs="auto"): # normalize path argument to be equal when called from cmdline and # python and nothing was passed into `path` path = assure_list(path) if not source and not path: raise InsufficientArgumentsError( "Please provide at least a source or a path") # Common kwargs to pass to underlying git/install calls. # They might need adjustments (e.g. for recursion_limit, but # otherwise would be applicable throughout # # There should have been more of common options! # since underneath get could do similar installs common_kwargs = dict( get_data=get_data, recursive=recursive, recursion_limit=recursion_limit, # git_opts=git_opts, # annex_opts=annex_opts, reckless=reckless, jobs=jobs, ) #installed_items = [] #failed_items = [] # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. ds = None if dataset is not None: ds = require_dataset(dataset, check_installed=True, purpose='installation') common_kwargs['dataset'] = dataset # switch into the two scenarios without --source: # 1. list of URLs # 2. list of (sub)dataset content if source is None: # we need to collect URLs and paths to_install = [] to_get = [] # TODO: this approach is problematic, it disrupts the order of input args. # consequently results will be returned in an unexpected order when a # mixture of source URL and paths is given. Reordering is only possible when # everything in here is fully processed before any results can be yielded. # moreover, I think the semantics of the status quo implementation are a # bit complicated: in a mixture list a source URL will lead to a new dataset # at a generated default location, but a path will lead to a subdataset # at that exact location for urlpath in path: ri = RI(urlpath) (to_get if isinstance(ri, PathRI) else to_install).append(urlpath) # 1. multiple source URLs for s in to_install: lgr.debug("Install passes into install source=%s", s) for r in Install.__call__( source=s, description=description, save=save, # git_clone_opts=git_clone_opts, # annex_init_opts=annex_init_opts, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" on_failure='ignore', return_type='generator', result_xfm=None, result_filter=None, **common_kwargs): # no post-processing of the installed content on disk # should be necessary here, all done by code further # down that deals with an install from an actuall `source` # any necessary fixes should go there too! # TODO generator: possibly adjust refds yield r # 2. one or more dataset content paths if to_get: lgr.debug("Install passes into get %d items", len(to_get)) # all commented out hint on inability to pass those options # into underlying install-related calls. # Also need to pass from get: # annex_get_opts for r in Get.__call__( to_get, # TODO should pass-through description, not sure why disabled # description=description, # save=save, # git_clone_opts=git_clone_opts, # annex_init_opts=annex_init_opts, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" on_failure='ignore', return_type='generator', result_xfm=None, result_filter=None, **common_kwargs): # no post-processing of get'ed content on disk should be # necessary here, this is the responsibility of `get` # (incl. adjusting parent's gitmodules when submodules end # up in an "updated" state (done in get helpers) # any required fixes should go there! yield r # we are done here # the rest is about install from a `source` return # an actual `source` was given if source and path and len(path) > 1: # exception is ok here, if this fails it is either direct user error # or we f****d up one of our internal calls raise ValueError( "install needs a single PATH when source is provided. " "Was given mutliple PATHs: %s" % str(path)) # parameter constraints: if not source: # exception is ok here, if this fails it is either direct user error # or we f****d up one of our internal calls raise InsufficientArgumentsError( "a `source` is required for installation") # code below deals with a single path only path = path[0] if path else None # pre-compute for results below refds_path = Interface.get_refds_path(ds) if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination yield get_status_dict( 'install', path=path, status='impossible', logger=lgr, source_url=source, refds=refds_path, message="installation `source` and destination `path` are identical. " "If you are trying to add a subdataset simply use the `add` command") return # resolve the target location (if local) against the provided dataset # or CWD: if path is not None: # MIH everything in here is highly similar to what common # interface helpers do (or should/could do), but at the same # is very much tailored to just apply to `install` -- I guess # it has to stay special # Should work out just fine for regular paths, so no additional # conditioning is necessary try: path_ri = RI(path) except Exception as e: raise ValueError( "invalid path argument {}: ({})".format(path, exc_str(e))) try: # Wouldn't work for SSHRI ATM, see TODO within SSHRI # yoh: path should be a local path, and mapping note within # SSHRI about mapping localhost:path to path is kinda # a peculiar use-case IMHO path = resolve_path(path_ri.localpath, dataset) # any `path` argument that point to something local now # resolved and is no longer a URL except ValueError: # `path` is neither a valid source nor a local path. # TODO: The only thing left is a known subdataset with a # name, that is not a path; Once we correctly distinguish # between path and name of a submodule, we need to consider # this. # For now: Just raise raise ValueError("Invalid path argument {0}".format(path)) # `path` resolved, if there was any. # clone dataset, will also take care of adding to superdataset, if one # is given res = Clone.__call__( source, path, dataset=ds, description=description, reckless=reckless, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" result_xfm=None, return_type='generator', result_filter=None, on_failure='ignore') # helper as_ds = YieldDatasets() destination_dataset = None for r in res: if r['action'] == 'install' and r['type'] == 'dataset': # make sure logic below is valid, only one dataset result is # coming back assert(destination_dataset is None) destination_dataset = as_ds(r) yield r assert(destination_dataset) # Now, recursive calls: if recursive or get_data: # dataset argument must not be passed inside since we use bound .get # It is ok to do "inplace" as long as we still return right # after the loop ends common_kwargs.pop('dataset', '') for r in destination_dataset.get( curdir, description=description, # TODO expose this # yoh: exactly! #annex_get_opts=annex_get_opts, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" on_failure='ignore', return_type='generator', result_xfm=None, **common_kwargs): yield r # at this point no futher post-processing should be necessary, # `clone` and `get` must have done that (incl. parent handling) # if not, bugs should be fixed in those commands return
def _configure_remote(ds, name, known_remotes, url, pushurl, fetch, description, as_common_datasrc, publish_depends, publish_by_default, annex_wanted, annex_required, annex_group, annex_groupwanted, inherit, get_annex_info, **res_kwargs): result_props = dict(action='configure-sibling', path=ds.path, type='sibling', name=name, **res_kwargs) if name is None: result_props['status'] = 'error' result_props['message'] = 'need sibling `name` for configuration' yield result_props return if name != 'here': # do all configure steps that are not meaningful for the 'here' sibling # AKA the local repo if name not in known_remotes: # this remote is fresh: make it known # just minimalistic name and URL, the rest is coming from `configure` ds.repo.add_remote(name, url) known_remotes.append(name) elif url: # not new, override URl if given ds.repo.set_remote_url(name, url) # make sure we have a configured fetch expression at this point fetchvar = 'remote.{}.fetch'.format(name) if fetchvar not in ds.repo.config: # place default fetch refspec in config # same as `git remote add` would have added ds.repo.config.add(fetchvar, '+refs/heads/*:refs/remotes/{}/*'.format(name), where='local') if pushurl: ds.repo.set_remote_url(name, pushurl, push=True) if publish_depends: # Check if all `deps` remotes are known to the `repo` unknown_deps = set( assure_list(publish_depends)).difference(known_remotes) if unknown_deps: result_props['status'] = 'error' result_props['message'] = ( 'unknown sibling(s) specified as publication dependency: %s', unknown_deps) yield result_props return # define config var name for potential publication dependencies depvar = 'remote.{}.datalad-publish-depends'.format(name) # and default pushes dfltvar = "remote.{}.push".format(name) if fetch: # fetch the remote so we are up to date for r in Update.__call__(dataset=res_kwargs['refds'], path=[dict(path=ds.path, type='dataset')], sibling=name, merge=False, recursive=False, on_failure='ignore', return_type='generator', result_xfm=None): # fixup refds r.update(res_kwargs) yield r if inherit: # Adjust variables which we should inherit delayed_super = _DelayedSuper(ds.repo) publish_depends = _inherit_config_var(delayed_super, depvar, publish_depends) publish_by_default = _inherit_config_var(delayed_super, dfltvar, publish_by_default) # Copy relevant annex settings for the sibling # makes sense only if current AND super are annexes, so it is # kinda a boomer, since then forbids having a super a pure git if isinstance(ds.repo, AnnexRepo) and \ isinstance(delayed_super.repo, AnnexRepo): if annex_wanted is None: annex_wanted = _inherit_annex_var(delayed_super, name, 'wanted') if annex_required is None: annex_required = _inherit_annex_var( delayed_super, name, 'required') if annex_group is None: # I think it might be worth inheritting group regardless what # value is #if annex_wanted in {'groupwanted', 'standard'}: annex_group = _inherit_annex_var(delayed_super, name, 'group') if annex_wanted == 'groupwanted' and annex_groupwanted is None: # we better have a value for the expression for that group annex_groupwanted = _inherit_annex_var( delayed_super, name, 'groupwanted') if publish_depends: if depvar in ds.config: # config vars are incremental, so make sure we start from # scratch ds.config.unset(depvar, where='local', reload=False) for d in assure_list(publish_depends): lgr.info('Configure additional publication dependency on "%s"', d) ds.config.add(depvar, d, where='local', reload=False) ds.config.reload() if publish_by_default: if dfltvar in ds.config: ds.config.unset(dfltvar, where='local', reload=False) for refspec in assure_list(publish_by_default): lgr.info( 'Configure additional default publication refspec "%s"', refspec) ds.config.add(dfltvar, refspec, 'local') ds.config.reload() assert isinstance(ds.repo, GitRepo) # just against silly code if isinstance(ds.repo, AnnexRepo): # we need to check if added sibling an annex, and try to enable it # another part of the fix for #463 and #432 try: if not ds.config.obtain('remote.{}.annex-ignore'.format(name), default=False, valtype=EnsureBool(), store=False): ds.repo.enable_remote(name) except CommandError as exc: # TODO yield # this is unlikely to ever happen, now done for AnnexRepo instances # only lgr.info("Failed to enable annex remote %s, " "could be a pure git" % name) lgr.debug("Exception was: %s" % exc_str(exc)) if as_common_datasrc: ri = RI(url) if isinstance(ri, URL) and ri.scheme in ('http', 'https'): # XXX what if there is already a special remote # of this name? Above check for remotes ignores special # remotes. we need to `git annex dead REMOTE` on reconfigure # before we can init a new one # XXX except it is not enough # make special remote of type=git (see #335) ds.repo._run_annex_command('initremote', annex_options=[ as_common_datasrc, 'type=git', 'location={}'.format(url), 'autoenable=true' ]) else: yield dict( status='impossible', name=name, message='cannot configure as a common data source, ' 'URL protocol is not http or https', **result_props) # # place configure steps that also work for 'here' below # if isinstance(ds.repo, AnnexRepo): for prop, var in (('wanted', annex_wanted), ('required', annex_required), ('group', annex_group)): if var is not None: ds.repo.set_preferred_content(prop, var, '.' if name == 'here' else name) if annex_groupwanted: ds.repo.set_groupwanted(annex_group, annex_groupwanted) if description: if not isinstance(ds.repo, AnnexRepo): result_props['status'] = 'impossible' result_props[ 'message'] = 'cannot set description of a plain Git repository' yield result_props return ds.repo._run_annex_command('describe', annex_options=[name, description]) # report all we know at once info = list( _query_remotes(ds, name, known_remotes, get_annex_info=get_annex_info))[0] info.update(dict(status='ok', **result_props)) yield info
def _configure_remote( ds, name, known_remotes, url, pushurl, fetch, description, as_common_datasrc, publish_depends, publish_by_default, annex_wanted, annex_required, annex_group, annex_groupwanted, inherit, get_annex_info, **res_kwargs): result_props = dict( action='configure-sibling', path=ds.path, type='sibling', name=name, **res_kwargs) if name is None: result_props['status'] = 'error' result_props['message'] = 'need sibling `name` for configuration' yield result_props return if name != 'here': # do all configure steps that are not meaningful for the 'here' sibling # AKA the local repo if name not in known_remotes: # this remote is fresh: make it known # just minimalistic name and URL, the rest is coming from `configure` ds.repo.add_remote(name, url) known_remotes.append(name) elif url: # not new, override URl if given ds.repo.set_remote_url(name, url) # make sure we have a configured fetch expression at this point fetchvar = 'remote.{}.fetch'.format(name) if fetchvar not in ds.repo.config: # place default fetch refspec in config # same as `git remote add` would have added ds.repo.config.add( fetchvar, '+refs/heads/*:refs/remotes/{}/*'.format(name), where='local') if pushurl: ds.repo.set_remote_url(name, pushurl, push=True) if publish_depends: # Check if all `deps` remotes are known to the `repo` unknown_deps = set(assure_list(publish_depends)).difference( known_remotes) if unknown_deps: result_props['status'] = 'error' result_props['message'] = ( 'unknown sibling(s) specified as publication dependency: %s', unknown_deps) yield result_props return # define config var name for potential publication dependencies depvar = 'remote.{}.datalad-publish-depends'.format(name) # and default pushes dfltvar = "remote.{}.push".format(name) if fetch: # fetch the remote so we are up to date for r in Update.__call__( dataset=res_kwargs['refds'], path=[dict(path=ds.path, type='dataset')], sibling=name, merge=False, recursive=False, on_failure='ignore', return_type='generator', result_xfm=None): # fixup refds r.update(res_kwargs) yield r if inherit: # Adjust variables which we should inherit delayed_super = _DelayedSuper(ds.repo) publish_depends = _inherit_config_var( delayed_super, depvar, publish_depends) publish_by_default = _inherit_config_var( delayed_super, dfltvar, publish_by_default) # Copy relevant annex settings for the sibling # makes sense only if current AND super are annexes, so it is # kinda a boomer, since then forbids having a super a pure git if isinstance(ds.repo, AnnexRepo) and \ isinstance(delayed_super.repo, AnnexRepo): if annex_wanted is None: annex_wanted = _inherit_annex_var( delayed_super, name, 'wanted') if annex_required is None: annex_required = _inherit_annex_var( delayed_super, name, 'required') if annex_group is None: # I think it might be worth inheritting group regardless what # value is #if annex_wanted in {'groupwanted', 'standard'}: annex_group = _inherit_annex_var( delayed_super, name, 'group' ) if annex_wanted == 'groupwanted' and annex_groupwanted is None: # we better have a value for the expression for that group annex_groupwanted = _inherit_annex_var( delayed_super, name, 'groupwanted' ) if publish_depends: if depvar in ds.config: # config vars are incremental, so make sure we start from # scratch ds.config.unset(depvar, where='local', reload=False) for d in assure_list(publish_depends): lgr.info( 'Configure additional publication dependency on "%s"', d) ds.config.add(depvar, d, where='local', reload=False) ds.config.reload() if publish_by_default: if dfltvar in ds.config: ds.config.unset(dfltvar, where='local', reload=False) for refspec in assure_list(publish_by_default): lgr.info( 'Configure additional default publication refspec "%s"', refspec) ds.config.add(dfltvar, refspec, 'local') ds.config.reload() assert isinstance(ds.repo, GitRepo) # just against silly code if isinstance(ds.repo, AnnexRepo): # we need to check if added sibling an annex, and try to enable it # another part of the fix for #463 and #432 try: exc = None if not ds.config.obtain( 'remote.{}.annex-ignore'.format(name), default=False, valtype=EnsureBool(), store=False): ds.repo.enable_remote(name) except (CommandError, DownloadError) as exc: # TODO yield # this is unlikely to ever happen, now done for AnnexRepo # instances only # Note: CommandError happens with git-annex # 6.20180416+gitg86b18966f-1~ndall+1 (prior 6.20180510, from # which starts to fail with AccessFailedError) if URL is bogus, # so enableremote fails. E.g. as "tested" in test_siblings lgr.info( "Failed to enable annex remote %s, could be a pure git " "or not accessible", name) lgr.debug("Exception was: %s" % exc_str(exc)) if as_common_datasrc: ri = RI(url) if isinstance(ri, URL) and ri.scheme in ('http', 'https'): # XXX what if there is already a special remote # of this name? Above check for remotes ignores special # remotes. we need to `git annex dead REMOTE` on reconfigure # before we can init a new one # XXX except it is not enough # make special remote of type=git (see #335) ds.repo._run_annex_command( 'initremote', annex_options=[ as_common_datasrc, 'type=git', 'location={}'.format(url), 'autoenable=true']) else: yield dict( status='impossible', name=name, message='cannot configure as a common data source, ' 'URL protocol is not http or https', **result_props) # # place configure steps that also work for 'here' below # if isinstance(ds.repo, AnnexRepo): for prop, var in (('wanted', annex_wanted), ('required', annex_required), ('group', annex_group)): if var is not None: ds.repo.set_preferred_content(prop, var, '.' if name =='here' else name) if annex_groupwanted: ds.repo.set_groupwanted(annex_group, annex_groupwanted) if description: if not isinstance(ds.repo, AnnexRepo): result_props['status'] = 'impossible' result_props['message'] = 'cannot set description of a plain Git repository' yield result_props return ds.repo._run_annex_command('describe', annex_options=[name, description]) # report all we know at once info = list(_query_remotes(ds, name, known_remotes, get_annex_info=get_annex_info))[0] info.update(dict(status='ok', **result_props)) yield info
def __call__( source, path=None, dataset=None, description=None, reckless=False, alt_sources=None): # TODO next ones should be there, but cannot go anywhere # git_opts=None, # git_clone_opts=None, # annex_opts=None, # annex_init_opts=None # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. dataset = require_dataset( dataset, check_installed=True, purpose='cloning') \ if dataset is not None else dataset refds_path = dataset.path if dataset else None if isinstance(source, Dataset): source = source.path if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination raise ValueError( "clone `source` and destination `path` are identical [{}]. " "If you are trying to add a subdataset simply use `add`".format( path)) if path is not None: path = resolve_path(path, dataset) # Possibly do conversion from source into a git-friendly url # luckily GitRepo will undo any fancy file:/// url to make use of Git's # optimization for local clones.... source_url = source source_ = _get_git_url_from_source(source) lgr.debug("Resolved clone source from '%s' to '%s'", source, source_) source = source_ # derive target from source: if path is None: # we got nothing but a source. do something similar to git clone # and derive the path from the source and continue path = _get_installationpath_from_url(source) # since this is a relative `path`, resolve it: path = resolve_path(path, dataset) lgr.debug("Determined clone target path from source") lgr.debug("Resolved clone target path to: '%s'", path) # there is no other way -- my intoxicated brain tells me assert(path is not None) destination_dataset = Dataset(path) dest_path = path status_kwargs = dict( action='install', ds=destination_dataset, logger=lgr, refds=refds_path, source_url=source_url) # important test! based on this `rmtree` will happen below after failed clone if exists(dest_path) and listdir(dest_path): if destination_dataset.is_installed(): # check if dest was cloned from the given source before # this is where we would have installed this from guessed_sources = _get_flexible_source_candidates( source, dest_path) # this is where it was actually installed from track_name, track_url = _get_tracking_source(destination_dataset) if track_url in guessed_sources or \ get_local_file_url(track_url) in guessed_sources: yield get_status_dict( status='notneeded', message=("dataset %s was already cloned from '%s'", destination_dataset, source), **status_kwargs) return # anything else is an error yield get_status_dict( status='error', message='target path already exists and not empty, refuse to clone into target path', **status_kwargs) return if dataset is not None and relpath(path, start=dataset.path).startswith(pardir): yield get_status_dict( status='error', message=("clone target path '%s' not in specified target dataset '%s'", path, dataset), **status_kwargs) return # generate candidate URLs from source argument to overcome a few corner cases # and hopefully be more robust than git clone candidate_sources = [] # combine all given sources (incl. alternatives), maintain order for s in [source] + assure_list(alt_sources): candidate_sources.extend(_get_flexible_source_candidates(s)) candidates_str = \ " [%d other candidates]" % (len(candidate_sources) - 1) \ if len(candidate_sources) > 1 \ else '' lgr.info("Cloning %s%s into '%s'", source, candidates_str, dest_path) dest_path_existed = exists(dest_path) error_msgs = OrderedDict() # accumulate all error messages formatted per each url for isource_, source_ in enumerate(candidate_sources): try: lgr.debug("Attempting to clone %s (%d out of %d candidates) to '%s'", source_, isource_ + 1, len(candidate_sources), dest_path) GitRepo.clone(path=dest_path, url=source_, create=True) break # do not bother with other sources if succeeded except GitCommandError as e: error_msgs[source_] = exc_str_ = exc_str(e) lgr.debug("Failed to clone from URL: %s (%s)", source_, exc_str_) if exists(dest_path): lgr.debug("Wiping out unsuccessful clone attempt at: %s", dest_path) # We must not just rmtree since it might be curdir etc # we should remove all files/directories under it rmtree(dest_path, children_only=dest_path_existed) # Whenever progress reporting is enabled, as it is now, # we end up without e.stderr since it is "processed" out by # GitPython/our progress handler. e_stderr = e.stderr from datalad.support.gitrepo import GitPythonProgressBar if not e_stderr and GitPythonProgressBar._last_error_lines: e_stderr = os.linesep.join(GitPythonProgressBar._last_error_lines) if 'could not create work tree' in e_stderr.lower(): # this cannot be fixed by trying another URL re_match = re.match(r".*fatal: (.*)$", e_stderr, flags=re.MULTILINE | re.DOTALL) yield get_status_dict( status='error', message=re_match.group(1) if re_match else "stderr: " + e_stderr, **status_kwargs) return if not destination_dataset.is_installed(): if len(error_msgs): error_msg = "Failed to clone from any candidate source URL. " \ "Encountered errors per each url were: %s" error_args = (error_msgs, ) else: # yoh: Not sure if we ever get here but I felt that there could # be a case when this might happen and original error would # not be sufficient to troubleshoot what is going on. error_msg = "Awkward error -- we failed to clone properly. " \ "Although no errors were encountered, target " \ "dataset at %s seems to be not fully installed. " \ "The 'succesful' source was: %s" error_args = (destination_dataset.path, source_) yield get_status_dict( status='error', message=(error_msg, error_args), **status_kwargs) return if dataset is not None: # we created a dataset in another dataset # -> make submodule for r in dataset.save( dest_path, return_type='generator', result_filter=None, result_xfm=None, on_failure='ignore'): yield r _handle_possible_annex_dataset( destination_dataset, reckless, description=description) # yield successful clone of the base dataset now, as any possible # subdataset clone down below will not alter the Git-state of the # parent yield get_status_dict(status='ok', **status_kwargs)
def __call__( path=None, dataset=None, annex=None, untracked='normal', recursive=False, recursion_limit=None, eval_subdataset_state='full'): # To the next white knight that comes in to re-implement `status` as a # special case of `diff`. There is one fundamental difference between # the two commands: `status` can always use the worktree as evident on # disk as a contraint (e.g. to figure out which subdataset a path is in) # `diff` cannot do that (everything need to be handled based on a # "virtual" representation of a dataset hierarchy). # MIH concludes that while `status` can be implemented as a special case # of `diff` doing so would complicate and slow down both `diff` and # `status`. So while the apparent almost code-duplication between the # two commands feels wrong, the benefit is speed. Any future RF should # come with evidence that speed does not suffer, and complexity stays # on a manageable level ds = require_dataset( dataset, check_installed=True, purpose='status reporting') paths_by_ds = OrderedDict() if path: # sort any path argument into the respective subdatasets for p in sorted(assure_list(path)): # it is important to capture the exact form of the # given path argument, before any normalization happens # for further decision logic below orig_path = text_type(p) p = rev_resolve_path(p, dataset) root = rev_get_dataset_root(text_type(p)) if root is None: # no root, not possibly underneath the refds yield dict( action='status', path=p, refds=ds.path, status='error', message='path not underneath this dataset', logger=lgr) continue else: if dataset and root == text_type(p) and \ not (orig_path.endswith(op.sep) or orig_path == "."): # the given path is pointing to a dataset # distinguish rsync-link syntax to identify # the dataset as whole (e.g. 'ds') vs its # content (e.g. 'ds/') super_root = rev_get_dataset_root(op.dirname(root)) if super_root: # the dataset identified by the path argument # is contained in a superdataset, and no # trailing path separator was found in the # argument -> user wants to address the dataset # as a whole (in the superdataset) root = super_root root = ut.Path(root) ps = paths_by_ds.get(root, []) ps.append(p) paths_by_ds[root] = ps else: paths_by_ds[ds.pathobj] = None queried = set() content_info_cache = {} while paths_by_ds: qdspath, qpaths = paths_by_ds.popitem(last=False) if qpaths and qdspath in qpaths: # this is supposed to be a full query, save some # cycles sifting through the actual path arguments qpaths = [] # try to recode the dataset path wrt to the reference # dataset # the path that it might have been located by could # have been a resolved path or another funky thing qds_inrefds = path_under_rev_dataset(ds, qdspath) if qds_inrefds is None: # nothing we support handling any further # there is only a single refds yield dict( path=text_type(qdspath), refds=ds.path, action='status', status='error', message=( "dataset containing given paths is not underneath " "the reference dataset %s: %s", ds, qpaths), logger=lgr, ) continue elif qds_inrefds != qdspath: # the path this dataset was located by is not how it would # be referenced underneath the refds (possibly resolved # realpath) -> recode all paths to be underneath the refds qpaths = [qds_inrefds / p.relative_to(qdspath) for p in qpaths] qdspath = qds_inrefds if qdspath in queried: # do not report on a single dataset twice continue qds = Dataset(text_type(qdspath)) for r in _yield_status( qds, qpaths, annex, untracked, recursion_limit if recursion_limit is not None else -1 if recursive else 0, queried, eval_subdataset_state, content_info_cache): yield dict( r, refds=ds.path, action='status', status='ok', )
def __call__( path=None, message=None, dataset=None, version_tag=None, recursive=False, recursion_limit=None, updated=False, message_file=None, to_git=None, ): if message and message_file: raise ValueError( "Both a message and message file were specified for save()") path = assure_list(path) if message_file: with open(message_file) as mfh: message = mfh.read() # we want 'normal' to achieve the most compact argument list # for git calls # untracked_mode = 'no' if updated else 'normal' # TODO however, Repo.add() would refuse to add any dotfiles # in a directory that is itself untracked, hence the only # choice is to go with potentially crazy long lists # until https://github.com/datalad/datalad/issues/1454 # has a resolution untracked_mode = 'no' if updated else 'all' # there are three basic scenarios: # 1. save modifications to any already tracked content # 2. save any content (including removal of deleted content) # to bring things to a clean state # 3. like (2), but only operate on a given subset of content # identified by paths # - all three have to work in conjunction with --recursive # - the difference between (1) and (2) should be no more # that a switch from --untracked=no to --untracked=all # in Repo.save() # we do not support # - simultaneous operations on multiple datasets from disjoint # dataset hierarchies, hence a single reference dataset must be # identifiable from the either # - curdir or # - the `dataset` argument. # This avoids complex annotation loops and hierarchy tracking. # - any modification upwards from the root dataset ds = require_dataset(dataset, check_installed=True, purpose='saving') # use status() to do all discovery and annotation of paths paths_by_ds = {} for s in Status()( # ATTN: it is vital to pass the `dataset` argument as it, # and not a dataset instance in order to maintain the path # semantics between here and the status() call dataset=dataset, path=path, untracked=untracked_mode, recursive=recursive, recursion_limit=recursion_limit, result_renderer='disabled'): # fish out status dict for this parent dataset ds_status = paths_by_ds.get(s['parentds'], {}) # reassemble path status info as repo.status() would have made it ds_status[ut.Path(s['path'])] = \ {k: v for k, v in iteritems(s) if k not in ( 'path', 'parentds', 'refds', 'status', 'action', 'logger')} paths_by_ds[s['parentds']] = ds_status lgr.debug('Determined %i datasets for saving from input arguments', len(paths_by_ds)) # figure out what datasets to process, start with the ones containing # the paths that were given as arguments discovered_datasets = list(paths_by_ds.keys()) if dataset: # if a reference dataset was given we want to save all the way up # to it, so let's throw it into the mix discovered_datasets.append(ds.path) # sort the datasets into (potentially) disjoint hierarchies, # or a single one, if a reference dataset was given dataset_hierarchies = get_tree_roots(discovered_datasets) for rootds, children in iteritems(dataset_hierarchies): edges = {} discover_dataset_trace_to_targets(rootds, children, [], edges, includeds=children) for superds, subdss in iteritems(edges): superds_status = paths_by_ds.get(superds, {}) for subds in subdss: # TODO actually start from an entry that may already # exist in the status record superds_status[ut.Path(subds)] = dict( # shot from the hip, some status config # to trigger this specific super/sub # relation to be saved state='untracked', type='dataset') paths_by_ds[superds] = superds_status # TODO parallelize, whenever we have multiple subdataset of a single # dataset they can all be processed simultaneously # sort list of dataset to handle, starting with the ones deep down for pdspath in sorted(paths_by_ds, reverse=True): pds = Dataset(pdspath) # pop status for this dataset, we are not coming back to it pds_status = { # for handing over to the low-level code, we recode any # path relative to the real repo location, this avoid # cumbersome symlink handling without context in the # lower levels pds.repo.pathobj / p.relative_to(pdspath): props for p, props in iteritems(paths_by_ds.pop(pdspath)) } start_commit = pds.repo.get_hexsha() if not all(p['state'] == 'clean' for p in pds_status.values()): for res in pds.repo.save_( message=message, # make sure to have the `path` arg be None, as we want # to prevent and bypass any additional repo.status() # calls paths=None, # prevent whining of GitRepo git=True if not hasattr(ds.repo, 'annexstatus') else to_git, # we are supplying the full status already, do not # detect anything else untracked='no', _status=pds_status): # TODO remove stringification when datalad-core can handle # path objects, or when PY3.6 is the lowest supported # version for k in ('path', 'refds'): if k in res: res[k] = str( # recode path back to dataset path anchor pds.pathobj / res[k].relative_to(pds.repo.pathobj)) yield res # report on the dataset itself dsres = dict( action='save', type='dataset', path=pds.path, refds=ds.path, status='ok' if start_commit != pds.repo.get_hexsha() else 'notneeded', logger=lgr, ) if not version_tag: yield dsres continue try: pds.repo.tag(version_tag) dsres.update(status='ok', version_tag=version_tag) yield dsres except CommandError as e: if dsres['status'] == 'ok': # first we yield the result for the actual save yield dsres.copy() # and now complain that tagging didn't work dsres.update(status='error', message=('cannot tag this version: %s', e.stderr.strip())) yield dsres
def __call__(match, dataset=None, search=None, report=None, report_matched=False, format='custom', regex=False): lgr.debug("Initiating search for match=%r and dataset %r", match, dataset) try: ds = require_dataset(dataset, check_installed=True, purpose='dataset search') if ds.id is None: raise NoDatasetArgumentFound( "This does not seem to be a dataset (no DataLad dataset ID " "found). 'datalad create --force %s' can initialize " "this repository as a DataLad dataset" % ds.path) except NoDatasetArgumentFound: exc_info = sys.exc_info() if dataset is None: if not ui.is_interactive: raise NoDatasetArgumentFound( "No DataLad dataset found. Specify a dataset to be " "searched, or run interactively to get assistance " "installing a queriable superdataset." ) # none was provided so we could ask user either he possibly wants # to install our beautiful mega-duper-super-dataset? # TODO: following logic could possibly benefit other actions. if os.path.exists(LOCAL_CENTRAL_PATH): central_ds = Dataset(LOCAL_CENTRAL_PATH) if central_ds.is_installed(): if ui.yesno( title="No DataLad dataset found at current location", text="Would you like to search the DataLad " "superdataset at %r?" % LOCAL_CENTRAL_PATH): pass else: reraise(*exc_info) else: raise NoDatasetArgumentFound( "No DataLad dataset found at current location. " "The DataLad superdataset location %r exists, " "but does not contain an dataset." % LOCAL_CENTRAL_PATH) elif ui.yesno( title="No DataLad dataset found at current location", text="Would you like to install the DataLad " "superdataset at %r?" % LOCAL_CENTRAL_PATH): from datalad.api import install central_ds = install(LOCAL_CENTRAL_PATH, source='///') ui.message( "From now on you can refer to this dataset using the " "label '///'" ) else: reraise(*exc_info) lgr.info( "Performing search using DataLad superdataset %r", central_ds.path ) for res in central_ds.search( match, search=search, report=report, report_matched=report_matched, format=format, regex=regex): yield res return else: raise cache_dir = opj(opj(ds.path, get_git_dir(ds.path)), 'datalad', 'cache') mcache_fname = opj(cache_dir, 'metadata.p%d' % pickle.HIGHEST_PROTOCOL) meta = None if os.path.exists(mcache_fname): lgr.debug("use cached metadata of '{}' from {}".format(ds, mcache_fname)) meta, checksum = pickle.load(open(mcache_fname, 'rb')) # TODO add more sophisticated tests to decide when the cache is no longer valid if checksum != ds.repo.get_hexsha(): # errrr, try again below meta = None # don't put in 'else', as yet to be written tests above might fail and require # regenerating meta data if meta is None: lgr.info("Loading and caching local meta-data... might take a few seconds") if not exists(cache_dir): os.makedirs(cache_dir) meta = get_metadata(ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False) # merge all info on datasets into a single dict per dataset meta = flatten_metadata_graph(meta) # extract graph, if any meta = meta.get('@graph', meta) # build simple queriable representation if not isinstance(meta, list): meta = [meta] # sort entries by location (if present) sort_keys = ('location', 'description', 'id') meta = sorted(meta, key=lambda m: tuple(m.get(x, "") for x in sort_keys)) # use pickle to store the optimized graph in the cache pickle.dump( # graph plus checksum from what it was built (meta, ds.repo.get_hexsha()), open(mcache_fname, 'wb')) lgr.debug("cached meta data graph of '{}' in {}".format(ds, mcache_fname)) if report in ('', ['']): report = [] elif report and not isinstance(report, list): report = [report] match = assure_list(match) search = assure_list(search) # convert all to lower case for case insensitive matching search = {x.lower() for x in search} def get_in_matcher(m): """Function generator to provide closure for a specific value of m""" mlower = m.lower() def matcher(s): return mlower in s.lower() return matcher matchers = [ re.compile(match_).search if regex else get_in_matcher(match_) for match_ in match ] # location should be reported relative to current location # We will assume that noone chpwd while we are yielding ds_path_prefix = get_path_prefix(ds.path) # So we could provide a useful message whenever there were not a single # dataset with specified `--search` properties observed_properties = set() # for every meta data set for mds in meta: hit = False hits = [False] * len(matchers) matched_fields = set() if not mds.get('type', mds.get('schema:type', None)) == 'Dataset': # we are presently only dealing with datasets continue # TODO consider the possibility of nested and context/graph dicts # but so far we were trying to build simple lists of dicts, as much # as possible if not isinstance(mds, dict): raise NotImplementedError("nested meta data is not yet supported") # manual loop for now for k, v in iteritems(mds): if search: k_lower = k.lower() if k_lower not in search: if observed_properties is not None: # record for providing a hint later observed_properties.add(k_lower) continue # so we have a hit, no need to track observed_properties = None if isinstance(v, dict) or isinstance(v, list): v = text_type(v) for imatcher, matcher in enumerate(matchers): if matcher(v): hits[imatcher] = True matched_fields.add(k) if all(hits): hit = True # no need to do it longer than necessary if not report_matched: break if hit: location = mds.get('location', '.') report_ = matched_fields.union(report if report else {}) \ if report_matched else report if report_ == ['*']: report_dict = mds elif report_: report_dict = {k: mds[k] for k in report_ if k in mds} if report_ and not report_dict: lgr.debug( 'meta data match for %s, but no to-be-reported ' 'properties (%s) found. Present properties: %s', location, ", ".join(report_), ", ".join(sorted(mds)) ) else: report_dict = {} # it was empty but not None -- asked to # not report any specific field if isinstance(location, (list, tuple)): # could be that the same dataset installed into multiple # locations. For now report them separately for l in location: yield opj(ds_path_prefix, l), report_dict else: yield opj(ds_path_prefix, location), report_dict if search and observed_properties is not None: import difflib suggestions = { s: difflib.get_close_matches(s, observed_properties) for s in search } suggestions_str = "\n ".join( "%s for %s" % (", ".join(choices), s) for s, choices in iteritems(suggestions) if choices ) lgr.warning( "Found no properties which matched one of the one you " "specified (%s). May be you meant one among: %s.\n" "Suggestions:\n" " %s", ", ".join(search), ", ".join(observed_properties), suggestions_str if suggestions_str.strip() else "none" )
def _get_submodules(ds, paths, fulfilled, recursive, recursion_limit, contains, bottomup, set_property, delete_property, refds_path): dspath = ds.path if not GitRepo.is_valid_repo(dspath): return modinfo = _parse_gitmodules(ds) # put in giant for-loop to be able to yield results before completion for sm in _parse_git_submodules(ds, paths): if contains and not any( sm['path'] == c or sm['path'] in c.parents for c in contains): # we are not looking for this subds, because it doesn't # match the target path continue # do we just need this to recurse into subdatasets, or is this a # real results? to_report = paths is None \ or any(p == sm['path'] or p in sm['path'].parents for p in paths) sm.update(modinfo.get(sm['path'], {})) if to_report and (set_property or delete_property): # first deletions for dprop in assure_list(delete_property): try: out, err = ds.repo._git_custom_command( '', ['git', 'config', '--file', '.gitmodules', '--unset-all', 'submodule.{}.{}'.format(sm['gitmodule_name'], dprop), ] ) except CommandError: yield get_status_dict( 'subdataset', status='impossible', message=( "Deleting subdataset property '%s' failed for " "subdataset '%s', possibly did " "not exist", dprop, sm['gitmodule_name']), logger=lgr, **sm) # also kick from the info we just read above sm.pop('gitmodule_{}'.format(dprop), None) # and now setting values for sprop in assure_list(set_property): prop, val = sprop if val.startswith('<') and val.endswith('>') and '{' in val: # expand template string val = val[1:-1].format( **dict( sm, refds_relpath=sm['path'].relative_to(refds_path), refds_relname=text_type( sm['path'].relative_to(refds_path) ).replace(os.sep, '-'))) try: out, err = ds.repo._git_custom_command( '', ['git', 'config', '--file', '.gitmodules', '--replace-all', 'submodule.{}.{}'.format(sm['gitmodule_name'], prop), text_type(val), ] ) except CommandError as e: # pragma: no cover # this conditional may not be possible to reach, as # variable name validity is checked before and Git # replaces the file completely, resolving any permission # issues, if the file could be read (already done above) yield get_status_dict( 'subdataset', status='error', message=( "Failed to set property '%s': %s", prop, exc_str(e)), type='dataset', logger=lgr, **sm) # it is up to parent code to decide whether we would continue # after this # also add to the info we just read above sm['gitmodule_{}'.format(prop)] = val Dataset(dspath).save( '.gitmodules', to_git=True, message='[DATALAD] modified subdataset properties') #common = commonprefix((with_pathsep(subds), with_pathsep(path))) #if common.endswith(sep) and common == with_pathsep(subds): # candidates.append(common) subdsres = get_status_dict( 'subdataset', status='ok', type='dataset', logger=lgr) subdsres.update(sm) subdsres['parentds'] = dspath if to_report and (not bottomup and \ (fulfilled is None or GitRepo.is_valid_repo(sm['path']) == fulfilled)): yield subdsres # expand list with child submodules. keep all paths relative to parent # and convert jointly at the end if recursive and \ (recursion_limit in (None, 'existing') or (isinstance(recursion_limit, int) and recursion_limit > 1)): for r in _get_submodules( Dataset(sm['path']), paths, fulfilled, recursive, (recursion_limit - 1) if isinstance(recursion_limit, int) else recursion_limit, contains, bottomup, set_property, delete_property, refds_path): yield r if to_report and (bottomup and \ (fulfilled is None or GitRepo.is_valid_repo(sm['path']) == fulfilled)): yield subdsres
def _publish_dataset(ds, remote, refspec, paths, annex_copy_options, force=False, jobs=None, transfer_data='auto', **kwargs): # TODO: this setup is now quite ugly. The only way `refspec` can come # in, is when there is a tracking branch, and we get its state via # `refspec` # define config var name for potential publication dependencies depvar = 'remote.{}.datalad-publish-depends'.format(remote) # list of remotes that are publication dependencies for the # target remote publish_depends = assure_list(ds.config.get(depvar, [])) # remote might be set to be ignored by annex, or we might not even know yet its uuid # make sure we are up-to-date on this topic on all affected remotes, before # we start making decisions for r in publish_depends + [remote]: if not ds.config.get('.'.join(('remote', remote, 'annex-uuid')), None): lgr.debug("Obtain remote annex info from '%s'", r) ds.repo.fetch(remote=r) # in order to be able to use git's config to determine what to push, # we need to annex merge first. Otherwise a git push might be # rejected if involving all matching branches for example. # NOTE we should not use a precomputed 'is_annex' test here, as # each fetch could give evidence that there is an annex # somewhere and replace the repo class... if isinstance(ds.repo, AnnexRepo): ds.repo.merge_annex(r) ds.config.reload() # anything that follows will not change the repo type anymore, cache is_annex_repo = isinstance(ds.repo, AnnexRepo) # Plan: # 1. Check if there is anything to push, and if so # 2. process push dependencies # 3. fetch and merge annex branch # 4. push non-annex branch(es) # 5. copy data to the remote if paths are provided or it wants something generally # upstream refspec needed for update (merge) and subsequent push, # in case there is no. # no tracking refspec yet? # TODO: i think this whole modification detection could be done by path # annotation at the very beginning -- keeping it for now to not get too # dizzy in the forehead.... # if forced -- we push regardless if there are differences or not diff = True if force else has_diff(ds, refspec, remote, paths) # We might have got new information in git-annex branch although no other # changes if not diff and is_annex_repo: try: git_annex_commit = next(ds.repo.get_branch_commits('git-annex')) except StopIteration: git_annex_commit = None #diff = _get_remote_diff(ds, [], git_annex_commit, remote, 'git-annex') diff = _get_remote_diff(ds, git_annex_commit, remote, 'git-annex') if diff: lgr.info("Will publish updated git-annex") # # publish data (annex copy --to) # # # remote might be set to be ignored by annex, or we might not even know yet its uuid # annex_ignore = ds.config.getbool('remote.{}.annex-ignore'.format(remote), None) # annex_uuid = ds.config.get('remote.{}.annex-uuid'.format(remote), None) # if not annex_ignore: # if annex_uuid is None: # # most probably not yet 'known' and might require some annex copied_data = False # skip right away if data transfer is not desired if transfer_data != 'none' and isinstance(ds.repo, AnnexRepo): # publishing of `remote` might depend on publishing other # remote(s) first, so they need to receive the data first: for d in publish_depends: lgr.info("Transferring data to configured publication dependency: '%s'" % d) # properly initialized remote annex -> publish data for r in _publish_data( ds, d, paths, annex_copy_options, force, transfer_data, **kwargs): # note if we published any data, notify to sync annex branch below if r['status'] == 'ok' and r['action'] == 'publish' and \ r.get('type', None) == 'file': copied_data = True yield r # and for the main target for r in _publish_data( ds, remote, paths, annex_copy_options, force, transfer_data, **kwargs): # note if we published any data, notify to sync annex branch below if r['status'] == 'ok' and r['action'] == 'publish' and \ r.get('type', None) == 'file': copied_data = True yield r # # publish dataset (git push) # if not diff and not copied_data: lgr.debug("No changes detected with respect to state of '%s'", remote) yield get_status_dict(ds=ds, status='notneeded', **kwargs) else: # publishing of `remote` might depend on publishing other # remote(s) first: for d in publish_depends: lgr.info("Publishing to configured dependency: '%s'" % d) # call this again to take care of the dependency first, # but keep the paths the same, as the goal is to publish those # to the primary remote, and not anything elase to a dependency for r in _publish_dataset( ds, d, # should get the same as the base dataset refspec, paths, annex_copy_options, force=force, jobs=jobs, transfer_data=transfer_data, **kwargs): yield r if is_annex_repo and \ ds.repo.is_special_annex_remote(remote): # There is nothing else to "publish" lgr.debug( "{0} is a special annex remote, no git push is needed".format(remote) ) return lgr.info("Publishing {0} to {1}".format(ds, remote)) # in order to be able to use git's config to determine what to push, # we need to annex merge first. Otherwise a git push might be # rejected if involving all matching branches for example # even if we already fetched above we need to do it again if is_annex_repo: lgr.debug("Obtain remote annex info from '%s'", remote) ds.repo.fetch(remote=remote) ds.repo.merge_annex(remote) # Note: git's push.default is 'matching', which doesn't work for first # time publication (a branch, that doesn't exist on remote yet) # But if we want to respect remote.*.push entries, etc. we need to # not pass a specific refspec (like active branch) to `git push` # by default. # hence we amend any existing config on the fly # TODO: what else to push by default? # consider also: --follow-tags, --tags, --atomic # make sure we push things2push = [] current_branch = ds.repo.get_active_branch() if current_branch: # possibly make this conditional on a switch # TODO: this should become it own helper if is_annex_repo: # annex could manage this branch if current_branch.startswith('annex/direct') \ and ds.config.getbool('annex', 'direct', default=False): # this is a "fake" annex direct mode branch # we want to publish the underlying branch current_branch = current_branch[12:] match_adjusted = re.match( 'adjusted/(.*)\([a-z]*\)', current_branch) if match_adjusted: # adjusted/master(...) # TODO: this code is not tested # see https://codecov.io/gh/datalad/datalad/src/17e67045a088ae0372b38aa4d8d46ecf7c821cb7/datalad/distribution/publish.py#L156 # and thus probably broken -- test me! current_branch = match_adjusted.group(1) things2push.append(current_branch) if is_annex_repo: things2push.append('git-annex') # check that all our magic found valid branches things2push = [t for t in things2push if t in ds.repo.get_branches()] # check that we don't ask to push things that are already configured # -> would cause error # TODO need to find a way to properly do this, when wildcards are used # in the push configuration variable things2push = [t for t in things2push if t not in ds.config.get('remote.{}.push'.format(remote), [])] # now we know what to push where status, msg = _push(ds, remote, things2push, force) yield get_status_dict(ds=ds, status=status, message=msg, **kwargs)