class AnnotatePaths(Interface): """Analyze and act upon input paths Given paths (or more generally location requests) are inspected and annotated with a number of properties. A list of recognized properties is provided below. || PYTHON >>Input `paths` for this command can either be un-annotated (raw) path strings, or already (partially) annotated paths. In the latter case, further annotation is limited to yet-unknown properties, and is potentially faster than initial annotation.<< PYTHON || *Recognized path properties* {proplist} In the case of enabled modification detection the results may contain additional properties regarding the nature of the modification. See the documentation of the `diff` command for details. """ _docs_ = dict(proplist='\n\n '.join('"{}"\n{}'.format( k, textwrap.fill(known_props[k], initial_indent=' ', subsequent_indent=' ')) for k in sorted(known_props))) _params_ = dict( path=Parameter(args=("path", ), metavar="PATH", doc="""path to be annotated""", nargs="*", constraints=EnsureStr() | EnsureNone()), dataset=Parameter( args=("-d", "--dataset"), doc="""an optional reference/base dataset for the paths""", constraints=EnsureDataset() | EnsureNone()), recursive=recursion_flag, recursion_limit=recursion_limit, action=Parameter(args=("--action", ), metavar="LABEL", doc="""an "action" property value to include in the path annotation""", constraints=EnsureStr() | EnsureNone()), unavailable_path_status=Parameter( args=("--unavailable-path-status", ), metavar="LABEL", doc="""a "status" property value to include in the annotation for paths that are underneath a dataset, but do not exist on the filesystem""", constraints=EnsureStr() | EnsureNone()), unavailable_path_msg=Parameter( args=("--unavailable-path-msg", ), metavar="message", doc="""a "message" property value to include in the annotation for paths that are underneath a dataset, but do not exist on the filesystem""", constraints=EnsureStr() | EnsureNone()), nondataset_path_status=Parameter( args=("--nondataset-path-status", ), metavar="LABEL", doc="""a "status" property value to include in the annotation for paths that are not underneath any dataset""", constraints=EnsureStr() | EnsureNone()), force_parentds_discovery=Parameter( args=("--no-parentds-discovery", ), dest='force_parentds_discovery', action='store_false', doc="""Flag to disable reports of parent dataset information for any path, in particular dataset root paths. Disabling saves on command run time, if this information is not needed."""), force_subds_discovery=Parameter( args=("--no-subds-discovery", ), action='store_false', dest='force_subds_discovery', doc="""Flag to disable reporting type='dataset' for subdatasets, even when they are not installed, or their mount point directory doesn't exist. Disabling saves on command run time, if this information is not needed."""), force_untracked_discovery=Parameter( args=("--no-untracked-discovery", ), action='store_false', dest='force_untracked_discovery', doc="""Flag to disable discovery of untracked changes. Disabling saves on command run time, if this information is not needed."""), force_no_revision_change_discovery=Parameter( args=("--revision-change-discovery", ), action='store_false', dest='force_no_revision_change_discovery', doc= """Flag to disable discovery of changes which were not yet committed. Disabling saves on command run time, if this information is not needed."""), modified=Parameter( args=("--modified", ), nargs='?', const=True, constraints=EnsureStr() | EnsureBool() | EnsureNone(), doc="""comparison reference specification for modification detection. This can be (mostly) anything that `git diff` understands (commit, treeish, tag, etc). See the documentation of `datalad diff --revision` for details. Unmodified paths will not be annotated. If a requested path was not modified but some content underneath it was, then the request is replaced by the modified paths and those are annotated instead. This option can be used [PY: with `True` as PY][CMD: without CMD] an argument to test against changes that have been made, but have not yet been staged for a commit.""")) @staticmethod @datasetmethod(name='annotate_paths') @eval_results def __call__(path=None, dataset=None, recursive=False, recursion_limit=None, action=None, unavailable_path_status='', unavailable_path_msg=None, nondataset_path_status='error', force_parentds_discovery=True, force_subds_discovery=True, force_no_revision_change_discovery=True, force_untracked_discovery=True, modified=None): # upfront check for the fastest possible response if not path and dataset is None: # nothing given, try "here", but do not use `require_dataset`, as # it will determine the root dataset of `curdir` and further down # lead to path annotation of upstairs directories dataset = curdir if force_subds_discovery and not force_parentds_discovery: raise ValueError( 'subdataset discovery requires parent dataset discovery') # CONCEPT: yield with no status to indicate further processing # everything in one big loop to be able too yield as fast a possible # without any precomputing for all paths refds_path = Interface.get_refds_path(dataset) if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)): raise ValueError( "modification detection only works with a base dataset (non-given or found)" ) # prep common result props res_kwargs = dict(action=action if action else 'annotate_path', refds=refds_path, logger=lgr) # handle the case of recursion into a single dataset without any # extra fancy processing first -- full recursion can be done # faster than manual recursion, hence we gain quite some speed # from these few lines of extra code if not modified and not path and refds_path: if not GitRepo.is_valid_repo(refds_path): yield get_status_dict( # doesn't matter if the path is in another dataset # it was given as reference dataset status=nondataset_path_status, message='given reference dataset is not a dataset', path=refds_path, **res_kwargs) return refds = Dataset(refds_path) path = [] # yield the dataset itself r = get_status_dict(ds=refds, status='', **res_kwargs) yield r if recursive: # if we have nothing given, but need recursion, we need to feed # the dataset path itself for r in yield_recursive(refds, refds_path, action, recursion_limit): r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] yield r return # goal: structure in a way that makes most information on any path # available in a single pass, at the cheapest possible cost reported_paths = {} requested_paths = assure_list(path) if modified is not None: # modification detection would silently kill all nondataset paths # but we have to complain about them, hence doing it here if requested_paths and refds_path: for r in requested_paths: p = r['path'] if isinstance(r, dict) else r p = resolve_path(p, ds=refds_path) if path_startswith(p, refds_path): # all good continue # not the refds path_props = r if isinstance(r, dict) else {} res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with reference dataset' reported_paths[r] = res yield res # preserve non-existing paths to be silently killed by modification # detection and append them to requested_paths again after detection. # TODO: This might be melted in with treatment of non dataset paths # above. Re-appending those paths seems to be better than yielding # directly to avoid code duplication, since both cases later on are # dealt with again. preserved_paths = [] if requested_paths: [ preserved_paths.append(r) for r in requested_paths if not lexists(r['path'] if isinstance(r, dict) else r) ] # replace the requested paths by those paths that were actually # modified underneath or at a requested location requested_paths = get_modified_subpaths( # either the request, or the base dataset, if there was no request requested_paths if requested_paths else [refds_path], refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit) from itertools import chain # re-append the preserved paths: requested_paths = chain(requested_paths, iter(preserved_paths)) # do not loop over unique(), this could be a list of dicts # we avoid duplicates manually below via `reported_paths` for path in requested_paths: if not isinstance(path, dict): path = rawpath2ap(path, refds_path) # this is now an annotated path! path_props = path path = path['path'] # we need to mark our territory, who knows where this has been path_props.update(res_kwargs) if path in reported_paths: # we already recorded this path in the output # this can happen, whenever `path` is a subdataset, that was # discovered via recursive processing of another path before continue # the path exists in some shape or form # TODO if we have path_props already we could skip this test if isdir(path): # keep any existing type info, previously a more expensive run # could have discovered an uninstalled 'dataset', and we don't # want it to be relabeled to a directory path_props['type'] = \ path_props.get( 'type', 'dataset' if GitRepo.is_valid_repo(path) else 'directory') # this could contain all types of additional content containing_dir = path else: if lexists(path): path_props['type'] = 'file' else: path_props['state'] = 'absent' # for everything else we are interested in the container containing_dir = dirname(path) if not containing_dir: containing_dir = curdir dspath = parent = get_dataset_root(containing_dir) if dspath: if path_props.get('type', None) == 'dataset': # for a dataset the root is not the parent, for anything else # it is parent = path_props.get('parentds', None) oneupdir = normpath(opj(containing_dir, pardir)) if parent is None and (force_parentds_discovery or (refds_path and _with_sep(oneupdir).startswith( _with_sep(refds_path)))): # either forced, or only if we have a reference dataset, and # only if we stay within this refds when searching for the # parent parent = get_dataset_root( normpath(opj(containing_dir, pardir))) # NOTE the `and refds_path` is critical, as it will determine # whether a top-level dataset that was discovered gets the # parent property or not, it won't get it without a common # base dataset, and that is how we always rolled if parent and refds_path: path_props['parentds'] = parent # don't check whether this is actually a true subdataset of the # parent, done further down else: # set parent, but prefer existing property path_props['parentds'] = path_props.get('parentds', dspath) # test for `dspath` not `parent`, we only need to know whether there is # ANY dataset, not which one is the true parent, logic below relies on # the fact that we end here, if there is no dataset at all if not dspath: # not in any dataset res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with any dataset' reported_paths[path] = res yield res continue # check that we only got SUBdatasets if refds_path and not path_startswith(dspath, refds_path): res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = \ ('path not part of the reference dataset at %s', refds_path) reported_paths[path] = res yield res continue if path_props.get('type', None) == 'file': # nothing else we can learn about this res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res continue containing_ds = None path_type = path_props.get('type', None) if parent and force_subds_discovery and ( (path_type == 'dataset' and 'registered_subds' not in path_props) or path_type == 'directory' or not lexists(path)): from datalad.distribution.subdatasets import Subdatasets # if the path doesn't exist, or is labeled a directory, or a dataset even # a dataset (without this info) -> record whether this is a known subdataset # to its parent containing_ds = Dataset(parent) subdss = containing_ds.subdatasets(fulfilled=None, recursive=False, result_xfm=None, result_filter=None, return_type='list') if path in [s['path'] for s in subdss]: if path_type == 'directory' or not lexists(path): # first record that it isn't here, if just a dir or not here at all path_props['state'] = 'absent' # this must be a directory, and it is not installed path_props['type'] = 'dataset' path_props['registered_subds'] = True if not lexists(path) or \ (path_props.get('type', None) == 'dataset' and path_props.get('state', None) == 'absent'): # not there (yet) message = unavailable_path_msg if unavailable_path_msg else None if message and '%s' in message: message = (message, path) path_props['message'] = message res = get_status_dict(**dict(res_kwargs, **path_props)) # assign given status, but only if the props don't indicate a status # already res['status'] = path_props.get('status', unavailable_path_status) reported_paths[path] = res yield res continue # we know everything we can, report res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res rec_paths = [] if recursive: # here we need to consider the special case that `path` is # a dataset itself, if a recursion_limit is given (e.g. # `remove` will do that by default), we need to recurse # from the dataset itself, and not its parent to get things # right -- this will also avoid needless discovery of # unrelated subdatasets if path_props.get('type', None) == 'dataset': containing_ds = Dataset(path) else: # regular parent, we might have a dataset already containing_ds = Dataset( parent) if containing_ds is None else containing_ds for r in yield_recursive(containing_ds, path, action, recursion_limit): # capture reported paths r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] reported_paths[r['path']] = r if modified is not None: # we cannot yield right away, maybe it wasn't modified rec_paths.append(r) else: yield r if modified is not None and rec_paths: # replace the recursively discovered paths by those paths that # were actually modified underneath or at a requested location for r in get_modified_subpaths( rec_paths, refds=Dataset(refds_path), revision=modified, report_no_revision_change= force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit): res = get_status_dict(**dict(r, **res_kwargs)) reported_paths[res['path']] = res yield res return
class Get(Interface): """Get any dataset content (files/directories/subdatasets). This command only operates on dataset content. To obtain a new independent dataset from some source use the `install` command. By default this command operates recursively within a dataset, but not across potential subdatasets, i.e. if a directory is provided, all files in the directory are obtained. Recursion into subdatasets is supported too. If enabled, relevant subdatasets are detected and installed in order to fulfill a request. Known data locations for each requested file are evaluated and data are obtained from some available location (according to git-annex configuration and possibly assigned remote priorities), unless a specific source is specified. .. note:: Power-user info: This command uses :command:`git annex get` to fulfill file handles. """ _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), metavar="PATH", doc="""specify the dataset to perform the add operation on, in which case `path` arguments are interpreted as being relative to this dataset. If no dataset is given, an attempt is made to identify a dataset for each input `path`""", constraints=EnsureDataset() | EnsureNone()), path=Parameter( args=("path",), metavar="PATH", doc="""path/name of the requested dataset component. The component must already be known to a dataset. To add new components to a dataset use the `add` command""", nargs="*", constraints=EnsureStr() | EnsureNone()), source=Parameter( args=("-s", "--source",), metavar="LABEL", doc="""label of the data source to be used to fulfill requests. This can be the name of a dataset :term:`sibling` or another known source""", constraints=EnsureStr() | EnsureNone()), recursive=recursion_flag, recursion_limit=Parameter( args=("--recursion-limit",), metavar="LEVELS", constraints=EnsureInt() | EnsureChoice('existing') | EnsureNone(), doc="""limit recursion into subdataset to the given number of levels. Alternatively, 'existing' will limit recursion to subdatasets that already existed on the filesystem at the start of processing, and prevent new subdatasets from being obtained recursively."""), get_data=Parameter( args=("-n", "--no-data",), dest='get_data', action='store_false', doc="""whether to obtain data for all file handles. If disabled, `get` operations are limited to dataset handles.[CMD: This option prevents data for file handles from being obtained CMD]"""), reckless=reckless_opt, git_opts=git_opts, annex_opts=annex_opts, annex_get_opts=annex_get_opts, jobs=jobs_opt, verbose=verbose) # Note: May be use 'git annex find --not --in here' to have a list of all # files to actually get and give kind of a progress in terms of number # files processed ... @staticmethod @datasetmethod(name='get') def __call__( path=None, source=None, dataset=None, recursive=False, recursion_limit=None, get_data=True, reckless=False, git_opts=None, annex_opts=None, annex_get_opts=None, jobs=None, verbose=False, # internal -- instead of returning 'get'ed items, return final # content_by_ds, unavailable_paths. To be used by the call from # Install.__call__ and done so to avoid creating another reusable # function which would need to duplicate all this heavy list of # kwargs _return_datasets=False ): # IMPLEMENTATION CONCEPT: # # 1. Sort the world into existing handles and the rest # 2. Try locate missing handles (obtain subdatasets along the way) # 3. Expand into subdatasets with recursion enables (potentially # obtain even more subdatasets # 4. Shoot info of which handles to get in each subdataset to, # git-annex, once at the very end dataset_path = dataset.path if isinstance(dataset, Dataset) else dataset if not (dataset or path): raise InsufficientArgumentsError( "Neither dataset nor target path(s) provided") if dataset and not path: # act on the whole dataset if nothing else was specified path = dataset_path # use lookup cache -- we need that info further down dir_lookup = {} content_by_ds, unavailable_paths = Interface._prep( path=path, dataset=dataset, recursive=recursive, recursion_limit=recursion_limit, dir_lookup=dir_lookup) # explore the unknown for path in sorted(unavailable_paths): # how close can we get? dspath = get_dataset_root(path) if dspath is None: # nothing we can do for this path continue ds = Dataset(dspath) # must always yield a dataset -- we sorted out the ones outside # any dataset at the very top assert ds.is_installed() # now actually obtain whatever is necessary to get to this path containing_ds = install_necessary_subdatasets(ds, path, reckless) if containing_ds.path != ds.path: lgr.debug("Installed %s to fulfill request for content for " "path %s", containing_ds, path) # mark resulting dataset as auto-installed if containing_ds.path == path: # we had to get the entire dataset, not something within # mark that it just appeared content_by_ds[path] = [curdir] else: # we need to get content within content_by_ds[path] = [path] if recursive and not recursion_limit == 'existing': # obtain any subdatasets underneath the paths given inside the # subdatasets that we know already exist # unless we do not want recursion into not-yet-installed datasets for subdspath in sorted(content_by_ds.keys()): for content_path in content_by_ds[subdspath]: if not isdir(content_path): # a non-directory cannot have content underneath continue subds = Dataset(subdspath) lgr.info( "Obtaining %s %s recursively", subds, ("underneath %s" % content_path if subds.path != content_path else "")) cbysubds = _recursive_install_subds_underneath( subds, # `content_path` was explicitly given as input # we count recursions from the input, hence we # can start with the full number recursion_limit, reckless, # protect against magic marker misinterpretation # only relevant for _get, hence replace here start=content_path if content_path != curdir else None) # gets file content for all freshly installed subdatasets content_by_ds.update(cbysubds) ## we have now done everything we could to obtain whatever subdataset ## to get something on the file system for previously unavailable paths ## check and sort one last content_by_ds, unavailable_paths, nondataset_paths = \ get_paths_by_dataset( unavailable_paths, recursive=recursive, recursion_limit=recursion_limit, out=content_by_ds, dir_lookup=dir_lookup) if nondataset_paths: # XXX likely can never get here lgr.warning( "ignored paths that do not belong to any dataset: %s", nondataset_paths) if unavailable_paths: lgr.warning('ignored non-existing paths: %s', unavailable_paths) # hand over to git-annex results = list(chain.from_iterable( _get(content_by_ds, refpath=dataset_path, source=source, jobs=jobs, get_data=get_data))) # ??? should we in _return_datasets case just return both content_by_ds # and unavailable_paths may be so we provide consistent across runs output # and then issue outside similar IncompleteResultsError? if unavailable_paths: # and likely other error flags if _return_datasets: results = sorted(set(content_by_ds).difference(unavailable_paths)) raise IncompleteResultsError(results, failed=unavailable_paths) else: return sorted(content_by_ds) if _return_datasets else results @staticmethod def result_renderer_cmdline(res, args): from datalad.ui import ui from os import linesep if res is None: res = [] if not isinstance(res, list): res = [res] if not len(res): ui.message("Got nothing new") return # provide summary nsuccess = sum(item.get('success', False) if isinstance(item, dict) else True for item in res) nfailure = len(res) - nsuccess msg = "Tried to get %d %s." % ( len(res), single_or_plural("file", "files", len(res))) if nsuccess: msg += " Got %d. " % nsuccess if nfailure: msg += " Failed to get %d." % (nfailure,) ui.message(msg) # if just a few or less than initially explicitly requested if len(res) < 10 or args.verbose: msg = linesep.join([ "{path} ... {suc}".format( suc="ok." if isinstance(item, Dataset) or item.get('success', False) else "failed. (%s)" % item.get('note', 'unknown reason'), path=item.get('file') if isinstance(item, dict) else item.path) for item in res]) ui.message(msg)
class Unlock(Interface): """Unlock file(s) of a dataset Unlock files of a dataset in order to be able to edit the actual content """ _params_ = dict( path=Parameter(args=("path", ), doc="""file(s) to unlock""", nargs="*", constraints=EnsureStr() | EnsureNone()), dataset=Parameter(args=("-d", "--dataset"), doc=""""specify the dataset to unlock files in. If no dataset is given, an attempt is made to identify the dataset based on the current working directory. If the latter fails, an attempt is made to identify the dataset based on `path` """, constraints=EnsureDataset() | EnsureNone()), recursive=recursion_flag, recursion_limit=recursion_limit, ) @staticmethod @datasetmethod(name='unlock') @eval_results def __call__(path=None, dataset=None, recursive=False, recursion_limit=None): if path is None and dataset is None: raise InsufficientArgumentsError( "insufficient arguments for unlocking: needs at least " "a dataset or a path to unlock.") refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='unlock', logger=lgr, refds=refds_path) to_process = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='unlock', unavailable_path_status='impossible', unavailable_path_msg="path does not exist", nondataset_path_status='impossible', modified=None, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', 'dataset') == 'dataset': # this is a dataset ap['process_content'] = True to_process.append(ap) content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path) assert (not completed) for ds_path in sorted(content_by_ds.keys()): ds = Dataset(ds_path) content = content_by_ds[ds_path] # no annex, no unlock: if not isinstance(ds.repo, AnnexRepo): for ap in content: ap['status'] = 'notneeded' ap['message'] = "not annex'ed, nothing to unlock" ap.update(res_kwargs) yield ap continue # only files in annex with their content present: files = [ap['path'] for ap in content] to_unlock = [] for ap, under_annex, has_content in \ zip(content, ds.repo.is_under_annex(files), ds.repo.file_has_content(files)): # TODO: what about directories? Make sure, there is no # situation like no file beneath with content or everything in # git, that leads to a CommandError # For now pass to annex: from os.path import isdir if isdir(ap['path']): to_unlock.append(ap) continue # Note, that `file_has_content` is (planned to report) True on # files in git. Therefore order matters: First check for annex! if under_annex: if has_content: to_unlock.append(ap) # no content, no unlock: else: ap['status'] = 'impossible' ap['message'] = "no content present, can't unlock" ap.update(res_kwargs) yield ap # file in git, no unlock: else: ap['status'] = 'notneeded' ap['message'] = "not controlled by annex, nothing to unlock" ap.update(res_kwargs) yield ap # don't call annex-unlock with no path, if this is this case because # nothing survived the filtering above if content and not to_unlock: continue for r in ds.repo.unlock([ap['path'] for ap in to_unlock]): yield get_status_dict(path=opj(ds.path, r), status='ok', type='file', **res_kwargs)
class CrawlInit(Interface): """Initialize crawling configuration Allows to specify template and function to generate a crawling pipeline Examples: $ datalad crawl-init \ --template openfmri \ --template-func superdataset_pipeline $ datalad crawl-init \ --template fcptable \ dataset=Baltimore tarballs=True """ # XXX prevent common args from being added to the docstring _no_eval_results = True _params_ = dict( template=Parameter( args=("-t", "--template"), action="store", constraints=EnsureStr() | EnsureNone(), doc="""the name of the template"""), template_func=Parameter( args=("-f", "--template-func"), action="store", doc="""the name of the function"""), args=Parameter( args=("args",), nargs="*", constraints=EnsureStr() | EnsureNone(), doc="""keyword arguments to pass into the template function generating actual pipeline, organized in [PY: a dict PY][CMD: key=value pairs CMD]"""), save=Parameter( args=("--save",), action="store_true", doc="""flag to save file into git repo"""), ) @staticmethod def __call__(args=None, template=None, template_func=None, save=False): if args: if isinstance(args, str): args = [args] if isinstance(args, list): args = OrderedDict(map(str, it.split('=', 1)) for it in args) elif isinstance(args, dict): pass else: raise ValueError( "args entered must be given in a list or dict, were given as %s", type(args)) elif not template: raise TypeError("crawl-init needs a template") else: args = {} pipeline_func = load_pipeline_from_template(template, template_func, kwargs=args, return_only=True) try: pipeline = pipeline_func(**args) except Exception as exc: raise RuntimeError( "Running the pipeline function resulted in %s." "FYI this pipeline only takes the following args: %s" % (exc_str(exc), get_func_kwargs_doc(pipeline_func))) if not pipeline: raise ValueError("returned pipeline is empty") if not isinstance(pipeline, list): raise ValueError("pipeline should be represented as a list. Got: %r" % pipeline) configfile = initiate_pipeline_config(template, template_func, args) if save: from datalad.api import save ds = Dataset(curdir) ds.repo.add(configfile, git=True) ds.save("committing crawl config file", path=configfile)
class AggregateMetaData(Interface): """Aggregate metadata of one or more datasets for later query. Metadata aggregation refers to a procedure that extracts metadata present in a dataset into a portable representation that is stored a single standardized format. Moreover, metadata aggregation can also extract metadata in this format from one dataset and store it in another (super)dataset. Based on such collections of aggregated metadata it is possible to discover particular datasets and specific parts of their content, without having to obtain the target datasets first (see the DataLad 'search' command). To enable aggregation of metadata that are contained in files of a dataset, one has to enable one or more metadata extractor for a dataset. DataLad supports a number of common metadata standards, such as the Exchangeable Image File Format (EXIF), Adobe's Extensible Metadata Platform (XMP), and various audio file metadata systems like ID3. DataLad extension packages can provide metadata data extractors for additional metadata sources. For example, the neuroimaging extension provides extractors for scientific (meta)data standards like BIDS, DICOM, and NIfTI1. Some metadata extractors depend on particular 3rd-party software. The list of metadata extractors available to a particular DataLad installation is reported by the 'wtf' command ('datalad wtf'). Enabling a metadata extractor for a dataset is done by adding its name to the 'datalad.metadata.nativetype' configuration variable -- typically in the dataset's configuration file (.datalad/config), e.g.:: [datalad "metadata"] nativetype = exif nativetype = xmp If an enabled metadata extractor is not available in a particular DataLad installation, metadata extraction will not succeed in order to avoid inconsistent aggregation results. Enabling multiple extractors is supported. In this case, metadata are extracted by each extractor individually, and stored alongside each other. Metadata aggregation will also extract DataLad's own metadata (extractors 'datalad_core', and 'annex'). Metadata aggregation can be performed recursively, in order to aggregate all metadata across all subdatasets, for example, to be able to search across any content in any dataset of a collection. Aggregation can also be performed for subdatasets that are not available locally. In this case, pre-aggregated metadata from the closest available superdataset will be considered instead. Depending on the versatility of the present metadata and the number of dataset or files, aggregated metadata can grow prohibitively large. A number of configuration switches are provided to mitigate such issues. datalad.metadata.aggregate-content-<extractor-name> If set to false, content metadata aggregation will not be performed for the named metadata extractor (a potential underscore '_' in the extractor name must be replaced by a dash '-'). This can substantially reduce the runtime for metadata extraction, and also reduce the size of the generated metadata aggregate. Note, however, that some extractors may not produce any metadata when this is disabled, because their metadata might come from individual file headers only. 'datalad.metadata.store-aggregate-content' might be a more appropriate setting in such cases. datalad.metadata.aggregate-ignore-fields Any metadata key matching any regular expression in this configuration setting is removed prior to generating the dataset-level metadata summary (keys and their unique values across all dataset content), and from the dataset metadata itself. This switch can also be used to filter out sensitive information prior aggregation. datalad.metadata.generate-unique-<extractor-name> If set to false, DataLad will not auto-generate a summary of unique content metadata values for a particular extractor as part of the dataset-global metadata (a potential underscore '_' in the extractor name must be replaced by a dash '-'). This can be useful if such a summary is bloated due to minor uninformative (e.g. numerical) differences, or when a particular extractor already provides a carefully designed content metadata summary. datalad.metadata.maxfieldsize Any metadata value that exceeds the size threshold given by this configuration setting (in bytes/characters) is removed. datalad.metadata.store-aggregate-content If set, extracted content metadata are still used to generate a dataset-level summary of present metadata (all keys and their unique values across all files in a dataset are determined and stored as part of the dataset-level metadata aggregate, see datalad.metadata.generate-unique-<extractor-name>), but metadata on individual files are not stored. This switch can be used to avoid prohibitively large metadata files. Discovery of datasets containing content matching particular metadata properties will still be possible, but such datasets would have to be obtained first in order to discover which particular files in them match these properties. """ _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""topmost dataset metadata will be aggregated into. All dataset between this dataset and any given path will receive updated aggregated metadata from all given paths.""", constraints=EnsureDataset() | EnsureNone()), path=Parameter(args=("path", ), metavar="PATH", doc="""path to datasets that shall be aggregated. When a given path is pointing into a dataset, the metadata of the containing dataset will be aggregated. If no paths given, current dataset metadata is aggregated.""", nargs="*", constraints=EnsureStr() | EnsureNone()), recursive=recursion_flag, recursion_limit=recursion_limit, update_mode=Parameter( args=('--update-mode', ), constraints=EnsureChoice('all', 'target'), doc="""which datasets to update with newly aggregated metadata: all datasets from any leaf dataset to the top-level target dataset including all intermediate datasets (all), or just the top-level target dataset (target)."""), incremental=Parameter( args=('--incremental', ), action='store_true', doc="""If set, all information on metadata records of subdatasets that have not been (re-)aggregated in this run will be kept unchanged. This is useful when (re-)aggregation only a subset of a dataset hierarchy, for example, because not all subdatasets are locally available.""" ), force_extraction=Parameter( args=('--force-extraction', ), action='store_true', doc="""If set, all enabled extractors will be engaged regardless of whether change detection indicates that metadata has already been extracted for a given dataset state."""), save=nosave_opt, ) @staticmethod @datasetmethod(name='aggregate_metadata') @eval_results def __call__(path=None, dataset=None, recursive=False, recursion_limit=None, update_mode='target', incremental=False, force_extraction=False, save=True): refds_path = Interface.get_refds_path(dataset) # it really doesn't work without a dataset ds = require_dataset(dataset, check_installed=True, purpose='metadata aggregation') path = assure_list(path) if not path: # then current/reference dataset is "aggregated" # We should not add ds.path always since then --recursive would # also recurse current even if paths are given path.append(ds.path) agginfo_db_location, agg_base_path = get_ds_aggregate_db_locations( ds, # do not warn here, next call triggers the same warning warn_absent=False) agginfo_db = load_ds_aggregate_db(ds, abspath=True) to_save = [] to_aggregate = set() for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='aggregate_metadata', # uninstalled subdatasets could be queried via aggregated metadata # -> no 'error' unavailable_path_status='', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue ap_type = ap.get('type', None) ap_state = ap.get('state', None) assert ('parentds' in ap or ap_type == 'dataset') if ap_type == 'dataset' and ap_state != 'absent': # a present dataset, we can take directly from it aggsrc = ap['path'] lgr.info('Aggregate metadata for dataset %s', aggsrc) else: # everything else needs to come from the parent aggsrc = ap['parentds'] if ap_state == 'absent': lgr.info( 'Attempt to use pre-aggregate metadata for absent %s from dataset at %s', ap['path'], aggsrc) else: lgr.info('Aggregate metadata for %s from dataset at %s', ap['path'], aggsrc) to_aggregate.add(aggsrc) if ap_state == 'absent': # key thought: recursive is done by path annotation, hence # once we hit an absent dataset, we are 100% certain that # there is nothing to recurse into on the file system # hence we only have to look into the aggregated metadata # of the last available dataset in the dataset tree edge # # if there is nothing at this path, we need to look into the # parentds and check if we know anything about this path # if we do, we need to grab all the info and objects # if not, we need to error res = _get_dsinfo_from_aggmetadata(aggsrc, ap['path'], recursive, agginfo_db) if not isinstance(res, list): yield get_status_dict(status='impossible', message=res, action='aggregate_metadata', path=ap['path'], logger=lgr) continue # cue for aggregation to_aggregate.update(res) else: # actually aggregate metadata for this dataset, immediately place # generated objects into the aggregated or reference dataset, # and put info into DB to get the distributed to all datasets # that need to be updated errored = _dump_extracted_metadata(ds, Dataset(aggsrc), agginfo_db, to_save, force_extraction, agg_base_path) if errored: yield get_status_dict( status='error', message= 'Metadata extraction failed (see previous error message, set datalad.runtime.raiseonerror=yes to fail immediately)', action='aggregate_metadata', path=aggsrc, logger=lgr) # at this point we have dumped all aggregated metadata into object files # somewhere, we know what needs saving, but having saved anything, and # we know about the states of all aggregated dataset in the DB # what remains to do is to update all dataset, so they have there own copy # of aggregated metadata and update their respective aggregate.json with # info on what states we just aggregated from # first, let's figure out what dataset need updating at all # get adjencency info of the dataset tree spanning the base to all leaf dataset # associated with the path arguments if update_mode == 'all': ds_adj = {} discover_dataset_trace_to_targets( ds.path, to_aggregate, [], ds_adj, # we know that to_aggregate only lists datasets, existing and # absent ones -- we want to aggregate all of them, either from # just extracted metadata, or from previously aggregated metadata # of the closest superdataset includeds=to_aggregate) # TODO we need to work in the info about dataset that we only got from # aggregated metadata, that had no trace on the file system in here!! subtrees = _adj2subtrees(ds.path, ds_adj, to_aggregate) elif update_mode == 'target': subtrees = {ds.path: list(agginfo_db.keys())} else: raise ValueError( "unknown `update_mode` '%s' for metadata aggregation", update_mode) # go over datasets in bottom-up fashion for parentds_path in sorted(subtrees, reverse=True): lgr.info('Update aggregate metadata in dataset at: %s', parentds_path) _update_ds_agginfo(ds.path, parentds_path, subtrees[parentds_path], incremental, agginfo_db, to_save) # update complete res = get_status_dict(status='ok', action='aggregate_metadata', path=parentds_path, type='dataset', logger=lgr) res.update(agginfo_db.get(parentds_path, {})) yield res # # save potential modifications to dataset global metadata # if not to_save: return lgr.info('Attempting to save %i files/datasets', len(to_save)) for res in Save.__call__( path=to_save, dataset=refds_path, message='[DATALAD] Dataset aggregate metadata update', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
class Uninstall(Interface): """Uninstall subdatasets This command can be used to uninstall any number of installed subdataset. If a to-be-uninstalled subdataset contains presently installed subdatasets itself, their recursive removal has to be enabled explicitly to avoid the command to exit with an error. This command will error if individual files or non-dataset directories are given as input (use the drop or remove command depending in the desired goal), nor will it uninstall top-level datasets (i.e. datasets that or not a subdataset in another dataset; use the remove command for this purpose). By default, the availability of at least one remote copy for each currently available file in any dataset is verified. As these checks could lead to slow operation (network latencies, etc), they can be disabled. Any number of paths to process can be given as input. Recursion into subdatasets needs to be explicitly enabled, while recursion in subdirectories within a dataset as always done automatically. An optional recursion limit is applied relative to each given input path. Examples: Uninstall a subdataset (undo installation):: ~/some/dataset$ datalad uninstall somesubdataset1 """ _action = 'uninstall' _params_ = dict( dataset=dataset_argument, path=Parameter(args=("path", ), metavar="PATH", doc="path/name of the component to be uninstalled", nargs="*", constraints=EnsureStr() | EnsureNone()), recursive=recursion_flag, check=check_argument, if_dirty=if_dirty_opt, ) @staticmethod @datasetmethod(name=_action) @eval_results def __call__(path=None, dataset=None, recursive=False, check=True, if_dirty='save-before'): refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='uninstall', logger=lgr, refds=refds_path) if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path if not dataset and not path: raise InsufficientArgumentsError( "insufficient information for `uninstall`: requires at least a path or dataset" ) to_uninstall = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, action='uninstall', # justification for status: # content need not be uninstalled where there is none unavailable_path_status='notneeded', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue # upfront sanity and compliance checks # check that we have no top-level datasets and not files to process if ap.get('type') == 'dataset' and \ not ap.get('state', None) == 'absent' and \ path_is_under([ap['path']]): # wants a sequence! ap.update( status='error', message="refusing to uninstall current or parent directory" ) yield ap continue if not ap.get('type', None) == 'dataset': ap.update( status='impossible', message= "can only uninstall datasets (consider the `drop` command)" ) yield ap continue # we only have dataset from here if not ap.get('parentds', None): # this could be a side-effect of the specific call semantics. # As stated in #1714, we are not really interested in whether # a superdataset was obvious in the call, but only whether there # is a superdataset at all. So let's look for one, and only barf # when there really isn't parentds = Dataset(ap['path']).get_superdataset( datalad_only=False, topmost=False, # unless it is properly registered we have no way of # reinstalling it registered_only=True) if parentds is None: ap.update( status='error', message= "will not uninstall top-level dataset (consider `remove` command)" ) yield ap continue ap['parentds'] = parentds.path if not ap['path'] == refds_path: ap['process_content'] = True to_uninstall.append(ap) # iterate over all datasets, starting at the bottom # to deinit contained submodules first for ap in sorted(to_uninstall, key=lambda x: x['path'], reverse=True): if ap.get('state', None) == 'absent': # already gone continue ds = Dataset(ap['path']) # TODO generator # this should yield what it did handle_dirty_dataset(ds, mode=if_dirty) # we confirmed the super dataset presence above for r in _uninstall_dataset(ds, check=check, has_super=True, **res_kwargs): yield r
class Addurls(Interface): """Create and update a dataset from a list of URLs. *Format specification* Several arguments take format strings. These are similar to normal Python format strings where the names from `URL-FILE` (column names for a CSV or properties for JSON) are available as placeholders. If `URL-FILE` is a CSV file, a positional index can also be used (i.e., "{0}" for the first column). Note that a placeholder cannot contain a ':' or '!'. In addition, the `FILENAME-FORMAT` arguments has a few special placeholders. - _repindex The constructed file names must be unique across all fields rows. To avoid collisions, the special placeholder "_repindex" can be added to the formatter. Its value will start at 0 and increment every time a file name repeats. - _url_hostname, _urlN, _url_basename* Various parts of the formatted URL are available. Take "http://datalad.org/asciicast/seamless_nested_repos.sh" as an example. "datalad.org" is stored as "_url_hostname". Components of the URL's path can be referenced as "_urlN". "_url0" and "_url1" would map to "asciicast" and "seamless_nested_repos.sh", respectively. The final part of the path is also available as "_url_basename". This name is broken down further. "_url_basename_root" and "_url_basename_ext" provide access to the root name and extension. These values are similar to the result of os.path.splitext, but, in the case of multiple periods, the extension is identified using the same length heuristic that git-annex uses. As a result, the extension of "file.tar.gz" would be ".tar.gz", not ".gz". In addition, the fields "_url_basename_root_py" and "_url_basename_ext_py" provide access to the result of os.path.splitext. - _url_filename* These are similar to _url_basename* fields, but they are obtained with a server request. This is useful if the file name is set in the Content-Disposition header. *Examples* Consider a file "avatars.csv" that contains:: who,ext,link neurodebian,png,https://avatars3.githubusercontent.com/u/260793 datalad,png,https://avatars1.githubusercontent.com/u/8927200 To download each link into a file name composed of the 'who' and 'ext' fields, we could run:: $ datalad addurls -d avatar_ds --fast avatars.csv '{link}' '{who}.{ext}' The `-d avatar_ds` is used to create a new dataset in "$PWD/avatar_ds". If we were already in a dataset and wanted to create a new subdataset in an "avatars" subdirectory, we could use "//" in the `FILENAME-FORMAT` argument:: $ datalad addurls --fast avatars.csv '{link}' 'avatars//{who}.{ext}' .. note:: For users familiar with 'git annex addurl': A large part of this plugin's functionality can be viewed as transforming data from `URL-FILE` into a "url filename" format that fed to 'git annex addurl --batch --with-files'. """ from datalad.distribution.dataset import datasetmethod from datalad.interface.utils import eval_results from datalad.distribution.dataset import EnsureDataset from datalad.support.constraints import EnsureChoice, EnsureNone, EnsureStr from datalad.support.param import Parameter _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""Add the URLs to this dataset (or possibly subdatasets of this dataset). An empty or non-existent directory is passed to create a new dataset. New subdatasets can be specified with `FILENAME-FORMAT`.""", constraints=EnsureDataset() | EnsureNone()), urlfile=Parameter( args=("urlfile", ), metavar="URL-FILE", doc="""A file that contains URLs or information that can be used to construct URLs. Depending on the value of --input-type, this should be a CSV file (with a header as the first row) or a JSON file (structured as a list of objects with string values)."""), urlformat=Parameter( args=("urlformat", ), metavar="URL-FORMAT", doc="""A format string that specifies the URL for each entry. See the 'Format Specification' section above."""), filenameformat=Parameter( args=("filenameformat", ), metavar="FILENAME-FORMAT", doc="""Like `URL-FORMAT`, but this format string specifies the file to which the URL's content will be downloaded. The file name may contain directories. The separator "//" can be used to indicate that the left-side directory should be created as a new subdataset. See the 'Format Specification' section above."""), input_type=Parameter( args=("-t", "--input-type"), metavar="TYPE", doc="""Whether `URL-FILE` should be considered a CSV file or a JSON file. The default value, "ext", means to consider `URL-FILE` as a JSON file if it ends with ".json". Otherwise, treat it as a CSV file.""", constraints=EnsureChoice("ext", "csv", "json")), exclude_autometa=Parameter( args=("-x", "--exclude_autometa"), metavar="REGEXP", doc="""By default, metadata field=value pairs are constructed with each column in `URL-FILE`, excluding any single column that is specified via `URL-FORMAT`. This argument can be used to exclude columns that match a regular expression. If set to '*' or an empty string, automatic metadata extraction is disabled completely. This argument does not affect metadata set explicitly with --meta."""), meta=Parameter( args=( "-m", "--meta", ), metavar="FORMAT", action="append", doc="""A format string that specifies metadata. It should be structured as "<field>=<value>". As an example, "location={3}" would mean that the value for the "location" metadata field should be set the value of the fourth column. This option can be given multiple times."""), message=Parameter( args=("--message", ), metavar="MESSAGE", doc="""Use this message when committing the URL additions.""", constraints=EnsureNone() | EnsureStr()), dry_run=Parameter( args=("-n", "--dry-run"), action="store_true", doc="""Report which URLs would be downloaded to which files and then exit."""), fast=Parameter( args=("--fast", ), action="store_true", doc="""If True, add the URLs, but don't download their content. Underneath, this passes the --fast flag to `git annex addurl`."""), ifexists=Parameter( args=("--ifexists", ), metavar="ACTION", doc="""What to do if a constructed file name already exists. The default behavior is to proceed with the `git annex addurl`, which will fail if the file size has changed. If set to 'overwrite', remove the old file before adding the new one. If set to 'skip', do not add the new file.""", constraints=EnsureNone() | EnsureChoice("overwrite", "skip")), missing_value=Parameter( args=("--missing-value", ), metavar="VALUE", doc="""When an empty string is encountered, use this value instead.""", constraints=EnsureNone() | EnsureStr()), save=nosave_opt, version_urls=Parameter( args=("--version-urls", ), action="store_true", doc="""Try to add a version ID to the URL. This currently only has an effect on URLs for AWS S3 buckets."""), ) @staticmethod @datasetmethod(name='addurls') @eval_results def __call__(dataset, urlfile, urlformat, filenameformat, input_type="ext", exclude_autometa=None, meta=None, message=None, dry_run=False, fast=False, ifexists=None, missing_value=None, save=True, version_urls=False): # Temporarily work around gh-2269. url_file = urlfile url_format, filename_format = urlformat, filenameformat from requests.exceptions import RequestException from datalad.distribution.add import Add from datalad.distribution.create import Create from datalad.distribution.dataset import Dataset, require_dataset from datalad.interface.results import get_status_dict from datalad.support.annexrepo import AnnexRepo lgr = logging.getLogger("datalad.plugin.addurls") dataset = require_dataset(dataset, check_installed=False) if dataset.repo and not isinstance(dataset.repo, AnnexRepo): yield get_status_dict(action="addurls", ds=dataset, status="error", message="not an annex repo") return if input_type == "ext": extension = os.path.splitext(url_file)[1] input_type = "json" if extension == ".json" else "csv" with open(url_file) as fd: try: rows, subpaths = extract(fd, input_type, url_format, filename_format, exclude_autometa, meta, dry_run, missing_value) except (ValueError, RequestException) as exc: yield get_status_dict(action="addurls", ds=dataset, status="error", message=exc_str(exc)) return if len(rows) != len(set(row["filename"] for row in rows)): yield get_status_dict(action="addurls", ds=dataset, status="error", message=("There are file name collisions; " "consider using {_repindex}")) return if dry_run: for subpath in subpaths: lgr.info("Would create a subdataset at %s", subpath) for row in rows: lgr.info("Would download %s to %s", row["url"], os.path.join(dataset.path, row["filename"])) lgr.info( "Metadata: %s", sorted(u"{}={}".format(k, v) for k, v in row["meta_args"].items())) yield get_status_dict(action="addurls", ds=dataset, status="ok", message="dry-run finished") return if not dataset.repo: # Populate a new dataset with the URLs. for r in dataset.create(result_xfm=None, return_type='generator', save=save): yield r annex_options = ["--fast"] if fast else [] for spath in subpaths: if os.path.exists(os.path.join(dataset.path, spath)): lgr.warning("Not creating subdataset at existing path: %s", spath) else: for r in dataset.create(spath, result_xfm=None, return_type='generator', save=save): yield r for row in rows: # Add additional information that we'll need for various # operations. filename_abs = os.path.join(dataset.path, row["filename"]) if row["subpath"]: ds_current = Dataset(os.path.join(dataset.path, row["subpath"])) ds_filename = os.path.relpath(filename_abs, ds_current.path) else: ds_current = dataset ds_filename = row["filename"] row.update({ "filename_abs": filename_abs, "ds": ds_current, "ds_filename": ds_filename }) if version_urls: num_urls = len(rows) log_progress(lgr.info, "addurls_versionurls", "Versioning %d URLs", num_urls, label="Versioning URLs", total=num_urls, unit=" URLs") for row in rows: url = row["url"] try: row["url"] = get_versioned_url(url) except (ValueError, NotImplementedError) as exc: # We don't expect this to happen because get_versioned_url # should return the original URL if it isn't an S3 bucket. # It only raises exceptions if it doesn't know how to # handle the scheme for what looks like an S3 bucket. lgr.warning("error getting version of %s: %s", row["url"], exc_str(exc)) log_progress(lgr.info, "addurls_versionurls", "Versioned result for %s: %s", url, row["url"], update=1, increment=True) log_progress(lgr.info, "addurls_versionurls", "Finished versioning URLs") files_to_add = set() for r in add_urls(rows, ifexists=ifexists, options=annex_options): if r["status"] == "ok": files_to_add.add(r["path"]) yield r msg = message or """\ [DATALAD] add files from URLs url_file='{}' url_format='{}' filename_format='{}'""".format(url_file, url_format, filename_format) if files_to_add: for r in dataset.add(files_to_add, save=False): yield r meta_rows = [r for r in rows if r["filename_abs"] in files_to_add] for r in add_meta(meta_rows): yield r # Save here rather than the add call above to trigger a metadata # commit on the git-annex branch. if save: for r in dataset.save(message=msg, recursive=True): yield r
class Subdatasets(Interface): r"""Report subdatasets and their properties. The following properties are reported (if possible) for each matching subdataset record. "name" Name of the subdataset in the parent (often identical with the relative path in the parent dataset) "path" Absolute path to the subdataset "parentds" Absolute path to the parent dataset "gitshasum" SHA1 of the subdataset commit recorded in the parent dataset "state" Condition of the subdataset: 'absent', 'present' "gitmodule_url" URL of the subdataset recorded in the parent "gitmodule_name" Name of the subdataset recorded in the parent "gitmodule_<label>" Any additional configuration property on record. Performance note: Property modification, requesting `bottomup` reporting order, or a particular numerical `recursion_limit` implies an internal switch to an alternative query implementation for recursive query that is more flexible, but also notably slower (performs one call to Git per dataset versus a single call for all combined). The following properties for subdatasets are recognized by DataLad (without the 'gitmodule\_' prefix that is used in the query results): "datalad-recursiveinstall" If set to 'skip', the respective subdataset is skipped when DataLad is recursively installing its superdataset. However, the subdataset remains installable when explicitly requested, and no other features are impaired. "datalad-url" If a subdataset was originally established by cloning, 'datalad-url' records the URL that was used to do so. This might be different from 'url' if the URL contains datalad specific pieces like any URL of the form "ria+<some protocol>...". """ _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), doc="""specify the dataset to query. If no dataset is given, an attempt is made to identify the dataset based on the input and/or the current working directory""", constraints=EnsureDataset() | EnsureNone()), path=Parameter( args=("path", ), metavar='PATH', doc="""path/name to query for subdatasets. Defaults to the current directory[PY: , or the entire dataset if called as a dataset method PY].""", nargs='*', constraints=EnsureStr() | EnsureNone()), fulfilled=Parameter( args=("--fulfilled", ), doc="""if given, must be a boolean flag indicating whether to report either only locally present or absent datasets. By default subdatasets are reported regardless of their status""", constraints=EnsureBool() | EnsureNone()), recursive=recursion_flag, recursion_limit=recursion_limit, contains=Parameter( args=('--contains', ), metavar='PATH', action='append', doc="""limit report to the subdatasets containing the given path. If a root path of a subdataset is given the last reported dataset will be the subdataset itself.[CMD: This option can be given multiple times CMD][PY: Can be a list with multiple paths PY], in which case datasets will be reported that contain any of the given paths.""", constraints=EnsureStr() | EnsureNone()), bottomup=Parameter( args=("--bottomup", ), action="store_true", doc="""whether to report subdatasets in bottom-up order along each branch in the dataset tree, and not top-down."""), set_property=Parameter( args=('--set-property', ), metavar=('NAME', 'VALUE'), nargs=2, action='append', doc="""Name and value of one or more subdataset properties to be set in the parent dataset's .gitmodules file. The property name is case-insensitive, must start with a letter, and consist only of alphanumeric characters. The value can be a Python format() template string wrapped in '<>' (e.g. '<{gitmodule_name}>'). Supported keywords are any item reported in the result properties of this command, plus 'refds_relpath' and 'refds_relname': the relative path of a subdataset with respect to the base dataset of the command call, and, in the latter case, the same string with all directory separators replaced by dashes.[CMD: This option can be given multiple times. CMD]""", constraints=EnsureStr() | EnsureNone()), delete_property=Parameter( args=('--delete-property', ), metavar='NAME', action='append', doc="""Name of one or more subdataset properties to be removed from the parent dataset's .gitmodules file.[CMD: This option can be given multiple times. CMD]""", constraints=EnsureStr() | EnsureNone())) @staticmethod @datasetmethod(name='subdatasets') @eval_results def __call__(path=None, dataset=None, fulfilled=None, recursive=False, recursion_limit=None, contains=None, bottomup=False, set_property=None, delete_property=None): ds = require_dataset(dataset, check_installed=True, purpose='report on subdataset(s)') paths = resolve_path(ensure_list(path), dataset, ds) if path else None # no constraints given -> query subdatasets under curdir if not paths and dataset is None: cwd = Path(getpwd()) paths = None if cwd == ds.pathobj else [cwd] lgr.debug('Query subdatasets of %s', dataset) if paths is not None: lgr.debug('Query subdatasets underneath paths: %s', paths) refds_path = ds.path # return as quickly as possible if isinstance(recursion_limit, int) and (recursion_limit <= 0): return if set_property: for k, v in set_property: if valid_key.match(k) is None: raise ValueError( "key '%s' is invalid (alphanumeric plus '-' only, must " "start with a letter)" % k) if contains: contains = resolve_path(ensure_list(contains), dataset, ds) # expand all test cases for the contains test in the loop below # leads to ~20% speedup per loop iteration of a non-match expanded_contains = [[c] + list(c.parents) for c in contains] else: expanded_contains = [] contains_hits = set() for r in _get_submodules(ds, paths, fulfilled, recursive, recursion_limit, expanded_contains, bottomup, set_property, delete_property, refds_path): # a boat-load of ancient code consumes this and is ignorant of # Path objects r['path'] = str(r['path']) # without the refds_path cannot be rendered/converted relative # in the eval_results decorator r['refds'] = refds_path if 'contains' in r: contains_hits.update(r['contains']) r['contains'] = [str(c) for c in r['contains']] yield r if contains: for c in set(contains).difference(contains_hits): yield get_status_dict( 'subdataset', path=str(c), status='impossible', message='path not contained in any matching subdataset', # we do not want to log such an event, because it is a # legit query to check for matching subdatasets simply # for the purpose of further decision making # user communication in front-end scenarios will happen # via result rendering #logger=lgr )
class Dump(Interface): """Query a dataset's aggregated metadata for dataset and file metadata Two types of metadata are supported: 1. metadata describing a dataset as a whole (dataset-global metadata), and 2. metadata for files in a dataset (content metadata). The DATASET_FILE_PATH_PATTERN argument specifies dataset and file patterns that are matched against the dataset and file information in the metadata. There are two format, UUID-based and dataset-tree based. The formats are: TREE: ["tree:"] [DATASET_PATH] ["@" VERSION-DIGITS] [":" [LOCAL_PATH]] UUID: "uuid:" UUID-DIGITS ["@" VERSION-DIGITS] [":" [LOCAL_PATH]] (the tree-format is the default format and does not require a prefix). """ # Use a custom renderer to emit a self-contained metadata record. The # emitted record can be fed into meta-add for example. result_renderer = 'tailored' _examples_ = [ dict( text='Dump the metadata of the file "dataset_description.json" in ' 'the dataset "simon". (The queried dataset git-repository is ' 'determined based on the current working directory)', code_cmd="datalad meta-dump simon:dataset_description.json"), dict(text="Sometimes it is helpful to get metadata records formatted " "in a more accessible form, here as pretty-printed JSON", code_cmd="datalad -f json_pp meta-dump " "simon:dataset_description.json"), dict(text="Same query as above, but specify that all datasets should " "be queried for the given path", code_cmd="datalad meta-dump -d . :somedir/subdir/thisfile.dat"), dict(text="Dump any metadata record of any dataset known to the " "queried dataset", code_cmd="datalad meta-dump -r"), dict(text="Show metadata for all datasets", code_cmd="datalad -f json_pp meta-dump -r"), dict(text="Show metadata for all files ending in `.json´ in the root " "directories of all datasets", code_cmd="datalad -f json_pp meta-dump *:*.json -r"), dict(text="Show metadata for all files ending in `.json´ in all " "datasets by not specifying a dataset at all. This will " "start dumping at the top-level dataset.", code_cmd="datalad -f json_pp meta-dump :*.json -r") ] _params_ = dict( backend=Parameter(args=("--backend", ), metavar="BACKEND", doc="""metadata storage backend to be used.""", constraints=EnsureChoice("git")), metadata_store=Parameter( args=("-m", "--metadata-store"), metavar="METADATA_STORE", doc="""Directory in which the metadata model instance is stored (often this is the same directory as the dataset directory). If no directory name is provided, the current working directory is used."""), path=Parameter(args=("path", ), metavar="DATASET_FILE_PATH_PATTERN", doc="path to query metadata for", constraints=EnsureStr() | EnsureNone(), nargs='?'), recursive=Parameter( args=( "-r", "--recursive", ), action="store_true", doc="""if set, recursively report on any matching metadata based on given paths or reference dataset. Note, setting this option does not cause any recursion into potential subdatasets on the filesystem. It merely determines what metadata is being reported from the given/discovered reference dataset.""")) @staticmethod @datasetmethod(name='meta_dump') @eval_results def __call__(backend="git", metadata_store=None, path="", recursive=False): metadata_store = metadata_store or "." tree_version_list, uuid_set = get_top_level_metadata_objects( default_mapper_family, metadata_store) # We require both entry points to exist for valid metadata if tree_version_list is None or uuid_set is None: message = (f"No {backend}-mapped datalad metadata " f"model found in: {metadata_store}") lgr.warning(message) yield dict(action="meta_dump", status='impossible', backend=backend, metadata_store=metadata_store, message=message) return parser = MetadataURLParser(path) metadata_url = parser.parse() if isinstance(metadata_url, TreeMetadataURL): yield from dump_from_dataset_tree(backend, metadata_store, tree_version_list, metadata_url, recursive) elif isinstance(metadata_url, UUIDMetadataURL): yield from dump_from_uuid_set(backend, metadata_store, uuid_set, metadata_url, recursive) return @staticmethod def custom_result_renderer(res, **kwargs): if res["status"] != "ok" or res.get("action", "") != 'meta_dump': # logging complained about this already return render_dataset_level_metadata(res["metadata"].get( "dataset_level_metadata", dict())) render_file_level_metadata(res["metadata"].get("file_level_metadata", dict()))
""" __docformat__ = 'restructuredtext' from datalad.interface.results import known_result_xfms from datalad.support.param import Parameter from datalad.support.constraints import EnsureInt, EnsureNone, EnsureStr from datalad.support.constraints import EnsureChoice from datalad.support.constraints import EnsureCallable location_description = Parameter( args=( "-D", "--description", ), constraints=EnsureStr() | EnsureNone(), doc="""short description to use for a dataset location. Its primary purpose is to help humans to identify a dataset copy (e.g., "mike's dataset on lab server"). Note that when a dataset is published, this information becomes available on the remote side.""") recursion_flag = Parameter(args=( "-r", "--recursive", ), action="store_true", doc="""if set, recurse into potential subdataset""") recursion_limit = Parameter( args=( "-R",
class Clean(Interface): """Clean up after DataLad (possible temporary files etc.) Removes extracted temporary archives, etc. Examples: $ datalad clean """ _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset to perform the clean operation on. If no dataset is given, an attempt is made to identify the dataset in current working directory""", constraints=EnsureDataset() | EnsureNone()), # TODO: --info -- which performs dry run just summarizing what is to be cleaned up # TODO: Python only??? what=Parameter( args=("--what", ), dest='what', choices=('cached-archives', 'annex-tmp', 'annex-transfer', 'search-index'), nargs="*", doc="""What to clean. If none specified -- all known targets are cleaned"""), recursive=recursion_flag, recursion_limit=recursion_limit, ) @staticmethod @datasetmethod(name='clean') @eval_results def __call__(dataset=None, what=None, recursive=False, recursion_limit=None): ds = require_dataset(dataset, purpose='clean-up') res_kwargs = dict(action='clean', logger=lgr, refds=ds.path) for wds in itertools.chain( [ds], ds.subdatasets(fulfilled=True, recursive=recursive, recursion_limit=recursion_limit, return_type='generator', result_renderer='disabled', result_xfm='datasets') if recursive else []): d = wds.path gitdir = GitRepo.get_git_dir(d) DIRS_PLURAL = ("directory", "directories") FILES_PLURAL = ("file", "files") for dirpath, flag, msg, sing_pl in [ (ARCHIVES_TEMP_DIR, "cached-archives", "temporary archive", DIRS_PLURAL), (ANNEX_TEMP_DIR, "annex-tmp", "temporary annex", FILES_PLURAL), (ANNEX_TRANSFER_DIR, "annex-transfer", "annex temporary transfer", DIRS_PLURAL), (opj(gitdir, SEARCH_INDEX_DOTGITDIR), 'search-index', "metadata search index", FILES_PLURAL), ]: topdir = opj(d, dirpath) lgr.debug("Considering to clean %s:%s", d, dirpath) if not ((what is None) or (flag in what)): yield get_status_dict(path=topdir, status='notneeded', type='directory', **res_kwargs) continue paths = glob(opj(topdir, '*')) if not paths: yield get_status_dict(path=topdir, status='notneeded', type='directory', **res_kwargs) continue pl = len(paths) > 1 message = ("Removed %d %s %s: %s", len(paths), msg, sing_pl[int(pl)], ", ".join( sorted([x[len(topdir) + 1:] for x in paths]))) rmtree(topdir) yield get_status_dict(path=topdir, status='ok', type='dir', message=message, **res_kwargs)
class Create(Interface): """Create a new dataset from scratch. This command initializes a new :term:`dataset` at a given location, or the current directory. The new dataset can optionally be registered in an existing :term:`superdataset` (the new dataset's path needs to be located within the superdataset for that, and the superdataset needs to be given explicitly). It is recommended to provide a brief description to label the dataset's nature *and* location, e.g. "Michael's music on black laptop". This helps humans to identify data locations in distributed scenarios. By default an identifier comprised of user and machine name, plus path will be generated. Plain Git repositories can be created via the [PY: `no_annex` PY][CMD: --no-annex CMD] flag. However, the result will not be a full dataset, and, consequently, not all features are supported (e.g. a description). || REFLOW >> To create a local version of a remote dataset use the :func:`~datalad.api.install` command instead. << REFLOW || .. note:: Power-user info: This command uses :command:`git init`, and :command:`git annex init` to prepare the new dataset. Registering to a superdataset is performed via a :command:`git submodule add` operation in the discovered superdataset. """ _params_ = dict( path=Parameter( args=("path", ), metavar='PATH', doc="""path where the dataset shall be created, directories will be created as necessary. If no location is provided, a dataset will be created in the current working directory. Either way the command will error if the target directory is not empty. Use `force` to create a dataset in a non-empty directory.""", nargs='?', # put dataset 2nd to avoid useless conversion constraints=EnsureStr() | EnsureDataset() | EnsureNone()), dataset=Parameter( args=("-d", "--dataset"), metavar='PATH', doc="""specify the dataset to perform the create operation on. If a dataset is give, a new subdataset will be created in it.""", constraints=EnsureDataset() | EnsureNone()), force=Parameter( args=( "-f", "--force", ), doc="""enforce creation of a dataset in a non-empty directory""", action='store_true'), description=dataset_description, no_annex=Parameter( args=("--no-annex", ), doc="""if set, a plain Git repository will be created without any annex""", action='store_true'), save=nosave_opt, if_dirty=if_dirty_opt, annex_version=Parameter( args=("--annex-version", ), doc="""select a particular annex repository version. The list of supported versions depends on the available git-annex version. This should be left untouched, unless you know what you are doing""", constraints=EnsureDType(int) | EnsureNone()), annex_backend=Parameter( args=("--annex-backend", ), constraints=EnsureStr() | EnsureNone(), # not listing choices here on purpose to avoid future bugs doc="""set default hashing backend used by the new dataset. For a list of supported backends see the git-annex documentation. The default is optimized for maximum compatibility of datasets across platforms (especially those with limited path lengths)""", nargs=1), native_metadata_type=Parameter( args=('--native-metadata-type', ), metavar='LABEL', action='append', constraints=EnsureStr() | EnsureNone(), doc="""Metadata type label. Must match the name of the respective parser implementation in Datalad (e.g. "bids").[CMD: This option can be given multiple times CMD]"""), shared_access=shared_access_opt, git_opts=git_opts, annex_opts=annex_opts, annex_init_opts=annex_init_opts, ) @staticmethod @datasetmethod(name='create') def __call__(path=None, force=False, description=None, dataset=None, no_annex=False, save=True, annex_version=None, annex_backend='MD5E', native_metadata_type=None, if_dirty='save-before', shared_access=None, git_opts=None, annex_opts=None, annex_init_opts=None): # two major cases # 1. we got a `dataset` -> we either want to create it (path is None), # or another dataset in it (path is not None) # 2. we got no dataset -> we want to create a fresh dataset at the # desired location, either at `path` or PWD # sanity check first if git_opts: lgr.warning( "`git_opts` argument is presently ignored, please complain!") if no_annex: if description: raise ValueError("Incompatible arguments: cannot specify " "description for annex repo and declaring " "no annex repo.") if annex_opts: raise ValueError("Incompatible arguments: cannot specify " "options for annex and declaring no " "annex repo.") if annex_init_opts: raise ValueError("Incompatible arguments: cannot specify " "options for annex init and declaring no " "annex repo.") if not isinstance(force, bool): raise ValueError( "force should be bool, got %r. Did you mean to provide a 'path'?" % force) # straight from input arg, no messing around before this if path is None: if dataset is None: # nothing given explicity, assume create fresh right here path = getpwd() else: # no path, but dataset -> create that dataset path = dataset.path else: # resolve the path against a potential dataset path = resolve_path(path, ds=dataset) # we know that we need to create a dataset at `path` assert (path is not None) if git_opts is None: git_opts = {} if shared_access: # configure `git --shared` value git_opts['shared'] = shared_access # check for sane subdataset path real_targetpath = with_pathsep(realpath(path)) # realpath OK if dataset is not None: # make sure we get to an expected state if dataset.is_installed(): handle_dirty_dataset(dataset, if_dirty) if not real_targetpath.startswith( # realpath OK with_pathsep(realpath(dataset.path))): # realpath OK raise ValueError("path {} outside {}".format(path, dataset)) # important to use the given Dataset object to avoid spurious ID # changes with not-yet-materialized Datasets tbds = dataset if dataset is not None and dataset.path == path else Dataset( path) # don't create in non-empty directory without `force`: if isdir(tbds.path) and listdir(tbds.path) != [] and not force: raise ValueError("Cannot create dataset in directory %s " "(not empty). Use option 'force' in order to " "ignore this and enforce creation." % tbds.path) if no_annex: lgr.info("Creating a new git repo at %s", tbds.path) GitRepo(tbds.path, url=None, create=True, git_opts=git_opts) else: # always come with annex when created from scratch lgr.info("Creating a new annex repo at %s", tbds.path) AnnexRepo(tbds.path, url=None, create=True, backend=annex_backend, version=annex_version, description=description, git_opts=git_opts, annex_opts=annex_opts, annex_init_opts=annex_init_opts) if native_metadata_type is not None: if not isinstance(native_metadata_type, list): native_metadata_type = [native_metadata_type] for nt in native_metadata_type: tbds.config.add('datalad.metadata.nativetype', nt) # record an ID for this repo for the afterlife # to be able to track siblings and children id_var = 'datalad.dataset.id' if id_var in tbds.config: # make sure we reset this variable completely, in case of a re-create tbds.config.unset(id_var, where='dataset') tbds.config.add(id_var, tbds.id if tbds.id is not None else uuid.uuid1().urn.split(':')[-1], where='dataset') # save everthing tbds.repo.add('.datalad', git=True) if save: Save.__call__(message='[DATALAD] new dataset', dataset=tbds, auto_add_changes=False, recursive=False) if dataset is not None and dataset.path != tbds.path: # we created a dataset in another dataset # -> make submodule from datalad.distribution.utils import _install_subds_inplace subdsrelpath = relpath(realpath(tbds.path), realpath(dataset.path)) # realpath OK _install_subds_inplace(ds=dataset, path=tbds.path, relativepath=subdsrelpath) # this will have staged the changes in the superdataset already if save: Save.__call__(message='[DATALAD] added subdataset', dataset=dataset, auto_add_changes=False, recursive=False) return tbds @staticmethod def result_renderer_cmdline(res, args): from datalad.ui import ui if res is None: ui.message("Nothing was created") elif isinstance(res, Dataset): ui.message("Created dataset at %s." % res.path)
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """Common interface options """ __docformat__ = 'restructuredtext' from datalad.support.param import Parameter from datalad.support.constraints import EnsureInt, EnsureNone, EnsureStr dataset_description = Parameter( args=( "-D", "--description", ), constraints=EnsureStr() | EnsureNone(), doc="""short description of this dataset instance that humans can use to identify the repository/location, e.g. "Precious data on my laptop.""") recursion_flag = Parameter(args=( "-r", "--recursive", ), action="store_true", doc="""if set, recurse into potential subdataset""") recursion_limit = Parameter( args=("--recursion-limit", ), metavar="LEVELS", constraints=EnsureInt() | EnsureNone(), doc="""limit recursion into subdataset to the given number of levels""")
class Publish(Interface): """Publish a dataset to a known :term:`sibling`. This makes the last saved state of a dataset available to a sibling or special remote data store of a dataset. Any target sibling must already exist and be known to the dataset. Optionally, it is possible to limit publication to change sets relative to a particular point in the version history of a dataset (e.g. a release tag). By default, the state of the local dataset is evaluated against the last known state of the target sibling. Actual publication is only attempted if there was a change compared to the reference state, in order to speed up processing of large collections of datasets. Evaluation with respect to a particular "historic" state is only supported in conjunction with a specified reference dataset. Change sets are also evaluated recursively, i.e. only those subdatasets are published where a change was recorded that is reflected in to current state of the top-level reference dataset. See "since" option for more information. Only publication of saved changes is supported. Any unsaved changes in a dataset (hierarchy) have to be saved before publication. .. note:: Power-user info: This command uses :command:`git push`, and :command:`git annex copy` to publish a dataset. Publication targets are either configured remote Git repositories, or git-annex special remotes (if they support data upload). .. note:: The `push` command (new in 0.13.0) provides an alternative interface. Critical differences are that `push` transfers annexed data by default and does not handle sibling creation (i.e. it does not have a `--missing` option). """ # XXX prevent common args from being added to the docstring _no_eval_results = True # TODO: Figure out, how to tell about tracking branch/upstream # (and the respective remote) # - it is used, when no destination is given # - it is configured to be the given destination, if there was no # upstream set up before, so you can use just "datalad publish" next # time. _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), metavar='DATASET', doc="""specify the (top-level) dataset to be published. If no dataset is given, the datasets are determined based on the input arguments""", constraints=EnsureDataset() | EnsureNone()), to=Parameter( args=("--to", ), metavar='LABEL', doc="""name of the target sibling. If no name is given an attempt is made to identify the target based on the dataset's configuration (i.e. a configured tracking branch, or a single sibling that is configured for publication)""", # TODO: See TODO at top of class! constraints=EnsureStr() | EnsureNone()), since=Parameter( args=("--since", ), constraints=EnsureStr() | EnsureNone(), doc= """specifies commit-ish (tag, shasum, etc.) from which to look for changes to decide whether pushing is necessary. If '^' is given, the last state of the current branch at the sibling is taken as a starting point. An empty string ('') for the same effect is still supported)."""), # since: commit => .gitmodules diff to head => submodules to publish missing=missing_sibling_opt, path=Parameter( args=("path", ), metavar='PATH', # TODO this description is no longer correct doc="path(s), that may point to file handle(s) to publish including " "their actual content or to subdataset(s) to be published. If a " "file handle is published with its data, this implicitly means " "to also publish the (sub)dataset it belongs to. '.' as a path " "is treated in a special way in the sense, that it is passed " "to subdatasets in case `recursive` is also given.", constraints=EnsureStr() | EnsureNone(), nargs='*'), force=Parameter( args=( "-f", "--force", ), doc="""enforce doing publish activities (git push etc) regardless of the analysis if they seemed needed""", action='store_true'), # TODO add option to decide what branch/repo to push transfer_data=Parameter(args=("--transfer-data", ), doc="""ADDME""", constraints=EnsureChoice( 'auto', 'none', 'all')), recursive=recursion_flag, recursion_limit=recursion_limit, git_opts=git_opts, annex_opts=annex_opts, annex_copy_opts=annex_copy_opts, jobs=jobs_opt, ) @staticmethod @datasetmethod(name='publish') @eval_results def __call__(path=None, dataset=None, to=None, since=None, missing='fail', force=False, transfer_data='auto', recursive=False, recursion_limit=None, git_opts=None, annex_opts=None, annex_copy_opts=None, jobs=None): # if ever we get a mode, for "with-data" we would need this #if dataset and not path: # # act on the whole dataset if nothing else was specified # path = dataset.path if isinstance(dataset, Dataset) else dataset if not (isinstance(dataset, Dataset) or (dataset is None and path)): # try to find a dataset in PWD dataset = require_dataset(dataset, check_installed=True, purpose='publishing') if (since and since != '^') and not dataset: raise InsufficientArgumentsError( 'Modification detection (--since) without a base dataset ' 'is not supported') if dataset and since in ('', '^'): # only update since last update so we figure out what was the last update active_branch = dataset.repo.get_active_branch() if to: # XXX here we assume one to one mapping of names from local branches # to the remote since = '%s/%s' % (to, active_branch) # test if such branch already exists, if since not in dataset.repo.get_remote_branches(): lgr.debug( "No remote branch %s yet, so since will not be used", since) since = None else: # take tracking remote for the active branch tracked_remote, tracked_refspec = dataset.repo.get_tracking_branch( ) if tracked_remote: if tracked_refspec.startswith('refs/heads/'): tracked_refspec = tracked_refspec[len('refs/heads/'):] #to = tracked_remote since = '%s/%s' % (tracked_remote, tracked_refspec) else: lgr.info( "No tracked remote for %s. since option is of no effect", active_branch) since = None # here is the plan # 1. figure out remote to publish to # 2. figure out which content needs to be published to this remote # 3. look for any pre-publication dependencies of that remote # (i.e. remotes that need to be published to before) # 4. publish the content needed to go to the primary remote to # the dependencies first, and to the primary afterwards ds_remote_info = {} refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(refds=refds_path, logger=lgr, action='publish') to_process = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='publish', unavailable_path_status='impossible', nondataset_path_status='error', modified="%s..HEAD" % since if since else since, return_type='generator', on_failure='ignore', force_no_revision_change_discovery= False, # we cannot publish what was not committed force_untracked_discovery=False # we cannot publish untracked ): if ap.get('status', None): # this is done yield ap continue remote_info_result = None if ap.get('type', ap.get('type_src', 'dataset')) != 'dataset': # for everything that is not a dataset get the remote info # for the parent parentds = ap.get('parentds', None) if parentds and parentds not in ds_remote_info: remote_info_result = _get_remote_info( parentds, ds_remote_info, to, missing) else: # this is a dataset if ap.get('state', None) == 'absent': continue # get the remote info for itself remote_info_result = _get_remote_info(ap['path'], ds_remote_info, to, missing) ap['process_content'] = True if remote_info_result is not None: ap['status'] = remote_info_result[0] ap['message'] = remote_info_result[1] yield ap continue to_process.append(ap) content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path) assert (not completed) lgr.debug("Evaluating %i dataset publication candidate(s)", len(content_by_ds)) # TODO: fancier sorting, so we still follow somewhat the hierarchy # in sorted order, e.g. # d1/sub1/sub1 # d1/sub1 # d1 # d2/sub1 # d2 content_by_ds = OrderedDict( (d, content_by_ds[d]) for d in sorted(content_by_ds, reverse=True)) lgr.debug("Attempt to publish %i datasets", len(content_by_ds)) for ds_path in content_by_ds: remote_info = ds_remote_info.get(ds_path, None) if remote_info is None: # maybe this dataset wasn't annotated above, try to get info # MIH: I think this entire if-branch is practically impossible # to reach. It is certainly untested, but I think this is due # to mutually exclusive conditions during remote_info detection remote_info_result = _get_remote_info(ds_path, ds_remote_info, to, missing) if remote_info_result is not None: yield get_status_dict(type='dataset', path=ds_path, status=remote_info_result[0], message=remote_info_result[1], **res_kwargs) continue # continue with freshly obtained info remote_info = ds_remote_info[ds_path] # condition above must catch all other cases assert remote_info # and publish ds = Dataset(ds_path) for r in _publish_dataset( ds, remote=remote_info['remote'], refspec=remote_info.get('refspec', None), # only send paths that were explicitly requested paths= [ p for p in content_by_ds[ds_path] # do not feed (sub)dataset paths into the beast # makes no sense to try to annex copy them # for the base dataset itself let `transfer_data` # decide if p.get('type', None) != 'dataset' ], annex_copy_options=annex_copy_opts, force=force, jobs=jobs, transfer_data=transfer_data, **res_kwargs): yield r
class Uninstall(Interface): """Uninstall subdatasets This command can be used to uninstall any number of installed subdatasets. This command will error if individual files or non-dataset directories are given as input (use the drop or remove command depending on the desired goal), nor will it uninstall top-level datasets (i.e. datasets that are not a subdataset in another dataset; use the remove command for this purpose). By default, the availability of at least one remote copy for each currently available file in any dataset is verified. As these checks could lead to slow operation (network latencies, etc), they can be disabled. Any number of paths to process can be given as input. Recursion into subdatasets needs to be explicitly enabled, while recursion into subdirectories within a dataset is done automatically. An optional recursion limit is applied relative to each given input path. """ _action = 'uninstall' _params_ = dict( dataset=dataset_argument, path=Parameter(args=("path", ), metavar="PATH", doc="path/name of the component to be uninstalled", nargs="*", constraints=EnsureStr() | EnsureNone()), recursive=recursion_flag, check=check_argument, if_dirty=if_dirty_opt, ) _examples_ = [ dict(text="Uninstall a subdataset (undo installation)", code_py="uninstall(path='path/to/subds')", code_cmd="datalad uninstall <path/to/subds>"), dict(text="Uninstall a subdataset and all potential subdatasets", code_py="uninstall(path='path/to/subds', recursive=True)", code_cmd="datalad uninstall -r <path/to/subds>"), dict( text="Skip checks that ensure a minimal number of (remote) sources", code_py="uninstall(path='path/to/subds', check=False)", code_cmd="datalad uninstall <path/to/subds> --nocheck"), ] @staticmethod @datasetmethod(name=_action) @eval_results def __call__(path=None, dataset=None, recursive=False, check=True, if_dirty='save-before'): refds = require_dataset(dataset, check_installed=True, purpose='uninstalling') res_kwargs = dict(action='uninstall', logger=lgr, refds=refds.path) if not path: # if no path is given, ie. refds is supposed to be uninstalled # check if refds is a subdataset itself, if not die # we only need to test that for the refds, everything else # will be guaranteed to be a subdataset parentds = refds.get_superdataset( datalad_only=False, topmost=False, # unless it is properly registered we have no way of # reinstalling it registered_only=True) if parentds is None: yield dict( res_kwargs, path=refds.path, type='dataset', status='error', message="will not uninstall top-level dataset " "(consider `remove` command)", ) return saw_subds = False for ds in itertools.chain( Subdatasets.__call__( # it is critical to pass the dataset arg as-is # to not invalidate the path argument semantics # in subdatasets() dataset=dataset, path=path, fulfilled=True, # makes no sense to ignore subdatasets further down recursive=True, # important to start at the bottom for proper deinit bottomup=True, # doesn't make sense for uninstall #recursion_limit=recursion_limit, return_type='generator', result_renderer='disabled', result_xfm='datasets') if path or recursive else [], [refds] if not path else []): if ds != refds: saw_subds = True # TODO generator # this should yield what it did handle_dirty_dataset(ds, mode=if_dirty) # we confirmed the super dataset presence above for r in _uninstall_dataset(ds, check=check, has_super=True, **res_kwargs): yield r # there is nothing to save at the end if path and not saw_subds: lgr.warning( 'path constraints did not match an installed subdataset: %s', path)
from datalad.interface.results import annexjson2result from datalad.interface.results import success_status_map from datalad.interface.results import results_from_annex_noinfo from datalad.interface.utils import handle_dirty_dataset from datalad.interface.utils import eval_results from datalad.interface.base import build_doc lgr = logging.getLogger('datalad.distribution.drop') dataset_argument = Parameter( args=("-d", "--dataset"), metavar="DATASET", doc="""specify the dataset to perform the operation on. If no dataset is given, an attempt is made to identify a dataset based on the `path` given""", constraints=EnsureDataset() | EnsureNone()) check_argument = Parameter( args=("--nocheck", ), doc="""whether to perform checks to assure the configured minimum number (remote) source for data.[CMD: Give this option to skip checks CMD]""", action="store_false", dest='check') def _drop_files(ds, paths, check, noannex_iserror=False, **kwargs): """Helper to drop content in datasets. Parameters ----------
class NoAnnex(Interface): """Configure a dataset to never put some content into the dataset's annex This can be useful in mixed datasets that also contain textual data, such as source code, which can be efficiently and more conveniently managed directly in Git. Patterns generally look like this:: code/* which would match all file in the code directory. In order to match all files under ``code/``, including all its subdirectories use such a pattern:: code/** Note that the plugin works incrementally, hence any existing configuration (e.g. from a previous plugin run) is amended, not replaced. Parameters ---------- ref_dir : str, optional makedirs : bool, optional """ from datalad.support.param import Parameter from datalad.distribution.dataset import datasetmethod from datalad.interface.utils import eval_results from datalad.distribution.dataset import EnsureDataset from datalad.support.constraints import EnsureNone _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc=""""specify the dataset to configure. If no dataset is given, an attempt is made to identify the dataset based on the current working directory.""", constraints=EnsureDataset() | EnsureNone()), pattern=Parameter( args=("--pattern", ), nargs='+', doc="""list of path patterns. Any content whose path is matching any pattern will not be annexed when added to a dataset, but instead will be tracked directly in Git. Path pattern have to be relative to the directory given by the `ref_dir` option. By default, patterns should be relative to the root of the dataset.""" ), ref_dir=Parameter( args=("--ref-dir", ), doc="""Relative path (within the dataset) to the directory that is to be configured. All patterns are interpreted relative to this path, and configuration is written to a ``.gitattributes`` file in this directory."""), makedirs=Parameter( args=("--makedirs", ), action='store_true', doc="""If set, any missing directories will be created in order to be able to place a file into ``--ref-dir``."""), ) @staticmethod @datasetmethod(name='no_annex') @eval_results def __call__(dataset, pattern, ref_dir='.', makedirs=False): # could be extended to accept actual largefile expressions from os.path import join as opj from os.path import isabs from os.path import exists from os import makedirs as makedirsfx from datalad.distribution.dataset import require_dataset from datalad.support.annexrepo import AnnexRepo from datalad.utils import assure_list pattern = assure_list(pattern) ds = require_dataset(dataset, check_installed=True, purpose='no_annex configuration') res_kwargs = dict( path=ds.path, type='dataset', action='no_annex', ) # all the ways we refused to cooperate if not isinstance(ds.repo, AnnexRepo): yield dict(res_kwargs, status='notneeded', message='dataset has no annex') return if any(isabs(p) for p in pattern): yield dict( res_kwargs, status='error', message= ('path pattern for `no_annex` configuration must be relative paths: %s', pattern)) return if isabs(ref_dir): yield dict( res_kwargs, status='error', message= ('`ref_dir` for `no_annex` configuration must be a relative path: %s', ref_dir)) return gitattr_dir = opj(ds.path, ref_dir) if not exists(gitattr_dir): if makedirs: makedirsfx(gitattr_dir) else: yield dict( res_kwargs, status='error', message= 'target directory for `no_annex` does not exist (consider makedirs=True)' ) return gitattr_file = opj(gitattr_dir, '.gitattributes') dataset.repo.set_gitattributes([(p, { 'annex.largefiles': 'nothing' }) for p in pattern], attrfile=gitattr_file) yield dict(res_kwargs, status='ok') for r in dataset.rev_save( gitattr_file, to_git=True, message="[DATALAD] exclude paths from annex'ing", result_filter=None, result_xfm=None): yield r
class Drop(Interface): """Drop file content from datasets This command takes any number of paths of files and/or directories. If a common (super)dataset is given explicitly, the given paths are interpreted relative to this dataset. Recursion into subdatasets needs to be explicitly enabled, while recursion into subdirectories within a dataset is done automatically. An optional recursion limit is applied relative to each given input path. By default, the availability of at least one remote copy is verified before file content is dropped. As these checks could lead to slow operation (network latencies, etc), they can be disabled. Examples: Drop all file content in a dataset:: ~/some/dataset$ datalad drop Drop all file content in a dataset and all its subdatasets:: ~/some/dataset$ datalad drop --recursive """ _action = 'drop' _params_ = dict( dataset=dataset_argument, path=Parameter(args=("path", ), metavar="PATH", doc="path/name of the component to be dropped", nargs="*", constraints=EnsureStr() | EnsureNone()), recursive=recursion_flag, recursion_limit=recursion_limit, check=check_argument, if_dirty=if_dirty_opt, ) @staticmethod @datasetmethod(name=_action) @eval_results def __call__(path=None, dataset=None, recursive=False, recursion_limit=None, check=True, if_dirty='save-before'): if not dataset and not path: raise InsufficientArgumentsError( "insufficient information for `drop`: requires at least a path or dataset" ) refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='drop', logger=lgr, refds=refds_path) if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path to_drop = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='drop', # justification for status: # content need not be dropped where there is none unavailable_path_status='notneeded', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) == 'dataset' and \ GitRepo.is_valid_repo(ap['path']) and \ not ap['path'] == refds_path: ap['process_content'] = True if ap.get('registered_subds', False) and ap.get('state', None) == 'absent': # nothing to drop in an absent subdataset, don't be annoying # and skip silently continue to_drop.append(ap) content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_drop, refds_path=refds_path) assert (not completed) # iterate over all datasets, order doesn't matter for ds_path in content_by_ds: ds = Dataset(ds_path) # TODO generator # this should yield what it did handle_dirty_dataset(ds, mode=if_dirty) # ignore submodule entries content = [ ap['path'] for ap in content_by_ds[ds_path] if ap.get('type', None) != 'dataset' or ap['path'] == ds.path ] if not content: continue for r in _drop_files(ds, content, check=check, **res_kwargs): yield r
class Spec2Bids(Interface): """Convert to BIDS based on study specification """ _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""bids dataset""", constraints=EnsureDataset() | EnsureNone()), specfile=Parameter( args=("specfile",), metavar="SPEC_FILE", doc="""path(s) to the specification file(s) to use for conversion. If a directory at the first level beneath the dataset's root is given instead of a file, it's assumed to be an acqusition directory that contains a specification file. By default this is a file named 'studyspec.json' in the acquisition directory. This default name can be configured via the 'datalad.hirni.studyspec.filename' config variable. """, nargs="*", constraints=EnsureStr()), anonymize=Parameter( args=("--anonymize",), action="store_true", doc="""whether or not to anonymize for conversion. By now this means to use 'anon_subject' instead of 'subject' from spec and to use datalad-run with a sidecar file, to not leak potentially identifying information into its record.""",), only_type=Parameter( args=("--only-type",), metavar="TYPE", doc="specify snippet type to convert. If given only this type of " "specification snippets is considered for conversion", constraints=EnsureStr() | EnsureNone(),) ) @staticmethod @datasetmethod(name='hirni_spec2bids') @eval_results def __call__(specfile, dataset=None, anonymize=False, only_type=None): dataset = require_dataset(dataset, check_installed=True, purpose="spec2bids") specfile = assure_list(specfile) specfile = [resolve_path(p, dataset) for p in specfile] for spec_path in specfile: # Note/TODO: ran_procedure per spec file still isn't ideal. Could # be different spec files for same acquisition. It's actually about # the exact same call. How to best get around substitutions? # Also: per snippet isn't correct either. # substitutions is real issue. Example "copy {location} ." # # => datalad.interface.run.format_command / normalize_command ? # TODO: Also can we skip prepare_inputs within run? At least specify # more specifically. Note: Can be globbed! ran_procedure = dict() if not lexists(spec_path): yield get_status_dict( action='spec2bids', path=spec_path, status='impossible', message="{} not found".format(spec_path) ) if op.isdir(spec_path): if op.realpath(op.join(spec_path, op.pardir)) == \ op.realpath(dataset.path): spec_path = op.join( spec_path, dataset.config.get( "datalad.hirni.studyspec.filename", "studyspec.json") ) # TODO: check existence of that file! else: yield get_status_dict( action='spec2bids', path=spec_path, status='impossible', message="{} is neither a specification file nor an " "acquisition directory".format(spec_path) ) # relative path to spec to be recorded: rel_spec_path = relpath(spec_path, dataset.path) \ if isabs(spec_path) else spec_path # check each dict (snippet) in the specification for what to do # wrt conversion: for spec_snippet in load_stream(spec_path): if only_type and not spec_snippet['type'].startswith(only_type): # ignore snippets not matching `only_type` # Note/TODO: the .startswith part is meant for # matching "dicomseries:all" to given "dicomseries" but not # vice versa. This prob. needs refinement (and doc) continue if 'procedures' not in spec_snippet: # no conversion procedures defined at all: yield get_status_dict( action='spec2bids', path=spec_path, snippet=spec_snippet, status='notneeded', ) continue procedure_list = spec_snippet['procedures'] if not procedure_list: # no conversion procedures defined at all: yield get_status_dict( action='spec2bids', path=spec_path, snippet=spec_snippet, status='notneeded', ) continue # accept a single dict as a one item list: if isinstance(procedure_list, dict): procedure_list = [procedure_list] # build a dict available for placeholders in format strings: # Note: This is flattening the structure since we don't need # value/approved for the substitutions. In addition 'subject' # and 'anon_subject' are not passed on, but a new key # 'bids_subject' instead the value of which depends on the # --anonymize switch. # Additionally 'location' is recomputed to be relative to # dataset.path, since this is where the procedures are running # from within. replacements = dict() for k, v in spec_snippet.items(): if k == 'subject': if not anonymize: replacements['bids-subject'] = v['value'] elif k == 'anon-subject': if anonymize: replacements['bids-subject'] = v['value'] elif k == 'location': replacements[k] = op.join(op.dirname(rel_spec_path), v) elif k == 'procedures': # 'procedures' is a list of dicts (not suitable for # substitutions) and it makes little sense to be # referenced by converter format strings anyway: continue else: replacements[k] = v['value'] if isinstance(v, dict) else v # build dict to patch os.environ with for passing # replacements on to procedures: env_subs = dict() for k, v in replacements.items(): env_subs['DATALAD_RUN_SUBSTITUTIONS_{}' ''.format(k.upper().replace('-', '__'))] = str(v) env_subs['DATALAD_RUN_SUBSTITUTIONS_SPECPATH'] = rel_spec_path env_subs['DATALAD_RUN_SUBSTITUTIONS_ANONYMIZE'] = str(anonymize) # TODO: The above two blocks to build replacements dict and # env_subs should be joined eventually. for proc in procedure_list: if has_specval(proc, 'procedure-name'): proc_name = get_specval(proc, 'procedure-name') else: # invalid procedure spec lgr.warning("conversion procedure missing key " "'procedure-name' in %s: %s", spec_path, proc) # TODO: continue or yield impossible/error so it can be # dealt with via on_failure? continue if has_specval(proc, 'on-anonymize') \ and anything2bool( get_specval(proc, 'on-anonymize') ) and not anonymize: # don't run that procedure, if we weren't called with # --anonymize while procedure is specified to be run on # that switch only continue proc_call = get_specval(proc, 'procedure-call') \ if has_specval(proc, 'procedure-call') \ else None if ran_procedure.get(hash((proc_name, proc_call)), None): # if we ran the exact same call already, # don't call it again # TODO: notneeded? continue # if spec comes with call format string, it takes precedence # over what is generally configured for the procedure # TODO: Not sure yet whether this is how we should deal with it if proc_call: env_subs['DATALAD_PROCEDURES_{}_CALL__FORMAT' ''.format(proc_name.upper().replace('-', '__')) ] = proc_call run_results = list() # Note, that we can't use dataset.config.overrides to # pass run-substitution config to procedures, since we # leave python context and thereby loose the dataset # instance. Use patched os.environ instead. Note also, # that this requires names of substitutions to not # contain underscores, since they would be translated to # '.' by ConfigManager when reading them from within the # procedure's datalad-run calls. from mock import patch # TODO: Reconsider that patching. Shouldn't it be an update? with patch.dict('os.environ', env_subs): # apparently reload is necessary to consider config # overrides via env: dataset.config.reload() for r in dataset.run_procedure( spec=proc_name, return_type='generator' ): # # if there was an issue yield original result, # # otherwise swallow: # if r['status'] not in ['ok', 'notneeded']: yield r run_results.append(r) if not all(r['status'] in ['ok', 'notneeded'] for r in run_results): yield {'action': proc_name, 'path': spec_path, 'snippet': spec_snippet, 'status': 'error', 'message': "acquisition conversion failed. " "See previous message(s)."} else: yield {'action': proc_name, 'path': spec_path, 'snippet': spec_snippet, 'status': 'ok', 'message': "acquisition converted."} # mark as a procedure we ran on this acquisition: # TODO: rethink. Doesn't work that way. Disabled for now # ran_procedure[hash((proc_name, proc_call))] = True # elif proc_name != 'hirni-dicom-converter': # # specific converter procedure call # # from mock import patch # with patch.dict('os.environ', env_subs): # # apparently reload is necessary to consider config # # overrides via env: # dataset.config.reload() # # for r in dataset.run_procedure( # spec=[proc_name, rel_spec_path, anonymize], # return_type='generator' # ): # # # if there was an issue with containers-run, # # yield original result, otherwise swallow: # if r['status'] not in ['ok', 'notneeded']: # yield r # # run_results.append(r) # # if not all(r['status'] in ['ok', 'notneeded'] # for r in run_results): # yield {'action': proc_name, # 'path': spec_path, # 'snippet': spec_snippet, # 'status': 'error', # 'message': "Conversion failed. " # "See previous message(s)."} # # else: # yield {'action': proc_name, # 'path': spec_path, # 'snippet': spec_snippet, # 'status': 'ok', # 'message': "specification converted."} # elif ran_heudiconv and proc_name == 'hirni-dicom-converter': # # in this case we acted upon this snippet already and # # do not have to produce a result # pass # # else: # # this shouldn't happen! # raise RuntimeError yield {'action': 'spec2bids', 'path': spec_path, 'status': 'ok'}
class ContainersRemove(Interface): # first docstring line is used a short description in the cmdline help # the rest is put in the verbose help and manpage """Remove a known container from a dataset """ # parameters of the command, must be exhaustive _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset to query. If no dataset is given, an attempt is made to identify the dataset based on the current working directory""", constraints=EnsureDataset() | EnsureNone()), name=Parameter( args=("name",), doc="""name of the container to remove""", metavar="NAME", constraints=EnsureStr(), ), remove_image=Parameter( args=("-i", "--remove-image",), doc="""if set, remove container image as well""", action="store_true", ), ) @staticmethod @datasetmethod(name='containers_remove') @eval_results def __call__(name, dataset=None, remove_image=False): ds = require_dataset(dataset, check_installed=True, purpose='remove a container') res = get_status_dict( ds=ds, action='containers_remove', logger=lgr) section = 'datalad.containers.{}'.format(name) imagecfg = '{}.image'.format(section) to_save = [] if remove_image and imagecfg in ds.config: imagepath = ds.config.get(imagecfg) if op.lexists(op.join(ds.path, imagepath)): for r in ds.remove( path=imagepath, # XXX shortcomming: this is the only way to say: # don't drop check=False, # config setting might be outdated and image no longer # there -> no reason to fail, just report on_failure='ignore', save=False): yield r to_save.append(imagepath) if section in ds.config.sections(): ds.config.remove_section( section, where='dataset', reload=True) res['status'] = 'ok' to_save.append(op.join('.datalad', 'config')) else: res['status'] = 'notneeded' if to_save: for r in ds.save( path=to_save, message='[DATALAD] Remove container {}'.format(name)): yield r yield res
class Push(Interface): """Push a dataset to a known :term:`sibling`. This makes the last saved state of a dataset available to a sibling or special remote data store of a dataset. Any target sibling must already exist and be known to the dataset. Optionally, it is possible to limit a push to change sets relative to a particular point in the version history of a dataset (e.g. a release tag). By default, the state of the local dataset is evaluated against the last known state of the target sibling. An actual push is only attempted if there was a change compared to the reference state, in order to speed up processing of large collections of datasets. Evaluation with respect to a particular "historic" state is only supported in conjunction with a specified reference dataset. Change sets are also evaluated recursively, i.e. only those subdatasets are pushed where a change was recorded that is reflected in the current state of the top-level reference dataset. See "since" option for more information. Only a push of saved changes is supported. .. note:: Power-user info: This command uses :command:`git push`, and :command:`git annex copy` to push a dataset. Publication targets are either configured remote Git repositories, or git-annex special remotes (if they support data upload). """ # TODO add examples _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), doc="""specify the dataset to push""", constraints=EnsureDataset() | EnsureNone()), to=Parameter( args=("--to", ), metavar='SIBLING', doc="""name of the target sibling. If no name is given an attempt is made to identify the target based on the dataset's configuration (i.e. a configured tracking branch, or a single sibling that is configured for push)""", constraints=EnsureStr() | EnsureNone()), since=Parameter( args=("--since", ), constraints=EnsureStr() | EnsureNone(), doc= """specifies commit-ish (tag, shasum, etc.) from which to look for changes to decide whether pushing is necessary. If an empty string is given, the last state of the current branch at the sibling is taken as a starting point."""), path=Parameter(args=("path", ), metavar='PATH', doc="""path to contrain a push to. If given, only data or changes for those paths are considered for a push.""", nargs='*', constraints=EnsureStr() | EnsureNone()), force=Parameter( # multi-mode option https://github.com/datalad/datalad/issues/3414 args=( "-f", "--force", ), doc="""force particular operations, overruling automatic decision making: use --force with git-push ('gitpush'); do not use --fast with git-annex copy ('datatransfer'); do not attempt to copy annex'ed file content ('no-datatransfer'); combine force modes 'gitpush' and 'datatransfer' ('all').""", constraints=EnsureChoice('all', 'gitpush', 'no-datatransfer', 'datatransfer', None)), recursive=recursion_flag, recursion_limit=recursion_limit, jobs=jobs_opt, ) # Desired features: # - let Git do it's thing (push multiple configured refs without the need # to specific anything on the command line # - compilication: we need publication dependencies (i.e. publish what # would be published by Git to a different remote first, hence we # cannot simply watch Git do it, and later act on it.) # - https://github.com/datalad/datalad/issues/1284 # - https://github.com/datalad/datalad/issues/4006 # - make differences between remotes and various types of special remotes # opaque # - https://github.com/datalad/datalad/issues/3127 # - informative and comprehensive (error) reporting # - https://github.com/datalad/datalad/issues/2000 # - https://github.com/datalad/datalad/issues/1682 # - https://github.com/datalad/datalad/issues/2029 # - https://github.com/datalad/datalad/issues/2855 # - https://github.com/datalad/datalad/issues/3412 # - https://github.com/datalad/datalad/issues/3424 # - ensure robust behavior in multi-lateral push scenarios (updating # a dataset that was updated by a 3rd-party after the last known # fetched change # - https://github.com/datalad/datalad/issues/2636 # - should NOT mimic `publish` and that it mixes `create-sibling` and # `push` into a single operation. This would fold the complexity # of all possible ways a local dataset hierarchy could possibly # connected to remote ends into this command. It would be lost battle # from the start. # - not tackle: https://github.com/datalad/datalad/issues/2186 # - maintain standard setup, and not reflect procedural aspects # onto the resulting outcomes # - https://github.com/datalad/datalad/issues/2001 # - do a straight push, nothing like 'sync'. If a remote has something that # needs merging first, fail and let users update. Any diff we are missing # locally can impact decision making via --since and friends. @staticmethod @datasetmethod(name='push') @eval_results def __call__(path=None, dataset=None, to=None, since=None, force=None, recursive=False, recursion_limit=None, jobs=None): # we resolve here, because we need to perform inspection on what was given # as an input argument further down paths = [resolve_path(p, dataset) for p in assure_list(path)] ds = require_dataset(dataset, check_installed=True, purpose='pushing') ds_repo = ds.repo res_kwargs = dict( action='publish', refds=ds.path, logger=lgr, ) get_remote_kwargs = {'exclude_special_remotes': False} \ if isinstance(ds_repo, AnnexRepo) else {} if to and to not in ds_repo.get_remotes(**get_remote_kwargs): # get again for proper error: sr = ds_repo.get_remotes(**get_remote_kwargs) # yield an error result instead of raising a ValueError, # to enable the use case of pushing to a target that # a superdataset doesn't know, but some subdatasets to # (in combination with '--on-failure ignore') yield dict(res_kwargs, status='error', message="Unknown push target '{}'. {}".format( to, 'Known targets: {}.'.format(', '.join( repr(s) for s in sr)) if sr else 'No targets configured in dataset.')) return if since: # will blow with ValueError if unusable ds_repo.get_hexsha(since) if not since and since is not None: # special case: --since='' # figure out state of remote branch and set `since` since = _get_corresponding_remote_state(ds_repo, to) if not since: lgr.info("No tracked remote for active branch, " "detection of last pushed state not in effect.") # obtain a generator for information on the datasets to process # idea is to turn the `paths` argument into per-dataset # content listings that can be acted upon ds_spec = _datasets_since_( # important to pass unchanged dataset arg dataset, since, paths, recursive, recursion_limit) # instead of a loop, this could all be done in parallel matched_anything = False for dspath, dsrecords in ds_spec: matched_anything = True lgr.debug('Attempt push of Dataset at %s', dspath) pbars = {} yield from _push(dspath, dsrecords, to, force, jobs, res_kwargs.copy(), pbars, got_path_arg=True if path else False) # take down progress bars for this dataset for i, ds in pbars.items(): log_progress(lgr.info, i, 'Finished push of %s', ds) if not matched_anything: yield dict( res_kwargs, status='notneeded', message= 'Given constraints did not match any changes to publish', type='dataset', path=ds.path, ) @staticmethod def custom_result_summary_renderer(results): # pragma: more cover # report on any hints at the end # get all unique hints hints = set([r.get('hints', None) for r in results]) hints = [hint for hint in hints if hint is not None] if hints: from datalad.ui import ui from datalad.support import ansi_colors intro = ansi_colors.color_word( "Potential hints to solve encountered errors: ", ansi_colors.YELLOW) ui.message(intro) [ ui.message("{}: {}".format( ansi_colors.color_word(id + 1, ansi_colors.YELLOW), hint)) for id, hint in enumerate(hints) ]
class ExportArchive(Interface): """Export the content of a dataset as a TAR/ZIP archive. """ from datalad.support.param import Parameter from datalad.distribution.dataset import datasetmethod from datalad.interface.utils import eval_results from datalad.distribution.dataset import EnsureDataset from datalad.support.constraints import ( EnsureChoice, EnsureNone, EnsureStr, ) _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc=""""specify the dataset to export. If no dataset is given, an attempt is made to identify the dataset based on the current working directory.""", constraints=EnsureDataset() | EnsureNone()), filename=Parameter( args=("filename", ), metavar="PATH", nargs='?', doc="""File name of the generated TAR archive. If no file name is given the archive will be generated in the current directory and will be named: datalad_<dataset_uuid>.(tar.*|zip). To generate that file in a different directory, provide an existing directory as the file name.""", constraints=EnsureStr() | EnsureNone()), archivetype=Parameter(args=("-t", "--archivetype"), doc="""Type of archive to generate.""", constraints=EnsureChoice("tar", "zip")), compression=Parameter( args=("-c", "--compression"), doc="""Compression method to use. 'bz2' is not supported for ZIP archives. No compression is used when an empty string is given.""", constraints=EnsureChoice("gz", "bz2", "")), missing_content=Parameter( args=("--missing-content", ), doc="""By default, any discovered file with missing content will result in an error and the export is aborted. Setting this to 'continue' will issue warnings instead of failing on error. The value 'ignore' will only inform about problem at the 'debug' log level. The latter two can be helpful when generating a TAR archive from a dataset where some file content is not available locally.""", constraints=EnsureChoice("error", "continue", "ignore")), ) @staticmethod @datasetmethod(name='export_archive') @eval_results def __call__(dataset, filename=None, archivetype='tar', compression='gz', missing_content='error'): import os import tarfile import zipfile from unittest.mock import patch from os.path import join as opj, dirname, normpath, isabs import os.path as op from datalad.distribution.dataset import require_dataset from datalad.utils import file_basename from datalad.support.annexrepo import AnnexRepo from datalad.dochelpers import exc_str import logging lgr = logging.getLogger('datalad.local.export_archive') dataset = require_dataset(dataset, check_installed=True, purpose='export archive') repo = dataset.repo committed_date = repo.get_commit_date() # could be used later on to filter files by some criterion def _filter_tarinfo(ti): # Reset the date to match the one of the last commit, not from the # filesystem since git doesn't track those at all # TODO: use the date of the last commit when any particular # file was changed -- would be the most kosher yoh thinks to the # degree of our abilities ti.mtime = committed_date return ti tar_args = dict(recursive=False, filter=_filter_tarinfo) file_extension = '.{}{}'.format( archivetype, '{}{}'.format('.' if compression else '', compression) if archivetype == 'tar' else '') default_filename = "datalad_{.id}".format(dataset) if filename is None: filename = default_filename # in current directory elif path.exists(filename) and path.isdir(filename): filename = path.join(filename, default_filename) # under given directory if not filename.endswith(file_extension): filename += file_extension root = dataset.path # use dir inside matching the output filename # TODO: could be an option to the export plugin allowing empty value # for no leading dir leading_dir = file_basename(filename) # workaround for inability to pass down the time stamp with patch('time.time', return_value=committed_date), \ tarfile.open(filename, "w:{}".format(compression)) \ if archivetype == 'tar' \ else zipfile.ZipFile( filename, 'w', zipfile.ZIP_STORED if not compression else zipfile.ZIP_DEFLATED) \ as archive: add_method = archive.add if archivetype == 'tar' else archive.write repo_files = sorted(repo.get_indexed_files()) if isinstance(repo, AnnexRepo): annexed = repo.is_under_annex(repo_files, allow_quick=True, batch=True) # remember: returns False for files in Git! has_content = repo.file_has_content(repo_files, allow_quick=True, batch=True) else: annexed = [False] * len(repo_files) has_content = [True] * len(repo_files) for i, rpath in enumerate(repo_files): fpath = opj(root, rpath) if annexed[i]: if not has_content[i]: if missing_content in ('ignore', 'continue'): (lgr.warning if missing_content == 'continue' else lgr.debug)( 'File %s has no content available, skipped', fpath) continue else: raise IOError('File %s has no content available' % fpath) # resolve to possible link target if op.islink(fpath): link_target = os.readlink(fpath) if not isabs(link_target): link_target = normpath( opj(dirname(fpath), link_target)) fpath = link_target # name in the archive aname = normpath(opj(leading_dir, rpath)) add_method(fpath, arcname=aname, **(tar_args if archivetype == 'tar' else {})) if not isabs(filename): filename = opj(os.getcwd(), filename) yield dict(status='ok', path=filename, type='file', action='export_archive', logger=lgr)
class Install(Interface): """Install a dataset from a (remote) source. This command creates a local :term:`sibling` of an existing dataset from a (remote) location identified via a URL or path. Optional recursion into potential subdatasets, and download of all referenced data is supported. The new dataset can be optionally registered in an existing :term:`superdataset` by identifying it via the `dataset` argument (the new dataset's path needs to be located within the superdataset for that). It is recommended to provide a brief description to label the dataset's nature *and* location, e.g. "Michael's music on black laptop". This helps humans to identify data locations in distributed scenarios. By default an identifier comprised of user and machine name, plus path will be generated. When only partial dataset content shall be obtained, it is recommended to use this command without the `get-data` flag, followed by a :func:`~datalad.api.get` operation to obtain the desired data. .. note:: Power-user info: This command uses :command:`git clone`, and :command:`git annex init` to prepare the dataset. Registering to a superdataset is performed via a :command:`git submodule add` operation in the discovered superdataset. """ # very frequently this command will yield exactly one installed dataset # spare people the pain of going through a list by default return_type = 'item-or-list' # as discussed in #1409 and #1470, we want to return dataset instances # matching what is actually available after command completion (and # None for any failed dataset installation) # TODO actually need success(containing)dataset-or-none result_xfm = 'successdatasets-or-none' # we also want to limit the returned result to explicit input arguments # (paths/source) and not report any implicit action, like intermediate # datasets result_filter = is_result_matching_pathsource_argument _examples_ = [ dict(text="Install a dataset from Github into the current directory", code_py="install(" "source='https://github.com/datalad-datasets/longnow" "-podcasts.git')", code_cmd="datalad install " "https://github.com/datalad-datasets/longnow-podcasts.git"), dict(text="Install a dataset as a subdataset into the current dataset", code_py="""\ install(dataset='.', source='https://github.com/datalad-datasets/longnow-podcasts.git')""", code_cmd="""\ datalad install -d . \\ --source='https://github.com/datalad-datasets/longnow-podcasts.git'""" ), dict(text="Install a dataset, and get all content right away", code_py="""\ install(source='https://github.com/datalad-datasets/longnow-podcasts.git', get_data=True)""", code_cmd="""\ datalad install --get-data \\ -s https://github.com/datalad-datasets/longnow-podcasts.git"""), dict(text="Install a dataset with all its subdatasets", code_py="""\ install(source='https://github.com/datalad-datasets/longnow-podcasts.git', recursive=True)""", code_cmd="""\ datalad install -r \\ https://github.com/datalad-datasets/longnow-podcasts.git"""), ] _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), # TODO: this probably changes to install into the dataset (add_to_super) # and to install the thing 'just there' without operating 'on' a dataset. # Adapt doc. # MIH: `shouldn't this be the job of `add`? doc="""specify the dataset to perform the install operation on. If no dataset is given, an attempt is made to identify the dataset in a parent directory of the current working directory and/or the `path` given""", constraints=EnsureDataset() | EnsureNone()), path=Parameter( args=("path", ), metavar='PATH', nargs="*", # doc: TODO doc="""path/name of the installation target. If no `path` is provided a destination path will be derived from a source URL similar to :command:`git clone`"""), source=Parameter(args=("-s", "--source"), metavar='SOURCE', doc="URL or local path of the installation source", constraints=EnsureStr() | EnsureNone()), get_data=Parameter(args=( "-g", "--get-data", ), doc="""if given, obtain all data content too""", action="store_true"), description=location_description, recursive=recursion_flag, recursion_limit=recursion_limit, save=nosave_opt, reckless=reckless_opt, jobs=jobs_opt, ) @staticmethod @datasetmethod(name='install') @eval_results def __call__(path=None, source=None, dataset=None, get_data=False, description=None, recursive=False, recursion_limit=None, save=True, reckless=None, jobs="auto"): # normalize path argument to be equal when called from cmdline and # python and nothing was passed into `path` path = ensure_list(path) if not source and not path: raise InsufficientArgumentsError( "Please provide at least a source or a path") # Common kwargs to pass to underlying git/install calls. # They might need adjustments (e.g. for recursion_limit, but # otherwise would be applicable throughout # # There should have been more of common options! # since underneath get could do similar installs common_kwargs = dict( get_data=get_data, recursive=recursive, recursion_limit=recursion_limit, # git_opts=git_opts, # annex_opts=annex_opts, reckless=reckless, jobs=jobs, ) # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. ds = None if dataset is not None: ds = require_dataset(dataset, check_installed=True, purpose='installation') common_kwargs['dataset'] = dataset # pre-compute for results below refds_path = Interface.get_refds_path(ds) # switch into the two scenarios without --source: # 1. list of URLs # 2. list of (sub)dataset content if source is None: # we need to collect URLs and paths to_install = [] to_get = [] # TODO: this approach is problematic, it disrupts the order of input args. # consequently results will be returned in an unexpected order when a # mixture of source URL and paths is given. Reordering is only possible when # everything in here is fully processed before any results can be yielded. # moreover, I think the semantics of the status quo implementation are a # bit complicated: in a mixture list a source URL will lead to a new dataset # at a generated default location, but a path will lead to a subdataset # at that exact location for urlpath in path: ri = RI(urlpath) (to_get if isinstance(ri, PathRI) else to_install).append(urlpath) # 1. multiple source URLs for s in to_install: lgr.debug("Install passes into install source=%s", s) for r in Install.__call__( source=s, description=description, save=save, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" on_failure='ignore', return_type='generator', result_xfm=None, result_filter=None, **common_kwargs): # no post-processing of the installed content on disk # should be necessary here, all done by code further # down that deals with an install from an actuall `source` # any necessary fixes should go there too! r['refds'] = refds_path yield r # 2. one or more dataset content paths if to_get: lgr.debug("Install passes into get %d items", len(to_get)) # all commented out hint on inability to pass those options # into underlying install-related calls. # Also need to pass from get: # annex_get_opts for r in Get.__call__( to_get, # TODO should pass-through description, not sure why disabled # description=description, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" on_failure='ignore', return_type='generator', result_xfm=None, result_filter=None, **common_kwargs): # no post-processing of get'ed content on disk should be # necessary here, this is the responsibility of `get` # (incl. adjusting parent's gitmodules when submodules end # up in an "updated" state (done in get helpers) # any required fixes should go there! r['refds'] = refds_path yield r # we are done here # the rest is about install from a `source` return # an actual `source` was given if source and path and len(path) > 1: # exception is ok here, if this fails it is either direct user error # or we f****d up one of our internal calls raise ValueError( "install needs a single PATH when source is provided. " "Was given mutliple PATHs: %s" % str(path)) # parameter constraints: if not source: # exception is ok here, if this fails it is either direct user error # or we f****d up one of our internal calls raise InsufficientArgumentsError( "a `source` is required for installation") # code below deals with a single path only path = path[0] if path else None if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination yield get_status_dict( 'install', path=path, status='impossible', logger=lgr, source_url=source, refds=refds_path, message= "installation `source` and destination `path` are identical. " "If you are trying to add a subdataset simply use the `save` command" ) return # resolve the target location (if local) against the provided dataset # or CWD: if path is not None: # MIH everything in here is highly similar to what common # interface helpers do (or should/could do), but at the same # is very much tailored to just apply to `install` -- I guess # it has to stay special # Should work out just fine for regular paths, so no additional # conditioning is necessary try: path_ri = RI(path) except Exception as e: raise ValueError("invalid path argument {}: ({})".format( path, exc_str(e))) try: # Wouldn't work for SSHRI ATM, see TODO within SSHRI # yoh: path should be a local path, and mapping note within # SSHRI about mapping localhost:path to path is kinda # a peculiar use-case IMHO # TODO Stringification can be removed once PY35 is no longer # supported path = str(resolve_path(path_ri.localpath, dataset)) # any `path` argument that point to something local now # resolved and is no longer a URL except ValueError: # `path` is neither a valid source nor a local path. # TODO: The only thing left is a known subdataset with a # name, that is not a path; Once we correctly distinguish # between path and name of a submodule, we need to consider # this. # For now: Just raise raise ValueError("Invalid path argument {0}".format(path)) # `path` resolved, if there was any. # clone dataset, will also take care of adding to superdataset, if one # is given res = Clone.__call__( source, path, dataset=ds, description=description, reckless=reckless, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" result_xfm=None, return_type='generator', result_filter=None, on_failure='ignore') # helper as_ds = YieldDatasets() destination_dataset = None for r in res: if r['action'] == 'install' and r['type'] == 'dataset': # make sure logic below is valid, only one dataset result is # coming back assert (destination_dataset is None) destination_dataset = as_ds(r) r['refds'] = refds_path yield r assert (destination_dataset) # Now, recursive calls: if recursive or get_data: # dataset argument must not be passed inside since we use bound .get # It is ok to do "inplace" as long as we still return right # after the loop ends common_kwargs.pop('dataset', '') for r in destination_dataset.get( curdir, description=description, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" on_failure='ignore', return_type='generator', result_xfm=None, **common_kwargs): r['refds'] = refds_path yield r # at this point no futher post-processing should be necessary, # `clone` and `get` must have done that (incl. parent handling) # if not, bugs should be fixed in those commands return
class CheckDates(Interface): """Find repository dates that are more recent than a reference date. The main purpose of this tool is to find "leaked" real dates in repositories that are configured to use fake dates. It checks dates from three sources: (1) commit timestamps (author and committer dates), (2) timestamps within files of the "git-annex" branch, and (3) the timestamps of annotated tags. """ from datalad.interface.utils import eval_results import datalad.support.ansi_colors as ac from datalad.support.constraints import EnsureChoice, EnsureNone, EnsureStr from datalad.support.param import Parameter result_renderer = "tailored" @staticmethod def custom_result_renderer(res, **kwargs): """Like 'json_pp', but skip non-error results without flagged objects. """ # FIXME: I think the proper way to do this is to use 'result_filter', # but I couldn't seem to get eval_results to detect the filter when I # used # # result_renderer = "json_pp" # result_filter = lambda x: ... # # Also, I want to keep the "message" key for errors. from datalad.ui import ui to_render = {} if res["status"] == "error": to_render = dict(res.items()) elif "report" in res and res["report"]["objects"]: to_render = { k: v for k, v in res.items() if k not in ["status", "message", "logger"] } if to_render: ui.message(json.dumps(to_render, sort_keys=True, indent=2)) _params_ = dict( paths=Parameter( args=("paths", ), metavar="PATH", nargs="*", doc="""Root directory in which to search for Git repositories. The current working directory will be used by default.""", constraints=EnsureStr() | EnsureNone()), reference_date=Parameter( args=("-D", "--reference-date"), metavar="DATE", doc="""Compare dates to this date. If dateutil is installed, this value can be any format that its parser recognizes. Otherwise, it should be a unix timestamp that starts with a "@". The default value corresponds to 01 Jan, 2018 00:00:00 -0000.""", constraints=EnsureStr()), revs=Parameter( args=("--rev", ), dest="revs", action="append", metavar="REVISION", doc="""Search timestamps from commits that are reachable from [PY: these revisions PY][CMD: REVISION CMD]. Any revision specification supported by :command:`git log`, including flags like --all and --tags, can be used.[CMD: This option can be given multiple times. CMD]"""), annex=Parameter( args=("--annex", ), doc="""Mode for "git-annex" branch search. If 'all', all blobs within the branch are searched. 'tree' limits the search to blobs that are referenced by the tree at the tip of the branch. 'none' disables search of "git-annex" blobs.""", constraints=EnsureChoice("all", "tree", "none")), no_tags=Parameter(args=("--no-tags", ), action="store_true", doc="""Don't check the dates of annotated tags."""), older=Parameter( args=("--older", ), action="store_true", doc="""Find dates which are older than the reference date rather than newer."""), ) @staticmethod @eval_results def __call__(paths, reference_date="@1514764800", revs=None, annex="all", no_tags=False, older=False): from datalad.support.repodates import check_dates which = "older" if older else "newer" try: ref_ts = _parse_date(reference_date) except ValueError as exc: lgr.error("Could not parse '%s' as a date", reference_date) yield get_status_dict("check_dates", status="error", message=exc_str(exc)) return lgr.info("Searching for dates %s than %s", which, time.strftime("%d %b %Y %H:%M:%S +0000", time.gmtime(ref_ts))) for repo in _git_repos(paths or ["."]): fullpath = os.path.abspath(repo) lgr.debug("Checking %s", fullpath) try: report = check_dates(repo, ref_ts, which=which, revs=revs or ["--all"], annex={ "all": True, "none": False, "tree": "tree" }[annex], tags=not no_tags) except InvalidGitRepositoryError as exc: lgr.warning("Skipping invalid Git repo: %s", repo) continue yield get_status_dict( "check_dates", status="ok", path=fullpath, message=("Found {} dates" if report["objects"] else "No {} dates found").format(which), report=report)
class Diff(Interface): """Report changes of dataset components. Reports can be generated for changes between recorded revisions, or between a revision and the state of a dataset's work tree. Unlike 'git diff', this command also reports untracked content when comparing a revision to the state of the work tree. Such content is marked with the property `state='untracked'` in the command results. The following types of changes are distinguished and reported via the `state` result property: - added - copied - deleted - modified - renamed - typechange - unmerged - untracked Whenever applicable, source and/or destination revisions are reported to indicate when exactly within the requested revision range a particular component changed its status. Optionally, the reported changes can be limited to a subset of paths within a dataset. """ # make the custom renderer the default one, as the global default renderer # does not yield meaningful output for this command result_renderer = 'tailored' _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), doc="""specify the dataset to query. If no dataset is given, an attempt is made to identify the dataset based on the input and/or the current working directory""", constraints=EnsureDataset() | EnsureNone()), path=Parameter(args=("path", ), metavar="PATH", doc="""path to be evaluated""", nargs="*", constraints=EnsureStr() | EnsureNone()), revision=Parameter( args=('--revision', ), metavar='REVISION EXPRESSION', nargs='?', doc="""comparison reference specification. Three modes are supported: 1) <revision> changes you have in your working tree relative to the named revision (this can also be a branch name, tag, commit or any label Git can understand). 2) <revision>..<revision> changes between two arbitrary revisions. 3) <revision>...<revision> changes on the branch containing and up to the second <revision>, starting at a common ancestor of both revisions."""), staged=Parameter( args=("--staged", ), action="store_true", doc="""get the changes already staged for a commit relative to an optionally given revision (by default the most recent one)""" ), ignore_subdatasets=Parameter( args=('--ignore-subdatasets', ), constraints=EnsureChoice('none', 'untracked', 'dirty', 'all'), doc="""speed up execution by (partially) not evaluating the state of subdatasets in a parent dataset. With "none" a subdataset is considered modified when it either contains untracked or modified content or its last saved state differs from that recorded in the parent dataset. When "untracked" is used subdatasets are not considered modified when they only contain untracked content (but they are still scanned for modified content). Using "dirty" ignores all changes to the work tree of subdatasets, only changes to the revisions stored in the parent dataset are shown. Using "all" hides all changes to subdatasets. Note, even with "all" recursive execution will still report other changes in any existing subdataset, only the subdataset record in a parent dataset is not evaluated."""), report_untracked=Parameter( args=('--report-untracked', ), constraints=EnsureChoice('no', 'normal', 'all'), doc="""If and how untracked content is reported when comparing a revision to the state of the work tree. 'no': no untracked files are reported; 'normal': untracked files and entire untracked directories are reported as such; 'all': report individual files even in fully untracked directories."""), recursive=recursion_flag, recursion_limit=recursion_limit) @staticmethod @datasetmethod(name='diff') @eval_results def __call__(path=None, dataset=None, revision=None, staged=False, ignore_subdatasets='none', report_untracked='normal', recursive=False, recursion_limit=None): if not dataset and not path: # act on the whole dataset if nothing else was specified dataset = curdir refds_path = Interface.get_refds_path(dataset) to_process = [] # tracked what commit ranges we want to diff per dataset ds_diffies = {} for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='diff', # unavailable is OK, because we might query for a deleted file unavailable_path_status='', nondataset_path_status='impossible', # must not use `modified`, infinite loop otherwise modified=None, return_type='generator', on_failure='ignore'): if ap.get('status', None): # we know what to report already yield ap continue if ap.get('type', None) == 'dataset': ap['process_content'] = True if ap.get('raw_input', False) or ap['path'] == refds_path: # prepopulate the revision specs for all input paths ds_diffies[ap['path'] if ap.get('type', None) == 'dataset' else ap['parentds']] = revision to_process.append(ap) # sort into datasets content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path) assert (not completed) for ds_path in sorted(content_by_ds.keys()): if ds_path not in ds_diffies: # we don't know how to diff # this was not neither an input path, not did we see it # when diffing its parent continue content_paths = content_by_ds[ds_path] revision = ds_diffies[ds_path] for r in _parse_git_diff(ds_path, diff_thingie=ds_diffies[ds_path], paths=content_paths, ignore_submodules=ignore_subdatasets, staged=staged): r.update(dict(action='diff', logger=lgr)) if refds_path: r['refds'] = refds_path if 'status' not in r: r['status'] = 'ok' if r.get('type', None) == 'dataset': # this is a subdataset report # we need to use the reported commit range to properly adjust the # query once we hit that subdataset from_rev = r.get('revision_src', '') to_rev = r.get('revision', '') subrev = '{}..{}'.format( from_rev if from_rev else PRE_INIT_COMMIT_SHA, to_rev if to_rev else '', ) if from_rev and from_rev == to_rev: # this is a special case, where subdataset reported changes without # a change in state/commit -- this is code for uncommited changes # in the subdataset (including staged ones). In such a case, we # must not provide a diff range, but only the source commit we want # to diff against # XXX if this is changed, likely the same logic in annotate_paths needs # changing too! subrev = from_rev ds_diffies[r['path']] = subrev yield r if (revision and '..' in revision) or report_untracked == 'no': # don't look for untracked content, we got a revision range continue for r in _get_untracked_content(ds_path, report_untracked, paths=content_paths): r.update(dict(action='diff', logger=lgr)) if refds_path: r['refds'] = refds_path if 'status' not in r: r['status'] = 'ok' yield r @staticmethod def custom_result_renderer(res, **kwargs): from datalad.ui import ui if not res['status'] == 'ok': # logging reported already return path = relpath(res['path'], start=res['refds']) \ if res.get('refds', None) else res['path'] type_ = res.get('type', res.get('type_src', '')) max_len = len('untracked(directory)') state_msg = '{}{}'.format(res['state'], '({})'.format(type_ if type_ else '')) ui.message('{fill}{state_msg}: {path}'.format( fill=' ' * max(0, max_len - len(state_msg)), state_msg=state_msg, path=path))
class Rerun(Interface): """Re-execute previous `datalad run` commands. This will unlock any dataset content that is on record to have been modified by the command in the specified revision. It will then re-execute the command in the recorded path (if it was inside the dataset). Afterwards, all modifications will be saved. Examples: Re-execute the command from the previous commit:: % datalad rerun Re-execute any commands in the last five commits:: % datalad rerun --since=HEAD~5 Do the same as above, but re-execute the commands on top of HEAD~5 in a detached state:: % datalad rerun --onto= --since=HEAD~5 Re-execute all previous commands and compare the old and new results:: % # on master branch % datalad rerun --branch=verify --since= % # now on verify branch % datalad diff --revision=master.. % git log --oneline --left-right --cherry-pick master... """ _params_ = dict( revision=Parameter( args=("revision", ), metavar="REVISION", nargs="?", doc="""rerun command(s) in `revision`. By default, the command from this commit will be executed, but [CMD: --since CMD][PY: `since` PY] can be used to construct a revision range.""", default="HEAD", constraints=EnsureStr()), since=Parameter( args=("--since", ), doc="""If `since` is a commit-ish, the commands from all commits that are reachable from `revision` but not `since` will be re-executed (in other words, the commands in :command:`git log SINCE..REVISION`). If SINCE is an empty string, it is set to the parent of the first commit that contains a recorded command (i.e., all commands in :command:`git log REVISION` will be re-executed).""", constraints=EnsureStr() | EnsureNone()), branch=Parameter( metavar="NAME", args=( "-b", "--branch", ), doc= "create and checkout this branch before rerunning the commands.", constraints=EnsureStr() | EnsureNone()), onto=Parameter( metavar="base", args=("--onto", ), doc="""start point for rerunning the commands. If not specified, commands are executed at HEAD. This option can be used to specify an alternative start point, which will be checked out with the branch name specified by [CMD: --branch CMD][PY: `branch` PY] or in a detached state otherwise. As a special case, an empty value for this option means to use the commit specified by [CMD: --since CMD][PY: `since` PY].""", constraints=EnsureStr() | EnsureNone()), message=Parameter( args=( "-m", "--message", ), metavar="MESSAGE", doc="""use MESSAGE for the reran commit rather than the recorded commit message. In the case of a multi-commit rerun, all the reran commits will have this message.""", constraints=EnsureStr() | EnsureNone()), script=Parameter( args=("--script", ), metavar="FILE", doc="""extract the commands into [CMD: FILE CMD][PY: this file PY] rather than rerunning. Use - to write to stdout instead.""", constraints=EnsureStr() | EnsureNone()), dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset from which to rerun a recorded command. If no dataset is given, an attempt is made to identify the dataset based on the current working directory. If a dataset is given, the command will be executed in the root directory of this dataset.""", constraints=EnsureDataset() | EnsureNone()), # TODO # --list-commands # go through the history and report any recorded command. this info # could be used to unlock the associated output files for a rerun ) @staticmethod @datasetmethod(name='rerun') @eval_results def __call__(revision="HEAD", since=None, dataset=None, branch=None, message=None, onto=None, script=None): ds = require_dataset(dataset, check_installed=True, purpose='rerunning a command') lgr.debug('rerunning command output underneath %s', ds) if script is None and ds.repo.dirty: yield get_status_dict('run', ds=ds, status='impossible', message=('unsaved modifications present, ' 'cannot detect changes by command')) return err_info = get_status_dict('run', ds=ds) if not ds.repo.get_hexsha(): yield dict(err_info, status='impossible', message='cannot rerun command, nothing recorded') return if branch and branch in ds.repo.get_branches(): yield get_status_dict( "run", ds=ds, status="error", message="branch '{}' already exists".format(branch)) return if not commit_exists(ds, revision + "^"): # Only a single commit is reachable from `revision`. In # this case, --since has no effect on the range construction. revrange = revision elif since is None: revrange = "{rev}^..{rev}".format(rev=revision) elif since.strip() == "": revrange = revision else: revrange = "{}..{}".format(since, revision) if ds.repo.repo.git.rev_list("--merges", revrange, "--"): yield get_status_dict( "run", ds=ds, status="error", message="cannot rerun history with merge commits") return revs = [{ "hexsha": hexsha, "message": ds.repo.repo.git.show(hexsha, "--format=%B", "--no-patch") } for hexsha in ds.repo.repo.git.rev_list( "--reverse", revrange, "--").split()] for rev in revs: try: msg, info = get_run_info(rev["message"]) except ValueError as exc: yield dict(err_info, status='error', message="Error on {}'s message: {}".format( rev["hexsha"], exc_str(exc))) return if info is not None: rev["run_info"] = info rev["run_message"] = msg if since is not None and since.strip() == "": # For --since='', drop any leading commits that don't have # a run command. revs = list(dropwhile(lambda r: "run_info" not in r, revs)) if script: ofh = sys.stdout if script.strip() == "-" else open(script, "w") header = """\ #!/bin/sh # # This file was generated by running (the equivalent of) # # datalad rerun --script={script}{since} {revision} # # in {ds}{path}\n""" ofh.write( header.format( script=script, since="" if since is None else " --since=" + since, revision=ds.repo.repo.git.rev_parse(revision), ds='dataset {} at '.format(ds.id) if ds.id else '', path=ds.path)) for rev in revs: if "run_info" not in rev: continue cmd = rev["run_info"]["cmd"] msg = rev["run_message"] if msg == _format_cmd_shorty(cmd): msg = '' ofh.write("\n" + "".join("# " + ln for ln in msg.splitlines(True)) + "\n") commit_descr = ds.repo.describe(rev['hexsha']) ofh.write('# (record: {})\n'.format( commit_descr if commit_descr else rev['hexsha'])) if isinstance(cmd, list): cmd = " ".join(cmd) ofh.write(cmd + "\n") if ofh is not sys.stdout: ofh.close() else: if onto is not None and onto.strip() == "": # Special case: --onto='' is the value of --since. # Because we're currently aborting if the revision list # contains merges, we know that, regardless of if and how # --since is specified, the effective value for --since is # the parent of the first revision. onto = revs[0]["hexsha"] + "^" if not commit_exists(ds, onto): # This is unlikely to happen in the wild because it # means that the first commit is a datalad run commit. # Just abort rather than trying to checkout on orphan # branch or something like that. yield get_status_dict( "run", ds=ds, status="error", message="Commit for --onto does not exist.") return if branch or onto: start_point = onto or "HEAD" if branch: checkout_options = ["-b", branch] else: checkout_options = ["--detach"] ds.repo.checkout(start_point, options=checkout_options) for rev in revs: hexsha = rev["hexsha"] if "run_info" not in rev: pick = False try: ds.repo.repo.git.merge_base("--is-ancestor", hexsha, "HEAD") except GitCommandError: # Revision is NOT an ancestor of HEAD. pick = True shortrev = ds.repo.repo.git.rev_parse("--short", hexsha) err_msg = "no command for {} found; {}".format( shortrev, "cherry picking" if pick else "skipping") yield dict(err_info, status='ok', message=err_msg) if pick: ds.repo.repo.git.cherry_pick(hexsha) continue run_info = rev["run_info"] # Keep a "rerun" trail. if "chain" in run_info: run_info["chain"].append(hexsha) else: run_info["chain"] = [hexsha] # now we have to find out what was modified during the # last run, and enable re-modification ideally, we would # bring back the entire state of the tree with #1424, but # we limit ourself to file addition/not-in-place-modification # for now for r in ds.unlock(new_or_modified(ds, hexsha), return_type='generator', result_xfm=None): yield r for r in run_command(run_info['cmd'], ds, message or rev["run_message"], rerun_info=run_info): yield r
class Siblings(Interface): """Manage sibling configuration This command offers four different actions: 'query', 'add', 'remove', 'configure', 'enable'. 'query' is the default action and can be used to obtain information about (all) known siblings. 'add' and 'configure' are highly similar actions, the only difference being that adding a sibling with a name that is already registered will fail, whereas re-configuring a (different) sibling under a known name will not be considered an error. 'enable' can be used to complete access configuration for non-Git sibling (aka git-annex special remotes). Lastly, the 'remove' action allows for the removal (or de-configuration) of a registered sibling. For each sibling (added, configured, or queried) all known sibling properties are reported. This includes: "name" Name of the sibling "path" Absolute path of the dataset "url" For regular siblings at minimum a "fetch" URL, possibly also a "pushurl" Additionally, any further configuration will also be reported using a key that matches that in the Git configuration. By default, sibling information is rendered as one line per sibling following this scheme:: <dataset_path>: <sibling_name>(<+|->) [<access_specification] where the `+` and `-` labels indicate the presence or absence of a remote data annex at a particular remote, and `access_specification` contains either a URL and/or a type label for the sibling. """ # make the custom renderer the default, path reporting isn't the top # priority here result_renderer = 'tailored' _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), doc="""specify the dataset to configure. If no dataset is given, an attempt is made to identify the dataset based on the input and/or the current working directory""", constraints=EnsureDataset() | EnsureNone()), name=Parameter( args=( '-s', '--name', ), metavar='NAME', doc="""name of the sibling. For sibling removal this option is mandatory, otherwise the hostname part of a given URL is used as a default. This option can be used to limit 'query' to a specific sibling.""", constraints=EnsureStr() | EnsureNone()), action=Parameter( args=('action', ), nargs='?', metavar='ACTION', doc="""command action selection (see general documentation)""", constraints=EnsureChoice('query', 'add', 'remove', 'configure', 'enable') | EnsureNone()), url=Parameter(args=('--url', ), doc="""the URL of or path to the dataset sibling named by `name`. For recursive operation it is required that a template string for building subdataset sibling URLs is given.\n List of currently available placeholders:\n %%NAME\tthe name of the dataset, where slashes are replaced by dashes.""", constraints=EnsureStr() | EnsureNone(), nargs="?"), pushurl=Parameter( args=('--pushurl', ), doc="""in case the `url` cannot be used to publish to the dataset sibling, this option specifies a URL to be used instead.\nIf no `url` is given, `pushurl` serves as `url` as well.""", constraints=EnsureStr() | EnsureNone()), description=location_description, ## info options # --template/cfgfrom gh-1462 (maybe also for a one-time inherit) # --wanted gh-925 (also see below for add_sibling approach) fetch=Parameter(args=("--fetch", ), action="store_true", doc="""fetch the sibling after configuration"""), as_common_datasrc=as_common_datasrc, publish_depends=publish_depends, publish_by_default=publish_by_default, annex_wanted=annex_wanted_opt, annex_required=annex_required_opt, annex_group=annex_group_opt, annex_groupwanted=annex_groupwanted_opt, inherit=inherit_opt, get_annex_info=Parameter( args=("--no-annex-info", ), dest='get_annex_info', action="store_false", doc= """Whether to query all information about the annex configurations of siblings. Can be disabled if speed is a concern"""), recursive=recursion_flag, recursion_limit=recursion_limit) @staticmethod @datasetmethod(name='siblings') @eval_results def __call__( action='query', dataset=None, name=None, url=None, pushurl=None, description=None, # TODO consider true, for now like add_sibling fetch=False, as_common_datasrc=None, publish_depends=None, publish_by_default=None, annex_wanted=None, annex_required=None, annex_group=None, annex_groupwanted=None, inherit=False, get_annex_info=True, recursive=False, recursion_limit=None): # TODO: Detect malformed URL and fail? # XXX possibly fail if fetch is False and as_common_datasrc if annex_groupwanted and not annex_group: raise InsufficientArgumentsError( "To set groupwanted, you need to provide annex_group option") # TODO catch invalid action specified action_worker_map = { 'query': _query_remotes, 'add': _add_remote, 'configure': _configure_remote, 'remove': _remove_remote, 'enable': _enable_remote, } # all worker strictly operate on a single dataset # anything that deals with hierarchies and/or dataset # relationships in general should be dealt with in here # at the top-level and vice versa worker = action_worker_map[action] dataset = require_dataset(dataset, check_installed=False, purpose='sibling configuration') refds_path = dataset.path res_kwargs = dict(refds=refds_path, logger=lgr) ds_name = basename(dataset.path) # do not form single list of datasets (with recursion results) to # give fastest possible response, for the precise of a long-all # function call ds = dataset for r in worker( # always copy signature to below to avoid bugs! ds, name, ds.repo.get_remotes(), # for top-level dataset there is no layout questions _mangle_urls(url, ds_name), _mangle_urls(pushurl, ds_name), fetch, description, as_common_datasrc, publish_depends, publish_by_default, annex_wanted, annex_required, annex_group, annex_groupwanted, inherit, get_annex_info, **res_kwargs): yield r if not recursive: return # do we have instructions to register siblings with some alternative # layout? replicate_local_structure = url and "%NAME" not in url for subds in dataset.subdatasets(fulfilled=True, recursive=recursive, recursion_limit=recursion_limit, result_xfm='datasets'): subds_name = relpath(subds.path, start=dataset.path) if replicate_local_structure: subds_url = slash_join(url, subds_name) subds_pushurl = slash_join(pushurl, subds_name) else: subds_url = \ _mangle_urls(url, '/'.join([ds_name, subds_name])) subds_pushurl = \ _mangle_urls(pushurl, '/'.join([ds_name, subds_name])) for r in worker( # always copy signature from above to avoid bugs subds, name, subds.repo.get_remotes(), subds_url, subds_pushurl, fetch, description, as_common_datasrc, publish_depends, publish_by_default, annex_wanted, annex_required, annex_group, annex_groupwanted, inherit, get_annex_info, **res_kwargs): yield r @staticmethod def custom_result_renderer(res, **kwargs): from datalad.ui import ui if res['status'] != 'ok' or not res.get('action', '').endswith('-sibling'): # logging complained about this already return path = relpath(res['path'], res['refds']) if res.get( 'refds', None) else res['path'] got_url = 'url' in res spec = '{}{}{}{}'.format(res.get('url', ''), ' (' if got_url else '', res.get('annex-externaltype', 'git'), ')' if got_url else '') ui.message('{path}: {name}({with_annex}) [{spec}]'.format( **dict( res, path=path, # TODO report '+' for special remotes with_annex='+' if 'annex-uuid' in res \ else ('-' if res.get('annex-ignore', None) else '?'), spec=spec)))
class Save(Interface): """Save the current state of a dataset Saving the state of a dataset records changes that have been made to it. This change record is annotated with a user-provided description. Optionally, an additional tag, such as a version, can be assigned to the saved state. Such tag enables straightforward retrieval of past versions at a later point in time. Examples: Save any content underneath the current directory, without altering any potential subdataset (use --recursive for that):: % datalad save . Save any modification of known dataset content, but leave untracked files (e.g. temporary files) untouched:: % dataset save -d <path_to_dataset> Tag the most recent saved state of a dataset:: % dataset save -d <path_to_dataset> --version-tag bestyet """ _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), doc=""""specify the dataset to save""", constraints=EnsureDataset() | EnsureNone()), path=Parameter( args=("path", ), metavar='PATH', doc="""path/name of the dataset component to save. If given, only changes made to those components are recorded in the new state.""", nargs='*', constraints=EnsureStr() | EnsureNone()), message=save_message_opt, message_file=message_file_opt, # switch not functional from cmdline: default True, action=store_true # TODO remove from API? all_updated=False is not used anywhere in the codebase all_updated=Parameter( args=("-u", "--all-updated"), doc="""if no explicit paths are given, save changes of all known components in a datasets""", action="store_true"), version_tag=Parameter(args=("--version-tag", ), metavar='ID', doc="""an additional marker for that state.""", constraints=EnsureStr() | EnsureNone()), super_datasets=super_datasets_flag, recursive=recursion_flag, recursion_limit=recursion_limit, ) @staticmethod @datasetmethod(name='save') @eval_results def __call__(message=None, path=None, dataset=None, all_updated=True, version_tag=None, recursive=False, recursion_limit=None, super_datasets=False, message_file=None): if not dataset and not path: # we got nothing at all -> save what is staged in the repo in "this" directory? # make sure we don't treat this as a user-provided '.' argument path = [{'path': abspath(curdir), 'raw_input': False}] refds_path = Interface.get_refds_path(dataset) if message and message_file: raise ValueError("Both a message and message file were specified") if message_file: with open(message_file, "rb") as mfh: message = assure_unicode(mfh.read()) to_process = [] got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='save', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', modified='HEAD' if not path and recursive else None, return_type='generator', on_failure='ignore'): if ap.get('state', None) == 'untracked' and not ap.get('raw_input', False): # this path was found untracked, but not explicitly given to save # we will silently ignore this continue got_nothing = False # next check should not be done during annotation, as it is possibly expensive # and not generally useful if ap.get('status', None) == 'impossible' and \ ap.get('state', None) == 'absent' and \ ap.get('parentds', None): # this is not here anymore, but it might actually have been a deleted # component if relpath(ap['path'], start=ap['parentds']) \ in Dataset(ap['parentds']).repo.get_deleted_files(): # ok, this is a staged deletion that we want to save ap['status'] = '' del ap['message'] if ap.get('status', None): # this is done yield ap continue # for things like: `ds.save()` # or recursively discovered datasets if ap['path'] == refds_path or \ (ap.get('type', None) == 'dataset' and not ap.get('raw_input', False) and not ap.get('state', None) == 'absent'): ap['process_content'] = True ap['process_updated_only'] = all_updated to_process.append(ap) lgr.log(2, "save, to_process=%r", to_process) if got_nothing and recursive and refds_path: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict('save', status='notneeded', path=refds_path, type='dataset', logger=lgr) return if not to_process: # nothing left to do, potentially all errored before return if super_datasets: # search for the topmost superdatasets of any path dss = [ Dataset(ap.get('parentds', ap['path'])) for ap in to_process ] superdss = [ds.get_superdataset(topmost=True) for ds in dss] superdss = get_tree_roots( unique(ds.path for ds in dss + superdss if ds)) if dataset: # need to adjust the reference to the new superds # if we had one ref before, we should still have exactly one assert len(superdss) <= 1 dataset = list(superdss.keys())[0] refds_path = dataset elif refds_path: # there is a single superdataset superdss = { refds_path: unique( [ap['parentds'] for ap in to_process if 'parentds' in ap]) } else: # sort all datasets under their potential superdatasets # start from the top to get all subdatasets down the line # and collate them into as few superdatasets as possible # this is quick, just string operations superdss = get_tree_roots( unique( [ap['parentds'] for ap in to_process if 'parentds' in ap])) # for each "superdataset" check the tree of subdatasets and make sure # we gather all datasets between the super and any subdataset # so we can save them all bottom-up in order to be able to properly # save the superdataset # if this is called from e.g. `add` this is actually not necessary, # but in the general case we cannot avoid it # TODO maybe introduce a switch? discovered = {} for superds_path in superdss: target_subs = superdss[superds_path] discover_dataset_trace_to_targets( # from here superds_path, # to all target_subs, [], discovered) # create a new minimally annotated path for each discovered dataset discovered_added = set() for parentds in discovered: for subds in discovered[parentds]: to_process.append( dict(path=subds, parentds=parentds, type='dataset')) discovered_added.add(subds) # make sure we have an entry for each dataset, including those # tha are just parents for parentds in discovered: if parentds not in discovered_added: to_process.append( dict( path=parentds, type='dataset', # make sure we save content of superds later on process_content=True, # but not do nasty things, like adding untracked content # just because we discovered this dataset process_updated_only=True)) # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, deduplication happens here too annotated_paths = AnnotatePaths.__call__( path=to_process, dataset=dataset, # never recursion, done already recursive=False, action='save', unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') # now sort into datasets so we can process them one by one content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert (not completed) # iterate over all datasets, starting at the bottom for dspath in sorted(content_by_ds.keys(), reverse=True): ds = Dataset(dspath) res = get_status_dict('save', ds=ds, logger=lgr) if not ds.is_installed(): # TODO This is likely impossible now res['status'] = 'impossible' res['message'] = ('dataset %s is not installed', ds) yield res continue saved_state = save_dataset(ds, content_by_ds[dspath], message=message) res['status'] = 'ok' if saved_state else 'notneeded' # MIH: let's tag even if there was nothing commit. I'd forget this # option too often... if version_tag: try: # TODO: check whether comment below is still true after # removing the log swallowing: # again cannot help but force-silence low-level code, because # it screams like a made man instead of allowing top-level # code an orderly error report ds.repo.tag(version_tag) # even if we haven't saved anything res['status'] = 'ok' yield res except CommandError as e: if saved_state: # first we yield the result for the actual save yield res # and now complain that tagging didn't work yield get_status_dict( 'save', ds=ds, logger=lgr, status='error', message=('cannot tag this version: %s', e.stderr.strip())) else: yield res @staticmethod def custom_result_renderer(res, **kwargs): from datalad.ui import ui if not res or res.get('type', None) != 'dataset' or 'path' not in res: return ds = Dataset(res['path']) commit = ds.repo.get_hexsha() ui.message('Saved state: {0} for {1}'.format(commit, ds))
class Get(Interface): """Get any dataset content (files/directories/subdatasets). This command only operates on dataset content. To obtain a new independent dataset from some source use the `clone` command. By default this command operates recursively within a dataset, but not across potential subdatasets, i.e. if a directory is provided, all files in the directory are obtained. Recursion into subdatasets is supported too. If enabled, relevant subdatasets are detected and installed in order to fulfill a request. Known data locations for each requested file are evaluated and data are obtained from some available location (according to git-annex configuration and possibly assigned remote priorities), unless a specific source is specified. *Getting subdatasets* Just as DataLad supports getting file content from more than one location, the same is supported for subdatasets, including a ranking of individual sources for prioritization. The following location candidates are considered. For each candidate a cost is given in parenthesis, higher values indicate higher cost, and thus lower priority: - URL of any configured superdataset remote that is known to have the desired submodule commit, with the submodule path appended to it. There can be more than one candidate (cost 500). - In case `.gitmodules` contains a relative path instead of a URL, the URL of any configured superdataset remote that is known to have the desired submodule commit, with this relative path appended to it. There can be more than one candidate (cost 500). - A URL or absolute path recorded in `.gitmodules` (cost 600). - In case `.gitmodules` contains a relative path as a URL, the absolute path of the superdataset, appended with this relative path (cost 900). Additional candidate URLs can be generated based on templates specified as configuration variables with the pattern `datalad.get.subdataset-source-candidate-<name>` where `name` is an arbitrary identifier. If `name` starts with three digits (e.g. '400myserver') these will be interpreted as a cost, and the respective candidate will be sorted into the generated candidate list according to this cost. If no cost is given, a default of 700 is used. A template string assigned to such a variable can utilize the Python format mini language and may reference a number of properties that are inferred from the parent dataset's knowledge about the target subdataset. Properties include any submodule property specified in the respective `.gitmodules` record. For convenience, an existing `datalad-id` record is made available under the shortened name `id`. Additionally, the URL of any configured remote that contains the respective submodule commit is available as `remote-<name>` properties, where `name` is the configured remote name. Lastly, all candidates are sorted according to their cost (lower values first), and duplicate URLs are stripped, while preserving the first item in the candidate list. .. note:: Power-user info: This command uses :command:`git annex get` to fulfill file handles. """ _examples_ = [ dict(text="Get a single file", code_py="get('path/to/file')", code_cmd="datalad get <path/to/file>"), dict(text="Get contents of a directory", code_py="get('path/to/dir/')", code_cmd="datalad get <path/to/dir/>"), dict( text="Get all contents of the current dataset and its subdatasets", code_py="get(dataset='.', recursive=True)", code_cmd="datalad get . -r"), dict( text="Get (clone) a registered subdataset, but don't retrieve data", code_py="get('path/to/subds', get_data=False)", code_cmd="datalad get -n <path/to/subds>"), ] _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), metavar="PATH", doc="""specify the dataset to perform the add operation on, in which case `path` arguments are interpreted as being relative to this dataset. If no dataset is given, an attempt is made to identify a dataset for each input `path`""", constraints=EnsureDataset() | EnsureNone()), path=Parameter( args=("path", ), metavar="PATH", doc="""path/name of the requested dataset component. The component must already be known to a dataset. To add new components to a dataset use the `add` command""", nargs="*", constraints=EnsureStr() | EnsureNone()), source=Parameter( args=( "-s", "--source", ), metavar="LABEL", doc="""label of the data source to be used to fulfill requests. This can be the name of a dataset :term:`sibling` or another known source""", constraints=EnsureStr() | EnsureNone()), recursive=recursion_flag, recursion_limit=Parameter( args=( "-R", "--recursion-limit", ), metavar="LEVELS", constraints=EnsureInt() | EnsureChoice('existing') | EnsureNone(), doc="""limit recursion into subdataset to the given number of levels. Alternatively, 'existing' will limit recursion to subdatasets that already existed on the filesystem at the start of processing, and prevent new subdatasets from being obtained recursively."""), get_data=Parameter( args=( "-n", "--no-data", ), dest='get_data', action='store_false', doc= """whether to obtain data for all file handles. If disabled, `get` operations are limited to dataset handles.[CMD: This option prevents data for file handles from being obtained CMD]"""), description=location_description, reckless=reckless_opt, jobs=jobs_opt) @staticmethod @datasetmethod(name='get') @eval_results def __call__( path=None, *, source=None, dataset=None, recursive=False, recursion_limit=None, get_data=True, description=None, reckless=None, jobs='auto', ): if not (dataset or path): raise InsufficientArgumentsError( "Neither dataset nor target path(s) provided") # we have to have a single dataset to operate on refds = require_dataset(dataset, check_installed=True, purpose='get content of %s' % shortened_repr(path)) # some functions downstream expect a str refds_path = refds.path if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path content_by_ds = {} # use subdatasets() to discover any relevant content that is not # already present in the root dataset (refds) for sdsres in Subdatasets.__call__( contains=path, # maintain path argument semantics and pass in dataset arg # as is dataset=dataset, # always come from the top to get sensible generator behavior bottomup=False, # when paths are given, they will constrain the recursion # automatically, and we need to enable recursion so we can # location path in subdatasets several levels down recursive=True if path else recursive, recursion_limit=None if path else recursion_limit, return_type='generator', on_failure='ignore', result_renderer='disabled'): if sdsres.get('type', None) != 'dataset': # if it is not about a 'dataset' it is likely content in # the root dataset if sdsres.get('status', None) == 'impossible' and \ sdsres.get('message', None) == \ 'path not contained in any matching subdataset': target_path = Path(sdsres['path']) if refds.pathobj != target_path and \ refds.pathobj not in target_path.parents: yield dict( action='get', path=str(target_path), status='error', message=('path not associated with dataset %s', refds), ) continue # check if we need to obtain anything underneath this path # the subdataset() call above will only look _until_ it # hits the targetpath for res in _install_targetpath( refds, Path(sdsres['path']), recursive, recursion_limit, reckless, refds_path, description, jobs=jobs, ): # fish out the datasets that 'contains' a targetpath # and store them for later if res.get('status', None) in ('ok', 'notneeded') and \ 'contains' in res: dsrec = content_by_ds.get(res['path'], set()) dsrec.update(res['contains']) content_by_ds[res['path']] = dsrec if res.get('status', None) != 'notneeded': # all those messages on not having installed anything # are a bit pointless # "notneeded" for annex get comes below yield res else: # dunno what this is, send upstairs yield sdsres # must continue for both conditional branches above # the rest is about stuff in real subdatasets continue # instance of the closest existing dataset for this result ds = Dataset(sdsres['parentds'] if sdsres.get('state', None) == 'absent' else sdsres['path']) assert 'contains' in sdsres # explore the unknown for target_path in sdsres.get('contains', []): # essentially the same as done above for paths in the root # dataset, but here we are starting from the closest # discovered subdataset for res in _install_targetpath( ds, Path(target_path), recursive, recursion_limit, reckless, refds_path, description, jobs=jobs, ): known_ds = res['path'] in content_by_ds if res.get('status', None) in ('ok', 'notneeded') and \ 'contains' in res: dsrec = content_by_ds.get(res['path'], set()) dsrec.update(res['contains']) content_by_ds[res['path']] = dsrec # prevent double-reporting of datasets that have been # installed by explorative installation to get to target # paths, prior in this loop if res.get('status', None) != 'notneeded' or not known_ds: yield res if not get_data: # done already return # and now annex-get, this could all be done in parallel now for ds, content in content_by_ds.items(): for res in _get_targetpaths(Dataset(ds), content, refds.path, source, jobs): if 'path' not in res or res['path'] not in content_by_ds: # we had reports on datasets and subdatasets already # before the annex stage yield res
class Crawl(Interface): """Crawl online resource to create or update a dataset. Examples: $ datalad crawl # within a dataset having .datalad/crawl/crawl.cfg """ # XXX prevent common args from being added to the docstring _no_eval_results = True _params_ = dict( # Dry run is untested and largely probably not working in this implementation # so let's not expose it for now at all # dry_run=Parameter( # args=("-n", "--dry-run"), # action="store_true", # doc="""flag if file manipulations to be invoked (e.g., adding to git/annex). # If not, commands are only printed to the stdout"""), is_pipeline=Parameter( args=("--is-pipeline", ), action="store_true", doc= """flag if provided file is a Python script which defines pipeline()""" ), is_template=Parameter( args=("-t", "--is-template"), action="store_true", doc="""flag if provided value is the name of the template to use""" ), recursive=Parameter( args=("-r", "--recursive"), action="store_true", doc="""flag to crawl subdatasets as well (for now serially)"""), chdir=Parameter(args=("-C", "--chdir"), constraints=EnsureStr() | EnsureNone(), doc="""directory to chdir to for crawling"""), path=Parameter( args=('path', ), metavar='file', nargs='?', constraints=EnsureStr() | EnsureNone(), doc= """configuration (or pipeline if --is-pipeline) file defining crawling, or a directory of a dataset on which to perform crawling using its standard crawling specification""" ), ) @staticmethod def __call__(path=None, is_pipeline=False, is_template=False, recursive=False, chdir=None): # dry_run=False, dry_run = False from datalad.crawler.pipeline import (load_pipeline_from_config, load_pipeline_from_module, get_repo_pipeline_config_path, get_repo_pipeline_script_path) from datalad.crawler.pipeline import run_pipeline from datalad.utils import chpwd # import late so we could mock during tests with chpwd(chdir): assert not ( is_pipeline and is_template ), "it is either a pipeline or a template name, can't be both" if is_template: # generate a config and overload path with its filename path = initiate_pipeline_config( template=path, # kwargs=TODO, commit=True) # TODO: centralize via _params_ handling if dry_run: dryrun_optlabel = 'datalad.crawl.dryrun' if dryrun_optlabel in cfg: cfg.unset(dryrun_optlabel, where='local', reload=False) cfg.add(dryrun_optlabel, "True", where='local') if path is None: # get config from the current repository/dataset if is_pipeline: raise ValueError("You must specify the file if --pipeline") # Let's see if there is a config or pipeline in this repo path = get_repo_pipeline_config_path() if not path or not exists(path): # Check if there may be the pipeline provided path = get_repo_pipeline_script_path() if path and exists(path): is_pipeline = True stats = ActivityStats() if not path: raise RuntimeError( "Cannot locate crawler config or pipeline file") if is_pipeline: lgr.info("Loading pipeline definition from %s" % path) pipeline = load_pipeline_from_module(path) else: lgr.info("Loading pipeline specification from %s" % path) pipeline = load_pipeline_from_config(path) lgr.info("Running pipeline %s" % str(pipeline)) # TODO: capture the state of all branches so in case of crash # we could gracefully reset back try: output = run_pipeline(pipeline, stats=stats) except Exception as exc: # TODO: config.crawl.failure = full-reset | last-good-master # probably ask via ui which action should be performed unless # explicitly specified raise stats.datasets_crawled += 1 # TODO: Move gc/clean over here! stats_total = stats.get_total() if recursive: # get all subdatasets, and crawl them too! ## ? assert path_orig is None, "Otherwise not sure what to do with path=%r in subdatasets" % path import os from ..distribution.dataset import Dataset from ..api import crawl from ..utils import swallow_logs from ..dochelpers import exc_str # Note: we could collect all datasets to be crawled here or pass recursive=True # into the subdatasets' crawl. We will collect all of them here so we might later # also introduce automatic commits when super-dataset got successfully updated subdatasets = Dataset(os.curdir).subdatasets( recursive=recursive, result_xfm='relpaths') lgr.info("Crawling %d subdatasets", len(subdatasets)) output = [output] # TODO: parallelize # TODO: assumes that all sub-datasets are 'crawllable', and if not # just adds them to crawl_failed count. But may be we should make it more # explicit, that some sub-datasets might not need to be crawled, so they get # skipped explicitly? for ds_ in subdatasets: ds_logfile = utils.get_logfilename(ds_, 'crawl') try: # TODO: might be cool to be able to report a 'heart beat' from the swallow into pbar or smth with swallow_logs(file_=ds_logfile) as cml: output_, stats_ = crawl(chdir=ds_) stats_total += stats_ output.append(output_) lgr.info("Crawled %s: %s (log: %s)", ds_, stats_.as_str(mode='line'), ds_logfile) except Exception as exc: stats_total.datasets_crawl_failed += 1 stats_total.datasets_crawled += 1 output += [None] lgr.warning( "Crawling of %s has failed (more in %s): %s.", # Log output: %s", ds_, ds_logfile, exc_str(exc)) # , cml.out) lgr.info("Total stats: %s", stats_total.as_str(mode='line')) return output, stats_total