class Clone(Interface): """Obtain a dataset (copy) from a URL or local directory The purpose of this command is to obtain a new clone (copy) of a dataset and place it into a not-yet-existing or empty directory. As such `clone` provides a strict subset of the functionality offered by `install`. Only a single dataset can be obtained, and immediate recursive installation of subdatasets is not supported. However, once a (super)dataset is installed via `clone`, any content, including subdatasets can be obtained by a subsequent `get` command. Primary differences over a direct `git clone` call are 1) the automatic initialization of a dataset annex (pure Git repositories are equally supported); 2) automatic registration of the newly obtained dataset as a subdataset (submodule), if a parent dataset is specified; 3) support for additional resource identifiers (DataLad resource identifiers as used on datasets.datalad.org, and RIA store URLs as used for store.datalad.org - optionally in specific versions as identified by a branch or a tag; see examples); and 4) automatic configurable generation of alternative access URL for common cases (such as appending '.git' to the URL in case the accessing the base URL failed). || PYTHON >>By default, the command returns a single Dataset instance for an installed dataset, regardless of whether it was newly installed ('ok' result), or found already installed from the specified source ('notneeded' result).<< PYTHON || .. seealso:: :ref:`handbook:3-001` More information on Remote Indexed Archive (RIA) stores """ # by default ignore everything but install results # i.e. no "add to super dataset" result_filter = EnsureKeyChoice('action', ('install',)) # very frequently this command will yield exactly one installed dataset # spare people the pain of going through a list by default return_type = 'item-or-list' # as discussed in #1409 and #1470, we want to return dataset instances # matching what is actually available after command completion (and # None for any failed dataset installation) result_xfm = 'successdatasets-or-none' _examples_ = [ dict(text="Install a dataset from Github into the current directory", code_py="clone(" "source='https://github.com/datalad-datasets/longnow" "-podcasts.git')", code_cmd="datalad clone " "https://github.com/datalad-datasets/longnow-podcasts.git"), dict(text="Install a dataset into a specific directory", code_py="""\ clone(source='https://github.com/datalad-datasets/longnow-podcasts.git', path='myfavpodcasts')""", code_cmd="""\ datalad clone https://github.com/datalad-datasets/longnow-podcasts.git \\ myfavpodcasts"""), dict(text="Install a dataset as a subdataset into the current dataset", code_py="""\ clone(dataset='.', source='https://github.com/datalad-datasets/longnow-podcasts.git')""", code_cmd="datalad clone -d . " "https://github.com/datalad-datasets/longnow-podcasts.git"), dict(text="Install the main superdataset from datasets.datalad.org", code_py="clone(source='///')", code_cmd="datalad clone ///"), dict(text="Install a dataset identified by a literal alias from store.datalad.org", code_py="clone(source='ria+http://store.datalad.org#~hcp-openaccess')", code_cmd="datalad clone ria+http://store.datalad.org#~hcp-openaccess"), dict( text="Install a dataset in a specific version as identified by a" "branch or tag name from store.datalad.org", code_py="clone(source='ria+http://store.datalad.org#76b6ca66-36b1-11ea-a2e6-f0d5bf7b5561@myidentifier')", code_cmd="datalad clone ria+http://store.datalad.org#76b6ca66-36b1-11ea-a2e6-f0d5bf7b5561@myidentifier"), dict( text="Install a dataset with group-write access permissions", code_py=\ "clone(source='http://example.com/dataset', reckless='shared-group')", code_cmd=\ "datalad clone http://example.com/dataset --reckless shared-group"), ] _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""(parent) dataset to clone into. If given, the newly cloned dataset is registered as a subdataset of the parent. Also, if given, relative paths are interpreted as being relative to the parent dataset, and not relative to the working directory.""", constraints=EnsureDataset() | EnsureNone()), source=Parameter( args=("source",), metavar='SOURCE', doc="""URL, DataLad resource identifier, local path or instance of dataset to be cloned""", constraints=EnsureStr() | EnsureNone()), path=Parameter( args=("path",), metavar='PATH', nargs="?", doc="""path to clone into. If no `path` is provided a destination path will be derived from a source URL similar to :command:`git clone`"""), description=location_description, reckless=reckless_opt, ) @staticmethod @datasetmethod(name='clone') @eval_results def __call__( source, path=None, dataset=None, description=None, reckless=None): # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. ds = require_dataset( dataset, check_installed=True, purpose='cloning') \ if dataset is not None else dataset refds_path = ds.path if ds else None # legacy compatibility if reckless is True: # so that we can forget about how things used to be reckless = 'auto' if isinstance(source, Dataset): source = source.path if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination raise ValueError( "clone `source` and destination `path` are identical [{}]. " "If you are trying to add a subdataset simply use `save`".format( path)) if path is not None: path = resolve_path(path, dataset) # derive target from source: if path is None: # we got nothing but a source. do something similar to git clone # and derive the path from the source and continue # since this is a relative `path`, resolve it: # we are not going to reuse the decoded URL, as this is done for # all source candidates in clone_dataset(), we just use to determine # a destination path here in order to perform a bunch of additional # checks that shall not pollute the helper function source_ = decode_source_spec( source, cfg=None if ds is None else ds.config) path = resolve_path(source_['default_destpath'], dataset) lgr.debug("Determined clone target path from source") lgr.debug("Resolved clone target path to: '%s'", path) # there is no other way -- my intoxicated brain tells me assert(path is not None) result_props = dict( action='install', logger=lgr, refds=refds_path, source_url=source) try: # this will implicitly cause pathlib to run a bunch of checks # whether the present path makes any sense on the platform # we are running on -- we don't care if the path actually # exists at this point, but we want to abort early if the path # spec is determined to be useless path.exists() except OSError as e: yield get_status_dict( status='error', path=path, message=('cannot handle target path: %s', exc_str(e)), **result_props) return destination_dataset = Dataset(path) result_props['ds'] = destination_dataset if ds is not None and ds.pathobj not in path.parents: yield get_status_dict( status='error', message=("clone target path '%s' not in specified target dataset '%s'", path, ds), **result_props) return # perform the actual cloning operation yield from clone_dataset( [source], destination_dataset, reckless, description, result_props, cfg=None if ds is None else ds.config, ) # TODO handle any 'version' property handling and verification using a dedicated # public helper if ds is not None: # we created a dataset in another dataset # -> make submodule for r in ds.save( path, return_type='generator', result_filter=None, result_xfm=None, on_failure='ignore'): yield r
class NoAnnex(Interface): """Configure a dataset to never put some content into the dataset's annex This can be useful in mixed datasets that also contain textual data, such as source code, which can be efficiently and more conveniently managed directly in Git. Patterns generally look like this:: code/* which would match all file in the code directory. In order to match all files under ``code/``, including all its subdirectories use such a pattern:: code/** Note that the plugin works incrementally, hence any existing configuration (e.g. from a previous plugin run) is amended, not replaced. """ from datalad.support.param import Parameter from datalad.distribution.dataset import datasetmethod from datalad.interface.utils import eval_results from datalad.distribution.dataset import EnsureDataset from datalad.support.constraints import EnsureNone _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc=""""specify the dataset to configure. If no dataset is given, an attempt is made to identify the dataset based on the current working directory.""", constraints=EnsureDataset() | EnsureNone()), pattern=Parameter( args=("--pattern", ), nargs='+', doc="""list of path patterns. Any content whose path is matching any pattern will not be annexed when added to a dataset, but instead will be tracked directly in Git. Path pattern have to be relative to the directory given by the `ref_dir` option. By default, patterns should be relative to the root of the dataset.""" ), ref_dir=Parameter( args=("--ref-dir", ), doc="""Relative path (within the dataset) to the directory that is to be configured. All patterns are interpreted relative to this path, and configuration is written to a ``.gitattributes`` file in this directory."""), makedirs=Parameter( args=("--makedirs", ), action='store_true', doc="""If set, any missing directories will be created in order to be able to place a file into ``--ref-dir``."""), ) @staticmethod @datasetmethod(name='no_annex') @eval_results def __call__(dataset, pattern, ref_dir='.', makedirs=False): # could be extended to accept actual largefile expressions from os.path import join as opj from os.path import isabs from os.path import exists from os import makedirs as makedirsfx from datalad.distribution.dataset import require_dataset from datalad.support.annexrepo import AnnexRepo from datalad.utils import assure_list pattern = assure_list(pattern) ds = require_dataset(dataset, check_installed=True, purpose='no_annex configuration') res_kwargs = dict( path=ds.path, type='dataset', action='no_annex', ) # all the ways we refused to cooperate if not isinstance(ds.repo, AnnexRepo): yield dict(res_kwargs, status='notneeded', message='dataset has no annex') return if any(isabs(p) for p in pattern): yield dict( res_kwargs, status='error', message= ('path pattern for `no_annex` configuration must be relative paths: %s', pattern)) return if isabs(ref_dir): yield dict( res_kwargs, status='error', message= ('`ref_dir` for `no_annex` configuration must be a relative path: %s', ref_dir)) return gitattr_dir = opj(ds.path, ref_dir) if not exists(gitattr_dir): if makedirs: makedirsfx(gitattr_dir) else: yield dict( res_kwargs, status='error', message= 'target directory for `no_annex` does not exist (consider makedirs=True)' ) return gitattr_file = opj(gitattr_dir, '.gitattributes') ds.repo.set_gitattributes([(p, { 'annex.largefiles': 'nothing' }) for p in pattern], attrfile=gitattr_file) yield dict(res_kwargs, status='ok') for r in ds.save(gitattr_file, to_git=True, message="[DATALAD] exclude paths from annex'ing", result_filter=None, result_xfm=None): yield r
class ImportDicoms(Interface): """Import a DICOM archive into a study raw dataset. This creates a subdataset, containing the extracted DICOM files, under ACQUISITION ID/dicoms. Metadata is extracted from the DICOM headers and a study specification will automatically be prefilled, based on the metadata in DICOM headers. The specification is written to AQUISTION ID/studyspec.json by default. To this end after the creation of the subdataset and the extraction of DICOM metadata, hirni-dicom2spec is called internally. Therefore whatever you configure regarding dicom2spec applies here as well. Please refer to hirni-dicom2spec's documentation on how to configure the deduction from DICOM metadata to a study specification.""" _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), metavar='PATH', doc="""specify the dataset to import the DICOM archive into. If no dataset is given, an attempt is made to identify the dataset based on the current working directory and/or the given PATH""", constraints=EnsureDataset() | EnsureNone()), path=Parameter( args=("path", ), metavar='PATH', doc="""path or URL of the dicom archive to be imported.""", constraints=EnsureStr()), acqid=Parameter( args=("acqid", ), metavar="ACQUISITION ID", doc= """acquisition identifier for the imported DICOM files. This is used as the name the of directory, that is supposed to contain all data related to that acquisition. If not specified, an attempt will be made to derive ACQUISITION_ID from DICOM metadata. You can specify how to deduce that identifier from the DICOM header fields by configuring `datalad.hirni.import.acquisition-format` with a python format string referencing DICOM header field names as variables. For example, the current default value for that configuration is "{PatientID}".""", nargs="?", constraints=EnsureStr() | EnsureNone()), subject=Parameter( args=("--subject", ), metavar="SUBJECT", doc="""subject identifier. If not specified, an attempt will be made to derive SUBJECT from DICOM headers. See hirni-dicom2spec for details.""", constraints=EnsureStr() | EnsureNone()), anon_subject=Parameter( args=("--anon-subject", ), metavar="ANON_SUBJECT", doc="""an anonymized subject identifier. This is needed for anonymized conversion via spec2bids --anonymize and will be stored in the specification snippet for the imported DICOMs. Hence it can be added later and isn't mandatory for the import.""", constraints=EnsureStr() | EnsureNone()), properties=Parameter( args=("--properties", ), metavar="PATH or JSON string", doc="""a JSON string or a path to a JSON file, to provide overrides/additions to the to be created specification snippets for this acquisition. """, constraints=EnsureStr() | EnsureNone()), ) @staticmethod @datasetmethod(name='hirni_import_dcm') @eval_results def __call__(path, acqid=None, dataset=None, subject=None, anon_subject=None, properties=None): ds = require_dataset(dataset, check_installed=True, purpose="import DICOM session") if acqid: # acquisition was specified => we know where to create subds acq_dir = op.join(ds.path, acqid) if not op.exists(acq_dir): makedirs(acq_dir) # TODO: if exists: needs to be empty? dicom_ds = _create_subds_from_tarball(path, acq_dir) else: # we don't know the acquisition id yet => create in tmp acq_dir = op.join(ds.path, '.git', 'datalad', 'hirni_import') assert not op.exists(acq_dir) # TODO: don't assert; check and adapt instead try: dicom_ds = _create_subds_from_tarball(path, acq_dir) dicom_ds = _guess_acquisition_and_move(dicom_ds, ds) except OSError as e: # TODO: Was FileExistsError. Find more accurate PY2/3 solution # than just OSError yield dict(status='impossible', path=e.filename, type='file', action='import DICOM tarball', logger=lgr, message=exc_str(e)) rmtree(acq_dir) return # we can't do anything finally: if op.exists(acq_dir): lgr.debug("Killing temp dataset at %s ...", acq_dir) rmtree(acq_dir) acqid = op.basename(op.dirname(dicom_ds.path)) ds.save(dicom_ds.path, message="[HIRNI] Add aquisition {}".format(acqid)) # Note: use path with trailing slash to indicate we want metadata about the content of this subds, # not the subds itself. ds.meta_aggregate(with_pathsep(dicom_ds.path), into='top') ds.hirni_dicom2spec(path=dicom_ds.path, spec=op.normpath( op.join(dicom_ds.path, op.pardir, "studyspec.json")), subject=subject, anon_subject=anon_subject, acquisition=acqid, properties=properties) # TODO: This should probably be optional # We have the tarball and can drop extracted stuff: dicom_ds.drop([ f for f in listdir(dicom_ds.path) if f != ".datalad" and f != ".git" ]) # finally clean up git objects: dicom_ds.repo.cmd_call_wrapper.run(['git', 'gc']) # TODO: yield error results etc. yield dict(status='ok', path=dicom_ds.path, type='dataset', action='import DICOM tarball', logger=lgr)
class WebApp(Interface): """ """ _params_ = dict( app=Parameter(args=('app', ), nargs='?', metavar='APPNAME', doc="""Name of a registered webapp to start"""), dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset to serve as the anchor of the webapp. An attempt is made to identify the dataset based on the current working directory. If a dataset is given, the command will be executed in the root directory of this dataset.""", constraints=EnsureDataset() | EnsureNone()), read_only=Parameter( args=("--read-only", ), constraints=EnsureBool(), doc="""do not perform operations other then read-only access to dataset. It is up to the individual resources to interpret this flag and act accordingly."""), mode=Parameter( args=("--mode", ), constraints=EnsureChoice('normal', 'daemon', 'dry-run', 'debug'), doc="""Execution mode: regular foreground process (normal); background process (daemon); no server is started, but all configuration is perform (dry-run); like normal, but in debug mode (debug)"""), static_root=Parameter( args=("--static-root", ), doc="""path to static (HTML) files that should be served in root of the webapp. Defaults to the current directory."""), get_apps=Parameter(args=('--get-apps', ), action='store_true', doc="""if set, yields all registered webapp."""), ) @staticmethod @datasetmethod(name='webapp') @eval_results def __call__(app=None, dataset=None, read_only=False, mode='normal', static_root=None, get_apps=False): if get_apps: for ep in iter_entry_points('datalad.webapp.apps'): yield dict(action='webapp', status='ok' if resource_isdir( ep.module_name, ep.load()) else 'error', path=ep.name, logger=lgr, message=("provided by '%s'", ep.module_name)) return from datalad.distribution.dataset import require_dataset dataset = require_dataset(dataset, check_installed=True, purpose='serving') if static_root is None and app: for ep in iter_entry_points('datalad.webapp.apps'): if ep.name == app: app_path = resource_filename(ep.module_name, ep.load()) if not resource_isdir(ep.module_name, ep.load()): yield dict( action='webapp', status='error', path=dataset.path, message= ("app entrypoint '%s' does not point to directory", app, app_path)) return static_root = app_path break if static_root is None: yield dict(action='webapp', status='error', path=dataset.path, message=("no registered webapp with name '%s'", app)) return elif static_root is None: static_root = op.curdir from flask import Flask app = Flask( __name__, root_path=dataset.path, static_url_path='', static_folder=op.abspath(static_root), ) app.secret_key = os.urandom(64) # expose via arg app.config['api_key'] = 'dummy' webapp_props['config'] = app.config from flask_restful import Api api = Api(app, prefix="/api/v1") # TODO add default route to static index.html, if one exists # TODO use opt-in model for endpoints to limit exposure of # functionality to what is really needed for ep in iter_entry_points('datalad.webapp.resources'): lgr.warn("Available webapp resource'%s'", ep.name) cls = ep.load() urls = ['/{}'.format(ep.name)] if hasattr(cls, '_urlarg_spec'): urls.append('/{}/{}'.format(ep.name, cls._urlarg_spec)) api.add_resource(cls, *urls, resource_class_kwargs=dict(dataset=dataset, )) if op.exists(op.join(static_root, 'index.html')): from flask import send_from_directory @app.route('/') def serve_index(): return send_from_directory(static_root, 'index.html') if mode == 'dry-run': yield dict( action='webapp', status='ok', app=app, path=dataset.path, ) return print(""" ************************************************* ************************************************* THIS IS NOT A PRODUCTION-READY TOOL - only use in a trusted environment - do not expose service on public network interfaces ************************************************* ************************************************* """) # TODO expose flags, or use FLASK config vars app.run(debug=mode == 'debug')
class Get(Interface): """Get any dataset content (files/directories/subdatasets). This command only operates on dataset content. To obtain a new independent dataset from some source use the `clone` command. By default this command operates recursively within a dataset, but not across potential subdatasets, i.e. if a directory is provided, all files in the directory are obtained. Recursion into subdatasets is supported too. If enabled, relevant subdatasets are detected and installed in order to fulfill a request. Known data locations for each requested file are evaluated and data are obtained from some available location (according to git-annex configuration and possibly assigned remote priorities), unless a specific source is specified. *Getting subdatasets* Just as DataLad supports getting file content from more than one location, the same is supported for subdatasets, including a ranking of individual sources for prioritization. The following location candidates are considered. For each candidate a cost is given in parenthesis, higher values indicate higher cost, and thus lower priority: - URL of any configured superdataset remote that is known to have the desired submodule commit, with the submodule path appended to it. There can be more than one candidate (cost 500). - In case `.gitmodules` contains a relative path instead of a URL, the URL of any configured superdataset remote that is known to have the desired submodule commit, with this relative path appended to it. There can be more than one candidate (cost 500). - A URL or absolute path recorded in `.gitmodules` (cost 600). - In case `.gitmodules` contains a relative path as a URL, the absolute path of the superdataset, appended with this relative path (cost 900). Additional candidate URLs can be generated based on templates specified as configuration variables with the pattern `datalad.get.subdataset-source-candidate-<name>` where `name` is an arbitrary identifier. If `name` starts with three digits (e.g. '400myserver') these will be interpreted as a cost, and the respective candidate will be sorted into the generated candidate list according to this cost. If no cost is given, a default of 700 is used. A template string assigned to such a variable can utilize the Python format mini language and may reference a number of properties that are inferred from the parent dataset's knowledge about the target subdataset. Properties include any submodule property specified in the respective `.gitmodules` record. For convenience, an existing `datalad-id` record is made available under the shortened name `id`. Additionally, the URL of any configured remote that contains the respective submodule commit is available as `remote-<name>` properties, where `name` is the configured remote name. Lastly, all candidates are sorted according to their cost (lower values first), and duplicate URLs are stripped, while preserving the first item in the candidate list. .. note:: Power-user info: This command uses :command:`git annex get` to fulfill file handles. """ _examples_ = [ dict(text="Get a single file", code_py="get('path/to/file')", code_cmd="datalad get <path/to/file>"), dict(text="Get contents of a directory", code_py="get('path/to/dir/')", code_cmd="datalad get <path/to/dir/>"), dict( text="Get all contents of the current dataset and its subdatasets", code_py="get(dataset='.', recursive=True)", code_cmd="datalad get . -r"), dict( text="Get (clone) a registered subdataset, but don't retrieve data", code_py="get('path/to/subds', get_data=False)", code_cmd="datalad get -n <path/to/subds>"), ] _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), metavar="PATH", doc="""specify the dataset to perform the add operation on, in which case `path` arguments are interpreted as being relative to this dataset. If no dataset is given, an attempt is made to identify a dataset for each input `path`""", constraints=EnsureDataset() | EnsureNone()), path=Parameter( args=("path", ), metavar="PATH", doc="""path/name of the requested dataset component. The component must already be known to a dataset. To add new components to a dataset use the `add` command""", nargs="*", constraints=EnsureStr() | EnsureNone()), source=Parameter( args=( "-s", "--source", ), metavar="LABEL", doc="""label of the data source to be used to fulfill requests. This can be the name of a dataset :term:`sibling` or another known source""", constraints=EnsureStr() | EnsureNone()), recursive=recursion_flag, recursion_limit=Parameter( args=( "-R", "--recursion-limit", ), metavar="LEVELS", constraints=EnsureInt() | EnsureChoice('existing') | EnsureNone(), doc="""limit recursion into subdataset to the given number of levels. Alternatively, 'existing' will limit recursion to subdatasets that already existed on the filesystem at the start of processing, and prevent new subdatasets from being obtained recursively."""), get_data=Parameter( args=( "-n", "--no-data", ), dest='get_data', action='store_false', doc= """whether to obtain data for all file handles. If disabled, `get` operations are limited to dataset handles.[CMD: This option prevents data for file handles from being obtained CMD]"""), description=location_description, reckless=reckless_opt, jobs=jobs_opt) @staticmethod @datasetmethod(name='get') @eval_results def __call__( path=None, source=None, dataset=None, recursive=False, recursion_limit=None, get_data=True, description=None, reckless=None, jobs='auto', ): refds_path = Interface.get_refds_path(dataset) if not (dataset or path): raise InsufficientArgumentsError( "Neither dataset nor target path(s) provided") if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path # we have to have a single dataset to operate on refds = require_dataset(dataset, check_installed=True, purpose='get content') content_by_ds = {} # use subdatasets() to discover any relevant content that is not # already present in the root dataset (refds) for sdsres in Subdatasets.__call__( contains=path, # maintain path argument semantics and pass in dataset arg # as is dataset=dataset, # always come from the top to get sensible generator behavior bottomup=False, # when paths are given, they will constrain the recursion # automatically, and we need to enable recursion so we can # location path in subdatasets several levels down recursive=True if path else recursive, recursion_limit=None if path else recursion_limit, return_type='generator', on_failure='ignore'): if sdsres.get('type', None) != 'dataset': # if it is not about a 'dataset' it is likely content in # the root dataset if sdsres.get('status', None) == 'impossible' and \ sdsres.get('message', None) == \ 'path not contained in any matching subdataset': target_path = Path(sdsres['path']) if refds.pathobj != target_path and \ refds.pathobj not in target_path.parents: yield dict( action='get', path=str(target_path), status='error', message=('path not associated with dataset %s', refds), ) continue # check if we need to obtain anything underneath this path # the subdataset() call above will only look _until_ it # hits the targetpath for res in _install_targetpath( refds, Path(sdsres['path']), recursive, recursion_limit, reckless, refds_path, description, jobs=jobs, ): # fish out the datasets that 'contains' a targetpath # and store them for later if res.get('status', None) in ('ok', 'notneeded') and \ 'contains' in res: dsrec = content_by_ds.get(res['path'], set()) dsrec.update(res['contains']) content_by_ds[res['path']] = dsrec if res.get('status', None) != 'notneeded': # all those messages on not having installed anything # are a bit pointless # "notneeded" for annex get comes below yield res else: # dunno what this is, send upstairs yield sdsres # must continue for both conditional branches above # the rest is about stuff in real subdatasets continue # instance of the closest existing dataset for this result ds = Dataset(sdsres['parentds'] if sdsres.get('state', None) == 'absent' else sdsres['path']) assert 'contains' in sdsres # explore the unknown for target_path in sdsres.get('contains', []): # essentially the same as done above for paths in the root # dataset, but here we are starting from the closest # discovered subdataset for res in _install_targetpath( ds, Path(target_path), recursive, recursion_limit, reckless, refds_path, description, jobs=jobs, ): known_ds = res['path'] in content_by_ds if res.get('status', None) in ('ok', 'notneeded') and \ 'contains' in res: dsrec = content_by_ds.get(res['path'], set()) dsrec.update(res['contains']) content_by_ds[res['path']] = dsrec # prevent double-reporting of datasets that have been # installed by explorative installation to get to target # paths, prior in this loop if res.get('status', None) != 'notneeded' or not known_ds: yield res if not get_data: # done already return # and now annex-get, this could all be done in parallel now for ds, content in content_by_ds.items(): for res in _get_targetpaths(Dataset(ds), content, refds.path, source, jobs): if res['path'] not in content_by_ds: # we had reports on datasets and subdatasets already # before the annex stage yield res
class Diff(Interface): """Report changes of dataset components. Reports can be generated for changes between recorded revisions, or between a revision and the state of a dataset's work tree. Unlike 'git diff', this command also reports untracked content when comparing a revision to the state of the work tree. Such content is marked with the property `state='untracked'` in the command results. The following types of changes are distinguished and reported via the `state` result property: - added - copied - deleted - modified - renamed - typechange - unmerged - untracked Whenever applicable, source and/or destination revisions are reported to indicate when exactly within the requested revision range a particular component changed its status. Optionally, the reported changes can be limited to a subset of paths within a dataset. """ # make the custom renderer the default one, as the global default renderer # does not yield meaningful output for this command result_renderer = 'tailored' _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), doc="""specify the dataset to query. If no dataset is given, an attempt is made to identify the dataset based on the input and/or the current working directory""", constraints=EnsureDataset() | EnsureNone()), path=Parameter(args=("path", ), metavar="PATH", doc="""path to be evaluated""", nargs="*", constraints=EnsureStr() | EnsureNone()), revision=Parameter( args=('--revision', ), metavar='REVISION EXPRESSION', nargs='?', doc="""comparison reference specification. Three modes are supported: 1) <revision> changes you have in your working tree relative to the named revision (this can also be a branch name, tag, commit or any label Git can understand). 2) <revision>..<revision> changes between two arbitrary revisions. 3) <revision>...<revision> changes on the branch containing and up to the second <revision>, starting at a common ancestor of both revisions."""), staged=Parameter( args=("--staged", ), action="store_true", doc="""get the changes already staged for a commit relative to an optionally given revision (by default the most recent one)""" ), ignore_subdatasets=Parameter( args=('--ignore-subdatasets', ), constraints=EnsureChoice('none', 'untracked', 'dirty', 'all'), doc="""speed up execution by (partially) not evaluating the state of subdatasets in a parent dataset. With "none" a subdataset is considered modified when it either contains untracked or modified content or its last saved state differs from that recorded in the parent dataset. When "untracked" is used subdatasets are not considered modified when they only contain untracked content (but they are still scanned for modified content). Using "dirty" ignores all changes to the work tree of subdatasets, only changes to the revisions stored in the parent dataset are shown. Using "all" hides all changes to subdatasets. Note, even with "all" recursive execution will still report other changes in any existing subdataset, only the subdataset record in a parent dataset is not evaluated."""), report_untracked=Parameter( args=('--report-untracked', ), constraints=EnsureChoice('no', 'normal', 'all'), doc="""If and how untracked content is reported when comparing a revision to the state of the work tree. 'no': no untracked files are reported; 'normal': untracked files and entire untracked directories are reported as such; 'all': report individual files even in fully untracked directories."""), recursive=recursion_flag, recursion_limit=recursion_limit) @staticmethod @datasetmethod(name='diff') @eval_results def __call__(path=None, dataset=None, revision=None, staged=False, ignore_subdatasets='none', report_untracked='normal', recursive=False, recursion_limit=None): if not dataset and not path: # act on the whole dataset if nothing else was specified dataset = curdir refds_path = Interface.get_refds_path(dataset) to_process = [] # tracked what commit ranges we want to diff per dataset ds_diffies = {} for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='diff', # unavailable is OK, because we might query for a deleted file unavailable_path_status='', nondataset_path_status='impossible', # must not use `modified`, infinite loop otherwise modified=None, return_type='generator', on_failure='ignore'): if ap.get('status', None): # we know what to report already yield ap continue if ap.get('type', None) == 'dataset': ap['process_content'] = True if ap.get('raw_input', False) or ap['path'] == refds_path: # prepopulate the revision specs for all input paths ds_diffies[ap['path'] if ap.get('type', None) == 'dataset' else ap['parentds']] = revision to_process.append(ap) # sort into datasets content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path) assert (not completed) for ds_path in sorted(content_by_ds.keys()): if ds_path not in ds_diffies: # we don't know how to diff # this was not neither an input path, not did we see it # when diffing its parent continue content_paths = content_by_ds[ds_path] revision = ds_diffies[ds_path] for r in _parse_git_diff(ds_path, diff_thingie=ds_diffies[ds_path], paths=content_paths, ignore_submodules=ignore_subdatasets, staged=staged): r.update(dict(action='diff', logger=lgr)) if refds_path: r['refds'] = refds_path if 'status' not in r: r['status'] = 'ok' if r.get('type', None) == 'dataset': # this is a subdataset report # we need to use the reported commit range to properly adjust the # query once we hit that subdataset from_rev = r.get('revision_src', '') to_rev = r.get('revision', '') subrev = '{}..{}'.format( from_rev if from_rev else PRE_INIT_COMMIT_SHA, to_rev if to_rev else '', ) if from_rev and from_rev == to_rev: # this is a special case, where subdataset reported changes without # a change in state/commit -- this is code for uncommited changes # in the subdataset (including staged ones). In such a case, we # must not provide a diff range, but only the source commit we want # to diff against # XXX if this is changed, likely the same logic in annotate_paths needs # changing too! subrev = from_rev ds_diffies[r['path']] = subrev yield r if (revision and '..' in revision) or report_untracked == 'no': # don't look for untracked content, we got a revision range continue for r in _get_untracked_content(ds_path, report_untracked, paths=content_paths): r.update(dict(action='diff', logger=lgr)) if refds_path: r['refds'] = refds_path if 'status' not in r: r['status'] = 'ok' yield r @staticmethod def custom_result_renderer(res, **kwargs): from datalad.ui import ui if not res['status'] == 'ok': # logging reported already return path = relpath(res['path'], start=res['refds']) \ if res.get('refds', None) else res['path'] type_ = res.get('type', res.get('type_src', '')) max_len = len('untracked(directory)') state_msg = '{}{}'.format(res['state'], '({})'.format(type_ if type_ else '')) ui.message('{fill}{state_msg}: {path}'.format( fill=' ' * max(0, max_len - len(state_msg)), state_msg=state_msg, path=path))
class ExportToFigshare(Interface): """Export the content of a dataset as a ZIP archive to figshare Very quick and dirty approach. Ideally figshare should be supported as a proper git annex special remote. Unfortunately, figshare does not support having directories, and can store only a flat list of files. That makes it impossible for any sensible publishing of complete datasets. The only workaround is to publish dataset as a zip-ball, where the entire content is wrapped into a .zip archive for which figshare would provide a navigator. """ from datalad.support.param import Parameter from datalad.distribution.dataset import datasetmethod from datalad.interface.utils import eval_results from datalad.distribution.dataset import EnsureDataset from datalad.support.constraints import EnsureNone, EnsureInt, EnsureStr _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc=""""specify the dataset to export. If no dataset is given, an attempt is made to identify the dataset based on the current working directory.""", constraints=EnsureDataset() | EnsureNone()), filename=Parameter( args=("filename", ), metavar="PATH", nargs='?', doc="""File name of the generated ZIP archive. If no file name is given the archive will be generated in the current directory and will be named: datalad_<dataset_uuid>.zip.""", constraints=EnsureStr() | EnsureNone()), no_annex=Parameter( args=("--no-annex", ), action="store_true", doc="""By default the generated .zip file would be added to annex, and all files would get registered in git-annex to be available from such a tarball. Also upon upload we will register for that archive to be a possible source for it in annex. Setting this flag disables this behavior."""), missing_content=Parameter( args=("--missing-content", ), metavar="error|continue|ignore", doc="""By default, any discovered file with missing content will result in an error and the plugin is aborted. Setting this to 'continue' will issue warnings instead of failing on error. The value 'ignore' will only inform about problem at the 'debug' log level. The latter two can be helpful when generating a TAR archive from a dataset where some file content is not available locally.""", constraints=EnsureStr()), # article_id=Parameter( # args=("--project-id",), # metavar="ID", # doc="""If given, article (if article_id is not provided) will be # created in that project.""", # constraints=EnsureInt() | EnsureNone()), article_id=Parameter(args=("--article-id", ), metavar="ID", doc="""Which article to publish to.""", constraints=EnsureInt() | EnsureNone()), ) @staticmethod @datasetmethod(name='export_to_figshare') @eval_results def __call__( dataset, filename=None, missing_content='error', no_annex=False, # TODO: support working with projects and articles within them # project_id=None, article_id=None): import os import logging lgr = logging.getLogger('datalad.plugin.export_to_figshare') from datalad.ui import ui from datalad.api import add_archive_content from datalad.api import export_archive from datalad.distribution.dataset import require_dataset from datalad.support.annexrepo import AnnexRepo dataset = require_dataset(dataset, check_installed=True, purpose='export to figshare') if not isinstance(dataset.repo, AnnexRepo): raise ValueError( "%s is not an annex repo, so annexification could be done" % dataset) if dataset.repo.is_dirty(): raise RuntimeError( "Paranoid authors of DataLad refuse to proceed in a dirty repository" ) lgr.info( "Exporting current tree as an archive since figshare does not support directories" ) archive_out = next( export_archive(dataset, filename=filename, archivetype='zip', missing_content=missing_content, return_type="generator")) assert archive_out['status'] == 'ok' fname = archive_out['path'] lgr.info("Uploading %s to figshare", fname) figshare = FigshareRESTLaison() if not article_id: # TODO: ask if it should be an article within a project if ui.is_interactive: # or should we just upload to a new article? if ui.yesno( "Would you like to create a new article to upload to? " "If not - we will list existing articles", title="Article"): article = figshare.create_article( title=os.path.basename(dataset.path)) lgr.info( "Created a new (private) article %(id)s at %(url_private_html)s. " "Please visit it, enter additional meta-data and make public", article) article_id = article['id'] else: article_id = int( ui.question( "Which of the articles should we upload to.", choices=map(str, figshare.get_article_ids()))) if not article_id: raise ValueError("We need an article to upload to.") file_info = figshare.upload_file( fname, files_url='account/articles/%s/files' % article_id) if no_annex: lgr.info("Removing generated tarball") os.unlink(fname) else: # I will leave all the complaining etc to the dataset add if path # is outside etc lgr.info("'Registering' %s within annex", fname) repo = dataset.repo repo.add(fname, git=False) key = repo.get_file_key(fname) lgr.info("Adding URL %(download_url)s for it", file_info) repo._annex_custom_command([], [ "git", "annex", "registerurl", '-c', 'annex.alwayscommit=false', key, file_info['download_url'] ]) lgr.info("Registering links back for the content of the archive") add_archive_content( fname, annex=dataset.repo, delete_after=True, # just remove extracted into a temp dir allow_dirty=True, # since we have a tarball commit= False # we do not want to commit anything we have done here ) lgr.info("Removing generated and now registered in annex archive") repo.drop(key, key=True, options=['--force']) repo.remove(fname, force=True) # remove the tarball # if annex in {'delete'}: # dataset.repo.remove(fname) # else: # # kinda makes little sense I guess. # # Made more sense if export_archive could export an arbitrary treeish # # so we could create a branch where to dump and export to figshare # # (kinda closer to my idea) # dataset.save(fname, message="Added the entire dataset into a zip file") # TODO: add to downloader knowledge about figshare token so it could download-url # those zipballs before they go public yield dict( status='ok', # TODO: add article url (which needs to be queried if only ID is known message="Published archive {}".format(file_info['download_url']), file_info=file_info, path=dataset, action='export_to_figshare', logger=lgr)
class CreateSiblingRia(Interface): """Creates a sibling to a dataset in a RIA store Communication with a dataset in a RIA store is implemented via two siblings. A regular Git remote (repository sibling) and a git-annex special remote for data transfer (storage sibling) -- with the former having a publication dependency on the latter. By default, the name of the storage sibling is derived from the repository sibling's name by appending "-storage". The store's base path is expected to not exist, be an empty directory, or a valid RIA store. RIA store layout ~~~~~~~~~~~~~~~~ A RIA store is a directory tree with a dedicated subdirectory for each dataset in the store. The subdirectory name is constructed from the DataLad dataset ID, e.g. '124/68afe-59ec-11ea-93d7-f0d5bf7b5561', where the first three characters of the ID are used for an intermediate subdirectory in order to mitigate files system limitations for stores containing a large number of datasets. Each dataset subdirectory contains a standard bare Git repository for the dataset. In addition, a subdirectory 'annex' hold a standard Git-annex object store. However, instead of using the 'dirhashlower' naming scheme for the object directories, like Git-annex would do, a 'dirhashmixed' layout is used -- the same as for non-bare Git repositories or regular DataLad datasets. Optionally, there can be a further subdirectory 'archives' with (compressed) 7z archives of annex objects. The storage remote is able to pull annex objects from these archives, if it cannot find in the regular annex object store. This feature can be useful for storing large collections of rarely changing data on systems that limit the number of files that can be stored. Each dataset directory also contains a 'ria-layout-version' file that identifies the data organization (as, for example, described above). Lastly, there is a global 'ria-layout-version' file at the store's base path that identifies where dataset subdirectories themselves are located. At present, this file must contain a single line stating the version (currently "1"). This line MUST end with a newline character. It is possible to define an alias for an individual dataset in a store by placing a symlink to the dataset location into an 'alias/' directory in the root of the store. This enables dataset access via URLs of format: 'ria+<protocol>://<storelocation>#~<aliasname>'. Error logging ~~~~~~~~~~~~~ To enable error logging at the remote end, append a pipe symbol and an "l" to the version number in ria-layout-version (like so '1|l\\n'). Error logging will create files in an "error_log" directory whenever the git-annex special remote (storage sibling) raises an exception, storing the Python traceback of it. The logfiles are named according to the scheme '<dataset id>.<annex uuid of the remote>.log' showing "who" ran into this issue with which dataset. Because logging can potentially leak personal data (like local file paths for example), it can be disabled client-side by setting the configuration variable "annex.ora-remote.<storage-sibling-name>.ignore-remote-config". """ # TODO: description? _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), doc="""specify the dataset to process. If no dataset is given, an attempt is made to identify the dataset based on the current working directory""", constraints=EnsureDataset() | EnsureNone()), url=Parameter( args=("url", ), metavar="ria+<ssh|file>://<host>[/path]", doc="""URL identifying the target RIA store and access protocol. """, constraints=EnsureStr() | EnsureNone()), name=Parameter(args=( '-s', '--name', ), metavar='NAME', doc="""Name of the sibling. With `recursive`, the same name will be used to label all the subdatasets' siblings.""", constraints=EnsureStr() | EnsureNone(), required=True), storage_name=Parameter( args=("--storage-name", ), metavar="NAME", doc="""Name of the storage sibling (git-annex special remote). Must not be identical to the sibling name. If not specified, defaults to the sibling name plus '-storage' suffix.""", constraints=EnsureStr() | EnsureNone()), post_update_hook=Parameter( args=("--post-update-hook", ), doc="""Enable git's default post-update-hook for the created sibling.""", action="store_true"), shared=Parameter( args=("--shared", ), metavar='{false|true|umask|group|all|world|everybody|0xxx}', doc="""If given, configures the permissions in the RIA store for multi-users access. Possible values for this option are identical to those of `git init --shared` and are described in its documentation.""", constraints=EnsureStr() | EnsureBool() | EnsureNone()), group=Parameter( args=("--group", ), metavar="GROUP", doc="""Filesystem group for the repository. Specifying the group is crucial when [CMD: --shared=group CMD][PY: shared="group" PY]""", constraints=EnsureStr() | EnsureNone()), storage_sibling=Parameter( args=("--no-storage-sibling", ), dest='storage_sibling', doc="""Flag to disable establishing a storage sibling.""", action="store_false"), existing=Parameter( args=("--existing", ), constraints=EnsureChoice('skip', 'error', 'reconfigure') | EnsureNone(), metavar='MODE', doc="""Action to perform, if a (storage) sibling is already configured under the given name and/or a target already exists. In this case, a dataset can be skipped ('skip'), an existing target repository be forcefully re-initialized, and the sibling (re-)configured ('reconfigure'), or the command be instructed to fail ('error').""", ), recursive=recursion_flag, recursion_limit=recursion_limit, trust_level=Parameter( args=("--trust-level", ), metavar="TRUST-LEVEL", constraints=EnsureChoice('trust', 'semitrust', 'untrust') | EnsureNone(), doc="""specify a trust level for the storage sibling. If not specified, the default git-annex trust level is used.""", ), ) @staticmethod @datasetmethod(name='create_sibling_ria') @eval_results def __call__(url, name, dataset=None, storage_name=None, post_update_hook=False, shared=None, group=None, storage_sibling=True, existing='error', trust_level=None, recursive=False, recursion_limit=None): ds = require_dataset(dataset, check_installed=True, purpose='create sibling RIA') res_kwargs = dict( ds=ds, action="create-sibling-ria", logger=lgr, ) if ds.repo.get_hexsha() is None or ds.id is None: raise RuntimeError("Repository at {} is not a DataLad dataset, " "run 'datalad create [--force]' first.".format( ds.path)) if not storage_sibling and storage_name: lgr.warning( "Storage sibling setup disabled, but a storage sibling name " "was provided") if storage_sibling and not storage_name: storage_name = "{}-storage".format(name) if storage_sibling and name == storage_name: # leads to unresolvable, circular dependency with publish-depends raise ValueError("sibling names must not be equal") if not isinstance(url, str): raise TypeError("url is not a string, but %s" % type(url)) # Query existing siblings upfront in order to fail early on # existing=='error', since misconfiguration (particularly of special # remotes) only to fail in a subdataset later on with that config, can # be quite painful. # TODO: messages - this is "create-sibling". Don't confuse existence of # local remotes with existence of the actual remote sibling # in wording if existing == 'error': # in recursive mode this check could take a substantial amount of # time: employ a progress bar (or rather a counter, because we don't # know the total in advance pbar_id = 'check-siblings-{}'.format(id(ds)) log_progress( lgr.info, pbar_id, 'Start checking pre-existing sibling configuration %s', ds, label='Query siblings', unit=' Siblings', ) # even if we have to fail, let's report all conflicting siblings # in subdatasets failed = False for r in ds.siblings(result_renderer=None, recursive=recursive, recursion_limit=recursion_limit): log_progress(lgr.info, pbar_id, 'Discovered sibling %s in dataset at %s', r['name'], r['path'], update=1, increment=True) if not r['type'] == 'sibling' or r['status'] != 'ok': # this is an internal status query that has not consequence # for the outside world. Be silent unless something useful # can be said #yield r continue if r['name'] == name: res = get_status_dict( status='error', message="a sibling '{}' is already configured in " "dataset {}".format(name, r['path']), **res_kwargs, ) failed = True yield res continue if storage_name and r['name'] == storage_name: res = get_status_dict( status='error', message="a sibling '{}' is already configured in " "dataset {}".format(storage_name, r['path']), **res_kwargs, ) failed = True yield res continue log_progress( lgr.info, pbar_id, 'Finished checking pre-existing sibling configuration %s', ds, ) if failed: return # TODO: - URL parsing + store creation needs to be RF'ed based on # command abstractions # - more generally consider store creation a dedicated command or # option # Note: URL parsing is done twice ATM (for top-level ds). This can't be # reduced to single instance, since rewriting url based on config could # be different for subdatasets. # parse target URL try: ssh_host, base_path, rewritten_url = verify_ria_url(url, ds.config) except ValueError as e: yield get_status_dict(status='error', message=str(e), **res_kwargs) return create_store( SSHRemoteIO(ssh_host) if ssh_host else LocalIO(), Path(base_path), '1') yield from _create_sibling_ria(ds, url, name, storage_sibling, storage_name, existing, shared, group, post_update_hook, trust_level, res_kwargs) if recursive: # Note: subdatasets can be treated independently, so go full # recursion when querying for them and _no_recursion with the # actual call. Theoretically this can be parallelized. for subds in ds.subdatasets(fulfilled=True, recursive=True, recursion_limit=recursion_limit, result_xfm='datasets'): yield from _create_sibling_ria(subds, url, name, storage_sibling, storage_name, existing, shared, group, post_update_hook, trust_level, res_kwargs)
class Metadata(Interface): """Metadata reporting for files and entire datasets Two types of metadata are supported: 1. metadata describing a dataset as a whole (dataset-global metadata), and 2. metadata for files in a dataset (content metadata). Both types can be accessed with this command. Examples: Report the metadata of a single file, as aggregated into the closest locally available dataset, containing the query path:: % datalad metadata somedir/subdir/thisfile.dat Sometimes it is helpful to get metadata records formatted in a more accessible form, here as pretty-printed JSON:: % datalad -f json_pp metadata somedir/subdir/thisfile.dat Same query as above, but specify which dataset to query (must be containing the query path):: % datalad metadata -d . somedir/subdir/thisfile.dat Report any metadata record of any dataset known to the queried dataset:: % datalad metadata --recursive --reporton datasets Get a JSON-formatted report of aggregated metadata in a dataset, incl. information on enabled metadata extractors, dataset versions, dataset IDs, and dataset paths:: % datalad -f json metadata --get-aggregates """ # make the custom renderer the default, path reporting isn't the top # priority here result_renderer = 'tailored' _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""dataset to query. If given, metadata will be reported as stored in this dataset. Otherwise, the closest available dataset containing a query path will be consulted.""", constraints=EnsureDataset() | EnsureNone()), path=Parameter(args=("path", ), metavar="PATH", doc="path(s) to query metadata for", nargs="*", constraints=EnsureStr() | EnsureNone()), get_aggregates=Parameter( args=('--get-aggregates', ), action='store_true', doc="""if set, yields all (sub)datasets for which aggregate metadata are available in the dataset. No other action is performed, even if other arguments are given. The reported results contain a datasets's ID, the commit hash at which metadata aggregation was performed, and the location of the object file(s) containing the aggregated metadata."""), reporton=reporton_opt, recursive=recursion_flag) # MIH: not sure of a recursion limit makes sense here # ("outdated from 5 levels down?") #recursion_limit=recursion_limit) @staticmethod @datasetmethod(name='metadata') @eval_results def __call__(path=None, dataset=None, get_aggregates=False, reporton='all', recursive=False): # prep results refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='metadata', logger=lgr) if refds_path: res_kwargs['refds'] = refds_path if get_aggregates: # yield all datasets for which we have aggregated metadata as results # the get actual dataset results, so we can turn them into dataset # instances using generic top-level code if desired ds = require_dataset(refds_path, check_installed=True, purpose='aggregate metadata query') agginfos = load_ds_aggregate_db( ds, version=str(aggregate_layout_version), abspath=True) if not agginfos: # if there has ever been an aggregation run, this file would # exist, hence there has not been and we need to tell this # to people yield get_status_dict( ds=ds, status='impossible', action='metadata', logger=lgr, message= 'metadata aggregation has never been performed in this dataset' ) return parentds = [] for dspath in sorted(agginfos): info = agginfos[dspath] if parentds and not path_is_subpath(dspath, parentds[-1]): parentds.pop() info.update( path=dspath, type='dataset', status='ok', ) if dspath == ds.path: info['layout_version'] = aggregate_layout_version if parentds: info['parentds'] = parentds[-1] yield dict(info, **res_kwargs) parentds.append(dspath) return if not dataset and not path: # makes no sense to have no dataset, go with "here" # error generation happens during annotation path = op.curdir content_by_ds = OrderedDict() for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, # MIH: we are querying the aggregated metadata anyways, and that # mechanism has its own, faster way to go down the hierarchy #recursive=recursive, #recursion_limit=recursion_limit, action='metadata', # uninstalled subdatasets could be queried via aggregated metadata # -> no 'error' unavailable_path_status='', nondataset_path_status='error', # we need to know when to look into aggregated data force_subds_discovery=True, force_parentds_discovery=True, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) == 'dataset' and GitRepo.is_valid_repo( ap['path']): ap['process_content'] = True to_query = None if ap.get('state', None) == 'absent' or \ ap.get('type', 'dataset') != 'dataset': # this is a lonely absent dataset/file or content in a present dataset # -> query through parent # there must be a parent, otherwise this would be a non-dataset path # and would have errored during annotation to_query = ap['parentds'] else: to_query = ap['path'] if to_query: pcontent = content_by_ds.get(to_query, []) pcontent.append(ap) content_by_ds[to_query] = pcontent for ds_path in content_by_ds: ds = Dataset(ds_path) query_agg = [ ap for ap in content_by_ds[ds_path] # this is an available subdataset, will be processed in another # iteration if ap.get('state', None) == 'absent' or not (ap.get( 'type', None) == 'dataset' and ap['path'] != ds_path) ] if not query_agg: continue # report from aggregated metadata for r in query_aggregated_metadata( reporton, # by default query the reference dataset, only if there is none # try our luck in the dataset that contains the queried path # this is consistent with e.g. `get_aggregates` reporting the # situation in the reference dataset only Dataset(refds_path) if refds_path else ds, query_agg, # recursion above could only recurse into datasets # on the filesystem, but there might be any number of # uninstalled datasets underneath the last installed one # for which we might have metadata recursive=recursive, **res_kwargs): yield r return @staticmethod def custom_result_renderer(res, **kwargs): if res['status'] != 'ok' or not res.get('action', None) == 'metadata': # logging complained about this already return # list the path, available metadata keys, and tags path = op.relpath(res['path'], res['refds']) if res.get( 'refds', None) else res['path'] meta = res.get('metadata', {}) ui.message('{path}{type}:{spacer}{meta}{tags}'.format( path=ac.color_word(path, ac.BOLD), type=' ({})'.format(ac.color_word(res['type'], ac.MAGENTA)) if 'type' in res else '', spacer=' ' if len([m for m in meta if m != 'tag']) else '', meta=','.join(k for k in sorted(meta.keys()) if k not in ('tag', '@context', '@id')) if meta else ' -' if 'metadata' in res else ' aggregated', tags='' if 'tag' not in meta else ' [{}]'.format(','.join( ensure_list(meta['tag'])))))
class Extract(Interface): """Run a metadata extractor on a dataset or file. This command distinguishes between dataset-level extraction and file-level extraction. If no "path" argument is given, the command assumes that a given extractor is a dataset-level extractor and executes it on the dataset that is given by the current working directory or by the "-d" argument. If a path is given, the command assumes that the path identifies a file and that the given extractor is a file-level extractor, which will then be executed on the specified file. If the file level extractor requests the content of a file that is not present, the command might "get" the file content to make it locally available. Path must not refer to a sub-dataset. Path must not be a directory. .. note:: If you want to insert sub-dataset-metadata into the super-dataset's metadata, you currently have to do the following: first, extract dataset metadata of the sub-dataset using a dataset- level extractor, second add the extracted metadata with sub-dataset information (i.e. dataset_path, root_dataset_id, root-dataset- version) to the metadata of the super-dataset. The extractor configuration can be parameterized with key-value pairs given as additional arguments. Each key-value pair consists of two arguments, first the key, followed by the value. If no path is given, and you want to provide key-value pairs, you have to give the path "++", to prevent that the first key is interpreted as path. The results are written into the repository of the source dataset or into the repository of the dataset given by the "-i" parameter. If the same extractor is executed on the same element (dataset or file) with the same configuration, any existing results will be overwritten. The command can also take legacy datalad-metalad extractors and will execute them in either "content" or "dataset" mode, depending on the presence of the "path"-parameter. """ result_renderer = "tailored" _examples_ = [ dict(text='Use the metalad_core_file-extractor to extract metadata' 'from the file "subdir/data_file_1.txt". The dataset is ' 'given by the current working directory', code_cmd="datalad meta-extract metalad_core_file " "subdir/data_file_1.txt"), dict(text='Use the metalad_core_file-extractor to extract metadata ' 'from the file "subdir/data_file_1.txt" in the dataset ' '/home/datasets/ds0001', code_cmd="datalad meta-extract -d /home/datasets/ds0001 " "metalad_core_file subdir/data_file_1.txt"), dict(text='Use the metalad_core_dataset-extractor to extract ' 'dataset-level metadata from the dataset given by the ' 'current working directory', code_cmd="datalad meta-extract metalad_core_dataset"), dict(text='Use the metalad_core_dataset-extractor to extract ' 'dataset-level metadata from the dataset in ' '/home/datasets/ds0001', code_cmd="datalad meta-extract -d /home/datasets/ds0001 " "metalad_core_dataset") ] _params_ = dict( extractorname=Parameter( args=("extractorname", ), metavar="EXTRACTOR_NAME", doc="Name of a metadata extractor to be executed."), path=Parameter(args=("path", ), metavar="FILE", nargs="?", doc="""Path of a file or dataset to extract metadata from. If this argument is provided, we assume a file extractor is requested, if the path is not given, or if it identifies the root of a dataset, i.e. "", we assume a dataset level metadata extractor is specified.""", constraints=EnsureStr() | EnsureNone()), dataset=Parameter( args=("-d", "--dataset"), doc="""Dataset to extract metadata from. If no dataset is given, the dataset is determined by the current work directory.""", constraints=EnsureDataset() | EnsureNone()), extractorargs=Parameter(args=("extractorargs", ), metavar="EXTRACTOR_ARGUMENTS", doc="""Extractor arguments""", nargs="*", constraints=EnsureStr() | EnsureNone())) @staticmethod @datasetmethod(name="meta_extract") @eval_results def __call__(extractorname: str, path: Optional[str] = None, dataset: Optional[Union[Dataset, str]] = None, extractorargs: Optional[List[str]] = None): # Get basic arguments extractor_name = extractorname extractor_args = extractorargs path = None if path == "++" else path source_dataset = require_dataset(dataset or curdir, purpose="extract metadata", check_installed=path is not None) if not source_dataset.repo: raise ValueError(f"No dataset found in {dataset or curdir}.") source_dataset_version = source_dataset.repo.get_hexsha() extractor_class = get_extractor_class(extractor_name) dataset_tree_path, file_tree_path = get_path_info( source_dataset, Path(path) if path else None, None) extraction_parameters = ExtractionParameter( source_dataset=source_dataset, source_dataset_id=UUID(source_dataset.id), source_dataset_version=source_dataset_version, extractor_class=extractor_class, extractor_name=extractor_name, extractor_arguments=args_to_dict(extractor_args), file_tree_path=file_tree_path, agent_name=source_dataset.config.get("user.name"), agent_email=source_dataset.config.get("user.email")) # If a path is given, we assume file-level metadata extraction is # requested, and the extractor class should be a subclass of # FileMetadataExtractor (or a legacy extractor). # If path is not given, we assume that a dataset-level extraction is # requested and the extractor class is a subclass of # DatasetMetadataExtractor (or a legacy extractor class). path = None if path == "++" else path if path: # Check whether the path points to a sub_dataset. ensure_path_validity(source_dataset, file_tree_path) yield from do_file_extraction(extraction_parameters) else: yield from do_dataset_extraction(extraction_parameters) return @staticmethod def custom_result_renderer(res, **kwargs): if res["status"] != "ok" or res.get("action", "") != 'meta_extract': # logging complained about this already return metadata_record = res["metadata_record"] path = ({ "path": str(metadata_record["path"]) } if "path" in metadata_record else {}) dataset_path = ({ "dataset_path": str(metadata_record["dataset_path"]) } if "dataset_path" in metadata_record else {}) ui.message( json.dumps({ **metadata_record, **path, **dataset_path, "dataset_id": str(metadata_record["dataset_id"]) }))
class AnnotatePaths(Interface): """Analyze and act upon input paths Given paths (or more generally location requests) are inspected and annotated with a number of properties. A list of recognized properties is provided below. || PYTHON >>Input `paths` for this command can either be un-annotated (raw) path strings, or already (partially) annotated paths. In the latter case, further annotation is limited to yet-unknown properties, and is potentially faster than initial annotation.<< PYTHON || *Recognized path properties* {proplist} In the case of enabled modification detection the results may contain additional properties regarding the nature of the modification. See the documentation of the `diff` command for details. """ _docs_ = dict(proplist='\n\n '.join('"{}"\n{}'.format( k, textwrap.fill(known_props[k], initial_indent=' ', subsequent_indent=' ')) for k in sorted(known_props))) _params_ = dict( path=Parameter(args=("path", ), metavar="PATH", doc="""path to be annotated""", nargs="*", constraints=EnsureStr() | EnsureNone()), dataset=Parameter( args=("-d", "--dataset"), doc="""an optional reference/base dataset for the paths""", constraints=EnsureDataset() | EnsureNone()), recursive=recursion_flag, recursion_limit=recursion_limit, action=Parameter(args=("--action", ), metavar="LABEL", doc="""an "action" property value to include in the path annotation""", constraints=EnsureStr() | EnsureNone()), unavailable_path_status=Parameter( args=("--unavailable-path-status", ), metavar="LABEL", doc="""a "status" property value to include in the annotation for paths that are underneath a dataset, but do not exist on the filesystem""", constraints=EnsureStr() | EnsureNone()), unavailable_path_msg=Parameter( args=("--unavailable-path-msg", ), metavar="message", doc="""a "message" property value to include in the annotation for paths that are underneath a dataset, but do not exist on the filesystem""", constraints=EnsureStr() | EnsureNone()), nondataset_path_status=Parameter( args=("--nondataset-path-status", ), metavar="LABEL", doc="""a "status" property value to include in the annotation for paths that are not underneath any dataset""", constraints=EnsureStr() | EnsureNone()), force_parentds_discovery=Parameter( args=("--no-parentds-discovery", ), dest='force_parentds_discovery', action='store_false', doc="""Flag to disable reports of parent dataset information for any path, in particular dataset root paths. Disabling saves on command run time, if this information is not needed."""), force_subds_discovery=Parameter( args=("--no-subds-discovery", ), action='store_false', dest='force_subds_discovery', doc="""Flag to disable reporting type='dataset' for subdatasets, even when they are not installed, or their mount point directory doesn't exist. Disabling saves on command run time, if this information is not needed."""), force_untracked_discovery=Parameter( args=("--no-untracked-discovery", ), action='store_false', dest='force_untracked_discovery', doc="""Flag to disable discovery of untracked changes. Disabling saves on command run time, if this information is not needed."""), force_no_revision_change_discovery=Parameter( args=("--revision-change-discovery", ), action='store_false', dest='force_no_revision_change_discovery', doc= """Flag to disable discovery of changes which were not yet committed. Disabling saves on command run time, if this information is not needed."""), modified=Parameter( args=("--modified", ), nargs='?', const=True, constraints=EnsureStr() | EnsureBool() | EnsureNone(), doc="""comparison reference specification for modification detection. This can be (mostly) anything that `git diff` understands (commit, treeish, tag, etc). See the documentation of `datalad diff --revision` for details. Unmodified paths will not be annotated. If a requested path was not modified but some content underneath it was, then the request is replaced by the modified paths and those are annotated instead. This option can be used [PY: with `True` as PY][CMD: without CMD] an argument to test against changes that have been made, but have not yet been staged for a commit.""")) @staticmethod @datasetmethod(name='annotate_paths') @eval_results def __call__(path=None, dataset=None, recursive=False, recursion_limit=None, action=None, unavailable_path_status='', unavailable_path_msg=None, nondataset_path_status='error', force_parentds_discovery=True, force_subds_discovery=True, force_no_revision_change_discovery=True, force_untracked_discovery=True, modified=None): # upfront check for the fastest possible response if not path and dataset is None: # nothing given, try "here", but do not use `require_dataset`, as # it will determine the root dataset of `curdir` and further down # lead to path annotation of upstairs directories dataset = curdir if force_subds_discovery and not force_parentds_discovery: raise ValueError( 'subdataset discovery requires parent dataset discovery') # CONCEPT: yield with no status to indicate further processing # everything in one big loop to be able too yield as fast a possible # without any precomputing for all paths refds_path = Interface.get_refds_path(dataset) if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)): raise ValueError( "modification detection only works with a base dataset (non-given or found)" ) # prep common result props res_kwargs = dict(action=action if action else 'annotate_path', refds=refds_path, logger=lgr) # handle the case of recursion into a single dataset without any # extra fancy processing first -- full recursion can be done # faster than manual recursion, hence we gain quite some speed # from these few lines of extra code if not modified and not path and refds_path: if not GitRepo.is_valid_repo(refds_path): yield get_status_dict( # doesn't matter if the path is in another dataset # it was given as reference dataset status=nondataset_path_status, message='given reference dataset is not a dataset', path=refds_path, **res_kwargs) return refds = Dataset(refds_path) path = [] # yield the dataset itself r = get_status_dict(ds=refds, status='', **res_kwargs) yield r if recursive: # if we have nothing given, but need recursion, we need to feed # the dataset path itself for r in yield_recursive(refds, refds_path, action, recursion_limit): r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] yield r return # goal: structure in a way that makes most information on any path # available in a single pass, at the cheapest possible cost reported_paths = {} requested_paths = ensure_list(path) if modified is not None: # modification detection would silently kill all nondataset paths # but we have to complain about them, hence doing it here if requested_paths and refds_path: for r in requested_paths: p = r['path'] if isinstance(r, dict) else r p = _resolve_path(p, ds=refds_path) if path_startswith(p, refds_path): # all good continue # not the refds path_props = r if isinstance(r, dict) else {} res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with reference dataset' reported_paths[r] = res yield res # preserve non-existing paths to be silently killed by modification # detection and append them to requested_paths again after detection. # TODO: This might be melted in with treatment of non dataset paths # above. Re-appending those paths seems to be better than yielding # directly to avoid code duplication, since both cases later on are # dealt with again. preserved_paths = [] if requested_paths: [ preserved_paths.append(r) for r in requested_paths if not lexists(r['path'] if isinstance(r, dict) else r) ] # replace the requested paths by those paths that were actually # modified underneath or at a requested location requested_paths = get_modified_subpaths( # either the request, or the base dataset, if there was no request requested_paths if requested_paths else [refds_path], refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit) from itertools import chain # re-append the preserved paths: requested_paths = chain(requested_paths, iter(preserved_paths)) # Possibly to be used "cache" of known subdatasets per each parent # to avoid re-querying subdatasets per each path. The assumption here # is that the list of sub-datasets for a given parent should not change # through the execution of this loop, which (hypothetically) could be # incorrect while annotating paths for some commands. # TODO: verify this assumption and possibly add an argument to turn # caching off if/when needed, or provide some other way to invalidate # it subdss_cache = {} # do not loop over unique(), this could be a list of dicts # we avoid duplicates manually below via `reported_paths` for path in requested_paths: if not isinstance(path, dict): path = rawpath2ap(path, refds_path) # this is now an annotated path! path_props = path path = path['path'] # we need to mark our territory, who knows where this has been path_props.update(res_kwargs) if path in reported_paths: # we already recorded this path in the output # this can happen, whenever `path` is a subdataset, that was # discovered via recursive processing of another path before continue # the path exists in some shape or form # TODO if we have path_props already we could skip this test if isdir(path): # keep any existing type info, previously a more expensive run # could have discovered an uninstalled 'dataset', and we don't # want it to be relabeled to a directory path_props['type'] = \ path_props.get( 'type', 'dataset' if not islink(path) and GitRepo.is_valid_repo(path) else 'directory') # this could contain all types of additional content containing_dir = path if not islink(path) else normpath( opj(path, pardir)) else: if lexists(path): path_props['type'] = 'file' else: path_props['state'] = 'absent' # for everything else we are interested in the container containing_dir = dirname(path) if not containing_dir: containing_dir = curdir dspath = parent = get_dataset_root(containing_dir) if dspath: if path_props.get('type', None) == 'dataset': # for a dataset the root is not the parent, for anything else # it is parent = path_props.get('parentds', None) oneupdir = normpath(opj(containing_dir, pardir)) if parent is None and (force_parentds_discovery or (refds_path and _with_sep(oneupdir).startswith( _with_sep(refds_path)))): # either forced, or only if we have a reference dataset, and # only if we stay within this refds when searching for the # parent parent = get_dataset_root( normpath(opj(containing_dir, pardir))) # NOTE the `and refds_path` is critical, as it will determine # whether a top-level dataset that was discovered gets the # parent property or not, it won't get it without a common # base dataset, and that is how we always rolled if parent and refds_path: path_props['parentds'] = parent # don't check whether this is actually a true subdataset of the # parent, done further down else: # set parent, but prefer existing property path_props['parentds'] = path_props.get('parentds', dspath) # test for `dspath` not `parent`, we only need to know whether there is # ANY dataset, not which one is the true parent, logic below relies on # the fact that we end here, if there is no dataset at all if not dspath: # not in any dataset res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with any dataset' reported_paths[path] = res yield res continue # check that we only got SUBdatasets if refds_path and not path_startswith(dspath, refds_path): res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = \ ('path not part of the reference dataset at %s', refds_path) reported_paths[path] = res yield res continue if path_props.get('type', None) == 'file': # nothing else we can learn about this res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res continue containing_ds = None path_type = path_props.get('type', None) if parent and force_subds_discovery and ( (path_type == 'dataset' and 'registered_subds' not in path_props) or path_type == 'directory' or not lexists(path)): # if the path doesn't exist, or is labeled a directory, or a dataset even # a dataset (without this info) -> record whether this is a known subdataset # to its parent containing_ds = Dataset(parent) # Possibly "cache" the list of known subdss for parents we # have encountered so far if parent in subdss_cache: subdss = subdss_cache[parent] else: subdss = containing_ds.subdatasets(fulfilled=None, recursive=False, result_xfm=None, result_filter=None, return_type='list') subdss_cache[parent] = subdss if path in [s['path'] for s in subdss]: if path_type == 'directory' or not lexists(path): # first record that it isn't here, if just a dir or not here at all path_props['state'] = 'absent' # this must be a directory, and it is not installed path_props['type'] = 'dataset' path_props['registered_subds'] = True if not lexists(path) or \ (path_props.get('type', None) == 'dataset' and path_props.get('state', None) == 'absent'): # not there (yet) message = unavailable_path_msg if unavailable_path_msg else None if message and '%s' in message: message = (message, path) path_props['message'] = message res = get_status_dict(**dict(res_kwargs, **path_props)) # assign given status, but only if the props don't indicate a status # already res['status'] = path_props.get('status', unavailable_path_status) reported_paths[path] = res yield res continue # we know everything we can, report res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res rec_paths = [] if recursive: # here we need to consider the special case that `path` is # a dataset itself, if a recursion_limit is given (e.g. # `remove` will do that by default), we need to recurse # from the dataset itself, and not its parent to get things # right -- this will also avoid needless discovery of # unrelated subdatasets if path_props.get('type', None) == 'dataset': containing_ds = Dataset(path) else: # regular parent, we might have a dataset already containing_ds = Dataset( parent) if containing_ds is None else containing_ds for r in yield_recursive(containing_ds, path, action, recursion_limit): # capture reported paths r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] reported_paths[r['path']] = r if modified is not None: # we cannot yield right away, maybe it wasn't modified rec_paths.append(r) else: yield r if modified is not None and rec_paths: # replace the recursively discovered paths by those paths that # were actually modified underneath or at a requested location for r in get_modified_subpaths( rec_paths, refds=Dataset(refds_path), revision=modified, report_no_revision_change= force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit): res = get_status_dict(**dict(r, **res_kwargs)) reported_paths[res['path']] = res yield res return
class ContainersList(Interface): # first docstring line is used a short description in the cmdline help # the rest is put in the verbose help and manpage """List containers known to a dataset """ result_renderer = 'tailored' # parameters of the command, must be exhaustive _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset to query. If no dataset is given, an attempt is made to identify the dataset based on the current working directory""", constraints=EnsureDataset() | EnsureNone()), contains=Parameter( args=('--contains',), metavar='PATH', action='append', doc="""when operating recursively, restrict the reported containers to those from subdatasets that contain the given path (i.e. the subdatasets that are reported by :command:`datalad subdatasets --contains=PATH`). Top-level containers are always reported."""), recursive=recursion_flag, ) @staticmethod @datasetmethod(name='containers_list') @eval_results def __call__(dataset=None, recursive=False, contains=None): ds = require_dataset(dataset, check_installed=True, purpose='list containers') refds = ds.path if recursive: for sub in ds.subdatasets( contains=contains, on_failure='ignore', return_type='generator', result_renderer='disabled'): subds = Dataset(sub['path']) if subds.is_installed(): for c in subds.containers_list(recursive=recursive, return_type='generator', on_failure='ignore', result_filter=None, result_renderer=None, result_xfm=None): c['name'] = sub['gitmodule_name'] + '/' + c['name'] c['refds'] = refds yield c # all info is in the dataset config! var_prefix = 'datalad.containers.' containers = {} for var, value in ds.config.items(): if not var.startswith(var_prefix): # not an interesting variable continue var_comps = var[len(var_prefix):].split('.') cname = var_comps[0] ccfgname = '.'.join(var_comps[1:]) if not ccfgname: continue cinfo = containers.get(cname, {}) cinfo[ccfgname] = value containers[cname] = cinfo for k, v in containers.items(): if 'image' not in v: # there is no container location configured continue res = get_status_dict( status='ok', action='containers', name=k, type='file', path=op.join(ds.path, v.pop('image')), refds=refds, parentds=ds.path, # TODO #state='absent' if ... else 'present' **v) yield res @staticmethod def custom_result_renderer(res, **kwargs): if res["action"] != "containers": default_result_renderer(res) else: ui.message( "{name} -> {path}" .format(name=ac.color_word(res["name"], ac.MAGENTA), path=op.relpath(res["path"], res["refds"])))
class Clean(Interface): """Clean up after DataLad (possible temporary files etc.) Removes extracted temporary archives, etc. Examples: $ datalad clean """ _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset to perform the clean operation on. If no dataset is given, an attempt is made to identify the dataset in current working directory""", constraints=EnsureDataset() | EnsureNone()), # TODO: --info -- which performs dry run just summarizing what is to be cleaned up # TODO: Python only??? what=Parameter( args=("--what", ), dest='what', choices=('cached-archives', 'annex-tmp', 'search-index'), nargs="*", doc="""What to clean. If none specified -- all known targets are cleaned"""), recursive=recursion_flag, recursion_limit=recursion_limit, ) @staticmethod @datasetmethod(name='clean') @eval_results def __call__(dataset=None, what=None, recursive=False, recursion_limit=None): ds = require_dataset(dataset, purpose='clean-up') res_kwargs = dict(action='clean', logger=lgr, refds=ds.path) for ap in AnnotatePaths.__call__(dataset=ds.path, recursive=recursive, recursion_limit=recursion_limit, action='clean', unavailable_path_status='impossible', nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): if ap.get('status', None): yield ap continue if ap.get('type', None) != 'dataset': ap.update(status='impossible', message='only datasets can be cleaned') yield ap continue d = ap['path'] gitdir = get_git_dir(d) for dirpath, flag, msg, sing_pl in [ (ARCHIVES_TEMP_DIR, "cached-archives", "temporary archive", ("directory", "directories")), (ANNEX_TEMP_DIR, "annex-tmp", "temporary annex", ("file", "files")), (opj(gitdir, SEARCH_INDEX_DOTGITDIR), 'search-index', "metadata search index", ("file", "files")), ]: topdir = opj(d, dirpath) lgr.debug("Considering to clean %s:%s", d, dirpath) if not ((what is None) or (flag in what)): yield get_status_dict(path=topdir, status='notneeded', type='directory', **res_kwargs) continue paths = glob(opj(topdir, '*')) if not paths: yield get_status_dict(path=topdir, status='notneeded', type='directory', **res_kwargs) continue pl = len(paths) > 1 message = ("Removed %d %s %s: %s", len(paths), msg, sing_pl[int(pl)], ", ".join( sorted([x[len(topdir) + 1:] for x in paths]))) rmtree(topdir) yield get_status_dict(path=topdir, status='ok', type='dir', message=message, **res_kwargs)
class Search(Interface): """Search dataset metadata DataLad can search metadata extracted from a dataset and/or aggregated into a superdataset (see the `aggregate-metadata` command). This makes it possible to discover datasets, or individual files in a dataset even when they are not available locally. Ultimately DataLad metadata are a graph of linked data structures. However, this command does not (yet) support queries that can exploit all information stored in the metadata. At the moment the following search modes are implemented that represent different trade-offs between the expressiveness of a query and the computational and storage resources required to execute a query. - egrep (default) - egrepcs [case-sensitive egrep] - textblob - autofield An alternative default mode can be configured by tuning the configuration variable 'datalad.search.default-mode':: [datalad "search"] default-mode = egrepcs Each search mode has its own default configuration for what kind of documents to query. The respective default can be changed via configuration variables:: [datalad "search"] index-<mode_name>-documenttype = (all|datasets|files) *Mode: egrep/egrepcs* These search modes are largely ignorant of the metadata structure, and simply perform matching of a search pattern against a flat string-representation of metadata. This is advantageous when the query is simple and the metadata structure is irrelevant, or precisely known. Moreover, it does not require a search index, hence results can be reported without an initial latency for building a search index when the underlying metadata has changed (e.g. due to a dataset update). By default, these search modes only consider datasets and do not investigate records for individual files for speed reasons. Search results are reported in the order in which they were discovered. Queries can make use of Python regular expression syntax (https://docs.python.org/3/library/re.html). In `egrep` mode, matching is case-insensitive when the query does not contain upper case characters, but is case-sensitive when it does. In `egrepcs` mode, matching is always case-sensitive. Expressions will match anywhere in a metadata string, not only at the start. When multiple queries are given, all queries have to match for a search hit (AND behavior). It is possible to search individual metadata key/value items by prefixing the query with a metadata key name, separated by a colon (':'). The key name can also be a regular expression to match multiple keys. A query match happens when any value of an item with a matching key name matches the query (OR behavior). See examples for more information. Examples: Query for (what happens to be) an author:: % datalad search haxby Queries are case-INsensitive when the query contains no upper case characters, and can be regular expressions. Use `egrepcs` mode when it is desired to perform a case-sensitive lowercase match:: % datalad search --mode egrepcs halchenko.*haxby This search mode performs NO analysis of the metadata content. Therefore queries can easily fail to match. For example, the above query implicitly assumes that authors are listed in alphabetical order. If that is the case (which may or may not be true), the following query would yield NO hits:: % datalad search Haxby.*Halchenko The ``textblob`` search mode represents an alternative that is more robust in such cases. For more complex queries multiple query expressions can be provided that all have to match to be considered a hit (AND behavior). This query discovers all files (non-default behavior) that match 'bids.type=T1w' AND 'nifti1.qform_code=scanner':: % datalad -c datalad.search.index-egrep-documenttype=all search bids.type:T1w nifti1.qform_code:scanner Key name selectors can also be expressions, which can be used to select multiple keys or construct "fuzzy" queries. In such cases a query matches when any item with a matching key matches the query (OR behavior). However, multiple queries are always evaluated using an AND conjunction. The following query extends the example above to match any files that have either 'nifti1.qform_code=scanner' or 'nifti1.sform_code=scanner':: % datalad -c datalad.search.index-egrep-documenttype=all search bids.type:T1w nifti1.(q|s)form_code:scanner *Mode: textblob* This search mode is very similar to the ``egrep`` mode, but with a few key differences. A search index is built from the string-representation of metadata records. By default, only datasets are included in this index, hence the indexing is usually completed within a few seconds, even for hundreds of datasets. This mode uses its own query language (not regular expressions) that is similar to other search engines. It supports logical conjunctions and fuzzy search terms. More information on this is available from the Whoosh project (search engine implementation): - Description of the Whoosh query language: http://whoosh.readthedocs.io/en/latest/querylang.html) - Description of a number of query language customizations that are enabled in DataLad, such as, fuzzy term matching: http://whoosh.readthedocs.io/en/latest/parsing.html#common-customizations Importantly, search hits are scored and reported in order of descending relevance, hence limiting the number of search results is more meaningful than in the 'egrep' mode and can also reduce the query duration. Examples: Search for (what happens to be) two authors, regardless of the order in which those names appear in the metadata:: % datalad search --mode textblob halchenko haxby Fuzzy search when you only have an approximate idea what you are looking for or how it is spelled:: % datalad search --mode textblob haxbi~ Very fuzzy search, when you are basically only confident about the first two characters and how it sounds approximately (or more precisely: allow for three edits and require matching of the first two characters):: % datalad search --mode textblob haksbi~3/2 Combine fuzzy search with logical constructs:: % datalad search --mode textblob 'haxbi~ AND (hanke OR halchenko)' *Mode: autofield* This mode is similar to the 'textblob' mode, but builds a vastly more detailed search index that represents individual metadata variables as individual fields. By default, this search index includes records for datasets and individual fields, hence it can grow very quickly into a huge structure that can easily take an hour or more to build and require more than a GB of storage. However, limiting it to documents on datasets (see above) retains the enhanced expressiveness of queries while dramatically reducing the resource demands. Examples: List names of search index fields (auto-discovered from the set of indexed datasets):: % datalad search --mode autofield --show-keys name Fuzzy search for datasets with an author that is specified in a particular metadata field:: % datalad search --mode autofield bids.author:haxbi~ type:dataset Search for individual files that carry a particular description prefix in their 'nifti1' metadata:: % datalad search --mode autofield nifti1.description:FSL* type:file *Reporting* Search hits are returned as standard DataLad results. On the command line the '--output-format' (or '-f') option can be used to tweak results for further processing. Examples: Format search hits as a JSON stream (one hit per line):: % datalad -f json search haxby Custom formatting: which terms matched the query of particular results. Useful for investigating fuzzy search results:: $ datalad -f '{path}: {query_matched}' search --mode autofield bids.author:haxbi~ """ _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset to perform the query operation on. If no dataset is given, an attempt is made to identify the dataset based on the current working directory and/or the `path` given""", constraints=EnsureDataset() | EnsureNone()), query=Parameter( args=("query", ), metavar='QUERY', nargs="*", doc="""query string, supported syntax and features depends on the selected search mode (see documentation)"""), force_reindex=Parameter( args=("--reindex", ), dest='force_reindex', action='store_true', doc="""force rebuilding the search index, even if no change in the dataset's state has been detected, for example, when the index documenttype configuration has changed."""), max_nresults=Parameter( args=("--max-nresults", ), doc="""maxmimum number of search results to report. Setting this to 0 will report all search matches. Depending on the mode this can search substantially slower. If not specified, a mode-specific default setting will be used.""", constraints=EnsureInt() | EnsureNone()), mode=Parameter( args=("--mode", ), choices=('egrep', 'textblob', 'autofield'), doc="""Mode of search index structure and content. See section SEARCH MODES for details. """), full_record=Parameter( args=("--full-record", '-f'), action='store_true', doc="""If set, return the full metadata record for each search hit. Depending on the search mode this might require additional queries. By default, only data that is available to the respective search modes is returned. This always includes essential information, such as the path and the type."""), show_keys=Parameter( args=('--show-keys', ), choices=('name', 'short', 'full'), default=None, doc="""if given, a list of known search keys is shown. If 'name' - only the name is printed one per line. If 'short' or 'full', statistics (in how many datasets, and how many unique values) are printed. 'short' truncates the listing of unique values. No other action is performed (except for reindexing), even if other arguments are given. Each key is accompanied by a term definition in parenthesis (TODO). In most cases a definition is given in the form of a URL. If an ontology definition for a term is known, this URL can resolve to a webpage that provides a comprehensive definition of the term. However, for speed reasons term resolution is solely done on information contained in a local dataset's metadata, and definition URLs might be outdated or point to no longer existing resources.""" ), show_query=Parameter( args=('--show-query', ), action='store_true', doc="""if given, the formal query that was generated from the given query string is shown, but not actually executed. This is mostly useful for debugging purposes."""), ) @staticmethod @datasetmethod(name='search') @eval_results def __call__(query=None, dataset=None, force_reindex=False, max_nresults=None, mode=None, full_record=False, show_keys=None, show_query=False): try: ds = require_dataset(dataset, check_installed=True, purpose='dataset search') if ds.id is None: raise NoDatasetArgumentFound( "This does not seem to be a dataset (no DataLad dataset ID " "found). 'datalad create --force %s' can initialize " "this repository as a DataLad dataset" % ds.path) except NoDatasetArgumentFound: for r in _search_from_virgin_install(dataset, query): yield r return if mode is None: # let's get inspired by what the dataset/user think is # default mode = ds.config.obtain('datalad.search.default-mode') if mode == 'egrep': searcher = _EGrepSearch elif mode == 'egrepcs': searcher = _EGrepCSSearch elif mode == 'textblob': searcher = _BlobSearch elif mode == 'autofield': searcher = _AutofieldSearch else: raise ValueError('unknown search mode "{}"'.format(mode)) searcher = searcher(ds, force_reindex=force_reindex) if show_keys: searcher.show_keys(show_keys) return if not query: return if show_query: print(repr(searcher.get_query(query))) return for r in searcher(query, max_nresults=max_nresults, full_record=full_record): yield r
class Save(Interface): """Save the current state of a dataset Saving the state of a dataset records changes that have been made to it. This change record is annotated with a user-provided description. Optionally, an additional tag, such as a version, can be assigned to the saved state. Such tag enables straightforward retrieval of past versions at a later point in time. .. note:: Before Git v2.22, any Git repository without an initial commit located inside a Dataset is ignored, and content underneath it will be saved to the respective superdataset. DataLad datasets always have an initial commit, hence are not affected by this behavior. """ # note above documents that out behavior is like that of `git add`, but # does not explicitly mention the connection to keep it simple. _examples_ = [ dict(text="""Save any content underneath the current directory, without altering any potential subdataset""", code_py="save(path='.')", code_cmd="datalad save ."), dict(text="""Save specific content in the dataset""", code_py="save(path='myfile.txt')", code_cmd="datalad save myfile.txt"), dict(text="""Attach a commit message to save""", code_py="save(path='myfile.txt', message='add file')", code_cmd="datalad save -m 'add file' myfile.txt"), dict(text="""Save any content underneath the current directory, and recurse into any potential subdatasets""", code_py="save(path='.', recursive=True)", code_cmd="datalad save . -r"), dict(text="Save any modification of known dataset content in the " "current directory, but leave untracked files (e.g. temporary files) " "untouched", code_py="""save(path='.', updated=True)""", code_cmd="""datalad save -u ."""), dict(text="Tag the most recent saved state of a dataset", code_py="save(version_tag='bestyet')", code_cmd="datalad save --version-tag 'bestyet'"), dict(text="Save a specific change but integrate into last commit keeping " "the already recorded commit message", code_py="save(path='myfile.txt', amend=True)", code_cmd="datalad save myfile.txt --amend") ] _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc=""""specify the dataset to save""", constraints=EnsureDataset() | EnsureNone()), path=Parameter( args=("path",), metavar='PATH', doc="""path/name of the dataset component to save. If given, only changes made to those components are recorded in the new state.""", nargs='*', constraints=EnsureStr() | EnsureNone()), message=save_message_opt, message_file=Parameter( args=("-F", "--message-file"), doc="""take the commit message from this file. This flag is mutually exclusive with -m.""", constraints=EnsureStr() | EnsureNone()), version_tag=Parameter( args=("-t", "--version-tag",), metavar='ID', doc="""an additional marker for that state. Every dataset that is touched will receive the tag.""", constraints=EnsureStr() | EnsureNone()), recursive=recursion_flag, recursion_limit=recursion_limit, updated=Parameter( args=('-u', '--updated',), action='store_true', doc="""if given, only saves previously tracked paths."""), to_git=Parameter( args=("--to-git",), action='store_true', doc="""flag whether to add data directly to Git, instead of tracking data identity only. Use with caution, there is no guarantee that a file put directly into Git like this will not be annexed in a subsequent save operation. If not specified, it will be up to git-annex to decide how a file is tracked, based on a dataset's configuration to track particular paths, file types, or file sizes with either Git or git-annex. (see https://git-annex.branchable.com/tips/largefiles). """), jobs=jobs_opt, amend=Parameter( args=('--amend',), action='store_true', doc="""if set, changes are not recorded in a new, separate commit, but are integrated with the changeset of the previous commit, and both together are recorded by replacing that previous commit. This is mutually exclusive with recursive operation. """), ) @staticmethod @datasetmethod(name='save') @eval_results def __call__(path=None, *, message=None, dataset=None, version_tag=None, recursive=False, recursion_limit=None, updated=False, message_file=None, to_git=None, jobs=None, amend=False, ): if message and message_file: raise ValueError( "Both a message and message file were specified for save()") if amend and recursive: raise ValueError("Cannot amend a commit recursively.") path = ensure_list(path) if message_file: with open(message_file) as mfh: message = mfh.read() # we want 'normal' to achieve the most compact argument list # for git calls # untracked_mode = 'no' if updated else 'normal' # TODO however, Repo.add() would refuse to add any dotfiles # in a directory that is itself untracked, hence the only # choice is to go with potentially crazy long lists # until https://github.com/datalad/datalad/issues/1454 # has a resolution untracked_mode = 'no' if updated else 'all' # there are three basic scenarios: # 1. save modifications to any already tracked content # 2. save any content (including removal of deleted content) # to bring things to a clean state # 3. like (2), but only operate on a given subset of content # identified by paths # - all three have to work in conjunction with --recursive # - the difference between (1) and (2) should be no more # that a switch from --untracked=no to --untracked=all # in Repo.save() # we do not support # - simultaneous operations on multiple datasets from disjoint # dataset hierarchies, hence a single reference dataset must be # identifiable from the either # - curdir or # - the `dataset` argument. # This avoids complex annotation loops and hierarchy tracking. # - any modification upwards from the root dataset ds = require_dataset(dataset, check_installed=True, purpose='save') # use status() to do all discovery and annotation of paths paths_by_ds = {} for s in Status()( # ATTN: it is vital to pass the `dataset` argument as it, # and not a dataset instance in order to maintain the path # semantics between here and the status() call dataset=dataset, path=path, untracked=untracked_mode, recursive=recursive, recursion_limit=recursion_limit, on_failure='ignore', # for save without recursion only commit matters eval_subdataset_state='full' if recursive else 'commit', return_type='generator', # this could be, but for now only 'error' results are handled # below #on_failure='ignore', result_renderer='disabled'): if s['status'] == 'error': # Downstream code can't do anything with these. Let the caller # decide their fate. yield s continue # fish out status dict for this parent dataset ds_status = paths_by_ds.get(s['parentds'], {}) # reassemble path status info as repo.status() would have made it ds_status[ut.Path(s['path'])] = \ {k: v for k, v in s.items() if k not in ( 'path', 'parentds', 'refds', 'status', 'action', 'logger')} paths_by_ds[s['parentds']] = ds_status lgr.debug('Determined %i datasets for saving from input arguments', len(paths_by_ds)) # figure out what datasets to process, start with the ones containing # the paths that were given as arguments discovered_datasets = list(paths_by_ds.keys()) if dataset: # if a reference dataset was given we want to save all the way up # to it, so let's throw it into the mix discovered_datasets.append(ds.path) # sort the datasets into (potentially) disjoint hierarchies, # or a single one, if a reference dataset was given dataset_hierarchies = get_tree_roots(discovered_datasets) for rootds, children in dataset_hierarchies.items(): edges = {} discover_dataset_trace_to_targets( rootds, children, [], edges, includeds=children) for superds, subdss in edges.items(): superds_status = paths_by_ds.get(superds, {}) for subds in subdss: subds_path = ut.Path(subds) sub_status = superds_status.get(subds_path, {}) if not (sub_status.get("state") == "clean" and sub_status.get("type") == "dataset"): # TODO actually start from an entry that may already # exist in the status record superds_status[subds_path] = dict( # shot from the hip, some status config # to trigger this specific super/sub # relation to be saved state='untracked', type='dataset') paths_by_ds[superds] = superds_status def save_ds(args, version_tag=None): pdspath, paths = args pds = Dataset(pdspath) pds_repo = pds.repo # pop status for this dataset, we are not coming back to it pds_status = { # for handing over to the low-level code, we recode any # path relative to the real repo location, this avoid # cumbersome symlink handling without context in the # lower levels pds_repo.pathobj / p.relative_to(pdspath): props for p, props in paths.items()} start_commit = pds_repo.get_hexsha() if not all(p['state'] == 'clean' for p in pds_status.values()) or \ (amend and message): for res in pds_repo.save_( message=message, # make sure to have the `path` arg be None, as we want # to prevent and bypass any additional repo.status() # calls paths=None, # prevent whining of GitRepo git=True if not hasattr(ds.repo, 'uuid') else to_git, # we are supplying the full status already, do not # detect anything else untracked='no', _status=pds_status, amend=amend): # TODO remove stringification when datalad-core can handle # path objects, or when PY3.6 is the lowest supported # version for k in ('path', 'refds'): if k in res: res[k] = str( # recode path back to dataset path anchor pds.pathobj / res[k].relative_to( pds_repo.pathobj) ) yield res # report on the dataset itself dsres = dict( action='save', type='dataset', path=pds.path, refds=ds.path, status='ok' if start_commit != pds_repo.get_hexsha() else 'notneeded', logger=lgr, ) if not version_tag: yield dsres return try: # method requires str version_tag = str(version_tag) pds_repo.tag(version_tag) dsres.update( status='ok', version_tag=version_tag) yield dsres except CommandError as e: if dsres['status'] == 'ok': # first we yield the result for the actual save # TODO: we will get duplicate dataset/save record obscuring # progress reporting. yoh thought to decouple "tag" from "save" # messages but was worrying that original authors would disagree yield dsres.copy() # and now complain that tagging didn't work dsres.update( status='error', message=('cannot tag this version: %s', e.stderr.strip())) yield dsres if not paths_by_ds: # Special case: empty repo. There's either an empty commit only or # none at all. An empty one we can amend otherwise there's nothing # to do. if amend and ds.repo.get_hexsha(): yield from save_ds((ds.pathobj, dict()), version_tag=version_tag) else: yield dict(action='save', type='dataset', path=ds.path, refds=ds.path, status='notneeded', logger=lgr) return # TODO: in principle logging could be improved to go not by a dataset # but by path(s) within subdatasets. That should provide a bit better ETA # and more "dynamic" feedback than jumpy datasets count. # See addurls where it is implemented that way by providing agg and another # log_filter yield from ProducerConsumerProgressLog( sorted(paths_by_ds.items(), key=lambda v: v[0], reverse=True), partial(save_ds, version_tag=version_tag), safe_to_consume=no_subds_in_futures, producer_future_key=lambda ds_items: ds_items[0], jobs=jobs, log_filter=_log_filter_save_dataset, unit="datasets", lgr=lgr, )
class Push(Interface): """Push a dataset to a known :term:`sibling`. This makes a saved state of a dataset available to a sibling or special remote data store of a dataset. Any target sibling must already exist and be known to the dataset. || REFLOW >> By default, all files tracked in the last saved state (of the current branch) will be copied to the target location. Optionally, it is possible to limit a push to changes relative to a particular point in the version history of a dataset (e.g. a release tag) using the [CMD: --since CMD][PY: since PY] option in conjunction with the specification of a reference dataset. In recursive mode subdatasets will also be evaluated, and only those subdatasets are pushed where a change was recorded that is reflected in the current state of the top-level reference dataset. << REFLOW || .. note:: Power-user info: This command uses :command:`git push`, and :command:`git annex copy` to push a dataset. Publication targets are either configured remote Git repositories, or git-annex special remotes (if they support data upload). """ # TODO add examples _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), doc="""specify the dataset to push""", constraints=EnsureDataset() | EnsureNone()), to=Parameter( args=("--to", ), metavar='SIBLING', doc="""name of the target sibling. If no name is given an attempt is made to identify the target based on the dataset's configuration (i.e. a configured tracking branch, or a single sibling that is configured for push)""", constraints=EnsureStr() | EnsureNone()), since=Parameter( args=("--since", ), constraints=EnsureStr() | EnsureNone(), doc= """specifies commit-ish (tag, shasum, etc.) from which to look for changes to decide whether pushing is necessary. If '^' is given, the last state of the current branch at the sibling is taken as a starting point."""), path=Parameter(args=("path", ), metavar='PATH', doc="""path to contrain a push to. If given, only data or changes for those paths are considered for a push.""", nargs='*', constraints=EnsureStr() | EnsureNone()), data=Parameter( args=("--data", ), doc="""what to do with (annex'ed) data. 'anything' would cause transfer of all annexed content, 'nothing' would avoid call to `git annex copy` altogether. 'auto' would use 'git annex copy' with '--auto' thus transferring only data which would satisfy "wanted" or "numcopies" settings for the remote (thus "nothing" otherwise). 'auto-if-wanted' would enable '--auto' mode only if there is a "wanted" setting for the remote, and transfer 'anything' otherwise. """, constraints=EnsureChoice('anything', 'nothing', 'auto', 'auto-if-wanted')), force=Parameter( # multi-mode option https://github.com/datalad/datalad/issues/3414 args=( "-f", "--force", ), doc="""force particular operations, possibly overruling safety protections or optimizations: use --force with git-push ('gitpush'); do not use --fast with git-annex copy ('checkdatapresent'); combine all force modes ('all').""", constraints=EnsureChoice('all', 'gitpush', 'checkdatapresent', None)), recursive=recursion_flag, recursion_limit=recursion_limit, jobs=jobs_opt, ) # Desired features: # - let Git do it's thing (push multiple configured refs without the need # to specific anything on the command line # - compilication: we need publication dependencies (i.e. publish what # would be published by Git to a different remote first, hence we # cannot simply watch Git do it, and later act on it.) # - https://github.com/datalad/datalad/issues/1284 # - https://github.com/datalad/datalad/issues/4006 # - make differences between remotes and various types of special remotes # opaque # - https://github.com/datalad/datalad/issues/3127 # - informative and comprehensive (error) reporting # - https://github.com/datalad/datalad/issues/2000 # - https://github.com/datalad/datalad/issues/1682 # - https://github.com/datalad/datalad/issues/2029 # - https://github.com/datalad/datalad/issues/2855 # - https://github.com/datalad/datalad/issues/3412 # - https://github.com/datalad/datalad/issues/3424 # - ensure robust behavior in multi-lateral push scenarios (updating # a dataset that was updated by a 3rd-party after the last known # fetched change # - https://github.com/datalad/datalad/issues/2636 # - should NOT mimic `publish` and that it mixes `create-sibling` and # `push` into a single operation. This would fold the complexity # of all possible ways a local dataset hierarchy could possibly # connected to remote ends into this command. It would be lost battle # from the start. # - not tackle: https://github.com/datalad/datalad/issues/2186 # - maintain standard setup, and not reflect procedural aspects # onto the resulting outcomes # - https://github.com/datalad/datalad/issues/2001 # - do a straight push, nothing like 'sync'. If a remote has something that # needs merging first, fail and let users update. Any diff we are missing # locally can impact decision making via --since and friends. @staticmethod @datasetmethod(name='push') @eval_results def __call__(path=None, dataset=None, to=None, since=None, data='auto-if-wanted', force=None, recursive=False, recursion_limit=None, jobs=None): # push uses '^' to annotate the previous pushed committish, and None for default # behavior. '' was/is (to be deprecated) used in `publish`. Alert user about the mistake if since == '': raise ValueError("'since' should point to commitish or use '^'.") # we resolve here, because we need to perform inspection on what was given # as an input argument further down paths = [resolve_path(p, dataset) for p in assure_list(path)] ds = require_dataset(dataset, check_installed=True, purpose='pushing') ds_repo = ds.repo res_kwargs = dict( action='publish', refds=ds.path, logger=lgr, ) get_remote_kwargs = {'exclude_special_remotes': False} \ if isinstance(ds_repo, AnnexRepo) else {} if to and to not in ds_repo.get_remotes(**get_remote_kwargs): # get again for proper error: sr = ds_repo.get_remotes(**get_remote_kwargs) # yield an error result instead of raising a ValueError, # to enable the use case of pushing to a target that # a superdataset doesn't know, but some subdatasets to # (in combination with '--on-failure ignore') yield dict(res_kwargs, status='error', path=ds.path, message="Unknown push target '{}'. {}".format( to, 'Known targets: {}.'.format(', '.join( repr(s) for s in sr)) if sr else 'No targets configured in dataset.')) return if since == '^': # figure out state of remote branch and set `since` since = _get_corresponding_remote_state(ds_repo, to) if not since: lgr.info("No tracked remote for active branch, " "detection of last pushed state not in effect.") elif since: # will blow with ValueError if unusable ds_repo.get_hexsha(since) # obtain a generator for information on the datasets to process # idea is to turn the `paths` argument into per-dataset # content listings that can be acted upon ds_spec = _datasets_since_( # important to pass unchanged dataset arg dataset, since, paths, recursive, recursion_limit) # instead of a loop, this could all be done in parallel matched_anything = False for dspath, dsrecords in ds_spec: matched_anything = True lgr.debug('Attempt push of Dataset at %s', dspath) pbars = {} yield from _push(dspath, dsrecords, to, data, force, jobs, res_kwargs.copy(), pbars, got_path_arg=True if path else False) # take down progress bars for this dataset for i, ds in pbars.items(): log_progress(lgr.info, i, 'Finished push of %s', ds) if not matched_anything: yield dict( res_kwargs, status='notneeded', message= 'Given constraints did not match any changes to publish', type='dataset', path=ds.path, ) @staticmethod def custom_result_summary_renderer(results): # pragma: more cover # report on any hints at the end # get all unique hints hints = set([r.get('hints', None) for r in results]) hints = [hint for hint in hints if hint is not None] if hints: from datalad.ui import ui from datalad.support import ansi_colors intro = ansi_colors.color_word( "Potential hints to solve encountered errors: ", ansi_colors.YELLOW) ui.message(intro) [ ui.message("{}: {}".format( ansi_colors.color_word(id + 1, ansi_colors.YELLOW), hint)) for id, hint in enumerate(hints) ]
class Run(Interface): """Run an arbitrary shell command and record its impact on a dataset. It is recommended to craft the command such that it can run in the root directory of the dataset that the command will be recorded in. However, as long as the command is executed somewhere underneath the dataset root, the exact location will be recorded relative to the dataset root. If the executed command did not alter the dataset in any way, no record of the command execution is made. If the given command errors, a `CommandError` exception with the same exit code will be raised, and no modifications will be saved. *Command format* || REFLOW >> A few placeholders are supported in the command via Python format specification. "{pwd}" will be replaced with the full path of the current working directory. "{dspath}" will be replaced with the full path of the dataset that run is invoked on. "{tmpdir}" will be replaced with the full path of a temporary directory. "{inputs}" and "{outputs}" represent the values specified by [CMD: --input and --output CMD][PY: `inputs` and `outputs` PY]. If multiple values are specified, the values will be joined by a space. The order of the values will match that order from the command line, with any globs expanded in alphabetical order (like bash). Individual values can be accessed with an integer index (e.g., "{inputs[0]}"). << REFLOW || || REFLOW >> Note that the representation of the inputs or outputs in the formatted command string depends on whether the command is given as a list of arguments or as a string[CMD: (quotes surrounding the command) CMD]. The concatenated list of inputs or outputs will be surrounded by quotes when the command is given as a list but not when it is given as a string. This means that the string form is required if you need to pass each input as a separate argument to a preceding script (i.e., write the command as "./script {inputs}", quotes included). The string form should also be used if the input or output paths contain spaces or other characters that need to be escaped. << REFLOW || To escape a brace character, double it (i.e., "{{" or "}}"). Custom placeholders can be added as configuration variables under "datalad.run.substitutions". As an example: Add a placeholder "name" with the value "joe":: % git config --file=.datalad/config datalad.run.substitutions.name joe % datalad add -m "Configure name placeholder" .datalad/config Access the new placeholder in a command:: % datalad run "echo my name is {name} >me" """ _examples_ = [ dict( text="Run an executable script and record the impact on a dataset", code_py="run(message='run my script', cmd='code/script.sh')", code_cmd="datalad run -m 'run my script' 'code/script.sh'"), dict(text="Run a command and specify a directory as a dependency " "for the run. The contents of the dependency will be retrieved " "prior to running the script", code_cmd="datalad run -m 'run my script' -i 'data/*' " "'code/script.sh'", code_py="""\ run(cmd='code/script.sh', message='run my script', inputs=['data/*'])"""), dict(text="Run an executable script and specify output files of the " "script to be unlocked prior to running the script", code_py="""\ run(cmd='code/script.sh', message='run my script', inputs=['data/*'], outputs=['output_dir'])""", code_cmd="""\ datalad run -m 'run my script' -i 'data/*' \\ -o 'output_dir/*' 'code/script.sh'"""), dict(text="Specify multiple inputs and outputs", code_py="""\ run(cmd='code/script.sh', message='run my script', inputs=['data/*', 'datafile.txt'], outputs=['output_dir', 'outfile.txt'])""", code_cmd="""\ datalad run -m 'run my script' -i 'data/*' \\ -i 'datafile.txt' -o 'output_dir/*' -o \\ 'outfile.txt' 'code/script.sh'""") ] _params_ = dict( cmd=Parameter( args=("cmd", ), nargs=REMAINDER, metavar='COMMAND', doc="""command for execution. A leading '--' can be used to disambiguate this command from the preceding options to DataLad."""), dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset to record the command results in. An attempt is made to identify the dataset based on the current working directory. If a dataset is given, the command will be executed in the root directory of this dataset.""", constraints=EnsureDataset() | EnsureNone()), inputs=Parameter( args=("-i", "--input"), dest="inputs", metavar=("PATH"), action='append', doc="""A dependency for the run. Before running the command, the content of this file will be retrieved. A value of "." means "run :command:`datalad get .`". The value can also be a glob. [CMD: This option can be given more than once. CMD]"""), outputs=Parameter( args=("-o", "--output"), dest="outputs", metavar=("PATH"), action='append', doc="""Prepare this file to be an output file of the command. A value of "." means "run :command:`datalad unlock .`" (and will fail if some content isn't present). For any other value, if the content of this file is present, unlock the file. Otherwise, remove it. The value can also be a glob. [CMD: This option can be given more than once. CMD]"""), expand=Parameter( args=("--expand", ), doc="""Expand globs when storing inputs and/or outputs in the commit message.""", constraints=EnsureChoice(None, "inputs", "outputs", "both")), explicit=Parameter( args=("--explicit", ), action="store_true", doc="""Consider the specification of inputs and outputs to be explicit. Don't warn if the repository is dirty, and only save modifications to the listed outputs."""), message=save_message_opt, sidecar=Parameter(args=('--sidecar', ), metavar="{yes|no}", doc="""By default, the configuration variable 'datalad.run.record-sidecar' determines whether a record with information on a command's execution is placed into a separate record file instead of the commit message (default: off). This option can be used to override the configured behavior on a case-by-case basis. Sidecar files are placed into the dataset's '.datalad/runinfo' directory (customizable via the 'datalad.run.record-directory' configuration variable).""", constraints=EnsureNone() | EnsureBool()), ) @staticmethod @datasetmethod(name='run') @eval_results def __call__(cmd=None, dataset=None, inputs=None, outputs=None, expand=None, explicit=False, message=None, sidecar=None): for r in run_command(cmd, dataset=dataset, inputs=inputs, outputs=outputs, expand=expand, explicit=explicit, message=message, sidecar=sidecar): yield r
class CreateSibling(Interface): """Create a dataset sibling on a UNIX-like SSH-accessible machine Given a local dataset, and SSH login information this command creates a remote dataset repository and configures it as a dataset sibling to be used as a publication target (see `publish` command). Various properties of the remote sibling can be configured (e.g. name location on the server, read and write access URLs, and access permissions. Optionally, a basic web-viewer for DataLad datasets can be installed at the remote location. This command supports recursive processing of dataset hierarchies, creating a remote sibling for each dataset in the hierarchy. By default, remote siblings are created in hierarchical structure that reflects the organization on the local file system. However, a simple templating mechanism is provided to produce a flat list of datasets (see --target-dir). """ # XXX prevent common args from being added to the docstring _no_eval_results = True _params_ = dict( # TODO: Figure out, whether (and when) to use `sshurl` as push url dataset=Parameter( args=( "--dataset", "-d", ), doc="""specify the dataset to create the publication target for. If no dataset is given, an attempt is made to identify the dataset based on the current working directory""", constraints=EnsureDataset() | EnsureNone()), sshurl=Parameter( args=("sshurl", ), metavar='SSHURL', nargs='?', doc="""Login information for the target server. This can be given as a URL (ssh://host/path) or SSH-style (user@host:path). Unless overridden, this also serves the future dataset's access URL and path on the server.""", constraints=EnsureStr()), name=Parameter( args=( '-s', '--name', ), metavar='NAME', doc="""sibling name to create for this publication target. If `recursive` is set, the same name will be used to label all the subdatasets' siblings. When creating a target dataset fails, no sibling is added""", constraints=EnsureStr() | EnsureNone(), nargs="?"), target_dir=Parameter( args=('--target-dir', ), metavar='PATH', doc="""path to the directory *on the server* where the dataset shall be created. By default the SSH access URL is used to identify this directory. If a relative path is provided here, it is interpreted as being relative to the user's home directory on the server.\n Additional features are relevant for recursive processing of datasets with subdatasets. By default, the local dataset structure is replicated on the server. However, it is possible to provide a template for generating different target directory names for all (sub)datasets. Templates can contain certain placeholder that are substituted for each (sub)dataset. For example: "/mydirectory/dataset%%RELNAME".\nSupported placeholders:\n %%RELNAME - the name of the datasets, with any slashes replaced by dashes\n""", constraints=EnsureStr() | EnsureNone()), target_url=Parameter( args=('--target-url', ), metavar='URL', doc=""""public" access URL of the to-be-created target dataset(s) (default: `sshurl`). Accessibility of this URL determines the access permissions of potential consumers of the dataset. As with `target_dir`, templates (same set of placeholders) are supported. Also, if specified, it is provided as the annex description\n""", constraints=EnsureStr() | EnsureNone()), target_pushurl=Parameter( args=('--target-pushurl', ), metavar='URL', doc="""In case the `target_url` cannot be used to publish to the dataset, this option specifies an alternative URL for this purpose. As with `target_url`, templates (same set of placeholders) are supported.\n""", constraints=EnsureStr() | EnsureNone()), recursive=recursion_flag, recursion_limit=recursion_limit, existing=Parameter( args=("--existing", ), constraints=EnsureChoice('skip', 'replace', 'error', 'reconfigure'), metavar='MODE', doc= """action to perform, if a sibling is already configured under the given name and/or a target directory already exists. In this case, a dataset can be skipped ('skip'), an existing target directory be forcefully re-initialized, and the sibling (re-)configured ('replace', implies 'reconfigure'), the sibling configuration be updated only ('reconfigure'), or to error ('error').""", ), inherit=inherit_opt, shared=Parameter( args=("--shared", ), metavar='false|true|umask|group|all|world|everybody|0xxx', doc="""if given, configures the access permissions on the server for multi-users (this could include access by a webserver!). Possible values for this option are identical to those of `git init --shared` and are described in its documentation.""", constraints=EnsureStr() | EnsureBool() | EnsureNone()), ui=Parameter(args=("--ui", ), metavar='false|true|html_filename', doc="""publish a web interface for the dataset with an optional user-specified name for the html at publication target. defaults to `index.html` at dataset root""", constraints=EnsureBool() | EnsureStr()), as_common_datasrc=as_common_datasrc, publish_depends=publish_depends, publish_by_default=publish_by_default, annex_wanted=annex_wanted_opt, annex_group=annex_group_opt, annex_groupwanted=annex_groupwanted_opt, since=Parameter( args=("--since", ), constraints=EnsureStr() | EnsureNone(), doc= """limit processing to datasets that have been changed since a given state (by tag, branch, commit, etc). This can be used to create siblings for recently added subdatasets."""), ) @staticmethod @datasetmethod(name='create_sibling') @eval_results def __call__(sshurl, name=None, target_dir=None, target_url=None, target_pushurl=None, dataset=None, recursive=False, recursion_limit=None, existing='error', shared=None, ui=False, as_common_datasrc=None, publish_by_default=None, publish_depends=None, annex_wanted=None, annex_group=None, annex_groupwanted=None, inherit=False, since=None): # there is no point in doing anything further not_supported_on_windows( "Support for SSH connections is not yet implemented in Windows") # # nothing without a base dataset # ds = require_dataset(dataset, check_installed=True, purpose='creating a sibling') refds_path = ds.path # # all checks that are possible before we start parsing the dataset # # possibly use sshurl to get the name in case if not specified if not sshurl: if not inherit: raise InsufficientArgumentsError( "needs at least an SSH URL, if no inherit option") if name is None: raise ValueError( "Neither SSH URL, nor the name of sibling to inherit from " "was specified") # It might well be that we already have this remote setup try: sshurl = CreateSibling._get_remote_url(ds, name) except Exception as exc: lgr.debug('%s does not know about url for %s: %s', ds, name, exc_str(exc)) elif inherit: raise ValueError( "For now, for clarity not allowing specifying a custom sshurl " "while inheriting settings") # may be could be safely dropped -- still WiP if not sshurl: # TODO: may be more back up before _prep? super_ds = ds.get_superdataset() if not super_ds: raise ValueError( "Could not determine super dataset for %s to inherit URL" % ds) super_url = CreateSibling._get_remote_url(super_ds, name) # for now assuming hierarchical setup # (TODO: to be able to destinguish between the two, probably # needs storing datalad.*.target_dir to have %RELNAME in there) sshurl = slash_join(super_url, relpath(ds.path, super_ds.path)) # check the login URL sshri = RI(sshurl) if not is_ssh(sshri): raise ValueError( "Unsupported SSH URL: '{0}', " "use ssh://host/path or host:path syntax".format(sshurl)) if not name: # use the hostname as default remote name name = sshri.hostname lgr.debug( "No sibling name given, use URL hostname '%s' as sibling name", name) if since == '': # default behavior - only updated since last update # so we figure out what was the last update # XXX here we assume one to one mapping of names from local branches # to the remote active_branch = ds.repo.get_active_branch() since = '%s/%s' % (name, active_branch) # # parse the base dataset to find all subdatasets that need processing # to_process = [] for ap in AnnotatePaths.__call__( dataset=refds_path, # only a single path! path=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='create_sibling', # both next should not happen anyways unavailable_path_status='impossible', nondataset_path_status='error', modified=since, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) != 'dataset' or ap.get('state', None) == 'absent': # this can happen when there is `since`, but we have no # use for anything but datasets here continue checkds_remotes = Dataset(ap['path']).repo.get_remotes() \ if ap.get('state', None) != 'absent' \ else [] if publish_depends: # make sure dependencies are valid # TODO: inherit -- we might want to automagically create # those dependents as well??? unknown_deps = set( assure_list(publish_depends)).difference(checkds_remotes) if unknown_deps: ap['status'] = 'error' ap['message'] = ( 'unknown sibling(s) specified as publication dependency: %s', unknown_deps) yield ap continue if name in checkds_remotes and existing in ('error', 'skip'): ap['status'] = 'error' if existing == 'error' else 'notneeded' ap['message'] = ( "sibling '%s' already configured (specify alternative name, or force " "reconfiguration via --existing", name) yield ap continue to_process.append(ap) if not to_process: # we ruled out all possibilities # TODO wait for gh-1218 and make better return values lgr.info("No datasets qualify for sibling creation. " "Consider different settings for --existing " "or --since if this is unexpected") return if target_dir is None: if sshri.path: target_dir = sshri.path else: target_dir = '.' # TODO: centralize and generalize template symbol handling replicate_local_structure = "%RELNAME" not in target_dir # request ssh connection: lgr.info("Connecting ...") assert (sshurl is not None) # delayed anal verification ssh = ssh_manager.get_connection(sshurl) if not ssh.get_annex_version(): raise MissingExternalDependency('git-annex', msg='on the remote system') # # all checks done and we have a connection, now do something # # loop over all datasets, ordered from top to bottom to make test # below valid (existing directories would cause the machinery to halt) # But we need to run post-update hook in depth-first fashion, so # would only collect first and then run (see gh #790) yielded = set() remote_repos_to_run_hook_for = [] for currentds_ap in \ sorted(to_process, key=lambda x: x['path'].count('/')): current_ds = Dataset(currentds_ap['path']) path = _create_dataset_sibling( name, current_ds, ds.path, ssh, replicate_local_structure, sshri, target_dir, target_url, target_pushurl, existing, shared, publish_depends, publish_by_default, as_common_datasrc, annex_wanted, annex_group, annex_groupwanted, inherit) if not path: # nothing new was created # TODO is 'notneeded' appropriate in this case? currentds_ap['status'] = 'notneeded' # TODO explain status in 'message' yield currentds_ap yielded.add(currentds_ap['path']) continue remote_repos_to_run_hook_for.append((path, currentds_ap)) # publish web-interface to root dataset on publication server if current_ds.path == ds.path and ui: lgr.info("Uploading web interface to %s" % path) try: CreateSibling.upload_web_interface(path, ssh, shared, ui) except CommandError as e: currentds_ap['status'] = 'error' currentds_ap['message'] = ( "failed to push web interface to the remote datalad repository (%s)", exc_str(e)) yield currentds_ap yielded.add(currentds_ap['path']) continue # in reverse order would be depth first lgr.info("Running post-update hooks in all created siblings") # TODO: add progressbar for path, currentds_ap in remote_repos_to_run_hook_for[::-1]: # Trigger the hook lgr.debug("Running hook for %s", path) try: ssh("cd {} && hooks/post-update".format( sh_quote(_path_(path, ".git")))) except CommandError as e: currentds_ap['status'] = 'error' currentds_ap['message'] = ( "failed to run post-update hook under remote path %s (%s)", path, exc_str(e)) yield currentds_ap yielded.add(currentds_ap['path']) continue if not currentds_ap['path'] in yielded: # if we were silent until now everything is just splendid currentds_ap['status'] = 'ok' yield currentds_ap @staticmethod def _get_ds_remote_shared_setting(ds, name, ssh): """Figure out setting of sharedrepository for dataset's `name` remote""" shared = None try: current_super_url = CreateSibling._get_remote_url(ds, name) current_super_ri = RI(current_super_url) out, err = ssh('git -C {} config --get core.sharedrepository'.format( # TODO -- we might need to expanduser taking .user into account # but then it must be done also on remote side sh_quote(current_super_ri.path))) shared = out.strip() if err: lgr.warning("Got stderr while calling ssh: %s", err) except CommandError as e: lgr.debug( "Could not figure out remote shared setting of %s for %s due " "to %s", ds, name, exc_str(e)) # could well be ok if e.g. not shared # TODO: more detailed analysis may be? return shared @staticmethod def _get_remote_url(ds, name): """A little helper to get url from pushurl or from url if not defined""" # take pushurl if present, if not -- just a url url = ds.config.get('remote.%s.pushurl' % name) or \ ds.config.get('remote.%s.url' % name) if not url: raise ValueError("%s had neither pushurl or url defined for %s" % (ds, name)) return url @staticmethod def init_remote_repo(path, ssh, shared, dataset, description=None): cmd = "git -C {} init{}".format( sh_quote(path), " --shared='{}'".format(sh_quote(shared)) if shared else '') try: ssh(cmd) except CommandError as e: lgr.error("Initialization of remote git repository failed at %s." "\nError: %s\nSkipping ..." % (path, exc_str(e))) return False if isinstance(dataset.repo, AnnexRepo): # init remote git annex repo (part fix of #463) try: ssh("git -C {} annex init {}".format( sh_quote(path), sh_quote(description) if description else '')) except CommandError as e: lgr.error( "Initialization of remote git annex repository failed at %s." "\nError: %s\nSkipping ..." % (path, exc_str(e))) return False return True @staticmethod def create_postupdate_hook(path, ssh, dataset): # location of post-update hook file, logs folder on remote target hooks_remote_dir = opj(path, '.git', 'hooks') # make sure hooks directory exists (see #1251) ssh('mkdir -p {}'.format(sh_quote(hooks_remote_dir))) hook_remote_target = opj(hooks_remote_dir, 'post-update') # create json command for current dataset log_filename = 'datalad-publish-hook-$(date +%s).log' % TIMESTAMP_FMT hook_content = r'''#!/bin/bash git update-server-info # # DataLad # # (Re)generate meta-data for DataLad Web UI and possibly init new submodules dsdir="$(dirname $0)/../.." logfile="$dsdir/{WEB_META_LOG}/{log_filename}" if [ ! -e "$dsdir/.git" ]; then echo Assumption of being under .git has failed >&2 exit 1 fi mkdir -p "$dsdir/{WEB_META_LOG}" # assure logs directory exists ( which datalad > /dev/null \ && ( cd "$dsdir"; GIT_DIR="$PWD/.git" datalad ls -a --json file .; ) \ || echo "E: no datalad found - skipping generation of indexes for web frontend"; \ ) &> "$logfile" # Some submodules might have been added and thus we better init them ( cd "$dsdir"; git submodule update --init || : ; ) >> "$logfile" 2>&1 '''.format(WEB_META_LOG=WEB_META_LOG, **locals()) with make_tempfile(content=hook_content) as tempf: # create post_update hook script # upload hook to dataset ssh.copy(tempf, hook_remote_target) # and make it executable ssh('chmod +x {}'.format(sh_quote(hook_remote_target))) @staticmethod def upload_web_interface(path, ssh, shared, ui): # path to web interface resources on local webui_local = opj(dirname(datalad.__file__), 'resources', 'website') # local html to dataset html_local = opj(webui_local, "index.html") # name and location of web-interface html on target html_targetname = {True: ui, False: "index.html"}[isinstance(ui, str)] html_target = opj(path, html_targetname) # upload ui html to target ssh.copy(html_local, html_target) # upload assets to the dataset webresources_local = opj(webui_local, 'assets') webresources_remote = opj(path, WEB_HTML_DIR) ssh('mkdir -p {}'.format(sh_quote(webresources_remote))) ssh.copy(webresources_local, webresources_remote, recursive=True) # minimize and upload js assets for js_file in glob(opj(webresources_local, 'js', '*.js')): with open(js_file) as asset: try: from jsmin import jsmin # jsmin = lambda x: x # no minimization minified = jsmin(asset.read()) # minify asset except ImportError: lgr.warning( "Will not minify web interface javascript, no jsmin available" ) minified = asset.read() # no minify available with make_tempfile(content=minified ) as tempf: # write minified to tempfile js_name = js_file.split('/')[-1] ssh.copy(tempf, opj(webresources_remote, 'assets', 'js', js_name)) # and upload js # explicitly make web+metadata dir of dataset world-readable, if shared set to 'all' mode = None if shared in (True, 'true', 'all', 'world', 'everybody'): mode = 'a+rX' elif shared == 'group': mode = 'g+rX' elif str(shared).startswith('0'): mode = shared if mode: ssh('chmod {} -R {} {}'.format( mode, sh_quote(dirname(webresources_remote)), sh_quote(opj(path, 'index.html'))))
class Rerun(Interface): """Re-execute previous `datalad run` commands. This will unlock any dataset content that is on record to have been modified by the command in the specified revision. It will then re-execute the command in the recorded path (if it was inside the dataset). Afterwards, all modifications will be saved. *Report mode* || REFLOW >> When called with [CMD: --report CMD][PY: report=True PY], this command reports information about what would be re-executed as a series of records. There will be a record for each revision in the specified revision range. Each of these will have one of the following "rerun_action" values: << REFLOW || - run: the revision has a recorded command that would be re-executed - skip: the revision does not have a recorded command and would be skipped - pick: the revision does not have a recorded command and would be cherry picked The decision to skip rather than cherry pick a revision is based on whether the revision would be reachable from HEAD at the time of execution. In addition, when a starting point other than HEAD is specified, there is a rerun_action value "checkout", in which case the record includes information about the revision the would be checked out before rerunning any commands. Examples: Re-execute the command from the previous commit:: % datalad rerun Re-execute any commands in the last five commits:: % datalad rerun --since=HEAD~5 Do the same as above, but re-execute the commands on top of HEAD~5 in a detached state:: % datalad rerun --onto= --since=HEAD~5 Re-execute all previous commands and compare the old and new results:: % # on master branch % datalad rerun --branch=verify --since= % # now on verify branch % datalad diff --revision=master.. % git log --oneline --left-right --cherry-pick master... .. note:: Currently the "onto" feature only sets the working tree of the current dataset to a previous state. The working trees of any subdatasets remain unchanged. """ _params_ = dict( revision=Parameter( args=("revision",), metavar="REVISION", nargs="?", doc="""rerun command(s) in `revision`. By default, the command from this commit will be executed, but [CMD: --since CMD][PY: `since` PY] can be used to construct a revision range.""", default="HEAD", constraints=EnsureStr()), since=Parameter( args=("--since",), doc="""If `since` is a commit-ish, the commands from all commits that are reachable from `revision` but not `since` will be re-executed (in other words, the commands in :command:`git log SINCE..REVISION`). If SINCE is an empty string, it is set to the parent of the first commit that contains a recorded command (i.e., all commands in :command:`git log REVISION` will be re-executed).""", constraints=EnsureStr() | EnsureNone()), branch=Parameter( metavar="NAME", args=("-b", "--branch",), doc="create and checkout this branch before rerunning the commands.", constraints=EnsureStr() | EnsureNone()), onto=Parameter( metavar="base", args=("--onto",), doc="""start point for rerunning the commands. If not specified, commands are executed at HEAD. This option can be used to specify an alternative start point, which will be checked out with the branch name specified by [CMD: --branch CMD][PY: `branch` PY] or in a detached state otherwise. As a special case, an empty value for this option means to use the commit specified by [CMD: --since CMD][PY: `since` PY].""", constraints=EnsureStr() | EnsureNone()), message=Parameter( args=("-m", "--message",), metavar="MESSAGE", doc="""use MESSAGE for the reran commit rather than the recorded commit message. In the case of a multi-commit rerun, all the reran commits will have this message.""", constraints=EnsureStr() | EnsureNone()), script=Parameter( args=("--script",), metavar="FILE", doc="""extract the commands into [CMD: FILE CMD][PY: this file PY] rather than rerunning. Use - to write to stdout instead. [CMD: This option implies --report. CMD]""", constraints=EnsureStr() | EnsureNone()), dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset from which to rerun a recorded command. If no dataset is given, an attempt is made to identify the dataset based on the current working directory. If a dataset is given, the command will be executed in the root directory of this dataset.""", constraints=EnsureDataset() | EnsureNone()), report=Parameter( args=("--report",), action="store_true", doc="""Don't actually re-execute anything, just display what would be done. [CMD: Note: If you give this option, you most likely want to set --output-format to 'json' or 'json_pp'. CMD]"""), ) @staticmethod @datasetmethod(name='rerun') @eval_results def __call__( revision="HEAD", since=None, dataset=None, branch=None, message=None, onto=None, script=None, report=False): ds = require_dataset( dataset, check_installed=True, purpose='rerunning a command') lgr.debug('rerunning command output underneath %s', ds) if script is None and not report and ds.repo.dirty: yield get_status_dict( 'run', ds=ds, status='impossible', message=( 'clean dataset required to detect changes from command; ' 'use `datalad status` to inspect unsaved changes')) return if not ds.repo.get_hexsha(): yield get_status_dict( 'run', ds=ds, status='impossible', message='cannot rerun command, nothing recorded') return if branch and branch in ds.repo.get_branches(): yield get_status_dict( "run", ds=ds, status="error", message="branch '{}' already exists".format(branch)) return if not ds.repo.commit_exists(revision + "^"): # Only a single commit is reachable from `revision`. In # this case, --since has no effect on the range construction. revrange = revision elif since is None: revrange = "{rev}^..{rev}".format(rev=revision) elif since.strip() == "": revrange = revision else: revrange = "{}..{}".format(since, revision) if ds.repo.repo.git.rev_list("--merges", revrange, "--"): yield get_status_dict( "run", ds=ds, status="error", message="cannot rerun history with merge commits") return results = _rerun_as_results(ds, revrange, since, branch, onto, message) if script: handler = _get_script_handler(script, since, revision) elif report: handler = _report else: handler = _rerun for res in handler(ds, results): yield res
class Init(Interface): """Initialize an existing dataset to track an XNAT project """ _examples_ = [ dict(text='Initialize a dataset in the current directory', code_cmd='datalad xnat-init http://central.xnat.org:8080', code_py='xnat_init("http://central.xnat.org:8080")'), ] _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), metavar='DATASET', doc="""specify the dataset to perform the initialization on""", constraints=EnsureDataset() | EnsureNone()), url=Parameter( args=("url", ), doc="""XNAT instance URL""", ), project=Parameter( args=( "-p", "--project", ), doc="""name of an XNAT project to track""", ), path=Parameter( args=( "-O", "--path", ), doc="""Specify the directory structure for the downloaded files, and if/where a subdataset should be created. To include the subject, session, or scan values, use the following format: {subject}/{session}/{scan}/ To insert a subdataset at a specific directory level use '//': {subject}/{session}//{scan}/""", ), force=Parameter(args=( "-f", "--force", ), doc="""force (re-)initialization""", action='store_true'), ) @staticmethod @datasetmethod(name='xnat_init') @eval_results def __call__(url, path="{subject}/{session}/{scan}/", project=None, force=False, dataset=None): from pyxnat import Interface as XNATInterface ds = require_dataset(dataset, check_installed=True, purpose='initialization') config = ds.config path = with_pathsep(path) # prep for yield res = dict( action='xnat_init', path=ds.path, type='dataset', logger=lgr, refds=ds.path, ) # obtain user credentials, use simplified/stripped URL as identifier # given we don't have more knowledge than the user, do not # give a `url` to provide hints on how to obtain credentials parsed_url = urlparse(url) no_proto_url = '{}{}'.format(parsed_url.netloc, parsed_url.path).replace(' ', '') cred = UserPassword(name=no_proto_url, url=None)() xn = XNATInterface(server=url, **cred) # now we make a simple request to obtain the server version # we don't care much, but if the URL or the credentials are wrong # we will not get to see one try: xnat_version = xn.version() lgr.debug("XNAT server version is %s", xnat_version) except Exception as e: yield dict( res, status='error', message=('Failed to access the XNAT server. Full error:\n%s', e), ) return if project is None: from datalad.ui import ui projects = xn.select.projects().get() ui.message('No project name specified. The following projects are ' 'available on {} for user {}:'.format( url, cred['user'])) for p in sorted(projects): # list and prep for C&P # TODO multi-column formatting? ui.message(" {}".format(quote_cmdlinearg(p))) return # query the specified project to make sure it exists and is accessible proj = xn.select.project(project) try: nsubj = len(proj.subjects().get()) except Exception as e: yield dict( res, status='error', message=( 'Failed to obtain information on project %s from XNAT. ' 'Full error:\n%s', project, e), ) return lgr.info('XNAT reports %i subjects currently on-record for project %s', nsubj, project) # check if dataset already initialized auth_dir = ds.pathobj / '.datalad' / 'providers' if auth_dir.exists() and not force: yield dict( res, status='error', message='Dataset found already initialized, ' 'use `force` to reinitialize', ) return # put essential configuration into the dataset config.set('datalad.xnat.default.url', url, where='dataset', reload=False) config.set('datalad.xnat.default.project', project, where='dataset') config.set('datalad.xnat.default.path', path, where='dataset') ds.save( path=ds.pathobj / '.datalad' / 'config', to_git=True, message="Configure default XNAT url and project", ) # Configure XNAT access authentication ds.run_procedure(spec='cfg_xnat_dataset') yield dict( res, status='ok', ) return
class CreateSiblingOSF(Interface): """Create a dataset representation at OSF. This will create a node on OSF and initialize an osf special remote to point to it. There are two modes this can operate in: 'annex' and 'export'. The former uses the OSF node as a key-value store, that can be used by git-annex to copy data to and retrieve data from (potentially by any clone of the original dataset). The latter allows to use 'git annex export' to publish a snapshot of a particular version of the dataset. Such an OSF node will - in opposition to the 'annex' - be human-readable. For authentication with OSF, you can define environment variables: Either 'OSF_TOKEN', or both 'OSF_USERNAME' and 'OSF_PASSWORD'. If neither of these is defined, the tool will fall back to the datalad credential manager and inquire for credentials interactively. """ result_renderer = 'tailored' _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc=""""Dataset to create a sibling for. If no further constraining path is given, metadata is extracted from all files of the dataset.""", constraints=EnsureDataset() | EnsureNone() ), title=Parameter( args=("--title",), doc="""title of the to-be created OSF node that is displayed on the OSF website. Defaults to the basename of the root directory of the local dataset.""", constraints=EnsureStr() | EnsureNone(), ), name=Parameter( args=("-s", "--name",), doc="""Name of the to-be initialized osf-special-remote""", constraints=EnsureStr() ), storage_name=Parameter( args=("--storage-name",), metavar="NAME", doc="""Name of the storage sibling (git-annex special remote). Must not be identical to the sibling name. If not specified, defaults to the sibling name plus '-storage' suffix.""", constraints=EnsureStr() | EnsureNone()), existing=Parameter( args=("--existing",), constraints=EnsureChoice( 'skip', 'error') | EnsureNone(), metavar='MODE', doc="""Action to perform, if a (storage) sibling is already configured under the given name and/or a target already exists. In this case, a dataset can be skipped ('skip'), or the command be instructed to fail ('error').""", ), trust_level=Parameter( args=("--trust-level",), metavar="TRUST-LEVEL", constraints=EnsureChoice( 'trust', 'semitrust', 'untrust') | EnsureNone(), doc="""specify a trust level for the storage sibling. If not specified, the default git-annex trust level is used.""",), mode=Parameter( args=("--mode",), doc=""" """, constraints=EnsureChoice( "annex", "export", "exportonly", "gitonly") ), tags=Parameter( args=('--tag',), dest='tags', metavar='TAG', doc="""specific one or more tags for the to-be-create OSF node. A tag 'DataLad dataset' and the dataset ID (if there is any) will be automatically added as additional tags. [CMD: This option can be given more than once CMD].""", action='append', ), public=Parameter( args=("--public",), doc="""make OSF node public""", action='store_true', ), category=Parameter( args=("--category",), doc="""specific the OSF node category to be used for the node. The categorization determines what icon is displayed with the node on the OSF, and helps with search organization""", # all presently supported categories constraints=EnsureChoice( "analysis", "communication", "data", "hypothesis", "instrumentation", "methods and measures", "procedure", "project", "software", "other") ), description=Parameter( args=("--description",), metavar="TEXT", doc="""Description of the OSF node that will be displayed on the associated project page. By default a description will be generated based on the mode the sibling is put into.""", constraints=EnsureStr() | EnsureNone()), ) @staticmethod @datasetmethod(name='create_sibling_osf') @eval_results def __call__(title=None, name="osf", storage_name=None, dataset=None, mode="annex", existing='error', trust_level=None, tags=None, public=False, category='data', description=None, ): ds = require_dataset(dataset, purpose="create OSF remote", check_installed=True) res_kwargs = dict( ds=ds, action="create-sibling-osf", logger=lgr, ) # we need an annex if not isinstance(ds.repo, AnnexRepo): yield get_status_dict( type="dataset", status="impossible", message="dataset has no annex", **res_kwargs) return # NOTES: # - we prob. should check osf-special-remote availability upfront to # fail early # - add --recursive option # - recursive won't work easily. Need to think that through. # - would need a naming scheme for subdatasets # - flat on OSF or a tree? # - how do we detect something is there already, so we can skip # rather than duplicate (with a new name)? # osf-type-special-remote sufficient to decide it's not needed? # - adapt to conclusions in issue #30 # -> create those subcomponents # - results need to report URL for created projects suitable for datalad # output formatting! # -> result_renderer # -> needs to ne returned by create_node if not storage_name: storage_name = "{}-storage".format(name) sibling_conflicts = sibling_exists( ds, [name, storage_name], # TODO pass through recursive=False, recursion_limit=None, # fail fast, if error is desired exhaustive=existing == 'error', ) if existing == 'error' and sibling_conflicts: # we only asked for one conflict = sibling_conflicts[0] yield get_status_dict( status='error', message=( "a sibling '%s' is already configured in dataset %s", conflict[1], conflict[0]), **res_kwargs, ) return if title is None: # use dataset root basename title = ds.pathobj.name tags = ensure_list(tags) if 'DataLad dataset' not in tags: tags.append('DataLad dataset') if ds.id and ds.id not in tags: tags.append(ds.id) if not description: description = \ "This component was built from a DataLad dataset using the " \ "datalad-osf extension " \ "(https://github.com/datalad/datalad-osf)." if mode != 'exportonly': description += \ " With this extension installed, this component can be " \ "git or datalad cloned from a 'osf://ID' URL, where " \ "'ID' is the OSF node ID that shown in the OSF HTTP " \ "URL, e.g. https://osf.io/q8xnk/ can be cloned from " \ "osf://q8xnk" cred = get_credentials(allow_interactive=True) osf = OSF(**cred) node_id, node_url = create_node( osf_session=osf.session, title=title, category=category, tags=tags if tags else None, public=EnsureBool()(public), description=description, ) if mode != 'gitonly': init_opts = ["encryption=none", "type=external", "externaltype=osf", "autoenable=true", "node={}".format(node_id)] if mode in ("export", "exportonly"): init_opts += ["exporttree=yes"] ds.repo.init_remote(storage_name, options=init_opts) if trust_level: ds.repo.call_git(['annex', trust_level, storage_name]) yield get_status_dict( type="dataset", url=node_url, id=node_id, name=storage_name, status="ok", **res_kwargs ) if mode == 'exportonly': return ds.config.set( 'remote.{}.annex-ignore'.format(name), 'true', where='local') yield from ds.siblings( # use configure, not add, to not trip over the config that # we just made action='configure', name=name, url='osf://{}'.format(node_id), fetch=False, publish_depends=storage_name if mode != 'gitonly' else None, recursive=False, result_renderer=None, ) @staticmethod def custom_result_renderer(res, **kwargs): from datalad.ui import ui if res['action'] == "create-sibling-osf": msg = res.get('message', None) ui.message("{action}({status}): {url}{msg}".format( action=ac.color_word(res['action'], ac.BOLD), status=ac.color_status(res['status']), url=res.get('url', ''), msg=' [{}]'.format(msg[0] % msg[1:] if isinstance(msg, tuple) else res['message']) if msg else '') ) elif res['action'] == "add-sibling-osf": ui.message("{action}({status})".format( action=ac.color_word(res['action'], ac.BOLD), status=ac.color_status(res['status'])) ) else: from datalad.interface.utils import default_result_renderer default_result_renderer(res)
class Configuration(Interface): """Get and set dataset, dataset-clone-local, or global configuration .. note:: This command was introduced with DataLad 0.14 and might still see API and behavior changes in future releases. This command works similar to git-config, but some features are not supported (e.g., modifying system configuration), while other features are not available in git-config (e.g., multi-configuration queries). Query and modification of three distinct configuration scopes is supported: - 'dataset': the persistent configuration in .datalad/config of a dataset - 'local': a dataset clone's Git repository configuration in .git/config - 'global': non-dataset-specific configuration (usually in $USER/.gitconfig) Modifications of the persistent 'dataset' configuration will not be saved by this command, but have to be committed with a subsequent `save` call. Rules of precedence regarding different configuration scopes are the same as in Git, with two exceptions: 1) environment variables can be used to override any datalad configuration, and have precedence over any other configuration scope (see below). 2) the 'dataset' scope is considered in addition to the standard git configuration scopes. Its content has lower precedence than Git configuration scopes, but it is committed to a dataset, hence can be used to ship (default and branch-specific) configuration with a dataset. Any DATALAD_* environment variable is also mapped to a configuration item. Their values take precedence over any other specification. In variable names '_' encodes a '.' in the configuration name, and '__' encodes a '-', such that 'DATALAD_SOME__VAR' is mapped to 'datalad.some-var'. Recursive operation is supported for querying and modifying configuration across a hierarchy of datasets. """ _examples_ = [ dict( text= "Dump the effective configuration, including an annotation for common items", code_py="configuration()", code_cmd="datalad configuration"), dict(text="Query two configuration items", code_py="configuration('get', ['user.name', 'user.email'])", code_cmd="datalad configuration get user.name user.email"), dict( text= "Recursively set configuration in all (sub)dataset repositories", code_py= "configuration('set', [('my.config.name', 'value')], recursive=True)", code_cmd="datalad configuration -r set my.config=value"), dict( text= "Modify the persistent dataset configuration (changes are not committed)", code_py= "configuration('set', [('my.config.name', 'value')], scope='dataset')", code_cmd="datalad configuration --scope dataset set my.config=value" ), ] result_renderer = 'tailored' _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset to query or to configure""", constraints=EnsureDataset() | EnsureNone()), action=Parameter(args=("action", ), nargs='?', doc="""which action to perform""", constraints=EnsureChoice(*config_actions)), scope=Parameter(args=("--scope", ), doc="""scope for getting or setting configuration. If no scope is declared for a query, all configuration sources (including overrides via environment variables) are considered according to the normal rules of precedence. For action 'get' only 'dataset' and 'local' (with include 'global' here) are supported. For action 'dump', a scope selection is ignored and all scopes are considered.""", constraints=EnsureChoice('global', 'local', 'dataset', None)), spec=Parameter( args=("spec", ), doc="""configuration name (for actions 'get' and 'unset'), or name/value pair (for action 'set')""", nargs='*', metavar='name[=value]'), recursive=recursion_flag, recursion_limit=recursion_limit, ) @staticmethod @datasetmethod(name='configuration') @eval_results def __call__(action='dump', spec=None, scope=None, dataset=None, recursive=False, recursion_limit=None): # check conditions # - global and recursion makes no sense if action == 'dump': if spec: raise ValueError( 'Configuration selection is not supported for dumping') if scope: raise ValueError( 'Scope selection is not supported for dumping') # normalize variable specificatons specs = [] for s in ensure_list(spec): if isinstance(s, tuple): specs.append((str(s[0]), str(s[1]))) elif '=' not in s: specs.append((str(s), )) else: specs.append(tuple(s.split('=', 1))) if action == 'set': missing_values = [s[0] for s in specs if len(s) < 2] if missing_values: raise ValueError( 'Values must be provided for all configuration ' 'settings. Missing: {}'.format(missing_values)) invalid_names = [s[0] for s in specs if '.' not in s[0]] if invalid_names: raise ValueError( 'Name must contain a section (i.e. "section.name"). ' 'Invalid: {}'.format(invalid_names)) ds = None if scope != 'global' or recursive: ds = require_dataset(dataset, check_installed=True, purpose='configuration') res_kwargs = dict( action='configuration', logger=lgr, ) if ds: res_kwargs['refds'] = ds.path yield from configuration(action, scope, specs, res_kwargs, ds) if not recursive: return for subds in ds.subdatasets(fulfilled=True, recursive=True, recursion_limit=recursion_limit, on_failure='ignore', result_renderer='disabled'): yield from configuration(action, scope, specs, res_kwargs, Dataset(subds['path'])) @staticmethod def custom_result_renderer(res, **kwargs): if (res['status'] != 'ok' or res['action'] not in ('get_configuration', 'dump_configuration')): if 'message' not in res and 'name' in res: suffix = '={}'.format(res['value']) if 'value' in res else '' res['message'] = '{}{}'.format(res['name'], suffix) default_result_renderer(res) return # TODO source from datalad.ui import ui name = res['name'] if res['action'] == 'dump_configuration': for key in ('purpose', 'description'): s = res.get(key) if s: ui.message('\n'.join( wrap( s, initial_indent='# ', subsequent_indent='# ', ))) if kwargs.get('recursive', False): have_subds = res['path'] != res['refds'] # we need to mark up from which dataset results are reported prefix = '<ds>{}{}:'.format( '/' if have_subds else '', Path(res['path']).relative_to(res['refds']).as_posix() if have_subds else '', ) else: prefix = '' if kwargs.get('action', None) == 'dump': ui.message('{}{}={}'.format( prefix, ac.color_word(name, ac.BOLD), res['value'] if res['value'] is not None else '', )) else: ui.message('{}{}'.format( prefix, res['value'] if res['value'] is not None else '', ))
class WTF(Interface): """Generate a report about the DataLad installation and configuration IMPORTANT: Sharing this report with untrusted parties (e.g. on the web) should be done with care, as it may include identifying information, and/or credentials or access tokens. """ result_renderer = 'tailored' from datalad.support.param import Parameter from datalad.distribution.dataset import datasetmethod from datalad.interface.utils import eval_results from datalad.distribution.dataset import EnsureDataset from datalad.support.constraints import EnsureNone, EnsureChoice _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), doc=""""specify the dataset to report on. no dataset is given, an attempt is made to identify the dataset based on the current working directory.""", constraints=EnsureDataset() | EnsureNone()), sensitive=Parameter( args=( "-s", "--sensitive", ), constraints=EnsureChoice(None, 'some', 'all'), doc="""if set to 'some' or 'all', it will display sections such as config and metadata which could potentially contain sensitive information (credentials, names, etc.). If 'some', the fields which are known to be sensitive will still be masked out"""), sections=Parameter( args=("-S", "--section"), action='append', dest='sections', metavar="SECTION", constraints=EnsureChoice(*sorted(SECTION_CALLABLES)) | EnsureNone(), doc="""section to include. If not set, all sections. [CMD: This option can be given multiple times. CMD]"""), decor=Parameter( args=("-D", "--decor"), constraints=EnsureChoice('html_details') | EnsureNone(), doc="""decoration around the rendering to facilitate embedding into issues etc, e.g. use 'html_details' for posting collapsable entry to GitHub issues."""), clipboard=Parameter( args=( "-c", "--clipboard", ), action="store_true", doc="""if set, do not print but copy to clipboard (requires pyperclip module)"""), ) @staticmethod @datasetmethod(name='wtf') @eval_results def __call__(dataset=None, sensitive=None, sections=None, decor=None, clipboard=None): from datalad.distribution.dataset import require_dataset from datalad.support.exceptions import NoDatasetFound from datalad.interface.results import get_status_dict ds = None try: ds = require_dataset(dataset, check_installed=False, purpose='reporting') except NoDatasetFound: # failure is already logged pass if ds and not ds.is_installed(): # warn that the dataset is bogus yield dict( action='wtf', path=ds.path, status='impossible', message=('No dataset found at %s. Reporting on the dataset is ' 'not attempted.', ds.path), logger=lgr) # we don't deal with absent datasets ds = None if sensitive: if ds is None: from datalad import cfg else: cfg = ds.config else: cfg = None from datalad.ui import ui from datalad.support.external_versions import external_versions infos = OrderedDict() res = get_status_dict( action='wtf', path=ds.path if ds else assure_unicode(op.abspath(op.curdir)), type='dataset' if ds else 'directory', status='ok', logger=lgr, decor=decor, infos=infos, ) # Define section callables which require variables. # so there is no side-effect on module level original section_callables = SECTION_CALLABLES.copy() section_callables['location'] = partial(_describe_location, res) section_callables['configuration'] = \ partial(_describe_configuration, cfg, sensitive) if ds: section_callables['dataset'] = \ partial(_describe_dataset, ds, sensitive) else: section_callables.pop('dataset') assert all(section_callables.values()) # check if none was missed if sections is None: sections = sorted(list(section_callables)) for s in sections: infos[s] = section_callables[s]() if clipboard: external_versions.check( 'pyperclip', msg="It is needed to be able to use clipboard") import pyperclip report = _render_report(res) pyperclip.copy(report) ui.message("WTF information of length %s copied to clipboard" % len(report)) yield res return @staticmethod def custom_result_renderer(res, **kwargs): from datalad.ui import ui out = _render_report(res) ui.message(out)
class AddArchiveContent(Interface): """Add content of an archive under git annex control. Given an already annex'ed archive, extract and add its files to the dataset, and reference the original archive as a custom special remote. """ _examples_ = [ dict(text="""Add files from the archive 'big_tarball.tar.gz', but keep big_tarball.tar.gz in the index""", code_py="add_archive_content(path='big_tarball.tar.gz')", code_cmd="datalad add-archive-content big_tarball.tar.gz"), dict(text="""Add files from the archive 'tarball.tar.gz', and remove big_tarball.tar.gz from the index""", code_py="add_archive_content(path='big_tarball.tar.gz', delete=True)", code_cmd="datalad add-archive-content big_tarball.tar.gz --delete"), dict(text="""Add files from the archive 's3.zip' but remove the leading directory""", code_py="add_archive_content(path='s3.zip', strip_leading_dirs=True)", code_cmd="datalad add-archive-content s3.zip --strip-leading-dirs"), ] # XXX prevent common args from being added to the docstring _no_eval_results = True _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc=""""specify the dataset to save""", constraints=EnsureDataset() | EnsureNone()), delete=Parameter( args=("-D", "--delete"), action="store_true", doc="""delete original archive from the filesystem/Git in current tree. %s""" % _KEY_OPT_NOTE), add_archive_leading_dir=Parameter( args=("--add-archive-leading-dir",), action="store_true", doc="""place extracted content under a directory which would correspond to the archive name with all suffixes stripped. E.g. the content of `archive.tar.gz` will be extracted under `archive/`"""), strip_leading_dirs=Parameter( args=("--strip-leading-dirs",), action="store_true", doc="""remove one or more leading directories from the archive layout on extraction"""), leading_dirs_depth=Parameter( args=("--leading-dirs-depth",), action="store", type=int, doc="""maximum depth of leading directories to strip. If not specified (None), no limit"""), leading_dirs_consider=Parameter( args=("--leading-dirs-consider",), action="append", doc="""regular expression(s) for directories to consider to strip away""", constraints=EnsureStr() | EnsureNone(), ), use_current_dir=Parameter( args=("--use-current-dir",), action="store_true", doc="""extract the archive under the current directory, not the directory where the archive is located. This parameter is applied automatically if [PY: `key=True` PY][CMD: --key CMD] was used."""), # TODO: add option to extract under archive's original directory. Currently would extract in curdir existing=Parameter( args=("--existing",), choices=('fail', 'overwrite', 'archive-suffix', 'numeric-suffix'), default="fail", doc="""what operation to perform if a file from an archive tries to overwrite an existing file with the same name. 'fail' (default) leads to an error result, 'overwrite' silently replaces existing file, 'archive-suffix' instructs to add a suffix (prefixed with a '-') matching archive name from which file gets extracted, and if that one is present as well, 'numeric-suffix' is in effect in addition, when incremental numeric suffix (prefixed with a '.') is added until no name collision is longer detected""" ), exclude=Parameter( args=("-e", "--exclude"), action='append', doc="""regular expressions for filenames which to exclude from being added to annex. Applied after --rename if that one is specified. For exact matching, use anchoring""", constraints=EnsureStr() | EnsureNone() ), rename=Parameter( args=("-r", "--rename"), action='append', doc="""regular expressions to rename files before added them under to Git. The first defines how to split provided string into two parts: Python regular expression (with groups), and replacement string""", constraints=EnsureStr(min_len=2) | EnsureNone() ), annex_options=Parameter( args=("-o", "--annex-options"), doc="""additional options to pass to git-annex """, constraints=EnsureStr() | EnsureNone() ), annex=Parameter( doc="""DEPRECATED. Use the 'dataset' parameter instead.""" ), # TODO: Python only! stats=Parameter( doc="""ActivityStats instance for global tracking""", ), key=Parameter( args=("--key",), action="store_true", doc="""signal if provided archive is not actually a filename on its own but an annex key. The archive will be extracted in the current directory."""), copy=Parameter( args=("--copy",), action="store_true", doc="""copy the content of the archive instead of moving"""), allow_dirty=allow_dirty, commit=Parameter( args=("--no-commit",), action="store_false", dest="commit", doc="""don't commit upon completion"""), drop_after=Parameter( args=("--drop-after",), action="store_true", doc="""drop extracted files after adding to annex""", ), delete_after=Parameter( args=("--delete-after",), action="store_true", doc="""extract under a temporary directory, git-annex add, and delete afterwards. To be used to "index" files within annex without actually creating corresponding files under git. Note that `annex dropunused` would later remove that load"""), # TODO: interaction with archives cache whenever we make it persistent across runs archive=Parameter( args=("archive",), doc="archive file or a key (if %s specified)" % _KEY_OPT, constraints=EnsureStr()), ) @staticmethod @datasetmethod(name='add_archive_content') @eval_results def __call__( archive, *, dataset=None, annex=None, add_archive_leading_dir=False, strip_leading_dirs=False, leading_dirs_depth=None, leading_dirs_consider=None, use_current_dir=False, delete=False, key=False, exclude=None, rename=None, existing='fail', annex_options=None, copy=False, commit=True, allow_dirty=False, stats=None, drop_after=False, delete_after=False): if exclude: exclude = ensure_tuple_or_list(exclude) if rename: rename = ensure_tuple_or_list(rename) ds = require_dataset(dataset, check_installed=True, purpose='add-archive-content') # set up common params for result records res_kwargs = { 'action': 'add-archive-content', 'logger': lgr, } if not isinstance(ds.repo, AnnexRepo): yield get_status_dict( ds=ds, status='impossible', message="Can't operate in a pure Git repository", **res_kwargs ) return if annex: warnings.warn( "datalad add_archive_content's `annex` parameter is " "deprecated and will be removed in a future release. " "Use the 'dataset' parameter instead.", DeprecationWarning) annex = ds.repo # get the archive path relative from the ds root archive_path = resolve_path(archive, ds=dataset) # let Status decide whether we can act on the given file for s in ds.status( path=archive_path, on_failure='ignore', result_renderer='disabled'): if s['status'] == 'error': if 'path not underneath the reference dataset %s' in s['message']: yield get_status_dict( ds=ds, status='impossible', message='Can not add archive outside of the dataset', **res_kwargs) return # status errored & we haven't anticipated the cause. Bubble up yield s return elif s['state'] == 'untracked': # we can't act on an untracked file message = ( "Can not add an untracked archive. " "Run 'datalad save {}'".format(archive) ) yield get_status_dict( ds=ds, status='impossible', message=message, **res_kwargs) return if not allow_dirty and annex.dirty: # error out here if the dataset contains untracked changes yield get_status_dict( ds=ds, status='impossible', message=( 'clean dataset required. ' 'Use `datalad status` to inspect unsaved changes'), **res_kwargs ) return # ensure the archive exists, status doesn't error on a non-existing file if not key and not lexists(archive_path): yield get_status_dict( ds=ds, status='impossible', message=( 'No such file: {}'.format(archive_path), ), **res_kwargs ) return if not key: check_path = archive_path.relative_to(ds.pathobj) # TODO: support adding archives content from outside the annex/repo origin = 'archive' # can become get_file_annexinfo once #6104 is merged key = annex.get_file_annexinfo(check_path)['key'] if not key: raise RuntimeError( f"Archive must be an annexed file in {ds}") archive_dir = Path(archive_path).parent else: origin = 'key' key = archive # We must not have anything to do with the location under .git/annex archive_dir = None # instead, we will go from the current directory use_current_dir = True archive_basename = file_basename(archive) if not key: # if we didn't manage to get a key, the file must be in Git raise NotImplementedError( "Provided file %s does not seem to be under annex control. " "We don't support adding everything straight to Git" % archive ) # figure out our location pwd = getpwd() # are we in a subdirectory of the repository? pwd_in_root = annex.path == archive_dir # then we should add content under that subdirectory, # get the path relative to the repo top if use_current_dir: # extract the archive under the current directory, not the directory # where the archive is located extract_rpath = Path(pwd).relative_to(ds.path) \ if not pwd_in_root \ else None else: extract_rpath = archive_dir.relative_to(ds.path) # relpath might return '.' as the relative path to curdir, which then normalize_paths # would take as instructions to really go from cwd, so we need to sanitize if extract_rpath == curdir: extract_rpath = None try: key_rpath = annex.get_contentlocation(key) except: # the only probable reason for this to fail is that there is no # content present raise RuntimeError( "Content of %s seems to be N/A. Fetch it first" % key ) # now we simply need to go through every file in that archive and lgr.info( "Adding content of the archive %s into annex %s", archive, annex ) from datalad.customremotes.archives import ArchiveAnnexCustomRemote # TODO: shouldn't we be able just to pass existing AnnexRepo instance? # TODO: we will use persistent cache so we could just (ab)use possibly extracted archive # OK, let's ignore that the following class is actually a special # remote implementation, and use it only to work with its cache annexarchive = ArchiveAnnexCustomRemote(annex=None, path=annex.path, persistent_cache=True) # We will move extracted content so it must not exist prior running annexarchive.cache.allow_existing = True earchive = annexarchive.cache[key_rpath] # make sure there is an enabled datalad-archives special remote ensure_datalad_remote(ds.repo, remote=ARCHIVES_SPECIAL_REMOTE, autoenable=True) precommitted = False old_always_commit = annex.always_commit # batch mode is disabled when faking dates, we want to always commit annex.always_commit = annex.fake_dates_enabled if annex_options: if isinstance(annex_options, str): annex_options = split_cmdline(annex_options) delete_after_rpath = None prefix_dir = basename(tempfile.mkdtemp(prefix=".datalad", dir=annex.path)) \ if delete_after \ else None # dedicated stats which would be added to passed in (if any) outside_stats = stats stats = ActivityStats() try: # keep track of extracted files for progress bar logging file_counter = 0 # iterative over all files in the archive extracted_files = list(earchive.get_extracted_files()) # start a progress bar for extraction pbar_id = f'add-archive-{archive_path}' log_progress( lgr.info, pbar_id, 'Extracting archive', label="Extracting archive", unit=' Files', total = len(extracted_files), noninteractive_level = logging.INFO) for extracted_file in extracted_files: file_counter += 1 files_left = len(extracted_files) - file_counter log_progress( lgr.info, pbar_id, "Files to extract %i ", files_left, update=1, increment=True, noninteractive_level=logging.DEBUG) stats.files += 1 extracted_path = Path(earchive.path) / Path(extracted_file) if extracted_path.is_symlink(): link_path = str(extracted_path.resolve()) if not exists(link_path): # TODO: config addarchive.symlink-broken='skip' lgr.warning( "Path %s points to non-existing file %s" % (extracted_path, link_path) ) stats.skipped += 1 continue # TODO: check if points outside of archive - warn & skip url = annexarchive.get_file_url( archive_key=key, file=extracted_file, size=os.stat(extracted_path).st_size) # preliminary target name which might get modified by renames target_file_orig = target_file = Path(extracted_file) # stream archives would not have had the original filename # information in them, so would be extracted under a name # derived from their annex key. # Provide ad-hoc handling for such cases if (len(extracted_files) == 1 and Path(archive).suffix in ('.xz', '.gz', '.lzma') and Path(key_rpath).name.startswith(Path( extracted_file).name)): # take archive's name without extension for filename & place # where it was originally extracted target_file = \ Path(extracted_file).parent / Path(archive).stem if strip_leading_dirs: leading_dir = earchive.get_leading_directory( depth=leading_dirs_depth, exclude=exclude, consider=leading_dirs_consider) leading_dir_len = \ len(leading_dir) + len(opsep) if leading_dir else 0 target_file = str(target_file)[leading_dir_len:] if add_archive_leading_dir: # place extracted content under a directory corresponding to # the archive name with suffix stripped. target_file = Path(archive_basename) / target_file if rename: target_file = apply_replacement_rules(rename, str(target_file)) # continue to next iteration if extracted_file in excluded if exclude: try: # since we need to skip outside loop from inside loop for regexp in exclude: if re.search(regexp, extracted_file): lgr.debug( "Skipping {extracted_file} since contains " "{regexp} pattern".format(**locals())) stats.skipped += 1 raise StopIteration except StopIteration: continue if delete_after: # place target file in a temporary directory target_file = Path(prefix_dir) / Path(target_file) # but also allow for it in the orig target_file_orig = Path(prefix_dir) / Path(target_file_orig) target_file_path_orig = annex.pathobj / target_file_orig # If we were invoked in a subdirectory, patch together the # correct path target_file_path = extract_rpath / target_file \ if extract_rpath else target_file target_file_path = annex.pathobj / target_file_path # when the file already exists... if lexists(target_file_path): handle_existing = True if md5sum(str(target_file_path)) == \ md5sum(str(extracted_path)): if not annex.is_under_annex(str(extracted_path)): # if under annex -- must be having the same content, # we should just add possibly a new extra URL # but if under git -- we cannot/should not do # anything about it ATM if existing != 'overwrite': continue else: handle_existing = False if not handle_existing: pass # nothing... just to avoid additional indentation elif existing == 'fail': message = \ "{} exists, but would be overwritten by new file " \ "{}. Consider adjusting --existing".format\ (target_file_path, extracted_file) yield get_status_dict( ds=ds, status='error', message=message, **res_kwargs) return elif existing == 'overwrite': stats.overwritten += 1 # to make sure it doesn't conflict -- might have been a # tree rmtree(target_file_path) else: # an elaborate dance to piece together new archive names target_file_path_orig_ = target_file_path # To keep extension intact -- operate on the base of the # filename p, fn = os.path.split(target_file_path) ends_with_dot = fn.endswith('.') fn_base, fn_ext = file_basename(fn, return_ext=True) if existing == 'archive-suffix': fn_base += '-%s' % archive_basename elif existing == 'numeric-suffix': pass # archive-suffix will have the same logic else: # we shouldn't get here, argparse should catch a # non-existing value for --existing right away raise ValueError(existing) # keep incrementing index in the suffix until file # doesn't collide suf, i = '', 0 while True: connector = \ ('.' if (fn_ext or ends_with_dot) else '') file = fn_base + suf + connector + fn_ext target_file_path_new = \ Path(p) / Path(file) if not lexists(target_file_path_new): # we found a file name that is not yet taken break lgr.debug("Iteration %i of file name finding. " "File %s already exists", i, target_file_path_new) i += 1 suf = '.%d' % i target_file_path = target_file_path_new lgr.debug("Original file %s will be saved into %s" % (target_file_path_orig_, target_file_path)) # TODO: should we reserve smth like # stats.clobbed += 1 if target_file_path != target_file_path_orig: stats.renamed += 1 if copy: raise NotImplementedError( "Not yet copying from 'persistent' cache" ) lgr.debug("Adding %s to annex pointing to %s and with options " "%r", target_file_path, url, annex_options) out_json = annex.add_url_to_file( target_file_path, url, options=annex_options, batch=True) if 'key' in out_json and out_json['key'] is not None: # annex.is_under_annex(target_file, batch=True): # due to http://git-annex.branchable.com/bugs/annex_drop_is_not___34__in_effect__34___for_load_which_was___34__addurl_--batch__34__ed_but_not_yet_committed/?updated # we need to maintain a list of those to be dropped files if drop_after: # drop extracted files after adding to annex annex.drop_key(out_json['key'], batch=True) stats.dropped += 1 stats.add_annex += 1 else: lgr.debug("File {} was added to git, not adding url".format( target_file_path)) stats.add_git += 1 if delete_after: # we count the removal here, but don't yet perform it # to not interfer with batched processes - any pure Git # action invokes precommit which closes batched processes. stats.removed += 1 # Done with target_file -- just to have clear end of the loop del target_file if delete and archive and origin != 'key': lgr.debug("Removing the original archive {}".format(archive)) # force=True since some times might still be staged and fail annex.remove(str(archive_path), force=True) lgr.info("Finished adding %s: %s", archive, stats.as_str(mode='line')) if outside_stats: outside_stats += stats if delete_after: # force since not committed. r=True for -r (passed into git call # to recurse) delete_after_rpath = opj(extract_rpath, prefix_dir) \ if extract_rpath else prefix_dir delete_after_rpath = resolve_path(delete_after_rpath, ds=dataset) lgr.debug( "Removing extracted and annexed files under %s", delete_after_rpath ) annex.remove(str(delete_after_rpath), r=True, force=True) if commit: archive_rpath = archive_path.relative_to(ds.path) commit_stats = outside_stats if outside_stats else stats # so batched ones close and files become annex symlinks etc annex.precommit() precommitted = True if any(r.get('state', None) != 'clean' for p, r in annex.status(untracked='no').items()): annex.commit( "Added content extracted from %s %s\n\n%s" % (origin, archive_rpath, commit_stats.as_str(mode='full')), _datalad_msg=True ) commit_stats.reset() else: # don't commit upon completion pass finally: # take down the progress bar log_progress( lgr.info, pbar_id, 'Finished extraction', noninteractive_level=logging.INFO) # since we batched addurl, we should close those batched processes # if haven't done yet. explicitly checked to avoid any possible # "double-action" if not precommitted: annex.precommit() if delete_after_rpath: delete_after_path = opj(annex.path, delete_after_rpath) delete_after_rpath = resolve_path(delete_after_rpath, ds=dataset) if exists(delete_after_path): # should not be there # but for paranoid yoh lgr.warning( "Removing temporary directory under which extracted " "files were annexed and should have been removed: %s", delete_after_path) rmtree(delete_after_path) annex.always_commit = old_always_commit # remove what is left and/or everything upon failure earchive.clean(force=True) # remove tempfile directories (not cleaned up automatically): if prefix_dir is not None and lexists(prefix_dir): os.rmdir(prefix_dir) yield get_status_dict( ds=ds, status='ok', **res_kwargs) return annex
class ContainersRemove(Interface): # first docstring line is used a short description in the cmdline help # the rest is put in the verbose help and manpage """Remove a known container from a dataset """ # parameters of the command, must be exhaustive _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset to query. If no dataset is given, an attempt is made to identify the dataset based on the current working directory""", constraints=EnsureDataset() | EnsureNone()), name=Parameter( args=("name", ), doc="""name of the container to remove""", metavar="NAME", constraints=EnsureStr(), ), remove_image=Parameter( args=( "-i", "--remove-image", ), doc="""if set, remove container image as well""", action="store_true", ), ) @staticmethod @datasetmethod(name='containers_remove') @eval_results def __call__(name, dataset=None, remove_image=False): ds = require_dataset(dataset, check_installed=True, purpose='remove a container') res = get_status_dict(ds=ds, action='containers_remove', logger=lgr) section = 'datalad.containers.{}'.format(name) imagecfg = '{}.image'.format(section) to_save = [] if remove_image and imagecfg in ds.config: imagepath = ds.config.get(imagecfg) if op.lexists(op.join(ds.path, imagepath)): for r in ds.remove( path=imagepath, # XXX shortcoming: this is the only way to say: # don't drop check=False, # config setting might be outdated and image no longer # there -> no reason to fail, just report on_failure='ignore', save=False): yield r to_save.append(imagepath) if section in ds.config.sections(): ds.config.remove_section(section, where='dataset', reload=True) res['status'] = 'ok' to_save.append(op.join('.datalad', 'config')) else: res['status'] = 'notneeded' if to_save: for r in ds.save( path=to_save, message='[DATALAD] Remove container {}'.format(name)): yield r yield res
class Dicom2Spec(Interface): """Derives a specification snippet from DICOM metadata and stores it in a JSON file. The derivation is based on a rule system. You can implement your own rules as a python class. See the documentation page on customization for details. If you have such rules in dedicated files, their use and priority is configured via the datalad.hirni.dicom2spec.rules config variable. It takes a path to a python file containung such a rule definition. This configuration can be specified multiple times and at different levels (system-wide, user, dataset, local repository). If there are indeed several occurences of that configuration, the respective rules will be applied in order. Hence "later" appearances will overwrite "earlier" ones. Thereby you can have institution rules for example and still apply additional rules tailored to your needs or a particular study. """ _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""specify a dataset containing the DICOM metadata to be used. If no dataset is given, an attempt is made to identify the dataset based on the current working directory""", constraints=EnsureDataset() | EnsureNone()), path=Parameter(args=("path", ), metavar="PATH", nargs="+", doc="""path to DICOM files""", constraints=EnsureStr() | EnsureNone()), spec=Parameter(args=( "-s", "--spec", ), metavar="SPEC", doc="""file to store the specification in""", constraints=EnsureStr() | EnsureNone()), subject=Parameter( args=("--subject", ), metavar="SUBJECT", doc="""subject identifier. If not specified, an attempt will be made to derive SUBJECT from DICOM headers""", constraints=EnsureStr() | EnsureNone()), anon_subject=Parameter(args=("--anon-subject", ), metavar="ANON_SUBJECT", doc="""TODO""", constraints=EnsureStr() | EnsureNone()), acquisition=Parameter( args=("--acquisition", ), metavar="ACQUISITION", doc="""acquisition identifier. If not specified, an attempt will be made to derive an identifier from DICOM headers""", constraints=EnsureStr() | EnsureNone()), properties=Parameter(args=("--properties", ), metavar="PATH or JSON string", doc="""""", constraints=EnsureStr() | EnsureNone()), ) @staticmethod @datasetmethod(name='hirni_dicom2spec') @eval_results def __call__(path=None, spec=None, dataset=None, subject=None, anon_subject=None, acquisition=None, properties=None): # TODO: acquisition can probably be removed (or made an alternative to # derive spec and/or dicom location from) # Change, so path needs to point directly to dicom ds? # Or just use acq and remove path? dataset = require_dataset(dataset, check_installed=True, purpose="spec from dicoms") from datalad.utils import assure_list if path is not None: path = assure_list(path) path = [resolve_path(p, dataset) for p in path] path = [str(p) for p in path] else: raise InsufficientArgumentsError( "insufficient arguments for dicom2spec: a path is required") # TODO: We should be able to deal with several paths at once # ATM we aren't (see also commit + message of actual spec) assert len(path) == 1 if not spec: raise InsufficientArgumentsError( "insufficient arguments for dicom2spec: a spec file is required" ) # TODO: That's prob. wrong. We can derive default spec from acquisition else: spec = str(resolve_path(spec, dataset)) spec_series_list = \ [r for r in json_py.load_stream(spec)] if op.exists(spec) else list() # get dataset level metadata: found_some = False for meta in dataset.meta_dump( path, recursive=False, # always False? reporton='datasets', return_type='generator', result_renderer='disabled'): if meta.get('status', None) not in ['ok', 'notneeded']: yield meta continue if 'dicom' not in meta['metadata']: # TODO: Really "notneeded" or simply not a result at all? yield dict(status='notneeded', message=("found no DICOM metadata for %s", meta['path']), path=meta['path'], type='dataset', action='dicom2spec', logger=lgr) continue if 'Series' not in meta['metadata']['dicom'] or \ not meta['metadata']['dicom']['Series']: yield dict( status='impossible', message=("no image series detected in DICOM metadata of" " %s", meta['path']), path=meta['path'], type='dataset', action='dicom2spec', logger=lgr) continue found_some = True overrides = dict() if properties: # load from file or json string props = json_py.load(properties) \ if op.exists(properties) else json_py.loads(properties) # turn into editable, pre-approved records props = { k: dict(value=v, approved=True) for k, v in props.items() } overrides.update(props) spec_series_list = add_to_spec( meta, spec_series_list, op.dirname(spec), subject=subject, anon_subject=anon_subject, # session=session, # TODO: parameter "session" was what # we now call acquisition. This is # NOT a good default for bids_session! # Particularly wrt to anonymization overrides=overrides, dataset=dataset) if not found_some: yield dict( status='impossible', message="found no DICOM metadata", path=path, type= 'file', # TODO: arguable should be 'file' or 'dataset', depending on path action='dicom2spec', logger=lgr) return # TODO: RF needed. This rule should go elsewhere: # ignore duplicates (prob. reruns of aborted runs) # -> convert highest id only # Note: This sorting is a q&d hack! # TODO: Sorting needs to become more sophisticated + include notion of :all spec_series_list = sorted(spec_series_list, key=lambda x: get_specval(x, 'id') if 'id' in x.keys() else 0) for i in range(len(spec_series_list)): # Note: Removed the following line from condition below, # since it appears to be pointless. Value for 'converter' # used to be 'heudiconv' or 'ignore' for a 'dicomseries', so # it's not clear ATM what case this could possibly have catched: # heuristic.has_specval(spec_series_list[i], "converter") and \ if spec_series_list[i]["type"] == "dicomseries" and \ has_specval(spec_series_list[i], "bids-run") and \ get_specval(spec_series_list[i], "bids-run") in \ [get_specval(s, "bids-run") for s in spec_series_list[i + 1:] if get_specval( s, "description") == get_specval( spec_series_list[i], "description") and \ get_specval(s, "id") > get_specval( spec_series_list[i], "id") ]: lgr.debug("Ignore SeriesNumber %s for conversion" % i) spec_series_list[i]["tags"].append( 'hirni-dicom-converter-ignore') lgr.debug("Storing specification (%s)", spec) # store as a stream (one record per file) to be able to # easily concat files without having to parse them, or # process them line by line without having to fully parse them from datalad_hirni.support.spec_helpers import sort_spec # Note: Sorting paradigm needs to change. See above. # spec_series_list = sorted(spec_series_list, key=lambda x: sort_spec(x)) json_py.dump2stream(spec_series_list, spec) # make sure spec is tracked in git: spec_attrs = dataset.repo.get_gitattributes(spec) spec_relpath = op.relpath(spec, dataset.path) if spec_relpath not in spec_attrs.keys() or \ 'annex.largefiles' not in spec_attrs[spec_relpath].keys() or \ spec_attrs[spec_relpath]['annex.largefiles'] != 'nothing': dataset.repo.set_gitattributes([(spec, { 'annex.largefiles': 'nothing' })], '.gitattributes') for r in Save.__call__(dataset=dataset, path=[spec, '.gitattributes'], to_git=True, message="[HIRNI] Added study specification " "snippet for %s" % op.relpath(path[0], dataset.path), return_type='generator', result_renderer='disabled'): if r.get('status', None) not in ['ok', 'notneeded']: yield r elif r['path'] in [spec, op.join(dataset.path, '.gitattributes')] \ and r['type'] == 'file': r['action'] = 'dicom2spec' r['logger'] = lgr yield r elif r['type'] == 'dataset': # 'ok' or 'notneeded' for a dataset is okay, since we commit # the spec. But it's not a result to yield continue else: # anything else shouldn't happen yield dict( status='error', message=("unexpected result from save: %s", r), path= spec, # TODO: This actually isn't clear - get it from `r` type='file', action='dicom2spec', logger=lgr)
class Unlock(Interface): """Unlock file(s) of a dataset Unlock files of a dataset in order to be able to edit the actual content """ _params_ = dict( path=Parameter( args=("path",), doc="""file(s) to unlock""", nargs="*", constraints=EnsureStr() | EnsureNone()), dataset=Parameter( args=("-d", "--dataset"), doc=""""specify the dataset to unlock files in. If no dataset is given, an attempt is made to identify the dataset based on the current working directory. If the latter fails, an attempt is made to identify the dataset based on `path` """, constraints=EnsureDataset() | EnsureNone()), recursive=recursion_flag, recursion_limit=recursion_limit, ) @staticmethod @datasetmethod(name='unlock') @eval_results def __call__( path=None, dataset=None, recursive=False, recursion_limit=None): if path is None and dataset is None: raise InsufficientArgumentsError( "insufficient arguments for unlocking: needs at least " "a dataset or a path to unlock.") refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='unlock', logger=lgr, refds=refds_path) to_process = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='unlock', unavailable_path_status='impossible', unavailable_path_msg="path does not exist", nondataset_path_status='impossible', modified=None, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', 'dataset') == 'dataset': # this is a dataset ap['process_content'] = True to_process.append(ap) content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path) assert(not completed) for ds_path in sorted(content_by_ds.keys()): ds = Dataset(ds_path) content = content_by_ds[ds_path] # no annex, no unlock: if not isinstance(ds.repo, AnnexRepo): for ap in content: ap['status'] = 'notneeded' ap['message'] = "not annex'ed, nothing to unlock" ap.update(res_kwargs) yield ap continue # direct mode, no unlock: elif ds.repo.is_direct_mode(): for ap in content: ap['status'] = 'notneeded' ap['message'] = "direct mode, nothing to unlock" ap.update(res_kwargs) yield ap continue # only files in annex with their content present: files = [ap['path'] for ap in content] to_unlock = [] for ap, under_annex, has_content in \ zip(content, ds.repo.is_under_annex(files), ds.repo.file_has_content(files)): # TODO: what about directories? Make sure, there is no # situation like no file beneath with content or everything in # git, that leads to a CommandError # For now pass to annex: from os.path import isdir if isdir(ap['path']): to_unlock.append(ap) continue # Note, that `file_has_content` is (planned to report) True on # files in git. Therefore order matters: First check for annex! if under_annex: if has_content: to_unlock.append(ap) # no content, no unlock: else: ap['status'] = 'impossible' ap['message'] = "no content present, can't unlock" ap.update(res_kwargs) yield ap # file in git, no unlock: else: ap['status'] = 'notneeded' ap['message'] = "not controlled by annex, nothing to unlock" ap.update(res_kwargs) yield ap # don't call annex-unlock with no path, if this is this case because # nothing survived the filtering above if content and not to_unlock: continue for r in ds.repo.unlock([ap['path'] for ap in to_unlock]): yield get_status_dict( path=r, status='ok', type='file', **res_kwargs)
class Init(Interface): """Initialize an existing dataset to track a UKBiobank participant A batch file for the 'ukbfetch' tool will be generated and placed into the dataset. By selecting the relevant data records, raw and/or preprocessed data will be tracked. After initialization the dataset will contain at least three branches: - 'incoming': to track the pristine ZIP files downloaded from UKB - 'incoming-native': to track individual files (some extracted from ZIP files) - 'incoming-bids': to track individual files in a layout where file name conform to BIDS-conventions - main branch: based off of incoming-native or incoming-bids (if enabled) with potential manual modifications applied """ _examples_ = [ dict( text='Initialize a dataset in the current directory', code_cmd='datalad ukb-init 5874415 20227_2_0 20249_2_0', code_py= 'ukb_init(participant="5874415", records=["20227_2_0", "20249_2_0"])' ), dict( text='Initialize a dataset in the current directory in BIDS layout', code_cmd='datalad ukb-init --bids 5874415 20227_2_0', code_py= 'ukb_init(participant="5874415", records=["20227_2_0"], bids=True)' ), ] _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), metavar='DATASET', doc="""specify the dataset to perform the initialization on""", constraints=EnsureDataset() | EnsureNone()), participant=Parameter( args=('participant', ), metavar='PARTICPANT-ID', nargs=1, doc="""UKBiobank participant ID to use for this dataset (note: these encoded IDs are unique to each application/project)""", constraints=EnsureStr()), records=Parameter(args=('records', ), metavar='DATARECORD-ID', nargs='+', doc='One or more data record identifiers', constraints=EnsureStr()), force=Parameter(args=( "-f", "--force", ), doc="""force (re-)initialization""", action='store_true'), bids=Parameter( args=('--bids', ), action='store_true', doc="""additionally maintain an incoming-bids branch with a BIDS-like organization."""), ) @staticmethod @datasetmethod(name='ukb_init') @eval_results def __call__(participant, records, force=False, bids=False, dataset=None): ds = require_dataset(dataset, check_installed=True, purpose='initialization') participant = ensure_list(participant)[0] records = ensure_list(records) repo = ds.repo main_branch = repo.get_active_branch() # test for awareness of incoming branches incoming_branches = [ b for b in ('incoming', 'incoming-native', 'incoming-bids') if b in repo.get_branches() or any( u.endswith('/{}'.format(b)) for u in repo.get_remote_branches()) ] # prep for yield res = dict( action='ukb_init', path=ds.path, type='dataset', logger=lgr, refds=ds.path, ) if 'incoming' in incoming_branches and not force: yield dict( res, status='error', message='Dataset found already initialized, ' 'use `force` to reinitialize', ) return if 'incoming' not in incoming_branches: # establish "incoming" branch that will hold pristine UKB downloads repo.call_git(['checkout', '--orphan', 'incoming']) else: repo.call_git(['checkout', 'incoming']) # place batch file with download config for ukbfetch in it batchfile = repo.pathobj / '.ukbbatch' batchfile.write_text('{}\n'.format('\n'.join( '{} {}'.format(participant, rec) for rec in records))) # inherit the standard attributes to ensure uniform behavior # across branches (repo.pathobj / '.gitattributes').write_text( repo.call_git( ['cat-file', '-p', '{}:.gitattributes'.format(main_branch)])) # save to incoming branch, provide path to avoid adding untracked # content ds.save( path=['.ukbbatch', '.gitattributes'], to_git=True, message="Configure UKB data fetch", result_renderer=None, ) # establish rest of the branch structure: "incoming-native" # for extracted archive content _add_incoming_branch('incoming-native', incoming_branches, repo, batchfile) if bids: _add_incoming_branch('incoming-bids', incoming_branches, repo, batchfile) # force merge unrelated histories into main branch # we are using an orphan branch such that we know that # `git ls-tree incoming` # will only report download-related content, nothing extracted or # manually modified repo.call_git(['checkout', main_branch]) repo.call_git([ 'merge', '-m', 'Merge incoming', '--allow-unrelated-histories', 'incoming-bids' if bids else 'incoming-native', ]) yield dict( res, status='ok', participant=participant, records=records, ) return
class Dump(Interface): """Query a dataset's aggregated metadata for dataset and file metadata Two types of metadata are supported: 1. metadata describing a dataset as a whole (dataset-global metadata), and 2. metadata for files in a dataset (content metadata). Both types can be queried with this command, and a specific type is requested via the `--reporton` argument. Examples: Dump the metadata of a single file, the queried dataset is determined based on the current working directory:: % datalad meta-dump somedir/subdir/thisfile.dat Sometimes it is helpful to get metadata records formatted in a more accessible form, here as pretty-printed JSON:: % datalad -f json_pp meta-dump somedir/subdir/thisfile.dat Same query as above, but specify which dataset to query (must be containing the query path):: % datalad meta-dump -d . somedir/subdir/thisfile.dat Dump any metadata record of any dataset known to the queried dataset:: % datalad meta-dump --recursive --reporton datasets Get a JSON-formatted report of metadata aggregates in a dataset, incl. information on enabled metadata extractors, dataset versions, dataset IDs, and dataset paths:: % datalad -f json meta-dump --reporton aggregates """ # make the custom renderer the default, path reporting isn't the top # priority here result_renderer = 'tailored' _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""dataset to query. If not given, a dataset will be determined based on the current working directory.""", constraints=EnsureDataset() | EnsureNone()), path=Parameter(args=("path", ), metavar="PATH", doc="path(s) to query metadata for", nargs="*", constraints=EnsureStr() | EnsureNone()), reporton=Parameter( args=('--reporton', ), constraints=EnsureChoice('all', 'jsonld', 'datasets', 'files', 'aggregates'), doc="""what type of metadata to report on: dataset-global metadata only ('datasets'), metadata on dataset content/files only ('files'), both ('all', default). 'jsonld' is an alternative mode to report all available metadata with JSON-LD markup. A single metadata result with the entire metadata graph matching the query will be reported, all non-JSON-LD-type metadata will be ignored. There is an auxiliary category 'aggregates' that reports on which metadata aggregates are present in the queried dataset."""), recursive=Parameter( args=( "-r", "--recursive", ), action="store_true", doc="""if set, recursively report on any matching metadata based on given paths or reference dataset. Note, setting this option does not cause any recursion into potential subdatasets on the filesystem. It merely determines what metadata is being reported from the given/discovered reference dataset."""), ) @staticmethod @datasetmethod(name='meta_dump') @eval_results def __call__(path=None, dataset=None, reporton='all', recursive=False): # prep results res_kwargs = dict(action='meta_dump', logger=lgr) ds = require_dataset(dataset=dataset, check_installed=True, purpose='aggregate metadata query') if dataset: res_kwargs['refds'] = ds.path agginfos = get_ds_aggregate_db( ds.pathobj, version=str(aggregate_layout_version), # we are handling errors below warn_absent=False, ) if not agginfos: # if there has ever been an aggregation run, this file would # exist, hence there has not been and we need to tell this # to people yield get_status_dict( ds=ds, status='impossible', message='metadata aggregation has never been performed in ' 'this dataset', **res_kwargs) return if not path: # implement https://github.com/datalad/datalad/issues/3282 path = ds.pathobj if isinstance(dataset, Dataset) else os.getcwd() # check for paths that are not underneath this dataset resolved_paths = set() for p in assure_list(path): p = resolve_path(p, dataset) if p != ds.pathobj and ds.pathobj not in p.parents: raise ValueError( 'given path {} is not underneath dataset {}'.format(p, ds)) resolved_paths.add(p) # sort paths into their containing dataset aggregate records paths_by_ds = {} while resolved_paths: resolved_path = resolved_paths.pop() # find the first dataset that matches for aggdspath in sorted(agginfos, reverse=True): if recursive and resolved_path in aggdspath.parents: ps = paths_by_ds.get(aggdspath, set()) ps.add(aggdspath) paths_by_ds[aggdspath] = ps elif aggdspath == resolved_path \ or aggdspath in resolved_path.parents: ps = paths_by_ds.get(aggdspath, set()) ps.add(resolved_path) paths_by_ds[aggdspath] = ps # stop when the containing dataset is found break # which files do we need to have locally to perform the query info_keys = \ ('dataset_info', 'content_info') \ if reporton in ('all', 'jsonld') else \ ('dataset_info',) if reporton == 'datasets' else \ ('content_info',) if reporton == 'files' else \ [] objfiles = [ text_type(agginfos[d][t]) for d in paths_by_ds for t in info_keys if t in agginfos[d] ] lgr.debug( 'Verifying/achieving local availability of %i metadata objects', len(objfiles)) if objfiles: for r in ds.get(path=objfiles, result_renderer='disabled', return_type='generator'): # report only of not a success as this is an internal operation # that a user would not (need to) expect if success_status_map.get( r['status'], False) != 'success': # pragma: no cover yield r contexts = {} nodes_by_context = {} parentds = [] # loop over all records to get complete parentds relationships for aggdspath in sorted(agginfos): while parentds and parentds[-1] not in aggdspath.parents: parentds.pop() if aggdspath not in paths_by_ds: # nothing to say about this parentds.append(aggdspath) continue agg_record = agginfos[aggdspath] if reporton == 'aggregates': # we do not need to loop over the actual query paths, as # the aggregates of the containing dataset will contain # the desired info, if any exists # convert pathobj before emitting until we became more clever info = { k: text_type(v) if isinstance(v, ut.PurePath) else v for k, v in iteritems(agg_record) } info.update( path=text_type(aggdspath), type='dataset', ) if aggdspath == ds.pathobj: info['layout_version'] = aggregate_layout_version if parentds: info['parentds'] = text_type(parentds[-1]) yield dict(info, status='ok', **res_kwargs) parentds.append(aggdspath) continue # pull out actual metadata records for res in _yield_metadata_records( aggdspath, agg_record, paths_by_ds[aggdspath], reporton, parentds=parentds[-1] if parentds else None): if reporton != 'jsonld': yield dict(res, **res_kwargs) continue collect_jsonld_metadata(aggdspath, res, nodes_by_context, contexts) parentds.append(aggdspath) if reporton == 'jsonld': yield dict(status='ok', type='dataset', path=ds.path, metadata=format_jsonld_metadata(nodes_by_context), refcommit=agginfos[ds.pathobj]['refcommit'], **res_kwargs) @staticmethod def custom_result_renderer(res, **kwargs): if res['status'] != 'ok' or not res.get('action', None) == 'meta_dump': # logging complained about this already return if kwargs.get('reporton', None) == 'jsonld': # special case of a JSON-LD report request # all reports are consolidated into a single # graph, dumps just that (no pretty printing, can # be done outside) ui.message( jsondumps( res['metadata'], # support utf-8 output ensure_ascii=False, # this cannot happen, spare the checks check_circular=False, # this will cause the output to not necessarily be # JSON compliant, but at least contain all info that went # in, and be usable for javascript consumers allow_nan=True, )) return # list the path, available metadata keys, and tags path = op.relpath(res['path'], res['refds']) if res.get( 'refds', None) else res['path'] meta = res.get('metadata', {}) ui.message('{path}{type}:{spacer}{meta}{tags}'.format( path=ac.color_word(path, ac.BOLD), type=' ({})'.format(ac.color_word(res['type'], ac.MAGENTA)) if 'type' in res else '', spacer=' ' if len([m for m in meta if m != 'tag']) else '', meta=','.join(k for k in sorted(meta.keys()) if k not in ('tag', '@context', '@id')) if meta else ' -' if 'metadata' in res else ' {}'.format(','.join( e for e in res['extractors'] if e not in ('datalad_core', 'metalad_core', 'metalad_annex'))) if 'extractors' in res else '', tags='' if 'tag' not in meta else ' [{}]'.format(','.join( assure_list(meta['tag'])))))
class Subdatasets(Interface): r"""Report subdatasets and their properties. The following properties are reported (if possible) for each matching subdataset record. "name" Name of the subdataset in the parent (often identical with the relative path in the parent dataset) "path" Absolute path to the subdataset "parentds" Absolute path to the parent dataset "gitshasum" SHA1 of the subdataset commit recorded in the parent dataset "state" Condition of the subdataset: 'clean', 'modified', 'absent', 'conflict' as reported by `git submodule` "gitmodule_url" URL of the subdataset recorded in the parent "gitmodule_name" Name of the subdataset recorded in the parent "gitmodule_<label>" Any additional configuration property on record. Performance note: Property modification, requesting `bottomup` reporting order, or a particular numerical `recursion_limit` implies an internal switch to an alternative query implementation for recursive query that is more flexible, but also notably slower (performs one call to Git per dataset versus a single call for all combined). The following properties for subdatasets are recognized by DataLad (without the 'gitmodule\_' prefix that is used in the query results): "datalad-recursiveinstall" If set to 'skip', the respective subdataset is skipped when DataLad is recursively installing its superdataset. However, the subdataset remains installable when explicitly requested, and no other features are impaired. """ _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), doc="""specify the dataset to query. If no dataset is given, an attempt is made to identify the dataset based on the input and/or the current working directory""", constraints=EnsureDataset() | EnsureNone()), path=Parameter( args=("path", ), metavar='PATH', doc="""path/name to query for subdatasets. Defaults to the current directory[PY: , or the entire dataset if called as a dataset method PY].""", nargs='*', constraints=EnsureStr() | EnsureNone()), fulfilled=Parameter( args=("--fulfilled", ), doc="""if given, must be a boolean flag indicating whether to report either only locally present or absent datasets. By default subdatasets are reported regardless of their status""", constraints=EnsureBool() | EnsureNone()), recursive=recursion_flag, recursion_limit=recursion_limit, contains=Parameter( args=('--contains', ), metavar='PATH', action='append', doc="""limit report to the subdatasets containing the given path. If a root path of a subdataset is given the last reported dataset will be the subdataset itself.[CMD: This option can be given multiple times CMD][PY: Can be a list with multiple paths PY], in which case datasets will be reported that contain any of the given paths.""", constraints=EnsureStr() | EnsureNone()), bottomup=Parameter( args=("--bottomup", ), action="store_true", doc="""whether to report subdatasets in bottom-up order along each branch in the dataset tree, and not top-down."""), set_property=Parameter( args=('--set-property', ), metavar=('NAME', 'VALUE'), nargs=2, action='append', doc="""Name and value of one or more subdataset properties to be set in the parent dataset's .gitmodules file. The property name is case-insensitive, must start with a letter, and consist only of alphanumeric characters. The value can be a Python format() template string wrapped in '<>' (e.g. '<{gitmodule_name}>'). Supported keywords are any item reported in the result properties of this command, plus 'refds_relpath' and 'refds_relname': the relative path of a subdataset with respect to the base dataset of the command call, and, in the latter case, the same string with all directory separators replaced by dashes.[CMD: This option can be given multiple times. CMD]""", constraints=EnsureStr() | EnsureNone()), delete_property=Parameter( args=('--delete-property', ), metavar='NAME', action='append', doc="""Name of one or more subdataset properties to be removed from the parent dataset's .gitmodules file.[CMD: This option can be given multiple times. CMD]""", constraints=EnsureStr() | EnsureNone())) @staticmethod @datasetmethod(name='subdatasets') @eval_results def __call__(path=None, dataset=None, fulfilled=None, recursive=False, recursion_limit=None, contains=None, bottomup=False, set_property=None, delete_property=None): # no constraints given -> query subdatasets under curdir if not path and dataset is None: path = os.curdir paths = [resolve_path(p, dataset) for p in assure_list(path)] \ if path else None ds = require_dataset(dataset, check_installed=True, purpose='subdataset reporting/modification') lgr.debug('Query subdatasets of %s', dataset) if paths is not None: lgr.debug('Query subdatasets underneath paths: %s', paths) refds_path = ds.path # return as quickly as possible if isinstance(recursion_limit, int) and (recursion_limit <= 0): return if set_property: for k, v in set_property: if valid_key.match(k) is None: raise ValueError( "key '%s' is invalid (alphanumeric plus '-' only, must " "start with a letter)" % k) if contains: contains = [ resolve_path(c, dataset) for c in assure_list(contains) ] contains_hits = set() for r in _get_submodules(ds, paths, fulfilled, recursive, recursion_limit, contains, bottomup, set_property, delete_property, refds_path): # a boat-load of ancient code consumes this and is ignorant of # Path objects r['path'] = str(r['path']) # without the refds_path cannot be rendered/converted relative # in the eval_results decorator r['refds'] = refds_path if 'contains' in r: contains_hits.update(r['contains']) r['contains'] = [str(c) for c in r['contains']] yield r if contains: for c in set(contains).difference(contains_hits): yield get_status_dict( 'subdataset', path=str(c), status='impossible', message='path not contained in any matching subdataset', # we do not want to log such an event, because it is a # legit query to check for matching subdatasets simply # for the purpose of further decision making # user communication in front-end scenarios will happen # via result rendering #logger=lgr )