class Init(Interface): """Initialize an existing dataset to track a UKBiobank participant A batch file for the 'ukbfetch' tool will be generated and placed into the dataset. By selecting the relevant data records, raw and/or preprocessed data will be tracked. After initialization the dataset will contain at least three branches: - incoming: to track the pristine ZIP files downloaded from UKB - incoming-native: to track individual files (some extracted from ZIP files) - incoming-bids: to track individual files in a layout where file name conform to BIDS-conventions - master: based off of incoming-native or incoming-bids (if enabled) with potential manual modifications applied """ _examples_ = [ dict( text='Initialize a dataset in the current directory', code_cmd='datalad ukb-init 5874415 20227_2_0 20249_2_0', code_py= 'ukb_init(participant="5874415", records=["20227_2_0", "20249_2_0"])' ), dict( text='Initialize a dataset in the current directory in BIDS layout', code_cmd='datalad ukb-init --bids 5874415 20227_2_0', code_py= 'ukb_init(participant="5874415", records=["20227_2_0"], bids=True)' ), ] _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), metavar='DATASET', doc="""specify the dataset to perform the initialization on""", constraints=EnsureDataset() | EnsureNone()), participant=Parameter( args=('participant', ), metavar='PARTICPANT-ID', nargs=1, doc="""UKBiobank participant ID to use for this dataset (note: these encoded IDs are unique to each application/project)""", constraints=EnsureStr()), records=Parameter(args=('records', ), metavar='DATARECORD-ID', nargs='+', doc='One or more data record identifiers', constraints=EnsureStr()), force=Parameter(args=( "-f", "--force", ), doc="""force (re-)initialization""", action='store_true'), bids=Parameter( args=('--bids', ), action='store_true', doc="""additionally maintain an incoming-bids branch with a BIDS-like organization."""), ) @staticmethod @datasetmethod(name='ukb_init') @eval_results def __call__(participant, records, force=False, bids=False, dataset=None): ds = require_dataset(dataset, check_installed=True, purpose='initialization') participant = ensure_list(participant)[0] records = ensure_list(records) repo = ds.repo branches = repo.get_branches() # prep for yield res = dict( action='ukb_init', path=ds.path, type='dataset', logger=lgr, refds=ds.path, ) if 'incoming' in branches and not force: yield dict( res, status='error', message='Dataset found already initialized, ' 'use `force` to reinitialize', ) return if 'incoming' not in branches: # establish "incoming" branch that will hold pristine UKB downloads repo.call_git(['checkout', '--orphan', 'incoming']) else: repo.call_git(['checkout', 'incoming']) # place batch file with download config for ukbfetch in it batchfile = repo.pathobj / '.ukbbatch' batchfile.write_text('{}\n'.format('\n'.join( '{} {}'.format(participant, rec) for rec in records))) # save to incoming branch, provide path to avoid adding untracked # content ds.save( path='.ukbbatch', to_git=True, message="Configure UKB data fetch", result_renderer=None, ) # establish rest of the branch structure: "incoming-processsed" # for extracted archive content _add_incoming_branch('incoming-native', branches, repo, batchfile) if bids: _add_incoming_branch('incoming-bids', branches, repo, batchfile) # force merge unrelated histories into master # we are using an orphan branch such that we know that # `git ls-tree incoming` # will only report download-related content, nothing extracted or # manually modified repo.call_git(['checkout', 'master']) repo.call_git([ 'merge', '-m', 'Merge incoming', '--allow-unrelated-histories', 'incoming-bids' if bids else 'incoming-native', ]) yield dict( res, status='ok', participant=participant, records=records, ) return
class Create(Interface): """Create a new dataset from scratch. This command initializes a new dataset at a given location, or the current directory. The new dataset can optionally be registered in an existing superdataset (the new dataset's path needs to be located within the superdataset for that, and the superdataset needs to be given explicitly via [PY: `dataset` PY][CMD: --dataset CMD]). It is recommended to provide a brief description to label the dataset's nature *and* location, e.g. "Michael's music on black laptop". This helps humans to identify data locations in distributed scenarios. By default an identifier comprised of user and machine name, plus path will be generated. This command only creates a new dataset, it does not add existing content to it, even if the target directory already contains additional files or directories. Plain Git repositories can be created via the [PY: `no_annex` PY][CMD: --no-annex CMD] flag. However, the result will not be a full dataset, and, consequently, not all features are supported (e.g. a description). || REFLOW >> To create a local version of a remote dataset use the :func:`~datalad.api.install` command instead. << REFLOW || .. note:: Power-user info: This command uses :command:`git init` and :command:`git annex init` to prepare the new dataset. Registering to a superdataset is performed via a :command:`git submodule add` operation in the discovered superdataset. """ # in general this command will yield exactly one result return_type = 'item-or-list' # in general users expect to get an instance of the created dataset result_xfm = 'datasets' # result filter result_filter = \ EnsureKeyChoice('action', ('create',)) & \ EnsureKeyChoice('status', ('ok', 'notneeded')) _examples_ = [ dict(text="Create a dataset 'mydataset' in the current directory", code_py="create(path='mydataset')", code_cmd="datalad create mydataset"), dict(text="Apply the text2git procedure upon creation of a dataset", code_py="create(path='mydataset', cfg_proc='text2git')", code_cmd="datalad create -c text2git mydataset"), dict(text="Create a subdataset in the root of an existing dataset", code_py="create(dataset='.', path='mysubdataset')", code_cmd="datalad create -d . mysubdataset"), dict(text="Create a dataset in an existing, non-empty directory", code_py="create(force=True)", code_cmd="datalad create --force"), dict(text="Create a plain Git repository", code_py="create(path='mydataset', no_annex=True)", code_cmd="datalad create --no-annex mydataset"), ] _params_ = dict( path=Parameter( args=("path", ), nargs='?', metavar='PATH', doc="""path where the dataset shall be created, directories will be created as necessary. If no location is provided, a dataset will be created in the location specified by [PY: `dataset` PY][CMD: --dataset CMD] (if given) or the current working directory. Either way the command will error if the target directory is not empty. Use [PY: `force` PY][CMD: --force CMD] to create a dataset in a non-empty directory.""", # put dataset 2nd to avoid useless conversion constraints=EnsureStr() | EnsureDataset() | EnsureNone()), initopts=Parameter( args=("initopts", ), metavar='INIT OPTIONS', nargs=REMAINDER, doc="""options to pass to :command:`git init`. [PY: Options can be given as a list of command line arguments or as a GitPython-style option dictionary PY][CMD: Any argument specified after the destination path of the repository will be passed to git-init as-is CMD]. Note that not all options will lead to viable results. For example '--bare' will not yield a repository where DataLad can adjust files in its working tree."""), dataset=Parameter( args=("-d", "--dataset"), metavar='DATASET', doc="""specify the dataset to perform the create operation on. If a dataset is given along with `path`, a new subdataset will be created in it at the `path` provided to the create command. If a dataset is given but `path` is unspecified, a new dataset will be created at the location specified by this option.""", constraints=EnsureDataset() | EnsureNone()), force=Parameter( args=( "-f", "--force", ), doc="""enforce creation of a dataset in a non-empty directory""", action='store_true'), description=location_description, no_annex=Parameter( # hide this from the cmdline parser, replaced by `annex` args=tuple(), doc="""this option is deprecated, use `annex` instead""", action='store_true'), annex=Parameter( args=("--no-annex", ), dest='annex', doc="""if [CMD: set CMD][PY: disabled PY], a plain Git repository will be created without any annex""", action='store_false'), # TODO seems to only cause a config flag to be set, this could be done # in a procedure fake_dates=Parameter( args=('--fake-dates', ), action='store_true', doc="""Configure the repository to use fake dates. The date for a new commit will be set to one second later than the latest commit in the repository. This can be used to anonymize dates."""), cfg_proc=Parameter( args=("-c", "--cfg-proc"), metavar="PROC", action='append', doc="""Run cfg_PROC procedure(s) (can be specified multiple times) on the created dataset. Use [PY: `run_procedure(discover=True)` PY][CMD: run_procedure --discover CMD] to get a list of available procedures, such as cfg_text2git. """)) @staticmethod @datasetmethod(name='create') @eval_results def __call__(path=None, initopts=None, force=False, description=None, dataset=None, no_annex=_NoAnnexDefault, annex=True, fake_dates=False, cfg_proc=None): # TODO: introduced with 0.13, remove with 0.14 if no_annex is not _NoAnnexDefault: # the two mirror options do not agree and the deprecated one is # not at default value warnings.warn( "datalad-create's `no_annex` option is deprecated " "and will be removed in a future release, " "use the reversed-sign `annex` option instead.", DeprecationWarning) # honor the old option for now annex = not no_annex # we only perform negative tests below no_annex = not annex if dataset: if isinstance(dataset, Dataset): ds = dataset else: ds = Dataset(dataset) refds_path = ds.path else: ds = refds_path = None # two major cases # 1. we got a `dataset` -> we either want to create it (path is None), # or another dataset in it (path is not None) # 2. we got no dataset -> we want to create a fresh dataset at the # desired location, either at `path` or PWD # sanity check first if no_annex: if description: raise ValueError("Incompatible arguments: cannot specify " "description for annex repo and declaring " "no annex repo.") if (isinstance(initopts, (list, tuple)) and '--bare' in initopts) or (isinstance(initopts, dict) and 'bare' in initopts): raise ValueError( "Creation of bare repositories is not supported. Consider " "one of the create-sibling commands, or use " "Git to init a bare repository and push an existing dataset " "into it.") if path: path = resolve_path(path, dataset) path = path if path \ else getpwd() if ds is None \ else refds_path # we know that we need to create a dataset at `path` assert (path is not None) # assure cfg_proc is a list (relevant if used via Python API) cfg_proc = assure_list(cfg_proc) # prep for yield res = dict(action='create', path=str(path), logger=lgr, type='dataset', refds=refds_path) refds = None if refds_path and refds_path != str(path): refds = require_dataset(refds_path, check_installed=True, purpose='creating a subdataset') path_inrefds = path_under_rev_dataset(refds, path) if path_inrefds is None: yield dict( res, status='error', message=( "dataset containing given paths is not underneath " "the reference dataset %s: %s", ds, str(path)), ) return # try to locate an immediate parent dataset # we want to know this (irrespective of whether we plan on adding # this new dataset to a parent) in order to avoid conflicts with # a potentially absent/uninstalled subdataset of the parent # in this location # it will cost some filesystem traversal though... parentds_path = get_dataset_root( op.normpath(op.join(str(path), os.pardir))) if parentds_path: prepo = GitRepo(parentds_path) parentds_path = ut.Path(parentds_path) # we cannot get away with a simple # GitRepo.get_content_info(), as we need to detect # uninstalled/added subdatasets too check_path = ut.Path(path) pstatus = prepo.status( untracked='no', # limit query to target path for a potentially massive speed-up paths=[check_path.relative_to(parentds_path)]) if (not pstatus.get(check_path, {}).get("type") == "dataset" and any(check_path == p or check_path in p.parents for p in pstatus)): # redo the check in a slower fashion, it is already broken # let's take our time for a proper error message conflict = [ p for p in pstatus if check_path == p or check_path in p.parents ] res.update({ 'status': 'error', 'message': ('collision with content in parent dataset at %s: %s', str(parentds_path), [str(c) for c in conflict]) }) yield res return if not force: # another set of check to see whether the target path is pointing # into a known subdataset that is not around ATM subds_status = { parentds_path / k.relative_to(prepo.path) for k, v in pstatus.items() if v.get('type', None) == 'dataset' } check_paths = [check_path] check_paths.extend(check_path.parents) if any(p in subds_status for p in check_paths): conflict = [p for p in check_paths if p in subds_status] res.update({ 'status': 'error', 'message': ('collision with %s (dataset) in dataset %s', str(conflict[0]), str(parentds_path)) }) yield res return # important to use the given Dataset object to avoid spurious ID # changes with not-yet-materialized Datasets tbds = ds if isinstance(ds, Dataset) and \ ds.path == path else Dataset(str(path)) # don't create in non-empty directory without `force`: if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force: res.update({ 'status': 'error', 'message': 'will not create a dataset in a non-empty directory, use ' '`force` option to ignore' }) yield res return # stuff that we create and want to have tracked with git (not annex) add_to_git = {} if initopts is not None and isinstance(initopts, list): initopts = {'_from_cmdline_': initopts} # Note for the code below: # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad # Re-use tbrepo instance, do not use tbds.repo # create and configure desired repository if no_annex: lgr.info("Creating a new git repo at %s", tbds.path) tbrepo = GitRepo(tbds.path, url=None, create=True, create_sanity_checks=False, git_opts=initopts, fake_dates=fake_dates) # place a .noannex file to indicate annex to leave this repo alone stamp_path = ut.Path(tbrepo.path) / '.noannex' stamp_path.touch() add_to_git[stamp_path] = {'type': 'file', 'state': 'untracked'} else: # always come with annex when created from scratch lgr.info("Creating a new annex repo at %s", tbds.path) tbrepo = AnnexRepo( tbds.path, url=None, create=True, create_sanity_checks=False, # do not set backend here, to avoid a dedicated commit backend=None, # None causes version to be taken from config version=None, description=description, git_opts=initopts, fake_dates=fake_dates) # set the annex backend in .gitattributes as a staged change tbrepo.set_default_backend(cfg.obtain('datalad.repo.backend'), persistent=True, commit=False) add_to_git[tbrepo.pathobj / '.gitattributes'] = { 'type': 'file', 'state': 'added' } # make sure that v6 annex repos never commit content under .datalad attrs_cfg = (('config', 'annex.largefiles', 'nothing'), ( 'metadata/aggregate*', 'annex.largefiles', 'nothing' ), ('metadata/objects/**', 'annex.largefiles', '({})'.format( cfg.obtain('datalad.metadata.create-aggregate-annex-limit')))) attrs = tbrepo.get_gitattributes( [op.join('.datalad', i[0]) for i in attrs_cfg]) set_attrs = [] for p, k, v in attrs_cfg: if not attrs.get(op.join('.datalad', p), {}).get(k, None) == v: set_attrs.append((p, {k: v})) if set_attrs: tbrepo.set_gitattributes(set_attrs, attrfile=op.join( '.datalad', '.gitattributes')) # prevent git annex from ever annexing .git* stuff (gh-1597) attrs = tbrepo.get_gitattributes('.git') if not attrs.get('.git', {}).get('annex.largefiles', None) == 'nothing': tbrepo.set_gitattributes([('**/.git*', { 'annex.largefiles': 'nothing' })]) # must use the repo.pathobj as this will have resolved symlinks add_to_git[tbrepo.pathobj / '.gitattributes'] = { 'type': 'file', 'state': 'untracked' } # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad # Note, must not happen earlier (before if) since "smart" it would not be tbds_config = tbds.config # record an ID for this repo for the afterlife # to be able to track siblings and children id_var = 'datalad.dataset.id' # Note, that Dataset property `id` will change when we unset the # respective config. Therefore store it before: tbds_id = tbds.id if id_var in tbds_config: # make sure we reset this variable completely, in case of a # re-create tbds_config.unset(id_var, where='dataset') if _seed is None: # just the standard way uuid_id = uuid.uuid1().urn.split(':')[-1] else: # Let's generate preseeded ones uuid_id = str(uuid.UUID(int=random.getrandbits(128))) tbds_config.add(id_var, tbds_id if tbds_id is not None else uuid_id, where='dataset', reload=False) # make config overrides permanent in the repo config # this is similar to what `annex init` does # we are only doing this for config overrides and do not expose # a dedicated argument, because it is sufficient for the cmdline # and unnecessary for the Python API (there could simply be a # subsequence ds.config.add() call) for k, v in tbds_config.overrides.items(): tbds_config.add(k, v, where='local', reload=False) # all config manipulation is done -> fll reload tbds_config.reload() # must use the repo.pathobj as this will have resolved symlinks add_to_git[tbrepo.pathobj / '.datalad'] = { 'type': 'directory', 'state': 'untracked' } # save everything, we need to do this now and cannot merge with the # call below, because we may need to add this subdataset to a parent # but cannot until we have a first commit tbrepo.save( message='[DATALAD] new dataset', git=True, # we have to supply our own custom status, as the repo does # not have a single commit yet and the is no HEAD reference # TODO make `GitRepo.status()` robust to this state. _status=add_to_git, ) for cfg_proc_ in cfg_proc: for r in tbds.run_procedure('cfg_' + cfg_proc_): yield r # the next only makes sense if we saved the created dataset, # otherwise we have no committed state to be registered # in the parent if isinstance(refds, Dataset) and refds.path != tbds.path: # we created a dataset in another dataset # -> make submodule for r in refds.save(path=tbds.path, ): yield r res.update({'status': 'ok'}) yield res @staticmethod def custom_result_renderer(res, **kwargs): # pragma: more cover from datalad.ui import ui if res.get('action', None) == 'create' and \ res.get('status', None) == 'ok' and \ res.get('type', None) == 'dataset': ui.message("Created dataset at {}.".format(res['path'])) else: ui.message("Nothing was created")
class Subdatasets(Interface): r"""Report subdatasets and their properties. The following properties are reported (if possible) for each matching subdataset record. "name" Name of the subdataset in the parent (often identical with the relative path in the parent dataset) "path" Absolute path to the subdataset "parentds" Absolute path to the parent dataset "revision" SHA1 of the subdataset commit recorded in the parent dataset "state" Condition of the subdataset: 'clean', 'modified', 'absent', 'conflict' as reported by `git submodule` "revision_descr" Output of `git describe` for the subdataset "gitmodule_url" URL of the subdataset recorded in the parent "gitmodule_<label>" Any additional configuration property on record. Performance note: Property modification, requesting `bottomup` reporting order, or a particular numerical `recursion_limit` implies an internal switch to an alternative query implementation for recursive query that is more flexible, but also notably slower (performs one call to Git per dataset versus a single call for all combined). The following properties for subdatasets are recognized by DataLad (without the 'gitmodule\_' prefix that is used in the query results): "datalad-recursiveinstall" If set to 'skip', the respective subdataset is skipped when DataLad is recursively installing its superdataset. However, the subdataset remains installable when explicitly requested, and no other features are impaired. """ _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), doc="""specify the dataset to query. If no dataset is given, an attempt is made to identify the dataset based on the input and/or the current working directory""", constraints=EnsureDataset() | EnsureNone()), fulfilled=Parameter( args=("--fulfilled", ), doc="""if given, must be a boolean flag indicating whether to report either only locally present or absent datasets. By default subdatasets are reported regardless of their status""", constraints=EnsureBool() | EnsureNone()), recursive=recursion_flag, recursion_limit=recursion_limit, contains=Parameter( args=('--contains', ), metavar='PATH', doc="""limit report to the subdatasets containing the given path. If a root path of a subdataset is given the last reported dataset will be the subdataset itself.""", constraints=EnsureStr() | EnsureNone()), bottomup=Parameter( args=("--bottomup", ), action="store_true", doc="""whether to report subdatasets in bottom-up order along each branch in the dataset tree, and not top-down."""), set_property=Parameter( args=('--set-property', ), metavar=('NAME', 'VALUE'), nargs=2, action='append', doc="""Name and value of one or more subdataset properties to be set in the parent dataset's .gitmodules file. The property name is case-insensitive, must start with a letter, and consist only of alphanumeric characters. The value can be a Python format() template string wrapped in '<>' (e.g. '<{gitmodule_name}>'). Supported keywords are any item reported in the result properties of this command, plus 'refds_relpath' and 'refds_relname': the relative path of a subdataset with respect to the base dataset of the command call, and, in the latter case, the same string with all directory separators replaced by dashes.[CMD: This option can be given multiple times. CMD]""", constraints=EnsureStr() | EnsureNone()), delete_property=Parameter( args=('--delete-property', ), metavar='NAME', action='append', doc="""Name of one or more subdataset properties to be removed from the parent dataset's .gitmodules file.[CMD: This option can be given multiple times. CMD]""", constraints=EnsureStr() | EnsureNone())) @staticmethod @datasetmethod(name='subdatasets') @eval_results def __call__(dataset=None, fulfilled=None, recursive=False, recursion_limit=None, contains=None, bottomup=False, set_property=None, delete_property=None): dataset = require_dataset(dataset, check_installed=False, purpose='subdataset reporting/modification') refds_path = dataset.path # XXX this seems strange, but is tested to be the case -- I'd rather set # `check_installed` to true above and fail if not GitRepo.is_valid_repo(refds_path): return # return as quickly as possible if isinstance(recursion_limit, int) and (recursion_limit <= 0): return if set_property: for k, v in set_property: if valid_key.match(k) is None: raise ValueError( "key '%s' is invalid (alphanumeric plus '-' only, must start with a letter)", k) if contains: contains = resolve_path(contains, dataset) for r in _get_submodules(dataset.path, fulfilled, recursive, recursion_limit, contains, bottomup, set_property, delete_property, refds_path): # without the refds_path cannot be rendered/converted relative # in the eval_results decorator r['refds'] = refds_path yield r
class CreateSiblingRia(Interface): """Creates a sibling to a dataset in a RIA store Communication with a dataset in a RIA store is implemented via two siblings. A regular Git remote (repository sibling) and a git-annex special remote for data transfer (storage sibling) -- with the former having a publication dependency on the latter. By default, the name of the storage sibling is derived from the repository sibling's name by appending "-storage". The store's base path is expected to not exist, be an empty directory, or a valid RIA store. RIA store layout ~~~~~~~~~~~~~~~~ A RIA store is a directory tree with a dedicated subdirectory for each dataset in the store. The subdirectory name is constructed from the DataLad dataset ID, e.g. '124/68afe-59ec-11ea-93d7-f0d5bf7b5561', where the first three characters of the ID are used for an intermediate subdirectory in order to mitigate files system limitations for stores containing a large number of datasets. Each dataset subdirectory contains a standard bare Git repository for the dataset. In addition, a subdirectory 'annex' hold a standard Git-annex object store. However, instead of using the 'dirhashlower' naming scheme for the object directories, like Git-annex would do, a 'dirhashmixed' layout is used -- the same as for non-bare Git repositories or regular DataLad datasets. Optionally, there can be a further subdirectory 'archives' with (compressed) 7z archives of annex objects. The storage remote is able to pull annex objects from these archives, if it cannot find in the regular annex object store. This feature can be useful for storing large collections of rarely changing data on systems that limit the number of files that can be stored. Each dataset directory also contains a 'ria-layout-version' file that identifies the data organization (as, for example, described above). Lastly, there is a global 'ria-layout-version' file at the store's base path that identifies where dataset subdirectories themselves are located. At present, this file must contain a single line stating the version (currently "1"). This line MUST end with a newline character. It is possible to define an alias for an individual dataset in a store by placing a symlink to the dataset location into an 'alias/' directory in the root of the store. This enables dataset access via URLs of format: 'ria+<protocol>://<storelocation>#~<aliasname>'. Error logging ~~~~~~~~~~~~~ To enable error logging at the remote end, append a pipe symbol and an "l" to the version number in ria-layout-version (like so '1|l\\n'). Error logging will create files in an "error_log" directory whenever the git-annex special remote (storage sibling) raises an exception, storing the Python traceback of it. The logfiles are named according to the scheme '<dataset id>.<annex uuid of the remote>.log' showing "who" ran into this issue with which dataset. Because logging can potentially leak personal data (like local file paths for example), it can be disabled client-side by setting the configuration variable "annex.ora-remote.<storage-sibling-name>.ignore-remote-config". """ # TODO: description? _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), doc="""specify the dataset to process. If no dataset is given, an attempt is made to identify the dataset based on the current working directory""", constraints=EnsureDataset() | EnsureNone()), url=Parameter( args=("url", ), metavar="ria+<ssh|file>://<host>[/path]", doc="""URL identifying the target RIA store and access protocol. """, constraints=EnsureStr() | EnsureNone()), name=Parameter(args=( '-s', '--name', ), metavar='NAME', doc="""Name of the sibling. With `recursive`, the same name will be used to label all the subdatasets' siblings.""", constraints=EnsureStr() | EnsureNone(), required=True), storage_name=Parameter( args=("--storage-name", ), metavar="NAME", doc="""Name of the storage sibling (git-annex special remote). Must not be identical to the sibling name. If not specified, defaults to the sibling name plus '-storage' suffix. If only a storage sibling is created, this setting is ignored, and the primary sibling name is used.""", constraints=EnsureStr() | EnsureNone()), post_update_hook=Parameter( args=("--post-update-hook", ), doc="""Enable git's default post-update-hook for the created sibling.""", action="store_true"), shared=Parameter( args=("--shared", ), metavar='{false|true|umask|group|all|world|everybody|0xxx}', doc="""If given, configures the permissions in the RIA store for multi-users access. Possible values for this option are identical to those of `git init --shared` and are described in its documentation.""", constraints=EnsureStr() | EnsureBool() | EnsureNone()), group=Parameter( args=("--group", ), metavar="GROUP", doc="""Filesystem group for the repository. Specifying the group is crucial when [CMD: --shared=group CMD][PY: shared="group" PY]""", constraints=EnsureStr() | EnsureNone()), storage_sibling=Parameter( args=("--storage-sibling", ), dest='storage_sibling', metavar='MODE', constraints=EnsureChoice('only') | EnsureBool() | EnsureNone(), doc="""By default, an ORA storage sibling and a Git repository sibling are created ([CMD: on CMD][PY: True|'on' PY]). Alternatively, creation of the storage sibling can be disabled ([CMD: off CMD][PY: False|'off' PY]), or a storage sibling created only and no Git sibling ([CMD: only CMD][PY: 'only' PY]). In the latter mode, no Git installation is required on the target host."""), existing=Parameter( args=("--existing", ), constraints=EnsureChoice('skip', 'error', 'reconfigure') | EnsureNone(), metavar='MODE', doc="""Action to perform, if a (storage) sibling is already configured under the given name and/or a target already exists. In this case, a dataset can be skipped ('skip'), an existing target repository be forcefully re-initialized, and the sibling (re-)configured ('reconfigure'), or the command be instructed to fail ('error').""", ), recursive=recursion_flag, recursion_limit=recursion_limit, trust_level=Parameter( args=("--trust-level", ), metavar="TRUST-LEVEL", constraints=EnsureChoice('trust', 'semitrust', 'untrust') | EnsureNone(), doc="""specify a trust level for the storage sibling. If not specified, the default git-annex trust level is used. 'trust' should be used with care (see the git-annex-trust man page).""", ), disable_storage__=Parameter( args=("--no-storage-sibling", ), dest='disable_storage__', doc="""This option is deprecated. Use '--storage-sibling off' instead.""", action="store_false"), ) @staticmethod @datasetmethod(name='create_sibling_ria') @eval_results def __call__( url, name, dataset=None, storage_name=None, post_update_hook=False, shared=None, group=None, storage_sibling=True, existing='error', trust_level=None, recursive=False, recursion_limit=None, disable_storage__=None, ): if disable_storage__ is not None: import warnings warnings.warn( "datalad-create-sibling-ria --no-storage-sibling " "is deprecated, use --storage-sibling off instead.", DeprecationWarning) # recode to new setup disable_storage__ = None storage_sibling = False if storage_sibling == 'only' and storage_name: lgr.warning( "Sibling name will be used for storage sibling in " "storage-sibling-only mode, but a storage sibling name " "was provided") ds = require_dataset(dataset, check_installed=True, purpose='create sibling RIA') res_kwargs = dict( ds=ds, action="create-sibling-ria", logger=lgr, ) # parse target URL try: ssh_host, base_path, rewritten_url = verify_ria_url(url, ds.config) except ValueError as e: yield get_status_dict(status='error', message=str(e), **res_kwargs) return if ds.repo.get_hexsha() is None or ds.id is None: raise RuntimeError("Repository at {} is not a DataLad dataset, " "run 'datalad create [--force]' first.".format( ds.path)) if not storage_sibling and storage_name: lgr.warning( "Storage sibling setup disabled, but a storage sibling name " "was provided") if storage_sibling and not storage_name: storage_name = "{}-storage".format(name) if storage_sibling and name == storage_name: # leads to unresolvable, circular dependency with publish-depends raise ValueError("sibling names must not be equal") if not isinstance(url, str): raise TypeError("url is not a string, but %s" % type(url)) # Query existing siblings upfront in order to fail early on # existing=='error', since misconfiguration (particularly of special # remotes) only to fail in a subdataset later on with that config, can # be quite painful. # TODO: messages - this is "create-sibling". Don't confuse existence of # local remotes with existence of the actual remote sibling # in wording if existing == 'error': # in recursive mode this check could take a substantial amount of # time: employ a progress bar (or rather a counter, because we don't # know the total in advance pbar_id = 'check-siblings-{}'.format(id(ds)) log_progress( lgr.info, pbar_id, 'Start checking pre-existing sibling configuration %s', ds, label='Query siblings', unit=' Siblings', ) # even if we have to fail, let's report all conflicting siblings # in subdatasets failed = False for r in ds.siblings(result_renderer=None, recursive=recursive, recursion_limit=recursion_limit): log_progress(lgr.info, pbar_id, 'Discovered sibling %s in dataset at %s', r['name'], r['path'], update=1, increment=True) if not r['type'] == 'sibling' or r['status'] != 'ok': # this is an internal status query that has not consequence # for the outside world. Be silent unless something useful # can be said #yield r continue if r['name'] == name: res = get_status_dict( status='error', message="a sibling '{}' is already configured in " "dataset {}".format(name, r['path']), **res_kwargs, ) failed = True yield res continue if storage_name and r['name'] == storage_name: res = get_status_dict( status='error', message="a sibling '{}' is already configured in " "dataset {}".format(storage_name, r['path']), **res_kwargs, ) failed = True yield res continue log_progress( lgr.info, pbar_id, 'Finished checking pre-existing sibling configuration %s', ds, ) if failed: return # TODO: - URL parsing + store creation needs to be RF'ed based on # command abstractions # - more generally consider store creation a dedicated command or # option # Note: URL parsing is done twice ATM (for top-level ds). This can't be # reduced to single instance, since rewriting url based on config could # be different for subdatasets. create_store( SSHRemoteIO(ssh_host) if ssh_host else LocalIO(), Path(base_path), '1') yield from _create_sibling_ria(ds, url, name, storage_sibling, storage_name, existing, shared, group, post_update_hook, trust_level, res_kwargs) if recursive: # Note: subdatasets can be treated independently, so go full # recursion when querying for them and _no_recursion with the # actual call. Theoretically this can be parallelized. for subds in ds.subdatasets(fulfilled=True, recursive=True, recursion_limit=recursion_limit, result_xfm='datasets'): yield from _create_sibling_ria(subds, url, name, storage_sibling, storage_name, existing, shared, group, post_update_hook, trust_level, res_kwargs)
class Remove(Interface): """Remove components from datasets This command can remove any components (subdatasets, and (directories with) files) from datasets. Removing a component implies any present content to be dropped, and any associated subdatasets to be uninstalled. Subsequently, the component is "unregistered" from the respective dataset. This means that the respective component is no longer present on the file system. By default, the availability of at least one remote copy is verified, by default, before file content is dropped. As these checks could lead to slow operation (network latencies, etc), they can be disabled. Any number of paths to process can be given as input. Recursion into subdatasets needs to be explicitly enabled, while recursion in subdirectories within a dataset as always done automatically. An optional recursion limit is applied relative to each given input path. Examples: Permanently remove a subdataset from a dataset and wipe out the subdataset association too:: ~/some/dataset$ datalad remove somesubdataset1 """ _action = 'remove' _params_ = dict( dataset=dataset_argument, path=Parameter(args=("path", ), metavar="PATH", doc="path/name of the component to be removed", nargs="*", constraints=EnsureStr() | EnsureNone()), recursive=recursion_flag, check=check_argument, save=nosave_opt, message=save_message_opt, if_dirty=if_dirty_opt, ) @staticmethod @datasetmethod(name=_action) @eval_results def __call__(path=None, dataset=None, recursive=False, check=True, save=True, message=None, if_dirty='save-before'): res_kwargs = dict(action='remove', logger=lgr) if not dataset and not path: raise InsufficientArgumentsError( "insufficient information for `remove`: requires at least a path or dataset" ) refds_path = Interface.get_refds_path(dataset) res_kwargs['refds'] = refds_path if refds_path and not path and not GitRepo.is_valid_repo(refds_path): # nothing here, nothing to remove yield get_status_dict(path=refds_path, status='notneeded', **res_kwargs) return if refds_path and not path: # act on the whole dataset if nothing else was specified # TODO i think that would happen automatically in annotation? path = refds_path to_process = [] for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, # we only ever want to discover immediate subdatasets, the rest # will happen in `uninstall` recursion_limit=1, action='remove', unavailable_path_status='', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('state', None) == 'absent' and \ ap.get('parentds', None) is None: # nothing exists at location, and there is no parent to # remove from ap['status'] = 'notneeded' ap['message'] = "path does not exist and is not in a dataset" yield ap continue if ap.get('raw_input', False) and ap.get('type', None) == 'dataset': # make sure dataset sorting yields a dedicted entry for this one ap['process_content'] = True to_process.append(ap) if not to_process: # nothing left to do, potentially all errored before return if path_is_under([ap['path'] for ap in to_process]): # behave like `rm` and refuse to remove where we are raise ValueError( "refusing to uninstall current or parent directory") # now sort into datasets so we can process them one by one content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path, path_only=False) assert (not completed) # iterate over all datasets, starting at the bottom # to make the removal of dataset content known upstairs to_save = [] # track which submodules we have removed in the process, to avoid # failure in case we revisit them due to a subsequent path argument subm_removed = [] for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) paths = content_by_ds[ds_path] to_reporemove = [] # PLAN any dataset that was not raw_input, uninstall (passing recursive flag) # if dataset itself is in paths, skip any nondataset # sort reverse so we get subdatasets first for ap in sorted(paths, key=lambda x: x['path'], reverse=True): if ap.get('type', None) == 'dataset': # entire dataset needs to go, uninstall if present, pass recursive! uninstall_failed = False if ap['path'] == refds_path or \ (refds_path is None and ap.get('raw_input', False)): # top-level handling, cannot use regular uninstall call, as # it will refuse to uninstall a top-level dataset # and rightfully so, it is really a remove in that case # bypass all the safety by using low-level helper for r in _uninstall_dataset(ds, check=check, has_super=False, **res_kwargs): if r['status'] in ('impossible', 'error'): # we need to inspect if something went wrong, in order # to prevent failure from removing a non-empty dir below, # but at the same time allow for continued processing uninstall_failed = True r['refds'] = refds_path yield r # recheck that it wasn't removed during a previous iteration elif ap.get('state', None) != 'absent' and GitRepo.is_valid_repo( ap['path']): # anything that is not the top-level -> regular uninstall # this is for subdatasets of the to-be-removed dataset # we want to simply uninstall them in a regular manner for r in Uninstall.__call__( # use annotate path as input, but pass a copy because # we cannot rely on it being unaltered by reannotation # TODO maybe adjust annotate_path to do that [ap.copy()], dataset=refds_path, recursive=recursive, check=check, if_dirty=if_dirty, result_xfm=None, result_filter=None, on_failure='ignore'): if r['status'] in ('impossible', 'error'): # we need to inspect if something went wrong, in order # to prevent failure from removing a non-empty dir below, # but at the same time allow for continued processing uninstall_failed = True yield r if not ap.get('raw_input', False): # we only ever want to actually unregister subdatasets that # were given explicitly continue if not uninstall_failed and \ not ap['path'] in subm_removed and \ refds_path and \ ap.get('parentds', None) and \ not (relpath(ap['path'], start=refds_path).startswith(pardir) or ap['path'] == refds_path) and \ ap.get('registered_subds', False): # strip from superdataset, but only if a dataset was given explcitly # as in "remove from this dataset", but not when just a path was given # as in "remove from the filesystem" subds_relpath = relpath(ap['path'], start=ap['parentds']) # remove submodule reference parentds = Dataset(ap['parentds']) # play safe, will fail on dirty parentds.repo.deinit_submodule(ap['path']) # remove now empty submodule link parentds.repo.remove(ap['path']) # make a record that we removed this already, should it be # revisited via another path argument, because do not reannotate # the paths after every removal subm_removed.append(ap['path']) yield dict(ap, status='ok', **res_kwargs) # need .gitmodules update in parent to_save.append( dict(path=opj(parentds.path, '.gitmodules'), parents=parentds.path, type='file')) # and the removal itself needs to be committed # inform `save` that it is OK that this path # doesn't exist on the filesystem anymore ap['unavailable_path_status'] = '' ap['process_content'] = False to_save.append(ap) if not uninstall_failed and exists(ap['path']): # could be an empty dir in case an already uninstalled subdataset # got removed os.rmdir(ap['path']) else: # anything that is not a dataset can simply be passed on to_reporemove.append(ap['path']) # avoid unnecessary git calls when there is nothing to do if to_reporemove: if check and hasattr(ds.repo, 'drop'): for r in _drop_files(ds, to_reporemove, check=True): yield r for r in ds.repo.remove(to_reporemove, r=True): # these were removed, but we still need to save the removal ap['unavailable_path_status'] = '' to_save.append(ap) yield get_status_dict(status='ok', path=r, **res_kwargs) if not to_save: # nothing left to do, potentially all errored before return if not save: lgr.debug('Not calling `save` as instructed') return for res in Save.__call__( # TODO compose hand-selected annotated paths path=to_save, # we might have removed the reference dataset by now, recheck dataset=refds_path if (refds_path and GitRepo.is_valid_repo(refds_path)) else None, message=message if message else '[DATALAD] removed content', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
class Get(Interface): """Get any dataset content (files/directories/subdatasets). This command only operates on dataset content. To obtain a new independent dataset from some source use the `clone` command. By default this command operates recursively within a dataset, but not across potential subdatasets, i.e. if a directory is provided, all files in the directory are obtained. Recursion into subdatasets is supported too. If enabled, relevant subdatasets are detected and installed in order to fulfill a request. Known data locations for each requested file are evaluated and data are obtained from some available location (according to git-annex configuration and possibly assigned remote priorities), unless a specific source is specified. *Getting subdatasets* Just as DataLad supports getting file content from more than one location, the same is supported for subdatasets, including a ranking of individual sources for prioritization. The following location candidates are considered. For each candidate a cost is given in parenthesis, higher values indicate higher cost, and thus lower priority: - URL of any configured superdataset remote that is known to have the desired submodule commit, with the submodule path appended to it. There can be more than one candidate (cost 500). - In case `.gitmodules` contains a relative path instead of a URL, the URL of any configured superdataset remote that is known to have the desired submodule commit, with this relative path appended to it. There can be more than one candidate (cost 500). - A URL or absolute path recorded in `.gitmodules` (cost 600). - In case `.gitmodules` contains a relative path as a URL, the absolute path of the superdataset, appended with this relative path (cost 900). Additional candidate URLs can be generated based on templates specified as configuration variables with the pattern `datalad.get.subdataset-source-candidate-<name>` where `name` is an arbitrary identifier. If `name` starts with three digits (e.g. '400myserver') these will be interpreted as a cost, and the respective candidate will be sorted into the generated candidate list according to this cost. If no cost is given, a default of 700 is used. A template string assigned to such a variable can utilize the Python format mini language and may reference a number of properties that are inferred from the parent dataset's knowledge about the target subdataset. Properties include any submodule property specified in the respective `.gitmodules` record. For convenience, an existing `datalad-id` record is made available under the shortened name `id`. Additionally, the URL of any configured remote that contains the respective submodule commit is available as `remote-<name>` properties, where `name` is the configured remote name. Lastly, all candidates are sorted according to their cost (lower values first), and duplicate URLs are stripped, while preserving the first item in the candidate list. .. note:: Power-user info: This command uses :command:`git annex get` to fulfill file handles. """ _examples_ = [ dict(text="Get a single file", code_py="get('path/to/file')", code_cmd="datalad get <path/to/file>"), dict(text="Get contents of a directory", code_py="get('path/to/dir/')", code_cmd="datalad get <path/to/dir/>"), dict(text="Get all contents of the current dataset and its subdatasets", code_py="get(dataset='.', recursive=True)", code_cmd="datalad get . -r"), dict(text="Get (clone) a registered subdataset, but don't retrieve data", code_py="get('path/to/subds', get_data=False)", code_cmd="datalad get -n <path/to/subds>"), ] _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), metavar="PATH", doc="""specify the dataset to perform the add operation on, in which case `path` arguments are interpreted as being relative to this dataset. If no dataset is given, an attempt is made to identify a dataset for each input `path`""", constraints=EnsureDataset() | EnsureNone()), path=Parameter( args=("path",), metavar="PATH", doc="""path/name of the requested dataset component. The component must already be known to a dataset. To add new components to a dataset use the `add` command""", nargs="*", constraints=EnsureStr() | EnsureNone()), source=Parameter( args=("-s", "--source",), metavar="LABEL", doc="""label of the data source to be used to fulfill requests. This can be the name of a dataset :term:`sibling` or another known source""", constraints=EnsureStr() | EnsureNone()), recursive=recursion_flag, recursion_limit=Parameter( args=("-R", "--recursion-limit",), metavar="LEVELS", constraints=EnsureInt() | EnsureChoice('existing') | EnsureNone(), doc="""limit recursion into subdataset to the given number of levels. Alternatively, 'existing' will limit recursion to subdatasets that already existed on the filesystem at the start of processing, and prevent new subdatasets from being obtained recursively."""), get_data=Parameter( args=("-n", "--no-data",), dest='get_data', action='store_false', doc="""whether to obtain data for all file handles. If disabled, `get` operations are limited to dataset handles.[CMD: This option prevents data for file handles from being obtained CMD]"""), description=location_description, reckless=reckless_opt, jobs=jobs_opt) @staticmethod @datasetmethod(name='get') @eval_results def __call__( path=None, source=None, dataset=None, recursive=False, recursion_limit=None, get_data=True, description=None, reckless=None, jobs='auto', ): refds_path = Interface.get_refds_path(dataset) if not (dataset or path): raise InsufficientArgumentsError( "Neither dataset nor target path(s) provided") if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path # we have to have a single dataset to operate on refds = require_dataset( dataset, check_installed=True, purpose='get content') content_by_ds = {} # use subdatasets() to discover any relevant content that is not # already present in the root dataset (refds) for sdsres in Subdatasets.__call__( contains=path, # maintain path argument semantics and pass in dataset arg # as is dataset=dataset, # always come from the top to get sensible generator behavior bottomup=False, # when paths are given, they will constrain the recursion # automatically, and we need to enable recursion so we can # location path in subdatasets several levels down recursive=True if path else recursive, recursion_limit=None if path else recursion_limit, return_type='generator', on_failure='ignore'): if sdsres.get('type', None) != 'dataset': # if it is not about a 'dataset' it is likely content in # the root dataset if sdsres.get('status', None) == 'impossible' and \ sdsres.get('message', None) == \ 'path not contained in any matching subdataset': target_path = Path(sdsres['path']) if refds.pathobj != target_path and \ refds.pathobj not in target_path.parents: yield dict( action='get', path=str(target_path), status='error', message=('path not associated with dataset %s', refds), ) continue # check if we need to obtain anything underneath this path # the subdataset() call above will only look _until_ it # hits the targetpath for res in _install_targetpath( refds, Path(sdsres['path']), recursive, recursion_limit, reckless, refds_path, description, jobs=jobs, ): # fish out the datasets that 'contains' a targetpath # and store them for later if res.get('status', None) in ('ok', 'notneeded') and \ 'contains' in res: dsrec = content_by_ds.get(res['path'], set()) dsrec.update(res['contains']) content_by_ds[res['path']] = dsrec if res.get('status', None) != 'notneeded': # all those messages on not having installed anything # are a bit pointless # "notneeded" for annex get comes below yield res else: # dunno what this is, send upstairs yield sdsres # must continue for both conditional branches above # the rest is about stuff in real subdatasets continue # instance of the closest existing dataset for this result ds = Dataset(sdsres['parentds'] if sdsres.get('state', None) == 'absent' else sdsres['path']) assert 'contains' in sdsres # explore the unknown for target_path in sdsres.get('contains', []): # essentially the same as done above for paths in the root # dataset, but here we are starting from the closest # discovered subdataset for res in _install_targetpath( ds, Path(target_path), recursive, recursion_limit, reckless, refds_path, description, jobs=jobs, ): known_ds = res['path'] in content_by_ds if res.get('status', None) in ('ok', 'notneeded') and \ 'contains' in res: dsrec = content_by_ds.get(res['path'], set()) dsrec.update(res['contains']) content_by_ds[res['path']] = dsrec # prevent double-reporting of datasets that have been # installed by explorative installation to get to target # paths, prior in this loop if res.get('status', None) != 'notneeded' or not known_ds: yield res if not get_data: # done already return # and now annex-get, this could all be done in parallel now for ds, content in content_by_ds.items(): for res in _get_targetpaths( Dataset(ds), content, refds.path, source, jobs): if res['path'] not in content_by_ds: # we had reports on datasets and subdatasets already # before the annex stage yield res
class CreateSiblingOSF(Interface): """Create a dataset representation at OSF. This will create a node on OSF and initialize an osf special remote to point to it. There are two modes this can operate in: 'annex' and 'export'. The former uses the OSF node as a key-value store, that can be used by git-annex to copy data to and retrieve data from (potentially by any clone of the original dataset). The latter allows to use 'git annex export' to publish a snapshot of a particular version of the dataset. Such an OSF node will - in opposition to the 'annex' - be human-readable. For authentication with OSF, you can define environment variables: Either 'OSF_TOKEN', or both 'OSF_USERNAME' and 'OSF_PASSWORD'. If neither of these is defined, the tool will fall back to the datalad credential manager and inquire for credentials interactively. """ result_renderer = 'tailored' _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), doc="""Dataset to create a sibling for.""", constraints=EnsureDataset() | EnsureNone()), title=Parameter( args=("--title", ), doc="""title of the to-be created OSF node that is displayed on the OSF website. Defaults to the basename of the root directory of the local dataset.""", constraints=EnsureStr() | EnsureNone(), ), name=Parameter( args=( "-s", "--name", ), doc="""Name of the to-be initialized osf-special-remote""", constraints=EnsureStr()), storage_name=Parameter( args=("--storage-name", ), metavar="NAME", doc="""Name of the storage sibling (git-annex special remote). Must not be identical to the sibling name. If not specified, defaults to the sibling name plus '-storage' suffix.""", constraints=EnsureStr() | EnsureNone()), existing=Parameter( args=("--existing", ), constraints=EnsureChoice('skip', 'error') | EnsureNone(), metavar='MODE', doc="""Action to perform, if a (storage) sibling is already configured under the given name and/or a target already exists. In this case, a dataset can be skipped ('skip'), or the command be instructed to fail ('error').""", ), trust_level=Parameter( args=("--trust-level", ), metavar="TRUST-LEVEL", constraints=EnsureChoice('trust', 'semitrust', 'untrust') | EnsureNone(), doc="""specify a trust level for the storage sibling. If not specified, the default git-annex trust level is used.""", ), mode=Parameter(args=("--mode", ), doc=""" """, constraints=EnsureChoice("annex", "export", "exportonly", "gitonly")), tags=Parameter( args=('--tag', ), dest='tags', metavar='TAG', doc="""specific one or more tags for the to-be-create OSF node. A tag 'DataLad dataset' and the dataset ID (if there is any) will be automatically added as additional tags. [CMD: This option can be given more than once CMD].""", action='append', ), public=Parameter( args=("--public", ), doc="""make OSF node public""", action='store_true', ), category=Parameter( args=("--category", ), doc="""specific the OSF node category to be used for the node. The categorization determines what icon is displayed with the node on the OSF, and helps with search organization""", # all presently supported categories constraints=EnsureChoice("analysis", "communication", "data", "hypothesis", "instrumentation", "methods and measures", "procedure", "project", "software", "other")), description=Parameter( args=("--description", ), metavar="TEXT", doc="""Description of the OSF node that will be displayed on the associated project page. By default a description will be generated based on the mode the sibling is put into.""", constraints=EnsureStr() | EnsureNone()), ) @staticmethod @datasetmethod(name='create_sibling_osf') @eval_results def __call__( title=None, name="osf", storage_name=None, dataset=None, mode="annex", existing='error', trust_level=None, tags=None, public=False, category='data', description=None, ): ds = require_dataset(dataset, purpose="create OSF remote", check_installed=True) res_kwargs = dict( ds=ds, action="create-sibling-osf", logger=lgr, ) # we need an annex if not isinstance(ds.repo, AnnexRepo): yield get_status_dict(type="dataset", status="impossible", message="dataset has no annex", **res_kwargs) return # NOTES: # - we prob. should check osf-special-remote availability upfront to # fail early # - add --recursive option # - recursive won't work easily. Need to think that through. # - would need a naming scheme for subdatasets # - flat on OSF or a tree? # - how do we detect something is there already, so we can skip # rather than duplicate (with a new name)? # osf-type-special-remote sufficient to decide it's not needed? # - adapt to conclusions in issue #30 # -> create those subcomponents # - results need to report URL for created projects suitable for datalad # output formatting! # -> result_renderer # -> needs to ne returned by create_node if not storage_name: storage_name = "{}-storage".format(name) sibling_conflicts = sibling_exists( ds, [name, storage_name], # TODO pass through recursive=False, recursion_limit=None, # fail fast, if error is desired exhaustive=existing == 'error', ) if existing == 'error' and sibling_conflicts: # we only asked for one conflict = sibling_conflicts[0] yield get_status_dict( status='error', message=("a sibling '%s' is already configured in dataset %s", conflict[1], conflict[0]), **res_kwargs, ) return if title is None: # use dataset root basename title = ds.pathobj.name tags = ensure_list(tags) if 'DataLad dataset' not in tags: tags.append('DataLad dataset') if ds.id and ds.id not in tags: tags.append(ds.id) if not description: description = \ "This component was built from a DataLad dataset using the " \ "datalad-osf extension " \ "(https://github.com/datalad/datalad-osf)." if mode != 'exportonly': description += \ " With this extension installed, this component can be " \ "git or datalad cloned from a 'osf://ID' URL, where " \ "'ID' is the OSF node ID that shown in the OSF HTTP " \ "URL, e.g. https://osf.io/q8xnk can be cloned from " \ "osf://q8xnk. " cred = get_credentials(allow_interactive=True) osf = OSF(**cred) node_id, node_url = create_node( osf_session=osf.session, title=title, category=category, tags=tags if tags else None, public=EnsureBool()(public), description=description, ) if mode != 'gitonly': init_opts = [ "encryption=none", "type=external", "externaltype=osf", "autoenable=true", "node={}".format(node_id) ] if mode in ("export", "exportonly"): init_opts += ["exporttree=yes"] ds.repo.init_remote(storage_name, options=init_opts) if trust_level: ds.repo.call_git(['annex', trust_level, storage_name]) yield get_status_dict(type="dataset", url=node_url, id=node_id, name=storage_name, status="ok", **res_kwargs) if mode == 'exportonly': return # append how to clone this specific dataset to the description description += "This particular project can be cloned using" \ " 'datalad clone osf://{}'".format(node_id) update_node(osf_session=osf.session, id_=node_id, description=description) ds.config.set('remote.{}.annex-ignore'.format(name), 'true', where='local') yield from ds.siblings( # use configure, not add, to not trip over the config that # we just made action='configure', name=name, url='osf://{}'.format(node_id), fetch=False, publish_depends=storage_name if mode != 'gitonly' else None, recursive=False, result_renderer=None, ) @staticmethod def custom_result_renderer(res, **kwargs): from datalad.ui import ui if res['action'] == "create-sibling-osf": msg = res.get('message', None) ui.message("{action}({status}): {url}{msg}".format( action=ac.color_word(res['action'], ac.BOLD), status=ac.color_status(res['status']), url=res.get('url', ''), msg=' [{}]'.format(msg[0] % msg[1:] if isinstance(msg, tuple) else res['message']) if msg else '')) elif res['action'] == "add-sibling-osf": ui.message("{action}({status})".format( action=ac.color_word(res['action'], ac.BOLD), status=ac.color_status(res['status']))) else: from datalad.interface.utils import default_result_renderer default_result_renderer(res)
class ContainersAdd(Interface): # first docstring line is used a short description in the cmdline help # the rest is put in the verbose help and manpage """Add a container to a dataset """ # parameters of the command, must be exhaustive _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset to add the container to. If no dataset is given, an attempt is made to identify the dataset based on the current working directory""", constraints=EnsureDataset() | EnsureNone()), name=Parameter( args=("name", ), doc="""The name to register the container under. This also determines the default location of the container image within the dataset.""", metavar="NAME", constraints=EnsureStr(), ), url=Parameter( args=("-u", "--url"), doc="""A URL (or local path) to get the container image from. If the URL scheme is one recognized by Singularity, 'shub://' or 'docker://', the command format string will be auto-guessed when [CMD: --call-fmt CMD][PY: call_fmt PY] is not specified. For the scheme 'dhub://', the rest of the URL will be interpreted as the argument to 'docker pull', the image will be saved to the location specified by `name`, and the call format will be auto-guessed if not given.""", metavar="URL", constraints=EnsureStr() | EnsureNone(), ), # TODO: The "prepared command stuff should ultimately go somewhere else # (probably datalad-run). But first figure out, how exactly to address # container datasets call_fmt=Parameter( args=("--call-fmt", ), doc="""Command format string indicating how to execute a command in this container, e.g. "singularity exec {img} {cmd}". Where '{img}' is a placeholder for the path to the container image and '{cmd}' is replaced with the desired command. Additional placeholders: '{img_dspath}' is relative path to the dataset containing the image. """, metavar="FORMAT", constraints=EnsureStr() | EnsureNone(), ), image=Parameter( args=("-i", "--image"), doc= """Relative path of the container image within the dataset. If not given, a default location will be determined using the `name` argument.""", metavar="IMAGE", constraints=EnsureStr() | EnsureNone(), ), update=Parameter( args=("--update", ), action="store_true", doc="""Update the existing container for `name`. If no other options are specified, URL will be set to 'updateurl', if configured. If a container with `name` does not already exist, this option is ignored.""")) @staticmethod @datasetmethod(name='containers_add') @eval_results def __call__(name, url=None, dataset=None, call_fmt=None, image=None, update=False): if not name: raise InsufficientArgumentsError("`name` argument is required") ds = require_dataset(dataset, check_installed=True, purpose='add container') runner = Runner() # prevent madness in the config file if not re.match(r'^[0-9a-zA-Z-]+$', name): raise ValueError( "Container names can only contain alphanumeric characters " "and '-', got: '{}'".format(name)) cfgbasevar = "datalad.containers.{}".format(name) if cfgbasevar + ".image" in ds.config: if not update: yield get_status_dict( action="containers_add", ds=ds, logger=lgr, status="impossible", message=("Container named %r already exists. " "Use --update to reconfigure.", name)) return if not (url or image or call_fmt): # No updated values were provided. See if an update url is # configured (currently relevant only for Singularity Hub). url = ds.config.get(cfgbasevar + ".updateurl") if not url: yield get_status_dict( action="containers_add", ds=ds, logger=lgr, status="impossible", message="No values to update specified") return call_fmt = call_fmt or ds.config.get(cfgbasevar + ".cmdexec") image = image or ds.config.get(cfgbasevar + ".image") if not image: loc_cfg_var = "datalad.containers.location" # TODO: We should provide an entry point (or sth similar) for extensions # to get config definitions into the ConfigManager. In other words an # easy way to extend definitions in datalad's common_cfgs.py. container_loc = \ ds.config.obtain( loc_cfg_var, where=definitions[loc_cfg_var]['destination'], # if not False it would actually modify the # dataset config file -- undesirable store=False, default=definitions[loc_cfg_var]['default'], dialog_type=definitions[loc_cfg_var]['ui'][0], valtype=definitions[loc_cfg_var]['type'], **definitions[loc_cfg_var]['ui'][1] ) image = op.join(ds.path, container_loc, name, 'image') else: image = op.join(ds.path, image) result = get_status_dict( action="containers_add", path=image, type="file", logger=lgr, ) if call_fmt is None: # maybe built in knowledge can help call_fmt = _guess_call_fmt(ds, name, url) # collect bits for a final and single save() call to_save = [] imgurl = url was_updated = False if url: if update and op.lexists(image): was_updated = True # XXX: check=False is used to avoid dropping the image. It # should use drop=False if remove() gets such an option (see # DataLad's gh-2673). for r in ds.remove(image, save=False, check=False, return_type="generator"): yield r imgurl = _resolve_img_url(url) lgr.debug('Attempt to obtain container image from: %s', imgurl) if url.startswith("dhub://"): from .adapters import docker docker_image = url[len("dhub://"):] lgr.debug("Running 'docker pull %s and saving image to %s", docker_image, image) runner.run(["docker", "pull", docker_image]) docker.save(docker_image, image) elif url.startswith("docker://"): image_dir, image_basename = op.split(image) if not image_basename: raise ValueError("No basename in path {}".format(image)) if image_dir and not op.exists(image_dir): os.makedirs(image_dir) lgr.info( "Building Singularity image for %s " "(this may take some time)", url) runner.run(["singularity", "build", image_basename, url], cwd=image_dir or None) elif op.exists(url): lgr.info("Copying local file %s to %s", url, image) image_dir = op.dirname(image) if image_dir and not op.exists(image_dir): os.makedirs(image_dir) copyfile(url, image) else: try: ds.repo.add_url_to_file(image, imgurl) except Exception as e: result["status"] = "error" result["message"] = str(e) yield result # TODO do we have to take care of making the image executable # if --call_fmt is not provided? to_save.append(image) # continue despite a remote access failure, the following config # setting will enable running the command again with just the name # given to ease a re-run if not op.lexists(image): result["status"] = "error" result["message"] = ('no image at %s', image) yield result return # store configs if imgurl != url: # store originally given URL, as it resolves to something # different and maybe can be used to update the container # at a later point in time ds.config.set("{}.updateurl".format(cfgbasevar), url) # force store the image, and prevent multiple entries ds.config.set("{}.image".format(cfgbasevar), op.relpath(image, start=ds.path), force=True) if call_fmt: ds.config.set("{}.cmdexec".format(cfgbasevar), call_fmt, force=True) # store changes to_save.append(op.join(".datalad", "config")) for r in ds.save( path=to_save, message="[DATALAD] {do} containerized environment '{name}'". format(do="Update" if was_updated else "Configure", name=name)): yield r result["status"] = "ok" yield result
class CheckDates(Interface): """Find repository dates that are more recent than a reference date. The main purpose of this tool is to find "leaked" real dates in repositories that are configured to use fake dates. It checks dates from three sources: (1) commit timestamps (author and committer dates), (2) timestamps within files of the "git-annex" branch, and (3) the timestamps of annotated tags. """ from datalad.interface.utils import eval_results import datalad.support.ansi_colors as ac from datalad.support.constraints import EnsureChoice, EnsureNone, EnsureStr from datalad.support.param import Parameter result_renderer = "tailored" @staticmethod def custom_result_renderer(res, **kwargs): """Like 'json_pp', but skip non-error results without flagged objects. """ # FIXME: I think the proper way to do this is to use 'result_filter', # but I couldn't seem to get eval_results to detect the filter when I # used # # result_renderer = "json_pp" # result_filter = lambda x: ... # # Also, I want to keep the "message" key for errors. from datalad.ui import ui to_render = {} if res["status"] == "error": to_render = dict(res.items()) elif "report" in res and res["report"]["objects"]: to_render = { k: v for k, v in res.items() if k not in ["status", "message", "logger"] } if to_render: ui.message(json.dumps(to_render, sort_keys=True, indent=2)) _params_ = dict( paths=Parameter( args=("paths", ), metavar="PATH", nargs="*", doc="""Root directory in which to search for Git repositories. The current working directory will be used by default.""", constraints=EnsureStr() | EnsureNone()), reference_date=Parameter( args=("-D", "--reference-date"), metavar="DATE", doc="""Compare dates to this date. If dateutil is installed, this value can be any format that its parser recognizes. Otherwise, it should be a unix timestamp that starts with a "@". The default value corresponds to 01 Jan, 2018 00:00:00 -0000.""", constraints=EnsureStr()), revs=Parameter( args=("--rev", ), dest="revs", action="append", metavar="REVISION", doc="""Search timestamps from commits that are reachable from [PY: these revisions PY][CMD: REVISION CMD]. Any revision specification supported by :command:`git log`, including flags like --all and --tags, can be used.[CMD: This option can be given multiple times. CMD]"""), annex=Parameter( args=("--annex", ), doc="""Mode for "git-annex" branch search. If 'all', all blobs within the branch are searched. 'tree' limits the search to blobs that are referenced by the tree at the tip of the branch. 'none' disables search of "git-annex" blobs.""", constraints=EnsureChoice("all", "tree", "none")), no_tags=Parameter(args=("--no-tags", ), action="store_true", doc="""Don't check the dates of annotated tags."""), older=Parameter( args=("--older", ), action="store_true", doc="""Find dates which are older than the reference date rather than newer."""), ) @staticmethod @eval_results def __call__(paths, reference_date="@1514764800", revs=None, annex="all", no_tags=False, older=False): from datalad.support.repodates import check_dates which = "older" if older else "newer" try: ref_ts = _parse_date(reference_date) except ValueError as exc: lgr.error("Could not parse '%s' as a date", reference_date) yield get_status_dict("check_dates", status="error", message=exc_str(exc)) return lgr.info("Searching for dates %s than %s", which, time.strftime("%d %b %Y %H:%M:%S +0000", time.gmtime(ref_ts))) for repo in _git_repos(paths or ["."]): fullpath = os.path.abspath(repo) lgr.debug("Checking %s", fullpath) try: report = check_dates(repo, ref_ts, which=which, revs=revs or ["--all"], annex={ "all": True, "none": False, "tree": "tree" }[annex], tags=not no_tags) except InvalidGitRepositoryError as exc: lgr.warning("Skipping invalid Git repo: %s", repo) continue yield get_status_dict( "check_dates", status="ok", path=fullpath, message=("Found {} dates" if report["objects"] else "No {} dates found").format(which), report=report)
class Save(Interface): """Save the current state of a dataset Saving the state of a dataset records changes that have been made to it. This change record is annotated with a user-provided description. Optionally, an additional tag, such as a version, can be assigned to the saved state. Such tag enables straightforward retrieval of past versions at a later point in time. .. note:: Before Git v2.22, any Git repository without an initial commit located inside a Dataset is ignored, and content underneath it will be saved to the respective superdataset. DataLad datasets always have an initial commit, hence are not affected by this behavior. """ # note above documents that out behavior is like that of `git add`, but # does not explicitly mention the connection to keep it simple. _examples_ = [ dict(text="""Save any content underneath the current directory, without altering any potential subdataset""", code_py="save(path='.')", code_cmd="datalad save ."), dict(text="""Save specific content in the dataset""", code_py="save(path='myfile.txt')", code_cmd="datalad save myfile.txt"), dict(text="""Attach a commit message to save""", code_py="save(path='myfile.txt', message='add file')", code_cmd="datalad save -m 'add file' myfile.txt"), dict(text="""Save any content underneath the current directory, and recurse into any potential subdatasets""", code_py="save(path='.', recursive=True)", code_cmd="datalad save . -r"), dict( text="Save any modification of known dataset content in the " "current directory, but leave untracked files (e.g. temporary files) " "untouched", code_py="""save(path='.', updated=True)""", code_cmd="""datalad save -u ."""), dict(text="Tag the most recent saved state of a dataset", code_py="save(version_tag='bestyet')", code_cmd="datalad save --version-tag 'bestyet'"), dict( text="Save a specific change but integrate into last commit keeping " "the already recorded commit message", code_py="save(path='myfile.txt', amend=True)", code_cmd="datalad save myfile.txt --amend") ] _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), doc=""""specify the dataset to save""", constraints=EnsureDataset() | EnsureNone()), path=Parameter( args=("path", ), metavar='PATH', doc="""path/name of the dataset component to save. If given, only changes made to those components are recorded in the new state.""", nargs='*', constraints=EnsureStr() | EnsureNone()), message=save_message_opt, message_file=Parameter( args=("-F", "--message-file"), doc="""take the commit message from this file. This flag is mutually exclusive with -m.""", constraints=EnsureStr() | EnsureNone()), version_tag=Parameter( args=( "-t", "--version-tag", ), metavar='ID', doc="""an additional marker for that state. Every dataset that is touched will receive the tag.""", constraints=EnsureStr() | EnsureNone()), recursive=recursion_flag, recursion_limit=recursion_limit, updated=Parameter( args=( '-u', '--updated', ), action='store_true', doc="""if given, only saves previously tracked paths."""), to_git=Parameter( args=("--to-git", ), action='store_true', doc="""flag whether to add data directly to Git, instead of tracking data identity only. Use with caution, there is no guarantee that a file put directly into Git like this will not be annexed in a subsequent save operation. If not specified, it will be up to git-annex to decide how a file is tracked, based on a dataset's configuration to track particular paths, file types, or file sizes with either Git or git-annex. (see https://git-annex.branchable.com/tips/largefiles). """), jobs=jobs_opt, amend=Parameter( args=('--amend', ), action='store_true', doc="""if set, changes are not recorded in a new, separate commit, but are integrated with the changeset of the previous commit, and both together are recorded by replacing that previous commit. This is mutually exclusive with recursive operation. """), ) @staticmethod @datasetmethod(name='save') @eval_results def __call__( path=None, message=None, dataset=None, version_tag=None, recursive=False, recursion_limit=None, updated=False, message_file=None, to_git=None, jobs=None, amend=False, ): if message and message_file: raise ValueError( "Both a message and message file were specified for save()") if amend and recursive: raise ValueError("Cannot amend a commit recursively.") path = ensure_list(path) if message_file: with open(message_file) as mfh: message = mfh.read() # we want 'normal' to achieve the most compact argument list # for git calls # untracked_mode = 'no' if updated else 'normal' # TODO however, Repo.add() would refuse to add any dotfiles # in a directory that is itself untracked, hence the only # choice is to go with potentially crazy long lists # until https://github.com/datalad/datalad/issues/1454 # has a resolution untracked_mode = 'no' if updated else 'all' # there are three basic scenarios: # 1. save modifications to any already tracked content # 2. save any content (including removal of deleted content) # to bring things to a clean state # 3. like (2), but only operate on a given subset of content # identified by paths # - all three have to work in conjunction with --recursive # - the difference between (1) and (2) should be no more # that a switch from --untracked=no to --untracked=all # in Repo.save() # we do not support # - simultaneous operations on multiple datasets from disjoint # dataset hierarchies, hence a single reference dataset must be # identifiable from the either # - curdir or # - the `dataset` argument. # This avoids complex annotation loops and hierarchy tracking. # - any modification upwards from the root dataset ds = require_dataset(dataset, check_installed=True, purpose='save') # use status() to do all discovery and annotation of paths paths_by_ds = {} for s in Status()( # ATTN: it is vital to pass the `dataset` argument as it, # and not a dataset instance in order to maintain the path # semantics between here and the status() call dataset=dataset, path=path, untracked=untracked_mode, report_filetype=False, recursive=recursive, recursion_limit=recursion_limit, on_failure='ignore', # for save without recursion only commit matters eval_subdataset_state='full' if recursive else 'commit', result_renderer='disabled'): if s['status'] == 'error': # Downstream code can't do anything with these. Let the caller # decide their fate. yield s continue # fish out status dict for this parent dataset ds_status = paths_by_ds.get(s['parentds'], {}) # reassemble path status info as repo.status() would have made it ds_status[ut.Path(s['path'])] = \ {k: v for k, v in s.items() if k not in ( 'path', 'parentds', 'refds', 'status', 'action', 'logger')} paths_by_ds[s['parentds']] = ds_status lgr.debug('Determined %i datasets for saving from input arguments', len(paths_by_ds)) # figure out what datasets to process, start with the ones containing # the paths that were given as arguments discovered_datasets = list(paths_by_ds.keys()) if dataset: # if a reference dataset was given we want to save all the way up # to it, so let's throw it into the mix discovered_datasets.append(ds.path) # sort the datasets into (potentially) disjoint hierarchies, # or a single one, if a reference dataset was given dataset_hierarchies = get_tree_roots(discovered_datasets) for rootds, children in dataset_hierarchies.items(): edges = {} discover_dataset_trace_to_targets(rootds, children, [], edges, includeds=children) for superds, subdss in edges.items(): superds_status = paths_by_ds.get(superds, {}) for subds in subdss: subds_path = ut.Path(subds) sub_status = superds_status.get(subds_path, {}) if not (sub_status.get("state") == "clean" and sub_status.get("type") == "dataset"): # TODO actually start from an entry that may already # exist in the status record superds_status[subds_path] = dict( # shot from the hip, some status config # to trigger this specific super/sub # relation to be saved state='untracked', type='dataset') paths_by_ds[superds] = superds_status def save_ds(args, version_tag=None): pdspath, paths = args pds = Dataset(pdspath) pds_repo = pds.repo # pop status for this dataset, we are not coming back to it pds_status = { # for handing over to the low-level code, we recode any # path relative to the real repo location, this avoid # cumbersome symlink handling without context in the # lower levels pds_repo.pathobj / p.relative_to(pdspath): props for p, props in paths.items() } start_commit = pds_repo.get_hexsha() if not all(p['state'] == 'clean' for p in pds_status.values()) or \ (amend and message): for res in pds_repo.save_( message=message, # make sure to have the `path` arg be None, as we want # to prevent and bypass any additional repo.status() # calls paths=None, # prevent whining of GitRepo git=True if not hasattr(ds.repo, 'annexstatus') else to_git, # we are supplying the full status already, do not # detect anything else untracked='no', _status=pds_status, amend=amend): # TODO remove stringification when datalad-core can handle # path objects, or when PY3.6 is the lowest supported # version for k in ('path', 'refds'): if k in res: res[k] = str( # recode path back to dataset path anchor pds.pathobj / res[k].relative_to(pds_repo.pathobj)) yield res # report on the dataset itself dsres = dict( action='save', type='dataset', path=pds.path, refds=ds.path, status='ok' if start_commit != pds_repo.get_hexsha() else 'notneeded', logger=lgr, ) if not version_tag: yield dsres return try: # method requires str version_tag = str(version_tag) pds_repo.tag(version_tag) dsres.update(status='ok', version_tag=version_tag) yield dsres except CommandError as e: if dsres['status'] == 'ok': # first we yield the result for the actual save # TODO: we will get duplicate dataset/save record obscuring # progress reporting. yoh thought to decouple "tag" from "save" # messages but was worrying that original authors would disagree yield dsres.copy() # and now complain that tagging didn't work dsres.update(status='error', message=('cannot tag this version: %s', e.stderr.strip())) yield dsres if not paths_by_ds: # Special case: empty repo. There's either an empty commit only or # none at all. An empty one we can amend otherwise there's nothing # to do. if amend and ds.repo.get_hexsha(): yield from save_ds((ds.pathobj, dict()), version_tag=version_tag) else: yield dict(action='save', type='dataset', path=ds.path, refds=ds.path, status='notneeded', logger=lgr) return # TODO: in principle logging could be improved to go not by a dataset # but by path(s) within subdatasets. That should provide a bit better ETA # and more "dynamic" feedback than jumpy datasets count. # See addurls where it is implemented that way by providing agg and another # log_filter yield from ProducerConsumerProgressLog( sorted(paths_by_ds.items(), key=lambda v: v[0], reverse=True), partial(save_ds, version_tag=version_tag), safe_to_consume=no_subds_in_futures, producer_future_key=lambda ds_items: ds_items[0], jobs=jobs, log_filter=_log_filter_save_dataset, unit="datasets", lgr=lgr, )
class ExportArchiveORA(Interface): """Export an archive of a local annex object store for the ORA remote. Keys in the local annex object store are reorganized in a temporary directory (using links to avoid storage duplication) to use the 'hashdirlower' setup used by git-annex for bare repositories and the directory-type special remote. This alternative object store is then moved into a 7zip archive that is suitable for use in a ORA remote dataset store. Placing such an archive into:: <dataset location>/archives/archive.7z Enables the ORA special remote to locate and retrieve all keys contained in the archive. """ _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset to process. If no dataset is given, an attempt is made to identify the dataset based on the current working directory""", constraints=EnsureDataset() | EnsureNone()), target=Parameter( args=("target",), metavar="TARGET", doc="""if an existing directory, an 'archive.7z' is placed into it, otherwise this is the path to the target archive""", constraints=EnsureStr() | EnsureNone()), remote=Parameter( args=("--for",), dest="remote", metavar='LABEL', doc="""name of the target sibling, wanted/preferred settings will be used to filter the files added to the archives""", constraints=EnsureStr() | EnsureNone()), annex_wanted=Parameter( args=("--annex-wanted",), metavar="FILTERS", doc="""git-annex-preferred-content expression for git-annex find to filter files. Should start with 'or' or 'and' when used in combination with `--for`"""), froms=Parameter( args=("--from",), dest="froms", metavar="FROM", nargs="+", doc="""one or multiple tree-ish from which to select files"""), opts=Parameter( args=("opts",), nargs=REMAINDER, metavar="...", doc="""list of options for 7z to replace the default '-mx0' to generate an uncompressed archive"""), missing_content=Parameter( args=("--missing-content",), doc="""By default, any discovered file with missing content will result in an error and the export is aborted. Setting this to 'continue' will issue warnings instead of failing on error. The value 'ignore' will only inform about problem at the 'debug' log level. The latter two can be helpful when generating a TAR archive from a dataset where some file content is not available locally.""", constraints=EnsureChoice("error", "continue", "ignore")), ) @staticmethod @datasetmethod(name='export_archive_ora') @eval_results def __call__( target, opts=None, *, # opts is positional but optional in CLI dataset=None, remote=None, annex_wanted=None, froms=None, missing_content='error',): # only non-bare repos have hashdirmixed, so require one ds = require_dataset( dataset, check_installed=True, purpose='export to ORA archive') ds_repo = ds.repo annex_objs = ds_repo.dot_git / 'annex' / 'objects' archive = resolve_path(target, dataset) if archive.is_dir(): archive = archive / 'archive.7z' else: archive.parent.mkdir(exist_ok=True, parents=True) froms = ensure_list(froms) if not opts: # uncompressed by default opts = ['-mx0'] res_kwargs = dict( action="export-archive-ora", logger=lgr, ) if not annex_objs.is_dir(): yield get_status_dict( ds=ds, status='notneeded', message='no annex keys present', **res_kwargs, ) return exportdir = ds_repo.dot_git / 'datalad' / 'tmp' / 'ora_archive' if exportdir.exists(): yield get_status_dict( ds=ds, status='error', message=( 'export directory already exists, please remove first: %s', str(exportdir)), **res_kwargs, ) return def expr_to_opts(expr): opts = [] expr = expr.replace('(', ' ( ').replace(')', ' ) ') for sub_expr in expr.split(' '): if len(sub_expr): if sub_expr in '()': opts.append(f"-{sub_expr}") else: opts.append(f"--{sub_expr}") return opts find_filters = [] if remote: find_filters = ['-('] + expr_to_opts(ds_repo.get_preferred_content('wanted', remote)) + ['-)'] if annex_wanted: find_filters.extend(expr_to_opts(annex_wanted)) # git-annex find results need to be uniqued with set, as git-annex find # will return duplicates if multiple symlinks point to the same key. if froms: keypaths = set([ annex_objs.joinpath(k) for treeish in froms for k in ds_repo.call_annex_items_([ 'find', *find_filters, f"--branch={treeish}", "--format=${hashdirmixed}${key}/${key}\\n"]) ]) else: keypaths = set(annex_objs.joinpath(k) for k in ds_repo.call_annex_items_([ 'find', *find_filters, "--format=${hashdirmixed}${key}/${key}\\n" ])) log_progress( lgr.info, 'oraarchiveexport', 'Start ORA archive export %s', ds, total=len(keypaths), label='ORA archive export', unit=' Keys', ) if missing_content == 'continue': missing_file_lgr_func = lgr.warning elif missing_content == 'ignore': missing_file_lgr_func = lgr.debug link_fx = os.link for keypath in keypaths: key = keypath.name hashdir = op.join(keypath.parts[-4], keypath.parts[-3]) log_progress( lgr.info, 'oraarchiveexport', 'Export key %s to %s', key, hashdir, update=1, increment=True) keydir = exportdir / hashdir / key keydir.mkdir(parents=True, exist_ok=True) try: link_fx(str(keypath), str(keydir / key)) except FileNotFoundError as e: if missing_content == 'error': raise IOError('Key %s has no content available' % keypath) missing_file_lgr_func( 'Key %s has no content available', str(keypath)) except OSError: lgr.warning( 'No hard links supported at %s, will copy files instead', str(keypath)) # no hard links supported # switch function after first error link_fx = shutil.copyfile link_fx(str(keypath), str(keydir / key)) log_progress( lgr.info, 'oraarchiveexport', 'Finished RIA archive export from %s', ds ) try: subprocess.run( ['7z', 'u', str(archive), '.'] + opts, cwd=str(exportdir), ) yield get_status_dict( path=str(archive), type='file', status='ok', **res_kwargs) except Exception as e: ce = CapturedException(e) yield get_status_dict( path=str(archive), type='file', status='error', message=('7z failed: %s', ce), exception=ce, **res_kwargs) return finally: rmtree(str(exportdir))
class ExportArchive(Interface): """Export the content of a dataset as a TAR/ZIP archive. """ from datalad.support.param import Parameter from datalad.distribution.dataset import datasetmethod from datalad.interface.utils import eval_results from datalad.distribution.dataset import EnsureDataset from datalad.support.constraints import ( EnsureChoice, EnsureNone, EnsureStr, ) _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc=""""specify the dataset to export. If no dataset is given, an attempt is made to identify the dataset based on the current working directory.""", constraints=EnsureDataset() | EnsureNone()), filename=Parameter( args=("filename", ), metavar="PATH", nargs='?', doc="""File name of the generated TAR archive. If no file name is given the archive will be generated in the current directory and will be named: datalad_<dataset_uuid>.(tar.*|zip). To generate that file in a different directory, provide an existing directory as the file name.""", constraints=EnsureStr() | EnsureNone()), archivetype=Parameter(args=("-t", "--archivetype"), doc="""Type of archive to generate.""", constraints=EnsureChoice("tar", "zip")), compression=Parameter( args=("-c", "--compression"), doc="""Compression method to use. 'bz2' is not supported for ZIP archives. No compression is used when an empty string is given.""", constraints=EnsureChoice("gz", "bz2", "")), missing_content=Parameter( args=("--missing-content", ), doc="""By default, any discovered file with missing content will result in an error and the export is aborted. Setting this to 'continue' will issue warnings instead of failing on error. The value 'ignore' will only inform about problem at the 'debug' log level. The latter two can be helpful when generating a TAR archive from a dataset where some file content is not available locally.""", constraints=EnsureChoice("error", "continue", "ignore")), ) @staticmethod @datasetmethod(name='export_archive') @eval_results def __call__(dataset, filename=None, archivetype='tar', compression='gz', missing_content='error'): import os import tarfile import zipfile from unittest.mock import patch from os.path import join as opj, dirname, normpath, isabs import os.path as op from datalad.distribution.dataset import require_dataset from datalad.utils import file_basename from datalad.support.annexrepo import AnnexRepo from datalad.dochelpers import exc_str import logging lgr = logging.getLogger('datalad.plugin.export_archive') dataset = require_dataset(dataset, check_installed=True, purpose='export archive') repo = dataset.repo committed_date = repo.get_commit_date() # could be used later on to filter files by some criterion def _filter_tarinfo(ti): # Reset the date to match the one of the last commit, not from the # filesystem since git doesn't track those at all # TODO: use the date of the last commit when any particular # file was changed -- would be the most kosher yoh thinks to the # degree of our abilities ti.mtime = committed_date return ti tar_args = dict(recursive=False, filter=_filter_tarinfo) file_extension = '.{}{}'.format( archivetype, '{}{}'.format('.' if compression else '', compression) if archivetype == 'tar' else '') default_filename = "datalad_{.id}".format(dataset) if filename is None: filename = default_filename # in current directory elif path.exists(filename) and path.isdir(filename): filename = path.join(filename, default_filename) # under given directory if not filename.endswith(file_extension): filename += file_extension root = dataset.path # use dir inside matching the output filename # TODO: could be an option to the export plugin allowing empty value # for no leading dir leading_dir = file_basename(filename) # workaround for inability to pass down the time stamp with patch('time.time', return_value=committed_date), \ tarfile.open(filename, "w:{}".format(compression)) \ if archivetype == 'tar' \ else zipfile.ZipFile( filename, 'w', zipfile.ZIP_STORED if not compression else zipfile.ZIP_DEFLATED) \ as archive: add_method = archive.add if archivetype == 'tar' else archive.write repo_files = sorted(repo.get_indexed_files()) if isinstance(repo, AnnexRepo): annexed = repo.is_under_annex(repo_files, allow_quick=True, batch=True) # remember: returns False for files in Git! has_content = repo.file_has_content(repo_files, allow_quick=True, batch=True) else: annexed = [False] * len(repo_files) has_content = [True] * len(repo_files) for i, rpath in enumerate(repo_files): fpath = opj(root, rpath) if annexed[i]: if not has_content[i]: if missing_content in ('ignore', 'continue'): (lgr.warning if missing_content == 'continue' else lgr.debug)( 'File %s has no content available, skipped', fpath) continue else: raise IOError('File %s has no content available' % fpath) # resolve to possible link target if op.islink(fpath): link_target = os.readlink(fpath) if not isabs(link_target): link_target = normpath( opj(dirname(fpath), link_target)) fpath = link_target # name in the archive aname = normpath(opj(leading_dir, rpath)) add_method(fpath, arcname=aname, **(tar_args if archivetype == 'tar' else {})) if not isabs(filename): filename = opj(os.getcwd(), filename) yield dict(status='ok', path=filename, type='file', action='export_archive', logger=lgr)
class AggregateMetaData(Interface): """Aggregate metadata of one or more datasets for later query. Metadata aggregation refers to a procedure that extracts metadata present in a dataset into a portable representation that is stored a single standardized format. Moreover, metadata aggregation can also extract metadata in this format from one dataset and store it in another (super)dataset. Based on such collections of aggregated metadata it is possible to discover particular datasets and specific parts of their content, without having to obtain the target datasets first (see the DataLad 'search' command). To enable aggregation of metadata that are contained in files of a dataset, one has to enable one or more metadata extractor for a dataset. DataLad supports a number of common metadata standards, such as the Exchangeable Image File Format (EXIF), Adobe's Extensible Metadata Platform (XMP), and various audio file metadata systems like ID3. In addition, a number of scientific metadata standards are supported, like DICOM, BIDS, or datacite. Some metadata extractors depend on particular 3rd-party software. The list of metadata extractors available to a particular DataLad installation is reported by the 'wtf' plugin ('datalad wtf'). Enabling a metadata extractor for a dataset is done by adding its name to the 'datalad.metadata.nativetype' configuration variable -- typically in the dataset's configuration file (.datalad/config), e.g.:: [datalad "metadata"] nativetype = exif nativetype = xmp Enabling multiple extractors is supported. In this case, metadata are extracted by each extractor individually, and stored alongside each other. Metadata aggregation will also extract DataLad's own metadata (extractor 'datalad_core'). Metadata aggregation can be performed recursively, in order to aggregate all metadata across all subdatasets, for example, to be able to search across any content in any dataset of a collection. Aggregation can also be performed for subdatasets that are not available locally. In this case, pre-aggregated metadata from the closest available superdataset will be considered instead. Depending on the versatility of the present metadata and the number of dataset or files, aggregated metadata can grow prohibitively large. A number of configuration switches are provided to mitigate such issues. datalad.metadata.aggregate-content-<extractor-name> If set to false, content metadata aggregation will not be performed for the named metadata extractor (a potential underscore '_' in the extractor name must be replaced by a dash '-'). This can substantially reduce the runtime for metadata extraction, and also reduce the size of the generated metadata aggregate. Note, however, that some extractors may not produce any metadata when this is disabled, because their metadata might come from individual file headers only. 'datalad.metadata.store-aggregate-content' might be a more appropriate setting in such cases. datalad.metadata.aggregate-ignore-fields Any metadata key matching any regular expression in this configuration setting is removed prior to generating the dataset-level metadata summary (keys and their unique values across all dataset content), and from the dataset metadata itself. This switch can also be used to filter out sensitive information prior aggregation. datalad.metadata.generate-unique-<extractor-name> If set to false, DataLad will not auto-generate a summary of unique content metadata values for a particular extractor as part of the dataset-global metadata (a potential underscore '_' in the extractor name must be replaced by a dash '-'). This can be useful if such a summary is bloated due to minor uninformative (e.g. numerical) differences, or when a particular extractor already provides a carefully designed content metadata summary. datalad.metadata.maxfieldsize Any metadata value that exceeds the size threshold given by this configuration setting (in bytes/characters) is removed. datalad.metadata.store-aggregate-content If set, extracted content metadata are still used to generate a dataset-level summary of present metadata (all keys and their unique values across all files in a dataset are determined and stored as part of the dataset-level metadata aggregate, see datalad.metadata.generate-unique-<extractor-name>), but metadata on individual files are not stored. This switch can be used to avoid prohibitively large metadata files. Discovery of datasets containing content matching particular metadata properties will still be possible, but such datasets would have to be obtained first in order to discover which particular files in them match these properties. """ _params_ = dict( # TODO add option to not update aggregated data/info in intermediate # datasets # TODO add option for full aggregation (not incremental), so when something # is not present nothing about it is preserved in the aggregated metadata dataset=Parameter( args=("-d", "--dataset"), doc="""topmost dataset metadata will be aggregated into. All dataset between this dataset and any given path will receive updated aggregated metadata from all given paths.""", constraints=EnsureDataset() | EnsureNone()), path=Parameter( args=("path",), metavar="PATH", doc="""path to datasets that shall be aggregated. When a given path is pointing into a dataset, the metadata of the containing dataset will be aggregated.""", nargs="*", constraints=EnsureStr() | EnsureNone()), recursive=recursion_flag, recursion_limit=recursion_limit, save=nosave_opt, ) @staticmethod @datasetmethod(name='aggregate_metadata') @eval_results def __call__( path=None, dataset=None, recursive=False, recursion_limit=None, save=True): refds_path = Interface.get_refds_path(dataset) # it really doesn't work without a dataset ds = require_dataset( dataset, check_installed=True, purpose='metadata aggregation') # always include the reference dataset path = assure_list(path) path.append(ds.path) agginfo_db = {} to_save = [] to_aggregate = set() for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='aggregate_metadata', # uninstalled subdatasets could be queried via aggregated metadata # -> no 'error' unavailable_path_status='', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue ap_type = ap.get('type', None) ap_state = ap.get('state', None) assert('parentds' in ap or ap_type == 'dataset') if ap_type == 'dataset' and ap_state != 'absent': # a present dataset, we can take directly from it aggsrc = ap['path'] lgr.info('Aggregate metadata for dataset %s', aggsrc) else: # everything else needs to come from the parent aggsrc = ap['parentds'] if ap_state == 'absent': lgr.info( 'Attempt to use pre-aggregate metadata for absent %s from dataset at %s', ap['path'], aggsrc) else: lgr.info( 'Aggregate metadata for %s from dataset at %s', ap['path'], aggsrc) to_aggregate.add(aggsrc) if ap_state == 'absent': # key thought: recursive is done by path annotation, hence # once we hit an absent dataset, we are 100% certain that # there is nothing to recurse into on the file system # hence we only have to look into the aggregated metadata # of the last available dataset in the dataset tree edge # # if there is nothing at this path, we need to look into the # parentds and check if we know anything about this path # if we do, we need to grab all the info and objects # if not, we need to error res = _get_dsinfo_from_aggmetadata( aggsrc, ap['path'], recursive, agginfo_db) if not isinstance(res, list): yield get_status_dict( status='impossible', message=res, action='aggregate_metadata', path=ap['path'], logger=lgr) continue # cue for aggregation to_aggregate.update(res) else: # actually aggregate metadata for this dataset, immediately place # generated objects into the aggregated or reference dataset, # and put info into DB to get the distributed to all datasets # that need to be updated errored = _extract_metadata( ds, Dataset(aggsrc), agginfo_db, to_save) if errored: yield get_status_dict( status='error', message='Metadata extraction failed (see previous error message, set datalad.runtime.raiseonerror=yes to fail immediately)', action='aggregate_metadata', path=aggsrc, logger=lgr) # at this point we have dumped all aggregated metadata into object files # somewhere, we know what needs saving, but having saved anything, and # we know about the states of all aggregated dataset in the DB # what remains to do is to update all dataset, so they have there own copy # of aggregated metadata and update their respective aggregate.json with # info on what states we just aggregated from # first, let's figure out what dataset need updating at all # get adjencency info of the dataset tree spanning the base to all leaf dataset # associated with the path arguments ds_adj = {} discover_dataset_trace_to_targets( ds.path, to_aggregate, [], ds_adj, # we know that to_aggregate only lists datasets, existing and # absent ones -- we want to aggregate all of them, either from # just extracted metadata, or from previously aggregated metadata # of the closest superdataset includeds=to_aggregate) # TODO we need to work in the info about dataset that we only got from # aggregated metadata, that had no trace on the file system in here!! subtrees = _adj2subtrees(ds.path, ds_adj, to_aggregate) # go over datasets in bottom-up fashion for parentds_path in sorted(subtrees, reverse=True): lgr.info('Update aggregate metadata in dataset at: %s', parentds_path) _update_ds_agginfo( ds.path, parentds_path, subtrees[parentds_path], agginfo_db, to_save) # update complete res = get_status_dict( status='ok', action='aggregate_metadata', path=parentds_path, type='dataset', logger=lgr) res.update(agginfo_db.get(parentds_path, {})) yield res # # save potential modifications to dataset global metadata # if not to_save: return lgr.info('Attempting to save %i files/datasets', len(to_save)) for res in Save.__call__( path=to_save, dataset=refds_path, message='[DATALAD] dataset aggregate metadata update', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
class Create(Interface): """Create a new dataset from scratch. This command initializes a new :term:`dataset` at a given location, or the current directory. The new dataset can optionally be registered in an existing :term:`superdataset` (the new dataset's path needs to be located within the superdataset for that, and the superdataset needs to be given explicitly). It is recommended to provide a brief description to label the dataset's nature *and* location, e.g. "Michael's music on black laptop". This helps humans to identify data locations in distributed scenarios. By default an identifier comprised of user and machine name, plus path will be generated. This command only creates a new dataset, it does not add any content to it, even if the target directory already contains additional files or directories. Plain Git repositories can be created via the [PY: `no_annex` PY][CMD: --no-annex CMD] flag. However, the result will not be a full dataset, and, consequently, not all features are supported (e.g. a description). || REFLOW >> To create a local version of a remote dataset use the :func:`~datalad.api.install` command instead. << REFLOW || .. note:: Power-user info: This command uses :command:`git init` and :command:`git annex init` to prepare the new dataset. Registering to a superdataset is performed via a :command:`git submodule add` operation in the discovered superdataset. """ # in general this command will yield exactly one result return_type = 'item-or-list' # in general users expect to get an instance of the created dataset result_xfm = 'datasets' # result filter result_filter = EnsureKeyChoice('action', ('create',)) & \ EnsureKeyChoice('status', ('ok', 'notneeded')) _params_ = dict( path=Parameter( args=("path", ), metavar='PATH', doc="""path where the dataset shall be created, directories will be created as necessary. If no location is provided, a dataset will be created in the current working directory. Either way the command will error if the target directory is not empty. Use `force` to create a dataset in a non-empty directory.""", nargs='?', # put dataset 2nd to avoid useless conversion constraints=EnsureStr() | EnsureDataset() | EnsureNone()), dataset=Parameter( args=("-d", "--dataset"), metavar='PATH', doc="""specify the dataset to perform the create operation on. If a dataset is given, a new subdataset will be created in it.""", constraints=EnsureDataset() | EnsureNone()), force=Parameter( args=( "-f", "--force", ), doc="""enforce creation of a dataset in a non-empty directory""", action='store_true'), description=location_description, # TODO could move into cfg_annex plugin no_annex=Parameter( args=("--no-annex", ), doc="""if set, a plain Git repository will be created without any annex""", action='store_true'), text_no_annex=Parameter( args=("--text-no-annex", ), doc="""if set, all text files in the future would be added to Git, not annex. Achieved by adding an entry to `.gitattributes` file. See http://git-annex.branchable.com/tips/largefiles/ and `no_annex` DataLad plugin to establish even more detailed control over which files are placed under annex control.""", action='store_true'), save=nosave_opt, # TODO could move into cfg_annex plugin annex_version=Parameter( args=("--annex-version", ), doc="""select a particular annex repository version. The list of supported versions depends on the available git-annex version. This should be left untouched, unless you know what you are doing""", constraints=EnsureDType(int) | EnsureNone()), # TODO could move into cfg_annex plugin annex_backend=Parameter( args=("--annex-backend", ), constraints=EnsureStr() | EnsureNone(), # not listing choices here on purpose to avoid future bugs doc="""set default hashing backend used by the new dataset. For a list of supported backends see the git-annex documentation. The default is optimized for maximum compatibility of datasets across platforms (especially those with limited path lengths)"""), # TODO could move into cfg_metadata plugin native_metadata_type=Parameter( args=('--native-metadata-type', ), metavar='LABEL', action='append', constraints=EnsureStr() | EnsureNone(), doc="""Metadata type label. Must match the name of the respective parser implementation in DataLad (e.g. "xmp").[CMD: This option can be given multiple times CMD]"""), # TODO could move into cfg_access/permissions plugin shared_access=shared_access_opt, git_opts=git_opts, annex_opts=annex_opts, annex_init_opts=annex_init_opts, fake_dates=Parameter( args=('--fake-dates', ), action='store_true', doc="""Configure the repository to use fake dates. The date for a new commit will be set to one second later than the latest commit in the repository. This can be used to anonymize dates."""), ) @staticmethod @datasetmethod(name='create') @eval_results def __call__(path=None, force=False, description=None, dataset=None, no_annex=False, save=True, annex_version=None, annex_backend='MD5E', native_metadata_type=None, shared_access=None, git_opts=None, annex_opts=None, annex_init_opts=None, text_no_annex=None, fake_dates=False): # two major cases # 1. we got a `dataset` -> we either want to create it (path is None), # or another dataset in it (path is not None) # 2. we got no dataset -> we want to create a fresh dataset at the # desired location, either at `path` or PWD if path and dataset: # Given a path and a dataset (path) not pointing to installed # dataset if not dataset.is_installed(): msg = "No installed dataset at %s found." % dataset.path dsroot = get_dataset_root(dataset.path) if dsroot: msg += " If you meant to add to the %s dataset, use that path " \ "instead but remember that if dataset is provided, " \ "relative paths are relative to the top of the " \ "dataset." % dsroot raise ValueError(msg) # sanity check first if git_opts: lgr.warning( "`git_opts` argument is presently ignored, please complain!") if no_annex: if description: raise ValueError("Incompatible arguments: cannot specify " "description for annex repo and declaring " "no annex repo.") if annex_opts: raise ValueError("Incompatible arguments: cannot specify " "options for annex and declaring no " "annex repo.") if annex_init_opts: raise ValueError("Incompatible arguments: cannot specify " "options for annex init and declaring no " "annex repo.") if not isinstance(force, bool): raise ValueError( "force should be bool, got %r. Did you mean to provide a 'path'?" % force) annotated_paths = AnnotatePaths.__call__( # nothing given explicitly, assume create fresh right here path=path if path else getpwd() if dataset is None else None, dataset=dataset, recursive=False, action='create', # we need to know whether we have to check for potential # subdataset collision force_parentds_discovery=True, # it is absolutely OK to have something that does not exist unavailable_path_status='', unavailable_path_msg=None, # if we have a dataset given that actually exists, we want to # fail if the requested path is not in it nondataset_path_status='error' \ if isinstance(dataset, Dataset) and dataset.is_installed() else '', on_failure='ignore') path = None for r in annotated_paths: if r['status']: # this is dealt with already yield r continue if path is not None: raise ValueError( "`create` can only handle single target path or dataset") path = r if len(annotated_paths) and path is None: # we got something, we complained already, done return # we know that we need to create a dataset at `path` assert (path is not None) # prep for yield path.update({'logger': lgr, 'type': 'dataset'}) # just discard, we have a new story to tell path.pop('message', None) if 'parentds' in path: subs = Subdatasets.__call__( dataset=path['parentds'], # any known fulfilled=None, recursive=False, contains=path['path'], result_xfm='relpaths') if len(subs): path.update({ 'status': 'error', 'message': ('collision with known subdataset %s/ in dataset %s', subs[0], path['parentds']) }) yield path return # TODO here we need a further test that if force=True, we need to look if # there is a superdataset (regardless of whether we want to create a # subdataset or not), and if that superdataset tracks anything within # this directory -- if so, we need to stop right here and whine, because # the result of creating a repo here will produce an undesired mess if git_opts is None: git_opts = {} if shared_access: # configure `git --shared` value git_opts['shared'] = shared_access # important to use the given Dataset object to avoid spurious ID # changes with not-yet-materialized Datasets tbds = dataset if isinstance(dataset, Dataset) and dataset.path == path['path'] \ else Dataset(path['path']) # don't create in non-empty directory without `force`: if isdir(tbds.path) and listdir(tbds.path) != [] and not force: path.update({ 'status': 'error', 'message': 'will not create a dataset in a non-empty directory, use ' '`force` option to ignore' }) yield path return # stuff that we create and want to have tracked with git (not annex) add_to_git = [] if no_annex: lgr.info("Creating a new git repo at %s", tbds.path) GitRepo(tbds.path, url=None, create=True, git_opts=git_opts, fake_dates=fake_dates) else: # always come with annex when created from scratch lgr.info("Creating a new annex repo at %s", tbds.path) tbrepo = AnnexRepo(tbds.path, url=None, create=True, backend=annex_backend, version=annex_version, description=description, git_opts=git_opts, annex_opts=annex_opts, annex_init_opts=annex_init_opts, fake_dates=fake_dates) if text_no_annex: attrs = tbrepo.get_gitattributes('.') # some basic protection against useless duplication # on rerun with --force if not attrs.get('.', {}).get( 'annex.largefiles', None) == '(not(mimetype=text/*))': tbrepo.set_gitattributes([('*', { 'annex.largefiles': '(not(mimetype=text/*))' })]) add_to_git.append('.gitattributes') if native_metadata_type is not None: if not isinstance(native_metadata_type, list): native_metadata_type = [native_metadata_type] for nt in native_metadata_type: tbds.config.add('datalad.metadata.nativetype', nt) # record an ID for this repo for the afterlife # to be able to track siblings and children id_var = 'datalad.dataset.id' if id_var in tbds.config: # make sure we reset this variable completely, in case of a re-create tbds.config.unset(id_var, where='dataset') if _seed is None: # just the standard way uuid_id = uuid.uuid1().urn.split(':')[-1] else: # Let's generate preseeded ones uuid_id = str(uuid.UUID(int=random.getrandbits(128))) tbds.config.add(id_var, tbds.id if tbds.id is not None else uuid_id, where='dataset') add_to_git.append('.datalad') # make sure that v6 annex repos never commit content under .datalad attrs_cfg = ( ('config', 'annex.largefiles', 'nothing'), ('metadata/aggregate*', 'annex.largefiles', 'nothing'), ('metadata/objects/**', 'annex.largefiles', '({})'.format( cfg.obtain('datalad.metadata.create-aggregate-annex-limit')))) attrs = tbds.repo.get_gitattributes( [op.join('.datalad', i[0]) for i in attrs_cfg]) set_attrs = [] for p, k, v in attrs_cfg: if not attrs.get(op.join('.datalad', p), {}).get(k, None) == v: set_attrs.append((p, {k: v})) if set_attrs: tbds.repo.set_gitattributes(set_attrs, attrfile=op.join( '.datalad', '.gitattributes')) add_to_git.append('.datalad') # prevent git annex from ever annexing .git* stuff (gh-1597) attrs = tbds.repo.get_gitattributes('.git') if not attrs.get('.git', {}).get('annex.largefiles', None) == 'nothing': tbds.repo.set_gitattributes([('**/.git*', { 'annex.largefiles': 'nothing' })]) add_to_git.append('.gitattributes') # save everything, we need to do this now and cannot merge with the # call below, because we may need to add this subdataset to a parent # but cannot until we have a first commit tbds.add(add_to_git, to_git=True, save=save, message='[DATALAD] new dataset') # the next only makes sense if we saved the created dataset, # otherwise we have no committed state to be registered # in the parent if isinstance(dataset, Dataset) and dataset.path != tbds.path \ and tbds.repo.get_hexsha(): # we created a dataset in another dataset # -> make submodule for r in dataset.add(tbds.path, save=save, return_type='generator', result_filter=None, result_xfm=None, on_failure='ignore'): yield r path.update({'status': 'ok'}) yield path @staticmethod def custom_result_renderer(res, **kwargs): from datalad.ui import ui if res.get('action', None) == 'create' and \ res.get('status', None) == 'ok' and \ res.get('type', None) == 'dataset': ui.message("Created dataset at {}.".format(res['path'])) else: ui.message("Nothing was created")
class Clone(Interface): """Obtain a dataset copy from a URL or local source (path) The purpose of this command is to obtain a new clone (copy) of a dataset and place it into a not-yet-existing or empty directory. As such `clone` provides a strict subset of the functionality offered by `install`. Only a single dataset can be obtained, recursion is not supported. However, once installed, arbitrary dataset components can be obtained via a subsequent `get` command. Primary differences over a direct `git clone` call are 1) the automatic initialization of a dataset annex (pure Git repositories are equally supported); 2) automatic registration of the newly obtained dataset as a subdataset (submodule), if a parent dataset is specified; 3) support for datalad's resource identifiers and automatic generation of alternative access URL for common cases (such as appending '.git' to the URL in case the accessing the base URL failed); and 4) ability to take additional alternative source locations as an argument. """ # by default ignore everything but install results # i.e. no "add to super dataset" result_filter = EnsureKeyChoice('action', ('install', )) _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""(parent) dataset to clone into. If given, the newly cloned dataset is registered as a subdataset of the parent. Also, if given, relative paths are interpreted as being relative to the parent dataset, and not relative to the working directory.""", constraints=EnsureDataset() | EnsureNone()), source=Parameter( args=("source", ), metavar='SOURCE', doc="""URL, DataLad resource identifier, local path or instance of dataset to be cloned""", constraints=EnsureStr() | EnsureNone()), path=Parameter(args=("path", ), metavar='PATH', nargs="?", doc="""path to clone into. If no `path` is provided a destination path will be derived from a source URL similar to :command:`git clone`"""), description=location_description, reckless=reckless_opt, alt_sources=Parameter( args=('--alternative-sources', ), dest='alt_sources', metavar='SOURCE', nargs='+', doc="""Alternative sources to be tried if a dataset cannot be obtained from the main `source`""", constraints=EnsureStr() | EnsureNone()), # TODO next ones should be there, but cannot go anywhere # git_opts=git_opts, # git_clone_opts=git_clone_opts, # annex_opts=annex_opts, # annex_init_opts=annex_init_opts, ) @staticmethod @datasetmethod(name='clone') @eval_results def __call__(source, path=None, dataset=None, description=None, reckless=False, alt_sources=None): # TODO next ones should be there, but cannot go anywhere # git_opts=None, # git_clone_opts=None, # annex_opts=None, # annex_init_opts=None # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. dataset = require_dataset( dataset, check_installed=True, purpose='cloning') \ if dataset is not None else dataset refds_path = dataset.path if dataset else None if isinstance(source, Dataset): source = source.path if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination raise ValueError( "clone `source` and destination `path` are identical [{}]. " "If you are trying to add a subdataset simply use `add`". format(path)) if path is not None: path = resolve_path(path, dataset) # Possibly do conversion from source into a git-friendly url # luckily GitRepo will undo any fancy file:/// url to make use of Git's # optimization for local clones.... source_url = source source_ = _get_git_url_from_source(source) lgr.debug("Resolved clone source from '%s' to '%s'", source, source_) source = source_ # derive target from source: if path is None: # we got nothing but a source. do something similar to git clone # and derive the path from the source and continue path = _get_installationpath_from_url(source) # since this is a relative `path`, resolve it: path = resolve_path(path, dataset) lgr.debug("Determined clone target path from source") lgr.debug("Resolved clone target path to: '%s'", path) # there is no other way -- my intoxicated brain tells me assert (path is not None) destination_dataset = Dataset(path) dest_path = path status_kwargs = dict(action='install', ds=destination_dataset, logger=lgr, refds=refds_path, source_url=source_url) # important test! based on this `rmtree` will happen below after failed clone if exists(dest_path) and listdir(dest_path): if destination_dataset.is_installed(): # check if dest was cloned from the given source before # this is where we would have installed this from guessed_sources = _get_flexible_source_candidates( source, dest_path) # this is where it was actually installed from track_name, track_url = _get_tracking_source( destination_dataset) if track_url in guessed_sources or \ get_local_file_url(track_url) in guessed_sources: yield get_status_dict( status='notneeded', message=("dataset %s was already cloned from '%s'", destination_dataset, source), **status_kwargs) return # anything else is an error yield get_status_dict( status='error', message= 'target path already exists and not empty, refuse to clone into target path', **status_kwargs) return if dataset is not None and relpath( path, start=dataset.path).startswith(pardir): yield get_status_dict( status='error', message= ("clone target path '%s' not in specified target dataset '%s'", path, dataset), **status_kwargs) return # generate candidate URLs from source argument to overcome a few corner cases # and hopefully be more robust than git clone candidate_sources = [] # combine all given sources (incl. alternatives), maintain order for s in [source] + assure_list(alt_sources): candidate_sources.extend(_get_flexible_source_candidates(s)) lgr.info("Cloning %s to '%s'", source, dest_path) for isource_, source_ in enumerate(candidate_sources): try: lgr.debug( "Attempting to clone %s (%d out of %d candidates) to '%s'", source_, isource_ + 1, len(candidate_sources), dest_path) GitRepo.clone(path=dest_path, url=source_, create=True) break # do not bother with other sources if succeeded except GitCommandError as e: lgr.debug("Failed to clone from URL: %s (%s)", source_, exc_str(e)) if exists(dest_path): lgr.debug("Wiping out unsuccessful clone attempt at: %s", dest_path) rmtree(dest_path) if 'could not create work tree' in e.stderr.lower(): # this cannot be fixed by trying another URL yield get_status_dict( status='error', message=re.match(r".*fatal: (.*)\n", e.stderr, flags=re.MULTILINE | re.DOTALL).group(1), **status_kwargs) return if not destination_dataset.is_installed(): yield get_status_dict( status='error', message=( "Failed to clone data from any candidate source URL: %s", candidate_sources), **status_kwargs) return if dataset is not None: # we created a dataset in another dataset # -> make submodule for r in dataset.add(dest_path, save=True, ds2super=True, return_type='generator', result_filter=None, result_xfm=None, on_failure='ignore'): yield r _handle_possible_annex_dataset(destination_dataset, reckless, description=description) # yield successful clone of the base dataset now, as any possible # subdataset clone down below will not alter the Git-state of the # parent yield get_status_dict(status='ok', **status_kwargs)
class CopyFile(Interface): """Copy files and their availability metadata from one dataset to another. The difference to a system copy command is that here additional content availability information, such as registered URLs, is also copied to the target dataset. Moreover, potentially required git-annex special remote configurations are detected in a source dataset and are applied to a target dataset in an analogous fashion. It is possible to copy a file for which no content is available locally, by just copying the required metadata on content identity and availability. .. note:: At the moment, only URLs for the special remotes 'web' (git-annex built-in) and 'datalad' are recognized and transferred. || REFLOW >> The interface is modeled after the POSIX 'cp' command, but with one additional way to specify what to copy where: [CMD: --specs-from CMD][PY: `specs_from` PY] allows the caller to flexibly input source-destination path pairs. << REFLOW || || REFLOW >> This command can copy files out of and into a hierarchy of nested datasets. Unlike with other DataLad command, the [CMD: --recursive CMD][PY: `recursive` PY] switch does not enable recursion into subdatasets, but is analogous to the POSIX 'cp' command switch and enables subdirectory recursion, regardless of dataset boundaries. It is not necessary to enable recursion in order to save changes made to nested target subdatasets. << REFLOW || """ _params_ = dict( dataset=Parameter( # not really needed on the cmdline, but for PY to resolve relative # paths args=("-d", "--dataset"), doc="""root dataset to save after copy operations are completed. All destination paths must be within this dataset, or its subdatasets. If no dataset is given, dataset modifications will be left unsaved.""", constraints=EnsureDataset() | EnsureNone()), path=Parameter( args=("path", ), metavar='PATH', doc="""paths to copy (and possibly a target path to copy to).""", nargs='*', constraints=EnsureStr() | EnsureNone()), recursive=Parameter(args=( "--recursive", "-r", ), action='store_true', doc="""copy directories recursively"""), target_dir=Parameter( args=('--target-dir', '-t'), metavar='DIRECTORY', doc="""copy all source files into this DIRECTORY. This value is overridden by any explicit destination path provided via [CMD: --specs-from CMD][PY: 'specs_from' PY]. When not given, this defaults to the path of the dataset specified via [CMD: --dataset CMD][PY: 'dataset' PY].""", constraints=EnsureStr() | EnsureNone()), specs_from=Parameter( args=('--specs-from', ), metavar='SOURCE', doc="""read list of source (and destination) path names from a given file, or stdin (with '-'). Each line defines either a source path, or a source/destination path pair (separated by a null byte character).[PY: Alternatively, a list of 2-tuples with source/destination pairs can be given. PY]"""), message=save_message_opt, ) _examples_ = [ dict( text="Copy a file into a dataset 'myds' using a path and a target " "directory specification, and save its addition to 'myds'", code_py="""\ copy_file('path/to/myfile', dataset='path/to/myds')""", code_cmd="""\ datalad copy-file path/to/myfile -d path/to/myds"""), dict( text="Copy a file to a dataset 'myds' and save it under a new name " "by providing two paths", code_py="""\ copy_file(path=['path/to/myfile', 'path/to/myds/newname'], dataset='path/to/myds')""", code_cmd="""\ datalad copy-file path/to/myfile path/to/myds/new -d path/to/myds""" ), dict(text="Copy a file into a dataset without saving it", code_py="copy_file('path/to/myfile', target_dir='path/to/myds/')", code_cmd="datalad copy-file path/to/myfile -t path/to/myds"), dict( text="Copy a directory and its subdirectories into a dataset 'myds'" " and save the addition in 'myds'", code_py="""\ copy_file('path/to/dir/', recursive=True, dataset='path/to/myds')""", code_cmd="""\ datalad copy-file path/to/dir -r -d path/to/myds"""), dict( text="Copy files using a path and optionally target specification " "from a file", code_py="""\ copy_file(dataset='path/to/myds', specs_from='path/to/specfile')""", code_cmd="""\ datalad copy-file -d path/to/myds --specs-from specfile"""), dict( text="Read a specification from stdin and pipe the output of a find" " command into the copy-file command", code_cmd="""\ find <expr> | datalad copy-file -d path/to/myds --specs-from -""") ] @staticmethod @datasetmethod(name='copy_file') @eval_results def __call__(path=None, dataset=None, recursive=False, target_dir=None, specs_from=None, message=None): # Concept # # Loosely model after the POSIX cp command # # 1. Determine the target of the copy operation, and its associated # dataset # # 2. for each source: determine source dataset, query for metadata, put # into target dataset # # Instead of sifting and sorting through input args, process them one # by one sequentially. Utilize lookup caching to make things faster, # instead of making the procedure itself more complicated. if path and specs_from: raise ValueError("Path argument(s) AND a specs-from specified, " "this is not supported.") ds = None if dataset: ds = require_dataset(dataset, check_installed=True, purpose='copying into') if target_dir: target_dir = resolve_path(target_dir, dataset) if path: # turn into list of absolute paths paths = [resolve_path(p, dataset) for p in ensure_list(path)] # we already checked that there are no specs_from if not target_dir: if len(paths) == 1: if not ds: raise ValueError("No target directory was given.") # we can keep target_dir unset and need not manipulate # paths, this is all done in a generic fashion below elif len(paths) == 2: # single source+dest combo if paths[-1].is_dir(): # check if we need to set target_dir, in case dest # is a dir target_dir = paths.pop(-1) else: specs_from = [paths] else: target_dir = paths.pop(-1) if not specs_from: # in all other cases we have a plain source list specs_from = paths if not specs_from: raise ValueError("Neither `paths` nor `specs_from` given.") if target_dir: if ".git" in target_dir.parts: raise ValueError( "Target directory should not contain a .git directory: {}". format(target_dir)) elif ds: # no specific target set, but we have to write into a dataset, # and one was given. It seems to make sense to use this dataset # as a target. it is already to reference for any path resolution. # Any explicitely given destination, will take precedence over # a general target_dir setting nevertheless. target_dir = ds.pathobj res_kwargs = dict( action='copy_file', logger=lgr, ) # lookup cache for dir to repo mappings, and as a DB for cleaning # things up repo_cache = {} # which paths to pass on to save to_save = [] try: for src_path, dest_path in _yield_specs(specs_from): src_path = Path(src_path) dest_path = None \ if dest_path is None \ else resolve_path(dest_path, dataset) lgr.debug('Processing copy specification: %s -> %s', src_path, dest_path) # Some checks, first impossibility "wins" msg_impossible = None if not recursive and src_path.is_dir(): msg_impossible = 'recursion not enabled, omitting directory' elif (dest_path and dest_path.name == '.git') \ or src_path.name == '.git': msg_impossible = \ "refuse to place '.git' into destination dataset" elif not (dest_path or target_dir): msg_impossible = 'need destination path or target directory' if msg_impossible: yield dict(path=str(src_path), status='impossible', message=msg_impossible, **res_kwargs) continue for src_file, dest_file in _yield_src_dest_filepaths( src_path, dest_path, target_dir=target_dir): if ds and ds.pathobj not in dest_file.parents: # take time to compose proper error dpath = str(target_dir if target_dir else dest_path) yield dict( path=dpath, status='error', message=('reference dataset does not contain ' 'destination path: %s', dpath), **res_kwargs) # only recursion could yield further results, which would # all have the same issue, so call it over right here break for res in _copy_file(src_file, dest_file, cache=repo_cache): yield dict(res, **res_kwargs) if res.get('status', None) == 'ok': to_save.append(res['destination']) finally: # cleanup time # TODO this could also be the place to stop lingering batch processes _cleanup_cache(repo_cache) if not (ds and to_save): # nothing left to do return yield from ds.save( path=to_save, # we provide an explicit file list recursive=False, message=message, )
class Rerun(Interface): """Re-execute previous `datalad run` commands. This will unlock any dataset content that is on record to have been modified by the command in the specified revision. It will then re-execute the command in the recorded path (if it was inside the dataset). Afterwards, all modifications will be saved. *Report mode* || REFLOW >> When called with [CMD: --report CMD][PY: report=True PY], this command reports information about what would be re-executed as a series of records. There will be a record for each revision in the specified revision range. Each of these will have one of the following "rerun_action" values: << REFLOW || - run: the revision has a recorded command that would be re-executed - skip: the revision does not have a recorded command and would be skipped - pick: the revision does not have a recorded command and would be cherry picked The decision to skip rather than cherry pick a revision is based on whether the revision would be reachable from HEAD at the time of execution. In addition, when a starting point other than HEAD is specified, there is a rerun_action value "checkout", in which case the record includes information about the revision the would be checked out before rerunning any commands. Examples: Re-execute the command from the previous commit:: % datalad rerun Re-execute any commands in the last five commits:: % datalad rerun --since=HEAD~5 Do the same as above, but re-execute the commands on top of HEAD~5 in a detached state:: % datalad rerun --onto= --since=HEAD~5 Re-execute all previous commands and compare the old and new results:: % # on master branch % datalad rerun --branch=verify --since= % # now on verify branch % datalad diff --revision=master.. % git log --oneline --left-right --cherry-pick master... .. note:: Currently the "onto" feature only sets the working tree of the current dataset to a previous state. The working trees of any subdatasets remain unchanged. """ _params_ = dict( revision=Parameter( args=("revision", ), metavar="REVISION", nargs="?", doc="""rerun command(s) in `revision`. By default, the command from this commit will be executed, but [CMD: --since CMD][PY: `since` PY] can be used to construct a revision range.""", default="HEAD", constraints=EnsureStr()), since=Parameter( args=("--since", ), doc="""If `since` is a commit-ish, the commands from all commits that are reachable from `revision` but not `since` will be re-executed (in other words, the commands in :command:`git log SINCE..REVISION`). If SINCE is an empty string, it is set to the parent of the first commit that contains a recorded command (i.e., all commands in :command:`git log REVISION` will be re-executed).""", constraints=EnsureStr() | EnsureNone()), branch=Parameter( metavar="NAME", args=( "-b", "--branch", ), doc= "create and checkout this branch before rerunning the commands.", constraints=EnsureStr() | EnsureNone()), onto=Parameter( metavar="base", args=("--onto", ), doc="""start point for rerunning the commands. If not specified, commands are executed at HEAD. This option can be used to specify an alternative start point, which will be checked out with the branch name specified by [CMD: --branch CMD][PY: `branch` PY] or in a detached state otherwise. As a special case, an empty value for this option means to use the commit specified by [CMD: --since CMD][PY: `since` PY].""", constraints=EnsureStr() | EnsureNone()), message=Parameter( args=( "-m", "--message", ), metavar="MESSAGE", doc="""use MESSAGE for the reran commit rather than the recorded commit message. In the case of a multi-commit rerun, all the reran commits will have this message.""", constraints=EnsureStr() | EnsureNone()), script=Parameter( args=("--script", ), metavar="FILE", doc="""extract the commands into [CMD: FILE CMD][PY: this file PY] rather than rerunning. Use - to write to stdout instead. [CMD: This option implies --report. CMD]""", constraints=EnsureStr() | EnsureNone()), dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset from which to rerun a recorded command. If no dataset is given, an attempt is made to identify the dataset based on the current working directory. If a dataset is given, the command will be executed in the root directory of this dataset.""", constraints=EnsureDataset() | EnsureNone()), report=Parameter( args=("--report", ), action="store_true", doc="""Don't actually re-execute anything, just display what would be done. [CMD: Note: If you give this option, you most likely want to set --output-format to 'json' or 'json_pp'. CMD]"""), ) @staticmethod @datasetmethod(name='rerun') @eval_results def __call__(revision="HEAD", since=None, dataset=None, branch=None, message=None, onto=None, script=None, report=False): ds = require_dataset(dataset, check_installed=True, purpose='rerunning a command') lgr.debug('rerunning command output underneath %s', ds) if script is None and not report and ds.repo.dirty: yield get_status_dict('run', ds=ds, status='impossible', message=('unsaved modifications present, ' 'cannot detect changes by command')) return if not ds.repo.get_hexsha(): yield get_status_dict( 'run', ds=ds, status='impossible', message='cannot rerun command, nothing recorded') return if branch and branch in ds.repo.get_branches(): yield get_status_dict( "run", ds=ds, status="error", message="branch '{}' already exists".format(branch)) return if not ds.repo.commit_exists(revision + "^"): # Only a single commit is reachable from `revision`. In # this case, --since has no effect on the range construction. revrange = revision elif since is None: revrange = "{rev}^..{rev}".format(rev=revision) elif since.strip() == "": revrange = revision else: revrange = "{}..{}".format(since, revision) if ds.repo.repo.git.rev_list("--merges", revrange, "--"): yield get_status_dict( "run", ds=ds, status="error", message="cannot rerun history with merge commits") return results = _rerun_as_results(ds, revrange, since, branch, onto, message) if script: handler = _get_script_handler(script, since, revision) elif report: handler = _report else: handler = _rerun for res in handler(ds, results): yield res
class Addurls(Interface): """Create and update a dataset from a list of URLs. *Format specification* Several arguments take format strings. These are similar to normal Python format strings where the names from `URL-FILE` (column names for a CSV or properties for JSON) are available as placeholders. If `URL-FILE` is a CSV file, a positional index can also be used (i.e., "{0}" for the first column). Note that a placeholder cannot contain a ':' or '!'. In addition, the `FILENAME-FORMAT` arguments has a few special placeholders. - _repindex The constructed file names must be unique across all fields rows. To avoid collisions, the special placeholder "_repindex" can be added to the formatter. Its value will start at 0 and increment every time a file name repeats. - _url_hostname, _urlN, _url_basename* Various parts of the formatted URL are available. Take "http://datalad.org/asciicast/seamless_nested_repos.sh" as an example. "datalad.org" is stored as "_url_hostname". Components of the URL's path can be referenced as "_urlN". "_url0" and "_url1" would map to "asciicast" and "seamless_nested_repos.sh", respectively. The final part of the path is also available as "_url_basename". This name is broken down further. "_url_basename_root" and "_url_basename_ext" provide access to the root name and extension. These values are similar to the result of os.path.splitext, but, in the case of multiple periods, the extension is identified using the same length heuristic that git-annex uses. As a result, the extension of "file.tar.gz" would be ".tar.gz", not ".gz". In addition, the fields "_url_basename_root_py" and "_url_basename_ext_py" provide access to the result of os.path.splitext. - _url_filename* These are similar to _url_basename* fields, but they are obtained with a server request. This is useful if the file name is set in the Content-Disposition header. *Examples* Consider a file "avatars.csv" that contains:: who,ext,link neurodebian,png,https://avatars3.githubusercontent.com/u/260793 datalad,png,https://avatars1.githubusercontent.com/u/8927200 To download each link into a file name composed of the 'who' and 'ext' fields, we could run:: $ datalad addurls -d avatar_ds --fast avatars.csv '{link}' '{who}.{ext}' The `-d avatar_ds` is used to create a new dataset in "$PWD/avatar_ds". If we were already in a dataset and wanted to create a new subdataset in an "avatars" subdirectory, we could use "//" in the `FILENAME-FORMAT` argument:: $ datalad addurls --fast avatars.csv '{link}' 'avatars//{who}.{ext}' .. note:: For users familiar with 'git annex addurl': A large part of this plugin's functionality can be viewed as transforming data from `URL-FILE` into a "url filename" format that fed to 'git annex addurl --batch --with-files'. """ from datalad.distribution.dataset import datasetmethod from datalad.interface.utils import eval_results from datalad.distribution.dataset import EnsureDataset from datalad.support.constraints import EnsureChoice, EnsureNone, EnsureStr from datalad.support.param import Parameter _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""Add the URLs to this dataset (or possibly subdatasets of this dataset). An empty or non-existent directory is passed to create a new dataset. New subdatasets can be specified with `FILENAME-FORMAT`.""", constraints=EnsureDataset() | EnsureNone()), urlfile=Parameter( args=("urlfile", ), metavar="URL-FILE", doc="""A file that contains URLs or information that can be used to construct URLs. Depending on the value of --input-type, this should be a CSV file (with a header as the first row) or a JSON file (structured as a list of objects with string values)."""), urlformat=Parameter( args=("urlformat", ), metavar="URL-FORMAT", doc="""A format string that specifies the URL for each entry. See the 'Format Specification' section above."""), filenameformat=Parameter( args=("filenameformat", ), metavar="FILENAME-FORMAT", doc="""Like `URL-FORMAT`, but this format string specifies the file to which the URL's content will be downloaded. The file name may contain directories. The separator "//" can be used to indicate that the left-side directory should be created as a new subdataset. See the 'Format Specification' section above."""), input_type=Parameter( args=("-t", "--input-type"), metavar="TYPE", doc="""Whether `URL-FILE` should be considered a CSV file or a JSON file. The default value, "ext", means to consider `URL-FILE` as a JSON file if it ends with ".json". Otherwise, treat it as a CSV file.""", constraints=EnsureChoice("ext", "csv", "json")), exclude_autometa=Parameter( args=("-x", "--exclude_autometa"), metavar="REGEXP", doc="""By default, metadata field=value pairs are constructed with each column in `URL-FILE`, excluding any single column that is specified via `URL-FORMAT`. This argument can be used to exclude columns that match a regular expression. If set to '*' or an empty string, automatic metadata extraction is disabled completely. This argument does not affect metadata set explicitly with --meta."""), meta=Parameter( args=( "-m", "--meta", ), metavar="FORMAT", action="append", doc="""A format string that specifies metadata. It should be structured as "<field>=<value>". As an example, "location={3}" would mean that the value for the "location" metadata field should be set the value of the fourth column. This option can be given multiple times."""), message=Parameter( args=("--message", ), metavar="MESSAGE", doc="""Use this message when committing the URL additions.""", constraints=EnsureNone() | EnsureStr()), dry_run=Parameter( args=("-n", "--dry-run"), action="store_true", doc="""Report which URLs would be downloaded to which files and then exit."""), fast=Parameter( args=("--fast", ), action="store_true", doc="""If True, add the URLs, but don't download their content. Underneath, this passes the --fast flag to `git annex addurl`."""), ifexists=Parameter( args=("--ifexists", ), metavar="ACTION", doc="""What to do if a constructed file name already exists. The default behavior is to proceed with the `git annex addurl`, which will fail if the file size has changed. If set to 'overwrite', remove the old file before adding the new one. If set to 'skip', do not add the new file.""", constraints=EnsureNone() | EnsureChoice("overwrite", "skip")), missing_value=Parameter( args=("--missing-value", ), metavar="VALUE", doc="""When an empty string is encountered, use this value instead.""", constraints=EnsureNone() | EnsureStr()), save=nosave_opt, version_urls=Parameter( args=("--version-urls", ), action="store_true", doc="""Try to add a version ID to the URL. This currently only has an effect on URLs for AWS S3 buckets."""), ) @staticmethod @datasetmethod(name='addurls') @eval_results def __call__(dataset, urlfile, urlformat, filenameformat, input_type="ext", exclude_autometa=None, meta=None, message=None, dry_run=False, fast=False, ifexists=None, missing_value=None, save=True, version_urls=False): # Temporarily work around gh-2269. url_file = urlfile url_format, filename_format = urlformat, filenameformat from requests.exceptions import RequestException from datalad.distribution.dataset import Dataset, require_dataset from datalad.interface.results import get_status_dict from datalad.support.annexrepo import AnnexRepo lgr = logging.getLogger("datalad.plugin.addurls") dataset = require_dataset(dataset, check_installed=False) if dataset.repo and not isinstance(dataset.repo, AnnexRepo): yield get_status_dict(action="addurls", ds=dataset, status="error", message="not an annex repo") return if input_type == "ext": extension = os.path.splitext(url_file)[1] input_type = "json" if extension == ".json" else "csv" with open(url_file) as fd: try: rows, subpaths = extract(fd, input_type, url_format, filename_format, exclude_autometa, meta, dry_run, missing_value) except (ValueError, RequestException) as exc: yield get_status_dict(action="addurls", ds=dataset, status="error", message=exc_str(exc)) return if len(rows) != len(set(row["filename"] for row in rows)): yield get_status_dict(action="addurls", ds=dataset, status="error", message=("There are file name collisions; " "consider using {_repindex}")) return if dry_run: for subpath in subpaths: lgr.info("Would create a subdataset at %s", subpath) for row in rows: lgr.info("Would download %s to %s", row["url"], os.path.join(dataset.path, row["filename"])) lgr.info( "Metadata: %s", sorted(u"{}={}".format(k, v) for k, v in row["meta_args"].items())) yield get_status_dict(action="addurls", ds=dataset, status="ok", message="dry-run finished") return if not dataset.repo: # Populate a new dataset with the URLs. for r in dataset.rev_create(result_xfm=None, return_type='generator'): yield r annex_options = ["--fast"] if fast else [] for spath in subpaths: if os.path.exists(os.path.join(dataset.path, spath)): lgr.warning("Not creating subdataset at existing path: %s", spath) else: for r in dataset.rev_create(spath, result_xfm=None, return_type='generator'): yield r for row in rows: # Add additional information that we'll need for various # operations. filename_abs = os.path.join(dataset.path, row["filename"]) if row["subpath"]: ds_current = Dataset(os.path.join(dataset.path, row["subpath"])) ds_filename = os.path.relpath(filename_abs, ds_current.path) else: ds_current = dataset ds_filename = row["filename"] row.update({ "filename_abs": filename_abs, "ds": ds_current, "ds_filename": ds_filename }) if version_urls: num_urls = len(rows) log_progress(lgr.info, "addurls_versionurls", "Versioning %d URLs", num_urls, label="Versioning URLs", total=num_urls, unit=" URLs") for row in rows: url = row["url"] try: row["url"] = get_versioned_url(url) except (ValueError, NotImplementedError) as exc: # We don't expect this to happen because get_versioned_url # should return the original URL if it isn't an S3 bucket. # It only raises exceptions if it doesn't know how to # handle the scheme for what looks like an S3 bucket. lgr.warning("error getting version of %s: %s", row["url"], exc_str(exc)) log_progress(lgr.info, "addurls_versionurls", "Versioned result for %s: %s", url, row["url"], update=1, increment=True) log_progress(lgr.info, "addurls_versionurls", "Finished versioning URLs") files_to_add = set() for r in add_urls(rows, ifexists=ifexists, options=annex_options): if r["status"] == "ok": files_to_add.add(r["path"]) yield r msg = message or """\ [DATALAD] add files from URLs url_file='{}' url_format='{}' filename_format='{}'""".format(url_file, url_format, filename_format) if files_to_add: for r in dataset.add(files_to_add, save=False): yield r meta_rows = [r for r in rows if r["filename_abs"] in files_to_add] for r in add_meta(meta_rows): yield r # Save here rather than the add call above to trigger a metadata # commit on the git-annex branch. if save: for r in dataset.save(message=msg, recursive=True): yield r
class Metadata(Interface): """Metadata manipulation for files and whole datasets Two types of metadata are supported: 1. metadata describing a dataset as a whole (dataset-global), and 2. metadata for individual files in a dataset. Both types can be accessed and modified with this command. Note, however, that this only refers to Datalad's native metadata, and not to any other metadata that is possibly stored in files of a dataset. Datalad's native metadata capability is primarily targeting data description via arbitrary tags and other (brief) key-value attributes (with possibly multiple values for a single key). Metadata key names are limited to alphanumerics (and [_-.]). Moreover, all key names are converted to lower case. *Dataset (global) metadata* Metadata describing a dataset as a whole is stored in JSON format in the dataset at .datalad/metadata/dataset.json. The amount of metadata that can be stored is not limited by Datalad. However, it should be kept brief as this information is stored in the Git history of the dataset, and access or modification requires to read the entire file. Arbitrary metadata keys can be used. However, Datalad reserves the keys 'tag' and 'definition' for its own use. The can still be manipulated without any restrictions like any other metadata items, but doing so can impact Datalad's metadata-related functionality, handle with care. The 'tag' key is used to store a list of (unique) tags. The 'definition' key is used to store key-value mappings that define metadata keys used elsewhere in the metadata. Using the feature is optional (see --define-key). It can be useful in the context of data discovery needs, where metadata keys can be precisely defined by linking them to specific ontology terms. *File metadata* Metadata storage for individual files is provided by git-annex, and generally the same rules as for dataset-global metadata apply. However, there is just one reserved key name: 'tag'. Again, the amount of metadata is not limited, but metadata is stored in git-annex' internal data structures in the Git repository of a dataset. Large amounts of metadata can slow its performance. || CMDLINE >> *Output rendering* By default, a short summary of the metadata for each dataset (component) is rendered:: <path> (<type>): -|<keys> [<tags>] where <path> is the path of the respective component, <type> a label for the type of dataset components metadata is presented for. Non-existant metadata is indicated by a dash, otherwise a comma-separated list of metadata keys (except for 'tag'), is followed by a list of tags, if there are any. << CMDLINE || """ # make the custom renderer the default, path reporting isn't the top # priority here result_renderer = 'tailored' _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), doc="""""", constraints=EnsureDataset() | EnsureNone()), path=Parameter(args=("path", ), metavar="PATH", doc="path(s) to set/get metadata", nargs="*", constraints=EnsureStr() | EnsureNone()), add=Parameter(args=( '-a', '--add', ), nargs='+', action='append', metavar=('KEY', 'VAL'), doc="""metadata items to add. If only a key is given, a corresponding tag is added. If a key-value mapping (multiple values at once are supported) is given, the values are added to the metadata item of that key.""", constraints=EnsureStr() | EnsureNone()), init=Parameter( args=( '-i', '--init', ), nargs='+', action='append', metavar=('KEY', 'VAL'), doc="""like --add, but tags are only added if no tag was present before. Likewise, values are only added to a metadata key, if that key did not exist before.""", constraints=EnsureStr() | EnsureNone()), remove=Parameter( args=('--remove', ), nargs='+', action='append', metavar=('KEY', 'VAL'), doc="""metadata values to remove. If only a key is given, a corresponding tag is removed. If a key-value mapping (multiple values at once are supported) is given, only those values are removed from the metadata item of that key. If no values are left after the removal, the entire item of that key is removed.""", constraints=EnsureStr() | EnsureNone()), reset=Parameter( args=('--reset', ), nargs='+', action='append', metavar=('KEY', 'VAL'), doc="""metadata items to remove. If only a key is given, a corresponding metadata key with all its values is removed. If a key-value mapping (multiple values at once are supported) is given, any existing values for this key are replaced by the given ones.""", constraints=EnsureStr() | EnsureNone()), define_key=Parameter( args=('--define-key', ), nargs=2, action='append', metavar=('KEY', 'DEFINITION'), doc="""convenience option to add an item in the dataset's global metadata ('definition' key). This can be used to define (custom) keys used in the datasets's metadata, for example by providing a URL to an ontology term for a given key label. This option does not need --dataset-global to be set to be in effect.""", constraints=EnsureStr() | EnsureNone()), dataset_global=Parameter( args=('-g', '--dataset-global'), action='store_true', doc="""Whether to perform metadata query or modification on the global dataset metadata, or on individual dataset components. For example, without this switch setting metadata using the root path of a dataset, will set the given metadata for all files in a dataset, whereas with this flag only the metadata record of the dataset itself will be altered."""), recursive=recursion_flag, recursion_limit=recursion_limit) @staticmethod @datasetmethod(name='metadata') @eval_results def __call__(path=None, dataset=None, add=None, init=None, remove=None, reset=None, define_key=None, dataset_global=False, recursive=False, recursion_limit=None): # bring metadataset setter args in shape first untag, remove = _parse_argspec(remove) purge, reset = _parse_argspec(reset) tag_add, add = _parse_argspec(add) tag_init, init = _parse_argspec(init) define_key = dict(define_key) if define_key else None # merge all potential sources of tag specifications all_untag = remove.get('tag', []) + untag if all_untag: remove['tag'] = all_untag all_addtag = add.get('tag', []) + tag_add if all_addtag: add['tag'] = all_addtag all_inittag = init.get('tag', []) + tag_init if all_inittag: init['tag'] = all_inittag lgr.debug("Will 'init' metadata items: %s", init) lgr.debug("Will 'add' metadata items: %s", add) lgr.debug("Will 'remove' metadata items: %s", remove) lgr.debug("Will 'reset' metadata items: %s", reset) lgr.debug("Will 'purge' metadata items: %s", purge) refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='metadata', logger=lgr, refds=refds_path) to_process = [] for ap in AnnotatePaths.__call__(dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='metadata', unavailable_path_status='error', nondataset_path_status='error', force_subds_discovery=False, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) == 'dataset': if ap.get('state', None) == 'absent': # just discovered via recursion, but not relevant here continue if GitRepo.is_valid_repo(ap['path']): ap['process_content'] = True to_process.append(ap) content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path, path_only=False) assert (not completed) # iterate over all datasets, order doesn't matter to_save = [] for ds_path in content_by_ds: # ignore submodule entries content = [ ap for ap in content_by_ds[ds_path] if ap.get('type', None) != 'dataset' or ap['path'] == ds_path ] if not content: # nothing other than subdatasets were given or discovered in # this dataset, ignore continue ds = Dataset(ds_path) if dataset_global or define_key: db_path = opj(ds.path, '.datalad', 'metadata', 'dataset.json') db = {} if exists(db_path): db_fp = open(db_path) # need to read manually, load() would puke on an empty file db_content = db_fp.read() # minimize time for collision db_fp.close() if db_content: db = json.loads(db_content) # TODO make manipulation order identical to what git-annex does for k, v in init.items() if init else []: if k not in db: db[k] = v for k in purge: if k in db: del db[k] for k, v in reset.items(): db[k] = v for k, v in add.items(): db[k] = sorted(unique(db.get(k, []) + v)) for k, v in remove.items(): existing_data = db.get(k, []) if isinstance(existing_data, dict): db[k] = { dk: existing_data[dk] for dk in set(existing_data).difference(v) } else: db[k] = list(set(existing_data).difference(v)) # wipe out if empty if not db[k]: del db[k] added_def = False if define_key: defs = db.get('definition', {}) for k, v in define_key.items(): if k in defs: if not defs[k] == v: yield get_status_dict( status='error', ds=ds, message= ("conflicting definition for key '%s': '%s' != '%s'", k, v, defs[k]), **res_kwargs) continue else: defs[k] = v added_def = True db['definition'] = defs # store, if there is anything if db: if not exists(dirname(db_path)): makedirs(dirname(db_path)) db_fp = open(db_path, 'w') # produce relatively compact, but also diff-friendly format json.dump(db, db_fp, indent=0, separators=(',', ':\n'), sort_keys=True) # minimize time for collision db_fp.close() # use add not save to also cover case of a fresh file ds.add(db_path, save=False) to_save.append( dict(path=db_path, parentds=ds.path, type='file')) elif exists(db_path): # no metadata left, kill file ds.remove(db_path) to_save.append(dict(path=ds.path, type='dataset')) if added_def or init or add or remove or reset or purge: # if anything happended or could have happended yield get_status_dict(status='ok', ds=ds, metadata=db, **res_kwargs) elif not isinstance(ds.repo, AnnexRepo): # report on all explicitly requested paths only for ap in [c for c in content if ap.get('raw_input', False)]: yield dict( ap, status='impossible', message=( 'non-annex dataset %s has no file metadata support', ds), **res_kwargs) continue ds_paths = [p['path'] for p in content] if not dataset_global: if reset or purge or add or init or remove: # file metadata manipulation mod_paths = [] for mp in ds.repo.set_metadata( ds_paths, reset=reset, add=add, init=init, remove=remove, purge=purge, # we always go recursive # TODO is that a good thing? But how to otherwise distinuish # this kind of recursive from the one across datasets in # the API? recursive=True): if mp.get('success', False): mod_paths.append(mp['file']) else: yield get_status_dict( status='error', message='setting metadata failed', path=opj(ds.path, mp[0]), type='file', **res_kwargs) # query the actually modified paths only ds_paths = mod_paths # and lastly, query -- even if we set before -- there could # be side-effect from multiple set paths on an individual # path, hence we need to query to get the final result for file, meta in ds.repo.get_metadata(ds_paths): r = get_status_dict(status='ok', path=opj(ds.path, file), type='file', metadata=meta, **res_kwargs) yield r # save potential modifications to dataset global metadata if not to_save: return for res in Save.__call__(files=to_save, dataset=refds_path, message='[DATALAD] dataset metadata update', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res @staticmethod def custom_result_renderer(res, **kwargs): from datalad.ui import ui if res['status'] != 'ok' or not res.get('action', None) == 'metadata': # logging complained about this already return # list the path, available metadata keys, and tags path = relpath(res['path'], res['refds']) if res.get( 'refds', None) else res['path'] meta = res.get('metadata', {}) ui.message('{path}{type}:{spacer}{meta}{tags}'.format( path=path, type=' ({})'.format(res['type']) if 'type' in res else '', spacer=' ' if len([m for m in meta if m != 'tag']) else '', meta=','.join(k for k in sorted(meta.keys()) if not k == 'tag') if meta else ' -', tags='' if 'tag' not in meta else ' [{}]'.format(','.join(meta['tag']))))
class AnnotatePaths(Interface): """Analyze and act upon input paths Given paths (or more generally location requests) are inspected and annotated with a number of properties. A list of recognized properties is provided below. || PYTHON >>Input `paths` for this command can either be un-annotated (raw) path strings, or already (partially) annotated paths. In the latter case, further annotation is limited to yet-unknown properties, and is potentially faster than initial annotation.<< PYTHON || *Recognized path properties* {proplist} In the case of enabled modification detection the results may contain additional properties regarding the nature of the modification. See the documentation of the `diff` command for details. """ _docs_ = dict( proplist='\n\n '.join( '"{}"\n{}'.format( k, textwrap.fill(known_props[k], initial_indent=' ', subsequent_indent=' ')) for k in sorted(known_props))) _params_ = dict( path=Parameter( args=("path",), metavar="PATH", doc="""path to be annotated""", nargs="*", constraints=EnsureStr() | EnsureNone()), dataset=Parameter( args=("-d", "--dataset"), doc="""an optional reference/base dataset for the paths""", constraints=EnsureDataset() | EnsureNone()), recursive=recursion_flag, recursion_limit=recursion_limit, action=Parameter( args=("--action",), metavar="LABEL", doc="""an "action" property value to include in the path annotation""", constraints=EnsureStr() | EnsureNone()), unavailable_path_status=Parameter( args=("--unavailable-path-status",), metavar="LABEL", doc="""a "status" property value to include in the annotation for paths that are underneath a dataset, but do not exist on the filesystem""", constraints=EnsureStr() | EnsureNone()), unavailable_path_msg=Parameter( args=("--unavailable-path-msg",), metavar="message", doc="""a "message" property value to include in the annotation for paths that are underneath a dataset, but do not exist on the filesystem""", constraints=EnsureStr() | EnsureNone()), nondataset_path_status=Parameter( args=("--nondataset-path-status",), metavar="LABEL", doc="""a "status" property value to include in the annotation for paths that are not underneath any dataset""", constraints=EnsureStr() | EnsureNone()), force_parentds_discovery=Parameter( args=("--no-parentds-discovery",), dest='force_parentds_discovery', action='store_false', doc="""Flag to disable reports of parent dataset information for any path, in particular dataset root paths. Disabling saves on command run time, if this information is not needed."""), force_subds_discovery=Parameter( args=("--no-subds-discovery",), action='store_false', dest='force_subds_discovery', doc="""Flag to disable reporting type='dataset' for subdatasets, even when they are not installed, or their mount point directory doesn't exist. Disabling saves on command run time, if this information is not needed."""), force_untracked_discovery=Parameter( args=("--no-untracked-discovery",), action='store_false', dest='force_untracked_discovery', doc="""Flag to disable discovery of untracked changes. Disabling saves on command run time, if this information is not needed."""), force_no_revision_change_discovery=Parameter( args=("--revision-change-discovery",), action='store_false', dest='force_no_revision_change_discovery', doc="""Flag to disable discovery of changes which were not yet committed. Disabling saves on command run time, if this information is not needed."""), modified=Parameter( args=("--modified",), nargs='?', const=True, constraints=EnsureStr() | EnsureBool() | EnsureNone(), doc="""comparison reference specification for modification detection. This can be (mostly) anything that `git diff` understands (commit, treeish, tag, etc). See the documentation of `datalad diff --revision` for details. Unmodified paths will not be annotated. If a requested path was not modified but some content underneath it was, then the request is replaced by the modified paths and those are annotated instead. This option can be used [PY: with `True` as PY][CMD: without CMD] an argument to test against changes that have been made, but have not yet been staged for a commit.""")) @staticmethod @datasetmethod(name='annotate_paths') @eval_results def __call__( path=None, dataset=None, recursive=False, recursion_limit=None, action=None, unavailable_path_status='', unavailable_path_msg=None, nondataset_path_status='error', force_parentds_discovery=True, force_subds_discovery=True, force_no_revision_change_discovery=True, force_untracked_discovery=True, modified=None): # upfront check for the fastest possible response if not path and dataset is None: # nothing given, try "here", but do not use `require_dataset`, as # it will determine the root dataset of `curdir` and further down # lead to path annotation of upstairs directories dataset = curdir if force_subds_discovery and not force_parentds_discovery: raise ValueError( 'subdataset discovery requires parent dataset discovery') # CONCEPT: yield with no status to indicate further processing # everything in one big loop to be able too yield as fast a possible # without any precomputing for all paths refds_path = Interface.get_refds_path(dataset) if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)): raise ValueError( "modification detection only works with a base dataset (non-given or found)") # prep common result props res_kwargs = dict( action=action if action else 'annotate_path', refds=refds_path, logger=lgr) # handle the case of recursion into a single dataset without any # extra fancy processing first -- full recursion can be done # faster than manual recursion, hence we gain quite some speed # from these few lines of extra code if not modified and not path and refds_path: if not GitRepo.is_valid_repo(refds_path): yield get_status_dict( # doesn't matter if the path is in another dataset # it was given as reference dataset status=nondataset_path_status, message='given reference dataset is not a dataset', path=refds_path, **res_kwargs) return refds = Dataset(refds_path) path = [] # yield the dataset itself r = get_status_dict(ds=refds, status='', **res_kwargs) yield r if recursive: # if we have nothing given, but need recursion, we need to feed # the dataset path itself for r in yield_recursive( refds, refds_path, action, recursion_limit): r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] yield r return # goal: structure in a way that makes most information on any path # available in a single pass, at the cheapest possible cost reported_paths = {} requested_paths = assure_list(path) if modified is not None: # modification detection would silently kill all nondataset paths # but we have to complain about them, hence doing it here if requested_paths and refds_path: for r in requested_paths: p = r['path'] if isinstance(r, dict) else r p = resolve_path(p, ds=refds_path) if path_startswith(p, refds_path): # all good continue # not the refds path_props = r if isinstance(r, dict) else {} res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with reference dataset' reported_paths[r] = res yield res # preserve non-existing paths to be silently killed by modification # detection and append them to requested_paths again after detection. # TODO: This might be melted in with treatment of non dataset paths # above. Re-appending those paths seems to be better than yielding # directly to avoid code duplication, since both cases later on are # dealt with again. preserved_paths = [] if requested_paths: [preserved_paths.append(r) for r in requested_paths if not lexists(r['path'] if isinstance(r, dict) else r)] # replace the requested paths by those paths that were actually # modified underneath or at a requested location requested_paths = get_modified_subpaths( # either the request, or the base dataset, if there was no request requested_paths if requested_paths else [refds_path], refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit) from itertools import chain # re-append the preserved paths: requested_paths = chain(requested_paths, iter(preserved_paths)) # do not loop over unique(), this could be a list of dicts # we avoid duplicates manually below via `reported_paths` for path in requested_paths: if not isinstance(path, dict): path = rawpath2ap(path, refds_path) # this is now an annotated path! path_props = path path = path['path'] # we need to mark our territory, who knows where this has been path_props.update(res_kwargs) if path in reported_paths: # we already recorded this path in the output # this can happen, whenever `path` is a subdataset, that was # discovered via recursive processing of another path before continue # the path exists in some shape or form # TODO if we have path_props already we could skip this test if isdir(path): # keep any existing type info, previously a more expensive run # could have discovered an uninstalled 'dataset', and we don't # want it to be relabeled to a directory path_props['type'] = \ path_props.get( 'type', 'dataset' if not islink(path) and GitRepo.is_valid_repo(path) else 'directory') # this could contain all types of additional content containing_dir = path if not islink(path) else normpath(opj(path, pardir)) else: if lexists(path): path_props['type'] = 'file' else: path_props['state'] = 'absent' # for everything else we are interested in the container containing_dir = dirname(path) if not containing_dir: containing_dir = curdir dspath = parent = get_dataset_root(containing_dir) if dspath: if path_props.get('type', None) == 'dataset': # for a dataset the root is not the parent, for anything else # it is parent = path_props.get('parentds', None) oneupdir = normpath(opj(containing_dir, pardir)) if parent is None and (force_parentds_discovery or ( refds_path and _with_sep(oneupdir).startswith( _with_sep(refds_path)))): # either forced, or only if we have a reference dataset, and # only if we stay within this refds when searching for the # parent parent = get_dataset_root(normpath(opj(containing_dir, pardir))) # NOTE the `and refds_path` is critical, as it will determine # whether a top-level dataset that was discovered gets the # parent property or not, it won't get it without a common # base dataset, and that is how we always rolled if parent and refds_path: path_props['parentds'] = parent # don't check whether this is actually a true subdataset of the # parent, done further down else: # set parent, but prefer existing property path_props['parentds'] = path_props.get('parentds', dspath) # test for `dspath` not `parent`, we only need to know whether there is # ANY dataset, not which one is the true parent, logic below relies on # the fact that we end here, if there is no dataset at all if not dspath: # not in any dataset res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with any dataset' reported_paths[path] = res yield res continue # check that we only got SUBdatasets if refds_path and not path_startswith(dspath, refds_path): res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = \ ('path not part of the reference dataset at %s', refds_path) reported_paths[path] = res yield res continue if path_props.get('type', None) == 'file': # nothing else we can learn about this res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res continue containing_ds = None path_type = path_props.get('type', None) if parent and force_subds_discovery and ( (path_type == 'dataset' and 'registered_subds' not in path_props) or path_type == 'directory' or not lexists(path)): # if the path doesn't exist, or is labeled a directory, or a dataset even # a dataset (without this info) -> record whether this is a known subdataset # to its parent containing_ds = Dataset(parent) subdss = containing_ds.subdatasets( fulfilled=None, recursive=False, result_xfm=None, result_filter=None, return_type='list') if path in [s['path'] for s in subdss]: if path_type == 'directory' or not lexists(path): # first record that it isn't here, if just a dir or not here at all path_props['state'] = 'absent' # this must be a directory, and it is not installed path_props['type'] = 'dataset' path_props['registered_subds'] = True if not lexists(path) or \ (path_props.get('type', None) == 'dataset' and path_props.get('state', None) == 'absent'): # not there (yet) message = unavailable_path_msg if unavailable_path_msg else None if message and '%s' in message: message = (message, path) path_props['message'] = message res = get_status_dict(**dict(res_kwargs, **path_props)) # assign given status, but only if the props don't indicate a status # already res['status'] = path_props.get( 'status', unavailable_path_status) reported_paths[path] = res yield res continue # we know everything we can, report res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res rec_paths = [] if recursive: # here we need to consider the special case that `path` is # a dataset itself, if a recursion_limit is given (e.g. # `remove` will do that by default), we need to recurse # from the dataset itself, and not its parent to get things # right -- this will also avoid needless discovery of # unrelated subdatasets if path_props.get('type', None) == 'dataset': containing_ds = Dataset(path) else: # regular parent, we might have a dataset already containing_ds = Dataset(parent) if containing_ds is None else containing_ds for r in yield_recursive(containing_ds, path, action, recursion_limit): # capture reported paths r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] reported_paths[r['path']] = r if modified is not None: # we cannot yield right away, maybe it wasn't modified rec_paths.append(r) else: yield r if modified is not None and rec_paths: # replace the recursively discovered paths by those paths that # were actually modified underneath or at a requested location for r in get_modified_subpaths( rec_paths, refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit): res = get_status_dict(**dict(r, **res_kwargs)) reported_paths[res['path']] = res yield res return
class Save(Interface): """Save the current state of a dataset Saving the state of a dataset records changes that have been made to it. This change record is annotated with a user-provided description. Optionally, an additional tag, such as a version, can be assigned to the saved state. Such tag enables straightforward retrieval of past versions at a later point in time. Examples: Save any content underneath the current directory, without altering any potential subdataset (use --recursive for that):: % datalad save . Save any modification of known dataset content, but leave untracked files (e.g. temporary files) untouched:: % dataset save -u -d <path_to_dataset> Tag the most recent saved state of a dataset:: % dataset save -d <path_to_dataset> --version-tag bestyet .. note:: For performance reasons, any Git repository without an initial commit located inside a Dataset is ignored, and content underneath it will be saved to the respective superdataset. DataLad datasets always have an initial commit, hence are not affected by this behavior. """ # note above documents that out behavior is like that of `git add`, but # does not explicitly mention the connection to keep it simple. _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), doc=""""specify the dataset to save""", constraints=EnsureDataset() | EnsureNone()), path=Parameter( args=("path", ), metavar='PATH', doc="""path/name of the dataset component to save. If given, only changes made to those components are recorded in the new state.""", nargs='*', constraints=EnsureStr() | EnsureNone()), message=save_message_opt, message_file=Parameter( args=("-F", "--message-file"), doc="""take the commit message from this file. This flag is mutually exclusive with -m.""", constraints=EnsureStr() | EnsureNone()), version_tag=Parameter( args=( "-t", "--version-tag", ), metavar='ID', doc="""an additional marker for that state. Every dataset that is touched will receive the tag.""", constraints=EnsureStr() | EnsureNone()), recursive=recursion_flag, recursion_limit=recursion_limit, updated=Parameter( args=( '-u', '--updated', ), action='store_true', doc="""if given, only saves previously tracked paths."""), to_git=Parameter( args=("--to-git", ), action='store_true', doc="""flag whether to add data directly to Git, instead of tracking data identity only. Usually this is not desired, as it inflates dataset sizes and impacts flexibility of data transport. If not specified - it will be up to git-annex to decide, possibly on .gitattributes options. Use this flag with a simultaneous selection of paths to save. In general, it is better to pre-configure a dataset to track particular paths, file types, or file sizes with either Git or git-annex. See https://git-annex.branchable.com/tips/largefiles/"""), ) @staticmethod @datasetmethod(name='save') @eval_results def __call__( path=None, message=None, dataset=None, version_tag=None, recursive=False, recursion_limit=None, updated=False, message_file=None, to_git=None, ): if message and message_file: raise ValueError( "Both a message and message file were specified for save()") path = assure_list(path) if message_file: with open(message_file) as mfh: message = mfh.read() # we want 'normal' to achieve the most compact argument list # for git calls # untracked_mode = 'no' if updated else 'normal' # TODO however, Repo.add() would refuse to add any dotfiles # in a directory that is itself untracked, hence the only # choice is to go with potentially crazy long lists # until https://github.com/datalad/datalad/issues/1454 # has a resolution untracked_mode = 'no' if updated else 'all' # there are three basic scenarios: # 1. save modifications to any already tracked content # 2. save any content (including removal of deleted content) # to bring things to a clean state # 3. like (2), but only operate on a given subset of content # identified by paths # - all three have to work in conjunction with --recursive # - the difference between (1) and (2) should be no more # that a switch from --untracked=no to --untracked=all # in Repo.save() # we do not support # - simultaneous operations on multiple datasets from disjoint # dataset hierarchies, hence a single reference dataset must be # identifiable from the either # - curdir or # - the `dataset` argument. # This avoids complex annotation loops and hierarchy tracking. # - any modification upwards from the root dataset ds = require_dataset(dataset, check_installed=True, purpose='saving') # use status() to do all discovery and annotation of paths paths_by_ds = {} for s in Status()( # ATTN: it is vital to pass the `dataset` argument as it, # and not a dataset instance in order to maintain the path # semantics between here and the status() call dataset=dataset, path=path, untracked=untracked_mode, recursive=recursive, recursion_limit=recursion_limit, on_failure='ignore', result_renderer='disabled'): if s['status'] == 'error': # Downstream code can't do anything with these. Let the caller # decide their fate. yield s continue # fish out status dict for this parent dataset ds_status = paths_by_ds.get(s['parentds'], {}) # reassemble path status info as repo.status() would have made it ds_status[ut.Path(s['path'])] = \ {k: v for k, v in iteritems(s) if k not in ( 'path', 'parentds', 'refds', 'status', 'action', 'logger')} paths_by_ds[s['parentds']] = ds_status lgr.debug('Determined %i datasets for saving from input arguments', len(paths_by_ds)) # figure out what datasets to process, start with the ones containing # the paths that were given as arguments discovered_datasets = list(paths_by_ds.keys()) if dataset: # if a reference dataset was given we want to save all the way up # to it, so let's throw it into the mix discovered_datasets.append(ds.path) # sort the datasets into (potentially) disjoint hierarchies, # or a single one, if a reference dataset was given dataset_hierarchies = get_tree_roots(discovered_datasets) for rootds, children in iteritems(dataset_hierarchies): edges = {} discover_dataset_trace_to_targets(rootds, children, [], edges, includeds=children) for superds, subdss in iteritems(edges): superds_status = paths_by_ds.get(superds, {}) for subds in subdss: # TODO actually start from an entry that may already # exist in the status record superds_status[ut.Path(subds)] = dict( # shot from the hip, some status config # to trigger this specific super/sub # relation to be saved state='untracked', type='dataset') paths_by_ds[superds] = superds_status # TODO parallelize, whenever we have multiple subdataset of a single # dataset they can all be processed simultaneously # sort list of dataset to handle, starting with the ones deep down for pdspath in sorted(paths_by_ds, reverse=True): pds = Dataset(pdspath) # pop status for this dataset, we are not coming back to it pds_status = { # for handing over to the low-level code, we recode any # path relative to the real repo location, this avoid # cumbersome symlink handling without context in the # lower levels pds.repo.pathobj / p.relative_to(pdspath): props for p, props in iteritems(paths_by_ds.pop(pdspath)) } start_commit = pds.repo.get_hexsha() if not all(p['state'] == 'clean' for p in pds_status.values()): for res in pds.repo.save_( message=message, # make sure to have the `path` arg be None, as we want # to prevent and bypass any additional repo.status() # calls paths=None, # prevent whining of GitRepo git=True if not hasattr(ds.repo, 'annexstatus') else to_git, # we are supplying the full status already, do not # detect anything else untracked='no', _status=pds_status): # TODO remove stringification when datalad-core can handle # path objects, or when PY3.6 is the lowest supported # version for k in ('path', 'refds'): if k in res: res[k] = text_type( # recode path back to dataset path anchor pds.pathobj / res[k].relative_to(pds.repo.pathobj)) yield res # report on the dataset itself dsres = dict( action='save', type='dataset', path=pds.path, refds=ds.path, status='ok' if start_commit != pds.repo.get_hexsha() else 'notneeded', logger=lgr, ) if not version_tag: yield dsres continue try: pds.repo.tag(version_tag) dsres.update(status='ok', version_tag=version_tag) yield dsres except CommandError as e: if dsres['status'] == 'ok': # first we yield the result for the actual save yield dsres.copy() # and now complain that tagging didn't work dsres.update(status='error', message=('cannot tag this version: %s', e.stderr.strip())) yield dsres
class Update(Interface): """Update a dataset from a sibling.""" _params_ = dict( name=Parameter( args=("name",), doc="""name of the sibling to update from""", nargs="?", constraints=EnsureStr() | EnsureNone()), dataset=Parameter( args=("-d", "--dataset"), doc=""""specify the dataset to update. If no dataset is given, an attempt is made to identify the dataset based on the current working directory""", constraints=EnsureDataset() | EnsureNone()), merge=Parameter( args=("--merge",), action="store_true", doc="Merge changes from sibling `name` or the remote branch, " "configured to be the tracking branch if no sibling was " "given.",), # TODO: How to document it without using the term 'tracking branch'? recursive=Parameter( args=("-r", "--recursive"), action="store_true", doc="""If set this updates all possibly existing subdatasets, too."""), fetch_all=Parameter( args=("--fetch-all",), action="store_true", doc="Fetch updates from all siblings.",), reobtain_data=Parameter( args=("--reobtain-data",), action="store_true", doc="TODO"),) @staticmethod @datasetmethod(name='update') def __call__(name=None, dataset=None, merge=False, recursive=False, fetch_all=False, reobtain_data=False): """ """ # TODO: Is there an 'update filehandle' similar to install and publish? # What does it mean? if reobtain_data: # TODO: properly define, what to do raise NotImplementedError("TODO: Option '--reobtain-data' not " "implemented yet.") # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) # if we have no dataset given, figure out which one we need to operate # on, based on the current working directory of the process: if ds is None: # try to find a dataset at or above PWD: dspath = GitRepo.get_toppath(getpwd()) if dspath is None: raise ValueError("No dataset found at %s." % getpwd()) ds = Dataset(dspath) assert(ds is not None) if not ds.is_installed(): raise ValueError("No installed dataset found at " "{0}.".format(ds.path)) assert(ds.repo is not None) repos_to_update = [ds.repo] if recursive: repos_to_update += [GitRepo(opj(ds.path, sub_path)) for sub_path in ds.get_dataset_handles(recursive=True)] for repo in repos_to_update: # get all remotes: remotes = repo.git_get_remotes() if name and name not in remotes: lgr.warning("'%s' not known to dataset %s.\nSkipping" % (name, repo.path)) continue # Currently '--merge' works for single remote only: # TODO: - condition still incomplete # - We can merge if a remote was given or there is a # tracking branch # - we also can fetch all remotes independently on whether or # not we merge a certain remote if not name and len(remotes) > 1 and merge: lgr.debug("Found multiple remotes:\n%s" % remotes) raise NotImplementedError("No merge strategy for multiple " "remotes implemented yet.") lgr.info("Updating handle '%s' ..." % repo.path) # fetch remote(s): repo.git_fetch(name if name else '', "--all" if fetch_all else '') # if it is an annex and there is a tracking branch, and we didn't # fetch the entire remote anyway, explicitly fetch git-annex # branch: # TODO: Is this logic correct? Shouldn't we fetch git-annex from # `name` if there is any (or if there is no tracking branch but we # have a `name`? if knows_annex(repo.path) and not fetch_all: # check for tracking branch's remote: try: std_out, std_err = \ repo._git_custom_command('', ["git", "config", "--get", "branch.{active_branch}.remote".format( active_branch=repo.git_get_active_branch())]) except CommandError as e: if e.code == 1 and e.stdout == "": std_out = None else: raise if std_out: # we have a "tracking remote" repo.git_fetch("%s git-annex" % std_out.strip()) # merge: if merge: lgr.info("Applying changes from tracking branch...") cmd_list = ["git", "pull"] if name: cmd_list.append(name) # branch needed, if not default remote # => TODO: use default remote/tracking branch to compare # (see above, where git-annex is fetched) # => TODO: allow for passing a branch # (or more general refspec?) # For now, just use the same name cmd_list.append(repo.git_get_active_branch()) out, err = repo._git_custom_command('', cmd_list) lgr.info(out) if knows_annex(repo.path): # annex-apply: lgr.info("Updating annex ...") out, err = repo._git_custom_command('', ["git", "annex", "merge"]) lgr.info(out)
class Remove(Interface): """Remove components from datasets Removing "unlinks" a dataset component, such as a file or subdataset, from a dataset. Such a removal advances the state of a dataset, just like adding new content. A remove operation can be undone, by restoring a previous dataset state, but might require re-obtaining file content and subdatasets from remote locations. This command relies on the 'drop' command for safe operation. By default, only file content from datasets which will be uninstalled as part of a removal will be dropped. Otherwise file content is retained, such that restoring a previous version also immediately restores file content access, just as it is the case for files directly committed to Git. This default behavior can be changed to always drop content prior removal, for cases where a minimal storage footprint for local datasets installations is desirable. Removing a dataset component is always a recursive operation. Removing a directory, removes all content underneath the directory too. If subdatasets are located under a to-be-removed path, they will be uninstalled entirely, and all their content dropped. If any subdataset can not be uninstalled safely, the remove operation will fail and halt. .. versionchanged:: 0.16 More in-depth and comprehensive safety-checks are now performed by default. The ``if_dirty||--if-dirty`` argument is ignored, will be removed in a future release, and can be removed for a safe-by-default behavior. For other cases consider the ``reckless||--reckless`` argument. The ``save||--save`` argument is ignored and will be removed in a future release, a dataset modification is now always saved. Consider save's ``amend||--amend`` argument for post-remove fix-ups. The ``recursive||--recursive`` argument is ignored, and will be removed in a future release. Removal operations are always recursive, and the parameter can be stripped from calls for a safe-by-default behavior. .. deprecated:: 0.16 The ``check||--check`` argument will be removed in a future release. It needs to be replaced with ``reckless||--reckless``. """ _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), metavar="DATASET", doc="""specify the dataset to perform remove from. If no dataset is given, the current working directory is used as operation context""", constraints=EnsureDataset() | EnsureNone()), path=Parameter( args=("path", ), metavar="PATH", doc="path of a dataset or dataset component to be removed", nargs="*", constraints=EnsureStr() | EnsureNone()), drop=Parameter( args=("--drop", ), doc="""which dataset components to drop prior removal. This parameter is passed on to the underlying drop operation as its 'what' argument.""", # we must not offer a 'nothing' which would bypass # the `drop()` call. The implementation completely # relies on `drop()` for all safety measures. # instead `drop(reckless=kill)` must be used to fast-kill # things constraints=EnsureChoice('datasets', 'all')), jobs=jobs_opt, message=save_message_opt, # XXX deprecate! save=Parameter( args=("--nosave", ), dest='save', action="store_false", doc="""DEPRECATED and IGNORED; use `save --amend` instead"""), recursive=Parameter( args=( "--recursive", '-r', ), action='store_const', const=None, doc="""DEPRECATED and IGNORED: removal is always a recursive operation"""), ) # inherit some from Drop # if_dirty and check as deprecated for p in ( 'reckless', 'if_dirty', 'check', ): _params_[p] = Drop._params_[p] _examples_ = [ dict( text= "Permanently remove a subdataset (and all further subdatasets contained in it) from a dataset", code_py="remove(dataset='path/to/dataset', path='path/to/subds')", code_cmd="datalad remove -d <path/to/dataset> <path/to/subds>"), dict( text= "Permanently remove a superdataset (with all subdatasets) from the filesystem", code_py="remove(dataset='path/to/dataset')", code_cmd="datalad remove -d <path/to/dataset>"), dict( text= "DANGER-ZONE: Fast wipe-out a dataset and all its subdataset, bypassing all safety checks", code_py="remove(dataset='path/to/dataset', reckless='kill')", code_cmd="datalad remove -d <path/to/dataset> --reckless kill"), ] @staticmethod @datasetmethod(name='remove') @eval_results def __call__( path=None, *, dataset=None, drop='datasets', reckless=None, message=None, jobs=None, # deprecated below recursive=None, check=None, save=None, if_dirty=None): # deprecate checks if if_dirty is not None: warnings.warn( "The `if_dirty` argument of `datalad remove` is ignored, " "it can be removed for a safe-by-default behavior. For " "other cases consider the `reckless` argument.", DeprecationWarning) if save is not None: warnings.warn( "The `save` argument of `datalad remove` is ignored. " "A dataset modification is always saved. Consider " "`save --amend` if post-remove fix-ups are needed.", DeprecationWarning) if recursive is not None: warnings.warn( "The `recursive` argument of `datalad remove` is ignored. " "Removal operations are always recursive, and the parameter " "can be stripped from calls for a safe-by-default behavior. ", DeprecationWarning) if check is not None: warnings.warn( "The `check` argument of `datalad remove` is deprecated, " "use the `reckless` argument instead.", DeprecationWarning) if check is False: if reckless is not None: raise ValueError( 'Must not use deprecated `check` argument, and new ' '`reckless` argument together with `datalad remove`.') reckless = 'availability' refds = require_dataset(dataset, check_installed=True, purpose='remove') # same path resolution that drop will do paths_by_ds, errors = get_paths_by_ds( refds, dataset, ensure_list(path), # super-mode will readily tell us which datasets to # save as the end subdsroot_mode='super') drop_success = True for res in Drop.__call__( dataset=dataset, path=path, what=drop, reckless=reckless, recursive=True, recursion_limit=None, jobs=jobs, result_xfm=None, return_type='generator', result_renderer='disabled', # delegate error handling here on_failure='ignore'): if res.get('status') not in ('ok', 'notneeded'): drop_success = False yield res if not drop_success: # there will be 'rm -rf' below, so play safe lgr.debug('Observed drop failure, will not attempt remove') return for dpath, paths in paths_by_ds.items(): for delpath in ([dpath] if paths is None else paths): if lexists(str(delpath)): # here we still have something around on the # filesystem. There is no need to fiddle with # Git, just wipe it out. A later save() will # act on it properly if delpath.is_dir(): lgr.debug('Remove directory: %s', delpath) rmtree(delpath) # cannot use .exists() must forsee dead symlinks else: lgr.debug('Remove file: %s', delpath) delpath.unlink() continue # if we get here, there is nothing on the file system # anymore at this path. Either because the parent # dataset vanished already, or because we dropped a # dataset, and it still needs to be unregistered # from its parent -> `git rm` if dpath.exists(): GitRepo(dpath).call_git( # no need for recursion, we know that even the root # path not longer exists ['rm', '-q'], files=[str(delpath.relative_to(dpath))]) # this path was already being removed by drop # so it must belong to a dropped dataset # save won't report about this, let's do it yield dict( action='remove', status='ok', path=str(delpath), type='dataset', ) if not refds.is_installed(): # we already dropped the whole thing return for res in Save.__call__( dataset=dataset, path=path, # we might have removed the reference dataset by now, recheck message=message if message else '[DATALAD] removed content', return_type='generator', result_renderer='disabled', result_xfm=None, result_filter=None, on_failure='ignore'): if res.get('action') == 'delete': # normalize to previous remove results res['action'] = 'remove' yield res
class Publish(Interface): """Publish a dataset to a known :term:`sibling`. This makes the last saved state of a dataset available to a sibling or special remote data store of a dataset. Any target sibling must already exist and be known to the dataset. Optionally, it is possible to limit publication to change sets relative to a particular point in the version history of a dataset (e.g. a release tag). By default, the state of the local dataset is evaluated against the last known state of the target sibling. Actual publication is only attempted if there was a change compared to the reference state, in order to speed up processing of large collections of datasets. Evaluation with respect to a particular "historic" state is only supported in conjunction with a specified reference dataset. Change sets are also evaluated recursively, i.e. only those subdatasets are published where a change was recorded that is reflected in to current state of the top-level reference dataset. See "since" option for more information. Only publication of saved changes is supported. Any unsaved changes in a dataset (hierarchy) have to be saved before publication. .. note:: Power-user info: This command uses :command:`git push`, and :command:`git annex copy` to publish a dataset. Publication targets are either configured remote Git repositories, or git-annex special remotes (if they support data upload). .. note:: This command is deprecated. It will be removed from DataLad eventually, but no earlier than the 0.15 release. The `push` command (new in 0.13.0) provides an alternative interface. Critical differences are that `push` transfers annexed data by default and does not handle sibling creation (i.e. it does not have a `--missing` option). """ # XXX prevent common args from being added to the docstring _no_eval_results = True # TODO: Figure out, how to tell about tracking branch/upstream # (and the respective remote) # - it is used, when no destination is given # - it is configured to be the given destination, if there was no # upstream set up before, so you can use just "datalad publish" next # time. _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), metavar='DATASET', doc="""specify the (top-level) dataset to be published. If no dataset is given, the datasets are determined based on the input arguments""", constraints=EnsureDataset() | EnsureNone()), to=Parameter( args=("--to", ), metavar='LABEL', doc="""name of the target sibling. If no name is given an attempt is made to identify the target based on the dataset's configuration (i.e. a configured tracking branch, or a single sibling that is configured for publication)""", # TODO: See TODO at top of class! constraints=EnsureStr() | EnsureNone()), since=Parameter( args=("--since", ), constraints=EnsureStr() | EnsureNone(), doc= """specifies commit-ish (tag, shasum, etc.) from which to look for changes to decide whether pushing is necessary. If '^' is given, the last state of the current branch at the sibling is taken as a starting point. An empty string ('') for the same effect is still supported)."""), # since: commit => .gitmodules diff to head => submodules to publish missing=missing_sibling_opt, path=Parameter( args=("path", ), metavar='PATH', # TODO this description is no longer correct doc="path(s), that may point to file handle(s) to publish including " "their actual content or to subdataset(s) to be published. If a " "file handle is published with its data, this implicitly means " "to also publish the (sub)dataset it belongs to. '.' as a path " "is treated in a special way in the sense, that it is passed " "to subdatasets in case `recursive` is also given.", constraints=EnsureStr() | EnsureNone(), nargs='*'), force=Parameter( args=( "-f", "--force", ), doc="""enforce doing publish activities (git push etc) regardless of the analysis if they seemed needed""", action='store_true'), # TODO add option to decide what branch/repo to push transfer_data=Parameter(args=("--transfer-data", ), doc="""ADDME""", constraints=EnsureChoice( 'auto', 'none', 'all')), recursive=recursion_flag, recursion_limit=recursion_limit, git_opts=git_opts, annex_opts=annex_opts, annex_copy_opts=annex_copy_opts, jobs=jobs_opt, ) @staticmethod @datasetmethod(name='publish') @eval_results def __call__(path=None, dataset=None, to=None, since=None, missing='fail', force=False, transfer_data='auto', recursive=False, recursion_limit=None, git_opts=None, annex_opts=None, annex_copy_opts=None, jobs=None): import warnings warnings.warn("`publish` is deprecated. Use `datalad push` instead.", DeprecationWarning) # if ever we get a mode, for "with-data" we would need this #if dataset and not path: # # act on the whole dataset if nothing else was specified # path = dataset.path if isinstance(dataset, Dataset) else dataset if not (isinstance(dataset, Dataset) or (dataset is None and path)): # try to find a dataset in PWD dataset = require_dataset(dataset, check_installed=True, purpose='publishing') if (since and since != '^') and not dataset: raise InsufficientArgumentsError( 'Modification detection (--since) without a base dataset ' 'is not supported') if dataset and since in ('', '^'): # only update since last update so we figure out what was the last update active_branch = dataset.repo.get_active_branch() if to: # XXX here we assume one to one mapping of names from local branches # to the remote since = '%s/%s' % (to, active_branch) # test if such branch already exists, if since not in dataset.repo.get_remote_branches(): lgr.debug( "No remote branch %s yet, so since will not be used", since) since = None else: # take tracking remote for the active branch tracked_remote, tracked_refspec = dataset.repo.get_tracking_branch( ) if tracked_remote: if tracked_refspec.startswith('refs/heads/'): tracked_refspec = tracked_refspec[len('refs/heads/'):] #to = tracked_remote since = '%s/%s' % (tracked_remote, tracked_refspec) else: lgr.info( "No tracked remote for %s. since option is of no effect", active_branch) since = None # here is the plan # 1. figure out remote to publish to # 2. figure out which content needs to be published to this remote # 3. look for any pre-publication dependencies of that remote # (i.e. remotes that need to be published to before) # 4. publish the content needed to go to the primary remote to # the dependencies first, and to the primary afterwards ds_remote_info = {} refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(refds=refds_path, logger=lgr, action='publish') to_process = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='publish', unavailable_path_status='impossible', nondataset_path_status='error', modified="%s..HEAD" % since if since else since, return_type='generator', on_failure='ignore', force_no_revision_change_discovery= False, # we cannot publish what was not committed force_untracked_discovery=False # we cannot publish untracked ): if ap.get('status', None): # this is done yield ap continue remote_info_result = None if ap.get('type', ap.get('type_src', 'dataset')) != 'dataset': # for everything that is not a dataset get the remote info # for the parent parentds = ap.get('parentds', None) if parentds and parentds not in ds_remote_info: remote_info_result = _get_remote_info( parentds, ds_remote_info, to, missing) else: # this is a dataset if ap.get('state', None) == 'absent': continue # get the remote info for itself remote_info_result = _get_remote_info(ap['path'], ds_remote_info, to, missing) ap['process_content'] = True if remote_info_result is not None: ap['status'] = remote_info_result[0] ap['message'] = remote_info_result[1] yield ap continue to_process.append(ap) content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path) assert (not completed) lgr.debug("Evaluating %i dataset publication candidate(s)", len(content_by_ds)) # TODO: fancier sorting, so we still follow somewhat the hierarchy # in sorted order, e.g. # d1/sub1/sub1 # d1/sub1 # d1 # d2/sub1 # d2 content_by_ds = OrderedDict( (d, content_by_ds[d]) for d in sorted(content_by_ds, reverse=True)) lgr.debug("Attempt to publish %i datasets", len(content_by_ds)) for ds_path in content_by_ds: remote_info = ds_remote_info.get(ds_path, None) if remote_info is None: # maybe this dataset wasn't annotated above, try to get info # MIH: I think this entire if-branch is practically impossible # to reach. It is certainly untested, but I think this is due # to mutually exclusive conditions during remote_info detection remote_info_result = _get_remote_info(ds_path, ds_remote_info, to, missing) if remote_info_result is not None: yield get_status_dict(type='dataset', path=ds_path, status=remote_info_result[0], message=remote_info_result[1], **res_kwargs) continue # continue with freshly obtained info remote_info = ds_remote_info[ds_path] # condition above must catch all other cases assert remote_info # and publish ds = Dataset(ds_path) for r in _publish_dataset( ds, remote=remote_info['remote'], refspec=remote_info.get('refspec', None), # only send paths that were explicitly requested paths= [ p for p in content_by_ds[ds_path] # do not feed (sub)dataset paths into the beast # makes no sense to try to annex copy them # for the base dataset itself let `transfer_data` # decide if p.get('type', None) != 'dataset' ], annex_copy_options=annex_copy_opts, force=force, jobs=jobs, transfer_data=transfer_data, **res_kwargs): yield r
class CreateSibling(Interface): """Create a dataset sibling on a UNIX-like Shell (local or SSH)-accessible machine Given a local dataset, and a path or SSH login information this command creates a remote dataset repository and configures it as a dataset sibling to be used as a publication target (see `publish` command). Various properties of the remote sibling can be configured (e.g. name location on the server, read and write access URLs, and access permissions. Optionally, a basic web-viewer for DataLad datasets can be installed at the remote location. This command supports recursive processing of dataset hierarchies, creating a remote sibling for each dataset in the hierarchy. By default, remote siblings are created in hierarchical structure that reflects the organization on the local file system. However, a simple templating mechanism is provided to produce a flat list of datasets (see --target-dir). """ # XXX prevent common args from being added to the docstring _no_eval_results = True _params_ = dict( # TODO: Figure out, whether (and when) to use `sshurl` as push url dataset=Parameter( args=("--dataset", "-d",), doc="""specify the dataset to create the publication target for. If no dataset is given, an attempt is made to identify the dataset based on the current working directory""", constraints=EnsureDataset() | EnsureNone()), sshurl=Parameter( args=("sshurl",), metavar='SSHURL', nargs='?', doc="""Login information for the target server. This can be given as a URL (ssh://host/path), SSH-style (user@host:path) or just a local path. Unless overridden, this also serves the future dataset's access URL and path on the server.""", constraints=EnsureStr()), name=Parameter( args=('-s', '--name',), metavar='NAME', doc="""sibling name to create for this publication target. If `recursive` is set, the same name will be used to label all the subdatasets' siblings. When creating a target dataset fails, no sibling is added""", constraints=EnsureStr() | EnsureNone(), nargs="?"), target_dir=Parameter( args=('--target-dir',), metavar='PATH', doc="""path to the directory *on the server* where the dataset shall be created. By default this is set to the URL (or local path) specified via [PY: `sshurl` PY][CMD: SSHURL CMD]. If a relative path is provided here, it is interpreted as being relative to the user's home directory on the server (or relative to [PY: `sshurl` PY][CMD: SSHURL CMD], when that is a local path). Additional features are relevant for recursive processing of datasets with subdatasets. By default, the local dataset structure is replicated on the server. However, it is possible to provide a template for generating different target directory names for all (sub)datasets. Templates can contain certain placeholder that are substituted for each (sub)dataset. For example: "/mydirectory/dataset%%RELNAME".\nSupported placeholders:\n %%RELNAME - the name of the datasets, with any slashes replaced by dashes\n""", constraints=EnsureStr() | EnsureNone()), target_url=Parameter( args=('--target-url',), metavar='URL', doc=""""public" access URL of the to-be-created target dataset(s) (default: `sshurl`). Accessibility of this URL determines the access permissions of potential consumers of the dataset. As with `target_dir`, templates (same set of placeholders) are supported. Also, if specified, it is provided as the annex description\n""", constraints=EnsureStr() | EnsureNone()), target_pushurl=Parameter( args=('--target-pushurl',), metavar='URL', doc="""In case the `target_url` cannot be used to publish to the dataset, this option specifies an alternative URL for this purpose. As with `target_url`, templates (same set of placeholders) are supported.\n""", constraints=EnsureStr() | EnsureNone()), recursive=recursion_flag, recursion_limit=recursion_limit, existing=Parameter( args=("--existing",), constraints=EnsureChoice('skip', 'error', 'reconfigure', 'replace'), metavar='MODE', doc="""action to perform, if a sibling is already configured under the given name and/or a target (non-empty) directory already exists. In this case, a dataset can be skipped ('skip'), the sibling configuration be updated ('reconfigure'), or process interrupts with error ('error'). DANGER ZONE: If 'replace' is used, an existing target directory will be forcefully removed, re-initialized, and the sibling (re-)configured (thus implies 'reconfigure'). `replace` could lead to data loss, so use with care. To minimize possibility of data loss, in interactive mode DataLad will ask for confirmation, but it would just issue a warning and proceed in non-interactive mode. """,), inherit=inherit_opt, shared=Parameter( args=("--shared",), metavar='{false|true|umask|group|all|world|everybody|0xxx}', doc="""if given, configures the access permissions on the server for multi-users (this could include access by a webserver!). Possible values for this option are identical to those of `git init --shared` and are described in its documentation.""", constraints=EnsureStr() | EnsureBool() | EnsureNone()), group=Parameter( args=("--group",), metavar="GROUP", doc="""Filesystem group for the repository. Specifying the group is particularly important when [CMD: --shared=group CMD][PY: shared="group" PY]""", constraints=EnsureStr() | EnsureNone() ), ui=Parameter( args=("--ui",), metavar='{false|true|html_filename}', doc="""publish a web interface for the dataset with an optional user-specified name for the html at publication target. defaults to `index.html` at dataset root""", constraints=EnsureBool() | EnsureStr()), as_common_datasrc=as_common_datasrc, publish_depends=publish_depends, publish_by_default=publish_by_default, annex_wanted=annex_wanted_opt, annex_group=annex_group_opt, annex_groupwanted=annex_groupwanted_opt, since=Parameter( args=("--since",), constraints=EnsureStr() | EnsureNone(), doc="""limit processing to datasets that have been changed since a given state (by tag, branch, commit, etc). This can be used to create siblings for recently added subdatasets."""), ) @staticmethod @datasetmethod(name='create_sibling') @eval_results def __call__(sshurl, name=None, target_dir=None, target_url=None, target_pushurl=None, dataset=None, recursive=False, recursion_limit=None, existing='error', shared=None, group=None, ui=False, as_common_datasrc=None, publish_by_default=None, publish_depends=None, annex_wanted=None, annex_group=None, annex_groupwanted=None, inherit=False, since=None): # # nothing without a base dataset # ds = require_dataset(dataset, check_installed=True, purpose='creating a sibling') refds_path = ds.path # # all checks that are possible before we start parsing the dataset # # possibly use sshurl to get the name in case if not specified if not sshurl: if not inherit: raise InsufficientArgumentsError( "needs at least an SSH URL, if no inherit option" ) if name is None: raise ValueError( "Neither SSH URL, nor the name of sibling to inherit from " "was specified" ) # It might well be that we already have this remote setup try: sshurl = CreateSibling._get_remote_url(ds, name) except Exception as exc: lgr.debug('%s does not know about url for %s: %s', ds, name, exc_str(exc)) elif inherit: raise ValueError( "For now, for clarity not allowing specifying a custom sshurl " "while inheriting settings" ) # may be could be safely dropped -- still WiP if not sshurl: # TODO: may be more back up before _prep? super_ds = ds.get_superdataset() if not super_ds: raise ValueError( "Could not determine super dataset for %s to inherit URL" % ds ) super_url = CreateSibling._get_remote_url(super_ds, name) # for now assuming hierarchical setup # (TODO: to be able to destinguish between the two, probably # needs storing datalad.*.target_dir to have %RELNAME in there) sshurl = slash_join(super_url, relpath(refds_path, super_ds.path)) # check the login URL sibling_ri = RI(sshurl) ssh_sibling = is_ssh(sibling_ri) if not (ssh_sibling or isinstance(sibling_ri, PathRI)): raise ValueError( "Unsupported SSH URL or path: '{0}', " "use ssh://host/path, host:path or path syntax".format(sshurl)) if not name: name = sibling_ri.hostname if ssh_sibling else "local" lgr.debug( "No sibling name given. Using %s'%s' as sibling name", "URL hostname " if ssh_sibling else "", name) if since == '': # consider creating siblings only since the point of # the last update # XXX here we assume one to one mapping of names from local branches # to the remote active_branch = ds.repo.get_active_branch() since = '%s/%s' % (name, active_branch) # # parse the base dataset to find all subdatasets that need processing # to_process = [] for ap in AnnotatePaths.__call__( dataset=refds_path, # only a single path! path=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='create_sibling', # both next should not happen anyways unavailable_path_status='impossible', nondataset_path_status='error', modified=since, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) != 'dataset' or ap.get('state', None) == 'absent': # this can happen when there is `since`, but we have no # use for anything but datasets here continue checkds_remotes = Dataset(ap['path']).repo.get_remotes() \ if ap.get('state', None) != 'absent' \ else [] if publish_depends: # make sure dependencies are valid # TODO: inherit -- we might want to automagically create # those dependents as well??? unknown_deps = set(assure_list(publish_depends)).difference(checkds_remotes) if unknown_deps: ap['status'] = 'error' ap['message'] = ( 'unknown sibling(s) specified as publication dependency: %s', unknown_deps) yield ap continue if name in checkds_remotes and existing in ('error', 'skip'): ap['status'] = 'error' if existing == 'error' else 'notneeded' ap['message'] = ( "sibling '%s' already configured (specify alternative name, or force " "reconfiguration via --existing", name) yield ap continue to_process.append(ap) if not to_process: # we ruled out all possibilities # TODO wait for gh-1218 and make better return values lgr.info("No datasets qualify for sibling creation. " "Consider different settings for --existing " "or --since if this is unexpected") return if ssh_sibling: # request ssh connection: lgr.info("Connecting ...") shell = ssh_manager.get_connection(sshurl) else: shell = _RunnerAdapter() sibling_ri.path = str(resolve_path(sibling_ri.path, dataset)) if target_dir: target_dir = opj(sibling_ri.path, target_dir) if target_dir is None: if sibling_ri.path: target_dir = sibling_ri.path else: target_dir = '.' # TODO: centralize and generalize template symbol handling replicate_local_structure = "%RELNAME" not in target_dir if not shell.get_annex_version(): raise MissingExternalDependency( 'git-annex', msg="It's required on the {} machine to create a sibling" .format('remote' if ssh_sibling else 'local')) # # all checks done and we have a connection, now do something # # loop over all datasets, ordered from top to bottom to make test # below valid (existing directories would cause the machinery to halt) # But we need to run post-update hook in depth-first fashion, so # would only collect first and then run (see gh #790) yielded = set() remote_repos_to_run_hook_for = [] for currentds_ap in \ sorted(to_process, key=lambda x: x['path'].count('/')): current_ds = Dataset(currentds_ap['path']) path = _create_dataset_sibling( name, current_ds, refds_path, shell, replicate_local_structure, sibling_ri, target_dir, target_url, target_pushurl, existing, shared, group, publish_depends, publish_by_default, ui, as_common_datasrc, annex_wanted, annex_group, annex_groupwanted, inherit ) if not path: # nothing new was created # TODO is 'notneeded' appropriate in this case? currentds_ap['status'] = 'notneeded' # TODO explain status in 'message' yield currentds_ap yielded.add(currentds_ap['path']) continue remote_repos_to_run_hook_for.append((path, currentds_ap)) # publish web-interface to root dataset on publication server if current_ds.path == refds_path and ui: lgr.info("Uploading web interface to %s" % path) try: CreateSibling.upload_web_interface(path, shell, shared, ui) except CommandError as e: currentds_ap['status'] = 'error' currentds_ap['message'] = ( "failed to push web interface to the remote datalad repository (%s)", exc_str(e)) yield currentds_ap yielded.add(currentds_ap['path']) continue # in reverse order would be depth first lgr.info("Running post-update hooks in all created siblings") # TODO: add progressbar for path, currentds_ap in remote_repos_to_run_hook_for[::-1]: # Trigger the hook lgr.debug("Running hook for %s (if exists and executable)", path) try: shell("cd {} " "&& ( [ -x hooks/post-update ] && hooks/post-update || : )" "".format(sh_quote(_path_(path, ".git")))) except CommandError as e: currentds_ap['status'] = 'error' currentds_ap['message'] = ( "failed to run post-update hook under remote path %s (%s)", path, exc_str(e)) yield currentds_ap yielded.add(currentds_ap['path']) continue if not currentds_ap['path'] in yielded: # if we were silent until now everything is just splendid currentds_ap['status'] = 'ok' yield currentds_ap @staticmethod def _run_on_ds_ssh_remote(ds, name, ssh, cmd): """Given a dataset, and name of the remote, run command via ssh Parameters ---------- cmd: str Will be .format()'ed given the `path` to the dataset on remote Returns ------- out Raises ------ CommandError """ remote_url = CreateSibling._get_remote_url(ds, name) remote_ri = RI(remote_url) out, err = ssh(cmd.format(path=sh_quote(remote_ri.path))) if err: lgr.warning("Got stderr while calling ssh: %s", err) return out @staticmethod def _get_ds_remote_shared_setting(ds, name, ssh): """Figure out setting of sharedrepository for dataset's `name` remote""" shared = None try: # TODO -- we might need to expanduser taking .user into account # but then it must be done also on remote side out = CreateSibling._run_on_ds_ssh_remote( ds, name, ssh, 'git -C {path} config --get core.sharedrepository' ) shared = out.strip() except CommandError as e: lgr.debug( "Could not figure out remote shared setting of %s for %s due " "to %s", ds, name, exc_str(e) ) # could well be ok if e.g. not shared # TODO: more detailed analysis may be? return shared @staticmethod def _has_active_postupdate(ds, name, ssh): """Figure out either has active post-update hook Returns ------- bool or None None if something went wrong and we could not figure out """ has_active_post_update = None try: # TODO -- we might need to expanduser taking .user into account # but then it must be done also on remote side out = CreateSibling._run_on_ds_ssh_remote( ds, name, ssh, 'cd {path} && [ -x .git/hooks/post-update ] && echo yes || echo no' ) out = out.strip() assert out in ('yes', 'no') has_active_post_update = out == "yes" except CommandError as e: lgr.debug( "Could not figure out either %s on remote %s has active " "post_update hook due to %s", ds, name, exc_str(e) ) return has_active_post_update @staticmethod def _get_remote_url(ds, name): """A little helper to get url from pushurl or from url if not defined""" # take pushurl if present, if not -- just a url url = ds.config.get('remote.%s.pushurl' % name) or \ ds.config.get('remote.%s.url' % name) if not url: raise ValueError( "%s had neither pushurl or url defined for %s" % (ds, name) ) return url @staticmethod def init_remote_repo(path, ssh, shared, dataset, description=None): cmd = "git -C {} init{}".format( sh_quote(path), " --shared='{}'".format(sh_quote(shared)) if shared else '') try: ssh(cmd) except CommandError as e: lgr.error("Initialization of remote git repository failed at %s." "\nError: %s\nSkipping ..." % (path, exc_str(e))) return False if isinstance(dataset.repo, AnnexRepo): # init remote git annex repo (part fix of #463) try: ssh( "git -C {} annex init {}".format( sh_quote(path), sh_quote(description) if description else '') ) except CommandError as e: lgr.error("Initialization of remote git annex repository failed at %s." "\nError: %s\nSkipping ..." % (path, exc_str(e))) return False return True @staticmethod def create_postupdate_hook(path, ssh, dataset): # location of post-update hook file, logs folder on remote target hooks_remote_dir = opj(path, '.git', 'hooks') # make sure hooks directory exists (see #1251) ssh('mkdir -p {}'.format(sh_quote(hooks_remote_dir))) hook_remote_target = opj(hooks_remote_dir, 'post-update') # create json command for current dataset log_filename = 'datalad-publish-hook-$(date +%s).log' % TIMESTAMP_FMT hook_content = r'''#!/bin/bash git update-server-info # # DataLad # # (Re)generate meta-data for DataLad Web UI and possibly init new submodules dsdir="$(dirname $0)/../.." logfile="$dsdir/{WEB_META_LOG}/{log_filename}" if [ ! -e "$dsdir/.git" ]; then echo Assumption of being under .git has failed >&2 exit 1 fi mkdir -p "$dsdir/{WEB_META_LOG}" # assure logs directory exists ( which datalad > /dev/null \ && ( cd "$dsdir"; GIT_DIR="$PWD/.git" datalad ls -a --json file .; ) \ || echo "E: no datalad found - skipping generation of indexes for web frontend"; \ ) &> "$logfile" '''.format(WEB_META_LOG=WEB_META_LOG, **locals()) with make_tempfile(content=hook_content) as tempf: # create post_update hook script # upload hook to dataset ssh.put(tempf, hook_remote_target) # and make it executable ssh('chmod +x {}'.format(sh_quote(hook_remote_target))) @staticmethod def upload_web_interface(path, ssh, shared, ui): # path to web interface resources on local webui_local = opj(dirname(datalad.__file__), 'resources', 'website') # local html to dataset html_local = opj(webui_local, "index.html") # name and location of web-interface html on target html_targetname = {True: ui, False: "index.html"}[isinstance(ui, str)] html_target = opj(path, html_targetname) # upload ui html to target ssh.put(html_local, html_target) # upload assets to the dataset webresources_local = opj(webui_local, 'assets') webresources_remote = opj(path, WEB_HTML_DIR) ssh('mkdir -p {}'.format(sh_quote(webresources_remote))) ssh.put(webresources_local, webresources_remote, recursive=True) # minimize and upload js assets for js_file in glob(opj(webresources_local, 'js', '*.js')): with open(js_file) as asset: try: from jsmin import jsmin # jsmin = lambda x: x # no minimization minified = jsmin(asset.read()) # minify asset except ImportError: lgr.warning( "Will not minify web interface javascript, no jsmin available") minified = asset.read() # no minify available with make_tempfile(content=minified) as tempf: # write minified to tempfile js_name = js_file.split('/')[-1] ssh.put(tempf, opj(webresources_remote, 'assets', 'js', js_name)) # and upload js # explicitly make web+metadata dir of dataset world-readable, if shared set to 'all' mode = None if shared in (True, 'true', 'all', 'world', 'everybody'): mode = 'a+rX' elif shared == 'group': mode = 'g+rX' elif str(shared).startswith('0'): mode = shared if mode: ssh('chmod {} -R {} {}'.format( mode, sh_quote(dirname(webresources_remote)), sh_quote(opj(path, 'index.html'))))
class ExportToFigshare(Interface): """Export the content of a dataset as a ZIP archive to figshare Very quick and dirty approach. Ideally figshare should be supported as a proper git annex special remote. Unfortunately, figshare does not support having directories, and can store only a flat list of files. That makes it impossible for any sensible publishing of complete datasets. The only workaround is to publish dataset as a zip-ball, where the entire content is wrapped into a .zip archive for which figshare would provide a navigator. """ from datalad.support.param import Parameter from datalad.distribution.dataset import datasetmethod from datalad.interface.utils import eval_results from datalad.distribution.dataset import EnsureDataset from datalad.support.constraints import ( EnsureChoice, EnsureInt, EnsureNone, EnsureStr, ) _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc=""""specify the dataset to export. If no dataset is given, an attempt is made to identify the dataset based on the current working directory.""", constraints=EnsureDataset() | EnsureNone()), filename=Parameter( args=("filename", ), metavar="PATH", nargs='?', doc="""File name of the generated ZIP archive. If no file name is given the archive will be generated in the top directory of the dataset and will be named: datalad_<dataset_uuid>.zip.""", constraints=EnsureStr() | EnsureNone()), no_annex=Parameter( args=("--no-annex", ), action="store_true", doc="""By default the generated .zip file would be added to annex, and all files would get registered in git-annex to be available from such a tarball. Also upon upload we will register for that archive to be a possible source for it in annex. Setting this flag disables this behavior."""), missing_content=Parameter( args=("--missing-content", ), doc="""By default, any discovered file with missing content will result in an error and the plugin is aborted. Setting this to 'continue' will issue warnings instead of failing on error. The value 'ignore' will only inform about problem at the 'debug' log level. The latter two can be helpful when generating a TAR archive from a dataset where some file content is not available locally.""", constraints=EnsureChoice("error", "continue", "ignore")), # article_id=Parameter( # args=("--project-id",), # metavar="ID", # doc="""If given, article (if article_id is not provided) will be # created in that project.""", # constraints=EnsureInt() | EnsureNone()), article_id=Parameter(args=("--article-id", ), metavar="ID", doc="""Which article to publish to.""", constraints=EnsureInt() | EnsureNone()), ) @staticmethod @datasetmethod(name='export_to_figshare') @eval_results def __call__( dataset, filename=None, missing_content='error', no_annex=False, # TODO: support working with projects and articles within them # project_id=None, article_id=None): import os import logging lgr = logging.getLogger('datalad.plugin.export_to_figshare') from datalad.ui import ui from datalad.api import add_archive_content from datalad.api import export_archive from datalad.distribution.dataset import require_dataset from datalad.support.annexrepo import AnnexRepo dataset = require_dataset(dataset, check_installed=True, purpose='export to figshare') if not isinstance(dataset.repo, AnnexRepo): raise ValueError( "%s is not an annex repo, so annexification could be done" % dataset) if dataset.repo.dirty: raise RuntimeError( "Paranoid authors of DataLad refuse to proceed in a dirty repository" ) if filename is None: filename = dataset.path lgr.info( "Exporting current tree as an archive under %s since figshare " "does not support directories", filename) archive_out = next( export_archive(dataset, filename=filename, archivetype='zip', missing_content=missing_content, return_type="generator")) assert archive_out['status'] == 'ok' fname = archive_out['path'] lgr.info("Uploading %s to figshare", fname) figshare = FigshareRESTLaison() if not article_id: # TODO: ask if it should be an article within a project if ui.is_interactive: # or should we just upload to a new article? if ui.yesno( "Would you like to create a new article to upload to? " "If not - we will list existing articles", title="Article"): article = figshare.create_article( title=_enter_title(ui, dataset)) lgr.info( "Created a new (private) article %(id)s at %(url_private_html)s. " "Please visit it, enter additional meta-data and make public", article) article_id = article['id'] else: article_id = int( ui.question( "Which of the articles should we upload to.", choices=list(map(str, figshare.get_article_ids())))) if not article_id: raise ValueError("We need an article to upload to.") file_info = figshare.upload_file( fname, files_url='account/articles/%s/files' % article_id) if no_annex: lgr.info("Removing generated tarball") unlink(fname) else: # I will leave all the complaining etc to the dataset add if path # is outside etc lgr.info("'Registering' %s within annex", fname) repo = dataset.repo repo.add(fname, git=False) key = repo.get_file_key(fname) lgr.info("Adding URL %(download_url)s for it", file_info) repo.call_annex([ "registerurl", '-c', 'annex.alwayscommit=false', key, file_info['download_url'] ]) lgr.info("Registering links back for the content of the archive") add_archive_content( fname, annex=dataset.repo, delete_after=True, # just remove extracted into a temp dir allow_dirty=True, # since we have a tarball commit= False # we do not want to commit anything we have done here ) lgr.info("Removing generated and now registered in annex archive") repo.drop(key, key=True, options=['--force']) repo.remove(fname, force=True) # remove the tarball # if annex in {'delete'}: # dataset.repo.remove(fname) # else: # # kinda makes little sense I guess. # # Made more sense if export_archive could export an arbitrary treeish # # so we could create a branch where to dump and export to figshare # # (kinda closer to my idea) # dataset.save(fname, message="Added the entire dataset into a zip file") # TODO: add to downloader knowledge about figshare token so it could download-url # those zipballs before they go public yield dict( status='ok', # TODO: add article url (which needs to be queried if only ID is known message="Published archive {}".format(file_info['download_url']), file_info=file_info, path=dataset, action='export_to_figshare', logger=lgr)
class DownloadURL(Interface): """Download content It allows for a uniform download interface to various supported URL schemes (see command help for details), re-using or asking for authentication details maintained by datalad. """ _params_ = dict( urls=Parameter( doc="""URL(s) to be downloaded. Supported protocols: {}""".format( ", ".join(map(repr, sorted(Provider.DOWNLOADERS)))), constraints=EnsureStr(), # TODO: EnsureURL metavar='url', args=('urls', ), nargs='+'), dataset=Parameter( args=("-d", "--dataset"), metavar='PATH', doc="""specify the dataset to add files to. If no dataset is given, an attempt is made to identify the dataset based on the current working directory. Use [CMD: --nosave CMD][PY: save=False PY] to prevent adding files to the dataset.""", constraints=EnsureDataset() | EnsureNone()), overwrite=Parameter( args=("-o", "--overwrite"), action="store_true", doc="""flag to overwrite it if target file exists"""), path=Parameter( args=("-O", "--path"), doc="""target for download. If the path has a trailing separator, it is treated as a directory, and each specified URL is downloaded under that directory to a base name taken from the URL. Without a trailing separator, the value specifies the name of the downloaded file (file name extensions inferred from the URL may be added to it, if they are not yet present) and only a single URL should be given. In both cases, leading directories will be created if needed. This argument defaults to the current directory.""", constraints=EnsureStr() | EnsureNone()), archive=Parameter( args=("--archive", ), action="store_true", doc="""pass the downloaded files to [CMD: :command:`datalad add-archive-content --delete` CMD][PY: add_archive_content(..., delete=True) PY]"""), save=nosave_opt, message=save_message_opt) _examples_ = [ dict( text="Download files from an http and S3 URL", code_py= "download_url(urls=['http://example.com/file.dat', 's3://bucket/file2.dat'])", code_cmd= "datalad download-url http://example.com/file.dat s3://bucket/file2.dat" ), dict( text="Download a file to a path and provide a commit message", code_py= "download_url(urls='s3://bucket/file2.dat', message='added a file', path='myfile.dat')", code_cmd="""datalad download-url -m 'added a file' -O myfile.dat \\ s3://bucket/file2.dat"""), dict(text="Append a trailing slash to the target path " "to download into a specified directory", code_py= "download_url(['http://example.com/file.dat'], path='data/')", code_cmd= "datalad download-url --path=data/ http://example.com/file.dat"), dict( text="Leave off the trailing slash to download into a regular file", code_py= "download_url(['http://example.com/file.dat'], path='data')", code_cmd= "datalad download-url --path=data http://example.com/file.dat"), ] @staticmethod @datasetmethod(name="download_url") @eval_results def __call__(urls, *, dataset=None, path=None, overwrite=False, archive=False, save=True, message=None): from ..downloaders.http import HTTPDownloader from ..downloaders.providers import Providers ds = None if save or dataset: try: ds = require_dataset(dataset, check_installed=True, purpose='download urls') except NoDatasetFound: pass common_report = {"action": "download_url", "ds": ds} got_ds_instance = isinstance(dataset, Dataset) dir_is_target = not path or str(path).endswith(op.sep) path = str(resolve_path(path or op.curdir, ds=dataset)) if dir_is_target: # resolve_path() doesn't preserve trailing separators. Add one for # the download() call. path = path + op.sep urls = ensure_list_from_str(urls) if not dir_is_target: if len(urls) > 1: yield get_status_dict( status="error", message= ("When specifying multiple urls, --path should point to " "a directory target (with a trailing separator). Got %r", path), type="file", path=path, **common_report) return if archive: # make sure the file suffix indicated by a URL is preserved # so that any further archive processing doesn't have to # employ mime type inspection in order to determine the archive # type from datalad.support.network import URL suffixes = PurePosixPath(URL(urls[0]).path).suffixes if not Path(path).suffixes == suffixes: path += ''.join(suffixes) # we know that we have a single URL # download() would be fine getting an existing directory and # downloading the URL underneath it, but let's enforce a trailing # slash here for consistency. if op.isdir(path): yield get_status_dict( status="error", message=( "Non-directory path given (no trailing separator) " "but a directory with that name (after adding archive " "suffix) exists"), type="file", path=path, **common_report) return # TODO setup fancy ui.progressbars doing this in parallel and reporting overall progress # in % of urls which were already downloaded providers = Providers.from_config_files() downloaded_paths = [] path_urls = {} need_datalad_remote = False for url in urls: # somewhat "ugly" downloader = providers.get_provider(url).get_downloader(url) try: downloaded_path = downloader.download(url, path=path, overwrite=overwrite) except Exception as e: ce = CapturedException(e) yield get_status_dict(status="error", message=str(ce), type="file", path=path, exception=ce, **common_report) else: if not need_datalad_remote \ and (downloader.authenticator or downloader.credential or type(downloader) != HTTPDownloader): need_datalad_remote = True downloaded_paths.append(downloaded_path) path_urls[downloaded_path] = url yield get_status_dict(status="ok", type="file", path=downloaded_path, **common_report) if downloaded_paths and save and ds is not None: msg = message or """\ [DATALAD] Download URLs URLs: {}""".format("\n ".join(urls)) for r in Save()( downloaded_paths, message=msg, # ATTN: Pass the original dataset argument to # preserve relative path handling semantics. dataset=dataset, return_type="generator", result_renderer='disabled', result_xfm=None, result_filter=None, on_failure="ignore"): yield r ds_repo = ds.repo if isinstance(ds_repo, AnnexRepo): if need_datalad_remote: from datalad.customremotes.base import ( ensure_datalad_remote, ) ensure_datalad_remote(ds_repo, autoenable=True, encryption=None) if got_ds_instance: # Paths in `downloaded_paths` are already relative to the # dataset. rpaths = dict(zip(downloaded_paths, downloaded_paths)) else: # Paths in `downloaded_paths` are already relative to the # current working directory. Take these relative to the # dataset for use with the AnnexRepo method calls. rpaths = {} for orig_path, resolved in zip( downloaded_paths, resolve_path(downloaded_paths, ds=dataset)): rpath = path_under_rev_dataset(ds, resolved) if rpath: rpaths[str(rpath)] = orig_path else: lgr.warning("Path %s not under dataset %s", orig_path, ds) annex_paths = [ p for p, annexed in zip( rpaths, ds_repo.is_under_annex(list(rpaths.keys()))) if annexed ] if annex_paths: for path in annex_paths: url = path_urls[rpaths[path]] try: # The file is already present. This is just to # register the URL. ds_repo.add_url_to_file( path, url, # avoid batch mode for single files # https://github.com/datalad/datalad/issues/2849 batch=len(annex_paths) > 1, # bypass URL size check, we already have the file options=['--relaxed']) except CommandError as exc: lgr.warning("Registering %s with %s failed: %s", path, url, CapturedException(exc)) if archive: for path in annex_paths: yield from ds.add_archive_content( path, delete=True, on_failure='ignore', return_type='generator', result_renderer='disabled')
class Get(Interface): """Get any dataset content (files/directories/subdatasets). This command only operates on dataset content. To obtain a new independent dataset from some source use the `install` command. By default this command operates recursively within a dataset, but not across potential subdatasets, i.e. if a directory is provided, all files in the directory are obtained. Recursion into subdatasets is supported too. If enabled, relevant subdatasets are detected and installed in order to fulfill a request. Known data locations for each requested file are evaluated and data are obtained from some available location (according to git-annex configuration and possibly assigned remote priorities), unless a specific source is specified. .. note:: Power-user info: This command uses :command:`git annex get` to fulfill file handles. """ _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), metavar="PATH", doc="""specify the dataset to perform the add operation on, in which case `path` arguments are interpreted as being relative to this dataset. If no dataset is given, an attempt is made to identify a dataset for each input `path`""", constraints=EnsureDataset() | EnsureNone()), path=Parameter( args=("path", ), metavar="PATH", doc="""path/name of the requested dataset component. The component must already be known to a dataset. To add new components to a dataset use the `add` command""", nargs="*", constraints=EnsureStr() | EnsureNone()), source=Parameter( args=( "-s", "--source", ), metavar="LABEL", doc="""label of the data source to be used to fulfill requests. This can be the name of a dataset :term:`sibling` or another known source""", constraints=EnsureStr() | EnsureNone()), recursive=recursion_flag, recursion_limit=Parameter( args=("--recursion-limit", ), metavar="LEVELS", constraints=EnsureInt() | EnsureChoice('existing') | EnsureNone(), doc="""limit recursion into subdataset to the given number of levels. Alternatively, 'existing' will limit recursion to subdatasets that already existed on the filesystem at the start of processing, and prevent new subdatasets from being obtained recursively."""), get_data=Parameter( args=( "-n", "--no-data", ), dest='get_data', action='store_false', doc= """whether to obtain data for all file handles. If disabled, `get` operations are limited to dataset handles.[CMD: This option prevents data for file handles from being obtained CMD]"""), description=location_description, reckless=reckless_opt, # git_opts=git_opts, # annex_opts=annex_opts, # annex_get_opts=annex_get_opts, jobs=jobs_opt, verbose=verbose) # Note: May be use 'git annex find --not --in here' to have a list of all # files to actually get and give kind of a progress in terms of number # files processed ... @staticmethod @datasetmethod(name='get') @eval_results def __call__( path=None, source=None, dataset=None, recursive=False, recursion_limit=None, get_data=True, description=None, reckless=False, #git_opts=None, #annex_opts=None, #annex_get_opts=None, jobs=None, verbose=False, ): # IMPLEMENTATION CONCEPT: # # 1. Sort the world into existing handles and the rest # 2. Try locate missing handles (obtain subdatasets along the way) # 3. Expand into subdatasets with recursion enables (potentially # obtain even more subdatasets # 4. Shoot info of which handles to get in each subdataset to, # git-annex, once at the very end refds_path = Interface.get_refds_path(dataset) if not (dataset or path): raise InsufficientArgumentsError( "Neither dataset nor target path(s) provided") if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path # remember which results we already reported, to avoid duplicates yielded_ds = [] to_get = [] unavailable_paths = [] for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='get', # NOTE: Do not act upon unavailable paths yet! Done below after # testing which ones could be obtained unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): if ap.get('status', None): # we know what to report already yield ap continue if ap.get('state', None) == 'absent' and ap.get( 'raw_input', False): # if this wasn't found, but directly requested, queue for further # exploration unavailable_paths.append(ap) continue if ap.get('type', None) == 'dataset' and \ GitRepo.is_valid_repo(ap['path']) and \ not ap['path'] == refds_path: # do not report what hasn't arived yet # also do not report the base dataset that is already # present -- no surprise yield dict(ap, status='notneeded', logger=lgr, message='already installed') yielded_ds.append(ap['path']) ap['process_content'] = get_data to_get.append(ap) # explore the unknown for ap in sorted(unavailable_paths, key=lambda x: x['path']): lgr.debug("Investigate yet unavailable path %s", ap) # how close can we get? dspath = ap.get('parentds', get_dataset_root(ap['path'])) if dspath is None: # nothing we can do for this path continue lgr.debug("Found containing dataset %s for path %s", dspath, ap['path']) ds = Dataset(dspath) # now actually obtain whatever is necessary to get to this path containing_ds = [dspath] for res in _install_necessary_subdatasets(ds, ap['path'], reckless, refds_path, description=description): # yield immediately so errors could be acted upon outside, before # we continue if not (res['type'] == 'dataset' and res['path'] in yielded_ds): # unless we reported on this dataset before if res['type'] == 'dataset': # make a record, recursive below might now want to report # a 'notneeded' yielded_ds.append(res['path']) yield res # update to the current innermost dataset containing_ds.append(res['path']) if len(containing_ds) < 2: # no subdataset was installed, hence if the path was unavailable # before it still is, no need to bother git annex ap.update(status='impossible', message='path does not exist') yield ap continue # important to only do the next for the innermost subdataset # as the `recursive` logic below relies on that! # set the correct parent, for a dataset this would be the second-last # reported subdataset ap.update(parentds=containing_ds[-1]) if containing_ds[-1] == ap['path']: # the path actually refers to the last installed dataset ap.update(parentds=containing_ds[-2], process_content=get_data, type='dataset') to_get.append(ap) # results of recursive installation of yet undiscovered datasets rec_get = [] if recursive and not recursion_limit == 'existing': # obtain any subdatasets underneath the paths given inside the # subdatasets that we know already exist # unless we do not want recursion into not-yet-installed datasets for ap in sorted(to_get, key=lambda x: x['path']): if ap['type'] not in ('dataset', 'directory') or not ap.get( 'raw_input', False): # a non-directory cannot have content underneath # also we do NOT want to recurse into anything that was specifically # requested, to avoid duplication continue subds = Dataset(ap['path'] if ap['type'] == 'dataset' else ap['parentds']) lgr.info("Installing %s%s recursively", subds, (" underneath %s" % ap['path'] if subds.path != ap['path'] else "")) for res in _recursive_install_subds_underneath( subds, # `ap['path']` was explicitly given as input # we count recursions from the input, hence we # can start with the full number recursion_limit, reckless, start=ap['path'], refds_path=refds_path, description=description): # yield immediately so errors could be acted upon # outside, before we continue if not (res['type'] == 'dataset' and res['path'] in yielded_ds): # unless we reported on this dataset before if res['type'] == 'dataset': # make a record yielded_ds.append(res['path']) yield res if not (res['status'] == 'ok' and res['type'] == 'dataset'): # not a dataset that was just installed, we just reported it # upstairs, and can ignore it from now on continue # paranoia, so popular these days... assert GitRepo.is_valid_repo(res['path']) # keep a copy of the install record for `get` later on get_ap = { k: v for k, v in res.items() if not k == 'status' } get_ap['process_content'] = get_data rec_get.append(get_ap) if not get_data: # done already return # merge the two AP lists to_get.extend(rec_get) # sort into datasets content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_get, refds_path=refds_path, path_only=False) assert (not completed) # hand over to git-annex, get files content, # report files in git as 'notneeded' to get for ds_path in sorted(content_by_ds.keys()): ds = Dataset(ds_path) # grab content, ignore subdataset entries content = [ ap['path'] for ap in content_by_ds[ds_path] if ap.get('type', None) != 'dataset' or ap['path'] == ds.path ] if not content: # cut this short should there be nothing continue # needs to be an annex to get content if not isinstance(ds.repo, AnnexRepo): for r in results_from_paths( content, status='notneeded', message="no dataset annex, content already present", action='get', logger=lgr, refds=refds_path): yield r continue respath_by_status = {} for res in ds.repo.get(content, options=['--from=%s' % source] if source else [], jobs=jobs): res = annexjson2result(res, ds, type='file', logger=lgr, refds=refds_path) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] yield res for r in results_from_annex_noinfo( ds, content, respath_by_status, dir_fail_msg='could not get some content in %s %s', noinfo_dir_msg='nothing to get from %s', noinfo_file_msg='already present', action='get', logger=lgr, refds=refds_path): yield r @staticmethod def custom_result_summary_renderer(res): from datalad.ui import ui from os import linesep if not len(res): ui.message("Got nothing new") return nfiles = count_results(res, type='file') nsuccess_file = count_results(res, type='file', status='ok') nfailure = nfiles - nsuccess_file msg = "Tried to get %d %s that had no content yet." % ( nfiles, single_or_plural("file", "files", nfiles)) if nsuccess_file: msg += " Successfully obtained %d. " % nsuccess_file if nfailure: msg += " %d (failed)." % (nfailure, ) ui.message(msg) # if just a few or less than initially explicitly requested if len(res) < 10: msg = linesep.join([ "{path}{type} ... {suc}".format( suc=item.get('status'), path=item.get('path'), type=' [{}]'.format(item['type']) if 'type' in item else '') for item in res ]) ui.message(msg)
class Uninstall(Interface): """Uninstall subdatasets This command can be used to uninstall any number of installed subdataset. If a to-be-uninstalled subdataset contains presently installed subdatasets itself, their recursive removal has to be enabled explicitly to avoid the command to exit with an error. This command will error if individual files or non-dataset directories are given as input (use the drop or remove command depending in the desired goal), nor will it uninstall top-level datasets (i.e. datasets that or not a subdataset in another dataset; use the remove command for this purpose). By default, the availability of at least one remote copy for each currently available file in any dataset is verified. As these checks could lead to slow operation (network latencies, etc), they can be disabled. Any number of paths to process can be given as input. Recursion into subdatasets needs to be explicitly enabled, while recursion in subdirectories within a dataset as always done automatically. An optional recursion limit is applied relative to each given input path. Examples: Uninstall a subdataset (undo installation):: ~/some/dataset$ datalad uninstall somesubdataset1 """ _action = 'uninstall' _params_ = dict( dataset=dataset_argument, path=Parameter(args=("path", ), metavar="PATH", doc="path/name of the component to be uninstalled", nargs="*", constraints=EnsureStr() | EnsureNone()), recursive=recursion_flag, check=check_argument, if_dirty=if_dirty_opt, ) @staticmethod @datasetmethod(name=_action) @eval_results def __call__(path=None, dataset=None, recursive=False, check=True, if_dirty='save-before'): refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='uninstall', logger=lgr, refds=refds_path) if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path if not dataset and not path: raise InsufficientArgumentsError( "insufficient information for `uninstall`: requires at least a path or dataset" ) to_uninstall = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, action='uninstall', # justification for status: # content need not be uninstalled where there is none unavailable_path_status='notneeded', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue # upfront sanity and compliance checks # check that we have no top-level datasets and not files to process if ap.get('type') == 'dataset' and \ not ap.get('state', None) == 'absent' and \ path_is_under([ap['path']]): # wants a sequence! ap.update( status='error', message="refusing to uninstall current or parent directory" ) yield ap continue if not ap.get('type', None) == 'dataset': ap.update( status='impossible', message= "can only uninstall datasets (consider the `drop` command)" ) yield ap continue # we only have dataset from here if not ap.get('parentds', None): ap.update( status='error', message= "will not uninstall top-level dataset (consider `remove` command)" ) yield ap continue if not ap['path'] == refds_path: ap['process_content'] = True to_uninstall.append(ap) # iterate over all datasets, starting at the bottom # to deinit contained submodules first for ap in sorted(to_uninstall, key=lambda x: x['path'], reverse=True): if ap.get('state', None) == 'absent': # already gone continue ds = Dataset(ap['path']) # TODO generator # this should yield what it did handle_dirty_dataset(ds, mode=if_dirty) # we confirmed the super dataset presence above for r in _uninstall_dataset(ds, check=check, has_super=True, **res_kwargs): yield r
class AddReadme(Interface): """Add basic information about DataLad datasets to a README file The README file is added to the dataset and the addition is saved in the dataset. Note: Make sure that no unsaved modifications to your dataset's .gitattributes file exist. """ from datalad.support.param import Parameter from datalad.distribution.dataset import datasetmethod from datalad.interface.utils import eval_results from datalad.distribution.dataset import EnsureDataset from datalad.support.constraints import ( EnsureChoice, EnsureNone, EnsureStr, ) _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""Dataset to add information to. If no dataset is given, an attempt is made to identify the dataset based on the current working directory.""", constraints=EnsureDataset() | EnsureNone()), filename=Parameter( args=("filename", ), metavar="PATH", nargs='?', doc="""Path of the README file within the dataset.""", constraints=EnsureStr()), existing=Parameter( args=("--existing", ), doc="""How to react if a file with the target name already exists: 'skip': do nothing; 'append': append information to the existing file; 'replace': replace the existing file with new content.""", constraints=EnsureChoice("skip", "append", "replace")), ) @staticmethod @datasetmethod(name='add_readme') @eval_results def __call__(filename='README.md', *, dataset=None, existing='skip'): from os.path import lexists from os.path import join as opj from io import open import logging lgr = logging.getLogger('datalad.local.add_readme') from datalad.distribution.dataset import require_dataset from datalad.utils import ensure_list dataset = require_dataset(dataset, check_installed=True, purpose='add README') fpath = opj(dataset.path, filename) res_kwargs = dict(action='add_readme', path=fpath) if lexists(fpath) and existing == 'skip': yield dict( res_kwargs, status='notneeded', message='file already exists, and not appending content') return # unlock, file could be annexed if lexists(fpath): yield from dataset.unlock(fpath, return_type='generator', result_renderer='disabled') if not lexists(fpath): # if we have an annex repo, shall the README go to Git or annex? if isinstance(dataset.repo, AnnexRepo) \ and 'annex.largefiles' not in \ dataset.repo.get_gitattributes(filename).get(filename, {}): # configure the README to go into Git dataset.repo.set_gitattributes([(filename, { 'annex.largefiles': 'nothing' })]) yield from dataset.save( path='.gitattributes', message="[DATALAD] Configure README to be in Git", to_git=True, return_type='generator', result_renderer='disabled') # get any metadata on the dataset itself dsinfo = dataset.metadata('.', reporton='datasets', return_type='item-or-list', result_renderer='disabled', on_failure='ignore') meta = {} if not isinstance(dsinfo, dict) or dsinfo.get('status', None) != 'ok': lgr.warning( "Could not obtain dataset metadata, proceeding without") dsinfo = {} else: # flatten possibly existing multiple metadata sources for src in dsinfo['metadata']: if src.startswith('@'): # not a source continue meta.update(dsinfo['metadata'][src]) metainfo = '' for label, content in ( ('', meta.get('description', meta.get('shortdescription', ''))), ('Author{}'.format( 's' if isinstance(meta.get('author', None), list) else ''), u'\n'.join([ u'- {}'.format(a) for a in ensure_list(meta.get('author', [])) ])), ('Homepage', meta.get('homepage', '')), ('Reference', meta.get('citation', '')), ('License', meta.get('license', '')), ('Keywords', u', '.join([ u'`{}`'.format(k) for k in ensure_list(meta.get('tag', [])) ])), ('Funding', meta.get('fundedby', '')), ): if label and content: metainfo += u'\n\n### {}\n\n{}'.format(label, content) elif content: metainfo += u'\n\n{}'.format(content) for key in 'title', 'name', 'shortdescription': if 'title' in meta: break if key in meta: meta['title'] = meta[key] default_content = u"""\ # {title}{metainfo} ## General information This is a DataLad dataset{id}. ## DataLad datasets and how to use them This repository is a [DataLad](https://www.datalad.org/) dataset. It provides fine-grained data access down to the level of individual files, and allows for tracking future updates. In order to use this repository for data retrieval, [DataLad](https://www.datalad.org/) is required. It is a free and open source command line tool, available for all major operating systems, and builds up on Git and [git-annex](https://git-annex.branchable.com/) to allow sharing, synchronizing, and version controlling collections of large files. More information on how to install DataLad and [how to install](http://handbook.datalad.org/en/latest/intro/installation.html) it can be found in the [DataLad Handbook](https://handbook.datalad.org/en/latest/index.html). ### Get the dataset A DataLad dataset can be `cloned` by running ``` datalad clone <url> ``` Once a dataset is cloned, it is a light-weight directory on your local machine. At this point, it contains only small metadata and information on the identity of the files in the dataset, but not actual *content* of the (sometimes large) data files. ### Retrieve dataset content After cloning a dataset, you can retrieve file contents by running ``` datalad get <path/to/directory/or/file> ``` This command will trigger a download of the files, directories, or subdatasets you have specified. DataLad datasets can contain other datasets, so called *subdatasets*. If you clone the top-level dataset, subdatasets do not yet contain metadata and information on the identity of files, but appear to be empty directories. In order to retrieve file availability metadata in subdatasets, run ``` datalad get -n <path/to/subdataset> ``` Afterwards, you can browse the retrieved metadata to find out about subdataset contents, and retrieve individual files with `datalad get`. If you use `datalad get <path/to/subdataset>`, all contents of the subdataset will be downloaded at once. ### Stay up-to-date DataLad datasets can be updated. The command `datalad update` will *fetch* updates and store them on a different branch (by default `remotes/origin/master`). Running ``` datalad update --merge ``` will *pull* available updates and integrate them in one go. ### Find out what has been done DataLad datasets contain their history in the ``git log``. By running ``git log`` (or a tool that displays Git history) in the dataset or on specific files, you can find out what has been done to the dataset or to individual files by whom, and when. """.format( title='Dataset "{}"'.format(meta['title']) if 'title' in meta else 'About this dataset', metainfo=metainfo, id=u' (id: {})'.format(dataset.id) if dataset.id else '', ) with open(fpath, 'a' if existing == 'append' else 'w', encoding='utf-8') as fp: fp.write(default_content) yield dict(status='ok', path=fpath, type='file', action='add_readme') yield from dataset.save(fpath, message='[DATALAD] added README', result_filter=None, result_xfm=None, return_type='generator', result_renderer='disabled')