class Run(Interface): """Run an arbitrary shell command and record its impact on a dataset. It is recommended to craft the command such that it can run in the root directory of the dataset that the command will be recorded in. However, as long as the command is executed somewhere underneath the dataset root, the exact location will be recorded relative to the dataset root. If the executed command did not alter the dataset in any way, no record of the command execution is made. If the given command errors, a `CommandError` exception with the same exit code will be raised, and no modifications will be saved. *Command format* || REFLOW >> A few placeholders are supported in the command via Python format specification. "{pwd}" will be replaced with the full path of the current working directory. "{dspath}" will be replaced with the full path of the dataset that run is invoked on. "{tmpdir}" will be replaced with the full path of a temporary directory. "{inputs}" and "{outputs}" represent the values specified by [CMD: --input and --output CMD][PY: `inputs` and `outputs` PY]. If multiple values are specified, the values will be joined by a space. The order of the values will match that order from the command line, with any globs expanded in alphabetical order (like bash). Individual values can be accessed with an integer index (e.g., "{inputs[0]}"). << REFLOW || || REFLOW >> Note that the representation of the inputs or outputs in the formatted command string depends on whether the command is given as a list of arguments or as a string[CMD: (quotes surrounding the command) CMD]. The concatenated list of inputs or outputs will be surrounded by quotes when the command is given as a list but not when it is given as a string. This means that the string form is required if you need to pass each input as a separate argument to a preceding script (i.e., write the command as "./script {inputs}", quotes included). The string form should also be used if the input or output paths contain spaces or other characters that need to be escaped. << REFLOW || To escape a brace character, double it (i.e., "{{" or "}}"). Custom placeholders can be added as configuration variables under "datalad.run.substitutions". As an example: Add a placeholder "name" with the value "joe":: % git config --file=.datalad/config datalad.run.substitutions.name joe % datalad add -m "Configure name placeholder" .datalad/config Access the new placeholder in a command:: % datalad run "echo my name is {name} >me" """ _params_ = dict( cmd=Parameter( args=("cmd",), nargs=REMAINDER, metavar='COMMAND', doc="""command for execution. A leading '--' can be used to disambiguate this command from the preceding options to DataLad."""), dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset to record the command results in. An attempt is made to identify the dataset based on the current working directory. If a dataset is given, the command will be executed in the root directory of this dataset.""", constraints=EnsureDataset() | EnsureNone()), inputs=Parameter( args=("-i", "--input"), dest="inputs", metavar=("PATH"), action='append', doc="""A dependency for the run. Before running the command, the content of this file will be retrieved. A value of "." means "run :command:`datalad get .`". The value can also be a glob. [CMD: This option can be given more than once. CMD]"""), outputs=Parameter( args=("-o", "--output"), dest="outputs", metavar=("PATH"), action='append', doc="""Prepare this file to be an output file of the command. A value of "." means "run :command:`datalad unlock .`" (and will fail if some content isn't present). For any other value, if the content of this file is present, unlock the file. Otherwise, remove it. The value can also be a glob. [CMD: This option can be given more than once. CMD]"""), expand=Parameter( args=("--expand",), metavar=("WHICH"), doc="""Expand globs when storing inputs and/or outputs in the commit message.""", constraints=EnsureNone() | EnsureChoice("inputs", "outputs", "both")), explicit=Parameter( args=("--explicit",), action="store_true", doc="""Consider the specification of inputs and outputs to be explicit. Don't warn if the repository is dirty, and only save modifications to the listed outputs."""), message=save_message_opt, sidecar=Parameter( args=('--sidecar',), metavar="yes|no", doc="""By default, the configuration variable 'datalad.run.record-sidecar' determines whether a record with information on a command's execution is placed into a separate record file instead of the commit message (default: off). This option can be used to override the configured behavior on a case-by-case basis. Sidecar files are placed into the dataset's '.datalad/runinfo' directory (customizable via the 'datalad.run.record-directory' configuration variable).""", constraints=EnsureNone() | EnsureBool()), ) @staticmethod @datasetmethod(name='run') @eval_results def __call__( cmd=None, dataset=None, inputs=None, outputs=None, expand=None, explicit=False, message=None, sidecar=None): for r in run_command(cmd, dataset=dataset, inputs=inputs, outputs=outputs, expand=expand, explicit=explicit, message=message, sidecar=sidecar): yield r
class WTF(Interface): """Generate a report about the DataLad installation and configuration IMPORTANT: Sharing this report with untrusted parties (e.g. on the web) should be done with care, as it may include identifying information, and/or credentials or access tokens. """ result_renderer = 'tailored' from datalad.support.param import Parameter from datalad.distribution.dataset import datasetmethod from datalad.interface.utils import eval_results from datalad.distribution.dataset import EnsureDataset from datalad.support.constraints import EnsureNone, EnsureChoice _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), doc=""""specify the dataset to report on. no dataset is given, an attempt is made to identify the dataset based on the current working directory.""", constraints=EnsureDataset() | EnsureNone()), sensitive=Parameter( args=( "-s", "--sensitive", ), constraints=EnsureChoice(None, 'some', 'all'), doc="""if set to 'some' or 'all', it will display sections such as config and metadata which could potentially contain sensitive information (credentials, names, etc.). If 'some', the fields which are known to be sensitive will still be masked out"""), sections=Parameter( args=("-S", "--section"), action='append', dest='sections', metavar="SECTION", constraints=EnsureChoice(*sorted(SECTION_CALLABLES) + ['*']) | EnsureNone(), doc="""section to include. If not set - depends on flavor. '*' could be used to force all sections. [CMD: This option can be given multiple times. CMD]"""), flavor=Parameter( args=("--flavor", ), constraints=EnsureChoice('full', 'short'), doc= """Flavor of WTF. 'full' would produce markdown with exhaustive list of sections. 'short' will provide a condensed summary only of datalad and dependencies by default. Use [CMD: --section CMD][PY: `section` PY] to list other sections""" ), decor=Parameter( args=("-D", "--decor"), constraints=EnsureChoice('html_details') | EnsureNone(), doc="""decoration around the rendering to facilitate embedding into issues etc, e.g. use 'html_details' for posting collapsable entry to GitHub issues."""), clipboard=Parameter( args=( "-c", "--clipboard", ), action="store_true", doc="""if set, do not print but copy to clipboard (requires pyperclip module)"""), ) @staticmethod @datasetmethod(name='wtf') @eval_results def __call__(dataset=None, sensitive=None, sections=None, flavor="full", decor=None, clipboard=None): from datalad.distribution.dataset import require_dataset from datalad.support.exceptions import NoDatasetFound from datalad.interface.results import get_status_dict ds = None try: ds = require_dataset(dataset, check_installed=False, purpose='reporting') except NoDatasetFound: # failure is already logged pass if ds and not ds.is_installed(): # warn that the dataset is bogus yield dict( action='wtf', path=ds.path, status='impossible', message=('No dataset found at %s. Reporting on the dataset is ' 'not attempted.', ds.path), logger=lgr) # we don't deal with absent datasets ds = None if sensitive: if ds is None: from datalad import cfg else: cfg = ds.config else: cfg = None from datalad.ui import ui from datalad.support.external_versions import external_versions infos = OrderedDict() res = get_status_dict( action='wtf', path=ds.path if ds else ensure_unicode(op.abspath(op.curdir)), type='dataset' if ds else 'directory', status='ok', logger=lgr, decor=decor, infos=infos, flavor=flavor, ) # Define section callables which require variables. # so there is no side-effect on module level original section_callables = SECTION_CALLABLES.copy() section_callables['location'] = partial(_describe_location, res) section_callables['configuration'] = \ partial(_describe_configuration, cfg, sensitive) if ds: section_callables['dataset'] = \ partial(_describe_dataset, ds, sensitive) else: section_callables.pop('dataset') assert all(section_callables.values()) # check if none was missed asked_for_all_sections = sections is not None and any( s == '*' for s in sections) if sections is None or asked_for_all_sections: if flavor == 'full' or asked_for_all_sections: sections = sorted(list(section_callables)) elif flavor == 'short': sections = ['datalad', 'dependencies'] else: raise ValueError(flavor) for s in sections: infos[s] = section_callables[s]() if clipboard: external_versions.check( 'pyperclip', msg="It is needed to be able to use clipboard") import pyperclip report = _render_report(res) pyperclip.copy(report) ui.message("WTF information of length %s copied to clipboard" % len(report)) yield res return @staticmethod def custom_result_renderer(res, **kwargs): from datalad.ui import ui out = _render_report(res) ui.message(out)
class CreateSibling(Interface): """Create a dataset sibling on a UNIX-like SSH-accessible machine Given a local dataset, and SSH login information this command creates a remote dataset repository and configures it as a dataset sibling to be used as a publication target (see `publish` command). Various properties of the remote sibling can be configured (e.g. name location on the server, read and write access URLs, and access permissions. Optionally, a basic web-viewer for DataLad datasets can be installed at the remote location. This command supports recursive processing of dataset hierarchies, creating a remote sibling for each dataset in the hierarchy. By default, remote siblings are created in hierarchical structure that reflects the organization on the local file system. However, a simple templating mechanism is provided to produce a flat list of datasets (see --target-dir). """ # XXX prevent common args from being added to the docstring _no_eval_results = True _params_ = dict( # TODO: Figure out, whether (and when) to use `sshurl` as push url dataset=Parameter( args=( "--dataset", "-d", ), doc="""specify the dataset to create the publication target for. If no dataset is given, an attempt is made to identify the dataset based on the current working directory""", constraints=EnsureDataset() | EnsureNone()), sshurl=Parameter( args=("sshurl", ), metavar='SSHURL', nargs='?', doc="""Login information for the target server. This can be given as a URL (ssh://host/path) or SSH-style (user@host:path). Unless overridden, this also serves the future dataset's access URL and path on the server.""", constraints=EnsureStr()), name=Parameter( args=( '-s', '--name', ), metavar='NAME', doc="""sibling name to create for this publication target. If `recursive` is set, the same name will be used to label all the subdatasets' siblings. When creating a target dataset fails, no sibling is added""", constraints=EnsureStr() | EnsureNone(), nargs="?"), target_dir=Parameter( args=('--target-dir', ), metavar='PATH', doc="""path to the directory *on the server* where the dataset shall be created. By default the SSH access URL is used to identify this directory. If a relative path is provided here, it is interpreted as being relative to the user's home directory on the server.\n Additional features are relevant for recursive processing of datasets with subdatasets. By default, the local dataset structure is replicated on the server. However, it is possible to provide a template for generating different target directory names for all (sub)datasets. Templates can contain certain placeholder that are substituted for each (sub)dataset. For example: "/mydirectory/dataset%%RELNAME".\nSupported placeholders:\n %%RELNAME - the name of the datasets, with any slashes replaced by dashes\n""", constraints=EnsureStr() | EnsureNone()), target_url=Parameter( args=('--target-url', ), metavar='URL', doc=""""public" access URL of the to-be-created target dataset(s) (default: `sshurl`). Accessibility of this URL determines the access permissions of potential consumers of the dataset. As with `target_dir`, templates (same set of placeholders) are supported. Also, if specified, it is provided as the annex description\n""", constraints=EnsureStr() | EnsureNone()), target_pushurl=Parameter( args=('--target-pushurl', ), metavar='URL', doc="""In case the `target_url` cannot be used to publish to the dataset, this option specifies an alternative URL for this purpose. As with `target_url`, templates (same set of placeholders) are supported.\n""", constraints=EnsureStr() | EnsureNone()), recursive=recursion_flag, recursion_limit=recursion_limit, existing=Parameter( args=("--existing", ), constraints=EnsureChoice('skip', 'replace', 'error', 'reconfigure'), metavar='MODE', doc= """action to perform, if a sibling is already configured under the given name and/or a target directory already exists. In this case, a dataset can be skipped ('skip'), an existing target directory be forcefully re-initialized, and the sibling (re-)configured ('replace', implies 'reconfigure'), the sibling configuration be updated only ('reconfigure'), or to error ('error').""", ), inherit=inherit_opt, shared=Parameter( args=("--shared", ), metavar='false|true|umask|group|all|world|everybody|0xxx', doc="""if given, configures the access permissions on the server for multi-users (this could include access by a webserver!). Possible values for this option are identical to those of `git init --shared` and are described in its documentation.""", constraints=EnsureStr() | EnsureBool() | EnsureNone()), group=Parameter( args=("--group", ), metavar="GROUP", doc="""Filesystem group for the repository. Specifying the group is particularly important when [CMD: --shared=group CMD][PY: shared="group" PY]""", constraints=EnsureStr() | EnsureNone()), ui=Parameter(args=("--ui", ), metavar='false|true|html_filename', doc="""publish a web interface for the dataset with an optional user-specified name for the html at publication target. defaults to `index.html` at dataset root""", constraints=EnsureBool() | EnsureStr()), as_common_datasrc=as_common_datasrc, publish_depends=publish_depends, publish_by_default=publish_by_default, annex_wanted=annex_wanted_opt, annex_group=annex_group_opt, annex_groupwanted=annex_groupwanted_opt, since=Parameter( args=("--since", ), constraints=EnsureStr() | EnsureNone(), doc= """limit processing to datasets that have been changed since a given state (by tag, branch, commit, etc). This can be used to create siblings for recently added subdatasets."""), ) @staticmethod @datasetmethod(name='create_sibling') @eval_results def __call__(sshurl, name=None, target_dir=None, target_url=None, target_pushurl=None, dataset=None, recursive=False, recursion_limit=None, existing='error', shared=None, group=None, ui=False, as_common_datasrc=None, publish_by_default=None, publish_depends=None, annex_wanted=None, annex_group=None, annex_groupwanted=None, inherit=False, since=None): # # nothing without a base dataset # ds = require_dataset(dataset, check_installed=True, purpose='creating a sibling') refds_path = ds.path # # all checks that are possible before we start parsing the dataset # # possibly use sshurl to get the name in case if not specified if not sshurl: if not inherit: raise InsufficientArgumentsError( "needs at least an SSH URL, if no inherit option") if name is None: raise ValueError( "Neither SSH URL, nor the name of sibling to inherit from " "was specified") # It might well be that we already have this remote setup try: sshurl = CreateSibling._get_remote_url(ds, name) except Exception as exc: lgr.debug('%s does not know about url for %s: %s', ds, name, exc_str(exc)) elif inherit: raise ValueError( "For now, for clarity not allowing specifying a custom sshurl " "while inheriting settings") # may be could be safely dropped -- still WiP if not sshurl: # TODO: may be more back up before _prep? super_ds = ds.get_superdataset() if not super_ds: raise ValueError( "Could not determine super dataset for %s to inherit URL" % ds) super_url = CreateSibling._get_remote_url(super_ds, name) # for now assuming hierarchical setup # (TODO: to be able to destinguish between the two, probably # needs storing datalad.*.target_dir to have %RELNAME in there) sshurl = slash_join(super_url, relpath(ds.path, super_ds.path)) # check the login URL sshri = RI(sshurl) if not is_ssh(sshri): raise ValueError( "Unsupported SSH URL: '{0}', " "use ssh://host/path or host:path syntax".format(sshurl)) if not name: # use the hostname as default remote name name = sshri.hostname lgr.debug( "No sibling name given, use URL hostname '%s' as sibling name", name) if since == '': # consider creating siblings only since the point of # the last update # XXX here we assume one to one mapping of names from local branches # to the remote active_branch = ds.repo.get_active_branch() since = '%s/%s' % (name, active_branch) # # parse the base dataset to find all subdatasets that need processing # to_process = [] for ap in AnnotatePaths.__call__( dataset=refds_path, # only a single path! path=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='create_sibling', # both next should not happen anyways unavailable_path_status='impossible', nondataset_path_status='error', modified=since, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) != 'dataset' or ap.get('state', None) == 'absent': # this can happen when there is `since`, but we have no # use for anything but datasets here continue checkds_remotes = Dataset(ap['path']).repo.get_remotes() \ if ap.get('state', None) != 'absent' \ else [] if publish_depends: # make sure dependencies are valid # TODO: inherit -- we might want to automagically create # those dependents as well??? unknown_deps = set( assure_list(publish_depends)).difference(checkds_remotes) if unknown_deps: ap['status'] = 'error' ap['message'] = ( 'unknown sibling(s) specified as publication dependency: %s', unknown_deps) yield ap continue if name in checkds_remotes and existing in ('error', 'skip'): ap['status'] = 'error' if existing == 'error' else 'notneeded' ap['message'] = ( "sibling '%s' already configured (specify alternative name, or force " "reconfiguration via --existing", name) yield ap continue to_process.append(ap) if not to_process: # we ruled out all possibilities # TODO wait for gh-1218 and make better return values lgr.info("No datasets qualify for sibling creation. " "Consider different settings for --existing " "or --since if this is unexpected") return if target_dir is None: if sshri.path: target_dir = sshri.path else: target_dir = '.' # TODO: centralize and generalize template symbol handling replicate_local_structure = "%RELNAME" not in target_dir # request ssh connection: lgr.info("Connecting ...") assert (sshurl is not None) # delayed anal verification ssh = ssh_manager.get_connection(sshurl) if not ssh.get_annex_version(): raise MissingExternalDependency('git-annex', msg='on the remote system') # # all checks done and we have a connection, now do something # # loop over all datasets, ordered from top to bottom to make test # below valid (existing directories would cause the machinery to halt) # But we need to run post-update hook in depth-first fashion, so # would only collect first and then run (see gh #790) yielded = set() remote_repos_to_run_hook_for = [] for currentds_ap in \ sorted(to_process, key=lambda x: x['path'].count('/')): current_ds = Dataset(currentds_ap['path']) path = _create_dataset_sibling( name, current_ds, ds.path, ssh, replicate_local_structure, sshri, target_dir, target_url, target_pushurl, existing, shared, group, publish_depends, publish_by_default, ui, as_common_datasrc, annex_wanted, annex_group, annex_groupwanted, inherit) if not path: # nothing new was created # TODO is 'notneeded' appropriate in this case? currentds_ap['status'] = 'notneeded' # TODO explain status in 'message' yield currentds_ap yielded.add(currentds_ap['path']) continue remote_repos_to_run_hook_for.append((path, currentds_ap)) # publish web-interface to root dataset on publication server if current_ds.path == ds.path and ui: lgr.info("Uploading web interface to %s" % path) try: CreateSibling.upload_web_interface(path, ssh, shared, ui) except CommandError as e: currentds_ap['status'] = 'error' currentds_ap['message'] = ( "failed to push web interface to the remote datalad repository (%s)", exc_str(e)) yield currentds_ap yielded.add(currentds_ap['path']) continue # in reverse order would be depth first lgr.info("Running post-update hooks in all created siblings") # TODO: add progressbar for path, currentds_ap in remote_repos_to_run_hook_for[::-1]: # Trigger the hook lgr.debug("Running hook for %s (if exists and executable)", path) try: ssh("cd {} " "&& ( [ -x hooks/post-update ] && hooks/post-update || : )" "".format(sh_quote(_path_(path, ".git")))) except CommandError as e: currentds_ap['status'] = 'error' currentds_ap['message'] = ( "failed to run post-update hook under remote path %s (%s)", path, exc_str(e)) yield currentds_ap yielded.add(currentds_ap['path']) continue if not currentds_ap['path'] in yielded: # if we were silent until now everything is just splendid currentds_ap['status'] = 'ok' yield currentds_ap @staticmethod def _run_on_ds_ssh_remote(ds, name, ssh, cmd): """Given a dataset, and name of the remote, run command via ssh Parameters ---------- cmd: str Will be .format()'ed given the `path` to the dataset on remote Returns ------- out Raises ------ CommandError """ remote_url = CreateSibling._get_remote_url(ds, name) remote_ri = RI(remote_url) out, err = ssh(cmd.format(path=sh_quote(remote_ri.path))) if err: lgr.warning("Got stderr while calling ssh: %s", err) return out @staticmethod def _get_ds_remote_shared_setting(ds, name, ssh): """Figure out setting of sharedrepository for dataset's `name` remote""" shared = None try: # TODO -- we might need to expanduser taking .user into account # but then it must be done also on remote side out = CreateSibling._run_on_ds_ssh_remote( ds, name, ssh, 'git -C {path} config --get core.sharedrepository') shared = out.strip() except CommandError as e: lgr.debug( "Could not figure out remote shared setting of %s for %s due " "to %s", ds, name, exc_str(e)) # could well be ok if e.g. not shared # TODO: more detailed analysis may be? return shared @staticmethod def _has_active_postupdate(ds, name, ssh): """Figure out either has active post-update hook Returns ------- bool or None None if something went wrong and we could not figure out """ has_active_post_update = None try: # TODO -- we might need to expanduser taking .user into account # but then it must be done also on remote side out = CreateSibling._run_on_ds_ssh_remote( ds, name, ssh, 'cd {path} && [ -x .git/hooks/post-update ] && echo yes || echo no' ) out = out.strip() assert out in ('yes', 'no') has_active_post_update = out == "yes" except CommandError as e: lgr.debug( "Could not figure out either %s on remote %s has active " "post_update hook due to %s", ds, name, exc_str(e)) return has_active_post_update @staticmethod def _get_remote_url(ds, name): """A little helper to get url from pushurl or from url if not defined""" # take pushurl if present, if not -- just a url url = ds.config.get('remote.%s.pushurl' % name) or \ ds.config.get('remote.%s.url' % name) if not url: raise ValueError("%s had neither pushurl or url defined for %s" % (ds, name)) return url @staticmethod def init_remote_repo(path, ssh, shared, dataset, description=None): cmd = "git -C {} init{}".format( sh_quote(path), " --shared='{}'".format(sh_quote(shared)) if shared else '') try: ssh(cmd) except CommandError as e: lgr.error("Initialization of remote git repository failed at %s." "\nError: %s\nSkipping ..." % (path, exc_str(e))) return False if isinstance(dataset.repo, AnnexRepo): # init remote git annex repo (part fix of #463) try: ssh("git -C {} annex init {}".format( sh_quote(path), sh_quote(description) if description else '')) except CommandError as e: lgr.error( "Initialization of remote git annex repository failed at %s." "\nError: %s\nSkipping ..." % (path, exc_str(e))) return False return True @staticmethod def create_postupdate_hook(path, ssh, dataset): # location of post-update hook file, logs folder on remote target hooks_remote_dir = opj(path, '.git', 'hooks') # make sure hooks directory exists (see #1251) ssh('mkdir -p {}'.format(sh_quote(hooks_remote_dir))) hook_remote_target = opj(hooks_remote_dir, 'post-update') # create json command for current dataset log_filename = 'datalad-publish-hook-$(date +%s).log' % TIMESTAMP_FMT hook_content = r'''#!/bin/bash git update-server-info # # DataLad # # (Re)generate meta-data for DataLad Web UI and possibly init new submodules dsdir="$(dirname $0)/../.." logfile="$dsdir/{WEB_META_LOG}/{log_filename}" if [ ! -e "$dsdir/.git" ]; then echo Assumption of being under .git has failed >&2 exit 1 fi mkdir -p "$dsdir/{WEB_META_LOG}" # assure logs directory exists ( which datalad > /dev/null \ && ( cd "$dsdir"; GIT_DIR="$PWD/.git" datalad ls -a --json file .; ) \ || echo "E: no datalad found - skipping generation of indexes for web frontend"; \ ) &> "$logfile" '''.format(WEB_META_LOG=WEB_META_LOG, **locals()) with make_tempfile(content=hook_content) as tempf: # create post_update hook script # upload hook to dataset ssh.copy(tempf, hook_remote_target) # and make it executable ssh('chmod +x {}'.format(sh_quote(hook_remote_target))) @staticmethod def upload_web_interface(path, ssh, shared, ui): # path to web interface resources on local webui_local = opj(dirname(datalad.__file__), 'resources', 'website') # local html to dataset html_local = opj(webui_local, "index.html") # name and location of web-interface html on target html_targetname = {True: ui, False: "index.html"}[isinstance(ui, str)] html_target = opj(path, html_targetname) # upload ui html to target ssh.copy(html_local, html_target) # upload assets to the dataset webresources_local = opj(webui_local, 'assets') webresources_remote = opj(path, WEB_HTML_DIR) ssh('mkdir -p {}'.format(sh_quote(webresources_remote))) ssh.copy(webresources_local, webresources_remote, recursive=True) # minimize and upload js assets for js_file in glob(opj(webresources_local, 'js', '*.js')): with open(js_file) as asset: try: from jsmin import jsmin # jsmin = lambda x: x # no minimization minified = jsmin(asset.read()) # minify asset except ImportError: lgr.warning( "Will not minify web interface javascript, no jsmin available" ) minified = asset.read() # no minify available with make_tempfile(content=minified ) as tempf: # write minified to tempfile js_name = js_file.split('/')[-1] ssh.copy(tempf, opj(webresources_remote, 'assets', 'js', js_name)) # and upload js # explicitly make web+metadata dir of dataset world-readable, if shared set to 'all' mode = None if shared in (True, 'true', 'all', 'world', 'everybody'): mode = 'a+rX' elif shared == 'group': mode = 'g+rX' elif str(shared).startswith('0'): mode = shared if mode: ssh('chmod {} -R {} {}'.format( mode, sh_quote(dirname(webresources_remote)), sh_quote(opj(path, 'index.html'))))
class ExportArchiveORA(Interface): """Export an archive of a local annex object store for the ORA remote. Keys in the local annex object store are reorganized in a temporary directory (using links to avoid storage duplication) to use the 'hashdirlower' setup used by git-annex for bare repositories and the directory-type special remote. This alternative object store is then moved into a 7zip archive that is suitable for use in a ORA remote dataset store. Placing such an archive into:: <dataset location>/archives/archive.7z Enables the ORA special remote to locate and retrieve all key contained in the archive. """ _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), doc="""specify the dataset to process. If no dataset is given, an attempt is made to identify the dataset based on the current working directory""", constraints=EnsureDataset() | EnsureNone()), target=Parameter( args=("target", ), metavar="TARGET", doc="""if an existing directory, an 'archive.7z' is placed into it, otherwise this is the path to the target archive""", constraints=EnsureStr() | EnsureNone()), opts=Parameter( args=("opts", ), nargs=REMAINDER, metavar="...", doc="""list of options for 7z to replace the default '-mx0' to generate an uncompressed archive"""), ) @staticmethod @datasetmethod(name='export_archive_ora') @eval_results def __call__(target, opts=None, dataset=None): # only non-bare repos have hashdirmixed, so require one ds = require_dataset(dataset, check_installed=True, purpose='ORA archive export') ds_repo = ds.repo # TODO remove once datalad 0.12rc7 or later is released if not hasattr(ds_repo, 'dot_git'): from datalad.support.gitrepo import GitRepo ds_repo.dot_git = ds_repo.pathobj / GitRepo.get_git_dir(ds_repo) annex_objs = ds_repo.dot_git / 'annex' / 'objects' archive = resolve_path(target, dataset) if archive.is_dir(): archive = archive / 'archive.7z' else: archive.parent.mkdir(exist_ok=True, parents=True) if not opts: # uncompressed by default opts = ['-mx0'] res_kwargs = dict( action="export-archive-ora", logger=lgr, ) if not annex_objs.is_dir(): yield get_status_dict( ds=ds, status='notneeded', message='no annex keys present', **res_kwargs, ) return exportdir = ds_repo.dot_git / 'datalad' / 'tmp' / 'ora_archive' if exportdir.exists(): yield get_status_dict( ds=ds, status='error', message=( 'export directory already exists, please remove first: %s', str(exportdir)), **res_kwargs, ) return keypaths = [ k for k in annex_objs.glob(op.join('**', '*')) if k.is_file() ] log_progress( lgr.info, 'oraarchiveexport', 'Start ORA archive export %s', ds, total=len(keypaths), label='ORA archive export', unit=' Keys', ) link_fx = os.link for keypath in keypaths: key = keypath.name hashdir = op.join(keypath.parts[-4], keypath.parts[-3]) log_progress(lgr.info, 'oraarchiveexport', 'Export key %s to %s', key, hashdir, update=1, increment=True) keydir = exportdir / hashdir / key keydir.mkdir(parents=True, exist_ok=True) try: link_fx(str(keypath), str(keydir / key)) except OSError: lgr.warning( 'No hard links supported at %s, will copy files instead', str(keydir)) # no hard links supported # switch function after first error link_fx = shutil.copyfile link_fx(str(keypath), str(keydir / key)) log_progress(lgr.info, 'oraarchiveexport', 'Finished RIA archive export from %s', ds) try: subprocess.run( ['7z', 'u', str(archive), '.'] + opts, cwd=str(exportdir), ) yield get_status_dict(path=str(archive), type='file', status='ok', **res_kwargs) except Exception as e: yield get_status_dict(path=str(archive), type='file', status='error', message=('7z failed: %s', exc_str(e)), **res_kwargs) return finally: rmtree(str(exportdir))
class Drop(Interface): """Drop file content from datasets This command takes any number of paths of files and/or directories. If a common (super)dataset is given explicitly, the given paths are interpreted relative to this dataset. Recursion into subdatasets needs to be explicitly enabled, while recursion into subdirectories within a dataset is done automatically. An optional recursion limit is applied relative to each given input path. By default, the availability of at least one remote copy is verified before file content is dropped. As these checks could lead to slow operation (network latencies, etc), they can be disabled. """ _examples_ = [ dict(text="Drop single file content", code_py="drop('path/to/file')", code_cmd="datalad drop <path/to/file>"), dict(text="Drop all file content in the current dataset", code_py="drop('.')", code_cmd="datalad drop"), dict(text="Drop all file content in a dataset and all its subdatasets", code_py="drop(dataset='.', recursive=True)", code_cmd="datalad drop -d <path/to/dataset> -r"), dict(text="Disable check to ensure the configured minimum number of " "remote sources for dropped data", code_py="drop(path='path/to/content', check=False)", code_cmd="datalad drop <path/to/content> --nocheck"), ] _action = 'drop' _params_ = dict( dataset=dataset_argument, path=Parameter(args=("path", ), metavar="PATH", doc="path/name of the component to be dropped", nargs="*", constraints=EnsureStr() | EnsureNone()), recursive=recursion_flag, recursion_limit=recursion_limit, check=check_argument, if_dirty=if_dirty_opt, ) @staticmethod @datasetmethod(name=_action) @eval_results def __call__(path=None, dataset=None, recursive=False, recursion_limit=None, check=True, if_dirty='save-before'): if not dataset and not path: raise InsufficientArgumentsError( "insufficient information for `drop`: requires at least a path or dataset" ) refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='drop', logger=lgr, refds=refds_path) if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path to_drop = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='drop', # justification for status: # content need not be dropped where there is none unavailable_path_status='notneeded', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) == 'dataset' and \ GitRepo.is_valid_repo(ap['path']) and \ not ap['path'] == refds_path: ap['process_content'] = True if ap.get('registered_subds', False) and ap.get('state', None) == 'absent': # nothing to drop in an absent subdataset, don't be annoying # and skip silently continue to_drop.append(ap) content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_drop, refds_path=refds_path) assert (not completed) # iterate over all datasets, order doesn't matter for ds_path in content_by_ds: ds = Dataset(ds_path) # TODO generator # this should yield what it did handle_dirty_dataset(ds, mode=if_dirty) # ignore submodule entries content = [ ap['path'] for ap in content_by_ds[ds_path] if ap.get('type', None) != 'dataset' or ap['path'] == ds.path ] if not content: continue for r in _drop_files(ds, content, check=check, **res_kwargs): yield r
class Addurls(Interface): """Create and update a dataset from a list of URLs. *Format specification* Several arguments take format strings. These are similar to normal Python format strings where the names from `URL-FILE` (column names for a CSV or properties for JSON) are available as placeholders. If `URL-FILE` is a CSV file, a positional index can also be used (i.e., "{0}" for the first column). Note that a placeholder cannot contain a ':' or '!'. In addition, the `FILENAME-FORMAT` arguments has a few special placeholders. - _repindex The constructed file names must be unique across all fields rows. To avoid collisions, the special placeholder "_repindex" can be added to the formatter. Its value will start at 0 and increment every time a file name repeats. - _url_hostname, _urlN, _url_basename* Various parts of the formatted URL are available. Take "http://datalad.org/asciicast/seamless_nested_repos.sh" as an example. "datalad.org" is stored as "_url_hostname". Components of the URL's path can be referenced as "_urlN". "_url0" and "_url1" would map to "asciicast" and "seamless_nested_repos.sh", respectively. The final part of the path is also available as "_url_basename". This name is broken down further. "_url_basename_root" and "_url_basename_ext" provide access to the root name and extension. These values are similar to the result of os.path.splitext, but, in the case of multiple periods, the extension is identified using the same length heuristic that git-annex uses. As a result, the extension of "file.tar.gz" would be ".tar.gz", not ".gz". In addition, the fields "_url_basename_root_py" and "_url_basename_ext_py" provide access to the result of os.path.splitext. - _url_filename* These are similar to _url_basename* fields, but they are obtained with a server request. This is useful if the file name is set in the Content-Disposition header. *Examples* Consider a file "avatars.csv" that contains:: who,ext,link neurodebian,png,https://avatars3.githubusercontent.com/u/260793 datalad,png,https://avatars1.githubusercontent.com/u/8927200 To download each link into a file name composed of the 'who' and 'ext' fields, we could run:: $ datalad addurls -d avatar_ds --fast avatars.csv '{link}' '{who}.{ext}' The `-d avatar_ds` is used to create a new dataset in "$PWD/avatar_ds". If we were already in a dataset and wanted to create a new subdataset in an "avatars" subdirectory, we could use "//" in the `FILENAME-FORMAT` argument:: $ datalad addurls --fast avatars.csv '{link}' 'avatars//{who}.{ext}' .. note:: For users familiar with 'git annex addurl': A large part of this plugin's functionality can be viewed as transforming data from `URL-FILE` into a "url filename" format that fed to 'git annex addurl --batch --with-files'. """ from datalad.distribution.dataset import datasetmethod from datalad.interface.utils import eval_results from datalad.distribution.dataset import EnsureDataset from datalad.support.constraints import EnsureChoice, EnsureNone, EnsureStr from datalad.support.param import Parameter _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""Add the URLs to this dataset (or possibly subdatasets of this dataset). An empty or non-existent directory is passed to create a new dataset. New subdatasets can be specified with `FILENAME-FORMAT`.""", constraints=EnsureDataset() | EnsureNone()), urlfile=Parameter( args=("urlfile", ), metavar="URL-FILE", doc="""A file that contains URLs or information that can be used to construct URLs. Depending on the value of --input-type, this should be a CSV file (with a header as the first row) or a JSON file (structured as a list of objects with string values)."""), urlformat=Parameter( args=("urlformat", ), metavar="URL-FORMAT", doc="""A format string that specifies the URL for each entry. See the 'Format Specification' section above."""), filenameformat=Parameter( args=("filenameformat", ), metavar="FILENAME-FORMAT", doc="""Like `URL-FORMAT`, but this format string specifies the file to which the URL's content will be downloaded. The name should be a relative path and will be taken as relative to the top-level dataset, regardless of whether it is specified via [PY: `dataset` PY][CMD: --dataset CMD]) or inferred. The file name may contain directories. The separator "//" can be used to indicate that the left-side directory should be created as a new subdataset. See the 'Format Specification' section above."""), input_type=Parameter( args=("-t", "--input-type"), metavar="TYPE", doc="""Whether `URL-FILE` should be considered a CSV file or a JSON file. The default value, "ext", means to consider `URL-FILE` as a JSON file if it ends with ".json". Otherwise, treat it as a CSV file.""", constraints=EnsureChoice("ext", "csv", "json")), exclude_autometa=Parameter( args=("-x", "--exclude_autometa"), metavar="REGEXP", doc="""By default, metadata field=value pairs are constructed with each column in `URL-FILE`, excluding any single column that is specified via `URL-FORMAT`. This argument can be used to exclude columns that match a regular expression. If set to '*' or an empty string, automatic metadata extraction is disabled completely. This argument does not affect metadata set explicitly with --meta."""), meta=Parameter( args=( "-m", "--meta", ), metavar="FORMAT", action="append", doc="""A format string that specifies metadata. It should be structured as "<field>=<value>". As an example, "location={3}" would mean that the value for the "location" metadata field should be set the value of the fourth column. This option can be given multiple times."""), message=Parameter( args=("--message", ), metavar="MESSAGE", doc="""Use this message when committing the URL additions.""", constraints=EnsureNone() | EnsureStr()), dry_run=Parameter( args=("-n", "--dry-run"), action="store_true", doc="""Report which URLs would be downloaded to which files and then exit."""), fast=Parameter( args=("--fast", ), action="store_true", doc="""If True, add the URLs, but don't download their content. Underneath, this passes the --fast flag to `git annex addurl`."""), ifexists=Parameter( args=("--ifexists", ), doc="""What to do if a constructed file name already exists. The default behavior is to proceed with the `git annex addurl`, which will fail if the file size has changed. If set to 'overwrite', remove the old file before adding the new one. If set to 'skip', do not add the new file.""", constraints=EnsureChoice(None, "overwrite", "skip")), missing_value=Parameter( args=("--missing-value", ), metavar="VALUE", doc="""When an empty string is encountered, use this value instead.""", constraints=EnsureNone() | EnsureStr()), save=nosave_opt, version_urls=Parameter( args=("--version-urls", ), action="store_true", doc="""Try to add a version ID to the URL. This currently only has an effect on HTTP URLs for AWS S3 buckets. s3:// URL versioning is not yet supported, but any URL that already contains a "versionId=" parameter will be used as is."""), cfg_proc=Parameter( args=("-c", "--cfg-proc"), metavar="PROC", action='append', doc="""Pass this [PY: cfg_proc PY][CMD: --cfg_proc CMD] value when calling `create` to make datasets."""), ) @staticmethod @datasetmethod(name='addurls') @eval_results def __call__(dataset, urlfile, urlformat, filenameformat, input_type="ext", exclude_autometa=None, meta=None, message=None, dry_run=False, fast=False, ifexists=None, missing_value=None, save=True, version_urls=False, cfg_proc=None): # Temporarily work around gh-2269. url_file = urlfile url_format, filename_format = urlformat, filenameformat from requests.exceptions import RequestException from datalad.distribution.dataset import Dataset, require_dataset from datalad.interface.results import get_status_dict from datalad.support.annexrepo import AnnexRepo lgr = logging.getLogger("datalad.plugin.addurls") ds = require_dataset(dataset, check_installed=False) if ds.repo and not isinstance(ds.repo, AnnexRepo): yield get_status_dict(action="addurls", ds=ds, status="error", message="not an annex repo") return url_file = str(resolve_path(url_file, dataset)) if input_type == "ext": extension = os.path.splitext(url_file)[1] input_type = "json" if extension == ".json" else "csv" with open(url_file) as fd: try: rows, subpaths = extract(fd, input_type, url_format, filename_format, exclude_autometa, meta, dry_run, missing_value) except (ValueError, RequestException) as exc: yield get_status_dict(action="addurls", ds=ds, status="error", message=exc_str(exc)) return if not rows: yield get_status_dict(action="addurls", ds=ds, status="notneeded", message="No rows to process") return if len(rows) != len(set(row["filename"] for row in rows)): yield get_status_dict(action="addurls", ds=ds, status="error", message=("There are file name collisions; " "consider using {_repindex}")) return if dry_run: for subpath in subpaths: lgr.info("Would create a subdataset at %s", subpath) for row in rows: lgr.info("Would download %s to %s", row["url"], os.path.join(ds.path, row["filename"])) lgr.info( "Metadata: %s", sorted(u"{}={}".format(k, v) for k, v in row["meta_args"].items())) yield get_status_dict(action="addurls", ds=ds, status="ok", message="dry-run finished") return if not ds.repo: # Populate a new dataset with the URLs. for r in ds.create(result_xfm=None, return_type='generator', cfg_proc=cfg_proc): yield r annex_options = ["--fast"] if fast else [] for spath in subpaths: if os.path.exists(os.path.join(ds.path, spath)): lgr.warning("Not creating subdataset at existing path: %s", spath) else: for r in ds.create(spath, result_xfm=None, cfg_proc=cfg_proc, return_type='generator'): yield r for row in rows: # Add additional information that we'll need for various # operations. filename_abs = os.path.join(ds.path, row["filename"]) if row["subpath"]: ds_current = Dataset(os.path.join(ds.path, row["subpath"])) ds_filename = os.path.relpath(filename_abs, ds_current.path) else: ds_current = ds ds_filename = row["filename"] row.update({ "filename_abs": filename_abs, "ds": ds_current, "ds_filename": ds_filename }) if version_urls: num_urls = len(rows) log_progress(lgr.info, "addurls_versionurls", "Versioning %d URLs", num_urls, label="Versioning URLs", total=num_urls, unit=" URLs") for row in rows: url = row["url"] try: row["url"] = get_versioned_url(url) except (ValueError, NotImplementedError) as exc: # We don't expect this to happen because get_versioned_url # should return the original URL if it isn't an S3 bucket. # It only raises exceptions if it doesn't know how to # handle the scheme for what looks like an S3 bucket. lgr.warning("error getting version of %s: %s", row["url"], exc_str(exc)) log_progress(lgr.info, "addurls_versionurls", "Versioned result for %s: %s", url, row["url"], update=1, increment=True) log_progress(lgr.info, "addurls_versionurls", "Finished versioning URLs") files_to_add = set() for r in add_urls(rows, ifexists=ifexists, options=annex_options): if r["status"] == "ok": files_to_add.add(r["path"]) yield r msg = message or """\ [DATALAD] add files from URLs url_file='{}' url_format='{}' filename_format='{}'""".format(url_file, url_format, filename_format) if files_to_add: meta_rows = [r for r in rows if r["filename_abs"] in files_to_add] for r in add_meta(meta_rows): yield r if save: for r in ds.save(path=files_to_add, message=msg, recursive=True): yield r
class CreateSibling(Interface): """Create dataset(s)'s sibling (e.g., on a web server). Those (empty) datasets can then serve as a target for the `publish` command. """ _params_ = dict( # TODO: Figure out, whether (and when) to use `sshurl` as push url dataset=Parameter( args=( "--dataset", "-d", ), doc="""specify the dataset to create the publication target for. If no dataset is given, an attempt is made to identify the dataset based on the current working directory""", constraints=EnsureDataset() | EnsureNone()), sshurl=Parameter( args=("sshurl", ), metavar='SSHURL', doc="""Login information for the target server. This can be given as a URL (ssh://host/path) or SSH-style (user@host:path). Unless overridden, this also serves the future dataset's access URL and path on the server.""", constraints=EnsureStr()), target=Parameter( args=('target', ), metavar='TARGETNAME', doc="""sibling name to create for this publication target. If `recursive` is set, the same name will be used to label all the subdatasets' siblings. Note, this is just a convenience option, siblings can also be added at a later point in time. When creation target datasets fails, no siblings are added""", constraints=EnsureStr() | EnsureNone(), nargs="?"), target_dir=Parameter( args=('--target-dir', ), metavar='PATH', doc="""path to the directory *on the server* where the dataset shall be created. By default the SSH access URL is used to identify this directory. If a relative path is provided here, it is interpreted as being relative to the user's home directory on the server.\n Additional features are relevant for recursive processing of datasets with subdatasets. By default, the local dataset structure is replicated on the server. However, it is possible to provide a template for generating different target directory names for all (sub)datasets. Templates can contain certain placeholder that are substituted for each (sub)dataset. For example: "/mydirectory/dataset-%%NAME".\nSupported placeholders:\n %%NAME - the name of the datasets, with any slashes replaced by dashes\n""", constraints=EnsureStr() | EnsureNone()), target_url=Parameter( args=('--target-url', ), metavar='URL', doc=""""public" access URL of the to-be-created target dataset(s) (default: `sshurl`). Accessibility of this URL determines the access permissions of potential consumers of the dataset. As with `target_dir`, templates (same set of placeholders) are supported. Also, if specified, it is provided as the annex description\n""", constraints=EnsureStr() | EnsureNone()), target_pushurl=Parameter( args=('--target-pushurl', ), metavar='URL', doc="""In case the `target_url` cannot be used to publish to the dataset, this option specifies an alternative URL for this purpose. As with `target_url`, templates (same set of placeholders) are supported.\n""", constraints=EnsureStr() | EnsureNone()), recursive=recursion_flag, existing=Parameter( args=("--existing", ), constraints=EnsureChoice('skip', 'replace', 'error', 'reconfigure'), metavar='MODE', doc="""action to perform, if target directory exists already. Dataset is skipped if 'skip'. 'replace' forces to (re-)init the dataset, and to (re-)configure the dataset sibling, i.e. its URL(s), in case it already exists. 'reconfigure' updates metadata of the dataset sibling. 'error' causes an exception to be raised.""", ), shared=Parameter( args=("--shared", ), metavar='false|true|umask|group|all|world|everybody|0xxx', doc="""if given, configures the access permissions on the server for multi-users (this could include access by a webserver!). Possible values for this option are identical to those of `git init --shared` and are described in its documentation.""", constraints=EnsureStr() | EnsureBool()), ui=Parameter(args=("--ui", ), metavar='false|true|html_filename', doc="""publish a web interface for the dataset with an optional user-specified name for the html at publication target. defaults to `index.html` at dataset root""", constraints=EnsureBool() | EnsureStr()), as_common_datasrc=as_common_datasrc, publish_depends=publish_depends, publish_by_default=publish_by_default, ) @staticmethod @datasetmethod(name='create_sibling') def __call__(sshurl, target=None, target_dir=None, target_url=None, target_pushurl=None, dataset=None, recursive=False, existing='error', shared=False, ui=False, as_common_datasrc=None, publish_by_default=None, publish_depends=None): if sshurl is None: raise ValueError("""insufficient information for target creation (needs at least a dataset and a SSH URL).""") if target is None and (target_url is not None or target_pushurl is not None): raise ValueError("""insufficient information for adding the target as a sibling (needs at least a name)""") # shortcut ds = require_dataset(dataset, check_installed=True, purpose='creating a sibling') assert (ds is not None and sshurl is not None and ds.repo is not None) # determine target parameters: sshri = RI(sshurl) if not isinstance(sshri, SSHRI) \ and not (isinstance(sshri, URL) and sshri.scheme == 'ssh'): raise ValueError( "Unsupported SSH URL: '{0}', use ssh://host/path or host:path syntax" .format(sshurl)) if target_dir is None: if sshri.path: target_dir = sshri.path else: target_dir = '.' # TODO: centralize and generalize template symbol handling replicate_local_structure = False if "%NAME" not in target_dir: replicate_local_structure = True # collect datasets to use: datasets = dict() datasets[basename(ds.path)] = ds if recursive: for subds in ds.get_subdatasets(recursive=True): sub_path = opj(ds.path, subds) # TODO: when enhancing Dataset/*Repo classes and therefore # adapt to moved code, make proper distinction between name and # path of a submodule, which are technically different. This # probably will become important on windows as well as whenever # we want to allow for moved worktrees. datasets[basename(ds.path) + '/' + subds] = \ Dataset(sub_path) # request ssh connection: not_supported_on_windows("TODO") lgr.info("Connecting ...") ssh = ssh_manager.get_connection(sshurl) ssh.open() # flag to check if at dataset_root at_root = True # loop over all datasets, ordered from top to bottom to make test # below valid (existing directories would cause the machinery to halt) # But we need to run post-update hook in depth-first fashion, so # would only collect first and then run (see gh #790) remote_repos_to_run_hook_for = [] for current_dspath in \ sorted(datasets.keys(), key=lambda x: x.count('/')): current_ds = datasets[current_dspath] if not current_ds.is_installed(): lgr.info("Skipping %s since not installed locally", current_dspath) continue if not replicate_local_structure: path = target_dir.replace("%NAME", current_dspath.replace("/", "-")) else: # TODO: opj depends on local platform, not the remote one. # check how to deal with it. Does windows ssh server accept # posix paths? vice versa? Should planned SSH class provide # tools for this issue? path = normpath( opj(target_dir, relpath(datasets[current_dspath].path, start=ds.path))) lgr.info("Creating target dataset {0} at {1}".format( current_dspath, path)) # Must be set to True only if exists and existing='reconfigure' # otherwise we might skip actions if we say existing='reconfigure' # but it did not even exist before only_reconfigure = False if path != '.': # check if target exists # TODO: Is this condition valid for != '.' only? path_exists = True try: out, err = ssh(["ls", path]) except CommandError as e: if "No such file or directory" in e.stderr and \ path in e.stderr: path_exists = False else: raise # It's an unexpected failure here if path_exists: if existing == 'error': raise RuntimeError( "Target directory %s already exists." % path) elif existing == 'skip': continue elif existing == 'replace': ssh([ "chmod", "+r+w", "-R", path ]) # enable write permissions to allow removing dir ssh(["rm", "-rf", path]) # remove target at path path_exists = False # if we succeeded in removing it elif existing == 'reconfigure': only_reconfigure = True else: raise ValueError( "Do not know how to handle existing=%s" % repr(existing)) if not path_exists: try: ssh(["mkdir", "-p", path]) except CommandError as e: lgr.error( "Remotely creating target directory failed at " "%s.\nError: %s" % (path, exc_str(e))) continue # don't (re-)initialize dataset if existing == reconfigure if not only_reconfigure: # init git and possibly annex repo if not CreateSibling.init_remote_repo(path, ssh, shared, datasets[current_dspath], description=target_url): continue # check git version on remote end lgr.info("Adjusting remote git configuration") remote_git_version = CreateSibling.get_remote_git_version(ssh) if remote_git_version and remote_git_version >= "2.4": # allow for pushing to checked out branch try: ssh(["git", "-C", path] + [ "config", "receive.denyCurrentBranch", "updateInstead" ]) except CommandError as e: lgr.error( "git config failed at remote location %s.\n" "You will not be able to push to checked out " "branch. Error: %s", path, exc_str(e)) else: lgr.error( "Git version >= 2.4 needed to configure remote." " Version detected on server: %s\nSkipping configuration" " of receive.denyCurrentBranch - you will not be able to" " publish updates to this repository. Upgrade your git" " and run with --existing=reconfigure" % remote_git_version) # enable metadata refresh on dataset updates to publication server lgr.info("Enabling git post-update hook ...") try: CreateSibling.create_postupdate_hook(path, ssh, datasets[current_dspath]) except CommandError as e: lgr.error("Failed to add json creation command to post update " "hook.\nError: %s" % exc_str(e)) # publish web-interface to root dataset on publication server if at_root and ui: lgr.info("Uploading web interface to %s" % path) at_root = False try: CreateSibling.upload_web_interface(path, ssh, shared, ui) except CommandError as e: lgr.error("Failed to push web interface to the remote " "datalad repository.\nError: %s" % exc_str(e)) remote_repos_to_run_hook_for.append(path) # in reverse order would be depth first lgr.debug("Running post-update hooks in all created siblings") for path in remote_repos_to_run_hook_for[::-1]: # Trigger the hook try: ssh( ["cd '" + _path_(path, ".git") + "' && hooks/post-update"], wrap_args=False # we wrapped here manually ) except CommandError as e: lgr.error("Failed to run post-update hook under path %s. " "Error: %s" % (path, exc_str(e))) if target: # add the sibling(s): lgr.debug("Adding the siblings") if target_url is None: target_url = sshurl if target_pushurl is None and sshurl != target_url: target_pushurl = sshurl AddSibling()(dataset=ds, name=target, url=target_url, pushurl=target_pushurl, recursive=recursive, fetch=True, force=existing in {'replace'}, as_common_datasrc=as_common_datasrc, publish_by_default=publish_by_default, publish_depends=publish_depends) # TODO: Return value!? # => [(Dataset, fetch_url)] @staticmethod def init_remote_repo(path, ssh, shared, dataset, description=None): cmd = ["git", "-C", path, "init"] if shared: cmd.append("--shared=%s" % shared) try: ssh(cmd) except CommandError as e: lgr.error("Initialization of remote git repository failed at %s." "\nError: %s\nSkipping ..." % (path, exc_str(e))) return False if isinstance(dataset.repo, AnnexRepo): # init remote git annex repo (part fix of #463) try: ssh(["git", "-C", path, "annex", "init"] + ([description] if description else [])) except CommandError as e: lgr.error( "Initialization of remote git annex repository failed at %s." "\nError: %s\nSkipping ..." % (path, exc_str(e))) return False return True @staticmethod def get_remote_git_version(ssh): try: # options to disable all auto so we don't trigger them while testing # for absent changes out, err = ssh(["git"] + ["version"]) assert out.strip().startswith("git version") git_version = out.strip().split()[2] lgr.debug("Detected git version on server: %s" % git_version) return LooseVersion(git_version) except CommandError as e: lgr.warning("Failed to determine git version on remote.\n" "Error: {0}\nTrying to configure anyway " "...".format(exc_str(e))) return None @staticmethod def create_postupdate_hook(path, ssh, dataset): # location of post-update hook file, logs folder on remote target hooks_remote_dir = opj(path, '.git', 'hooks') hook_remote_target = opj(hooks_remote_dir, 'post-update') # post-update hook should create its log directory if doesn't exist logs_remote_dir = opj(path, WEB_META_LOG) make_log_dir = 'mkdir -p "{}"'.format(logs_remote_dir) # create json command for current dataset json_command = r''' mkdir -p {}; ( which datalad > /dev/null \ && ( cd ..; GIT_DIR=$PWD/.git datalad ls -a --json file '{}'; ) \ || echo "no datalad found - skipping generation of indexes for web frontend"; \ ) &> "{}/{}" '''.format(logs_remote_dir, str(path), logs_remote_dir, 'datalad-publish-hook-$(date +%s).log' % TIMESTAMP_FMT) # collate content for post_update hook hook_content = '\n'.join([ '#!/bin/bash', 'git update-server-info', make_log_dir, json_command ]) with make_tempfile(content=hook_content ) as tempf: # create post_update hook script ssh.copy(tempf, hook_remote_target) # upload hook to dataset ssh(['chmod', '+x', hook_remote_target]) # and make it executable @staticmethod def upload_web_interface(path, ssh, shared, ui): # path to web interface resources on local webui_local = opj(dirname(datalad.__file__), 'resources', 'website') # local html to dataset html_local = opj(webui_local, "index.html") # name and location of web-interface html on target html_targetname = {True: ui, False: "index.html"}[isinstance(ui, str)] html_target = opj(path, html_targetname) # upload ui html to target ssh.copy(html_local, html_target) # upload assets to the dataset webresources_local = opj(webui_local, 'assets') webresources_remote = opj(path, WEB_HTML_DIR) ssh(['mkdir', '-p', webresources_remote]) ssh.copy(webresources_local, webresources_remote, recursive=True) # minimize and upload js assets for js_file in glob(opj(webresources_local, 'js', '*.js')): with open(js_file) as asset: try: from jsmin import jsmin minified = jsmin(asset.read()) # minify asset except ImportError: lgr.warning( "Will not minify web interface javascript, no jsmin available" ) minified = asset.read() # no minify available with make_tempfile(content=minified ) as tempf: # write minified to tempfile js_name = js_file.split('/')[-1] ssh.copy(tempf, opj(webresources_remote, 'assets', 'js', js_name)) # and upload js # explicitly make web+metadata dir of dataset world-readable, if shared set to 'all' mode = None if shared in (True, 'true', 'all', 'world', 'everybody'): mode = 'a+rX' elif shared == 'group': mode = 'g+rX' elif str(shared).startswith('0'): mode = shared if mode: ssh([ 'chmod', mode, '-R', dirname(webresources_remote), opj(path, 'index.html') ])
class Update(Interface): """Update a dataset from a sibling. """ # TODO: adjust docs to say: # - update from just one sibling at a time _params_ = dict( path=Parameter(args=("path", ), metavar="PATH", doc="path to be updated", nargs="*", constraints=EnsureStr() | EnsureNone()), sibling=Parameter(args=( "-s", "--sibling", ), doc="""name of the sibling to update from""", constraints=EnsureStr() | EnsureNone()), dataset=Parameter(args=("-d", "--dataset"), doc=""""specify the dataset to update. If no dataset is given, an attempt is made to identify the dataset based on the input and/or the current working directory""", constraints=EnsureDataset() | EnsureNone()), merge=Parameter( args=("--merge", ), action="store_true", doc="""merge obtained changes from the given or the default sibling""", ), recursive=recursion_flag, recursion_limit=recursion_limit, fetch_all=Parameter( args=("--fetch-all", ), action="store_true", doc="fetch updates from all known siblings", ), reobtain_data=Parameter(args=("--reobtain-data", ), action="store_true", doc="TODO"), ) @staticmethod @datasetmethod(name='update') @eval_results def __call__(path=None, sibling=None, merge=False, dataset=None, recursive=False, recursion_limit=None, fetch_all=False, reobtain_data=False): """ """ if not dataset and not path: # try to find a dataset in PWD dataset = require_dataset(None, check_installed=True, purpose='updating') refds_path = Interface.get_refds_path(dataset) if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path for ap in AnnotatePaths.__call__(dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='update', unavailable_path_status='impossible', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if not ap.get('type', None) == 'dataset': ap.update(status='impossible', message="can only update datasets") yield ap continue # this is definitely as dataset from here on ds = Dataset(ap['path']) if not ds.is_installed(): lgr.debug("Skipping update since not installed %s", ds) continue repo = ds.repo # prepare return value # TODO reuse AP for return props res = get_status_dict('update', ds=ds, logger=lgr, refds=refds_path) # get all remotes which have references (would exclude # special remotes) remotes = repo.get_remotes(**({ 'exclude_special_remotes': True } if isinstance(repo, AnnexRepo) else {})) if not remotes: res['message'] = ( "No siblings known to dataset at %s\nSkipping", repo.path) res['status'] = 'notneeded' yield res continue if not sibling: # nothing given, look for tracking branch sibling_ = repo.get_tracking_branch()[0] else: sibling_ = sibling if sibling_ and sibling_ not in remotes: res['message'] = ("'%s' not known to dataset %s\nSkipping", sibling_, repo.path) res['status'] = 'impossible' yield res continue if not sibling_ and len(remotes) == 1: # there is only one remote, must be this one sibling_ = remotes[0] if not sibling_ and len(remotes) > 1 and merge: lgr.debug("Found multiple siblings:\n%s" % remotes) res['status'] = 'impossible' res['error'] = NotImplementedError( "Multiple siblings, please specify from which to update.") yield res continue lgr.info("Updating dataset '%s' ..." % repo.path) # fetch remote fetch_kwargs = dict( remote=None if fetch_all else sibling_, all_=fetch_all, prune=True) # prune to not accumulate a mess over time try: repo.fetch(**fetch_kwargs) except BadName: # pragma: no cover # Workaround for # https://github.com/gitpython-developers/GitPython/issues/768 # also see https://github.com/datalad/datalad/issues/2550 # Let's try to precommit (to flush anything flushable) and do # it again repo.precommit() repo.fetch(**fetch_kwargs) # NOTE if any further acces to `repo` is needed, reevaluate # ds.repo again, as it might have be converted from an GitRepo # to an AnnexRepo if merge: for fr in _update_repo(ds, sibling_, reobtain_data): yield fr res['status'] = 'ok' yield res
class OSFCredentials(Interface): """Gather OSF credentials for subsequent non-interactive use This command enables (re-)entry of OSF credentials for storage in a credential manager. Once credentials are known, they will be retrieved automatically on demand, and enable non-interactive use for the purpose of data transfer to and from OSF. Credentials will be verified to enable successful authentication before being stored. """ _params_ = dict( method=Parameter( args=("--method", ), doc="""authentication method to use. 'token' authentication is strongly recommended.""", constraints=EnsureChoice("token", "userpassword")), reset=Parameter( args=("--reset", ), doc="""reset existing credentials and force re-entry""", action='store_true', ), ) @staticmethod @datasetmethod(name='osf_credentials') @eval_results def __call__(method="token", reset=False): auth = None cred_spec = [] if method == 'token': cred_spec = dict(token='token') auth = Token( name='https://osf.io', url='https://osf.io/settings/tokens', ) elif method == 'userpassword': cred_spec = dict(user='******', password='******') auth = UserPassword( name='https://osf.io', url='https://osf.io/settings/account', ) else: raise ValueError( 'Unknown authentication method: {}'.format(method)) if reset and auth.is_known: auth.delete() cred = {v: auth().get(k, None) for k, v in cred_spec.items()} # now verify that the credentials work by querying the # logged in user osf = OSF(**cred) try: req = osf.session.get('https://api.osf.io/v2/users/me/') req.raise_for_status() except UnauthorizedException: auth.delete() yield dict( action='osf_credentials', status='error', message='Invalid credentials', path=None, ) return except Exception as e: yield dict( action='osf_credentials', status='impossible', message='Could not verify credentials, ' 'please try again: {}'.format(exc_str(e)), # needed to pacify DataLad 0.13.0 and earlier path=None, ) return # if we get here auth has worked fine # get some attributes for an informative message attrs = req.json().get('data', {}).get('attributes', {}) yield dict( action='osf_credentials', status='ok', message='authenticated{}{}{}'.format( ' as ' if any( attrs.get(k, None) for k in ('email', 'full_name')) else '', attrs.get('full_name', ''), ' <{}>'.format(attrs['email']) if attrs.get('email', None) else ''), # needed to pacify DataLad 0.13.0 and earlier path=None, # report effective credentials **cred, )
class ExtractMetadata(Interface): """Run one or more of DataLad's metadata extractors on a dataset or file. The result(s) are structured like the metadata DataLad would extract during metadata aggregation. There is one result per dataset/file. Examples: Extract metadata with two extractors from a dataset in the current directory and also from all its files:: $ datalad extract-metadata -d . --type frictionless_datapackage --type datalad_core Extract XMP metadata from a single PDF that is not part of any dataset:: $ datalad extract-metadata --type xmp Downloads/freshfromtheweb.pdf """ _params_ = dict( types=Parameter(args=("--type", ), dest="types", metavar=("NAME"), action='append', required=True, doc="""Name of a metadata extractor to be executed. [CMD: This option can be given more than once CMD]"""), files=Parameter(args=("files", ), metavar="FILE", nargs="*", doc="Path of a file to extract metadata from.", constraints=EnsureStr() | EnsureNone()), dataset=Parameter( args=("-d", "--dataset"), doc=""""Dataset to extract metadata from. If no `file` is given, metadata is extracted from all files of the dataset.""", constraints=EnsureDataset() | EnsureNone()), ) @staticmethod @datasetmethod(name='extract_metadata') @eval_results def __call__(types, files=None, dataset=None): dataset = require_dataset(dataset or curdir, purpose="extract metadata", check_installed=not files) if not files: ds = require_dataset(dataset, check_installed=True) subds = ds.subdatasets(recursive=False, result_xfm='relpaths') files = list(_get_metadatarelevant_paths(ds, subds)) dsmeta, contentmeta, error = _get_metadata(dataset, types, global_meta=True, content_meta=bool(files), paths=files) if dataset is not None and dataset.is_installed(): res = get_status_dict(action='metadata', ds=dataset, refds=dataset.path, metadata=dsmeta, status='error' if error else 'ok') yield res for p in contentmeta: res = get_status_dict(action='metadata', path=opj(dataset.path, p) if dataset else p, refds=dataset.path, metadata=contentmeta[p], type='file', status='error' if error else 'ok') if dataset: res['parentds'] = dataset.path yield res
class Diff(Interface): """Report changes of dataset components. Reports can be generated for changes between recorded revisions, or between a revision and the state of a dataset's work tree. Unlike 'git diff', this command also reports untracked content when comparing a revision to the state of the work tree. Such content is marked with the property `state='untracked'` in the command results. The following types of changes are distinguished and reported via the `state` result property: - added - copied - deleted - modified - renamed - typechange - unmerged - untracked Whenever applicable, source and/or destination revisions are reported to indicate when exactly within the requested revision range a particular component changed its status. Optionally, the reported changes can be limited to a subset of paths within a dataset. """ # make the custom renderer the default one, as the global default renderer # does not yield meaningful output for this command result_renderer = 'tailored' _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), doc="""specify the dataset to query. If no dataset is given, an attempt is made to identify the dataset based on the input and/or the current working directory""", constraints=EnsureDataset() | EnsureNone()), path=Parameter(args=("path", ), metavar="PATH", doc="""path to be evaluated""", nargs="*", constraints=EnsureStr() | EnsureNone()), revision=Parameter( args=('--revision', ), metavar='REVISION EXPRESSION', nargs='?', doc="""comparison reference specification. Three modes are supported: 1) <revision> changes you have in your working tree relative to the named revision (this can also be a branch name, tag, commit or any label Git can understand). 2) <revision>..<revision> changes between two arbitrary revisions. 3) <revision>...<revision> changes on the branch containing and up to the second <revision>, starting at a common ancestor of both revisions."""), staged=Parameter( args=("--staged", ), action="store_true", doc="""get the changes already staged for a commit relative to an optionally given revision (by default the most recent one)""" ), ignore_subdatasets=Parameter( args=('--ignore-subdatasets', ), constraints=EnsureChoice('none', 'untracked', 'dirty', 'all'), doc="""speed up execution by (partially) not evaluating the state of subdatasets in a parent dataset. With "none" a subdataset is considered modified when it either contains untracked or modified content or its last saved state differs from that recorded in the parent dataset. When "untracked" is used subdatasets are not considered modified when they only contain untracked content (but they are still scanned for modified content). Using "dirty" ignores all changes to the work tree of subdatasets, only changes to the revisions stored in the parent dataset are shown. Using "all" hides all changes to subdatasets. Note, even with "all" recursive execution will still report other changes in any existing subdataset, only the subdataset record in a parent dataset is not evaluated."""), report_untracked=Parameter( args=('--report-untracked', ), constraints=EnsureChoice('no', 'normal', 'all'), doc="""If and how untracked content is reported when comparing a revision to the state of the work tree. 'no': no untracked files are reported; 'normal': untracked files and entire untracked directories are reported as such; 'all': report individual files even in fully untracked directories."""), recursive=recursion_flag, recursion_limit=recursion_limit) @staticmethod @datasetmethod(name='diff') @eval_results def __call__(path=None, dataset=None, revision=None, staged=False, ignore_subdatasets='none', report_untracked='normal', recursive=False, recursion_limit=None): if not dataset and not path: # act on the whole dataset if nothing else was specified dataset = curdir refds_path = Interface.get_refds_path(dataset) to_process = [] # tracked what commit ranges we want to diff per dataset ds_diffies = {} for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='diff', # unavailable is OK, because we might query for a deleted file unavailable_path_status='', nondataset_path_status='impossible', # must not use `modified`, infinite loop otherwise modified=None, return_type='generator', on_failure='ignore'): if ap.get('status', None): # we know what to report already yield ap continue if ap.get('type', None) == 'dataset': ap['process_content'] = True if ap.get('raw_input', False) or ap['path'] == refds_path: # prepopulate the revision specs for all input paths ds_diffies[ap['path'] if ap.get('type', None) == 'dataset' else ap['parentds']] = revision to_process.append(ap) # sort into datasets content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_process, refds_path=refds_path) assert (not completed) for ds_path in sorted(content_by_ds.keys()): if ds_path not in ds_diffies: # we don't know how to diff # this was not neither an input path, not did we see it # when diffing its parent continue content_paths = content_by_ds[ds_path] revision = ds_diffies[ds_path] for r in _parse_git_diff(ds_path, diff_thingie=ds_diffies[ds_path], paths=content_paths, ignore_submodules=ignore_subdatasets, staged=staged): r.update(dict(action='diff', logger=lgr)) if refds_path: r['refds'] = refds_path if 'status' not in r: r['status'] = 'ok' if r.get('type', None) == 'dataset': # this is a subdataset report # we need to use the reported commit range to properly adjust the # query once we hit that subdataset from_rev = r.get('revision_src', '') to_rev = r.get('revision', '') subrev = '{}..{}'.format( from_rev if from_rev else PRE_INIT_COMMIT_SHA, to_rev if to_rev else '', ) if from_rev and from_rev == to_rev: # this is a special case, where subdataset reported changes without # a change in state/commit -- this is code for uncommited changes # in the subdataset (including staged ones). In such a case, we # must not provide a diff range, but only the source commit we want # to diff against # XXX if this is changed, likely the same logic in annotate_paths needs # changing too! subrev = from_rev ds_diffies[r['path']] = subrev yield r if (revision and '..' in revision) or report_untracked == 'no': # don't look for untracked content, we got a revision range continue for r in _get_untracked_content(ds_path, report_untracked, paths=content_paths): r.update(dict(action='diff', logger=lgr)) if refds_path: r['refds'] = refds_path if 'status' not in r: r['status'] = 'ok' yield r @staticmethod def custom_result_renderer(res, **kwargs): from datalad.ui import ui if not res['status'] == 'ok': # logging reported already return path = relpath(res['path'], start=res['refds']) \ if res.get('refds', None) else res['path'] type_ = res.get('type', res.get('type_src', '')) max_len = len('untracked(directory)') state_msg = '{}{}'.format(res['state'], '({})'.format(type_ if type_ else '')) ui.message('{fill}{state_msg}: {path}'.format( fill=' ' * max(0, max_len - len(state_msg)), state_msg=state_msg, path=path))
class Save(Interface): """Save the current state of a dataset Saving the state of a dataset records all changes that have been made to it. This change record is annotated with a user-provided description. Optionally, an additional tag, such as a version, can be assigned to the saved state. Such tag enables straightforward retrieval of past versions at a later point in time. || PYTHON >> Returns ------- commit or None `None` if nothing was saved, the resulting commit otherwise. << PYTHON || """ _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc=""""specify the dataset to save. If a dataset is given, but no `files`, the entire dataset will be saved.""", constraints=EnsureDataset() | EnsureNone()), files=Parameter( args=("files", ), metavar='FILES', doc="""list of files to consider. If given, only changes made to those files are recorded in the new state.""", nargs='*', constraints=EnsureStr() | EnsureNone()), message=Parameter(args=( "-m", "--message", ), metavar='MESSAGE', doc="""a message to annotate the saved state.""", constraints=EnsureStr() | EnsureNone()), all_changes=Parameter( args=("-a", "--all-changes"), doc="""save changes of all known components in datasets that contain any of the given paths.""", action="store_true"), version_tag=Parameter(args=("--version-tag", ), metavar='ID', doc="""an additional marker for that state.""", constraints=EnsureStr() | EnsureNone()), super_datasets=super_datasets_flag, recursive=recursion_flag, recursion_limit=recursion_limit, ) @staticmethod @datasetmethod(name='save') def __call__(message=None, files=None, dataset=None, all_changes=False, version_tag=None, recursive=False, recursion_limit=None, super_datasets=False): if dataset: dataset = require_dataset(dataset, check_installed=True, purpose='saving') content_by_ds, unavailable_paths = Interface._prep( path=files, dataset=dataset, recursive=recursive, recursion_limit=recursion_limit) if unavailable_paths: lgr.warning("ignoring non-existent path(s): %s", unavailable_paths) # here we know all datasets associated with any inputs # so we can expand "all_changes" right here to avoid confusion # wrt to "super" and "intermediate" datasets discovered later on if all_changes: # and we do this by replacing any given paths with the respective # datasets' base path for ds in content_by_ds: content_by_ds[ds] = [ds] if super_datasets: content_by_ds = amend_pathspec_with_superdatasets( content_by_ds, # save up to and including the base dataset (if one is given) # otherwise up to the very top topmost=dataset if dataset else True, limit_single=False) if dataset: # stuff all paths also into the base dataset slot to make sure # we get all links between relevant subdatasets bp = content_by_ds.get(dataset.path, []) for c in content_by_ds: bp.extend(content_by_ds[c]) content_by_ds[dataset.path] = list(set(bp)) saved_ds = save_dataset_hierarchy( content_by_ds, base=dataset.path if dataset and dataset.is_installed() else None, message=message, version_tag=version_tag) return saved_ds @staticmethod def result_renderer_cmdline(res, args): from datalad.ui import ui if not res: return for ds in res: commit = ds.repo.repo.head.commit ui.message('Saved state: {0} for {1}'.format(commit.hexsha, ds))
class Publish(Interface): """Publish a dataset to a known :term:`sibling`. This makes the last saved state of a dataset available to a sibling or special remote data store of a dataset. Any target sibling must already exist and be known to the dataset. Optionally, it is possible to limit publication to change sets relative to a particular point in the version history of a dataset (e.g. a release tag). By default, the state of the local dataset is evaluated against the last known state of the target sibling. Actual publication is only attempted if there was a change compared to the reference state, in order to speed up processing of large collections of datasets. Evaluation with respect to a particular "historic" state is only supported in conjunction with a specified reference dataset. Change sets are also evaluated recursively, i.e. only those subdatasets are published where a change was recorded that is reflected in to current state of the top-level reference dataset. See "since" option for more information. Only publication of saved changes is supported. Any unsaved changes in a dataset (hierarchy) have to be saved before publication. .. note:: Power-user info: This command uses :command:`git push`, and :command:`git annex copy` to publish a dataset. Publication targets are either configured remote Git repositories, or git-annex special remotes (if their support data upload). """ # XXX prevent common args from being added to the docstring _no_eval_results = True # TODO: Figure out, how to tell about tracking branch/upstream # (and the respective remote) # - it is used, when no destination is given # - it is configured to be the given destination, if there was no # upstream set up before, so you can use just "datalad publish" next # time. _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), metavar='DATASET', doc="""specify the (top-level) dataset to be published. If no dataset is given, the datasets are determined based on the input arguments""", constraints=EnsureDataset() | EnsureNone()), to=Parameter( args=("--to", ), metavar='LABEL', doc="""name of the target sibling. If no name is given an attempt is made to identify the target based on the dataset's configuration (i.e. a configured tracking branch, or a single sibling that is configured for publication)""", # TODO: See TODO at top of class! constraints=EnsureStr() | EnsureNone()), since=Parameter( args=("--since", ), constraints=EnsureStr() | EnsureNone(), doc= """When publishing dataset(s), specifies commit (treeish, tag, etc) from which to look for changes to decide either updated publishing is necessary for this and which children. If empty argument is provided, then we will always run publish command. By default, would take from the previously published to that remote/sibling state (for the current branch)"""), # since: commit => .gitmodules diff to head => submodules to publish missing=missing_sibling_opt, path=Parameter( args=("path", ), metavar='PATH', doc="path(s), that may point to file handle(s) to publish including " "their actual content or to subdataset(s) to be published. If a " "file handle is published with its data, this implicitly means " "to also publish the (sub)dataset it belongs to. '.' as a path " "is treated in a special way in the sense, that it is passed " "to subdatasets in case `recursive` is also given.", constraints=EnsureStr() | EnsureNone(), nargs='*'), force=Parameter( args=( "-f", "--force", ), doc="""enforce doing publish activities (git push etc) regardless of the analysis if they seemed needed""", action='store_true'), recursive=recursion_flag, recursion_limit=recursion_limit, git_opts=git_opts, annex_opts=annex_opts, annex_copy_opts=annex_copy_opts, jobs=jobs_opt, ) @staticmethod @datasetmethod(name='publish') def __call__(path=None, dataset=None, to=None, since=None, missing='fail', force=False, recursive=False, recursion_limit=None, git_opts=None, annex_opts=None, annex_copy_opts=None, jobs=None): # if ever we get a mode, for "with-data" we would need this #if dataset and not path: # # act on the whole dataset if nothing else was specified # path = dataset.path if isinstance(dataset, Dataset) else dataset if not dataset and not path: # try to find a dataset in PWD dataset = require_dataset(None, check_installed=True, purpose='publishing') if since and not dataset: raise InsufficientArgumentsError( 'Modification detection (--since) without a base dataset ' 'is not supported') content_by_ds, unavailable_paths = Interface._prep( path=path, dataset=dataset, recursive=recursive, recursion_limit=recursion_limit, # we do not want for this command state that we want to publish # content by default by assigning paths for each sub-dataset # automagically. But if paths were provided -- sorting would # happen to point only to the submodules under those paths, and # then to stay consistent we want to copy those paths data sub_paths=bool(path)) if unavailable_paths: raise ValueError( 'cannot publish content that is not available locally: %s' % ', '.join(unavailable_paths)) # here is the plan # 1. figure out remote to publish to # 2. figure out which content needs to be published to this remote # 3. look for any pre-publication dependencies of that remote # (i.e. remotes that need to be published to before) # 4. publish the content needed to go to the primary remote to # the dependencies first, and to the primary afterwards ds_remote_info = {} lgr.debug("Evaluating %i dataset publication candidate(s)", len(content_by_ds)) # TODO: fancier sorting, so we still follow somewhat the hierarchy # in sorted order, e.g. # d1/sub1/sub1 # d1/sub1 # d1 # d2/sub1 # d2 content_by_ds = OrderedDict( (d, content_by_ds[d]) for d in sorted(content_by_ds, reverse=True)) for ds_path in content_by_ds: ds = Dataset(ds_path) if to is None: # we need an upstream remote, if there's none given. We could # wait for git push to complain, but we need to explicitly # figure it out for pushing annex branch anyway and we might as # well fail right here. track_remote, track_refspec = ds.repo.get_tracking_branch() if not track_remote: # no tracking remote configured, but let try one more # if we only have one remote, and it has a push target # configured that is "good enough" for us cand_remotes = [ r for r in ds.repo.get_remotes() if 'remote.{}.push'.format(r) in ds.config ] if len(cand_remotes) > 1: lgr.warning( 'Target sibling ambiguous, please specific via --to' ) elif len(cand_remotes) == 1: track_remote = cand_remotes[0] else: lgr.warning( 'No target sibling configured for default publication, ' 'please specific via --to') if track_remote: ds_remote_info[ds_path] = dict( zip(('remote', 'refspec'), (track_remote, track_refspec))) elif missing == 'skip': lgr.warning('Cannot determine target sibling, skipping %s', ds) ds_remote_info[ds_path] = None else: # we have no remote given and no upstream => fail raise InsufficientArgumentsError( 'Cannot determine target sibling for %s' % (ds, )) elif to not in ds.repo.get_remotes(): # unknown given remote if missing == 'skip': lgr.warning("Unknown target sibling '%s', skipping %s", to, ds) ds_remote_info[ds_path] = None elif missing == 'inherit': superds = ds.get_superdataset() if not superds: raise RuntimeError( "%s has no super-dataset to inherit settings for the remote %s" % (ds, to)) # XXX due to difference between create-sibling and create-sibling-github # would not be as transparent to inherit for -github lgr.info( "Will try to create a sibling inheriting settings from %s", superds) # XXX explicit None as sshurl for now ds.create_sibling(None, name=to, inherit=True) ds_remote_info[ds_path] = {'remote': to} else: raise ValueError("Unknown target sibling '%s' for %s" % (to, ds)) else: # all good: remote given and is known ds_remote_info[ds_path] = {'remote': to} if dataset and since: # remove all unmodified components from the spec lgr.debug("Testing %i dataset(s) for modifications since '%s'", len(content_by_ds), since) content_by_ds = filter_unmodified(content_by_ds, dataset, since) lgr.debug("Attempt to publish %i datasets", len(content_by_ds)) published, skipped = [], [] for ds_path in content_by_ds: remote_info = ds_remote_info[ds_path] if not remote_info: # in case we are skipping lgr.debug("Skipping dataset at '%s'", ds_path) continue # and publish ds = Dataset(ds_path) pblsh, skp = _publish_dataset(ds, remote=remote_info['remote'], refspec=remote_info.get( 'refspec', None), paths=content_by_ds[ds_path], annex_copy_options=annex_copy_opts, force=force, jobs=jobs) published.extend(pblsh) skipped.extend(skp) return published, skipped @staticmethod def result_renderer_cmdline(results, args): from datalad.ui import ui for res, res_label in zip(results, ('published', 'skipped')): if not res: if res_label == 'published': ui.message("Nothing was %s" % res_label) continue msg = "{n} {obj} {res_label}:\n".format( obj='items were' if len(res) > 1 else 'item was', n=len(res), res_label=res_label) for item in res: if isinstance(item, Dataset): msg += "Dataset: %s\n" % item.path else: msg += "File: %s\n" % item ui.message(msg)
class Save(Interface): """Save the current state of a dataset Saving the state of a dataset records changes that have been made to it. This change record is annotated with a user-provided description. Optionally, an additional tag, such as a version, can be assigned to the saved state. Such tag enables straightforward retrieval of past versions at a later point in time. Examples: Save any content underneath the current directory, without altering any potential subdataset (use --recursive for that):: % datalad save . Save any modification of known dataset content, but leave untracked files (e.g. temporary files) untouched:: % dataset save -u -d <path_to_dataset> Tag the most recent saved state of a dataset:: % dataset save -d <path_to_dataset> --version-tag bestyet .. note:: For performance reasons, any Git repository without an initial commit located inside a Dataset is ignored, and content underneath it will be saved to the respective superdataset. DataLad datasets always have an initial commit, hence are not affected by this behavior. """ # note above documents that out behavior is like that of `git add`, but # does not explicitly mention the connection to keep it simple. _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc=""""specify the dataset to save""", constraints=EnsureDataset() | EnsureNone()), path=Parameter( args=("path",), metavar='PATH', doc="""path/name of the dataset component to save. If given, only changes made to those components are recorded in the new state.""", nargs='*', constraints=EnsureStr() | EnsureNone()), message=save_message_opt, message_file=Parameter( args=("-F", "--message-file"), doc="""take the commit message from this file. This flag is mutually exclusive with -m.""", constraints=EnsureStr() | EnsureNone()), version_tag=Parameter( args=("-t", "--version-tag",), metavar='ID', doc="""an additional marker for that state. Every dataset that is touched will receive the tag.""", constraints=EnsureStr() | EnsureNone()), recursive=recursion_flag, recursion_limit=recursion_limit, updated=Parameter( args=('-u', '--updated',), action='store_true', doc="""if given, only saves previously tracked paths."""), to_git=Parameter( args=("--to-git",), action='store_true', doc="""flag whether to add data directly to Git, instead of tracking data identity only. Usually this is not desired, as it inflates dataset sizes and impacts flexibility of data transport. If not specified - it will be up to git-annex to decide, possibly on .gitattributes options. Use this flag with a simultaneous selection of paths to save. In general, it is better to pre-configure a dataset to track particular paths, file types, or file sizes with either Git or git-annex. See https://git-annex.branchable.com/tips/largefiles/"""), ) @staticmethod @datasetmethod(name='save') @eval_results def __call__(path=None, message=None, dataset=None, version_tag=None, recursive=False, recursion_limit=None, updated=False, message_file=None, to_git=None, ): if message and message_file: raise ValueError( "Both a message and message file were specified for save()") path = assure_list(path) if message_file: with open(message_file) as mfh: message = mfh.read() # we want 'normal' to achieve the most compact argument list # for git calls # untracked_mode = 'no' if updated else 'normal' # TODO however, Repo.add() would refuse to add any dotfiles # in a directory that is itself untracked, hence the only # choice is to go with potentially crazy long lists # until https://github.com/datalad/datalad/issues/1454 # has a resolution untracked_mode = 'no' if updated else 'all' # there are three basic scenarios: # 1. save modifications to any already tracked content # 2. save any content (including removal of deleted content) # to bring things to a clean state # 3. like (2), but only operate on a given subset of content # identified by paths # - all three have to work in conjunction with --recursive # - the difference between (1) and (2) should be no more # that a switch from --untracked=no to --untracked=all # in Repo.save() # we do not support # - simultaneous operations on multiple datasets from disjoint # dataset hierarchies, hence a single reference dataset must be # identifiable from the either # - curdir or # - the `dataset` argument. # This avoids complex annotation loops and hierarchy tracking. # - any modification upwards from the root dataset ds = require_dataset(dataset, check_installed=True, purpose='saving') # use status() to do all discovery and annotation of paths paths_by_ds = {} for s in Status()( # ATTN: it is vital to pass the `dataset` argument as it, # and not a dataset instance in order to maintain the path # semantics between here and the status() call dataset=dataset, path=path, untracked=untracked_mode, recursive=recursive, recursion_limit=recursion_limit, result_renderer='disabled'): # fish out status dict for this parent dataset ds_status = paths_by_ds.get(s['parentds'], {}) # reassemble path status info as repo.status() would have made it ds_status[ut.Path(s['path'])] = \ {k: v for k, v in iteritems(s) if k not in ( 'path', 'parentds', 'refds', 'status', 'action', 'logger')} paths_by_ds[s['parentds']] = ds_status lgr.debug('Determined %i datasets for saving from input arguments', len(paths_by_ds)) # figure out what datasets to process, start with the ones containing # the paths that were given as arguments discovered_datasets = list(paths_by_ds.keys()) if dataset: # if a reference dataset was given we want to save all the way up # to it, so let's throw it into the mix discovered_datasets.append(ds.path) # sort the datasets into (potentially) disjoint hierarchies, # or a single one, if a reference dataset was given dataset_hierarchies = get_tree_roots(discovered_datasets) for rootds, children in iteritems(dataset_hierarchies): edges = {} discover_dataset_trace_to_targets( rootds, children, [], edges, includeds=children) for superds, subdss in iteritems(edges): superds_status = paths_by_ds.get(superds, {}) for subds in subdss: # TODO actually start from an entry that may already # exist in the status record superds_status[ut.Path(subds)] = dict( # shot from the hip, some status config # to trigger this specific super/sub # relation to be saved state='untracked', type='dataset') paths_by_ds[superds] = superds_status # TODO parallelize, whenever we have multiple subdataset of a single # dataset they can all be processed simultaneously # sort list of dataset to handle, starting with the ones deep down for pdspath in sorted(paths_by_ds, reverse=True): pds = Dataset(pdspath) # pop status for this dataset, we are not coming back to it pds_status = { # for handing over to the low-level code, we recode any # path relative to the real repo location, this avoid # cumbersome symlink handling without context in the # lower levels pds.repo.pathobj / p.relative_to(pdspath): props for p, props in iteritems(paths_by_ds.pop(pdspath))} start_commit = pds.repo.get_hexsha() if not all(p['state'] == 'clean' for p in pds_status.values()): for res in pds.repo.save_( message=message, # make sure to have the `path` arg be None, as we want # to prevent and bypass any additional repo.status() # calls paths=None, # prevent whining of GitRepo git=True if not hasattr(ds.repo, 'annexstatus') else to_git, # we are supplying the full status already, do not # detect anything else untracked='no', _status=pds_status): # TODO remove stringification when datalad-core can handle # path objects, or when PY3.6 is the lowest supported # version for k in ('path', 'refds'): if k in res: res[k] = text_type( # recode path back to dataset path anchor pds.pathobj / res[k].relative_to( pds.repo.pathobj) ) yield res # report on the dataset itself dsres = dict( action='save', type='dataset', path=pds.path, refds=ds.path, status='ok' if start_commit != pds.repo.get_hexsha() else 'notneeded', logger=lgr, ) if not version_tag: yield dsres continue try: pds.repo.tag(version_tag) dsres.update( status='ok', version_tag=version_tag) yield dsres except CommandError as e: if dsres['status'] == 'ok': # first we yield the result for the actual save yield dsres.copy() # and now complain that tagging didn't work dsres.update( status='error', message=('cannot tag this version: %s', e.stderr.strip())) yield dsres
class Diff(Interface): """Report differences between two states of a dataset (hierarchy) The two to-be-compared states are given via the --from and --to options. These state identifiers are evaluated in the context of the (specified or detected) dataset. In the case of a recursive report on a dataset hierarchy, corresponding state pairs for any subdataset are determined from the subdataset record in the respective superdataset. Only changes recorded in a subdataset between these two states are reported, and so on. Any paths given as additional arguments will be used to constrain the difference report. As with Git's diff, it will not result in an error when a path is specified that does not exist on the filesystem. Reports are very similar to those of the `status` command, with the distinguished content types and states being identical. """ # make the custom renderer the default one, as the global default renderer # does not yield meaningful output for this command result_renderer = 'tailored' _params_ = dict( _common_diffstatus_params, path=Parameter( args=("path",), metavar="PATH", doc="""path to contrain the report to""", nargs="*", constraints=EnsureStr() | EnsureNone()), fr=Parameter( args=("-f", "--from",), dest='fr', metavar="REVISION", doc="""original state to compare to, as given by any identifier that Git understands.""", constraints=EnsureStr()), to=Parameter( args=("-t", "--to",), metavar="REVISION", doc="""state to compare against the original state, as given by any identifier that Git understands. If none is specified, the state of the working tree will be compared.""", constraints=EnsureStr() | EnsureNone()), ) _examples_ = [ dict(text="Show unsaved changes in a dataset", code_py="diff()", code_cmd="datalad diff"), dict(text="Compare a previous dataset state identified by shasum " "against current worktree", code_py="diff(fr='SHASUM')", code_cmd="datalad diff --from <SHASUM>"), dict(text="Compare two branches against each other", code_py="diff(fr='branch1', to='branch2')", code_cmd="datalad diff --from branch1 --to branch2"), dict(text="Show unsaved changes in the dataset and potential subdatasets", code_py="diff(recursive=True)", code_cmd="datalad diff -r"), dict(text="Show unsaved changes made to a particular file", code_py="diff(path='path/to/file')", code_cmd="datalad diff <path/to/file>"), ] @staticmethod @datasetmethod(name='diff') @eval_results def __call__( path=None, fr='HEAD', to=None, dataset=None, annex=None, untracked='normal', recursive=False, recursion_limit=None): yield from diff_dataset( dataset=dataset, fr=ensure_unicode(fr), to=ensure_unicode(to), constant_refs=False, path=path, annex=annex, untracked=untracked, recursive=recursive, recursion_limit=recursion_limit) @staticmethod def custom_result_renderer(res, **kwargs): # pragma: more cover Status.custom_result_renderer(res, **kwargs)
class Search(Interface): """Search dataset metadata DataLad can search metadata extracted from a dataset and/or aggregated into a superdataset (see the `aggregate-metadata` command). This makes it possible to discover datasets, or individual files in a dataset even when they are not available locally. Ultimately DataLad metadata are a graph of linked data structures. However, this command does not (yet) support queries that can exploit all information stored in the metadata. At the moment the following search modes are implemented that represent different trade-offs between the expressiveness of a query and the computational and storage resources required to execute a query. - egrep (default) - egrepcs [case-sensitive egrep] - textblob - autofield An alternative default mode can be configured by tuning the configuration variable 'datalad.search.default-mode':: [datalad "search"] default-mode = egrepcs Each search mode has its own default configuration for what kind of documents to query. The respective default can be changed via configuration variables:: [datalad "search"] index-<mode_name>-documenttype = (all|datasets|files) *Mode: egrep/egrepcs* These search modes are largely ignorant of the metadata structure, and simply perform matching of a search pattern against a flat string-representation of metadata. This is advantageous when the query is simple and the metadata structure is irrelevant, or precisely known. Moreover, it does not require a search index, hence results can be reported without an initial latency for building a search index when the underlying metadata has changed (e.g. due to a dataset update). By default, these search modes only consider datasets and do not investigate records for individual files for speed reasons. Search results are reported in the order in which they were discovered. Queries can make use of Python regular expression syntax (https://docs.python.org/3/library/re.html). In `egrep` mode, matching is case-insensitive when the query does not contain upper case characters, but is case-sensitive when it does. In `egrepcs` mode, matching is always case-sensitive. Expressions will match anywhere in a metadata string, not only at the start. When multiple queries are given, all queries have to match for a search hit (AND behavior). It is possible to search individual metadata key/value items by prefixing the query with a metadata key name, separated by a colon (':'). The key name can also be a regular expression to match multiple keys. A query match happens when any value of an item with a matching key name matches the query (OR behavior). See examples for more information. Examples: Query for (what happens to be) an author:: % datalad search haxby Queries are case-INsensitive when the query contains no upper case characters, and can be regular expressions. Use `egrepcs` mode when it is desired to perform a case-sensitive lowercase match:: % datalad search --mode egrepcs halchenko.*haxby This search mode performs NO analysis of the metadata content. Therefore queries can easily fail to match. For example, the above query implicitly assumes that authors are listed in alphabetical order. If that is the case (which may or may not be true), the following query would yield NO hits:: % datalad search Haxby.*Halchenko The ``textblob`` search mode represents an alternative that is more robust in such cases. For more complex queries multiple query expressions can be provided that all have to match to be considered a hit (AND behavior). This query discovers all files (non-default behavior) that match 'bids.type=T1w' AND 'nifti1.qform_code=scanner':: % datalad -c datalad.search.index-egrep-documenttype=all search bids.type:T1w nifti1.qform_code:scanner Key name selectors can also be expressions, which can be used to select multiple keys or construct "fuzzy" queries. In such cases a query matches when any item with a matching key matches the query (OR behavior). However, multiple queries are always evaluated using an AND conjunction. The following query extends the example above to match any files that have either 'nifti1.qform_code=scanner' or 'nifti1.sform_code=scanner':: % datalad -c datalad.search.index-egrep-documenttype=all search bids.type:T1w nifti1.(q|s)form_code:scanner *Mode: textblob* This search mode is very similar to the ``egrep`` mode, but with a few key differences. A search index is built from the string-representation of metadata records. By default, only datasets are included in this index, hence the indexing is usually completed within a few seconds, even for hundreds of datasets. This mode uses its own query language (not regular expressions) that is similar to other search engines. It supports logical conjunctions and fuzzy search terms. More information on this is available from the Whoosh project (search engine implementation): - Description of the Whoosh query language: http://whoosh.readthedocs.io/en/latest/querylang.html) - Description of a number of query language customizations that are enabled in DataLad, such as, fuzzy term matching: http://whoosh.readthedocs.io/en/latest/parsing.html#common-customizations Importantly, search hits are scored and reported in order of descending relevance, hence limiting the number of search results is more meaningful than in the 'egrep' mode and can also reduce the query duration. Examples: Search for (what happens to be) two authors, regardless of the order in which those names appear in the metadata:: % datalad search --mode textblob halchenko haxby Fuzzy search when you only have an approximate idea what you are looking for or how it is spelled:: % datalad search --mode textblob haxbi~ Very fuzzy search, when you are basically only confident about the first two characters and how it sounds approximately (or more precisely: allow for three edits and require matching of the first two characters):: % datalad search --mode textblob haksbi~3/2 Combine fuzzy search with logical constructs:: % datalad search --mode textblob 'haxbi~ AND (hanke OR halchenko)' *Mode: autofield* This mode is similar to the 'textblob' mode, but builds a vastly more detailed search index that represents individual metadata variables as individual fields. By default, this search index includes records for datasets and individual fields, hence it can grow very quickly into a huge structure that can easily take an hour or more to build and require more than a GB of storage. However, limiting it to documents on datasets (see above) retains the enhanced expressiveness of queries while dramatically reducing the resource demands. Examples: List names of search index fields (auto-discovered from the set of indexed datasets):: % datalad search --mode autofield --show-keys name Fuzzy search for datasets with an author that is specified in a particular metadata field:: % datalad search --mode autofield bids.author:haxbi~ type:dataset Search for individual files that carry a particular description prefix in their 'nifti1' metadata:: % datalad search --mode autofield nifti1.description:FSL* type:file *Reporting* Search hits are returned as standard DataLad results. On the command line the '--output-format' (or '-f') option can be used to tweak results for further processing. Examples: Format search hits as a JSON stream (one hit per line):: % datalad -f json search haxby Custom formatting: which terms matched the query of particular results. Useful for investigating fuzzy search results:: $ datalad -f '{path}: {query_matched}' search --mode autofield bids.author:haxbi~ """ _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset to perform the query operation on. If no dataset is given, an attempt is made to identify the dataset based on the current working directory and/or the `path` given""", constraints=EnsureDataset() | EnsureNone()), query=Parameter( args=("query",), metavar='QUERY', nargs="*", doc="""query string, supported syntax and features depends on the selected search mode (see documentation)"""), force_reindex=Parameter( args=("--reindex",), dest='force_reindex', action='store_true', doc="""force rebuilding the search index, even if no change in the dataset's state has been detected, for example, when the index documenttype configuration has changed."""), max_nresults=Parameter( args=("--max-nresults",), doc="""maxmimum number of search results to report. Setting this to 0 will report all search matches. Depending on the mode this can search substantially slower. If not specified, a mode-specific default setting will be used.""", constraints=EnsureInt() | EnsureNone()), mode=Parameter( args=("--mode",), choices=('egrep', 'textblob', 'autofield'), doc="""Mode of search index structure and content. See section SEARCH MODES for details."""), full_record=Parameter( args=("--full-record", '-f'), action='store_true', doc="""If set, return the full metadata record for each search hit. Depending on the search mode this might require additional queries. By default, only data that is available to the respective search modes is returned. This always includes essential information, such as the path and the type."""), show_keys=Parameter( args=('--show-keys',), choices=('name', 'short', 'full'), default=None, doc="""if given, a list of known search keys is shown. If 'name' - only the name is printed one per line. If 'short' or 'full', statistics (in how many datasets, and how many unique values) are printed. 'short' truncates the listing of unique values. No other action is performed (except for reindexing), even if other arguments are given. Each key is accompanied by a term definition in parenthesis (TODO). In most cases a definition is given in the form of a URL. If an ontology definition for a term is known, this URL can resolve to a webpage that provides a comprehensive definition of the term. However, for speed reasons term resolution is solely done on information contained in a local dataset's metadata, and definition URLs might be outdated or point to no longer existing resources."""), show_query=Parameter( args=('--show-query',), action='store_true', doc="""if given, the formal query that was generated from the given query string is shown, but not actually executed. This is mostly useful for debugging purposes."""), ) @staticmethod @datasetmethod(name='search') @eval_results def __call__(query=None, dataset=None, force_reindex=False, max_nresults=None, mode=None, full_record=False, show_keys=None, show_query=False): try: ds = require_dataset(dataset, check_installed=True, purpose='dataset search') if ds.id is None: raise NoDatasetArgumentFound( "This does not seem to be a dataset (no DataLad dataset ID " "found). 'datalad create --force %s' can initialize " "this repository as a DataLad dataset" % ds.path) except NoDatasetArgumentFound: for r in _search_from_virgin_install(dataset, query): yield r return if mode is None: # let's get inspired by what the dataset/user think is # default mode = ds.config.obtain('datalad.search.default-mode') if mode == 'egrep': searcher = _EGrepSearch elif mode == 'egrepcs': searcher = _EGrepCSSearch elif mode == 'textblob': searcher = _BlobSearch elif mode == 'autofield': searcher = _AutofieldSearch else: raise ValueError( 'unknown search mode "{}"'.format(mode)) searcher = searcher(ds, force_reindex=force_reindex) if show_keys: searcher.show_keys(show_keys) return if not query: return if show_query: print(repr(searcher.get_query(query))) return nhits = 0 for r in searcher( query, max_nresults=max_nresults, full_record=full_record): nhits += 1 yield r if not nhits: lgr.info(searcher.get_nohits_msg() or 'no hits')
class Export(Interface): """Export a dataset to another representation """ _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), doc="""specify the dataset to export. If no dataset is given, an attempt is made to identify the dataset based on the current working directory.""", constraints=EnsureDataset() | EnsureNone()), astype=Parameter( args=("astype", ), choices=_get_exporter_names(), doc="""label of the type or format the dataset shall be exported to."""), output=Parameter( args=('-o', '--output'), doc="""output destination specification to be passes to the exporter. The particular semantics of the option value depend on the actual exporter. Typically, this will be a file name or a path to a directory."""), getcmdhelp=Parameter( args=('--help-type', ), dest='getcmdhelp', action='store_true', doc="""show help for a specific export type/format"""), ) @staticmethod @datasetmethod(name='export') def __call__(astype, dataset, getcmdhelp=False, output=None, **kwargs): # get a handle on the relevant plugin module import datalad.export as export_mod try: exmod = import_module('.%s' % (astype, ), package=export_mod.__package__) except ImportError as e: raise ValueError("cannot load exporter '{}': {}".format( astype, exc_str(e))) if getcmdhelp: # no result, but return the module to make the renderer do the rest return (exmod, None) ds = require_dataset(dataset, check_installed=True, purpose='exporting') # call the plugin, either with the argv array from the cmdline call # or directly with the kwargs if 'datalad_unparsed_args' in kwargs: result = exmod._datalad_export_plugin_call( ds, argv=kwargs['datalad_unparsed_args'], output=output) else: result = exmod._datalad_export_plugin_call(ds, output=output, **kwargs) return (exmod, result) @staticmethod def result_renderer_cmdline(res, args): exmod, result = res if args.getcmdhelp: # the function that prints the help was returned as result if not hasattr(exmod, '_datalad_get_cmdline_help'): lgr.error( "export plugin '{}' does not provide help".format(exmod)) return replacement = [] help = exmod._datalad_get_cmdline_help() if isinstance(help, tuple): help, replacement = help if replacement: for in_s, out_s in replacement: help = help.replace( in_s, out_s + ' ' * max(0, len(in_s) - len(out_s))) print(help) return
class RewriteURLs(Interface): """Rewrite the URLs of sub-datasets of a dataset """ _params_ = dict( url=Parameter( args=("url", ), doc="a template for building the URLs of the subdatasets " "List of currently available placeholders:\n" "%%NAME\tthe name of the subdataset, where slashes are replaced by " "dashes", constraints=EnsureStr()), dataset=Parameter(args=( "-d", "--dataset", ), doc="""specify the dataset to update. If no dataset is given, an attempt is made to identify the dataset based on the current working directory""", constraints=EnsureDataset() | EnsureNone()), recursive=Parameter( args=("-r", "--recursive"), action="store_true", doc="recursively modify all subdataset URLs of `dataset` "), ) # TODO: User interaction. Allow for skipping and editing on a per # subdataset basis. Therefore some --mode option (see below). Additionally, # this leads to URL being optional, so no URL given means to # edit per subdataset # mode=Parameter( # args=("--mode",), # doc="", # constraints=EnsureChoice(["all", "ask"]),) @staticmethod @datasetmethod(name='rewrite_urls') def __call__(url, dataset=None, recursive=False): # shortcut ds = require_dataset(dataset, check_installed=True, purpose='modifying subdataset URLs') assert (ds.repo is not None) repos_to_update = [ds.repo] if recursive: repos_to_update += [ GitRepo(opj(ds.path, sub_path)) for sub_path in ds.get_subdatasets(recursive=True) ] for dataset_repo in repos_to_update: parser = get_module_parser(dataset_repo) for submodule_section in parser.sections(): submodule_name = submodule_section[11:-1] parser.set_value( submodule_section, "url", url.replace("%NAME", submodule_name.replace("/", "-"))) return # TODO: return value?
class Install(Interface): """Install a dataset from a (remote) source. This command creates a local :term:`sibling` of an existing dataset from a (remote) location identified via a URL or path. Optional recursion into potential subdatasets, and download of all referenced data is supported. The new dataset can be optionally registered in an existing :term:`superdataset` by identifying it via the `dataset` argument (the new dataset's path needs to be located within the superdataset for that). It is recommended to provide a brief description to label the dataset's nature *and* location, e.g. "Michael's music on black laptop". This helps humans to identify data locations in distributed scenarios. By default an identifier comprised of user and machine name, plus path will be generated. When only partial dataset content shall be obtained, it is recommended to use this command without the `get-data` flag, followed by a :func:`~datalad.api.get` operation to obtain the desired data. .. note:: Power-user info: This command uses :command:`git clone`, and :command:`git annex init` to prepare the dataset. Registering to a superdataset is performed via a :command:`git submodule add` operation in the discovered superdataset. """ # very frequently this command will yield exactly one installed dataset # spare people the pain of going through a list by default return_type = 'item-or-list' # as discussed in #1409 and #1470, we want to return dataset instances # matching what is actually available after command completion (and # None for any failed dataset installation) # TODO actually need success(containing)dataset-or-none result_xfm = 'successdatasets-or-none' # we also want to limit the returned result to explicit input arguments # (paths/source) and not report any implicit action, like intermediate # datasets result_filter = is_result_matching_pathsource_argument _examples_ = [ dict(text="Install a dataset from Github into the current directory", code_py="install(" "source='https://github.com/datalad-datasets/longnow" "-podcasts.git')", code_cmd="datalad install " "https://github.com/datalad-datasets/longnow-podcasts.git"), dict(text="Install a dataset as a subdataset into the current dataset", code_py="""\ install(dataset='.', source='https://github.com/datalad-datasets/longnow-podcasts.git')""", code_cmd="""\ datalad install -d . \\ --source='https://github.com/datalad-datasets/longnow-podcasts.git'""" ), dict(text="Install a dataset, and get all content right away", code_py="""\ install(source='https://github.com/datalad-datasets/longnow-podcasts.git', get_data=True)""", code_cmd="""\ datalad install --get-data \\ -s https://github.com/datalad-datasets/longnow-podcasts.git"""), dict(text="Install a dataset with all its subdatasets", code_py="""\ install(source='https://github.com/datalad-datasets/longnow-podcasts.git', recursive=True)""", code_cmd="""\ datalad install -r \\ https://github.com/datalad-datasets/longnow-podcasts.git"""), ] _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), # TODO: this probably changes to install into the dataset (add_to_super) # and to install the thing 'just there' without operating 'on' a dataset. # Adapt doc. # MIH: `shouldn't this be the job of `add`? doc="""specify the dataset to perform the install operation on. If no dataset is given, an attempt is made to identify the dataset in a parent directory of the current working directory and/or the `path` given""", constraints=EnsureDataset() | EnsureNone()), path=Parameter( args=("path", ), metavar='PATH', nargs="*", # doc: TODO doc="""path/name of the installation target. If no `path` is provided a destination path will be derived from a source URL similar to :command:`git clone`"""), source=Parameter(args=("-s", "--source"), metavar='SOURCE', doc="URL or local path of the installation source", constraints=EnsureStr() | EnsureNone()), get_data=Parameter(args=( "-g", "--get-data", ), doc="""if given, obtain all data content too""", action="store_true"), description=location_description, recursive=recursion_flag, recursion_limit=recursion_limit, reckless=reckless_opt, jobs=jobs_opt, ) @staticmethod @datasetmethod(name='install') @eval_results def __call__(path=None, source=None, dataset=None, get_data=False, description=None, recursive=False, recursion_limit=None, reckless=None, jobs="auto"): # normalize path argument to be equal when called from cmdline and # python and nothing was passed into `path` path = ensure_list(path) if not source and not path: raise InsufficientArgumentsError( "Please provide at least a source or a path") # Common kwargs to pass to underlying git/install calls. # They might need adjustments (e.g. for recursion_limit, but # otherwise would be applicable throughout # # There should have been more of common options! # since underneath get could do similar installs common_kwargs = dict( get_data=get_data, recursive=recursive, recursion_limit=recursion_limit, # git_opts=git_opts, # annex_opts=annex_opts, reckless=reckless, jobs=jobs, ) # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. ds = None if dataset is not None: ds = require_dataset(dataset, check_installed=True, purpose='installation') common_kwargs['dataset'] = dataset # pre-compute for results below refds_path = Interface.get_refds_path(ds) # switch into the two scenarios without --source: # 1. list of URLs # 2. list of (sub)dataset content if source is None: # we need to collect URLs and paths to_install = [] to_get = [] # TODO: this approach is problematic, it disrupts the order of input args. # consequently results will be returned in an unexpected order when a # mixture of source URL and paths is given. Reordering is only possible when # everything in here is fully processed before any results can be yielded. # moreover, I think the semantics of the status quo implementation are a # bit complicated: in a mixture list a source URL will lead to a new dataset # at a generated default location, but a path will lead to a subdataset # at that exact location for urlpath in path: ri = RI(urlpath) (to_get if isinstance(ri, PathRI) else to_install).append(urlpath) # 1. multiple source URLs for s in to_install: lgr.debug("Install passes into install source=%s", s) for r in Install.__call__( source=s, description=description, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" on_failure='ignore', return_type='generator', result_xfm=None, result_filter=None, **common_kwargs): # no post-processing of the installed content on disk # should be necessary here, all done by code further # down that deals with an install from an actuall `source` # any necessary fixes should go there too! r['refds'] = refds_path yield r # 2. one or more dataset content paths if to_get: lgr.debug("Install passes into get %d items", len(to_get)) # all commented out hint on inability to pass those options # into underlying install-related calls. # Also need to pass from get: # annex_get_opts for r in Get.__call__( to_get, # TODO should pass-through description, not sure why disabled # description=description, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" on_failure='ignore', return_type='generator', result_xfm=None, result_filter=None, **common_kwargs): # no post-processing of get'ed content on disk should be # necessary here, this is the responsibility of `get` # (incl. adjusting parent's gitmodules when submodules end # up in an "updated" state (done in get helpers) # any required fixes should go there! r['refds'] = refds_path yield r # we are done here # the rest is about install from a `source` return # an actual `source` was given if source and path and len(path) > 1: # exception is ok here, if this fails it is either direct user error # or we f****d up one of our internal calls raise ValueError( "install needs a single PATH when source is provided. " "Was given mutliple PATHs: %s" % str(path)) # parameter constraints: if not source: # exception is ok here, if this fails it is either direct user error # or we f****d up one of our internal calls raise InsufficientArgumentsError( "a `source` is required for installation") # code below deals with a single path only path = path[0] if path else None if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination yield get_status_dict( 'install', path=path, status='impossible', logger=lgr, source_url=source, refds=refds_path, message= "installation `source` and destination `path` are identical. " "If you are trying to add a subdataset simply use the `save` command" ) return # resolve the target location (if local) against the provided dataset # or CWD: if path is not None: # MIH everything in here is highly similar to what common # interface helpers do (or should/could do), but at the same # is very much tailored to just apply to `install` -- I guess # it has to stay special # Should work out just fine for regular paths, so no additional # conditioning is necessary try: path_ri = RI(path) except Exception as e: raise ValueError("invalid path argument {}: ({})".format( path, exc_str(e))) try: # Wouldn't work for SSHRI ATM, see TODO within SSHRI # yoh: path should be a local path, and mapping note within # SSHRI about mapping localhost:path to path is kinda # a peculiar use-case IMHO # TODO Stringification can be removed once PY35 is no longer # supported path = str(resolve_path(path_ri.localpath, dataset)) # any `path` argument that point to something local now # resolved and is no longer a URL except ValueError: # `path` is neither a valid source nor a local path. # TODO: The only thing left is a known subdataset with a # name, that is not a path; Once we correctly distinguish # between path and name of a submodule, we need to consider # this. # For now: Just raise raise ValueError("Invalid path argument {0}".format(path)) # `path` resolved, if there was any. # clone dataset, will also take care of adding to superdataset, if one # is given res = Clone.__call__( source, path, dataset=ds, description=description, reckless=reckless, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" result_xfm=None, return_type='generator', result_filter=None, on_failure='ignore') # helper as_ds = YieldDatasets() destination_dataset = None for r in res: if r['action'] == 'install' and r['type'] == 'dataset': # make sure logic below is valid, only one dataset result is # coming back assert (destination_dataset is None) destination_dataset = as_ds(r) r['refds'] = refds_path yield r assert (destination_dataset) # Now, recursive calls: if recursive or get_data: # dataset argument must not be passed inside since we use bound .get # It is ok to do "inplace" as long as we still return right # after the loop ends common_kwargs.pop('dataset', '') for r in destination_dataset.get( curdir, description=description, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" on_failure='ignore', return_type='generator', result_xfm=None, **common_kwargs): r['refds'] = refds_path yield r # at this point no futher post-processing should be necessary, # `clone` and `get` must have done that (incl. parent handling) # if not, bugs should be fixed in those commands return
class Siblings(Interface): """Manage sibling configuration This command offers four different actions: 'query', 'add', 'remove', 'configure', 'enable'. 'query' is the default action and can be used to obtain information about (all) known siblings. 'add' and 'configure' are highly similar actions, the only difference being that adding a sibling with a name that is already registered will fail, whereas re-configuring a (different) sibling under a known name will not be considered an error. 'enable' can be used to complete access configuration for non-Git sibling (aka git-annex special remotes). Lastly, the 'remove' action allows for the removal (or de-configuration) of a registered sibling. For each sibling (added, configured, or queried) all known sibling properties are reported. This includes: "name" Name of the sibling "path" Absolute path of the dataset "url" For regular siblings at minimum a "fetch" URL, possibly also a "pushurl" Additionally, any further configuration will also be reported using a key that matches that in the Git configuration. By default, sibling information is rendered as one line per sibling following this scheme:: <dataset_path>: <sibling_name>(<+|->) [<access_specification] where the `+` and `-` labels indicate the presence or absence of a remote data annex at a particular remote, and `access_specification` contains either a URL and/or a type label for the sibling. """ # make the custom renderer the default, path reporting isn't the top # priority here result_renderer = 'tailored' _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), doc="""specify the dataset to configure. If no dataset is given, an attempt is made to identify the dataset based on the input and/or the current working directory""", constraints=EnsureDataset() | EnsureNone()), name=Parameter( args=( '-s', '--name', ), metavar='NAME', doc="""name of the sibling. For addition with path "URLs" and sibling removal this option is mandatory, otherwise the hostname part of a given URL is used as a default. This option can be used to limit 'query' to a specific sibling.""", constraints=EnsureStr() | EnsureNone()), action=Parameter( args=('action', ), nargs='?', metavar='ACTION', doc="""command action selection (see general documentation)""", constraints=EnsureChoice('query', 'add', 'remove', 'configure', 'enable') | EnsureNone()), url=Parameter(args=('--url', ), doc="""the URL of or path to the dataset sibling named by `name`. For recursive operation it is required that a template string for building subdataset sibling URLs is given.\n List of currently available placeholders:\n %%NAME\tthe name of the dataset, where slashes are replaced by dashes.""", constraints=EnsureStr() | EnsureNone(), nargs="?"), pushurl=Parameter( args=('--pushurl', ), doc="""in case the `url` cannot be used to publish to the dataset sibling, this option specifies a URL to be used instead.\nIf no `url` is given, `pushurl` serves as `url` as well.""", constraints=EnsureStr() | EnsureNone()), description=location_description, ## info options # --template/cfgfrom gh-1462 (maybe also for a one-time inherit) # --wanted gh-925 (also see below for add_sibling approach) fetch=Parameter(args=("--fetch", ), action="store_true", doc="""fetch the sibling after configuration"""), as_common_datasrc=as_common_datasrc, publish_depends=publish_depends, publish_by_default=publish_by_default, annex_wanted=annex_wanted_opt, annex_required=annex_required_opt, annex_group=annex_group_opt, annex_groupwanted=annex_groupwanted_opt, inherit=inherit_opt, get_annex_info=Parameter( args=("--no-annex-info", ), dest='get_annex_info', action="store_false", doc= """Whether to query all information about the annex configurations of siblings. Can be disabled if speed is a concern"""), recursive=recursion_flag, recursion_limit=recursion_limit) @staticmethod @datasetmethod(name='siblings') @eval_results def __call__( action='query', dataset=None, name=None, url=None, pushurl=None, description=None, # TODO consider true, for now like add_sibling fetch=False, as_common_datasrc=None, publish_depends=None, publish_by_default=None, annex_wanted=None, annex_required=None, annex_group=None, annex_groupwanted=None, inherit=False, get_annex_info=True, recursive=False, recursion_limit=None): # TODO: Detect malformed URL and fail? # XXX possibly fail if fetch is False and as_common_datasrc if annex_groupwanted and not annex_group: raise InsufficientArgumentsError( "To set groupwanted, you need to provide annex_group option") # TODO catch invalid action specified action_worker_map = { 'query': _query_remotes, 'add': _add_remote, 'configure': _configure_remote, 'remove': _remove_remote, 'enable': _enable_remote, } # all worker strictly operate on a single dataset # anything that deals with hierarchies and/or dataset # relationships in general should be dealt with in here # at the top-level and vice versa worker = action_worker_map[action] dataset = require_dataset(dataset, check_installed=False, purpose='sibling configuration') refds_path = dataset.path res_kwargs = dict(refds=refds_path, logger=lgr) ds_name = op.basename(dataset.path) # do not form single list of datasets (with recursion results) to # give fastest possible response, for the precise of a long-all # function call ds = dataset for r in worker( # always copy signature to below to avoid bugs! ds, name, ds.repo.get_remotes(), # for top-level dataset there is no layout questions _mangle_urls(url, ds_name), _mangle_urls(pushurl, ds_name), fetch, description, as_common_datasrc, publish_depends, publish_by_default, annex_wanted, annex_required, annex_group, annex_groupwanted, inherit, get_annex_info, **res_kwargs): yield r if not recursive: return # do we have instructions to register siblings with some alternative # layout? replicate_local_structure = url and "%NAME" not in url subds_pushurl = None for subds in dataset.subdatasets(fulfilled=True, recursive=recursive, recursion_limit=recursion_limit, result_xfm='datasets'): subds_name = op.relpath(subds.path, start=dataset.path) if replicate_local_structure: subds_url = slash_join(url, subds_name) if pushurl: subds_pushurl = slash_join(pushurl, subds_name) else: subds_url = \ _mangle_urls(url, '/'.join([ds_name, subds_name])) subds_pushurl = \ _mangle_urls(pushurl, '/'.join([ds_name, subds_name])) for r in worker( # always copy signature from above to avoid bugs subds, name, subds.repo.get_remotes(), subds_url, subds_pushurl, fetch, description, as_common_datasrc, publish_depends, publish_by_default, annex_wanted, annex_required, annex_group, annex_groupwanted, inherit, get_annex_info, **res_kwargs): yield r @staticmethod def custom_result_renderer(res, **kwargs): from datalad.ui import ui # should we attempt to remove an unknown sibling, complain like Git does if res['status'] == 'notneeded' and res['action'] == 'remove-sibling': ui.message('{warn}: No sibling "{name}" in dataset {path}'.format( warn=ac.color_word('Warning', ac.LOG_LEVEL_COLORS['WARNING']), **res)) return if res['status'] != 'ok' or not res.get('action', '').endswith('-sibling'): # logging complained about this already return path = op.relpath(res['path'], res['refds']) if res.get( 'refds', None) else res['path'] got_url = 'url' in res spec = '{}{}{}{}'.format(res.get('url', ''), ' (' if got_url else '', res.get('annex-externaltype', 'git'), ')' if got_url else '') ui.message('{path}: {name}({with_annex}) [{spec}]'.format( **dict( res, path=path, # TODO report '+' for special remotes with_annex='+' if 'annex-uuid' in res \ else ('-' if res.get('annex-ignore', None) else '?'), spec=spec)))
class Save(Interface): """Save the current state of a dataset Saving the state of a dataset records all changes that have been made to it. This change record is annotated with a user-provided description. Optionally, an additional tag, such as a version, can be assigned to the saved state. Such tag enables straightforward retrieval of past versions at a later point in time. || PYTHON >> Returns ------- commit or None `None` if nothing was saved, the resulting commit otherwise. << PYTHON || """ _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc=""""specify the dataset to save. If a dataset is given, but no `files`, the entire dataset will be saved.""", constraints=EnsureDataset() | EnsureNone()), files=Parameter( args=("files", ), metavar='FILES', doc="""list of files to consider. If given, only changes made to those files are recorded in the new state.""", nargs='*', constraints=EnsureStr() | EnsureNone()), message=save_message_opt, all_changes=Parameter( args=("-a", "--all-changes"), doc= """save all changes (even to not yet added files) of all components in datasets that contain any of the given paths [DEPRECATED!].""", action="store_true"), all_updated=Parameter( args=("-u", "--all-updated"), doc="""if no explicit paths are given, save changes of all known components in a datasets""", action="store_true"), version_tag=Parameter(args=("--version-tag", ), metavar='ID', doc="""an additional marker for that state.""", constraints=EnsureStr() | EnsureNone()), super_datasets=super_datasets_flag, recursive=recursion_flag, recursion_limit=recursion_limit, ) @staticmethod @datasetmethod(name='save') @eval_results # TODO files -> path def __call__(message=None, files=None, dataset=None, all_updated=True, all_changes=None, version_tag=None, recursive=False, recursion_limit=None, super_datasets=False): if all_changes is not None: from datalad.support.exceptions import DeprecatedError raise DeprecatedError( new="all_updated option where fits and/or datalad add", version="0.5.0", msg="RF: all_changes option passed to the save") if not dataset and not files: # we got nothing at all -> save what is staged in the repo in "this" directory? # we verify that there is an actual repo next dataset = abspath(curdir) refds_path = Interface.get_refds_path(dataset) to_process = [] for ap in AnnotatePaths.__call__( path=files, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='save', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): # next check should not be done during annotation, as it is possibly expensive # and not generally useful if ap.get('status', None) == 'impossible' and \ ap.get('state', None) == 'absent' and \ ap.get('parentds', None): # this is not here anymore, but it might actually have been a deleted # component if relpath(ap['path'], start=ap['parentds']) \ in Dataset(ap['parentds']).repo.get_deleted_files(): # ok, this is a staged deletion that we want to save ap['status'] = '' del ap['message'] if ap.get('status', None): # this is done yield ap continue # for things like: `ds.save()` # or recursively discovered datasets if ap['path'] == refds_path or \ (ap.get('type', None) == 'dataset' and not ap.get('raw_input', False) and not ap.get('state', None) == 'absent'): ap['process_content'] = True ap['process_updated_only'] = all_updated to_process.append(ap) if not to_process: # nothing left to do, potentially all errored before return if super_datasets: # search for the topmost superdatasets of any path dss = [ Dataset(ap.get('parentds', ap['path'])) for ap in to_process ] superdss = [ds.get_superdataset(topmost=True) for ds in dss] superdss = get_tree_roots( unique(ds.path for ds in dss + superdss if ds)) if dataset: # need to adjust the reference to the new superds # if we had one ref before, we should still have exactly one assert len(superdss) <= 1 dataset = list(superdss.keys())[0] refds_path = dataset elif refds_path: # there is a single superdataset superdss = { refds_path: unique( [ap['parentds'] for ap in to_process if 'parentds' in ap]) } else: # sort all datasets under their potential superdatasets # start from the top to get all subdatasets down the line # and collate them into as few superdatasets as possible # this is quick, just string operations superdss = get_tree_roots( unique( [ap['parentds'] for ap in to_process if 'parentds' in ap])) # for each "superdataset" check the tree of subdatasets and make sure # we gather all datasets between the super and any subdataset # so we can save them all bottom-up in order to be able to properly # save the superdataset # if this is called from e.g. `add` this is actually not necessary, # but in the general case we cannot avoid it # TODO maybe introduce a switch? discovered = {} for superds_path in superdss: target_subs = superdss[superds_path] discover_dataset_trace_to_targets( # from here superds_path, # to all target_subs, [], discovered) # create a new minimally annotated path for each discovered dataset discovered_added = set() for parentds in discovered: for subds in discovered[parentds]: to_process.append( dict(path=subds, parentds=parentds, type='dataset')) discovered_added.add(subds) # make sure we have an entry for each dataset, including those # tha are just parents for parentds in discovered: if parentds not in discovered_added: to_process.append( dict( path=parentds, type='dataset', # make sure we save content of superds later on process_content=True)) # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, deduplication happens here too annotated_paths = AnnotatePaths.__call__( path=to_process, dataset=dataset, # never recursion, done already recursive=False, action='save', unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') # now sort into datasets so we can process them one by one content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path, path_only=False) assert (not completed) # iterate over all datasets, starting at the bottom for dspath in sorted(content_by_ds.keys(), reverse=True): ds = Dataset(dspath) res = get_status_dict('save', ds=ds, logger=lgr) if not ds.is_installed(): # TODO This is likely impossible now res['status'] = 'impossible' res['message'] = ('dataset %s is not installed', ds) yield res continue saved_state = save_dataset(ds, content_by_ds[dspath], message=message, version_tag=version_tag) if saved_state: res['status'] = 'ok' else: res['status'] = 'notneeded' yield res @staticmethod def custom_result_renderer(res, **kwargs): from datalad.ui import ui if not res or res.get('type', None) != 'dataset' or 'path' not in res: return ds = Dataset(res['path']) commit = ds.repo.get_hexsha() ui.message('Saved state: {0} for {1}'.format(commit, ds))
class ExportArchive(Interface): """Export the content of a dataset as a TAR/ZIP archive. """ from datalad.support.param import Parameter from datalad.distribution.dataset import datasetmethod from datalad.interface.utils import eval_results from datalad.distribution.dataset import EnsureDataset from datalad.support.constraints import ( EnsureChoice, EnsureNone, EnsureStr, ) _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc=""""specify the dataset to export. If no dataset is given, an attempt is made to identify the dataset based on the current working directory.""", constraints=EnsureDataset() | EnsureNone()), filename=Parameter( args=("filename", ), metavar="PATH", nargs='?', doc="""File name of the generated TAR archive. If no file name is given the archive will be generated in the current directory and will be named: datalad_<dataset_uuid>.(tar.*|zip). To generate that file in a different directory, provide an existing directory as the file name.""", constraints=EnsureStr() | EnsureNone()), archivetype=Parameter(args=("-t", "--archivetype"), doc="""Type of archive to generate.""", constraints=EnsureChoice("tar", "zip")), compression=Parameter( args=("-c", "--compression"), doc="""Compression method to use. 'bz2' is not supported for ZIP archives. No compression is used when an empty string is given.""", constraints=EnsureChoice("gz", "bz2", "")), missing_content=Parameter( args=("--missing-content", ), doc="""By default, any discovered file with missing content will result in an error and the export is aborted. Setting this to 'continue' will issue warnings instead of failing on error. The value 'ignore' will only inform about problem at the 'debug' log level. The latter two can be helpful when generating a TAR archive from a dataset where some file content is not available locally.""", constraints=EnsureChoice("error", "continue", "ignore")), ) @staticmethod @datasetmethod(name='export_archive') @eval_results def __call__(dataset, filename=None, archivetype='tar', compression='gz', missing_content='error'): import os import tarfile import zipfile from unittest.mock import patch from os.path import join as opj, dirname, normpath, isabs import os.path as op from datalad.distribution.dataset import require_dataset from datalad.utils import file_basename from datalad.support.annexrepo import AnnexRepo import logging lgr = logging.getLogger('datalad.local.export_archive') dataset = require_dataset(dataset, check_installed=True, purpose='export archive') repo = dataset.repo committed_date = repo.get_commit_date() # could be used later on to filter files by some criterion def _filter_tarinfo(ti): # Reset the date to match the one of the last commit, not from the # filesystem since git doesn't track those at all # TODO: use the date of the last commit when any particular # file was changed -- would be the most kosher yoh thinks to the # degree of our abilities ti.mtime = committed_date return ti tar_args = dict(recursive=False, filter=_filter_tarinfo) file_extension = '.{}{}'.format( archivetype, '{}{}'.format('.' if compression else '', compression) if archivetype == 'tar' else '') default_filename = "datalad_{.id}".format(dataset) if filename is None: filename = default_filename # in current directory elif path.exists(filename) and path.isdir(filename): filename = path.join(filename, default_filename) # under given directory if not filename.endswith(file_extension): filename += file_extension root = dataset.path # use dir inside matching the output filename # TODO: could be an option to the export plugin allowing empty value # for no leading dir leading_dir = file_basename(filename) # workaround for inability to pass down the time stamp with patch('time.time', return_value=committed_date), \ tarfile.open(filename, "w:{}".format(compression)) \ if archivetype == 'tar' \ else zipfile.ZipFile( filename, 'w', zipfile.ZIP_STORED if not compression else zipfile.ZIP_DEFLATED) \ as archive: add_method = archive.add if archivetype == 'tar' else archive.write repo_files = sorted(repo.get_indexed_files()) if isinstance(repo, AnnexRepo): annexed = repo.is_under_annex(repo_files, allow_quick=True, batch=True) # remember: returns False for files in Git! has_content = repo.file_has_content(repo_files, allow_quick=True, batch=True) else: annexed = [False] * len(repo_files) has_content = [True] * len(repo_files) for i, rpath in enumerate(repo_files): fpath = opj(root, rpath) if annexed[i]: if not has_content[i]: if missing_content in ('ignore', 'continue'): (lgr.warning if missing_content == 'continue' else lgr.debug)( 'File %s has no content available, skipped', fpath) continue else: raise IOError('File %s has no content available' % fpath) # resolve to possible link target if op.islink(fpath): link_target = os.readlink(fpath) if not isabs(link_target): link_target = normpath( opj(dirname(fpath), link_target)) fpath = link_target # name in the archive aname = normpath(opj(leading_dir, rpath)) add_method(fpath, arcname=aname, **(tar_args if archivetype == 'tar' else {})) if not isabs(filename): filename = opj(os.getcwd(), filename) yield dict(status='ok', path=filename, type='file', action='export_archive', logger=lgr)
from datalad.interface.common_opts import recursion_flag from datalad.interface.common_opts import recursion_limit from datalad.interface.results import get_status_dict from datalad.interface.results import annexjson2result from datalad.interface.results import success_status_map from datalad.interface.results import results_from_annex_noinfo from datalad.interface.utils import handle_dirty_dataset from datalad.interface.utils import eval_results from datalad.interface.base import build_doc lgr = logging.getLogger('datalad.distribution.drop') dataset_argument = Parameter( args=("-d", "--dataset"), metavar="DATASET", doc="""specify the dataset to perform the operation on. If no dataset is given, an attempt is made to identify a dataset based on the `path` given""", constraints=EnsureDataset() | EnsureNone()) check_argument = Parameter( args=("--nocheck", ), doc="""whether to perform checks to assure the configured minimum number (remote) source for data.[CMD: Give this option to skip checks CMD]""", action="store_false", dest='check') def _drop_files(ds, paths, check, noannex_iserror=False, **kwargs): """Helper to drop content in datasets.
class CreateSiblingOSF(Interface): """Create a dataset representation at OSF """ result_renderer = 'tailored' _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc=""""Dataset to create a sibling for. If no further constraining path is given, metadata is extracted from all files of the dataset.""", constraints=EnsureDataset() | EnsureNone()), title=Parameter(args=("title", ), doc=""" """, constraints=EnsureStr()), sibling=Parameter(args=("sibling", ), doc="""""", constraints=EnsureStr()), ) @staticmethod @datasetmethod(name='create_sibling_osf') @eval_results def __call__(title, sibling, dataset=None): ds = require_dataset(dataset, purpose="create OSF remote", check_installed=True) # we need an annex if not isinstance(ds.repo, AnnexRepo): yield get_status_dict(action="create-sibling-osf", type="dataset", status="impossible", message="dataset has no annex") return # NOTES: # - we prob. should check osf-special-remote availability upfront to # fail early # - publish-depends option? # - (try to) detect github/gitlab/bitbucket to suggest linking it on # OSF and configure publish dependency # -> prob. overkill; just make it clear in the doc # - add --recursive option # - recursive won't work easily. Need to think that through. # - would need a naming scheme for subdatasets # - flat on OSF or a tree? # - how do we detect something is there already, so we can skip # rather than duplicate (with a new name)? # osf-type-special-remote sufficient to decide it's not needed? # - adapt to conclusions in issue #30 # -> create those subcomponents # - results need to report URL for created projects suitable for datalad # output formatting! # -> result_renderer # -> needs to ne returned by create_project # - option: Make public! cred = _get_credentials() osf = OSF(**cred) proj_id, proj_url = create_project(osf_session=osf.session, title=title) yield get_status_dict(action="create-project-osf", type="dataset", url=proj_url, id=proj_id, status="ok") init_opts = [ "encryption=none", "type=external", "externaltype=osf", "autoenable=true", "project={}".format(proj_id) ] ds.repo.init_remote(sibling, options=init_opts) # TODO: add special remote name to result? # need to check w/ datalad-siblings conventions yield get_status_dict(action="add-sibling-osf", type="dataset", status="ok") @staticmethod def custom_result_renderer(res, **kwargs): from datalad.ui import ui status_str = "{action}({status}): " if res['action'] == "create-project-osf": ui.message("{action}({status}): {url}".format( action=ac.color_word(res['action'], ac.BOLD), status=ac.color_status(res['status']), url=res['url'])) elif res['action'] == "add-sibling-osf": ui.message("{action}({status})".format( action=ac.color_word(res['action'], ac.BOLD), status=ac.color_status(res['status']))) else: from datalad.interface.utils import default_result_renderer default_result_renderer(res, **kwargs)
class CreateSiblingRia(Interface): """Creates a sibling to a dataset in a RIA store Communication with a dataset in a RIA store is implemented via two siblings. A regular Git remote (repository sibling) and a git-annex special remote for data transfer (storage sibling) -- with the former having a publication dependency on the latter. By default, the name of the storage sibling is derived from the repository sibling's name by appending "-storage". The store's base path is expected to not exist, be an empty directory, or a valid RIA store. RIA store layout ~~~~~~~~~~~~~~~~ A RIA store is a directory tree with a dedicated subdirectory for each dataset in the store. The subdirectory name is constructed from the DataLad dataset ID, e.g. '124/68afe-59ec-11ea-93d7-f0d5bf7b5561', where the first three characters of the ID are used for an intermediate subdirectory in order to mitigate files system limitations for stores containing a large number of datasets. Each dataset subdirectory contains a standard bare Git repository for the dataset. In addition, a subdirectory 'annex' hold a standard Git-annex object store. However, instead of using the 'dirhashlower' naming scheme for the object directories, like Git-annex would do, a 'dirhashmixed' layout is used -- the same as for non-bare Git repositories or regular DataLad datasets. Optionally, there can be a further subdirectory 'archives' with (compressed) 7z archives of annex objects. The storage remote is able to pull annex objects from these archives, if it cannot find in the regular annex object store. This feature can be useful for storing large collections of rarely changing data on systems that limit the number of files that can be stored. Each dataset directory also contains a 'ria-layout-version' file that identifies the data organization (as, for example, described above). Lastly, there is a global 'ria-layout-version' file at the store's base path that identifies where dataset subdirectories themselves are located. At present, this file must contain a single line stating the version (currently "1"). This line MUST end with a newline character. It is possible to define an alias for an individual dataset in a store by placing a symlink to the dataset location into an 'alias/' directory in the root of the store. This enables dataset access via URLs of format: 'ria+<protocol>://<storelocation>#~<aliasname>'. Error logging ~~~~~~~~~~~~~ To enable error logging at the remote end, append a pipe symbol and an "l" to the version number in ria-layout-version (like so '1|l\\n'). Error logging will create files in an "error_log" directory whenever the git-annex special remote (storage sibling) raises an exception, storing the Python traceback of it. The logfiles are named according to the scheme '<dataset id>.<annex uuid of the remote>.log' showing "who" ran into this issue with which dataset. Because logging can potentially leak personal data (like local file paths for example), it can be disabled client-side by setting the configuration variable "annex.ora-remote.<storage-sibling-name>.ignore-remote-config". """ # TODO: description? _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset to process. If no dataset is given, an attempt is made to identify the dataset based on the current working directory""", constraints=EnsureDataset() | EnsureNone()), url=Parameter( args=("url",), metavar="ria+<ssh|file>://<host>[/path]", doc="""URL identifying the target RIA store and access protocol. """, constraints=EnsureStr() | EnsureNone()), name=Parameter( args=('-s', '--name',), metavar='NAME', doc="""Name of the sibling. With `recursive`, the same name will be used to label all the subdatasets' siblings.""", constraints=EnsureStr() | EnsureNone(), required=True), storage_name=Parameter( args=("--storage-name",), metavar="NAME", doc="""Name of the storage sibling (git-annex special remote). Must not be identical to the sibling name. If not specified, defaults to the sibling name plus '-storage' suffix. If only a storage sibling is created, this setting is ignored, and the primary sibling name is used.""", constraints=EnsureStr() | EnsureNone()), post_update_hook=Parameter( args=("--post-update-hook",), doc="""Enable git's default post-update-hook for the created sibling.""", action="store_true"), shared=Parameter( args=("--shared",), metavar='{false|true|umask|group|all|world|everybody|0xxx}', doc="""If given, configures the permissions in the RIA store for multi-users access. Possible values for this option are identical to those of `git init --shared` and are described in its documentation.""", constraints=EnsureStr() | EnsureBool() | EnsureNone()), group=Parameter( args=("--group",), metavar="GROUP", doc="""Filesystem group for the repository. Specifying the group is crucial when [CMD: --shared=group CMD][PY: shared="group" PY]""", constraints=EnsureStr() | EnsureNone()), storage_sibling=Parameter( args=("--storage-sibling",), dest='storage_sibling', metavar='MODE', constraints=EnsureChoice('only') | EnsureBool() | EnsureNone(), doc="""By default, an ORA storage sibling and a Git repository sibling are created ([CMD: on CMD][PY: True|'on' PY]). Alternatively, creation of the storage sibling can be disabled ([CMD: off CMD][PY: False|'off' PY]), or a storage sibling created only and no Git sibling ([CMD: only CMD][PY: 'only' PY]). In the latter mode, no Git installation is required on the target host."""), existing=Parameter( args=("--existing",), constraints=EnsureChoice( 'skip', 'error', 'reconfigure') | EnsureNone(), metavar='MODE', doc="""Action to perform, if a (storage) sibling is already configured under the given name and/or a target already exists. In this case, a dataset can be skipped ('skip'), an existing target repository be forcefully re-initialized, and the sibling (re-)configured ('reconfigure'), or the command be instructed to fail ('error').""", ), recursive=recursion_flag, recursion_limit=recursion_limit, trust_level=Parameter( args=("--trust-level",), metavar="TRUST-LEVEL", constraints=EnsureChoice( 'trust', 'semitrust', 'untrust') | EnsureNone(), doc="""specify a trust level for the storage sibling. If not specified, the default git-annex trust level is used.""",), disable_storage__=Parameter( args=("--no-storage-sibling",), dest='disable_storage__', doc="""This option is deprecated. Use '--storage-sibling off' instead.""", action="store_false"), ) @staticmethod @datasetmethod(name='create_sibling_ria') @eval_results def __call__(url, name, dataset=None, storage_name=None, post_update_hook=False, shared=None, group=None, storage_sibling=True, existing='error', trust_level=None, recursive=False, recursion_limit=None, disable_storage__=None, ): if disable_storage__ is not None: import warnings warnings.warn("datalad-create-sibling-ria --no-storage-sibling " "is deprecated, use --storage-sibling off instead.", DeprecationWarning) # recode to new setup disable_storage__ = None storage_sibling = False if storage_sibling == 'only' and storage_name: lgr.warning( "Sibling name will be used for storage sibling in " "storage-sibling-only mode, but a storage sibling name " "was provided" ) ds = require_dataset( dataset, check_installed=True, purpose='create sibling RIA') res_kwargs = dict( ds=ds, action="create-sibling-ria", logger=lgr, ) # parse target URL try: ssh_host, base_path, rewritten_url = verify_ria_url(url, ds.config) except ValueError as e: yield get_status_dict( status='error', message=str(e), **res_kwargs ) return if ds.repo.get_hexsha() is None or ds.id is None: raise RuntimeError( "Repository at {} is not a DataLad dataset, " "run 'datalad create [--force]' first.".format(ds.path)) if not storage_sibling and storage_name: lgr.warning( "Storage sibling setup disabled, but a storage sibling name " "was provided" ) if storage_sibling and not storage_name: storage_name = "{}-storage".format(name) if storage_sibling and name == storage_name: # leads to unresolvable, circular dependency with publish-depends raise ValueError("sibling names must not be equal") if not isinstance(url, str): raise TypeError("url is not a string, but %s" % type(url)) # Query existing siblings upfront in order to fail early on # existing=='error', since misconfiguration (particularly of special # remotes) only to fail in a subdataset later on with that config, can # be quite painful. # TODO: messages - this is "create-sibling". Don't confuse existence of # local remotes with existence of the actual remote sibling # in wording if existing == 'error': # in recursive mode this check could take a substantial amount of # time: employ a progress bar (or rather a counter, because we don't # know the total in advance pbar_id = 'check-siblings-{}'.format(id(ds)) log_progress( lgr.info, pbar_id, 'Start checking pre-existing sibling configuration %s', ds, label='Query siblings', unit=' Siblings', ) # even if we have to fail, let's report all conflicting siblings # in subdatasets failed = False for r in ds.siblings(result_renderer=None, recursive=recursive, recursion_limit=recursion_limit): log_progress( lgr.info, pbar_id, 'Discovered sibling %s in dataset at %s', r['name'], r['path'], update=1, increment=True) if not r['type'] == 'sibling' or r['status'] != 'ok': # this is an internal status query that has not consequence # for the outside world. Be silent unless something useful # can be said #yield r continue if r['name'] == name: res = get_status_dict( status='error', message="a sibling '{}' is already configured in " "dataset {}".format(name, r['path']), **res_kwargs, ) failed = True yield res continue if storage_name and r['name'] == storage_name: res = get_status_dict( status='error', message="a sibling '{}' is already configured in " "dataset {}".format(storage_name, r['path']), **res_kwargs, ) failed = True yield res continue log_progress( lgr.info, pbar_id, 'Finished checking pre-existing sibling configuration %s', ds, ) if failed: return # TODO: - URL parsing + store creation needs to be RF'ed based on # command abstractions # - more generally consider store creation a dedicated command or # option # Note: URL parsing is done twice ATM (for top-level ds). This can't be # reduced to single instance, since rewriting url based on config could # be different for subdatasets. create_store(SSHRemoteIO(ssh_host) if ssh_host else LocalIO(), Path(base_path), '1') yield from _create_sibling_ria( ds, url, name, storage_sibling, storage_name, existing, shared, group, post_update_hook, trust_level, res_kwargs) if recursive: # Note: subdatasets can be treated independently, so go full # recursion when querying for them and _no_recursion with the # actual call. Theoretically this can be parallelized. for subds in ds.subdatasets(fulfilled=True, recursive=True, recursion_limit=recursion_limit, result_xfm='datasets'): yield from _create_sibling_ria( subds, url, name, storage_sibling, storage_name, existing, shared, group, post_update_hook, trust_level, res_kwargs)
class ExportToFigshare(Interface): """Export the content of a dataset as a ZIP archive to figshare Very quick and dirty approach. Ideally figshare should be supported as a proper git annex special remote. Unfortunately, figshare does not support having directories, and can store only a flat list of files. That makes it impossible for any sensible publishing of complete datasets. The only workaround is to publish dataset as a zip-ball, where the entire content is wrapped into a .zip archive for which figshare would provide a navigator. """ from datalad.support.param import Parameter from datalad.distribution.dataset import datasetmethod from datalad.interface.utils import eval_results from datalad.distribution.dataset import EnsureDataset from datalad.support.constraints import EnsureNone, EnsureInt, EnsureStr _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc=""""specify the dataset to export. If no dataset is given, an attempt is made to identify the dataset based on the current working directory.""", constraints=EnsureDataset() | EnsureNone()), filename=Parameter( args=("filename",), metavar="PATH", nargs='?', doc="""File name of the generated ZIP archive. If no file name is given the archive will be generated in the top directory of the dataset and will be named: datalad_<dataset_uuid>.zip.""", constraints=EnsureStr() | EnsureNone()), no_annex=Parameter( args=("--no-annex",), action="store_true", doc="""By default the generated .zip file would be added to annex, and all files would get registered in git-annex to be available from such a tarball. Also upon upload we will register for that archive to be a possible source for it in annex. Setting this flag disables this behavior."""), missing_content=Parameter( args=("--missing-content",), metavar="error|continue|ignore", doc="""By default, any discovered file with missing content will result in an error and the plugin is aborted. Setting this to 'continue' will issue warnings instead of failing on error. The value 'ignore' will only inform about problem at the 'debug' log level. The latter two can be helpful when generating a TAR archive from a dataset where some file content is not available locally.""", constraints=EnsureStr()), # article_id=Parameter( # args=("--project-id",), # metavar="ID", # doc="""If given, article (if article_id is not provided) will be # created in that project.""", # constraints=EnsureInt() | EnsureNone()), article_id=Parameter( args=("--article-id",), metavar="ID", doc="""Which article to publish to.""", constraints=EnsureInt() | EnsureNone()), ) @staticmethod @datasetmethod(name='export_to_figshare') @eval_results def __call__(dataset, filename=None, missing_content='error', no_annex=False, # TODO: support working with projects and articles within them # project_id=None, article_id=None): import os import logging lgr = logging.getLogger('datalad.plugin.export_to_figshare') from datalad.ui import ui from datalad.api import add_archive_content from datalad.api import export_archive from datalad.distribution.dataset import require_dataset from datalad.support.annexrepo import AnnexRepo dataset = require_dataset(dataset, check_installed=True, purpose='export to figshare') if not isinstance(dataset.repo, AnnexRepo): raise ValueError( "%s is not an annex repo, so annexification could be done" % dataset ) if dataset.repo.is_dirty(): raise RuntimeError( "Paranoid authors of DataLad refuse to proceed in a dirty repository" ) if filename is None: filename = dataset.path lgr.info( "Exporting current tree as an archive under %s since figshare " "does not support directories", filename ) archive_out = next( export_archive( dataset, filename=filename, archivetype='zip', missing_content=missing_content, return_type="generator" ) ) assert archive_out['status'] == 'ok' fname = archive_out['path'] lgr.info("Uploading %s to figshare", fname) figshare = FigshareRESTLaison() if not article_id: # TODO: ask if it should be an article within a project if ui.is_interactive: # or should we just upload to a new article? if ui.yesno( "Would you like to create a new article to upload to? " "If not - we will list existing articles", title="Article" ): article = figshare.create_article( title=os.path.basename(dataset.path) ) lgr.info( "Created a new (private) article %(id)s at %(url_private_html)s. " "Please visit it, enter additional meta-data and make public", article ) article_id = article['id'] else: article_id = int(ui.question( "Which of the articles should we upload to.", choices=list(map(str, figshare.get_article_ids())) )) if not article_id: raise ValueError("We need an article to upload to.") file_info = figshare.upload_file( fname, files_url='account/articles/%s/files' % article_id ) if no_annex: lgr.info("Removing generated tarball") unlink(fname) else: # I will leave all the complaining etc to the dataset add if path # is outside etc lgr.info("'Registering' %s within annex", fname) repo = dataset.repo repo.add(fname, git=False) key = repo.get_file_key(fname) lgr.info("Adding URL %(download_url)s for it", file_info) repo._annex_custom_command([], [ "git", "annex", "registerurl", '-c', 'annex.alwayscommit=false', key, file_info['download_url'] ] ) lgr.info("Registering links back for the content of the archive") add_archive_content( fname, annex=dataset.repo, delete_after=True, # just remove extracted into a temp dir allow_dirty=True, # since we have a tarball commit=False # we do not want to commit anything we have done here ) lgr.info("Removing generated and now registered in annex archive") repo.drop(key, key=True, options=['--force']) repo.remove(fname, force=True) # remove the tarball # if annex in {'delete'}: # dataset.repo.remove(fname) # else: # # kinda makes little sense I guess. # # Made more sense if export_archive could export an arbitrary treeish # # so we could create a branch where to dump and export to figshare # # (kinda closer to my idea) # dataset.save(fname, message="Added the entire dataset into a zip file") # TODO: add to downloader knowledge about figshare token so it could download-url # those zipballs before they go public yield dict( status='ok', # TODO: add article url (which needs to be queried if only ID is known message="Published archive {}".format( file_info['download_url']), file_info=file_info, path=dataset, action='export_to_figshare', logger=lgr )
class Save(Interface): """Save the current state of a dataset Saving the state of a dataset records all changes that have been made to it. This change record is annotated with a user-provided description. Optionally, an additional tag, such as a version, can be assigned to the saved state. Such tag enables straightforward retrieval of past versions at a later point in time. || PYTHON >> Returns ------- commit or None `None` if nothing was saved, the resulting commit otherwise. << PYTHON || """ _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), doc=""""specify the dataset to save. If no dataset is given, an attempt is made to identify the dataset based on the current working directory.""", constraints=EnsureDataset() | EnsureNone()), files=Parameter( args=("files", ), metavar='FILES', doc="""list of files to consider. If given, only changes made to those files are recorded in the new state.""", nargs='*', constraints=EnsureStr() | EnsureNone()), message=Parameter(args=( "-m", "--message", ), metavar='MESSAGE', doc="""a message to annotate the saved state.""", constraints=EnsureStr() | EnsureNone()), auto_add_changes=Parameter( args=("-a", "--auto-add-changes"), doc="""automatically include all changes in the entire dataset, independent of the current working directory.""", action="store_true"), version_tag=Parameter(args=("--version-tag", ), metavar='ID', doc="""an additional marker for that state.""", constraints=EnsureStr() | EnsureNone()), super_datasets=super_datasets_flag, recursive=recursion_flag, recursion_limit=recursion_limit, ) @staticmethod @datasetmethod(name='save') def __call__(message=None, files=None, dataset=None, auto_add_changes=False, version_tag=None, recursive=False, recursion_limit=None, super_datasets=False): # shortcut ds = require_dataset(dataset, check_installed=True, purpose='saving') if not ds.repo.repo.is_dirty(index=True, working_tree=True, untracked_files=True, submodules=True): # if we cannot see anything dirty at all, the only things we could # do is tag if version_tag: ds.repo.tag(version_tag) # take the easy one out return # always yields list; empty if None files = assure_list(files) # track what to be committed, so it becomes # possible to decide when/what to save further down # and one level up orig_hexsha = ds.repo.get_hexsha() to_commit = [] # before anything, let's deal with missing submodules that may have # been rm'ed by the user # this will not alter/amend the history of the dataset deinit_deleted_submodules(ds) # XXX path resolution needs to happen on the input argument, not the # resolved dataset! # otherwise we will not be able to figure out, whether there was an # explicit dataset provided, or just a matching one resolved # automatically. # if files are provided but no dataset, we interpret them as # CWD-related if auto_add_changes: # use the dataset's base path to indicate that everything # should be saved if files: lgr.warning( "List of paths was provided to save but auto_add_changes " "was specified, so list of paths was ignored") files = [ds.path] else: # make sure we apply the usual path interpretation logic files = [resolve_path(p, dataset) for p in files] new_submodules = untracked_subdatasets_to_submodules(ds, files) if new_submodules: # make sure that .gitmodules is added to the list of files # to be committed. Adding to index might not be enough iff # custom files was provided to_commit.append('.gitmodules') to_commit.extend(new_submodules) # now we should have a complete list of submodules to potentially # recurse into if recursive and (recursion_limit is None or recursion_limit > 0): # what subdataset to touch? subdss = [] if auto_add_changes: # all installed 1st-level ones # we only want immediate subdatasets, higher depths will come # via recursion subdss = [ Dataset(opj(ds.path, subds_path)) for subds_path in ds.get_subdatasets(recursive=False) ] elif files is not None: # only subdatasets that contain any of the to-be-considered # paths # TODO: the same deductions will be redone later again # very inefficient. Should be just sorted into subds # once! subdss = [ ds.get_containing_subdataset(p, recursion_limit=1) for p in files ] # skip anything that isn't installed, or this dataset subdss = [d for d in subdss if d.is_installed() and d != ds] prop_recursion_limit = \ None if recursion_limit is None else max(recursion_limit - 1, 0) for subds in subdss: # TODO: just make use of get._sort_paths_into_datasets # currently it is very inefficient since for the same ds # it asks about subdatasets for every file! subds_files = [] # files belonging to the subds todo_files = [] # leftover files for f in files: if ds.get_containing_subdataset( f, recursion_limit=1) == subds: subds_files.append(f) else: todo_files.append(f) files = todo_files subds_modified = Save.__call__( message=message, files=subds_files, dataset=subds, auto_add_changes=auto_add_changes, version_tag=version_tag, recursive=recursive and (prop_recursion_limit is None or prop_recursion_limit > 0), recursion_limit=prop_recursion_limit, ) if subds_modified: # stage changes in this submodule subdspath = relpath(subds.path, ds.path) ds.repo.add(subdspath, git=True) to_commit.append(subdspath) if files: # could still be none without auto add changes ds_subdatasets = ds.get_subdatasets(recursive=False) subdatasets_paths = {opj(ds.path, f) for f in ds_subdatasets} # TODO: also use some centralized sorting into sub-datasets # e.g. one used in get ds_files = [ f for f in files if f in subdatasets_paths or ds.get_containing_subdataset(f, recursion_limit=1) == ds ] if len(ds_files): # XXX Is there a better way to handle files in mixed repos? ds.repo.add(ds_files) ds.repo.add(ds_files, git=True) to_commit.extend(ds_files) # it might be that the file itself is the submodule, so we might # need to commit .gitmodules for f in files: for subds in subdatasets_paths: if subds.rstrip('/') == f.rstrip('/'): to_commit.append('.gitmodules') break _datalad_msg = False if not message: message = 'Recorded existing changes' _datalad_msg = True # extend with files yet to be committed in this dataset to_commit.extend(files) # anything should be staged by now # however, staged submodule changes are not considered as # `index`, hence `submodules` needs to be True too # we can have an explicit list of stuff to save or (if no `files` # provided) have staged stuff if ds.repo.repo.is_dirty(index=True, working_tree=False, untracked_files=False, submodules=True): # Analyze list of known to be committed files/submodules, # see if nothing points outside, and then convert to relative paths to_commit_rel = [] if to_commit: repopath = ds.repo.path for f in to_commit: if isabs(f): frel = relpath(f, repopath) if frel.startswith(pardir): # XXX may be just a warning and skip? raise RuntimeError( "Path %s outside of the dataset %s. Can't commit" % (f, ds)) f = frel to_commit_rel.append(f) to_commit_rel = sorted(set(to_commit_rel)) if '.' in to_commit_rel: # we need to commit everything to_commit_rel = [] ds.repo.commit(message, options=to_commit_rel, _datalad_msg=_datalad_msg) elif to_commit: lgr.warning( "Was instructed to commit %s files but repository is not dirty", to_commit) elif not auto_add_changes: lgr.info('Nothing to save, consider auto-detection of changes, ' 'if this is unexpected.') # MIH: let's tag even if there was nothing commit. I'd forget this # option too often... if version_tag: ds.repo.tag(version_tag) _was_modified = ds.repo.get_hexsha() != orig_hexsha # and now we could consider saving our changes within super-datasets # Let's float up until we get to a non-dataset if super_datasets: if _was_modified: if version_tag: lgr.info( "Version tag %s will not be applied to super datasets", version_tag) superds = ds while True: supersubds = superds superds = superds.get_superdataset(datalad_only=True) if not superds: break Save.__call__( message=message + " [origin: %s]" % relpath(ds.path, superds.path), files=[relpath(supersubds.path, superds.path)], dataset=superds, auto_add_changes=False, version_tag=None, recursive=False, ) else: lgr.info( "Not trying to save super-datasets since no modifications") # TODO: figure out what we should return for recursive/super_datasets # shouldn't we return all commits??? return ds.repo.repo.head.commit if _was_modified else None @staticmethod def result_renderer_cmdline(res, args): from datalad.ui import ui if res: ui.message('Saved state: "{0}" by {1} [{2}]'.format( res.message.splitlines()[0], res.committer, res.hexsha))
class CreateSiblingRia(Interface): """Creates a sibling to a dataset in a RIA store This creates a representation of a dataset in a ria-remote compliant storage location. For access to it two siblings are configured for the dataset by default. A "regular" one and a RIA remote (git-annex special remote). Furthermore, the former is configured to have a publication dependency on the latter. If not given a default name for the RIA remote is derived from the sibling's name by appending "-ria". The store's base path currently is expected to either: - not yet exist or - be empty or - have a valid `ria-layout-version` file and an `error_logs` directory. In the first two cases, said file and directory are created by this command. Alternatively you can manually create the third case, of course. Please note, that `ria-layout-version` needs to contain a line stating the version (currently '1') and optionally enable error logging (append '|l' in that case). Currently, this line MUST end with a newline! Error logging will create files in the `error_log` directory whenever the RIA special remote (storage sibling) raises an exception, storing the python traceback of it. The logfiles are named according to the scheme <dataset id>.<annex uuid of the remote>.log showing 'who' ran into this issue with what dataset. Since this logging can potentially leak personal data (like local file paths for example) it can be disabled from the client side via `annex.ria-remote.<RIAREMOTE>.ignore-remote-config`. Todo ---- Where to put the description of a RIA store (see below)? The targeted layout of such a store is a tree of datasets, starting at the configured base path. First level of subdirectories are named for the first three characters of the datasets' id, second level is the remainder of those ids. The thereby created dataset directories contain a bare git repository. Those bare repositories are slightly different from plain git-annex bare repositories in that they use the standard dirhashmixed layout beneath annex/objects as opposed to dirhashlower, which is git-annex's default for bare repositories. Furthermore, there is an additional directory 'archives' within the dataset directories, which may or may not contain archives with annexed content. Note, that this helps to reduce the number of inodes consumed (no checkout + potential archive) as well as it allows to resolve dependencies (that is (sub)datasets) merely by their id. Finally, there is a file `ria-layout-version` put beneath the store's base path, determining the version of the dataset tree layout and a file of the same name per each dataset directory determining object tree layout version (we already switch from dirhashlower to dirhashmixed for example) and an additional directory `error_logs` at the toplevel. """ # TODO: description? _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), doc="""specify the dataset to process. If no dataset is given, an attempt is made to identify the dataset based on the current working directory""", constraints=EnsureDataset() | EnsureNone()), url=Parameter( args=("url", ), metavar="ria+<ssh|file>://<host>[/path]", doc="""URL identifying the target RIA store and access protocol. """, constraints=EnsureStr() | EnsureNone()), name=Parameter(args=( '-s', '--name', ), metavar='NAME', doc="""Name of the sibling. With `recursive`, the same name will be used to label all the subdatasets' siblings.""", constraints=EnsureStr() | EnsureNone(), required=True), ria_remote_name=Parameter( args=("--ria-remote-name", ), metavar="NAME", doc="""Name of the RIA remote (a git-annex special remote). Must not be identical to the sibling name. If not specified, defaults to the sibling name plus a '-ria' suffix.""", constraints=EnsureStr() | EnsureNone()), post_update_hook=Parameter( args=("--post-update-hook", ), doc="""Enable git's default post-update-hook for the created sibling.""", action="store_true"), shared=Parameter( args=("--shared", ), metavar='{false|true|umask|group|all|world|everybody|0xxx}', doc="""If given, configures the permissions in the RIA store for multi-users access. Possible values for this option are identical to those of `git init --shared` and are described in its documentation.""", constraints=EnsureStr() | EnsureBool() | EnsureNone()), group=Parameter( args=("--group", ), metavar="GROUP", doc="""Filesystem group for the repository. Specifying the group is crucial when [CMD: --shared=group CMD][PY: shared="group" PY]""", constraints=EnsureStr() | EnsureNone()), ria_remote=Parameter( args=("--no-ria-remote", ), dest='ria_remote', doc="""Whether to establish remote indexed archive (RIA) capabilties for the created sibling. If enabled, git-annex special remote access will be configured to enable regular git-annex key storage, and also retrieval of keys from (compressed) 7z archives that might be provided by the dataset store. If disabled, git-annex is instructed to ignore the sibling.""", action="store_false"), existing=Parameter( args=("--existing", ), constraints=EnsureChoice('skip', 'replace', 'error', 'reconfigure'), metavar='MODE', doc="""Action to perform, if a sibling or ria-remote is already configured under the given name and/or a target already exists. In this case, a dataset can be skipped ('skip'), an existing target directory be forcefully re-initialized, and the sibling (re-)configured ('replace', implies 'reconfigure'), the sibling configuration be updated only ('reconfigure'), or to error ('error').""", ), recursive=recursion_flag, recursion_limit=recursion_limit, ) @staticmethod @datasetmethod(name='create_sibling_ria') @eval_results def __call__(url, name, dataset=None, ria_remote_name=None, post_update_hook=False, shared=None, group=None, ria_remote=True, existing='error', recursive=False, recursion_limit=None): ds = require_dataset(dataset, check_installed=True, purpose='create sibling RIA') res_kwargs = dict( ds=ds, action="create-sibling-ria", logger=lgr, ) if ds.repo.get_hexsha() is None or ds.id is None: raise RuntimeError("Repository at {} is not a DataLad dataset, " "run 'datalad create [--force]' first.".format( ds.path)) if not ria_remote and ria_remote_name: lgr.warning( "RIA remote setup disabled, but a ria-remote name was provided" ) if ria_remote and not ria_remote_name: ria_remote_name = "{}-ria".format(name) if ria_remote and name == ria_remote_name: # leads to unresolvable, circular dependency with publish-depends raise ValueError("sibling names must not be equal") if not isinstance(url, str): raise TypeError("url is not a string, but %s" % type(url)) # Query existing siblings upfront in order to fail early on # existing=='error', since misconfiguration (particularly of special # remotes) only to fail in a subdataset later on with that config, can # be quite painful. # TODO: messages - this is "create-sibling". Don't confuse existence of # local remotes with existence of the actual remote sibling # in wording if existing == 'error': # in recursive mode this check could take a substantial amount of # time: employ a progress bar (or rather a counter, because we dont # know the total in advance pbar_id = 'check-siblings-{}'.format(id(ds)) log_progress( lgr.info, pbar_id, 'Start checking pre-existing sibling configuration %s', ds, label='Query siblings', unit=' Siblings', ) # even if we have to fail, let's report all conflicting siblings # in subdatasets failed = False for r in ds.siblings(result_renderer=None, recursive=recursive, recursion_limit=recursion_limit): log_progress(lgr.info, pbar_id, 'Discovered sibling %s in dataset at %s', r['name'], r['path'], update=1, increment=True) if not r['type'] == 'sibling' or r['status'] != 'ok': # this is an internal status query that has not consequence # for the outside world. Be silent unless something useful # can be said #yield r continue if r['name'] == name: res = get_status_dict( status='error', message="a sibling '{}' is already configured in " "dataset {}".format(name, r['path']), **res_kwargs, ) failed = True yield res continue if ria_remote_name and r['name'] == ria_remote_name: res = get_status_dict( status='error', message="a sibling '{}' is already configured in " "dataset {}".format(ria_remote_name, r['path']), **res_kwargs, ) failed = True yield res continue log_progress( lgr.info, pbar_id, 'Finished checking pre-existing sibling configuration %s', ds, ) if failed: return yield from _create_sibling_ria(ds, url, name, ria_remote, ria_remote_name, existing, shared, group, post_update_hook, res_kwargs) if recursive: # Note: subdatasets can be treated independently, so go full # recursion when querying for them and _no_recursion with the # actual call. Theoretically this can be parallelized. for subds in ds.subdatasets(fulfilled=True, recursive=True, recursion_limit=recursion_limit, result_xfm='datasets'): yield from _create_sibling_ria(subds, url, name, ria_remote, ria_remote_name, existing, shared, group, post_update_hook, res_kwargs)
class Run(Interface): """Run an arbitrary shell command and record its impact on a dataset. It is recommended to craft the command such that it can run in the root directory of the dataset that the command will be recorded in. However, as long as the command is executed somewhere underneath the dataset root, the exact location will be recorded relative to the dataset root. If the executed command did not alter the dataset in any way, no record of the command execution is made. If the given command errors, a `CommandError` exception with the same exit code will be raised, and no modifications will be saved. *Command format* || REFLOW >> A few placeholders are supported in the command via Python format specification. "{pwd}" will be replaced with the full path of the current working directory. "{dspath}" will be replaced with the full path of the dataset that run is invoked on. "{inputs}" and "{outputs}" represent the values specified by [CMD: --input and --output CMD][PY: `inputs` and `outputs` PY]. If multiple values are specified, the values will be joined by a space. The order of the values will match that order from the command line, with any globs expanded in alphabetical order (like bash). Individual values can be accessed with an integer index (e.g., "{inputs[0]}"). << REFLOW || To escape a brace character, double it (i.e., "{{" or "}}"). """ _params_ = dict( cmd=Parameter( args=("cmd",), nargs=REMAINDER, metavar='COMMAND', doc="command for execution"), dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset to record the command results in. An attempt is made to identify the dataset based on the current working directory. If a dataset is given, the command will be executed in the root directory of this dataset.""", constraints=EnsureDataset() | EnsureNone()), inputs=Parameter( args=("--input",), dest="inputs", metavar=("PATH"), action='append', doc="""A dependency for the run. Before running the command, the content of this file will be retrieved. A value of "." means "run :command:`datalad get .`". The value can also be a glob. [CMD: This option can be given more than once. CMD]"""), outputs=Parameter( args=("--output",), dest="outputs", metavar=("PATH"), action='append', doc="""Prepare this file to be an output file of the command. A value of "." means "run :command:`datalad unlock .`" (and will fail if some content isn't present). For any other value, if the content of this file is present, unlock the file. Otherwise, remove it. The value can also be a glob. [CMD: This option can be given more than once. CMD]"""), expand=Parameter( args=("--expand",), metavar=("WHICH"), doc="""Expand globs when storing inputs and/or outputs in the commit message.""", constraints=EnsureNone() | EnsureChoice("inputs", "outputs", "both")), explicit=Parameter( args=("--explicit",), action="store_true", doc="""Consider the specification of inputs and outputs to be explicit. Don't warn if the repository is dirty, and only save modifications to the listed outputs."""), message=save_message_opt, sidecar=Parameter( args=('--sidecar',), metavar="yes|no", doc="""By default, the configuration variable 'datalad.run.record-sidecar' determines whether a record with information on a command's execution is placed into a separate record file instead of the commit message (default: off). This option can be used to override the configured behavior on a case-by-case basis. Sidecar files are placed into the dataset's '.datalad/runinfo' directory (customizable via the 'datalad.run.record-directory' configuration variable).""", constraints=EnsureNone() | EnsureBool()), rerun=Parameter( args=('--rerun',), action='store_true', doc="""re-run the command recorded in the last saved change (if any). Note: This option is deprecated since version 0.9.2 and will be removed in a later release. Use `datalad rerun` instead."""), ) @staticmethod @datasetmethod(name='run') @eval_results def __call__( cmd=None, dataset=None, inputs=None, outputs=None, expand=None, explicit=False, message=None, sidecar=None, rerun=False): if rerun: if cmd: lgr.warning("Ignoring provided command in --rerun mode") lgr.warning("The --rerun option is deprecated since version 0.9.2. " "Use `datalad rerun` instead.") from datalad.interface.rerun import Rerun for r in Rerun.__call__(dataset=dataset, message=message): yield r else: if cmd: for r in run_command(cmd, dataset=dataset, inputs=inputs, outputs=outputs, expand=expand, explicit=explicit, message=message, sidecar=sidecar): yield r else: lgr.warning("No command given")
class Add(Interface): """Add files/directories to an existing dataset. Typically, files and directories to be added to a dataset would be placed into a directory of a dataset, and subsequently this command can be used to register this new content with the dataset. With recursion enabled, files will be added to their respective subdatasets as well. || REFLOW >> By default all files are added to the dataset's :term:`annex`, i.e. only their content identity and availability information is tracked with Git. This results in lightweight datasets. If desired, the [PY: `to_git` PY][CMD: --to-git CMD] flag can be used to tell datalad to inject files directly into Git. While this is not recommended for binary data or large files, it can be used for source code and meta-data to be able to benefit from Git's track and merge capabilities. Files checked directly into Git are always and unconditionally available immediately after installation of a dataset. << REFLOW || .. note:: Power-user info: This command uses :command:`git annex add`, or :command:`git add` to incorporate new dataset content. """ _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), metavar='PATH', doc="""specify the dataset to perform the add operation on. If no dataset is given, an attempt is made to identify the dataset based on the current working directory and/or the `path` given""", constraints=EnsureDataset() | EnsureNone()), path=Parameter( args=("path", ), metavar='PATH', doc="""path/name of the component to be added. The component must either exist on the filesystem already, or a `source` has to be provided.""", nargs="+", constraints=EnsureStr() | EnsureNone()), to_git=Parameter( args=("--to-git", ), action='store_true', doc="""flag whether to add data directly to Git, instead of tracking data identity only. Usually this is not desired, as it inflates dataset sizes and impacts flexibility of data transport. If not specified - it will be up to git-annex to decide, possibly on .gitattributes options."""), to_annex=Parameter( args=("--to-annex", ), action='store_false', dest='to_git', doc="""flag whether to force adding data to Annex, instead of git. It might be that .gitattributes instructs for a file to be added to git, but for some particular files it is desired to be added to annex (e.g. sensitive files etc). If not specified - it will be up to git-annex to decide, possibly on .gitattributes options."""), recursive=recursion_flag, recursion_limit=recursion_limit, # TODO not functional anymore ds2super=Parameter( args=( "-S", "--ds2super", "--datasets-to-super", ), action='store_true', doc="""given paths of dataset (toplevel) locations will cause these datasets to be added to their respective superdatasets underneath a given base `dataset` (instead of all their content to themselves). If no base `dataset` is provided, this flag has no effect. Regular files and directories are always added to their respective datasets, regardless of this setting."""), save=nosave_opt, message=save_message_opt, git_opts=git_opts, annex_opts=annex_opts, annex_add_opts=annex_add_opts, jobs=jobs_opt) @staticmethod @datasetmethod(name='add') @eval_results def __call__( path=None, dataset=None, # support passing this through in a path by path basis to_git=None, save=True, message=None, recursive=False, recursion_limit=None, ds2super=False, git_opts=None, annex_opts=None, annex_add_opts=None, jobs=None): # parameter constraints: if not path: raise InsufficientArgumentsError( "insufficient information for adding: requires at least a path" ) refds_path = Interface.get_refds_path(dataset) common_report = dict(action='add', logger=lgr, refds=refds_path) to_add = [] subds_to_add = {} ds_to_annotate_from_recursion = {} got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=dataset, # never recursion, need to handle manually below to be able to # discover untracked content recursive=False, action='add', # speed things up by using Git's modification detection, if there # is a repo with at least one commit modified='HEAD' \ if dataset and \ GitRepo.is_valid_repo(refds_path) and \ GitRepo(refds_path).get_hexsha() \ else None, unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): got_nothing = False if ap.get('status', None): # this is done yield ap continue if ap.get('parentds', None) is None and ap.get('type', None) != 'dataset': yield get_status_dict( status='impossible', message='"there is no dataset to add this path to', **dict(common_report, **ap)) continue if ap.get('type', None) == 'directory' and \ ap.get('state', None) == 'untracked' and \ GitRepo.is_valid_repo(ap['path']): # this is an untracked wannabe subdataset in disguise ap['type'] = 'dataset' if recursive and \ (ap.get('raw_input', False) or ap.get('state', None) in ('modified', 'untracked')) and \ (ap.get('parentds', None) or ap.get('type', None) == 'dataset'): # this was an actually requested input path, or a path that was found # modified by path annotation, based on an input argument # we need to recurse into all subdirs to find potentially # unregistered subdatasets # but only if this path has a parent, or is itself a dataset # otherwise there is nothing to add to _discover_subdatasets_recursively( ds_to_annotate_from_recursion, ap['path'], [ap['parentds'] if 'parentds' in ap else ap['path']], recursion_limit) # get the file content of the root dataset of this search added too # but be careful with extreme recursion_limit settings if recursion_limit is None or recursion_limit > 0: ap['process_content'] = True # record for further processing if not ap['path'] in ds_to_annotate_from_recursion: # if it was somehow already discovered to_add.append(ap) # TODO check if next isn't covered by discover_dataset_trace_to_targets already?? if dataset and ap.get('type', None) == 'dataset': # duplicates not possible, annotated_paths returns unique paths subds_to_add[ap['path']] = ap if got_nothing: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict('add', status='notneeded', path=refds_path, type='dataset', logger=lgr) return for subds in ds_to_annotate_from_recursion: if subds not in subds_to_add: # always prefer the already annotated path subds_to_add[subds] = ds_to_annotate_from_recursion[subds] if dataset: # we have a base dataset, discover any intermediate datasets between # the base and any already discovered dataset discovered = {} discover_dataset_trace_to_targets( # from here dataset.path, # to any dataset we are aware of subds_to_add.keys(), [], discovered) for parentds in discovered: for subds in discovered[parentds]: subds_to_add[subds] = subds_to_add.get( subds, dict(path=subds, parentds=parentds, type='dataset')) # merge custom paths and discovered dataset records, paths needs to go first, # because we know most about then, and subsequent annotation call we skip the # later duplicate ones to_add.extend(subds_to_add.values()) # and compact, this should be OK as all the info is in each ap dict to_add = unique(to_add, lambda x: x['path']) if not to_add: # nothing left to do, potentially all errored before return # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, it will also deduplicate annotated_paths = AnnotatePaths.__call__( path=to_add, dataset=dataset, # never recursion, done already recursive=False, action='add', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path, path_only=False) assert (not completed) if not content_by_ds: # we should have complained about any inappropriate path argument # above, so if nothing is left, we can simply exit return # simple loop over datasets -- save happens later # start deep down to_save = [] for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) torepoadd = {} respath_by_status = {} for ap in content_by_ds[ds_path]: # we have a new story ap.pop('status', None) torepoadd[ap['path']] = ap # skip anything that doesn't look like a wannabe subdataset if not ap.get('type', None) == 'dataset' or \ ap['path'] == ds_path: continue if ap.get('registered_subds', False): # subdataset that might be in this list because of the # need to save all the way up to a super dataset respath_by_status['success'] = \ respath_by_status.get('success', []) + [ap['path']] yield get_status_dict(status='notneeded', message="already known subdataset", **dict(common_report, **ap)) continue subds = Dataset(ap['path']) # check that the subds has a commit, and refuse # to operate on it otherwise, or we would get a bastard # submodule that cripples git operations if not subds.repo.get_hexsha(): yield get_status_dict( ds=subds, status='impossible', message='cannot add subdataset with no commits', **dict(common_report, **ap)) continue subds_relpath = relpath(ap['path'], ds_path) # make an attempt to configure a submodule source URL based on the # discovered remote configuration remote, branch = subds.repo.get_tracking_branch() subds_url = subds.repo.get_remote_url( remote) if remote else None # Register the repository in the repo tree as a submodule try: ds.repo.add_submodule(subds_relpath, url=subds_url, name=None) except CommandError as e: yield get_status_dict(ds=subds, status='error', message=e.stderr, **dict(common_report, **ap)) continue # queue for saving using the updated annotated path ap['registered_subds'] = True # I hope this is true in direct mode too # TODO this is disabled, because in some circumstances # staging just doesn't happen, and it is unclear when # exactly -- the case that prompted disabling was a submodule # that had no content except for other submodules was not staged, # whereas another submodule on the same level in the same # superdataset which also has one file in it was staged # disable to work correctly, while paying a little bit of # slow down #ap['staged'] = True to_save.append(ap) _fixup_submodule_dotgit_setup(ds, subds_relpath) # report added subdatasets -- `annex add` below won't do it yield get_status_dict(ds=subds, status='ok', message='added new subdataset', **dict(common_report, **ap)) # make sure that .gitmodules is added to the list of files gitmodules_path = opj(ds.path, '.gitmodules') # for git torepoadd[gitmodules_path] = dict(path=gitmodules_path) # and for save to_save.append( dict(path=gitmodules_path, parentds=ds_path, type='file')) # make sure any last minute additions make it to the saving stage # XXX? should content_by_ds become OrderedDict so that possible # super here gets processed last? lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd) added = ds.repo.add( list(torepoadd.keys()), git=to_git if isinstance(ds.repo, AnnexRepo) else True, commit=False) for a in added: res = annexjson2result(a, ds, type='file', **common_report) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] # produce best possible path/result annotation if res['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report res = dict(torepoadd[res['path']], **res) # override this in all cases to be safe res['parentds'] = ds.path if success: # this was successfully added, queue for saving this very path # in the dataset ap = {k: v for k, v in res.items() if k != 'status'} ap['staged'] = True # strip any status and state info (e.g. save will refuse to save # stuff that is marked state='untracked' to_save.append({ k: v for k, v in res.items() if k not in ('status', 'state') }) if a['file'] == '.gitmodules': # filter out .gitmodules, because this is only included for # technical reasons and has nothing to do with the actual content continue if GitRepo.is_valid_repo(res['path']): # more accurate report in case of an added submodule # mountpoint. # XXX Actually not sure if this can really happen # (depends on what our low-level code would do) # but worst case is that we loose a little bit of # coverage... res['type'] = 'dataset' res['message'] = 'added new state as submodule' yield res for r in results_from_annex_noinfo( ds, torepoadd, respath_by_status, dir_fail_msg='could not add some content in %s %s', noinfo_dir_msg='nothing to add from %s', noinfo_file_msg='already included in the dataset', action='add', logger=lgr, refds=refds_path): if r['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report r = dict(r, **torepoadd[r['path']]) if r['status'] == 'notneeded': # this could be a file that was staged already, it doesn't need # to be added, but it should be saved/commited if so desired to_save.append({ k: v for k, v in r.items() if k not in ('status', 'state') }) # XXX something is fishy with the next one, rethink when sober.... if r['path'] == ds_path and r['status'] == 'ok': # this is for the entire dataset itself which was explicitly requested # make sure to save all r['type'] = 'dataset' r['process_content'] = True to_save.append( {k: v for k, v in r.items() if k != 'status'}) yield r if refds_path and ds_path != refds_path and len( respath_by_status.get('success', [])): # TODO XXX we have an issue here when with `add('.')` and annex ignores any # dotfiles. In this case we end up not saving a dataset completely, because # we rely on accurate reporting. there is an issue about this already # TODO look up the issue ID # if there is a base dataset, but we are below it, and we have anything done to this # dataset -> queue dataset itself for saving its state in the parent ds_ap = dict( path=ds.path, # we have to look for the parent here, as we must save the # subdataset in the parent and not the whole subdataset itself type='dataset') parentds = get_dataset_root(normpath(opj(ds.path, pardir))) if parentds: ds_ap['parentds'] = parentds if dataset: ds_ap['refds'] = refds_path to_save.append(ds_ap) if not save: lgr.debug('Not calling `save` as instructed') return # TODO tell save what was staged already! Set 'staged=True' for # respective annotated paths that are fed into `save` # do not reuse any of the sorting done in here for saving, but instead # pass on all the annotated paths to have `save` figure out what to do with # them -- this is costs something, but should be safer, and frankly is # more comprehensible for res in Save.__call__( # hand-selected annotated paths path=to_save, dataset=refds_path, message=message if message else '[DATALAD] added content', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res