def test_logging_to_a_file(dst): ok_(not exists(dst)) lgr = LoggerHelper("dataladtest-1").get_initialized_logger(logtarget=dst) ok_(exists(dst)) # nothing was logged -- no file created msg = "Oh my god, they killed Kenny" lgr.error(msg) with open(dst) as f: lines = f.readlines() assert_equal(len(lines), 1, "Read more than a single log line: %s" % lines) line = lines[0] ok_(msg in line) ok_('\033[' not in line, msg="There should be no color formatting in log files. Got: %s" % line) # verify that time stamp and level are present in the log line # do not want to rely on not having race conditions around date/time changes # so matching just with regexp # (...)? is added to swallow possible traceback logs regex = "\[ERROR\]" if EnsureBool()(dl_cfg.get('datalad.log.timestamp', False)): regex = "\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3} " + regex if EnsureBool()(dl_cfg.get('datalad.log.vmem', False)): regex += ' RSS/VMS: \S+/\S+( \S+)?\s*' regex += "(\s+\S+\s*)? " + msg assert_re_in(regex, line, match=True) # Close all handlers so windows is happy -- apparently not closed fast enough for handler in lgr.handlers: handler.close()
def __init__(self, *args, **kwargs): super(FileResource, self).__init__(*args, **kwargs) # setup parser bool_type = EnsureBool() json_type = EnsureChoice('yes', 'no', 'stream') self.rp = reqparse.RequestParser() self.rp.add_argument( 'path', type=str, help="""path to file. If none is given, or the path contains a wildcard character '*', a list of (matching) files in the dataset is returned.""", location=['args', 'json', 'form']) self.rp.add_argument( 'json', type=json_type, default='no', help='%s. {error_msg}' % repr(json_type), location=['args', 'json', 'form']) self.rp.add_argument( 'verify_availability', type=bool_type, default='yes', help='%s. {error_msg}' % repr(bool_type), location=['args', 'json', 'form']) self.rp.add_argument( 'content', help='file content', location=['form', 'json']) self.rp.add_argument( 'togit', type=bool_type, help="""flag whether to add files to git, instead of making a decision based on the dataset configuration. %s. {error_msg}""" % repr(bool_type), location=['json', 'form'])
def test_logging_to_a_file(dst): ok_(not exists(dst)) lgr = LoggerHelper("dataladtest-1").get_initialized_logger(logtarget=dst) ok_(exists(dst)) # nothing was logged -- no file created msg = "Oh my god, they killed Kenny" lgr.error(msg) with open(dst) as f: lines = f.readlines() assert_equal(len(lines), 1, "Read more than a single log line: %s" % lines) line = lines[0] ok_(msg in line) ok_(not '\033[' in line, msg="There should be no color formatting in log files. Got: %s" % line) # verify that time stamp and level are present in the log line # do not want to rely on not having race conditions around date/time changes # so matching just with regexp # .* is added to swallow possible traceback logs if EnsureBool()(cfg.get('datalad.log.timestamp', False)): ok_( re.match( "\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3} \[ERROR\](\s+\S+\s*)? %s" % msg, line)) else: ok_(re.match("\[ERROR\](\s+\S+\s*)? %s" % msg, line))
def _get_commit_info(ds, refcommit, status): """Get info about all commits, up to (and incl. the refcommit)""" #- get all the commit info with git log --pretty='%aN%x00%aI%x00%H' # - use all first-level paths other than .datalad and .git for the query #- from this we can determine all modification timestamps, described refcommit #- do a subsequent git log query for the determined refcommit to determine # a version by counting all commits since inception up to the refcommit # - we cannot use the first query, because it will be constrained by the # present paths that may not have existed previously at all # grab the history until the refcommit commits = [ line.split('\0') for line in ds.repo.call_git_items_( # name, email, timestamp, shasum ['log', '--pretty=format:%aN%x00%aE%x00%aI%x00%H', refcommit]) ] # version, always anchored on the first commit (tags could move and # make the integer commit count ambigous, and subtantially complicate # version comparisons version = '0-{}-g{}'.format( len(commits), # abbreviated shasum (like git-describe) ds.repo.get_hexsha(commits[0][3], short=True), ) meta = { 'version': version, } if ds.config.obtain('datalad.metadata.datalad-core.report-contributors', True, valtype=EnsureBool()): meta.update(contributors=sorted(set(tuple(c[:2]) for c in commits))) if ds.config.obtain( 'datalad.metadata.datalad-core.report-modification-dates', True, valtype=EnsureBool()): meta.update( dateCreated=commits[-1][2], dateModified=commits[0][2], ) return meta
def get_state(self, dataset): ds = dataset return { # increment when output format changes 'version': 1, 'unique_exclude': list(self._unique_exclude), 'remotes': ds.config.obtain('datalad.metadata.datalad-core.report-remotes', True, valtype=EnsureBool()), 'contributors': ds.config.obtain( 'datalad.metadata.datalad-core.report-contributors', True, valtype=EnsureBool()), 'modification-dates': ds.config.obtain( 'datalad.metadata.datalad-core.report-modification-dates', True, valtype=EnsureBool()), }
class CreateSibling(Interface): """Create dataset(s)'s sibling (e.g., on a web server). Those (empty) datasets can then serve as a target for the `publish` command. """ _params_ = dict( # TODO: Figure out, whether (and when) to use `sshurl` as push url dataset=Parameter( args=( "--dataset", "-d", ), doc="""specify the dataset to create the publication target for. If no dataset is given, an attempt is made to identify the dataset based on the current working directory""", constraints=EnsureDataset() | EnsureNone()), sshurl=Parameter( args=("sshurl", ), metavar='SSHURL', doc="""Login information for the target server. This can be given as a URL (ssh://host/path) or SSH-style (user@host:path). Unless overridden, this also serves the future dataset's access URL and path on the server.""", constraints=EnsureStr()), target=Parameter( args=('target', ), metavar='TARGETNAME', doc="""sibling name to create for this publication target. If `recursive` is set, the same name will be used to label all the subdatasets' siblings. Note, this is just a convenience option, siblings can also be added at a later point in time. When creation target datasets fails, no siblings are added""", constraints=EnsureStr() | EnsureNone(), nargs="?"), target_dir=Parameter( args=('--target-dir', ), metavar='PATH', doc="""path to the directory *on the server* where the dataset shall be created. By default the SSH access URL is used to identify this directory. If a relative path is provided here, it is interpreted as being relative to the user's home directory on the server.\n Additional features are relevant for recursive processing of datasets with subdatasets. By default, the local dataset structure is replicated on the server. However, it is possible to provide a template for generating different target directory names for all (sub)datasets. Templates can contain certain placeholder that are substituted for each (sub)dataset. For example: "/mydirectory/dataset-%%NAME".\nSupported placeholders:\n %%NAME - the name of the datasets, with any slashes replaced by dashes\n""", constraints=EnsureStr() | EnsureNone()), target_url=Parameter( args=('--target-url', ), metavar='URL', doc=""""public" access URL of the to-be-created target dataset(s) (default: `sshurl`). Accessibility of this URL determines the access permissions of potential consumers of the dataset. As with `target_dir`, templates (same set of placeholders) are supported. Also, if specified, it is provided as the annex description\n""", constraints=EnsureStr() | EnsureNone()), target_pushurl=Parameter( args=('--target-pushurl', ), metavar='URL', doc="""In case the `target_url` cannot be used to publish to the dataset, this option specifies an alternative URL for this purpose. As with `target_url`, templates (same set of placeholders) are supported.\n""", constraints=EnsureStr() | EnsureNone()), recursive=recursion_flag, existing=Parameter( args=("--existing", ), constraints=EnsureChoice('skip', 'replace', 'error', 'reconfigure'), metavar='MODE', doc="""action to perform, if target directory exists already. Dataset is skipped if 'skip'. 'replace' forces to (re-)init the dataset, and to (re-)configure the dataset sibling, i.e. its URL(s), in case it already exists. 'reconfigure' updates metadata of the dataset sibling. 'error' causes an exception to be raised.""", ), shared=Parameter( args=("--shared", ), metavar='false|true|umask|group|all|world|everybody|0xxx', doc="""if given, configures the access permissions on the server for multi-users (this could include access by a webserver!). Possible values for this option are identical to those of `git init --shared` and are described in its documentation.""", constraints=EnsureStr() | EnsureBool()), ui=Parameter(args=("--ui", ), metavar='false|true|html_filename', doc="""publish a web interface for the dataset with an optional user-specified name for the html at publication target. defaults to `index.html` at dataset root""", constraints=EnsureBool() | EnsureStr()), as_common_datasrc=as_common_datasrc, publish_depends=publish_depends, publish_by_default=publish_by_default, ) @staticmethod @datasetmethod(name='create_sibling') def __call__(sshurl, target=None, target_dir=None, target_url=None, target_pushurl=None, dataset=None, recursive=False, existing='error', shared=False, ui=False, as_common_datasrc=None, publish_by_default=None, publish_depends=None): if sshurl is None: raise ValueError("""insufficient information for target creation (needs at least a dataset and a SSH URL).""") if target is None and (target_url is not None or target_pushurl is not None): raise ValueError("""insufficient information for adding the target as a sibling (needs at least a name)""") # shortcut ds = require_dataset(dataset, check_installed=True, purpose='creating a sibling') assert (ds is not None and sshurl is not None and ds.repo is not None) # determine target parameters: sshri = RI(sshurl) if not isinstance(sshri, SSHRI) \ and not (isinstance(sshri, URL) and sshri.scheme == 'ssh'): raise ValueError( "Unsupported SSH URL: '{0}', use ssh://host/path or host:path syntax" .format(sshurl)) if target_dir is None: if sshri.path: target_dir = sshri.path else: target_dir = '.' # TODO: centralize and generalize template symbol handling replicate_local_structure = False if "%NAME" not in target_dir: replicate_local_structure = True # collect datasets to use: datasets = dict() datasets[basename(ds.path)] = ds if recursive: for subds in ds.get_subdatasets(recursive=True): sub_path = opj(ds.path, subds) # TODO: when enhancing Dataset/*Repo classes and therefore # adapt to moved code, make proper distinction between name and # path of a submodule, which are technically different. This # probably will become important on windows as well as whenever # we want to allow for moved worktrees. datasets[basename(ds.path) + '/' + subds] = \ Dataset(sub_path) # request ssh connection: not_supported_on_windows("TODO") lgr.info("Connecting ...") ssh = ssh_manager.get_connection(sshurl) ssh.open() # flag to check if at dataset_root at_root = True # loop over all datasets, ordered from top to bottom to make test # below valid (existing directories would cause the machinery to halt) # But we need to run post-update hook in depth-first fashion, so # would only collect first and then run (see gh #790) remote_repos_to_run_hook_for = [] for current_dspath in \ sorted(datasets.keys(), key=lambda x: x.count('/')): current_ds = datasets[current_dspath] if not current_ds.is_installed(): lgr.info("Skipping %s since not installed locally", current_dspath) continue if not replicate_local_structure: path = target_dir.replace("%NAME", current_dspath.replace("/", "-")) else: # TODO: opj depends on local platform, not the remote one. # check how to deal with it. Does windows ssh server accept # posix paths? vice versa? Should planned SSH class provide # tools for this issue? path = normpath( opj(target_dir, relpath(datasets[current_dspath].path, start=ds.path))) lgr.info("Creating target dataset {0} at {1}".format( current_dspath, path)) # Must be set to True only if exists and existing='reconfigure' # otherwise we might skip actions if we say existing='reconfigure' # but it did not even exist before only_reconfigure = False if path != '.': # check if target exists # TODO: Is this condition valid for != '.' only? path_exists = True try: out, err = ssh(["ls", path]) except CommandError as e: if "No such file or directory" in e.stderr and \ path in e.stderr: path_exists = False else: raise # It's an unexpected failure here if path_exists: if existing == 'error': raise RuntimeError( "Target directory %s already exists." % path) elif existing == 'skip': continue elif existing == 'replace': ssh([ "chmod", "+r+w", "-R", path ]) # enable write permissions to allow removing dir ssh(["rm", "-rf", path]) # remove target at path path_exists = False # if we succeeded in removing it elif existing == 'reconfigure': only_reconfigure = True else: raise ValueError( "Do not know how to handle existing=%s" % repr(existing)) if not path_exists: try: ssh(["mkdir", "-p", path]) except CommandError as e: lgr.error( "Remotely creating target directory failed at " "%s.\nError: %s" % (path, exc_str(e))) continue # don't (re-)initialize dataset if existing == reconfigure if not only_reconfigure: # init git and possibly annex repo if not CreateSibling.init_remote_repo(path, ssh, shared, datasets[current_dspath], description=target_url): continue # check git version on remote end lgr.info("Adjusting remote git configuration") remote_git_version = CreateSibling.get_remote_git_version(ssh) if remote_git_version and remote_git_version >= "2.4": # allow for pushing to checked out branch try: ssh(["git", "-C", path] + [ "config", "receive.denyCurrentBranch", "updateInstead" ]) except CommandError as e: lgr.error( "git config failed at remote location %s.\n" "You will not be able to push to checked out " "branch. Error: %s", path, exc_str(e)) else: lgr.error( "Git version >= 2.4 needed to configure remote." " Version detected on server: %s\nSkipping configuration" " of receive.denyCurrentBranch - you will not be able to" " publish updates to this repository. Upgrade your git" " and run with --existing=reconfigure" % remote_git_version) # enable metadata refresh on dataset updates to publication server lgr.info("Enabling git post-update hook ...") try: CreateSibling.create_postupdate_hook(path, ssh, datasets[current_dspath]) except CommandError as e: lgr.error("Failed to add json creation command to post update " "hook.\nError: %s" % exc_str(e)) # publish web-interface to root dataset on publication server if at_root and ui: lgr.info("Uploading web interface to %s" % path) at_root = False try: CreateSibling.upload_web_interface(path, ssh, shared, ui) except CommandError as e: lgr.error("Failed to push web interface to the remote " "datalad repository.\nError: %s" % exc_str(e)) remote_repos_to_run_hook_for.append(path) # in reverse order would be depth first lgr.debug("Running post-update hooks in all created siblings") for path in remote_repos_to_run_hook_for[::-1]: # Trigger the hook try: ssh( ["cd '" + _path_(path, ".git") + "' && hooks/post-update"], wrap_args=False # we wrapped here manually ) except CommandError as e: lgr.error("Failed to run post-update hook under path %s. " "Error: %s" % (path, exc_str(e))) if target: # add the sibling(s): lgr.debug("Adding the siblings") if target_url is None: target_url = sshurl if target_pushurl is None and sshurl != target_url: target_pushurl = sshurl AddSibling()(dataset=ds, name=target, url=target_url, pushurl=target_pushurl, recursive=recursive, fetch=True, force=existing in {'replace'}, as_common_datasrc=as_common_datasrc, publish_by_default=publish_by_default, publish_depends=publish_depends) # TODO: Return value!? # => [(Dataset, fetch_url)] @staticmethod def init_remote_repo(path, ssh, shared, dataset, description=None): cmd = ["git", "-C", path, "init"] if shared: cmd.append("--shared=%s" % shared) try: ssh(cmd) except CommandError as e: lgr.error("Initialization of remote git repository failed at %s." "\nError: %s\nSkipping ..." % (path, exc_str(e))) return False if isinstance(dataset.repo, AnnexRepo): # init remote git annex repo (part fix of #463) try: ssh(["git", "-C", path, "annex", "init"] + ([description] if description else [])) except CommandError as e: lgr.error( "Initialization of remote git annex repository failed at %s." "\nError: %s\nSkipping ..." % (path, exc_str(e))) return False return True @staticmethod def get_remote_git_version(ssh): try: # options to disable all auto so we don't trigger them while testing # for absent changes out, err = ssh(["git"] + ["version"]) assert out.strip().startswith("git version") git_version = out.strip().split()[2] lgr.debug("Detected git version on server: %s" % git_version) return LooseVersion(git_version) except CommandError as e: lgr.warning("Failed to determine git version on remote.\n" "Error: {0}\nTrying to configure anyway " "...".format(exc_str(e))) return None @staticmethod def create_postupdate_hook(path, ssh, dataset): # location of post-update hook file, logs folder on remote target hooks_remote_dir = opj(path, '.git', 'hooks') hook_remote_target = opj(hooks_remote_dir, 'post-update') # post-update hook should create its log directory if doesn't exist logs_remote_dir = opj(path, WEB_META_LOG) make_log_dir = 'mkdir -p "{}"'.format(logs_remote_dir) # create json command for current dataset json_command = r''' mkdir -p {}; ( which datalad > /dev/null \ && ( cd ..; GIT_DIR=$PWD/.git datalad ls -a --json file '{}'; ) \ || echo "no datalad found - skipping generation of indexes for web frontend"; \ ) &> "{}/{}" '''.format(logs_remote_dir, str(path), logs_remote_dir, 'datalad-publish-hook-$(date +%s).log' % TIMESTAMP_FMT) # collate content for post_update hook hook_content = '\n'.join([ '#!/bin/bash', 'git update-server-info', make_log_dir, json_command ]) with make_tempfile(content=hook_content ) as tempf: # create post_update hook script ssh.copy(tempf, hook_remote_target) # upload hook to dataset ssh(['chmod', '+x', hook_remote_target]) # and make it executable @staticmethod def upload_web_interface(path, ssh, shared, ui): # path to web interface resources on local webui_local = opj(dirname(datalad.__file__), 'resources', 'website') # local html to dataset html_local = opj(webui_local, "index.html") # name and location of web-interface html on target html_targetname = {True: ui, False: "index.html"}[isinstance(ui, str)] html_target = opj(path, html_targetname) # upload ui html to target ssh.copy(html_local, html_target) # upload assets to the dataset webresources_local = opj(webui_local, 'assets') webresources_remote = opj(path, WEB_HTML_DIR) ssh(['mkdir', '-p', webresources_remote]) ssh.copy(webresources_local, webresources_remote, recursive=True) # minimize and upload js assets for js_file in glob(opj(webresources_local, 'js', '*.js')): with open(js_file) as asset: try: from jsmin import jsmin minified = jsmin(asset.read()) # minify asset except ImportError: lgr.warning( "Will not minify web interface javascript, no jsmin available" ) minified = asset.read() # no minify available with make_tempfile(content=minified ) as tempf: # write minified to tempfile js_name = js_file.split('/')[-1] ssh.copy(tempf, opj(webresources_remote, 'assets', 'js', js_name)) # and upload js # explicitly make web+metadata dir of dataset world-readable, if shared set to 'all' mode = None if shared in (True, 'true', 'all', 'world', 'everybody'): mode = 'a+rX' elif shared == 'group': mode = 'g+rX' elif str(shared).startswith('0'): mode = shared if mode: ssh([ 'chmod', mode, '-R', dirname(webresources_remote), opj(path, 'index.html') ])
def _get_metadata(ds, types, global_meta=None, content_meta=None, paths=None): """Make a direct query of a dataset to extract its metadata. Parameters ---------- ds : Dataset types : list """ errored = False dsmeta = MetadataDict() # each item in here will be a MetadataDict, but not the whole thing contentmeta = {} if global_meta is not None and content_meta is not None and \ not global_meta and not content_meta: # both are false and not just none return dsmeta, contentmeta, errored context = { '@vocab': 'http://docs.datalad.org/schema_v{}.json'.format(vocabulary_version) } fullpathlist = paths if paths and isinstance(ds.repo, AnnexRepo): # Ugly? Jep: #2055 content_info = zip(paths, ds.repo.file_has_content(paths), ds.repo.is_under_annex(paths)) paths = [p for p, c, a in content_info if not a or c] nocontent = len(fullpathlist) - len(paths) if nocontent: # TODO better fail, or support incremental and label this file as no present lgr.warn( '{} files have no content present, skipped metadata extraction for {}' .format( nocontent, 'them' if nocontent > 10 else [p for p, c, a in content_info if not c and a])) # pull out potential metadata field blacklist config settings blacklist = [ re.compile(bl) for bl in assure_list( ds.config.obtain('datalad.metadata.aggregate-ignore-fields', default=[])) ] # enforce size limits max_fieldsize = ds.config.obtain('datalad.metadata.maxfieldsize') # keep local, who knows what some extractors might pull in from . import extractors lgr.info('Engage metadata extractors: %s', types) for mtype in types: mtype_key = mtype try: pmod = import_module('.{}'.format(mtype), package=extractors.__package__) except ImportError as e: lgr.warning( "Failed to import metadata extractor for '%s', " "broken dataset configuration (%s)? " "This type of metadata will be ignored: %s", mtype, ds, exc_str(e)) if cfg.get('datalad.runtime.raiseonerror'): raise errored = True continue extractor = pmod.MetadataExtractor(ds, paths=paths) try: dsmeta_t, contentmeta_t = extractor.get_metadata( dataset=global_meta if global_meta is not None else ds.config.obtain( 'datalad.metadata.aggregate-dataset-{}'.format( mtype.replace('_', '-')), default=True, valtype=EnsureBool()), content=content_meta if content_meta is not None else ds.config.obtain( 'datalad.metadata.aggregate-content-{}'.format( mtype.replace('_', '-')), default=True, valtype=EnsureBool())) except Exception as e: lgr.error('Failed to get dataset metadata ({}): {}'.format( mtype, exc_str(e))) if cfg.get('datalad.runtime.raiseonerror'): raise errored = True # if we dont get global metadata we do not want content metadata continue if dsmeta_t: if _ok_metadata(dsmeta_t, mtype, ds, None): dsmeta_t = _filter_metadata_fields(dsmeta_t, maxsize=max_fieldsize, blacklist=blacklist) dsmeta[mtype_key] = dsmeta_t else: errored = True unique_cm = {} for loc, meta in contentmeta_t or {}: if not _ok_metadata(meta, mtype, ds, loc): errored = True continue # we also want to store info that there was no metadata(e.g. to get a list of # files that have no metadata) # if there is an issue that a extractor needlessly produces empty records, the # extractor should be fixed and not a general switch. For example the datalad_core # issues empty records to document the presence of a file #elif not meta: # continue meta = MetadataDict(meta) # apply filters meta = _filter_metadata_fields(meta, maxsize=max_fieldsize, blacklist=blacklist) # assign # only ask each metadata extractor once, hence no conflict possible loc_dict = contentmeta.get(loc, {}) if meta: # do not store empty stuff loc_dict[mtype_key] = meta contentmeta[loc] = loc_dict if ds.config.obtain('datalad.metadata.generate-unique-{}'.format( mtype_key.replace('_', '-')), default=True, valtype=EnsureBool()): # go through content metadata and inject report of unique keys # and values into `dsmeta` for k, v in meta.items(): if k in dsmeta.get(mtype_key, {}): # if the dataset already has a dedicated idea # about a key, we skip it from the unique list # the point of the list is to make missing info about # content known in the dataset, not to blindly # duplicate metadata. Example: list of samples data # were recorded from. If the dataset has such under # a 'sample' key, we should prefer that, over an # aggregated list of a hopefully-kinda-ok structure continue vset = unique_cm.get(k, set()) vset.add(_val2hashable(v)) unique_cm[k] = vset if unique_cm: # per source storage here too ucp = dsmeta.get('datalad_unique_content_properties', {}) # important: we want to have a stable order regarding # the unique values (a list). we cannot guarantee the # same order of discovery, hence even when not using a # set above we would still need sorting. the callenge # is that any value can be an arbitrarily complex nested # beast # we also want to have each unique value set always come # in a top-level list, so we known if some unique value # was a list, os opposed to a list of unique values ucp[mtype_key] = { k: [ dict(i) if isinstance(i, ReadOnlyDict) else i for i in sorted(v, key=_unique_value_key) ] for k, v in unique_cm.items() } dsmeta['datalad_unique_content_properties'] = ucp # always identify the effective vocabulary - JSON-LD style if context: dsmeta['@context'] = context return dsmeta, contentmeta, errored
class Add(Interface): r"""Add metadata to metadata model instance. This command reads metadata from a source and adds this metadata to a metadata model instance. A source can be: arguments, standard input, or a local file. The metadata format is a strings with the JSON-serialized dictionary that describes the metadata [TODO: add a schema] If metadata is read from a source, parameter can overwrite or amend information that is stored in the source. The METADATA and the ADDITIONAL_VALUES arguments can be pre-fixed by '@', in which case the pre-fixed argument is interpreted as a file-name and the argument value is read from the file. """ _examples_ = [ dict(text='Add metadata stored in the file "metadata-123.json" to the ' 'metadata model instance in the current directory.', code_cmd="datalad meta-add metadata-123.json"), dict(text='Add metadata stored in the file "metadata-123.json" to the ' 'metadata stored in the git-repository "/home/user/dataset_0"', code_cmd="datalad meta-add --metadata-store /home/user/dataset_0 " "metadata-123.json"), dict(text='Add metadata stored in the file "metadata-123.json" to the ' 'metadata model instance in the current directory and ' 'overwrite the "dataset_id" value stored in ' '"metadata-123.json"', code_cmd='datalad meta-add --metadata-store /home/user/dataset_0 ' 'metadata-123.json \'{"dataset_id": ' '"00010203-1011-2021-3031-404142434445"}\''), dict( text='Add metadata read from standard input to the metadata model ' 'instance in the current directory', code_cmd='datalad meta-add --metadata-store /home/user/dataset_0 ' 'metadata-123.json @extra-info.json'), dict(text='Add metadata stored in the file "metadata-123.json" to the ' 'metadata model instance in the current directory and ' 'overwrite metadata values with the values stored in ' '"extra-info.json"', code_cmd='atalad meta-add --metadata-store /home/user/dataset_0 ' 'metadata-123.json @extra-info.json') ] required_keys = ("type", "extractor_name", "extractor_version", "extraction_parameter", "extraction_time", "agent_name", "agent_email", "dataset_id", "dataset_version", "extracted_metadata") optional_keys = ("path", ) required_additional_keys = ("root_dataset_id", "root_dataset_version", "dataset_path") required_keys_lines = "\n".join(map(repr, required_keys)) required_additional_keys_lines = "\n".join( map(repr, required_additional_keys)) _params_ = dict( metadata=Parameter( args=("metadata", ), metavar="METADATA", doc=f"""Path of a file that contains the metadata that should be added to the metadata model instance (the metadata must be provided as a JSON-serialized metadata dictionary). If the path is "-", metadata is read from standard input. The dictionary must contain the following keys: {required_keys_lines} If the metadata is associated with a file, the following key indicates the file path: 'path' It may in addition contain either all or none of the following keys (they are used to add metadata element as a sub-dataset element, i.e. perform aggregation): {required_additional_keys_lines} """, constraints=EnsureStr() | EnsureNone()), metadata_store=Parameter( args=("-m", "--metadata-store"), metavar="METADATA_STORE", doc="""Directory in which the metadata model instance is stored. If no directory name is provided, the current working directory is used.""", constraints=EnsureStr() | EnsureNone()), additionalvalues=Parameter( args=("additionalvalues", ), metavar="ADDITIONAL_VALUES", doc="""A string that contains a JSON serialized dictionary of key value-pairs. These key values-pairs are used in addition to the key value pairs in the metadata dictionary to describe the metadata that should be added. If an additional key is already present in the metadata, an error is raised, unless -o, --allow-override is provided. In this case, the additional values will override the value in metadata and a warning is issued.""", nargs="?", constraints=EnsureStr() | EnsureNone()), allow_override=Parameter( args=("-o", "--allow-override"), doc="""Allow the additional values to override values given in metadata.""", default=False, constraints=EnsureBool() | EnsureNone()), allow_unknown=Parameter( args=("-u", "--allow-unknown"), doc="""Allow unknown keys. By default, unknown keys generate an errors. If this switch is True, unknown keys will only be reported.""", default=False, constraints=EnsureBool() | EnsureNone())) @staticmethod @datasetmethod(name="meta_add") @eval_results def __call__(metadata: Union[str, JSONObject], metadata_store: Optional[str] = None, additionalvalues: Optional[Union[str, JSONObject]] = None, allow_override: bool = False, allow_unknown: bool = False): additionalvalues = additionalvalues or dict() metadata_store = Path(metadata_store or curdir) metadata = process_parameters( metadata=read_json_object(metadata), additional_values=get_json_object(additionalvalues), allow_override=allow_override, allow_unknown=allow_unknown) lgr.debug(f"attempting to add metadata: {json.dumps(metadata)}") add_parameter = AddParameter( dataset_id=UUID(metadata["dataset_id"]), dataset_version=metadata["dataset_version"], file_path=(MetadataPath(metadata["path"]) if "path" in metadata else None), root_dataset_id=(UUID(metadata["root_dataset_id"]) if "root_dataset_id" in metadata else None), root_dataset_version=metadata.get("root_dataset_version", None), dataset_path=MetadataPath(metadata.get("dataset_path", "")), extractor_name=metadata["extractor_name"], extractor_version=metadata["extractor_version"], extraction_time=metadata["extraction_time"], extraction_parameter=metadata["extraction_parameter"], agent_name=metadata["agent_name"], agent_email=metadata["agent_email"], extracted_metadata=metadata["extracted_metadata"]) # If the key "path" is present in the metadata # dictionary, we assume that the metadata-dictionary describes # file-level metadata. Otherwise, we assume that the # metadata-dictionary contains dataset-level metadata. if add_parameter.file_path: yield from add_file_metadata(metadata_store, add_parameter) else: yield from add_dataset_metadata(metadata_store, add_parameter) return
class CreateSiblingRia(Interface): """Creates a sibling to a dataset in a RIA store This creates a representation of a dataset in a ria-remote compliant storage location. For access to it two siblings are configured for the dataset by default. A "regular" one and a RIA remote (git-annex special remote). Furthermore, the former is configured to have a publication dependency on the latter. If not given a default name for the RIA remote is derived from the sibling's name by appending "-ria". The store's base path currently is expected to either: - not yet exist or - be empty or - have a valid `ria-layout-version` file and an `error_logs` directory. In the first two cases, said file and directory are created by this command. Alternatively you can manually create the third case, of course. Please note, that `ria-layout-version` needs to contain a line stating the version (currently '1') and optionally enable error logging (append '|l' in that case). Currently, this line MUST end with a newline! Error logging will create files in the `error_log` directory whenever the RIA special remote (storage sibling) raises an exception, storing the python traceback of it. The logfiles are named according to the scheme <dataset id>.<annex uuid of the remote>.log showing 'who' ran into this issue with what dataset. Since this logging can potentially leak personal data (like local file paths for example) it can be disabled from the client side via `annex.ria-remote.<RIAREMOTE>.ignore-remote-config`. Todo ---- Where to put the description of a RIA store (see below)? The targeted layout of such a store is a tree of datasets, starting at the configured base path. First level of subdirectories are named for the first three characters of the datasets' id, second level is the remainder of those ids. The thereby created dataset directories contain a bare git repository. Those bare repositories are slightly different from plain git-annex bare repositories in that they use the standard dirhashmixed layout beneath annex/objects as opposed to dirhashlower, which is git-annex's default for bare repositories. Furthermore, there is an additional directory 'archives' within the dataset directories, which may or may not contain archives with annexed content. Note, that this helps to reduce the number of inodes consumed (no checkout + potential archive) as well as it allows to resolve dependencies (that is (sub)datasets) merely by their id. Finally, there is a file `ria-layout-version` put beneath the store's base path, determining the version of the dataset tree layout and a file of the same name per each dataset directory determining object tree layout version (we already switch from dirhashlower to dirhashmixed for example) and an additional directory `error_logs` at the toplevel. """ # TODO: description? _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), doc="""specify the dataset to process. If no dataset is given, an attempt is made to identify the dataset based on the current working directory""", constraints=EnsureDataset() | EnsureNone()), url=Parameter( args=("url", ), metavar="ria+<ssh|file>://<host>[/path]", doc="""URL identifying the target RIA store and access protocol. """, constraints=EnsureStr() | EnsureNone()), name=Parameter(args=( '-s', '--name', ), metavar='NAME', doc="""Name of the sibling. With `recursive`, the same name will be used to label all the subdatasets' siblings.""", constraints=EnsureStr() | EnsureNone(), required=True), ria_remote_name=Parameter( args=("--ria-remote-name", ), metavar="NAME", doc="""Name of the RIA remote (a git-annex special remote). Must not be identical to the sibling name. If not specified, defaults to the sibling name plus a '-ria' suffix.""", constraints=EnsureStr() | EnsureNone()), post_update_hook=Parameter( args=("--post-update-hook", ), doc="""Enable git's default post-update-hook for the created sibling.""", action="store_true"), shared=Parameter( args=("--shared", ), metavar='{false|true|umask|group|all|world|everybody|0xxx}', doc="""If given, configures the permissions in the RIA store for multi-users access. Possible values for this option are identical to those of `git init --shared` and are described in its documentation.""", constraints=EnsureStr() | EnsureBool() | EnsureNone()), group=Parameter( args=("--group", ), metavar="GROUP", doc="""Filesystem group for the repository. Specifying the group is crucial when [CMD: --shared=group CMD][PY: shared="group" PY]""", constraints=EnsureStr() | EnsureNone()), ria_remote=Parameter( args=("--no-ria-remote", ), dest='ria_remote', doc="""Whether to establish remote indexed archive (RIA) capabilties for the created sibling. If enabled, git-annex special remote access will be configured to enable regular git-annex key storage, and also retrieval of keys from (compressed) 7z archives that might be provided by the dataset store. If disabled, git-annex is instructed to ignore the sibling.""", action="store_false"), existing=Parameter( args=("--existing", ), constraints=EnsureChoice('skip', 'replace', 'error', 'reconfigure'), metavar='MODE', doc="""Action to perform, if a sibling or ria-remote is already configured under the given name and/or a target already exists. In this case, a dataset can be skipped ('skip'), an existing target directory be forcefully re-initialized, and the sibling (re-)configured ('replace', implies 'reconfigure'), the sibling configuration be updated only ('reconfigure'), or to error ('error').""", ), recursive=recursion_flag, recursion_limit=recursion_limit, ) @staticmethod @datasetmethod(name='create_sibling_ria') @eval_results def __call__(url, name, dataset=None, ria_remote_name=None, post_update_hook=False, shared=None, group=None, ria_remote=True, existing='error', recursive=False, recursion_limit=None): ds = require_dataset(dataset, check_installed=True, purpose='create sibling RIA') res_kwargs = dict( ds=ds, action="create-sibling-ria", logger=lgr, ) if ds.repo.get_hexsha() is None or ds.id is None: raise RuntimeError("Repository at {} is not a DataLad dataset, " "run 'datalad create [--force]' first.".format( ds.path)) if not ria_remote and ria_remote_name: lgr.warning( "RIA remote setup disabled, but a ria-remote name was provided" ) if ria_remote and not ria_remote_name: ria_remote_name = "{}-ria".format(name) if ria_remote and name == ria_remote_name: # leads to unresolvable, circular dependency with publish-depends raise ValueError("sibling names must not be equal") if not isinstance(url, str): raise TypeError("url is not a string, but %s" % type(url)) # Query existing siblings upfront in order to fail early on # existing=='error', since misconfiguration (particularly of special # remotes) only to fail in a subdataset later on with that config, can # be quite painful. # TODO: messages - this is "create-sibling". Don't confuse existence of # local remotes with existence of the actual remote sibling # in wording if existing == 'error': # in recursive mode this check could take a substantial amount of # time: employ a progress bar (or rather a counter, because we dont # know the total in advance pbar_id = 'check-siblings-{}'.format(id(ds)) log_progress( lgr.info, pbar_id, 'Start checking pre-existing sibling configuration %s', ds, label='Query siblings', unit=' Siblings', ) # even if we have to fail, let's report all conflicting siblings # in subdatasets failed = False for r in ds.siblings(result_renderer=None, recursive=recursive, recursion_limit=recursion_limit): log_progress(lgr.info, pbar_id, 'Discovered sibling %s in dataset at %s', r['name'], r['path'], update=1, increment=True) if not r['type'] == 'sibling' or r['status'] != 'ok': # this is an internal status query that has not consequence # for the outside world. Be silent unless something useful # can be said #yield r continue if r['name'] == name: res = get_status_dict( status='error', message="a sibling '{}' is already configured in " "dataset {}".format(name, r['path']), **res_kwargs, ) failed = True yield res continue if ria_remote_name and r['name'] == ria_remote_name: res = get_status_dict( status='error', message="a sibling '{}' is already configured in " "dataset {}".format(ria_remote_name, r['path']), **res_kwargs, ) failed = True yield res continue log_progress( lgr.info, pbar_id, 'Finished checking pre-existing sibling configuration %s', ds, ) if failed: return yield from _create_sibling_ria(ds, url, name, ria_remote, ria_remote_name, existing, shared, group, post_update_hook, res_kwargs) if recursive: # Note: subdatasets can be treated independently, so go full # recursion when querying for them and _no_recursion with the # actual call. Theoretically this can be parallelized. for subds in ds.subdatasets(fulfilled=True, recursive=True, recursion_limit=recursion_limit, result_xfm='datasets'): yield from _create_sibling_ria(subds, url, name, ria_remote, ria_remote_name, existing, shared, group, post_update_hook, res_kwargs)
class Run(Interface): """Run an arbitrary shell command and record its impact on a dataset. It is recommended to craft the command such that it can run in the root directory of the dataset that the command will be recorded in. However, as long as the command is executed somewhere underneath the dataset root, the exact location will be recorded relative to the dataset root. If the executed command did not alter the dataset in any way, no record of the command execution is made. If the given command errors, a `CommandError` exception with the same exit code will be raised, and no modifications will be saved. *Command format* || REFLOW >> A few placeholders are supported in the command via Python format specification. "{pwd}" will be replaced with the full path of the current working directory. "{dspath}" will be replaced with the full path of the dataset that run is invoked on. "{tmpdir}" will be replaced with the full path of a temporary directory. "{inputs}" and "{outputs}" represent the values specified by [CMD: --input and --output CMD][PY: `inputs` and `outputs` PY]. If multiple values are specified, the values will be joined by a space. The order of the values will match that order from the command line, with any globs expanded in alphabetical order (like bash). Individual values can be accessed with an integer index (e.g., "{inputs[0]}"). << REFLOW || || REFLOW >> Note that the representation of the inputs or outputs in the formatted command string depends on whether the command is given as a list of arguments or as a string[CMD: (quotes surrounding the command) CMD]. The concatenated list of inputs or outputs will be surrounded by quotes when the command is given as a list but not when it is given as a string. This means that the string form is required if you need to pass each input as a separate argument to a preceding script (i.e., write the command as "./script {inputs}", quotes included). The string form should also be used if the input or output paths contain spaces or other characters that need to be escaped. << REFLOW || To escape a brace character, double it (i.e., "{{" or "}}"). Custom placeholders can be added as configuration variables under "datalad.run.substitutions". As an example: Add a placeholder "name" with the value "joe":: % git config --file=.datalad/config datalad.run.substitutions.name joe % datalad add -m "Configure name placeholder" .datalad/config Access the new placeholder in a command:: % datalad run "echo my name is {name} >me" """ _examples_ = [ dict( text="Run an executable script and record the impact on a dataset", code_py="run(message='run my script', cmd='code/script.sh')", code_cmd="datalad run -m 'run my script' 'code/script.sh'"), dict(text="Run a command and specify a directory as a dependency " "for the run. The contents of the dependency will be retrieved " "prior to running the script", code_cmd="datalad run -m 'run my script' -i 'data/*' " "'code/script.sh'", code_py="""\ run(cmd='code/script.sh', message='run my script', inputs=['data/*'])"""), dict(text="Run an executable script and specify output files of the " "script to be unlocked prior to running the script", code_py="""\ run(cmd='code/script.sh', message='run my script', inputs=['data/*'], outputs=['output_dir'])""", code_cmd="""\ datalad run -m 'run my script' -i 'data/*' \\ -o 'output_dir/*' 'code/script.sh'"""), dict(text="Specify multiple inputs and outputs", code_py="""\ run(cmd='code/script.sh', message='run my script', inputs=['data/*', 'datafile.txt'], outputs=['output_dir', 'outfile.txt'])""", code_cmd="""\ datalad run -m 'run my script' -i 'data/*' \\ -i 'datafile.txt' -o 'output_dir/*' -o \\ 'outfile.txt' 'code/script.sh'""") ] _params_ = dict( cmd=Parameter( args=("cmd", ), nargs=REMAINDER, metavar='COMMAND', doc="""command for execution. A leading '--' can be used to disambiguate this command from the preceding options to DataLad."""), dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset to record the command results in. An attempt is made to identify the dataset based on the current working directory. If a dataset is given, the command will be executed in the root directory of this dataset.""", constraints=EnsureDataset() | EnsureNone()), inputs=Parameter( args=("-i", "--input"), dest="inputs", metavar=("PATH"), action='append', doc="""A dependency for the run. Before running the command, the content of this file will be retrieved. A value of "." means "run :command:`datalad get .`". The value can also be a glob. [CMD: This option can be given more than once. CMD]"""), outputs=Parameter( args=("-o", "--output"), dest="outputs", metavar=("PATH"), action='append', doc="""Prepare this file to be an output file of the command. A value of "." means "run :command:`datalad unlock .`" (and will fail if some content isn't present). For any other value, if the content of this file is present, unlock the file. Otherwise, remove it. The value can also be a glob. [CMD: This option can be given more than once. CMD]"""), expand=Parameter( args=("--expand", ), doc="""Expand globs when storing inputs and/or outputs in the commit message.""", constraints=EnsureChoice(None, "inputs", "outputs", "both")), explicit=Parameter( args=("--explicit", ), action="store_true", doc="""Consider the specification of inputs and outputs to be explicit. Don't warn if the repository is dirty, and only save modifications to the listed outputs."""), message=save_message_opt, sidecar=Parameter(args=('--sidecar', ), metavar="{yes|no}", doc="""By default, the configuration variable 'datalad.run.record-sidecar' determines whether a record with information on a command's execution is placed into a separate record file instead of the commit message (default: off). This option can be used to override the configured behavior on a case-by-case basis. Sidecar files are placed into the dataset's '.datalad/runinfo' directory (customizable via the 'datalad.run.record-directory' configuration variable).""", constraints=EnsureNone() | EnsureBool()), ) @staticmethod @datasetmethod(name='run') @eval_results def __call__(cmd=None, dataset=None, inputs=None, outputs=None, expand=None, explicit=False, message=None, sidecar=None): for r in run_command(cmd, dataset=dataset, inputs=inputs, outputs=outputs, expand=expand, explicit=explicit, message=message, sidecar=sidecar): yield r
class WebApp(Interface): """ """ _params_ = dict( app=Parameter(args=('app', ), nargs='?', metavar='APPNAME', doc="""Name of a registered webapp to start"""), dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset to serve as the anchor of the webapp. An attempt is made to identify the dataset based on the current working directory. If a dataset is given, the command will be executed in the root directory of this dataset.""", constraints=EnsureDataset() | EnsureNone()), read_only=Parameter( args=("--read-only", ), constraints=EnsureBool(), doc="""do not perform operations other then read-only access to dataset. It is up to the individual resources to interpret this flag and act accordingly."""), mode=Parameter( args=("--mode", ), constraints=EnsureChoice('normal', 'daemon', 'dry-run', 'debug'), doc="""Execution mode: regular foreground process (normal); background process (daemon); no server is started, but all configuration is perform (dry-run); like normal, but in debug mode (debug)"""), static_root=Parameter( args=("--static-root", ), doc="""path to static (HTML) files that should be served in root of the webapp. Defaults to the current directory."""), get_apps=Parameter(args=('--get-apps', ), action='store_true', doc="""if set, yields all registered webapp."""), ) @staticmethod @datasetmethod(name='webapp') @eval_results def __call__(app=None, dataset=None, read_only=False, mode='normal', static_root=None, get_apps=False): if get_apps: for ep in iter_entry_points('datalad.webapp.apps'): yield dict(action='webapp', status='ok' if resource_isdir( ep.module_name, ep.load()) else 'error', path=ep.name, logger=lgr, message=("provided by '%s'", ep.module_name)) return from datalad.distribution.dataset import require_dataset dataset = require_dataset(dataset, check_installed=True, purpose='serving') if static_root is None and app: for ep in iter_entry_points('datalad.webapp.apps'): if ep.name == app: app_path = resource_filename(ep.module_name, ep.load()) if not resource_isdir(ep.module_name, ep.load()): yield dict( action='webapp', status='error', path=dataset.path, message= ("app entrypoint '%s' does not point to directory", app, app_path)) return static_root = app_path break if static_root is None: yield dict(action='webapp', status='error', path=dataset.path, message=("no registered webapp with name '%s'", app)) return elif static_root is None: static_root = op.curdir from flask import Flask app = Flask( __name__, root_path=dataset.path, static_url_path='', static_folder=op.abspath(static_root), ) app.secret_key = os.urandom(64) # expose via arg app.config['api_key'] = 'dummy' webapp_props['config'] = app.config from flask_restful import Api api = Api(app, prefix="/api/v1") # TODO add default route to static index.html, if one exists # TODO use opt-in model for endpoints to limit exposure of # functionality to what is really needed for ep in iter_entry_points('datalad.webapp.resources'): lgr.warn("Available webapp resource'%s'", ep.name) cls = ep.load() urls = ['/{}'.format(ep.name)] if hasattr(cls, '_urlarg_spec'): urls.append('/{}/{}'.format(ep.name, cls._urlarg_spec)) api.add_resource(cls, *urls, resource_class_kwargs=dict(dataset=dataset, )) if op.exists(op.join(static_root, 'index.html')): from flask import send_from_directory @app.route('/') def serve_index(): return send_from_directory(static_root, 'index.html') if mode == 'dry-run': yield dict( action='webapp', status='ok', app=app, path=dataset.path, ) return print(""" ************************************************* ************************************************* THIS IS NOT A PRODUCTION-READY TOOL - only use in a trusted environment - do not expose service on public network interfaces ************************************************* ************************************************* """) # TODO expose flags, or use FLASK config vars app.run(debug=mode == 'debug')
}, 'datalad.exc.str.tblimit': { 'ui': ('question', { 'title': 'This flag is used by datalad to cap the number of traceback steps included in exception logging and result reporting to DATALAD_EXC_STR_TBLIMIT of pre-processed entries from traceback.' }), }, 'datalad.fake-dates': { 'ui': ('yesno', { 'title': 'Fake (anonymize) dates', 'text': 'Should the dates in the logs be faked?' }), 'destination': 'local', 'type': EnsureBool(), 'default': False, }, 'datalad.fake-dates-start': { 'ui': ('question', { 'title': 'Initial fake date', 'text': 'When faking dates and there are no commits in any local branches, generate the date by adding one second to this value (Unix epoch time). The value must be positive.' }), 'type': EnsureInt(), 'default': 1112911993, },
class CreateSibling(Interface): """Create a dataset sibling on a UNIX-like SSH-accessible machine Given a local dataset, and SSH login information this command creates a remote dataset repository and configures it as a dataset sibling to be used as a publication target (see `publish` command). Various properties of the remote sibling can be configured (e.g. name location on the server, read and write access URLs, and access permissions. Optionally, a basic web-viewer for DataLad datasets can be installed at the remote location. This command supports recursive processing of dataset hierarchies, creating a remote sibling for each dataset in the hierarchy. By default, remote siblings are created in hierarchical structure that reflects the organization on the local file system. However, a simple templating mechanism is provided to produce a flat list of datasets (see --target-dir). """ # XXX prevent common args from being added to the docstring _no_eval_results = True _params_ = dict( # TODO: Figure out, whether (and when) to use `sshurl` as push url dataset=Parameter( args=("--dataset", "-d",), doc="""specify the dataset to create the publication target for. If no dataset is given, an attempt is made to identify the dataset based on the current working directory""", constraints=EnsureDataset() | EnsureNone()), sshurl=Parameter( args=("sshurl",), metavar='SSHURL', nargs='?', doc="""Login information for the target server. This can be given as a URL (ssh://host/path) or SSH-style (user@host:path). Unless overridden, this also serves the future dataset's access URL and path on the server.""", constraints=EnsureStr()), name=Parameter( args=('-s', '--name',), metavar='NAME', doc="""sibling name to create for this publication target. If `recursive` is set, the same name will be used to label all the subdatasets' siblings. When creating a target dataset fails, no sibling is added""", constraints=EnsureStr() | EnsureNone(), nargs="?"), target_dir=Parameter( args=('--target-dir',), metavar='PATH', doc="""path to the directory *on the server* where the dataset shall be created. By default the SSH access URL is used to identify this directory. If a relative path is provided here, it is interpreted as being relative to the user's home directory on the server.\n Additional features are relevant for recursive processing of datasets with subdatasets. By default, the local dataset structure is replicated on the server. However, it is possible to provide a template for generating different target directory names for all (sub)datasets. Templates can contain certain placeholder that are substituted for each (sub)dataset. For example: "/mydirectory/dataset%%RELNAME".\nSupported placeholders:\n %%RELNAME - the name of the datasets, with any slashes replaced by dashes\n""", constraints=EnsureStr() | EnsureNone()), target_url=Parameter( args=('--target-url',), metavar='URL', doc=""""public" access URL of the to-be-created target dataset(s) (default: `sshurl`). Accessibility of this URL determines the access permissions of potential consumers of the dataset. As with `target_dir`, templates (same set of placeholders) are supported. Also, if specified, it is provided as the annex description\n""", constraints=EnsureStr() | EnsureNone()), target_pushurl=Parameter( args=('--target-pushurl',), metavar='URL', doc="""In case the `target_url` cannot be used to publish to the dataset, this option specifies an alternative URL for this purpose. As with `target_url`, templates (same set of placeholders) are supported.\n""", constraints=EnsureStr() | EnsureNone()), recursive=recursion_flag, recursion_limit=recursion_limit, existing=Parameter( args=("--existing",), constraints=EnsureChoice('skip', 'replace', 'error', 'reconfigure'), metavar='MODE', doc="""action to perform, if a sibling is already configured under the given name and/or a target directory already exists. In this case, a dataset can be skipped ('skip'), an existing target directory be forcefully re-initialized, and the sibling (re-)configured ('replace', implies 'reconfigure'), the sibling configuration be updated only ('reconfigure'), or to error ('error').""",), inherit=inherit_opt, shared=Parameter( args=("--shared",), metavar='{false|true|umask|group|all|world|everybody|0xxx}', doc="""if given, configures the access permissions on the server for multi-users (this could include access by a webserver!). Possible values for this option are identical to those of `git init --shared` and are described in its documentation.""", constraints=EnsureStr() | EnsureBool() | EnsureNone()), group=Parameter( args=("--group",), metavar="GROUP", doc="""Filesystem group for the repository. Specifying the group is particularly important when [CMD: --shared=group CMD][PY: shared="group" PY]""", constraints=EnsureStr() | EnsureNone() ), ui=Parameter( args=("--ui",), metavar='{false|true|html_filename}', doc="""publish a web interface for the dataset with an optional user-specified name for the html at publication target. defaults to `index.html` at dataset root""", constraints=EnsureBool() | EnsureStr()), as_common_datasrc=as_common_datasrc, publish_depends=publish_depends, publish_by_default=publish_by_default, annex_wanted=annex_wanted_opt, annex_group=annex_group_opt, annex_groupwanted=annex_groupwanted_opt, since=Parameter( args=("--since",), constraints=EnsureStr() | EnsureNone(), doc="""limit processing to datasets that have been changed since a given state (by tag, branch, commit, etc). This can be used to create siblings for recently added subdatasets."""), ) @staticmethod @datasetmethod(name='create_sibling') @eval_results def __call__(sshurl, name=None, target_dir=None, target_url=None, target_pushurl=None, dataset=None, recursive=False, recursion_limit=None, existing='error', shared=None, group=None, ui=False, as_common_datasrc=None, publish_by_default=None, publish_depends=None, annex_wanted=None, annex_group=None, annex_groupwanted=None, inherit=False, since=None): # # nothing without a base dataset # ds = require_dataset(dataset, check_installed=True, purpose='creating a sibling') refds_path = ds.path # # all checks that are possible before we start parsing the dataset # # possibly use sshurl to get the name in case if not specified if not sshurl: if not inherit: raise InsufficientArgumentsError( "needs at least an SSH URL, if no inherit option" ) if name is None: raise ValueError( "Neither SSH URL, nor the name of sibling to inherit from " "was specified" ) # It might well be that we already have this remote setup try: sshurl = CreateSibling._get_remote_url(ds, name) except Exception as exc: lgr.debug('%s does not know about url for %s: %s', ds, name, exc_str(exc)) elif inherit: raise ValueError( "For now, for clarity not allowing specifying a custom sshurl " "while inheriting settings" ) # may be could be safely dropped -- still WiP if not sshurl: # TODO: may be more back up before _prep? super_ds = ds.get_superdataset() if not super_ds: raise ValueError( "Could not determine super dataset for %s to inherit URL" % ds ) super_url = CreateSibling._get_remote_url(super_ds, name) # for now assuming hierarchical setup # (TODO: to be able to destinguish between the two, probably # needs storing datalad.*.target_dir to have %RELNAME in there) sshurl = slash_join(super_url, relpath(ds.path, super_ds.path)) # check the login URL sshri = RI(sshurl) if not is_ssh(sshri): raise ValueError( "Unsupported SSH URL: '{0}', " "use ssh://host/path or host:path syntax".format(sshurl)) if not name: # use the hostname as default remote name name = sshri.hostname lgr.debug( "No sibling name given, use URL hostname '%s' as sibling name", name) if since == '': # consider creating siblings only since the point of # the last update # XXX here we assume one to one mapping of names from local branches # to the remote active_branch = ds.repo.get_active_branch() since = '%s/%s' % (name, active_branch) # # parse the base dataset to find all subdatasets that need processing # to_process = [] for ap in AnnotatePaths.__call__( dataset=refds_path, # only a single path! path=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='create_sibling', # both next should not happen anyways unavailable_path_status='impossible', nondataset_path_status='error', modified=since, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) != 'dataset' or ap.get('state', None) == 'absent': # this can happen when there is `since`, but we have no # use for anything but datasets here continue checkds_remotes = Dataset(ap['path']).repo.get_remotes() \ if ap.get('state', None) != 'absent' \ else [] if publish_depends: # make sure dependencies are valid # TODO: inherit -- we might want to automagically create # those dependents as well??? unknown_deps = set(assure_list(publish_depends)).difference(checkds_remotes) if unknown_deps: ap['status'] = 'error' ap['message'] = ( 'unknown sibling(s) specified as publication dependency: %s', unknown_deps) yield ap continue if name in checkds_remotes and existing in ('error', 'skip'): ap['status'] = 'error' if existing == 'error' else 'notneeded' ap['message'] = ( "sibling '%s' already configured (specify alternative name, or force " "reconfiguration via --existing", name) yield ap continue to_process.append(ap) if not to_process: # we ruled out all possibilities # TODO wait for gh-1218 and make better return values lgr.info("No datasets qualify for sibling creation. " "Consider different settings for --existing " "or --since if this is unexpected") return if target_dir is None: if sshri.path: target_dir = sshri.path else: target_dir = '.' # TODO: centralize and generalize template symbol handling replicate_local_structure = "%RELNAME" not in target_dir # request ssh connection: lgr.info("Connecting ...") assert(sshurl is not None) # delayed anal verification ssh = ssh_manager.get_connection(sshurl) if not ssh.get_annex_version(): raise MissingExternalDependency( 'git-annex', msg='on the remote system') # # all checks done and we have a connection, now do something # # loop over all datasets, ordered from top to bottom to make test # below valid (existing directories would cause the machinery to halt) # But we need to run post-update hook in depth-first fashion, so # would only collect first and then run (see gh #790) yielded = set() remote_repos_to_run_hook_for = [] for currentds_ap in \ sorted(to_process, key=lambda x: x['path'].count('/')): current_ds = Dataset(currentds_ap['path']) path = _create_dataset_sibling( name, current_ds, ds.path, ssh, replicate_local_structure, sshri, target_dir, target_url, target_pushurl, existing, shared, group, publish_depends, publish_by_default, ui, as_common_datasrc, annex_wanted, annex_group, annex_groupwanted, inherit ) if not path: # nothing new was created # TODO is 'notneeded' appropriate in this case? currentds_ap['status'] = 'notneeded' # TODO explain status in 'message' yield currentds_ap yielded.add(currentds_ap['path']) continue remote_repos_to_run_hook_for.append((path, currentds_ap)) # publish web-interface to root dataset on publication server if current_ds.path == ds.path and ui: lgr.info("Uploading web interface to %s" % path) try: CreateSibling.upload_web_interface(path, ssh, shared, ui) except CommandError as e: currentds_ap['status'] = 'error' currentds_ap['message'] = ( "failed to push web interface to the remote datalad repository (%s)", exc_str(e)) yield currentds_ap yielded.add(currentds_ap['path']) continue # in reverse order would be depth first lgr.info("Running post-update hooks in all created siblings") # TODO: add progressbar for path, currentds_ap in remote_repos_to_run_hook_for[::-1]: # Trigger the hook lgr.debug("Running hook for %s (if exists and executable)", path) try: ssh("cd {} " "&& ( [ -x hooks/post-update ] && hooks/post-update || : )" "".format(sh_quote(_path_(path, ".git")))) except CommandError as e: currentds_ap['status'] = 'error' currentds_ap['message'] = ( "failed to run post-update hook under remote path %s (%s)", path, exc_str(e)) yield currentds_ap yielded.add(currentds_ap['path']) continue if not currentds_ap['path'] in yielded: # if we were silent until now everything is just splendid currentds_ap['status'] = 'ok' yield currentds_ap @staticmethod def _run_on_ds_ssh_remote(ds, name, ssh, cmd): """Given a dataset, and name of the remote, run command via ssh Parameters ---------- cmd: str Will be .format()'ed given the `path` to the dataset on remote Returns ------- out Raises ------ CommandError """ remote_url = CreateSibling._get_remote_url(ds, name) remote_ri = RI(remote_url) out, err = ssh(cmd.format(path=sh_quote(remote_ri.path))) if err: lgr.warning("Got stderr while calling ssh: %s", err) return out @staticmethod def _get_ds_remote_shared_setting(ds, name, ssh): """Figure out setting of sharedrepository for dataset's `name` remote""" shared = None try: # TODO -- we might need to expanduser taking .user into account # but then it must be done also on remote side out = CreateSibling._run_on_ds_ssh_remote( ds, name, ssh, 'git -C {path} config --get core.sharedrepository' ) shared = out.strip() except CommandError as e: lgr.debug( "Could not figure out remote shared setting of %s for %s due " "to %s", ds, name, exc_str(e) ) # could well be ok if e.g. not shared # TODO: more detailed analysis may be? return shared @staticmethod def _has_active_postupdate(ds, name, ssh): """Figure out either has active post-update hook Returns ------- bool or None None if something went wrong and we could not figure out """ has_active_post_update = None try: # TODO -- we might need to expanduser taking .user into account # but then it must be done also on remote side out = CreateSibling._run_on_ds_ssh_remote( ds, name, ssh, 'cd {path} && [ -x .git/hooks/post-update ] && echo yes || echo no' ) out = out.strip() assert out in ('yes', 'no') has_active_post_update = out == "yes" except CommandError as e: lgr.debug( "Could not figure out either %s on remote %s has active " "post_update hook due to %s", ds, name, exc_str(e) ) return has_active_post_update @staticmethod def _get_remote_url(ds, name): """A little helper to get url from pushurl or from url if not defined""" # take pushurl if present, if not -- just a url url = ds.config.get('remote.%s.pushurl' % name) or \ ds.config.get('remote.%s.url' % name) if not url: raise ValueError( "%s had neither pushurl or url defined for %s" % (ds, name) ) return url @staticmethod def init_remote_repo(path, ssh, shared, dataset, description=None): cmd = "git -C {} init{}".format( sh_quote(path), " --shared='{}'".format(sh_quote(shared)) if shared else '') try: ssh(cmd) except CommandError as e: lgr.error("Initialization of remote git repository failed at %s." "\nError: %s\nSkipping ..." % (path, exc_str(e))) return False if isinstance(dataset.repo, AnnexRepo): # init remote git annex repo (part fix of #463) try: ssh( "git -C {} annex init {}".format( sh_quote(path), sh_quote(description) if description else '') ) except CommandError as e: lgr.error("Initialization of remote git annex repository failed at %s." "\nError: %s\nSkipping ..." % (path, exc_str(e))) return False return True @staticmethod def create_postupdate_hook(path, ssh, dataset): # location of post-update hook file, logs folder on remote target hooks_remote_dir = opj(path, '.git', 'hooks') # make sure hooks directory exists (see #1251) ssh('mkdir -p {}'.format(sh_quote(hooks_remote_dir))) hook_remote_target = opj(hooks_remote_dir, 'post-update') # create json command for current dataset log_filename = 'datalad-publish-hook-$(date +%s).log' % TIMESTAMP_FMT hook_content = r'''#!/bin/bash git update-server-info # # DataLad # # (Re)generate meta-data for DataLad Web UI and possibly init new submodules dsdir="$(dirname $0)/../.." logfile="$dsdir/{WEB_META_LOG}/{log_filename}" if [ ! -e "$dsdir/.git" ]; then echo Assumption of being under .git has failed >&2 exit 1 fi mkdir -p "$dsdir/{WEB_META_LOG}" # assure logs directory exists ( which datalad > /dev/null \ && ( cd "$dsdir"; GIT_DIR="$PWD/.git" datalad ls -a --json file .; ) \ || echo "E: no datalad found - skipping generation of indexes for web frontend"; \ ) &> "$logfile" '''.format(WEB_META_LOG=WEB_META_LOG, **locals()) with make_tempfile(content=hook_content) as tempf: # create post_update hook script # upload hook to dataset ssh.put(tempf, hook_remote_target) # and make it executable ssh('chmod +x {}'.format(sh_quote(hook_remote_target))) @staticmethod def upload_web_interface(path, ssh, shared, ui): # path to web interface resources on local webui_local = opj(dirname(datalad.__file__), 'resources', 'website') # local html to dataset html_local = opj(webui_local, "index.html") # name and location of web-interface html on target html_targetname = {True: ui, False: "index.html"}[isinstance(ui, str)] html_target = opj(path, html_targetname) # upload ui html to target ssh.put(html_local, html_target) # upload assets to the dataset webresources_local = opj(webui_local, 'assets') webresources_remote = opj(path, WEB_HTML_DIR) ssh('mkdir -p {}'.format(sh_quote(webresources_remote))) ssh.put(webresources_local, webresources_remote, recursive=True) # minimize and upload js assets for js_file in glob(opj(webresources_local, 'js', '*.js')): with open(js_file) as asset: try: from jsmin import jsmin # jsmin = lambda x: x # no minimization minified = jsmin(asset.read()) # minify asset except ImportError: lgr.warning( "Will not minify web interface javascript, no jsmin available") minified = asset.read() # no minify available with make_tempfile(content=minified) as tempf: # write minified to tempfile js_name = js_file.split('/')[-1] ssh.put(tempf, opj(webresources_remote, 'assets', 'js', js_name)) # and upload js # explicitly make web+metadata dir of dataset world-readable, if shared set to 'all' mode = None if shared in (True, 'true', 'all', 'world', 'everybody'): mode = 'a+rX' elif shared == 'group': mode = 'g+rX' elif str(shared).startswith('0'): mode = shared if mode: ssh('chmod {} -R {} {}'.format( mode, sh_quote(dirname(webresources_remote)), sh_quote(opj(path, 'index.html'))))
class Subdatasets(Interface): """Report subdatasets and their properties. The following properties are reported (if possible) for each matching subdataset record. "name" Name of the subdataset in the parent (often identical with the relative path in the parent dataset) "path" Absolute path to the subdataset "parentds" Absolute path to the parent dataset "revision" SHA1 of the subdataset commit recorded in the parent dataset "state" Condition of the subdataset: 'clean', 'modified', 'absent', 'conflict' as reported by `git submodule` "revision_descr" Output of `git describe` for the subdataset "gitmodule_url" URL of the subdataset recorded in the parent "gitmodule_<label>" Any additional configuration property on record. Performance note: Requesting `bottomup` reporting order, or a particular numerical `recursion_limit` implies an internal switch to an alternative query implementation for recursive query that is more flexible, but also notably slower (performs one call to Git per dataset versus a single call for all combined). """ _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset to query. If no dataset is given, an attempt is made to identify the dataset based on the input and/or the current working directory""", constraints=EnsureDataset() | EnsureNone()), fulfilled=Parameter( args=("--fulfilled",), doc="""if given, must be a boolean flag indicating whether to report either only locally present or absent datasets. By default subdatasets are reported regardless of their status""", constraints=EnsureBool() | EnsureNone()), recursive=recursion_flag, recursion_limit=recursion_limit, contains=Parameter( args=('--contains',), metavar='PATH', doc="""limit report to the subdatasets containing the given path. If a root path of a subdataset is given the last reported dataset will be the subdataset itself.""", constraints=EnsureStr() | EnsureNone()), bottomup=Parameter( args=("--bottomup",), action="store_true", doc="""whether to report subdatasets in bottom-up order along each branch in the dataset tree, and not top-down."""), set_property=Parameter( args=('--set-property',), metavar='VALUE', nargs=2, action='append', doc="""Name and value of one or more subdataset properties to be set in the parent dataset's .gitmodules file. The value can be a Python format() template string wrapped in '<>' (e.g. '<{gitmodule_name}>'). Supported keywords are any item reported in the result properties of this command, plus 'refds_relpath' and 'refds_relname': the relative path of a subdataset with respect to the base dataset of the command call, and, in the latter case, the same string with all directory separators replaced by dashes.[CMD: This option can be given multiple times. CMD]""", constraints=EnsureStr() | EnsureNone()), delete_property=Parameter( args=('--delete-property',), metavar='NAME', action='append', doc="""Name of one or more subdataset properties to be removed from the parent dataset's .gitmodules file.[CMD: This option can be given multiple times. CMD]""", constraints=EnsureStr() | EnsureNone())) @staticmethod @datasetmethod(name='subdatasets') @eval_results def __call__( dataset=None, fulfilled=None, recursive=False, recursion_limit=None, contains=None, bottomup=False, set_property=None, delete_property=None): dataset = require_dataset( dataset, check_installed=False, purpose='subdataset reporting/modification') refds_path = dataset.path # XXX this seems strange, but is tested to be the case -- I'd rather set # `check_installed` to true above and fail if not GitRepo.is_valid_repo(refds_path): return # return as quickly as possible if isinstance(recursion_limit, int) and (recursion_limit <= 0): return try: if not (bottomup or contains or set_property or delete_property or \ (recursive and recursion_limit is not None)): # FAST IMPLEMENTATION FOR THE STRAIGHTFORWARD CASE # as fast as possible (just a single call to Git) # need to track current parent stack = [refds_path] modinfo_cache = {} for sm in _parse_git_submodules(refds_path, recursive=recursive): # unwind the parent stack until we find the right one # this assumes that submodules come sorted while not sm['path'].startswith(_with_sep(stack[-1])): stack.pop() parent = stack[-1] if parent not in modinfo_cache: # read the parent .gitmodules, if not done yet modinfo_cache[parent] = _parse_gitmodules(parent) # get URL info, etc. sm.update(modinfo_cache[parent].get(sm['path'], {})) subdsres = get_status_dict( 'subdataset', status='ok', type='dataset', refds=refds_path, logger=lgr) subdsres.update(sm) subdsres['parentds'] = parent if (fulfilled is None or GitRepo.is_valid_repo(sm['path']) == fulfilled): yield subdsres # for the next "parent" commit this subdataset to the stack stack.append(sm['path']) # MUST RETURN: the rest of the function is doing another implementation return except InvalidGitRepositoryError as e: lgr.debug("fast subdataset query failed, trying slow robust one (%s)", exc_str(e)) # MORE ROBUST, FLEXIBLE, BUT SLOWER IMPLEMENTATION # slow but flexible (one Git call per dataset), but deals with subdatasets in # direct mode if contains: contains = resolve_path(contains, dataset) for r in _get_submodules( dataset.path, fulfilled, recursive, recursion_limit, contains, bottomup, set_property, delete_property, refds_path): # without the refds_path cannot be rendered/converted relative # in the eval_results decorator r['refds'] = refds_path yield r
class AnnotatePaths(Interface): """Analyze and act upon input paths Given paths (or more generally location requests) are inspected and annotated with a number of properties. A list of recognized properties is provided below. || PYTHON >>Input `paths` for this command can either be un-annotated (raw) path strings, or already (partially) annotated paths. In the latter case, further annotation is limited to yet-unknown properties, and is potentially faster than initial annotation.<< PYTHON || *Recognized path properties* {proplist} In the case of enabled modification detection the results may contain additional properties regarding the nature of the modification. See the documentation of the `diff` command for details. """ _docs_ = dict( proplist='\n\n '.join( '"{}"\n{}'.format( k, textwrap.fill(known_props[k], initial_indent=' ', subsequent_indent=' ')) for k in sorted(known_props))) _params_ = dict( path=Parameter( args=("path",), metavar="PATH", doc="""path to be annotated""", nargs="*", constraints=EnsureStr() | EnsureNone()), dataset=Parameter( args=("-d", "--dataset"), doc="""an optional reference/base dataset for the paths""", constraints=EnsureDataset() | EnsureNone()), recursive=recursion_flag, recursion_limit=recursion_limit, action=Parameter( args=("--action",), metavar="LABEL", doc="""an "action" property value to include in the path annotation""", constraints=EnsureStr() | EnsureNone()), unavailable_path_status=Parameter( args=("--unavailable-path-status",), metavar="LABEL", doc="""a "status" property value to include in the annotation for paths that are underneath a dataset, but do not exist on the filesystem""", constraints=EnsureStr() | EnsureNone()), unavailable_path_msg=Parameter( args=("--unavailable-path-msg",), metavar="message", doc="""a "message" property value to include in the annotation for paths that are underneath a dataset, but do not exist on the filesystem""", constraints=EnsureStr() | EnsureNone()), nondataset_path_status=Parameter( args=("--nondataset-path-status",), metavar="LABEL", doc="""a "status" property value to include in the annotation for paths that are not underneath any dataset""", constraints=EnsureStr() | EnsureNone()), force_parentds_discovery=Parameter( args=("--no-parentds-discovery",), dest='force_parentds_discovery', action='store_false', doc="""Flag to disable reports of parent dataset information for any path, in particular dataset root paths. Disabling saves on command run time, if this information is not needed."""), force_subds_discovery=Parameter( args=("--no-subds-discovery",), action='store_false', dest='force_subds_discovery', doc="""Flag to disable reporting type='dataset' for subdatasets, even when they are not installed, or their mount point directory doesn't exist. Disabling saves on command run time, if this information is not needed."""), force_untracked_discovery=Parameter( args=("--no-untracked-discovery",), action='store_false', dest='force_untracked_discovery', doc="""Flag to disable discovery of untracked changes. Disabling saves on command run time, if this information is not needed."""), force_no_revision_change_discovery=Parameter( args=("--revision-change-discovery",), action='store_false', dest='force_no_revision_change_discovery', doc="""Flag to disable discovery of changes which were not yet committed. Disabling saves on command run time, if this information is not needed."""), modified=Parameter( args=("--modified",), nargs='?', const=True, constraints=EnsureStr() | EnsureBool() | EnsureNone(), doc="""comparison reference specification for modification detection. This can be (mostly) anything that `git diff` understands (commit, treeish, tag, etc). See the documentation of `datalad diff --revision` for details. Unmodified paths will not be annotated. If a requested path was not modified but some content underneath it was, then the request is replaced by the modified paths and those are annotated instead. This option can be used [PY: with `True` as PY][CMD: without CMD] an argument to test against changes that have been made, but have not yet been staged for a commit.""")) @staticmethod @datasetmethod(name='annotate_paths') @eval_results def __call__( path=None, dataset=None, recursive=False, recursion_limit=None, action=None, unavailable_path_status='', unavailable_path_msg=None, nondataset_path_status='error', force_parentds_discovery=True, force_subds_discovery=True, force_no_revision_change_discovery=True, force_untracked_discovery=True, modified=None): # upfront check for the fastest possible response if not path and dataset is None: # nothing given, try "here", but do not use `require_dataset`, as # it will determine the root dataset of `curdir` and further down # lead to path annotation of upstairs directories dataset = curdir if force_subds_discovery and not force_parentds_discovery: raise ValueError( 'subdataset discovery requires parent dataset discovery') # CONCEPT: yield with no status to indicate further processing # everything in one big loop to be able too yield as fast a possible # without any precomputing for all paths refds_path = Interface.get_refds_path(dataset) if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)): raise ValueError( "modification detection only works with a base dataset (non-given or found)") # prep common result props res_kwargs = dict( action=action if action else 'annotate_path', refds=refds_path, logger=lgr) # handle the case of recursion into a single dataset without any # extra fancy processing first -- full recursion can be done # faster than manual recursion, hence we gain quite some speed # from these few lines of extra code if not modified and not path and refds_path: if not GitRepo.is_valid_repo(refds_path): yield get_status_dict( # doesn't matter if the path is in another dataset # it was given as reference dataset status=nondataset_path_status, message='given reference dataset is not a dataset', path=refds_path, **res_kwargs) return refds = Dataset(refds_path) path = [] # yield the dataset itself r = get_status_dict(ds=refds, status='', **res_kwargs) yield r if recursive: # if we have nothing given, but need recursion, we need to feed # the dataset path itself for r in yield_recursive( refds, refds_path, action, recursion_limit): r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] yield r return # goal: structure in a way that makes most information on any path # available in a single pass, at the cheapest possible cost reported_paths = {} requested_paths = assure_list(path) if modified is not None: # modification detection would silently kill all nondataset paths # but we have to complain about them, hence doing it here if requested_paths and refds_path: for r in requested_paths: p = r['path'] if isinstance(r, dict) else r p = resolve_path(p, ds=refds_path) if _with_sep(p).startswith(_with_sep(refds_path)): # all good continue # not the refds path_props = r if isinstance(r, dict) else {} res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with reference dataset' reported_paths[r] = res yield res # preserve non-existing paths to be silently killed by modification # detection and append them to requested_paths again after detection. # TODO: This might be melted in with treatment of non dataset paths # above. Re-appending those paths seems to be better than yielding # directly to avoid code duplication, since both cases later on are # dealt with again. preserved_paths = [] if requested_paths: [preserved_paths.append(r) for r in requested_paths if not lexists(r['path'] if isinstance(r, dict) else r)] # replace the requested paths by those paths that were actually # modified underneath or at a requested location requested_paths = get_modified_subpaths( # either the request, or the base dataset, if there was no request requested_paths if requested_paths else [refds_path], refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit) from itertools import chain # re-append the preserved paths: requested_paths = chain(requested_paths, iter(preserved_paths)) # do not loop over unique(), this could be a list of dicts # we avoid duplicates manually below via `reported_paths` for path in requested_paths: if not isinstance(path, dict): path = rawpath2ap(path, refds_path) # this is now an annotated path! path_props = path path = path['path'] # we need to mark our territory, who knows where this has been path_props.update(res_kwargs) if path in reported_paths: # we already recorded this path in the output # this can happen, whenever `path` is a subdataset, that was # discovered via recursive processing of another path before continue # the path exists in some shape or form # TODO if we have path_props already we could skip this test if isdir(path): # keep any existing type info, previously a more expensive run # could have discovered an uninstalled 'dataset', and we don't # want it to be relabeled to a directory path_props['type'] = \ path_props.get( 'type', 'dataset' if GitRepo.is_valid_repo(path) else 'directory') # this could contain all types of additional content containing_dir = path else: if lexists(path): path_props['type'] = 'file' else: path_props['state'] = 'absent' # for everything else we are interested in the container containing_dir = dirname(path) if not containing_dir: containing_dir = curdir dspath = parent = get_dataset_root(containing_dir) if dspath: if path_props.get('type', None) == 'dataset': # for a dataset the root is not the parent, for anything else # it is parent = path_props.get('parentds', None) oneupdir = normpath(opj(containing_dir, pardir)) if parent is None and (force_parentds_discovery or ( refds_path and _with_sep(oneupdir).startswith( _with_sep(refds_path)))): # either forced, or only if we have a reference dataset, and # only if we stay within this refds when searching for the # parent parent = get_dataset_root(normpath(opj(containing_dir, pardir))) # NOTE the `and refds_path` is critical, as it will determine # whether a top-level dataset that was discovered gets the # parent property or not, it won't get it without a common # base dataset, and that is how we always rolled if parent and refds_path: path_props['parentds'] = parent # don't check whether this is actually a true subdataset of the # parent, done further down else: # set parent, but prefer existing property path_props['parentds'] = path_props.get('parentds', dspath) # test for `dspath` not `parent`, we only need to know whether there is # ANY dataset, not which one is the true parent, logic below relies on # the fact that we end here, if there is no dataset at all if not dspath: # not in any dataset res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with any dataset' reported_paths[path] = res yield res continue # check that we only got SUBdatasets if refds_path and not _with_sep(dspath).startswith(_with_sep(refds_path)): res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = \ ('path not part of the reference dataset at %s', refds_path) reported_paths[path] = res yield res continue if path_props.get('type', None) == 'file': # nothing else we can learn about this res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res continue containing_ds = None path_type = path_props.get('type', None) if parent and force_subds_discovery and ( (path_type == 'dataset' and 'registered_subds' not in path_props) or path_type == 'directory' or not lexists(path)): # if the path doesn't exist, or is labeled a directory, or a dataset even # a dataset (without this info) -> record whether this is a known subdataset # to its parent containing_ds = Dataset(parent) subdss = containing_ds.subdatasets( fulfilled=None, recursive=False, result_xfm=None, result_filter=None, return_type='list') if path in [s['path'] for s in subdss]: if path_type == 'directory' or not lexists(path): # first record that it isn't here, if just a dir or not here at all path_props['state'] = 'absent' # this must be a directory, and it is not installed path_props['type'] = 'dataset' path_props['registered_subds'] = True if not lexists(path) or \ (path_props.get('type', None) == 'dataset' and path_props.get('state', None) == 'absent'): # not there (yet) message = unavailable_path_msg if unavailable_path_msg else None if message and '%s' in message: message = (message, path) path_props['message'] = message res = get_status_dict(**dict(res_kwargs, **path_props)) # assign given status, but only if the props don't indicate a status # already res['status'] = path_props.get( 'status', unavailable_path_status) reported_paths[path] = res yield res continue # we know everything we can, report res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res rec_paths = [] if recursive: # here we need to consider the special case that `path` is # a dataset itself, if a recursion_limit is given (e.g. # `remove` will do that by default), we need to recurse # from the dataset itself, and not its parent to get things # right -- this will also avoid needless discovery of # unrelated subdatasets if path_props.get('type', None) == 'dataset': containing_ds = Dataset(path) else: # regular parent, we might have a dataset already containing_ds = Dataset(parent) if containing_ds is None else containing_ds for r in yield_recursive(containing_ds, path, action, recursion_limit): # capture reported paths r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] reported_paths[r['path']] = r if modified is not None: # we cannot yield right away, maybe it wasn't modified rec_paths.append(r) else: yield r if modified is not None and rec_paths: # replace the recursively discovered paths by those paths that # were actually modified underneath or at a requested location for r in get_modified_subpaths( rec_paths, refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit): res = get_status_dict(**dict(r, **res_kwargs)) reported_paths[res['path']] = res yield res return
class Install(Interface): """Install a dataset component or entire datasets. This command can make arbitrary content available in a dataset. This includes the fulfillment of exisiting dataset handles or file handles in a dataset, as well as the adding such handles for content available locally or remotely. """ _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset to perform the install operation on. If no dataset is given, an attempt is made to identify the dataset based on the current working directory and/or the `path` given""", constraints=EnsureDataset() | EnsureNone()), path=Parameter( args=("path",), doc="""path/name of the installation target. If no `source` is provided, and no `dataset` is given or detected, this is interpreted as the source URL of a dataset and a destination path will be derived from the URL similar to 'git clone'.""", nargs="*", constraints=EnsureStr() | EnsureNone()), source=Parameter( args=("-s", "--source",), doc="url or local path of the installation source", constraints=EnsureStr() | EnsureNone()), # TODO this probably needs --with-data and --recursive as a plain boolean recursive=Parameter( args=("-r", "--recursive"), constraints=EnsureChoice('handles', 'data') | EnsureBool(), doc="""If set, all content is installed recursively, including content of any subdatasets."""), add_data_to_git=Parameter( args=("--add-data-to-git",), constraints=EnsureBool(), doc="""Flag whether to add data directly to Git, instead of tracking data identity only. Usually this is not desired, as it inflates dataset sizes and impacts flexibility of data transport.""")) @staticmethod @datasetmethod(name='install') def __call__(dataset=None, path=None, source=None, recursive=False, add_data_to_git=False): lgr.debug("Installation attempt started") # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) if isinstance(path, list): if not len(path): # normalize value to expected state when nothing was provided path = None elif len(path) == 1: # we can simply continue with the function as called with a # single argument path = path[0] else: lgr.debug("Installation of multiple targets was requested: {0}".format(path)) return [Install.__call__( dataset=ds, path=p, source=source, recursive=recursive) for p in path] # resolve the target location against the provided dataset if path is not None: # make sure it is not a URL, `resolve_path` cannot handle that if is_url(path): try: path = get_local_path_from_url(path) path = resolve_path(path, ds) except ValueError: # URL doesn't point to a local something pass else: path = resolve_path(path, ds) # any `path` argument that point to something local now resolved and # is no longer a URL # if we have no dataset given, figure out which one we need to operate # on, based on the resolved target location (that is now guaranteed to # be specified, but only if path isn't a URL (anymore) -> special case, # handles below if ds is None and path is not None and not is_url(path): # try to find a dataset at or above the installation target dspath = GitRepo.get_toppath(abspath(path)) if dspath is None: # no top-level dataset found, use path as such dspath = path ds = Dataset(dspath) if ds is None and source is None and path is not None: # no dataset, no source # this could be a shortcut install call, where the first # arg identifies the source if is_url(path) or os.path.exists(path): # we have an actual URL -> this should be the source # OR # it is not a URL, but it exists locally lgr.debug( "Single argument given to install and no dataset found. " "Assuming the argument identifies a source location.") source = path path = None lgr.debug("Resolved installation target: {0}".format(path)) if ds is None and path is None and source is not None: # we got nothing but a source. do something similar to git clone # and derive the path from the source and continue lgr.debug( "Neither dataset not target installation path provided. " "Assuming installation of a remote dataset. " "Deriving destination path from given source {0}".format( source)) ds = Dataset(_installationpath_from_url(source)) if not path and ds is None: # no dataset, no target location, nothing to do raise InsufficientArgumentsError( "insufficient information for installation (needs at " "least a dataset or an installation path") assert(ds is not None) lgr.debug("Resolved target dataset for installation: {0}".format(ds)) vcs = ds.repo if vcs is None: # TODO check that a "ds.path" actually points to a TOPDIR # should be the case already, but maybe nevertheless check try: with swallow_logs(): vcs = Install._get_new_vcs(ds, source, vcs) except GitCommandError: lgr.debug("Cannot retrieve from URL: {0}".format(source)) # maybe source URL was missing a '/.git' if source and not source.rstrip('/').endswith('/.git'): source = '{0}/.git'.format(source.rstrip('/')) lgr.debug("Attempt to retrieve from URL: {0}".format(source)) vcs = Install._get_new_vcs(ds, source, vcs) else: lgr.debug("Unable to establish repository instance at: {0}".format(ds.path)) raise assert(ds.repo) # is automagically re-evaluated in the .repo property runner = Runner() if path is None or path == ds.path: # if the goal was to install this dataset, we are done, # except for 'recursive'. # TODO: For now 'recursive' means just submodules. # See --with-data vs. -- recursive and figure it out if recursive: for sm in ds.repo.get_submodules(): _install_subds_from_flexible_source( ds, sm.path, sm.url, recursive=recursive) return ds # at this point this dataset is "installed", now we can test whether to # install something into the dataset # needed by the logic below assert(isabs(path)) # express the destination path relative to the root of this dataset relativepath = relpath(path, start=ds.path) if path.startswith(pardir): raise ValueError("installation path outside dataset") lgr.debug( "Resolved installation target relative to dataset {0}: {1}".format( ds, relativepath)) # this dataset must already know everything necessary ################################################### # FLOW GUIDE # # at this point we know nothing about the # installation targether ################################################### try: # it is simplest to let annex tell us what we are dealing with lgr.debug("Trying to fetch file %s using annex", relativepath) if not isinstance(vcs, AnnexRepo): assert(isinstance(vcs, GitRepo)) # FLOW GUIDE # this is not an annex repo, but we raise exceptions # to be able to treat them alike in the special case handling # below if not exists(path): raise IOError("path doesn't exist yet, might need special handling") elif relativepath in vcs.get_indexed_files(): # relativepath is in git raise FileInGitError("We need to handle it as known to git") else: raise FileNotInAnnexError("We don't have yet annex repo here") if vcs.get_file_key(relativepath): # FLOW GUIDE EXIT POINT # this is an annex'ed file -> get it # TODO implement `copy --from` using `source` # TODO fail if `source` is something strange vcs.annex_get(relativepath) # return the absolute path to the installed file return path except FileInGitError: ################################################### # FLOW GUIDE # # `path` is either # - a file already checked into Git # - known submodule ################################################### lgr.log(5, "FileInGitError logic") if source is not None: raise FileInGitError("File %s is already in git. Specifying source (%s) makes no sense" % (path, source)) # file is checked into git directly -> nothing to do # OR this is a submodule of this dataset submodule = [sm for sm in ds.repo.get_submodules() if sm.path == relativepath] if not len(submodule): # FLOW GUIDE EXIT POINT # this is a file in Git and no submodule, just return its path lgr.debug("Don't act, data already present in Git") return path elif len(submodule) > 1: raise RuntimeError( "more than one submodule registered at the same path?") submodule = submodule[0] # FLOW GUIDE EXIT POINT # we are dealing with a known submodule (i.e. `source` # doesn't matter) -> check it out lgr.debug("Install subdataset at: {0}".format(submodule.path)) subds = _install_subds_from_flexible_source( ds, submodule.path, submodule.url, recursive=recursive) return subds except FileNotInAnnexError: ################################################### # FLOW GUIDE # # `path` is either # - content of a subdataset # - an untracked file in this dataset # - an entire untracked/unknown existing subdataset ################################################### lgr.log(5, "FileNotInAnnexError logic") subds = get_containing_subdataset(ds, relativepath) if ds.path != subds.path: # FLOW GUIDE EXIT POINT # target path belongs to a known subdataset, hand # installation over to it return subds.install( path=relpath(path, start=subds.path), source=source, recursive=recursive, add_data_to_git=add_data_to_git) # FLOW GUIDE # this must be an untracked/existing something, so either # - a file # - a directory # - an entire repository if exists(opj(path, '.git')): # FLOW GUIDE EXIT POINT # this is an existing repo and must be in-place turned into # a submodule of this dataset return _install_subds_inplace( ds, path, relativepath, source, runner) # FLOW GUIDE EXIT POINT # - untracked file or directory in this dataset if isdir(path) and not recursive: # this is a directory and we want --recursive for it raise ValueError( "installation of a directory requires the `recursive` flag") # few sanity checks if source and abspath(source) != path: raise ValueError( "installation target already exists, but `source` points to " "another location (target: '{0}', source: '{0}'".format( source, path)) if not add_data_to_git and not (isinstance(vcs, AnnexRepo)): raise RuntimeError( "Trying to install file(s) into a dataset " "with a plain Git repository. First initialize annex, or " "provide override flag.") # switch `add` procedure between Git and Git-annex according to flag if add_data_to_git: vcs.git_add(relativepath) added_files = resolve_path(relativepath, ds) else: # do a blunt `annex add` added_files = vcs.annex_add(relativepath) # return just the paths of the installed components if isinstance(added_files, list): added_files = [resolve_path(i['file'], ds) for i in added_files] else: added_files = resolve_path(added_files['file'], ds) if added_files: return added_files else: return None except IOError: ################################################### # FLOW GUIDE # # more complicated special cases -- `path` is either # - a file/subdataset in a not yet initialized but known # submodule # - an entire untracked/unknown existing subdataset # - non-existing content that should be installed from `source` ################################################### lgr.log(5, "IOError logic") # we can end up here in two cases ATM if (exists(path) or islink(path)) or source is None: # FLOW GUIDE # - target exists but this dataset's VCS rejects it, # so it should be part of a subdataset # or # - target doesn't exist, but no source is given, so # it could be a handle that is actually contained in # a not yet installed subdataset subds = get_containing_subdataset(ds, relativepath) if ds.path != subds.path: # FLOW GUIDE # target path belongs to a subdataset, hand installation # over to it if not subds.is_installed(): # FLOW GUIDE # we are dealing with a target in a not yet # available but known subdataset -> install it first ds.install(subds.path, recursive=recursive) return subds.install( path=relpath(path, start=subds.path), source=source, recursive=recursive, add_data_to_git=add_data_to_git) # FLOW GUIDE EXIT POINT raise InsufficientArgumentsError( "insufficient information for installation: the " "installation target {0} doesn't exists, isn't a " "known handle of dataset {1}, and no `source` " "information was provided.".format(path, ds)) if not source: # FLOW GUIDE EXIT POINT raise InsufficientArgumentsError( "insufficient information for installation: the " "installation target {0} doesn't exists, isn't a " "known handle of dataset {1}, and no `source` " "information was provided.".format(path, ds)) source_path = expandpath(source) if exists(source_path): # FLOW GUIDE EXIT POINT # this could be # - local file # - local directory # - repository outside the dataset # we only want to support the last case of locally cloning # a repo -- fail otherwise if exists(opj(source_path, '.git')): return _install_subds_from_flexible_source( ds, relativepath, source_path, recursive) raise ValueError( "installing individual local files or directories is not " "supported, copy/move them into the dataset first") # FLOW GUIDE # `source` is non-local, it could be: # - repository # - file # we have no further evidence, hence we need to try try: # FLOW GUIDE EXIT POINT # assume it is a dataset return _install_subds_from_flexible_source( ds, relativepath, source, recursive) except CommandError: # FLOW GUIDE EXIT POINT # apaarently not a repo, assume it is a file url vcs.annex_addurl_to_file(relativepath, source) return path @staticmethod def _get_new_vcs(ds, source, vcs): if source is None: # always come with annex when created from scratch lgr.info("Creating a new annex repo at %s", ds.path) vcs = AnnexRepo(ds.path, url=source, create=True) else: # when obtained from remote, try with plain Git lgr.info("Creating a new git repo at %s", ds.path) vcs = GitRepo(ds.path, url=source, create=True) if knows_annex(ds.path): # init annex when traces of a remote annex can be detected lgr.info("Initializing annex repo at %s", ds.path) vcs = AnnexRepo(ds.path, init=True) else: lgr.debug("New repository clone has no traces of an annex") return vcs @staticmethod def result_renderer_cmdline(res): from datalad.ui import ui if res is None: res = [] if not isinstance(res, list): res = [res] if not len(res): ui.message("Nothing was installed") return items = '\n'.join(map(str, res)) msg = "{n} installed {obj} available at\n{items}".format( obj='items are' if len(res) > 1 else 'item is', n=len(res), items=items) ui.message(msg)
def dlplugin(dataset, pattern, ref_dir='.', makedirs='no'): # could be extended to accept actual largefile expressions """Configure a dataset to never put some content into the dataset's annex This can be useful in mixed datasets that also contain textual data, such as source code, which can be efficiently and more conveniently managed directly in Git. Patterns generally look like this:: code/* which would match all file in the code directory. In order to match all files under ``code/``, including all its subdirectories use such a pattern:: code/** Note that the plugin works incrementally, hence any existing configuration (e.g. from a previous plugin run) is amended, not replaced. Parameters ---------- dataset : Dataset dataset to configure pattern : list list of path patterns. Any content whose path is matching any pattern will not be annexed when added to a dataset, but instead will be tracked directly in Git. Path pattern have to be relative to the directory given by the `ref_dir` option. By default, patterns should be relative to the root of the dataset. ref_dir : str, optional Relative path (within the dataset) to the directory that is to be configured. All patterns are interpreted relative to this path, and configuration is written to a ``.gitattributes`` file in this directory. makedirs : bool, optional If set, any missing directories will be created in order to be able to place a file into ``ref_dir``. Default: False. """ from os.path import join as opj from os.path import isabs from os.path import exists from os import makedirs as makedirsfx from datalad.distribution.dataset import require_dataset from datalad.support.annexrepo import AnnexRepo from datalad.support.constraints import EnsureBool from datalad.utils import assure_list makedirs = EnsureBool()(makedirs) pattern = assure_list(pattern) ds = require_dataset(dataset, check_installed=True, purpose='no_annex configuration') res_kwargs = dict( path=ds.path, type='dataset', action='no_annex', ) # all the ways we refused to cooperate if not isinstance(ds.repo, AnnexRepo): yield dict(res_kwargs, status='notneeded', message='dataset has no annex') return if any(isabs(p) for p in pattern): yield dict( res_kwargs, status='error', message= ('path pattern for `no_annex` configuration must be relative paths: %s', pattern)) return if isabs(ref_dir): yield dict( res_kwargs, status='error', message= ('`ref_dir` for `no_annex` configuration must be a relative path: %s', ref_dir)) return gitattr_dir = opj(ds.path, ref_dir) if not exists(gitattr_dir): if makedirs: makedirsfx(gitattr_dir) else: yield dict( res_kwargs, status='error', message= 'target directory for `no_annex` does not exist (consider makedirs=True)' ) return gitattr_file = opj(gitattr_dir, '.gitattributes') with open(gitattr_file, 'a') as fp: for p in pattern: fp.write('{} annex.largefiles=nothing'.format(p)) yield dict(res_kwargs, status='ok') for r in dataset.add(gitattr_file, to_git=True, message="[DATALAD] exclude paths from annex'ing", result_filter=None, result_xfm=None): yield r
class CreatePublicationTargetSSHWebserver(Interface): """Create a dataset on a web server via SSH, that may then serve as a target for the publish command, if added as a sibling.""" _params_ = dict( # TODO: Somehow the replacement of '_' and '-' is buggy on # positional arguments # TODO: Figure out, whether (and when) to use `sshurl` as push url dataset=Parameter( args=( "--dataset", "-d", ), doc="""specify the dataset to create the publication target for. If no dataset is given, an attempt is made to identify the dataset based on the current working directory""", constraints=EnsureDataset() | EnsureNone()), sshurl=Parameter( args=("sshurl", ), doc="""SSH URL to use to log into the server and create the target dataset(s). This also serves as a default for the URL to be used to add the target as a sibling to `dataset` and as a default for the directory on the server, where to create the dataset.""", constraints=EnsureStr()), target=Parameter( args=('target', ), doc="""Sibling name to create for this publication target. If `recursive` is set, the same name will be used to address the subdatasets' siblings. Note, that this is just a convenience function, calling add_sibling after the actual creation of the target dataset(s). Whenever the creation fails, no siblings are added.""", constraints=EnsureStr() | EnsureNone(), nargs="?"), target_dir=Parameter( args=('--target-dir', ), doc="""Path to the directory on the server where to create the dataset. By default it's wherever `sshurl` points to. If a relative path is provided, it's interpreted as relative to the user's home directory on the server. Especially when using `recursive`, it's possible to provide a template for building the URLs of all (sub)datasets to be created by using placeholders. If you don't provide a template the local hierarchy with respect to `dataset` will be replicated on the server rooting in `target_dir`.\n List of currently available placeholders:\n %%NAME\tthe name of the datasets, where slashes are replaced by dashes.\n""", constraints=EnsureStr() | EnsureNone()), target_url=Parameter( args=('--target-url', ), doc="""The URL of the dataset sibling named by `target`. Defaults to `sshurl`. This URL has to be accessible to anyone, who is supposed to have access to the dataset later on.\n Especially when using `recursive`, it's possible to provide a template for building the URLs of all (sub)datasets to be created by using placeholders.\n List of currently available placeholders:\n %%NAME\tthe name of the datasets, where slashes are replaced by dashes.\n""", nargs="?", constraints=EnsureStr() | EnsureNone()), target_pushurl=Parameter( args=('--target-pushurl', ), doc="""Defaults to `sshurl`. In case the `target_url` cannot be used to publish to the dataset sibling, this option specifies a URL to be used for the actual publication operation.""", constraints=EnsureStr() | EnsureNone()), recursive=Parameter( args=("--recursive", "-r"), action="store_true", doc="""Recursively create the publication target for all subdatasets of `dataset`""", ), existing=Parameter( args=("--existing", ), constraints=EnsureChoice('skip', 'replace', 'raise'), doc="""Action to perform, if target directory exists already. Dataset is skipped if `skip`. `replace` forces to (re-)init git and to (re-)configure sibling `target` (i.e. its URL(s)) in case it already exists. `raise` just raises an Exception""", ), shared=Parameter( args=("--shared", ), doc="""passed to git-init. TODO: Figure out how to communicate what this is about""", constraints=EnsureStr() | EnsureBool()), ) @staticmethod @datasetmethod(name='create_publication_target_sshwebserver') def __call__(sshurl, target=None, target_dir=None, target_url=None, target_pushurl=None, dataset=None, recursive=False, existing='raise', shared=False): if sshurl is None: raise ValueError("""insufficient information for target creation (needs at least a dataset and a SSH URL).""") if target is None and (target_url is not None or target_pushurl is not None): raise ValueError("""insufficient information for adding the target as a sibling (needs at least a name)""") # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) if ds is None: # try to find a dataset at or above CWD dspath = GitRepo.get_toppath(abspath(getpwd())) if dspath is None: raise ValueError("""No dataset found at or above {0}.""".format(getpwd())) ds = Dataset(dspath) lgr.debug("Resolved dataset for target creation: {0}".format(ds)) assert (ds is not None and sshurl is not None) if not ds.is_installed(): raise ValueError( """Dataset {0} is not installed yet.""".format(ds)) assert (ds.repo is not None) # determine target parameters: parsed_target = urlparse(sshurl) host_name = parsed_target.netloc # TODO: Sufficient to fail on this condition? if not parsed_target.netloc: raise ValueError("Malformed URL: {0}".format(sshurl)) if target_dir is None: if parsed_target.path: target_dir = parsed_target.path else: target_dir = '.' # TODO: centralize and generalize template symbol handling replicate_local_structure = False if "%NAME" not in target_dir: replicate_local_structure = True # collect datasets to use: datasets = dict() datasets[basename(ds.path)] = ds if recursive: for subds in ds.get_dataset_handles(recursive=True): sub_path = opj(ds.path, subds) # TODO: when enhancing Dataset/*Repo classes and therefore # adapt to moved code, make proper distinction between name and # path of a submodule, which are technically different. This # probably will become important on windows as well as whenever # we want to allow for moved worktrees. datasets[basename(ds.path) + '/' + subds] = \ Dataset(sub_path) # setup SSH Connection: # TODO: Make the entire setup a helper to use it when pushing via # publish? # - build control master: from datalad.utils import assure_dir not_supported_on_windows("TODO") from os import geteuid # Linux specific import var_run_user_datalad = "/var/run/user/%s/datalad" % geteuid() assure_dir(var_run_user_datalad) control_path = "%s/%s" % (var_run_user_datalad, host_name) control_path += ":%s" % parsed_target.port if parsed_target.port else "" # - start control master: cmd = "ssh -o ControlMaster=yes -o \"ControlPath=%s\" " \ "-o ControlPersist=yes %s exit" % (control_path, host_name) lgr.debug("Try starting control master by calling:\n%s" % cmd) import subprocess proc = subprocess.Popen(cmd, shell=True) proc.communicate(input="\n") # why the f.. this is necessary? runner = Runner() ssh_cmd = ["ssh", "-S", control_path, host_name] lgr.info("Creating target datasets ...") for current_dataset in datasets: if not replicate_local_structure: path = target_dir.replace("%NAME", current_dataset.replace("/", "-")) else: # TODO: opj depends on local platform, not the remote one. # check how to deal with it. Does windows ssh server accept # posix paths? vice versa? Should planned SSH class provide # tools for this issue? path = normpath( opj(target_dir, relpath(datasets[current_dataset].path, start=ds.path))) if path != '.': # check if target exists # TODO: Is this condition valid for != '.' only? path_exists = True cmd = ssh_cmd + ["ls", path] try: out, err = runner.run(cmd, expect_fail=True, expect_stderr=True) except CommandError as e: if "No such file or directory" in e.stderr and \ path in e.stderr: path_exists = False else: raise # It's an unexpected failure here if path_exists: if existing == 'raise': raise RuntimeError( "Target directory %s already exists." % path) elif existing == 'skip': continue elif existing == 'replace': pass else: raise ValueError( "Do not know how to hand existing=%s" % repr(existing)) cmd = ssh_cmd + ["mkdir", "-p", path] try: runner.run(cmd) except CommandError as e: lgr.error("Remotely creating target directory failed at " "%s.\nError: %s" % (path, str(e))) continue # init git repo cmd = ssh_cmd + ["git", "-C", path, "init"] if shared: cmd.append("--shared=%s" % shared) try: runner.run(cmd) except CommandError as e: lgr.error("Remotely initializing git repository failed at %s." "\nError: %s\nSkipping ..." % (path, str(e))) continue # check git version on remote end: cmd = ssh_cmd + ["git", "version"] try: out, err = runner.run(cmd) git_version = out.lstrip("git version").strip() lgr.debug("Detected git version on server: %s" % git_version) if git_version < "2.4": lgr.error("Git version >= 2.4 needed to configure remote." " Version detected on server: %s\nSkipping ..." % git_version) continue except CommandError as e: lgr.warning("Failed to determine git version on remote.\n" "Error: {0}\nTrying to configure anyway " "...".format(e.message)) # allow for pushing to checked out branch cmd = ssh_cmd + [ "git", "-C", path, "config", "receive.denyCurrentBranch", "updateInstead" ] try: runner.run(cmd) except CommandError as e: lgr.warning("git config failed at remote location %s.\n" "You will not be able to push to checked out " "branch." % path) # enable post-update hook: cmd = ssh_cmd + [ "mv", opj(path, ".git/hooks/post-update.sample"), opj(path, ".git/hooks/post-update") ] try: runner.run(cmd) except CommandError as e: lgr.error("Failed to enable post update hook.\n" "Error: %s" % e.message) # initially update server info "manually": cmd = ssh_cmd + ["git", "-C", path, "update-server-info"] try: runner.run(cmd) except CommandError as e: lgr.error("Failed to update server info.\n" "Error: %s" % e.message) # stop controlmaster (close ssh connection): cmd = ["ssh", "-O", "stop", "-S", control_path, host_name] out, err = runner.run(cmd, expect_stderr=True) if target: # add the sibling(s): if target_url is None: target_url = sshurl if target_pushurl is None: target_pushurl = sshurl result_adding = AddSibling()(dataset=ds, name=target, url=target_url, pushurl=target_pushurl, recursive=recursive, force=existing in {'replace'})
class CreateSiblingRia(Interface): """Creates a sibling to a dataset in a RIA store Communication with a dataset in a RIA store is implemented via two siblings. A regular Git remote (repository sibling) and a git-annex special remote for data transfer (storage sibling) -- with the former having a publication dependency on the latter. By default, the name of the storage sibling is derived from the repository sibling's name by appending "-storage". The store's base path is expected to not exist, be an empty directory, or a valid RIA store. RIA store layout ~~~~~~~~~~~~~~~~ A RIA store is a directory tree with a dedicated subdirectory for each dataset in the store. The subdirectory name is constructed from the DataLad dataset ID, e.g. '124/68afe-59ec-11ea-93d7-f0d5bf7b5561', where the first three characters of the ID are used for an intermediate subdirectory in order to mitigate files system limitations for stores containing a large number of datasets. Each dataset subdirectory contains a standard bare Git repository for the dataset. In addition, a subdirectory 'annex' hold a standard Git-annex object store. However, instead of using the 'dirhashlower' naming scheme for the object directories, like Git-annex would do, a 'dirhashmixed' layout is used -- the same as for non-bare Git repositories or regular DataLad datasets. Optionally, there can be a further subdirectory 'archives' with (compressed) 7z archives of annex objects. The storage remote is able to pull annex objects from these archives, if it cannot find in the regular annex object store. This feature can be useful for storing large collections of rarely changing data on systems that limit the number of files that can be stored. Each dataset directory also contains a 'ria-layout-version' file that identifies the data organization (as, for example, described above). Lastly, there is a global 'ria-layout-version' file at the store's base path that identifies where dataset subdirectories themselves are located. At present, this file must contain a single line stating the version (currently "1"). This line MUST end with a newline character. It is possible to define an alias for an individual dataset in a store by placing a symlink to the dataset location into an 'alias/' directory in the root of the store. This enables dataset access via URLs of format: 'ria+<protocol>://<storelocation>#~<aliasname>'. Error logging ~~~~~~~~~~~~~ To enable error logging at the remote end, append a pipe symbol and an "l" to the version number in ria-layout-version (like so '1|l\\n'). Error logging will create files in an "error_log" directory whenever the git-annex special remote (storage sibling) raises an exception, storing the Python traceback of it. The logfiles are named according to the scheme '<dataset id>.<annex uuid of the remote>.log' showing "who" ran into this issue with which dataset. Because logging can potentially leak personal data (like local file paths for example), it can be disabled client-side by setting the configuration variable "annex.ora-remote.<storage-sibling-name>.ignore-remote-config". """ # TODO: description? _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset to process. If no dataset is given, an attempt is made to identify the dataset based on the current working directory""", constraints=EnsureDataset() | EnsureNone()), url=Parameter( args=("url",), metavar="ria+<ssh|file>://<host>[/path]", doc="""URL identifying the target RIA store and access protocol. """, constraints=EnsureStr() | EnsureNone()), name=Parameter( args=('-s', '--name',), metavar='NAME', doc="""Name of the sibling. With `recursive`, the same name will be used to label all the subdatasets' siblings.""", constraints=EnsureStr() | EnsureNone(), required=True), storage_name=Parameter( args=("--storage-name",), metavar="NAME", doc="""Name of the storage sibling (git-annex special remote). Must not be identical to the sibling name. If not specified, defaults to the sibling name plus '-storage' suffix. If only a storage sibling is created, this setting is ignored, and the primary sibling name is used.""", constraints=EnsureStr() | EnsureNone()), post_update_hook=Parameter( args=("--post-update-hook",), doc="""Enable git's default post-update-hook for the created sibling.""", action="store_true"), shared=Parameter( args=("--shared",), metavar='{false|true|umask|group|all|world|everybody|0xxx}', doc="""If given, configures the permissions in the RIA store for multi-users access. Possible values for this option are identical to those of `git init --shared` and are described in its documentation.""", constraints=EnsureStr() | EnsureBool() | EnsureNone()), group=Parameter( args=("--group",), metavar="GROUP", doc="""Filesystem group for the repository. Specifying the group is crucial when [CMD: --shared=group CMD][PY: shared="group" PY]""", constraints=EnsureStr() | EnsureNone()), storage_sibling=Parameter( args=("--storage-sibling",), dest='storage_sibling', metavar='MODE', constraints=EnsureChoice('only') | EnsureBool() | EnsureNone(), doc="""By default, an ORA storage sibling and a Git repository sibling are created ([CMD: on CMD][PY: True|'on' PY]). Alternatively, creation of the storage sibling can be disabled ([CMD: off CMD][PY: False|'off' PY]), or a storage sibling created only and no Git sibling ([CMD: only CMD][PY: 'only' PY]). In the latter mode, no Git installation is required on the target host."""), existing=Parameter( args=("--existing",), constraints=EnsureChoice( 'skip', 'error', 'reconfigure') | EnsureNone(), metavar='MODE', doc="""Action to perform, if a (storage) sibling is already configured under the given name and/or a target already exists. In this case, a dataset can be skipped ('skip'), an existing target repository be forcefully re-initialized, and the sibling (re-)configured ('reconfigure'), or the command be instructed to fail ('error').""", ), recursive=recursion_flag, recursion_limit=recursion_limit, trust_level=Parameter( args=("--trust-level",), metavar="TRUST-LEVEL", constraints=EnsureChoice( 'trust', 'semitrust', 'untrust') | EnsureNone(), doc="""specify a trust level for the storage sibling. If not specified, the default git-annex trust level is used.""",), disable_storage__=Parameter( args=("--no-storage-sibling",), dest='disable_storage__', doc="""This option is deprecated. Use '--storage-sibling off' instead.""", action="store_false"), ) @staticmethod @datasetmethod(name='create_sibling_ria') @eval_results def __call__(url, name, dataset=None, storage_name=None, post_update_hook=False, shared=None, group=None, storage_sibling=True, existing='error', trust_level=None, recursive=False, recursion_limit=None, disable_storage__=None, ): if disable_storage__ is not None: import warnings warnings.warn("datalad-create-sibling-ria --no-storage-sibling " "is deprecated, use --storage-sibling off instead.", DeprecationWarning) # recode to new setup disable_storage__ = None storage_sibling = False if storage_sibling == 'only' and storage_name: lgr.warning( "Sibling name will be used for storage sibling in " "storage-sibling-only mode, but a storage sibling name " "was provided" ) ds = require_dataset( dataset, check_installed=True, purpose='create sibling RIA') res_kwargs = dict( ds=ds, action="create-sibling-ria", logger=lgr, ) # parse target URL try: ssh_host, base_path, rewritten_url = verify_ria_url(url, ds.config) except ValueError as e: yield get_status_dict( status='error', message=str(e), **res_kwargs ) return if ds.repo.get_hexsha() is None or ds.id is None: raise RuntimeError( "Repository at {} is not a DataLad dataset, " "run 'datalad create [--force]' first.".format(ds.path)) if not storage_sibling and storage_name: lgr.warning( "Storage sibling setup disabled, but a storage sibling name " "was provided" ) if storage_sibling and not storage_name: storage_name = "{}-storage".format(name) if storage_sibling and name == storage_name: # leads to unresolvable, circular dependency with publish-depends raise ValueError("sibling names must not be equal") if not isinstance(url, str): raise TypeError("url is not a string, but %s" % type(url)) # Query existing siblings upfront in order to fail early on # existing=='error', since misconfiguration (particularly of special # remotes) only to fail in a subdataset later on with that config, can # be quite painful. # TODO: messages - this is "create-sibling". Don't confuse existence of # local remotes with existence of the actual remote sibling # in wording if existing == 'error': # in recursive mode this check could take a substantial amount of # time: employ a progress bar (or rather a counter, because we don't # know the total in advance pbar_id = 'check-siblings-{}'.format(id(ds)) log_progress( lgr.info, pbar_id, 'Start checking pre-existing sibling configuration %s', ds, label='Query siblings', unit=' Siblings', ) # even if we have to fail, let's report all conflicting siblings # in subdatasets failed = False for r in ds.siblings(result_renderer=None, recursive=recursive, recursion_limit=recursion_limit): log_progress( lgr.info, pbar_id, 'Discovered sibling %s in dataset at %s', r['name'], r['path'], update=1, increment=True) if not r['type'] == 'sibling' or r['status'] != 'ok': # this is an internal status query that has not consequence # for the outside world. Be silent unless something useful # can be said #yield r continue if r['name'] == name: res = get_status_dict( status='error', message="a sibling '{}' is already configured in " "dataset {}".format(name, r['path']), **res_kwargs, ) failed = True yield res continue if storage_name and r['name'] == storage_name: res = get_status_dict( status='error', message="a sibling '{}' is already configured in " "dataset {}".format(storage_name, r['path']), **res_kwargs, ) failed = True yield res continue log_progress( lgr.info, pbar_id, 'Finished checking pre-existing sibling configuration %s', ds, ) if failed: return # TODO: - URL parsing + store creation needs to be RF'ed based on # command abstractions # - more generally consider store creation a dedicated command or # option # Note: URL parsing is done twice ATM (for top-level ds). This can't be # reduced to single instance, since rewriting url based on config could # be different for subdatasets. create_store(SSHRemoteIO(ssh_host) if ssh_host else LocalIO(), Path(base_path), '1') yield from _create_sibling_ria( ds, url, name, storage_sibling, storage_name, existing, shared, group, post_update_hook, trust_level, res_kwargs) if recursive: # Note: subdatasets can be treated independently, so go full # recursion when querying for them and _no_recursion with the # actual call. Theoretically this can be parallelized. for subds in ds.subdatasets(fulfilled=True, recursive=True, recursion_limit=recursion_limit, result_xfm='datasets'): yield from _create_sibling_ria( subds, url, name, storage_sibling, storage_name, existing, shared, group, post_update_hook, trust_level, res_kwargs)
def _dump_extracted_metadata(agginto_ds, aggfrom_ds, db, to_save, force_extraction, agg_base_path): """Dump metadata from a dataset into object in the metadata store of another Info on the metadata objects is placed into a DB dict under the absolute path of the dataset whose metadata was aggregated. Parameters ---------- agginto_ds : Dataset aggfrom_ds : Dataset db : dict """ subds_relpaths = aggfrom_ds.subdatasets(result_xfm='relpaths', return_type='list') # figure out a "state" of the dataset wrt its metadata that we are describing # 1. the latest commit that changed any file for which we could have native metadata refcommit = _get_latest_refcommit(aggfrom_ds, subds_relpaths) objid = refcommit if refcommit else '' # 2, our own dataset-global metadata and the dataset config for tfile in ( op.join(aggfrom_ds.path, DATASET_METADATA_FILE), op.join(aggfrom_ds.path, DATASET_CONFIG_FILE)): if op.exists(tfile): objid += md5(open(tfile, 'r').read().encode()).hexdigest() # 3. potential annex-based metadata # XXX TODO shouldn't this be the annex extractor? if isinstance(aggfrom_ds, AnnexRepo) and \ aggfrom_ds.config.obtain( 'datalad.metadata.aggregate-content-datalad-core', default=True, valtype=EnsureBool()): # if there is no annex metadata, this will come out empty, # hence hash would be same as for a plain GitRepo # and no, we cannot use the shasum of the annex branch, # because this will change even when no metadata has changed timestamps, _ = aggfrom_ds.repo.call_annex_oneline([ 'metadata', '.', '-g', 'lastchanged']) objid += timestamps.strip() if not objid: lgr.debug('%s has no metadata-relevant content', aggfrom_ds) else: lgr.debug( 'Dump metadata of %s into %s', aggfrom_ds, agginto_ds) # check if we already have in store what we are about to create old_agginfo = db.get(aggfrom_ds.path, {}) agginfo = {} # dataset global if aggfrom_ds.id: agginfo['id'] = aggfrom_ds.id agginfo['refcommit'] = refcommit # put in DB db[aggfrom_ds.path] = agginfo if not objid: # this is no error, there is simply no metadata whatsoever return False # shorten to MD5sum objid = md5(objid.encode()).hexdigest() # assemble info on the metadata extraction and storage # label type targetds storage method metasources = {'ds': {'type': 'dataset', 'targetds': agginto_ds, 'dumper': json_py.dump}} # do not store content metadata if either the source or the target dataset # do not want it # TODO this AND was an OR before (wrong), misses a test if aggfrom_ds.config.obtain( 'datalad.metadata.store-aggregate-content', default=True, valtype=EnsureBool()) and \ agginto_ds.config.obtain( 'datalad.metadata.store-aggregate-content', default=True, valtype=EnsureBool()): metasources['cn'] = { 'type': 'content', 'targetds': agginto_ds, 'dumper': json_py.dump2xzstream} # check if we have the extracted metadata for this state already # either in the source or in the destination dataset # The situation is trickier! Extracted metadata could change for the same # state (commit etc), e.g. if extractors changed. # The "correct" thing would be either # - to inspect git history either there were changes # within aggfrom_ds since agginto_ds got the metadata committed OR # - check by content - if file is under git - compute checksum, # if under annex -- take checksum from the key without asking for the # content metafound = {} uptodatemeta = [] # record which meta not only found but matching in content # TODO: current fixes might break logic for when fromds is not installed # when I guess we just need to skip it? if not force_extraction: for s, sprop in metasources.items(): objloc = op.join(agg_base_path, _get_obj_location(objid, s, sprop['dumper'])) smetafound = [ # important to test for lexists() as we do not need to # or want to `get()` metadata files for this test. # Info on identity is NOT sufficient - later compare content if # multiple found objloc if op.lexists(op.join(d.path, objloc)) else None # Order of dss matters later for d in (aggfrom_ds, agginto_ds) ] if all(smetafound): # both have it metafound[s] = smetafound # but are they the same? try: if _the_same_across_datasets(objloc, aggfrom_ds, agginto_ds): uptodatemeta.append(s) except RuntimeError as exc: # TODO: dedicated test - when meta content changes lgr.debug("For now will just do re-extraction since caught %s", CapturedException(exc)) # source one has it, so we might be able to copy it # TODO: dedicated test - when it is sufficient to copy we do not re-extract if len(metafound) != len(metasources): # found some (either ds or cn) metadata missing entirely in both # from and into datasets lgr.debug( "Incomplete or absent metadata while aggregating %s <- %s: %s", agginto_ds, aggfrom_ds, metafound ) # no metadata found -> extract # this places metadata dump files into the configured # target dataset and lists them in `to_save`, as well # as updates the `db` record for `aggfrom_ds` return _extract_metadata( agginto_ds, aggfrom_ds, db, to_save, objid, metasources, refcommit, subds_relpaths, agg_base_path) # we did not actually run an extraction, so we need to # assemble an aggregation record from the existing pieces # that we found # simple case: the target dataset has all the records already and they are up to date: if len(uptodatemeta) == len(metasources): lgr.debug('Sticking with up-to-date metadata for %s', aggfrom_ds) # no change, use old record from the target dataset db[aggfrom_ds.path] = old_agginfo # no error return False else: lgr.debug('Reusing previously extracted metadata for %s', aggfrom_ds) # we need to move the metadata dump(s) into the target dataset objrelpaths = { label: next(filter(bool, smetafound)) for label, smetafound in metafound.items() } # make sure all the to-be-moved metadata records are present # locally aggfrom_ds.get( path=[op.join(aggfrom_ds.path, p) for p in objrelpaths.values()], result_renderer='disabled') # actually copy dump files for objrelpath in objrelpaths.values(): objpath = op.join(agginto_ds.path, objrelpath) objdir = op.dirname(objpath) if not op.exists(objdir): makedirs(objdir) if op.lexists(objpath): os.unlink(objpath) # remove previous version first # was a wild thought as a workaround for # http://git-annex.branchable.com/bugs/cannot_commit___34__annex_add__34__ed_modified_file_which_switched_its_largefile_status_to_be_committed_to_git_now/#comment-bf70dd0071de1bfdae9fd4f736fd1ec1 # agginto_ds.repo.remove(objpath) # XXX TODO once we have a command that can copy/move files # from one dataset to another including file availability # info, this should be used here shutil.copyfile( op.join(aggfrom_ds.path, objrelpath), objpath) # mark for saving to_save.append(dict( path=objpath, parentds=agginto_ds.path, type='file')) # lastly get 'self' aggregation record from source dataset and # use in target dataset db[aggfrom_ds.path] = load_ds_aggregate_db(aggfrom_ds, abspath=True)[aggfrom_ds.path] return False
def _extract_metadata(agginto_ds, aggfrom_ds, db, merge_native, to_save): """Dump metadata from a dataset into object in the metadata store of another Info on the metadata objects is placed into a DB dict under the absolute path of the dataset whose metadata was aggregated. Parameters ---------- agginto_ds : Dataset aggfrom_ds : Dataset db : dict merge_native : str Merge mode. """ subds_relpaths = aggfrom_ds.subdatasets(result_xfm='relpaths', return_type='list') # figure out a "state" of the dataset wrt its metadata that we are describing # 1. the latest commit that changed any file for which we could have native metadata refcommit = _get_latest_refcommit(aggfrom_ds, subds_relpaths) objid = refcommit if refcommit else '' # 2, our own dataset-global metadata dsmetafile = opj(aggfrom_ds.path, '.datalad', 'metadata', 'dataset.json') if exists(dsmetafile): objid += md5(open(dsmetafile, 'r').read().encode()).hexdigest() # 3. potential annex-based metadata if isinstance(aggfrom_ds, AnnexRepo) and \ aggfrom_ds.config.obtain( 'datalad.metadata.aggregate-content-datalad-core', default=True, valtype=EnsureBool()): # if there is no annex metadata, this will come out empty, # hence hash would be same as for a plain GitRepo # and no, we cannot use the shasum of the annex branch, # because this will change even when no metadata has changed timestamps, _ = aggfrom_ds.repo._run_annex_command( 'metadata', '.', '-g', 'lastchanged') objid += timestamps.strip() if not objid: lgr.debug('%s has no metadata-relevant content', aggfrom_ds) else: lgr.debug('Dump metadata of %s (merge mode: %s) into %s', aggfrom_ds, merge_native, agginto_ds) agginfo = {} # dataset global if aggfrom_ds.id: agginfo['id'] = aggfrom_ds.id agginfo['refcommit'] = refcommit # put in DB db[aggfrom_ds.path] = agginfo if not objid: dsmeta = contentmeta = None # this is no error, there is simply no metadata whatsoever return False # if there is any chance for metadata # obtain metadata for dataset and content relevant_paths = sorted( _get_metadatarelevant_paths(aggfrom_ds, subds_relpaths)) nativetypes = get_metadata_type(aggfrom_ds) dsmeta, contentmeta, errored = _get_metadata( aggfrom_ds, # core must come first ['datalad_core'] + assure_list(nativetypes), merge_native, # None indicates to honor a datasets per-parser configuration and to be # on by default global_meta=None, content_meta=None, paths=relevant_paths) # shorten to MD5sum objid = md5(objid.encode()).hexdigest() metasources = [('ds', 'dataset', dsmeta, aggfrom_ds, json_py.dump)] # do not store content metadata if either the source or the target dataset # do not want it if aggfrom_ds.config.obtain( 'datalad.metadata.store-aggregate-content', default=True, valtype=EnsureBool()) or \ agginto_ds.config.obtain( 'datalad.metadata.store-aggregate-content', default=True, valtype=EnsureBool()): metasources.append(( 'cn', 'content', # sort by path key to get deterministic dump content (dict(contentmeta[k], path=k) for k in sorted(contentmeta)), aggfrom_ds, json_py.dump2xzstream)) # for both types of metadata for label, mtype, meta, dest, store in metasources: if not meta: continue # only write to disk if there is something objrelpath = _get_obj_location(objid, label) if store is json_py.dump2xzstream: objrelpath += '.xz' # place metadata object into the source dataset objpath = opj(dest.path, dirname(agginfo_relpath), objrelpath) # write obj files if exists(objpath): dest.unlock(objpath) # TODO actually dump a compressed file when annexing is possible # to speed up on-demand access store(meta, objpath) # stage for dataset.save() to_save.append(dict(path=objpath, type='file')) # important to use abspath here, needs to be rewritten relative to # all receiving datasets agginfo['{}_info'.format(mtype)] = objpath return errored
class Run(Interface): """Run an arbitrary shell command and record its impact on a dataset. It is recommended to craft the command such that it can run in the root directory of the dataset that the command will be recorded in. However, as long as the command is executed somewhere underneath the dataset root, the exact location will be recorded relative to the dataset root. If the executed command did not alter the dataset in any way, no record of the command execution is made. If the given command errors, a `CommandError` exception with the same exit code will be raised, and no modifications will be saved. *Command format* || REFLOW >> A few placeholders are supported in the command via Python format specification. "{pwd}" will be replaced with the full path of the current working directory. "{dspath}" will be replaced with the full path of the dataset that run is invoked on. "{inputs}" and "{outputs}" represent the values specified by [CMD: --input and --output CMD][PY: `inputs` and `outputs` PY]. If multiple values are specified, the values will be joined by a space. The order of the values will match that order from the command line, with any globs expanded in alphabetical order (like bash). Individual values can be accessed with an integer index (e.g., "{inputs[0]}"). << REFLOW || To escape a brace character, double it (i.e., "{{" or "}}"). """ _params_ = dict( cmd=Parameter( args=("cmd",), nargs=REMAINDER, metavar='COMMAND', doc="command for execution"), dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset to record the command results in. An attempt is made to identify the dataset based on the current working directory. If a dataset is given, the command will be executed in the root directory of this dataset.""", constraints=EnsureDataset() | EnsureNone()), inputs=Parameter( args=("--input",), dest="inputs", metavar=("PATH"), action='append', doc="""A dependency for the run. Before running the command, the content of this file will be retrieved. A value of "." means "run :command:`datalad get .`". The value can also be a glob. [CMD: This option can be given more than once. CMD]"""), outputs=Parameter( args=("--output",), dest="outputs", metavar=("PATH"), action='append', doc="""Prepare this file to be an output file of the command. A value of "." means "run :command:`datalad unlock .`" (and will fail if some content isn't present). For any other value, if the content of this file is present, unlock the file. Otherwise, remove it. The value can also be a glob. [CMD: This option can be given more than once. CMD]"""), expand=Parameter( args=("--expand",), metavar=("WHICH"), doc="""Expand globs when storing inputs and/or outputs in the commit message.""", constraints=EnsureNone() | EnsureChoice("inputs", "outputs", "both")), message=save_message_opt, sidecar=Parameter( args=('--sidecar',), metavar="yes|no", doc="""By default, the configuration variable 'datalad.run.record-sidecar' determines whether a record with information on a command's execution is placed into a separate record file instead of the commit message (default: off). This option can be used to override the configured behavior on a case-by-case basis. Sidecar files are placed into the dataset's '.datalad/runinfo' directory (customizable via the 'datalad.run.record-directory' configuration variable).""", constraints=EnsureNone() | EnsureBool()), rerun=Parameter( args=('--rerun',), action='store_true', doc="""re-run the command recorded in the last saved change (if any). Note: This option is deprecated since version 0.9.2 and will be removed in a later release. Use `datalad rerun` instead."""), ) @staticmethod @datasetmethod(name='run') @eval_results def __call__( cmd=None, dataset=None, inputs=None, outputs=None, expand=None, message=None, sidecar=None, rerun=False): if rerun: if cmd: lgr.warning("Ignoring provided command in --rerun mode") lgr.warning("The --rerun option is deprecated since version 0.9.2. " "Use `datalad rerun` instead.") from datalad.interface.rerun import Rerun for r in Rerun.__call__(dataset=dataset, message=message): yield r else: if cmd: for r in run_command(cmd, dataset=dataset, inputs=inputs, outputs=outputs, expand=expand, message=message, sidecar=sidecar): yield r else: lgr.warning("No command given")
def _get_metadata(ds, types, global_meta=None, content_meta=None, paths=None): """Make a direct query of a dataset to extract its metadata. Parameters ---------- ds : Dataset types : list """ errored = False dsmeta = MetadataDict() # each item in here will be a MetadataDict, but not the whole thing contentmeta = {} if global_meta is not None and content_meta is not None and \ not global_meta and not content_meta: # both are false and not just none return dsmeta, contentmeta, errored context = { '@vocab': 'http://docs.datalad.org/schema_v{}.json'.format(vocabulary_version) } fullpathlist = paths if paths and isinstance(ds.repo, AnnexRepo): # Ugly? Jep: #2055 content_info = zip(paths, ds.repo.file_has_content(paths), ds.repo.is_under_annex(paths)) paths = [p for p, c, a in content_info if not a or c] nocontent = len(fullpathlist) - len(paths) if nocontent: # TODO better fail, or support incremental and label this file as no present lgr.warn('{} files have no content present, ' 'some extractors will not operate on {}'.format( nocontent, 'them' if nocontent > 10 else [p for p, c, a in content_info if not c and a])) # pull out potential metadata field blacklist config settings blacklist = [ re.compile(bl) for bl in assure_list( ds.config.obtain('datalad.metadata.aggregate-ignore-fields', default=[])) ] # enforce size limits max_fieldsize = ds.config.obtain('datalad.metadata.maxfieldsize') # keep local, who knows what some extractors might pull in from pkg_resources import iter_entry_points # delayed heavy import extractors = { ep.name: ep for ep in iter_entry_points('datalad.metadata.extractors') } log_progress( lgr.info, 'metadataextractors', 'Start metadata extraction from %s', ds, total=len(types), label='Metadata extraction', unit=' extractors', ) for mtype in types: mtype_key = mtype log_progress(lgr.info, 'metadataextractors', 'Engage %s metadata extractor', mtype_key, update=1, increment=True) if mtype_key not in extractors: # we said that we want to fail, rather then just moan about less metadata log_progress( lgr.error, 'metadataextractors', 'Failed %s metadata extraction from %s', mtype_key, ds, ) raise ValueError( 'Enable metadata extractor %s is not available in this installation', mtype_key) try: extractor_cls = extractors[mtype_key].load() extractor = extractor_cls( ds, paths=paths if extractor_cls.NEEDS_CONTENT else fullpathlist) except Exception as e: log_progress( lgr.error, 'metadataextractors', 'Failed %s metadata extraction from %s', mtype_key, ds, ) raise ValueError( "Failed to load metadata extractor for '%s', " "broken dataset configuration (%s)?: %s", mtype, ds, exc_str(e)) continue try: dsmeta_t, contentmeta_t = extractor.get_metadata( dataset=global_meta if global_meta is not None else ds.config.obtain( 'datalad.metadata.aggregate-dataset-{}'.format( mtype.replace('_', '-')), default=True, valtype=EnsureBool()), content=content_meta if content_meta is not None else ds.config.obtain( 'datalad.metadata.aggregate-content-{}'.format( mtype.replace('_', '-')), default=True, valtype=EnsureBool())) except Exception as e: lgr.error('Failed to get dataset metadata ({}): {}'.format( mtype, exc_str(e))) if cfg.get('datalad.runtime.raiseonerror'): log_progress( lgr.error, 'metadataextractors', 'Failed %s metadata extraction from %s', mtype_key, ds, ) raise errored = True # if we dont get global metadata we do not want content metadata continue if dsmeta_t: if _ok_metadata(dsmeta_t, mtype, ds, None): dsmeta_t = _filter_metadata_fields(dsmeta_t, maxsize=max_fieldsize, blacklist=blacklist) dsmeta[mtype_key] = dsmeta_t else: errored = True unique_cm = {} extractor_unique_exclude = getattr(extractor_cls, "_unique_exclude", set()) # TODO: ATM neuroimaging extractors all provide their own internal # log_progress but if they are all generators, we could provide generic # handling of the progress here. Note also that log message is actually # seems to be ignored and not used, only the label ;-) # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'Metadata extraction per location for %s', mtype, # # contentmeta_t is a generator... so no cound is known # # total=len(contentmeta_t or []), # label='Metadata extraction per location', # unit=' locations', # ) for loc, meta in contentmeta_t or {}: lgr.log(5, "Analyzing metadata for %s", loc) # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'ignoredatm', # label=loc, # update=1, # increment=True) if not _ok_metadata(meta, mtype, ds, loc): errored = True # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'ignoredatm', # label='Failed for %s' % loc, # ) continue # we also want to store info that there was no metadata(e.g. to get a list of # files that have no metadata) # if there is an issue that a extractor needlessly produces empty records, the # extractor should be fixed and not a general switch. For example the datalad_core # issues empty records to document the presence of a file #elif not meta: # continue meta = MetadataDict(meta) # apply filters meta = _filter_metadata_fields(meta, maxsize=max_fieldsize, blacklist=blacklist) if not meta: continue # assign # only ask each metadata extractor once, hence no conflict possible loc_dict = contentmeta.get(loc, {}) loc_dict[mtype_key] = meta contentmeta[loc] = loc_dict if ds.config.obtain('datalad.metadata.generate-unique-{}'.format( mtype_key.replace('_', '-')), default=True, valtype=EnsureBool()): # go through content metadata and inject report of unique keys # and values into `dsmeta` for k, v in iteritems(meta): if k in dsmeta.get(mtype_key, {}): # if the dataset already has a dedicated idea # about a key, we skip it from the unique list # the point of the list is to make missing info about # content known in the dataset, not to blindly # duplicate metadata. Example: list of samples data # were recorded from. If the dataset has such under # a 'sample' key, we should prefer that, over an # aggregated list of a hopefully-kinda-ok structure continue elif k in extractor_unique_exclude: # the extractor thinks this key is worthless for the purpose # of discovering whole datasets # we keep the key (so we know that some file is providing this key), # but ignore any value it came with unique_cm[k] = None continue vset = unique_cm.get(k, set()) vset.add(_val2hashable(v)) unique_cm[k] = vset # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'Finished metadata extraction across locations for %s', mtype) if unique_cm: # per source storage here too ucp = dsmeta.get('datalad_unique_content_properties', {}) # important: we want to have a stable order regarding # the unique values (a list). we cannot guarantee the # same order of discovery, hence even when not using a # set above we would still need sorting. the callenge # is that any value can be an arbitrarily complex nested # beast # we also want to have each unique value set always come # in a top-level list, so we known if some unique value # was a list, os opposed to a list of unique values def _ensure_serializable(val): if isinstance(val, ReadOnlyDict): return { k: _ensure_serializable(v) for k, v in iteritems(val) } if isinstance(val, (tuple, list)): return [_ensure_serializable(v) for v in val] else: return val ucp[mtype_key] = { k: [ _ensure_serializable(i) for i in sorted(v, key=_unique_value_key) ] if v is not None else None for k, v in iteritems(unique_cm) # v == None (disable unique, but there was a value at some point) # otherwise we only want actual values, and also no single-item-lists # of a non-value # those contribute no information, but bloat the operation # (inflated number of keys, inflated storage, inflated search index, ...) if v is None or (v and not v == {''}) } dsmeta['datalad_unique_content_properties'] = ucp log_progress( lgr.info, 'metadataextractors', 'Finished metadata extraction from %s', ds, ) # always identify the effective vocabulary - JSON-LD style if context: dsmeta['@context'] = context return dsmeta, contentmeta, errored
class Subdatasets(Interface): """Report subdatasets and their properties. The following properties are reported (if possible) for each matching subdataset record. "name" Name of the subdataset in the parent (often identical with the relative path in the parent dataset) "path" Absolute path to the subdataset "parentds" Absolute path to the parent dataset "revision" SHA1 of the subdataset commit recorded in the parent dataset "state" Condition of the subdataset: 'clean', 'modified', 'absent', 'conflict' as reported by `git submodule` "revision_descr" Output of `git describe` for the subdataset "gitmodule_url" URL of the subdataset recorded in the parent "gitmodule_<label>" Any additional configuration property on record. Performance note: Property modification, requesting `bottomup` reporting order, or a particular numerical `recursion_limit` implies an internal switch to an alternative query implementation for recursive query that is more flexible, but also notably slower (performs one call to Git per dataset versus a single call for all combined). The following properties for subdatasets are recognized by DataLad (without the 'gitmodule\_' prefix that is used in the query results): "datalad-recursiveinstall" If set to 'skip', the respective subdataset is skipped when DataLad is recursively installing its superdataset. However, the subdataset remains installable when explicitly requested, and no other features are impaired. """ _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), doc="""specify the dataset to query. If no dataset is given, an attempt is made to identify the dataset based on the input and/or the current working directory""", constraints=EnsureDataset() | EnsureNone()), fulfilled=Parameter( args=("--fulfilled", ), doc="""if given, must be a boolean flag indicating whether to report either only locally present or absent datasets. By default subdatasets are reported regardless of their status""", constraints=EnsureBool() | EnsureNone()), recursive=recursion_flag, recursion_limit=recursion_limit, contains=Parameter( args=('--contains', ), metavar='PATH', doc="""limit report to the subdatasets containing the given path. If a root path of a subdataset is given the last reported dataset will be the subdataset itself.""", constraints=EnsureStr() | EnsureNone()), bottomup=Parameter( args=("--bottomup", ), action="store_true", doc="""whether to report subdatasets in bottom-up order along each branch in the dataset tree, and not top-down."""), set_property=Parameter( args=('--set-property', ), metavar=('NAME', 'VALUE'), nargs=2, action='append', doc="""Name and value of one or more subdataset properties to be set in the parent dataset's .gitmodules file. The property name is case-insensitive, must start with a letter, and consist only of alphanumeric characters. The value can be a Python format() template string wrapped in '<>' (e.g. '<{gitmodule_name}>'). Supported keywords are any item reported in the result properties of this command, plus 'refds_relpath' and 'refds_relname': the relative path of a subdataset with respect to the base dataset of the command call, and, in the latter case, the same string with all directory separators replaced by dashes.[CMD: This option can be given multiple times. CMD]""", constraints=EnsureStr() | EnsureNone()), delete_property=Parameter( args=('--delete-property', ), metavar='NAME', action='append', doc="""Name of one or more subdataset properties to be removed from the parent dataset's .gitmodules file.[CMD: This option can be given multiple times. CMD]""", constraints=EnsureStr() | EnsureNone())) @staticmethod @datasetmethod(name='subdatasets') @eval_results def __call__(dataset=None, fulfilled=None, recursive=False, recursion_limit=None, contains=None, bottomup=False, set_property=None, delete_property=None): dataset = require_dataset(dataset, check_installed=False, purpose='subdataset reporting/modification') refds_path = dataset.path # XXX this seems strange, but is tested to be the case -- I'd rather set # `check_installed` to true above and fail if not GitRepo.is_valid_repo(refds_path): return # return as quickly as possible if isinstance(recursion_limit, int) and (recursion_limit <= 0): return if set_property: for k, v in set_property: if valid_key.match(k) is None: raise ValueError( "key '%s' is invalid (alphanumeric plus '-' only, must start with a letter)", k) if contains: contains = resolve_path(contains, dataset) for r in _get_submodules(dataset.path, fulfilled, recursive, recursion_limit, contains, bottomup, set_property, delete_property, refds_path): # without the refds_path cannot be rendered/converted relative # in the eval_results decorator r['refds'] = refds_path yield r
class Update(Interface): """Update a dataset from a sibling. """ # TODO: adjust docs to say: # - update from just one sibling at a time _examples_ = [ dict(text="Update from a particular sibling", code_py="update(sibling='siblingname')", code_cmd="datalad update -s <siblingname>"), dict(text="Update from a particular sibling and merge the changes " "from a configured or matching branch from the sibling " "(see [CMD: --follow CMD][PY: `follow` PY] for details)", code_py="update(sibling='siblingname', merge=True)", code_cmd="datalad update --merge -s <siblingname>"), dict(text="Update from the sibling 'origin', traversing into " "subdatasets. For subdatasets, merge the revision " "registered in the parent dataset into the current branch", code_py="update(sibling='origin', merge=True, " "follow='parentds', recursive=True)", code_cmd="datalad update -s origin --merge " "--follow=parentds --recursive"), ] _params_ = dict( path=Parameter( args=("path", ), metavar="PATH", doc= """constrain to-be-updated subdatasets to the given path for recursive operation.""", nargs="*", constraints=EnsureStr() | EnsureNone()), sibling=Parameter( args=( "-s", "--sibling", ), doc="""name of the sibling to update from. If no sibling is given, updates from all siblings are obtained.""", constraints=EnsureStr() | EnsureNone()), dataset=Parameter(args=("-d", "--dataset"), doc="""specify the dataset to update. If no dataset is given, an attempt is made to identify the dataset based on the current working directory""", constraints=EnsureDataset() | EnsureNone()), merge=Parameter( args=("--merge", ), metavar="ALLOWED", # const and nargs are set to map --merge to --merge=any. const="any", nargs="?", constraints=EnsureBool() | EnsureChoice("any", "ff-only"), doc="""merge obtained changes from the sibling. If a sibling is not explicitly given and there is only a single known sibling, that sibling is used. Otherwise, an unspecified sibling defaults to the configured remote for the current branch. By default, changes are fetched from the sibling but not merged into the current branch. With [CMD: --merge or --merge=any CMD][PY: merge=True or merge="any" PY], the changes will be merged into the current branch. A value of 'ff-only' restricts the allowed merges to fast-forwards."""), follow=Parameter( args=("--follow", ), constraints=EnsureChoice("sibling", "parentds"), doc="""source of updates for subdatasets. For 'sibling', the update will be done by merging in a branch from the (specified or inferred) sibling. The branch brought in will either be the current branch's configured branch, if it points to a branch that belongs to the sibling, or a sibling branch with a name that matches the current branch. For 'parentds', the revision registered in the parent dataset of the subdataset is merged in. Note that the current dataset is always updated according to 'sibling'. This option has no effect unless a merge is requested and [CMD: --recursive CMD][PY: recursive=True PY] is specified.""", ), recursive=recursion_flag, recursion_limit=recursion_limit, fetch_all=Parameter( args=("--fetch-all", ), action="store_true", doc= """this option has no effect and will be removed in a future version. When no siblings are given, an all-sibling update will be performed.""", ), reobtain_data=Parameter( args=("--reobtain-data", ), action="store_true", doc="""if enabled, file content that was present before an update will be re-obtained in case a file was changed by the update."""), ) @staticmethod @datasetmethod(name='update') @eval_results def __call__(path=None, sibling=None, merge=False, follow="sibling", dataset=None, recursive=False, recursion_limit=None, fetch_all=None, reobtain_data=False): if fetch_all is not None: lgr.warning( 'update(fetch_all=...) called. Option has no effect, and will be removed' ) if path and not recursive: lgr.warning('path constraints for subdataset updates ignored, ' 'because `recursive` option was not given') refds = require_dataset(dataset, check_installed=True, purpose='updating') save_paths = [] merge_failures = set() saw_subds = False for ds, revision in itertools.chain( [(refds, None)], refds.subdatasets(path=path, fulfilled=True, recursive=recursive, recursion_limit=recursion_limit, return_type='generator', result_renderer='disabled', result_xfm=YieldDatasetAndRevision()) if recursive else []): if ds != refds: saw_subds = True repo = ds.repo is_annex = isinstance(repo, AnnexRepo) # prepare return value res = get_status_dict('update', ds=ds, logger=lgr, refds=refds.path) # get all remotes which have references (would exclude # special remotes) remotes = repo.get_remotes(**({ 'exclude_special_remotes': True } if is_annex else {})) if not remotes and not sibling: res['message'] = ( "No siblings known to dataset at %s\nSkipping", repo.path) res['status'] = 'notneeded' yield res continue curr_branch = repo.get_active_branch() tracking_remote = None if not sibling and len(remotes) == 1: # there is only one remote, must be this one sibling_ = remotes[0] elif not sibling: # nothing given, look for tracking branch tracking_remote = repo.get_tracking_branch(branch=curr_branch, remote_only=True)[0] sibling_ = tracking_remote else: sibling_ = sibling if sibling_ and sibling_ not in remotes: res['message'] = ("'%s' not known to dataset %s\nSkipping", sibling_, repo.path) res['status'] = 'impossible' yield res continue if not sibling_ and len(remotes) > 1 and merge: lgr.debug("Found multiple siblings:\n%s" % remotes) res['status'] = 'impossible' res['message'] = "Multiple siblings, please specify from which to update." yield res continue lgr.info("Fetching updates for %s", ds) # fetch remote fetch_kwargs = dict( # test against user-provided value! remote=None if sibling is None else sibling_, all_=sibling is None, # required to not trip over submodules that # were removed in the origin clone recurse_submodules="no", prune=True) # prune to not accumulate a mess over time repo.fetch(**fetch_kwargs) # NOTE reevaluate ds.repo again, as it might have be converted from # a GitRepo to an AnnexRepo repo = ds.repo follow_parent = revision and follow == "parentds" if follow_parent and not repo.commit_exists(revision): if sibling_: try: lgr.debug("Fetching revision %s directly for %s", revision, repo) repo.fetch(remote=sibling_, refspec=revision, git_options=["--recurse-submodules=no"]) except CommandError as exc: yield dict( res, status="impossible", message=("Attempt to fetch %s from %s failed: %s", revision, sibling_, exc_str(exc))) continue else: yield dict(res, status="impossible", message=("Need to fetch %s directly " "but single sibling not resolved", revision)) continue saw_merge_failure = False if merge: if follow_parent: merge_target = revision else: merge_target = _choose_merge_target( repo, curr_branch, sibling_, tracking_remote) merge_fn = _choose_merge_fn( repo, is_annex=is_annex, adjusted=is_annex and repo.is_managed_branch(curr_branch)) merge_opts = None if merge_fn is _annex_sync: if follow_parent: yield dict( res, status="impossible", message=("follow='parentds' is incompatible " "with adjusted branches")) continue elif merge_target is None: yield dict(res, status="impossible", message="Could not determine merge target") continue elif merge == "ff-only": merge_opts = ["--ff-only"] if is_annex and reobtain_data: merge_fn = _reobtain(ds, merge_fn) for mres in merge_fn(repo, sibling_, merge_target, merge_opts=merge_opts): if mres["action"] == "merge" and mres["status"] != "ok": saw_merge_failure = True yield dict(res, **mres) if saw_merge_failure: merge_failures.add(ds) res['status'] = 'error' res['message'] = ("Merge of %s failed", merge_target) else: res['status'] = 'ok' save_paths.append(ds.path) yield res # we need to save updated states only if merge was requested -- otherwise # it was a pure fetch if merge and recursive: if path and not saw_subds: lgr.warning( 'path constraints did not match an installed subdataset: %s', path) if refds in merge_failures: lgr.warning( "Not saving because top-level dataset %s " "had a merge failure", refds.path) else: save_paths = [p for p in save_paths if p != refds.path] if not save_paths: return lgr.debug( 'Subdatasets where updated state may need to be ' 'saved in the parent dataset: %s', save_paths) for r in refds.save( path=save_paths, recursive=False, message='[DATALAD] Save updated subdatasets'): yield r
def __call__(title=None, name="osf", storage_name=None, dataset=None, mode="annex", existing='error', trust_level=None, tags=None, public=False, category='data', description=None, ): ds = require_dataset(dataset, purpose="create OSF remote", check_installed=True) res_kwargs = dict( ds=ds, action="create-sibling-osf", logger=lgr, ) # we need an annex if not isinstance(ds.repo, AnnexRepo): yield get_status_dict( type="dataset", status="impossible", message="dataset has no annex", **res_kwargs) return # NOTES: # - we prob. should check osf-special-remote availability upfront to # fail early # - add --recursive option # - recursive won't work easily. Need to think that through. # - would need a naming scheme for subdatasets # - flat on OSF or a tree? # - how do we detect something is there already, so we can skip # rather than duplicate (with a new name)? # osf-type-special-remote sufficient to decide it's not needed? # - adapt to conclusions in issue #30 # -> create those subcomponents # - results need to report URL for created projects suitable for datalad # output formatting! # -> result_renderer # -> needs to ne returned by create_node if not storage_name: storage_name = "{}-storage".format(name) sibling_conflicts = sibling_exists( ds, [name, storage_name], # TODO pass through recursive=False, recursion_limit=None, # fail fast, if error is desired exhaustive=existing == 'error', ) if existing == 'error' and sibling_conflicts: # we only asked for one conflict = sibling_conflicts[0] yield get_status_dict( status='error', message=( "a sibling '%s' is already configured in dataset %s", conflict[1], conflict[0]), **res_kwargs, ) return if title is None: # use dataset root basename title = ds.pathobj.name tags = ensure_list(tags) if 'DataLad dataset' not in tags: tags.append('DataLad dataset') if ds.id and ds.id not in tags: tags.append(ds.id) if not description: description = \ "This component was built from a DataLad dataset using the " \ "datalad-osf extension " \ "(https://github.com/datalad/datalad-osf)." if mode != 'exportonly': description += \ " With this extension installed, this component can be " \ "git or datalad cloned from a 'osf://ID' URL, where " \ "'ID' is the OSF node ID that shown in the OSF HTTP " \ "URL, e.g. https://osf.io/q8xnk/ can be cloned from " \ "osf://q8xnk" cred = get_credentials(allow_interactive=True) osf = OSF(**cred) node_id, node_url = create_node( osf_session=osf.session, title=title, category=category, tags=tags if tags else None, public=EnsureBool()(public), description=description, ) if mode != 'gitonly': init_opts = ["encryption=none", "type=external", "externaltype=osf", "autoenable=true", "node={}".format(node_id)] if mode in ("export", "exportonly"): init_opts += ["exporttree=yes"] ds.repo.init_remote(storage_name, options=init_opts) if trust_level: ds.repo.call_git(['annex', trust_level, storage_name]) yield get_status_dict( type="dataset", url=node_url, id=node_id, name=storage_name, status="ok", **res_kwargs ) if mode == 'exportonly': return ds.config.set( 'remote.{}.annex-ignore'.format(name), 'true', where='local') yield from ds.siblings( # use configure, not add, to not trip over the config that # we just made action='configure', name=name, url='osf://{}'.format(node_id), fetch=False, publish_depends=storage_name if mode != 'gitonly' else None, recursive=False, result_renderer=None, )
def _configure_remote(ds, name, known_remotes, url, pushurl, fetch, description, as_common_datasrc, publish_depends, publish_by_default, annex_wanted, annex_required, annex_group, annex_groupwanted, inherit, get_annex_info, **res_kwargs): result_props = dict(action='configure-sibling', path=ds.path, type='sibling', name=name, **res_kwargs) if name is None: result_props['status'] = 'error' result_props['message'] = 'need sibling `name` for configuration' yield result_props return if name != 'here': # do all configure steps that are not meaningful for the 'here' sibling # AKA the local repo if name not in known_remotes: # this remote is fresh: make it known # just minimalistic name and URL, the rest is coming from `configure` ds.repo.add_remote(name, url) known_remotes.append(name) elif url: # not new, override URl if given ds.repo.set_remote_url(name, url) # make sure we have a configured fetch expression at this point fetchvar = 'remote.{}.fetch'.format(name) if fetchvar not in ds.repo.config: # place default fetch refspec in config # same as `git remote add` would have added ds.repo.config.add(fetchvar, '+refs/heads/*:refs/remotes/{}/*'.format(name), where='local') if pushurl: ds.repo.set_remote_url(name, pushurl, push=True) if publish_depends: # Check if all `deps` remotes are known to the `repo` unknown_deps = set( assure_list(publish_depends)).difference(known_remotes) if unknown_deps: result_props['status'] = 'error' result_props['message'] = ( 'unknown sibling(s) specified as publication dependency: %s', unknown_deps) yield result_props return # define config var name for potential publication dependencies depvar = 'remote.{}.datalad-publish-depends'.format(name) # and default pushes dfltvar = "remote.{}.push".format(name) if fetch: # fetch the remote so we are up to date for r in Update.__call__(dataset=res_kwargs['refds'], path=[dict(path=ds.path, type='dataset')], sibling=name, merge=False, recursive=False, on_failure='ignore', return_type='generator', result_xfm=None): # fixup refds r.update(res_kwargs) yield r if inherit: # Adjust variables which we should inherit delayed_super = _DelayedSuper(ds.repo) publish_depends = _inherit_config_var(delayed_super, depvar, publish_depends) publish_by_default = _inherit_config_var(delayed_super, dfltvar, publish_by_default) # Copy relevant annex settings for the sibling # makes sense only if current AND super are annexes, so it is # kinda a boomer, since then forbids having a super a pure git if isinstance(ds.repo, AnnexRepo) and \ isinstance(delayed_super.repo, AnnexRepo): if annex_wanted is None: annex_wanted = _inherit_annex_var(delayed_super, name, 'wanted') if annex_required is None: annex_required = _inherit_annex_var( delayed_super, name, 'required') if annex_group is None: # I think it might be worth inheritting group regardless what # value is #if annex_wanted in {'groupwanted', 'standard'}: annex_group = _inherit_annex_var(delayed_super, name, 'group') if annex_wanted == 'groupwanted' and annex_groupwanted is None: # we better have a value for the expression for that group annex_groupwanted = _inherit_annex_var( delayed_super, name, 'groupwanted') if publish_depends: if depvar in ds.config: # config vars are incremental, so make sure we start from # scratch ds.config.unset(depvar, where='local', reload=False) for d in assure_list(publish_depends): lgr.info('Configure additional publication dependency on "%s"', d) ds.config.add(depvar, d, where='local', reload=False) ds.config.reload() if publish_by_default: if dfltvar in ds.config: ds.config.unset(dfltvar, where='local', reload=False) for refspec in assure_list(publish_by_default): lgr.info( 'Configure additional default publication refspec "%s"', refspec) ds.config.add(dfltvar, refspec, 'local') ds.config.reload() assert isinstance(ds.repo, GitRepo) # just against silly code if isinstance(ds.repo, AnnexRepo): # we need to check if added sibling an annex, and try to enable it # another part of the fix for #463 and #432 try: if not ds.config.obtain('remote.{}.annex-ignore'.format(name), default=False, valtype=EnsureBool(), store=False): ds.repo.enable_remote(name) except CommandError as exc: # TODO yield # this is unlikely to ever happen, now done for AnnexRepo instances # only lgr.info("Failed to enable annex remote %s, " "could be a pure git" % name) lgr.debug("Exception was: %s" % exc_str(exc)) if as_common_datasrc: ri = RI(url) if isinstance(ri, URL) and ri.scheme in ('http', 'https'): # XXX what if there is already a special remote # of this name? Above check for remotes ignores special # remotes. we need to `git annex dead REMOTE` on reconfigure # before we can init a new one # XXX except it is not enough # make special remote of type=git (see #335) ds.repo._run_annex_command('initremote', annex_options=[ as_common_datasrc, 'type=git', 'location={}'.format(url), 'autoenable=true' ]) else: yield dict( status='impossible', name=name, message='cannot configure as a common data source, ' 'URL protocol is not http or https', **result_props) # # place configure steps that also work for 'here' below # if isinstance(ds.repo, AnnexRepo): for prop, var in (('wanted', annex_wanted), ('required', annex_required), ('group', annex_group)): if var is not None: ds.repo.set_preferred_content(prop, var, '.' if name == 'here' else name) if annex_groupwanted: ds.repo.set_groupwanted(annex_group, annex_groupwanted) if description: if not isinstance(ds.repo, AnnexRepo): result_props['status'] = 'impossible' result_props[ 'message'] = 'cannot set description of a plain Git repository' yield result_props return ds.repo._run_annex_command('describe', annex_options=[name, description]) # report all we know at once info = list( _query_remotes(ds, name, known_remotes, get_annex_info=get_annex_info))[0] info.update(dict(status='ok', **result_props)) yield info
class Subdatasets(Interface): r"""Report subdatasets and their properties. The following properties are reported (if possible) for each matching subdataset record. "name" Name of the subdataset in the parent (often identical with the relative path in the parent dataset) "path" Absolute path to the subdataset "parentds" Absolute path to the parent dataset "gitshasum" SHA1 of the subdataset commit recorded in the parent dataset "state" Condition of the subdataset: 'clean', 'modified', 'absent', 'conflict' as reported by `git submodule` "gitmodule_url" URL of the subdataset recorded in the parent "gitmodule_name" Name of the subdataset recorded in the parent "gitmodule_<label>" Any additional configuration property on record. Performance note: Property modification, requesting `bottomup` reporting order, or a particular numerical `recursion_limit` implies an internal switch to an alternative query implementation for recursive query that is more flexible, but also notably slower (performs one call to Git per dataset versus a single call for all combined). The following properties for subdatasets are recognized by DataLad (without the 'gitmodule\_' prefix that is used in the query results): "datalad-recursiveinstall" If set to 'skip', the respective subdataset is skipped when DataLad is recursively installing its superdataset. However, the subdataset remains installable when explicitly requested, and no other features are impaired. """ _params_ = dict( dataset=Parameter(args=("-d", "--dataset"), doc="""specify the dataset to query. If no dataset is given, an attempt is made to identify the dataset based on the input and/or the current working directory""", constraints=EnsureDataset() | EnsureNone()), path=Parameter( args=("path", ), metavar='PATH', doc="""path/name to query for subdatasets. Defaults to the current directory[PY: , or the entire dataset if called as a dataset method PY].""", nargs='*', constraints=EnsureStr() | EnsureNone()), fulfilled=Parameter( args=("--fulfilled", ), doc="""if given, must be a boolean flag indicating whether to report either only locally present or absent datasets. By default subdatasets are reported regardless of their status""", constraints=EnsureBool() | EnsureNone()), recursive=recursion_flag, recursion_limit=recursion_limit, contains=Parameter( args=('--contains', ), metavar='PATH', action='append', doc="""limit report to the subdatasets containing the given path. If a root path of a subdataset is given the last reported dataset will be the subdataset itself.[CMD: This option can be given multiple times CMD][PY: Can be a list with multiple paths PY], in which case datasets will be reported that contain any of the given paths.""", constraints=EnsureStr() | EnsureNone()), bottomup=Parameter( args=("--bottomup", ), action="store_true", doc="""whether to report subdatasets in bottom-up order along each branch in the dataset tree, and not top-down."""), set_property=Parameter( args=('--set-property', ), metavar=('NAME', 'VALUE'), nargs=2, action='append', doc="""Name and value of one or more subdataset properties to be set in the parent dataset's .gitmodules file. The property name is case-insensitive, must start with a letter, and consist only of alphanumeric characters. The value can be a Python format() template string wrapped in '<>' (e.g. '<{gitmodule_name}>'). Supported keywords are any item reported in the result properties of this command, plus 'refds_relpath' and 'refds_relname': the relative path of a subdataset with respect to the base dataset of the command call, and, in the latter case, the same string with all directory separators replaced by dashes.[CMD: This option can be given multiple times. CMD]""", constraints=EnsureStr() | EnsureNone()), delete_property=Parameter( args=('--delete-property', ), metavar='NAME', action='append', doc="""Name of one or more subdataset properties to be removed from the parent dataset's .gitmodules file.[CMD: This option can be given multiple times. CMD]""", constraints=EnsureStr() | EnsureNone())) @staticmethod @datasetmethod(name='subdatasets') @eval_results def __call__(path=None, dataset=None, fulfilled=None, recursive=False, recursion_limit=None, contains=None, bottomup=False, set_property=None, delete_property=None): ds = require_dataset(dataset, check_installed=True, purpose='subdataset reporting/modification') paths = resolve_path(ensure_list(path), dataset, ds) if path else None # no constraints given -> query subdatasets under curdir if not paths and dataset is None: cwd = Path(getpwd()) paths = None if cwd == ds.pathobj else [cwd] lgr.debug('Query subdatasets of %s', dataset) if paths is not None: lgr.debug('Query subdatasets underneath paths: %s', paths) refds_path = ds.path # return as quickly as possible if isinstance(recursion_limit, int) and (recursion_limit <= 0): return if set_property: for k, v in set_property: if valid_key.match(k) is None: raise ValueError( "key '%s' is invalid (alphanumeric plus '-' only, must " "start with a letter)" % k) if contains: contains = resolve_path(ensure_list(contains), dataset, ds) # expand all test cases for the contains test in the loop below # leads to ~20% speedup per loop iteration of a non-match expanded_contains = [[c] + list(c.parents) for c in contains] else: expanded_contains = [] contains_hits = set() for r in _get_submodules(ds, paths, fulfilled, recursive, recursion_limit, expanded_contains, bottomup, set_property, delete_property, refds_path): # a boat-load of ancient code consumes this and is ignorant of # Path objects r['path'] = str(r['path']) # without the refds_path cannot be rendered/converted relative # in the eval_results decorator r['refds'] = refds_path if 'contains' in r: contains_hits.update(r['contains']) r['contains'] = [str(c) for c in r['contains']] yield r if contains: for c in set(contains).difference(contains_hits): yield get_status_dict( 'subdataset', path=str(c), status='impossible', message='path not contained in any matching subdataset', # we do not want to log such an event, because it is a # legit query to check for matching subdatasets simply # for the purpose of further decision making # user communication in front-end scenarios will happen # via result rendering #logger=lgr )
class Update(Interface): """Update a dataset from a sibling. """ # TODO: adjust docs to say: # - update from just one sibling at a time _examples_ = [ dict(text="Update from a particular sibling", code_py="update(sibling='siblingname')", code_cmd="datalad update -s <siblingname>"), dict(text="Update from a particular sibling and merge the changes " "from a configured or matching branch from the sibling " "(see [CMD: --follow CMD][PY: `follow` PY] for details)", code_py="update(sibling='siblingname', how='merge')", code_cmd="datalad update --how=merge -s <siblingname>"), dict(text="Update from the sibling 'origin', traversing into " "subdatasets. For subdatasets, merge the revision " "registered in the parent dataset into the current branch", code_py="update(sibling='origin', how='merge', " "follow='parentds', recursive=True)", code_cmd="datalad update -s origin --how=merge " "--follow=parentds -r"), dict(text="Fetch and merge the remote tracking branch " "into the current dataset. Then update each subdataset " "by resetting its current branch to the revision " "registered in the parent dataset, fetching only if " "the revision isn't already present", code_py="update(how='merge', how_subds='reset', " "follow='parentds-lazy', recursive=True)", code_cmd="datalad update --how=merge --how-subds=reset" "--follow=parentds-lazy -r"), ] _params_ = dict( path=Parameter( args=("path", ), metavar="PATH", doc= """constrain to-be-updated subdatasets to the given path for recursive operation.""", nargs="*", constraints=EnsureStr() | EnsureNone()), sibling=Parameter( args=( "-s", "--sibling", ), doc="""name of the sibling to update from. When unspecified, updates from all siblings are fetched. If there is more than one sibling and changes will be brought into the working tree (as requested via [CMD: --merge, --how, or --how-subds CMD][PY: `merge`, `how`, or `how_subds` PY]), a sibling will be chosen based on the configured remote for the current branch.""", constraints=EnsureStr() | EnsureNone()), dataset=Parameter(args=("-d", "--dataset"), doc="""specify the dataset to update. If no dataset is given, an attempt is made to identify the dataset based on the current working directory""", constraints=EnsureDataset() | EnsureNone()), merge=Parameter( args=("--merge", ), metavar="ALLOWED", # const and nargs are set to map --merge to --merge=any. const="any", nargs="?", constraints=EnsureBool() | EnsureChoice("any", "ff-only"), # TODO: Decide whether this should be removed eventually. doc="""merge obtained changes from the sibling. This is a subset of the functionality that can be achieved via the newer [CMD: --how CMD][PY: `how` PY]. [CMD: --merge or --merge=any CMD][PY: merge=True or merge="any" PY] is equivalent to [CMD: --how=merge CMD][PY: how="merge" PY]. [CMD: --merge=ff-only CMD][PY: merge="ff-only" PY] is equivalent to [CMD: --how=ff-only CMD][PY: how="ff-only" PY]."""), how=Parameter( args=("--how", ), nargs="?", constraints=_how_constraints, doc="""how to update the dataset. The default ("fetch") simply fetches the changes from the sibling but doesn't incorporate them into the working tree. A value of "merge" or "ff-only" merges in changes, with the latter restricting the allowed merges to fast-forwards. "reset" incorporates the changes with 'git reset --hard <target>', staying on the current branch but discarding any changes that aren't shared with the target. "checkout", on the other hand, runs 'git checkout <target>', switching from the current branch to a detached state. When [CMD: --recursive CMD][PY: recursive=True PY] is specified, this action will also apply to subdatasets unless overridden by [CMD: --how-subds CMD][PY: `how_subds` PY]."""), how_subds=Parameter( args=("--how-subds", ), nargs="?", constraints=_how_constraints, doc="""Override the behavior of [CMD: --how CMD][PY: `how` PY] in subdatasets."""), follow=Parameter( args=("--follow", ), constraints=EnsureChoice("sibling", "parentds", "parentds-lazy"), doc="""source of updates for subdatasets. For 'sibling', the update will be done by merging in a branch from the (specified or inferred) sibling. The branch brought in will either be the current branch's configured branch, if it points to a branch that belongs to the sibling, or a sibling branch with a name that matches the current branch. For 'parentds', the revision registered in the parent dataset of the subdataset is merged in. 'parentds-lazy' is like 'parentds', but prevents fetching from a subdataset's sibling if the registered revision is present in the subdataset. Note that the current dataset is always updated according to 'sibling'. This option has no effect unless a merge is requested and [CMD: --recursive CMD][PY: recursive=True PY] is specified.""", ), recursive=recursion_flag, recursion_limit=recursion_limit, fetch_all=Parameter( args=("--fetch-all", ), action="store_true", doc= """this option has no effect and will be removed in a future version. When no siblings are given, an all-sibling update will be performed.""", ), reobtain_data=Parameter( args=("--reobtain-data", ), action="store_true", doc="""if enabled, file content that was present before an update will be re-obtained in case a file was changed by the update."""), ) @staticmethod @datasetmethod(name='update') @eval_results def __call__(path=None, *, sibling=None, merge=False, how=None, how_subds=None, follow="sibling", dataset=None, recursive=False, recursion_limit=None, fetch_all=None, reobtain_data=False): if fetch_all is not None: lgr.warning( 'update(fetch_all=...) called. Option has no effect, and will be removed' ) if path and not recursive: lgr.warning('path constraints for subdataset updates ignored, ' 'because `recursive` option was not given') how, how_subds = _process_how_args(merge, how, how_subds) # `merge` should be considered through `how` and `how_subds` only. # Unbind `merge` to ensure that downstream code doesn't look at it. del merge refds = require_dataset(dataset, check_installed=True, purpose='update') save_paths = [] update_failures = set() saw_subds = False for ds, revision in itertools.chain( [(refds, None)], refds.subdatasets(path=path, state='present', recursive=recursive, recursion_limit=recursion_limit, return_type='generator', result_renderer='disabled', result_xfm=YieldDatasetAndRevision()) if recursive else []): if ds != refds: saw_subds = True repo = ds.repo is_annex = isinstance(repo, AnnexRepo) # prepare return value res = get_status_dict('update', ds=ds, logger=lgr, refds=refds.path) follow_parent = revision and follow.startswith("parentds") follow_parent_lazy = revision and follow == "parentds-lazy" if follow_parent_lazy and \ repo.get_hexsha(repo.get_corresponding_branch()) == revision: res["message"] = ( "Dataset already at commit registered in parent: %s", repo.path) res["status"] = "notneeded" yield res continue how_curr = how_subds if revision else how # get all remotes which have references (would exclude # special remotes) remotes = repo.get_remotes(**({ 'exclude_special_remotes': True } if is_annex else {})) if not remotes and not sibling: res['message'] = ( "No siblings known to dataset at %s\nSkipping", repo.path) res['status'] = 'notneeded' yield res continue curr_branch = repo.get_active_branch() tracking_remote = None if not sibling and len(remotes) == 1: # there is only one remote, must be this one sibling_ = remotes[0] elif not sibling: # nothing given, look for tracking branch tracking_remote = repo.get_tracking_branch(branch=curr_branch, remote_only=True)[0] sibling_ = tracking_remote else: sibling_ = sibling if sibling_ and sibling_ not in remotes: res['message'] = ("'%s' not known to dataset %s\nSkipping", sibling_, repo.path) res['status'] = 'impossible' yield res continue if not sibling_ and len(remotes) > 1 and how_curr: lgr.debug("Found multiple siblings:\n%s", remotes) res['status'] = 'impossible' res['message'] = "Multiple siblings, please specify from which to update." yield res continue lgr.info("Fetching updates for %s", ds) # fetch remote fetch_kwargs = dict( # test against user-provided value! remote=None if sibling is None else sibling_, all_=sibling is None, git_options=[ # required to not trip over submodules that were removed in # the origin clone "--no-recurse-submodules", # prune to not accumulate a mess over time "--prune" ]) if not (follow_parent_lazy and repo.commit_exists(revision)): try: repo.fetch(**fetch_kwargs) except CommandError as exc: ce = CapturedException(exc) yield get_status_dict( status="error", message=("Fetch failed: %s", ce), exception=ce, **res, ) continue # NOTE reevaluate ds.repo again, as it might have be converted from # a GitRepo to an AnnexRepo repo = ds.repo if follow_parent and not repo.commit_exists(revision): if sibling_: try: lgr.debug("Fetching revision %s directly for %s", revision, repo) repo.fetch(remote=sibling_, refspec=revision, git_options=["--recurse-submodules=no"]) except CommandError as exc: ce = CapturedException(exc) yield dict( res, status="impossible", message=("Attempt to fetch %s from %s failed: %s", revision, sibling_, ce), exception=ce) continue else: yield dict(res, status="impossible", message=("Need to fetch %s directly " "but single sibling not resolved", revision)) continue saw_update_failure = False if how_curr: if follow_parent: target = revision else: target = _choose_update_target(repo, curr_branch, sibling_, tracking_remote) adjusted = is_annex and repo.is_managed_branch(curr_branch) if adjusted: if follow_parent: yield dict( res, status="impossible", message=("follow='parentds' is incompatible " "with adjusted branches")) continue if how_curr != "merge": yield dict( res, status="impossible", message=("Updating via '%s' is incompatible " "with adjusted branches", how_curr)) continue update_fn = _choose_update_fn(repo, how_curr, is_annex=is_annex, adjusted=adjusted) fn_opts = ["--ff-only"] if how_curr == "ff-only" else None if update_fn is not _annex_sync: if target is None: yield dict(res, status="impossible", message="Could not determine update target") continue if is_annex and reobtain_data: update_fn = _reobtain(ds, update_fn) for ures in update_fn(repo, sibling_, target, opts=fn_opts): # NOTE: Ideally the "merge" action would also be prefixed # with "update.", but a plain "merge" is used for backward # compatibility. if ures["status"] != "ok" and ( ures["action"] == "merge" or ures["action"].startswith("update.")): saw_update_failure = True yield dict(res, **ures) if saw_update_failure: update_failures.add(ds) res['status'] = 'error' res['message'] = ("Update of %s failed", target) else: res['status'] = 'ok' save_paths.append(ds.path) yield res # we need to save updated states only if merge was requested -- otherwise # it was a pure fetch if how_curr and recursive: yield from _save_after_update(refds, save_paths, update_failures, path, saw_subds)
action='append', doc="""limit to the subdatasets containing the given path. If a root path of a subdataset is given, the last considered dataset will be the subdataset itself.[CMD: This option can be given multiple times CMD][PY: Can be a list with multiple paths PY], in which case datasets that contain any of the given paths will be considered.""", constraints=EnsureStr() | EnsureNone()) fulfilled = Parameter(args=("--fulfilled", ), doc="""DEPRECATED: use [CMD: --state CMD][PY: `state` PY] instead. If given, must be a boolean flag indicating whether to consider either only locally present or absent datasets. By default all subdatasets are considered regardless of their status.""", constraints=EnsureBool() | EnsureNone()) dataset_state = Parameter( args=("--state", ), doc="""indicate which (sub)datasets to consider: either only locally present, absent, or any of those two kinds. """, # yoh: intentionally left out the description of default since might be # command specific constraints=EnsureChoice('present', 'absent', 'any')) shared_access_opt = Parameter( args=('--shared-access', ), metavar='MODE', doc="""configure shared access to a dataset, see `git init --shared` documentation for complete details on the supported scenarios. Possible