def run_command(cmd, dataset=None, inputs=None, outputs=None, expand=None, explicit=False, message=None, sidecar=None, extra_info=None, rerun_info=None, extra_inputs=None, rerun_outputs=None, inject=False, saver=_save_outputs): """Run `cmd` in `dataset` and record the results. `Run.__call__` is a simple wrapper over this function. Aside from backward compatibility kludges, the only difference is that `Run.__call__` doesn't expose all the parameters of this function. The unexposed parameters are listed below. Parameters ---------- extra_info : dict, optional Additional information to dump with the json run record. Any value given here will take precedence over the standard run key. Warning: To avoid collisions with future keys added by `run`, callers should try to use fairly specific key names and are encouraged to nest fields under a top-level "namespace" key (e.g., the project or extension name). rerun_info : dict, optional Record from a previous run. This is used internally by `rerun`. extra_inputs : list, optional Inputs to use in addition to those specified by `inputs`. Unlike `inputs`, these will not be injected into the {inputs} format field. rerun_outputs : list, optional Outputs, in addition to those in `outputs`, determined automatically from a previous run. This is used internally by `rerun`. inject : bool, optional Record results as if a command was run, skipping input and output preparation and command execution. In this mode, the caller is responsible for ensuring that the state of the working tree is appropriate for recording the command's results. saver : callable, optional Must take a dataset instance, a list of paths to save, and a message string as arguments and must record any changes done to any content matching an entry in the path list. Must yield result dictionaries as a generator. Yields ------ Result records for the run. """ if not cmd: lgr.warning("No command given") return rel_pwd = rerun_info.get('pwd') if rerun_info else None if rel_pwd and dataset: # recording is relative to the dataset pwd = normpath(opj(dataset.path, rel_pwd)) rel_pwd = relpath(pwd, dataset.path) else: pwd, rel_pwd = get_command_pwds(dataset) ds = require_dataset( dataset, check_installed=True, purpose='tracking outcomes of a command') # not needed ATM #refds_path = ds.path lgr.debug('tracking command output underneath %s', ds) if not (rerun_info or inject): # Rerun already takes care of this. # For explicit=True, we probably want to check whether any inputs have # modifications. However, we can't just do is_dirty(..., path=inputs) # because we need to consider subdatasets and untracked files. if not explicit and ds.repo.dirty: yield get_status_dict( 'run', ds=ds, status='impossible', message=('unsaved modifications present, ' 'cannot detect changes by command')) return cmd = normalize_command(cmd) inputs = GlobbedPaths(inputs, pwd=pwd, expand=expand in ["inputs", "both"]) extra_inputs = GlobbedPaths(extra_inputs, pwd=pwd, # Follow same expansion rules as `inputs`. expand=expand in ["inputs", "both"]) outputs = GlobbedPaths(outputs, pwd=pwd, expand=expand in ["outputs", "both"]) if not inject: for res in prepare_inputs(ds, inputs, extra_inputs): yield res if outputs: for res in _install_and_reglob(ds, outputs): yield res for res in _unlock_or_remove(ds, outputs.expand(full=True)): yield res if rerun_outputs is not None: # These are files we need to unlock/remove for a rerun that aren't # included in the explicit outputs. Unlike inputs/outputs, these are # full paths, so we can pass them directly to unlock. for res in _unlock_or_remove(ds, rerun_outputs): yield res else: # If an inject=True caller wants to override the exit code, they can do # so in extra_info. cmd_exitcode = 0 exc = None try: cmd_expanded = format_command( ds, cmd, pwd=pwd, dspath=ds.path, # Check if the command contains "{tmpdir}" to avoid creating an # unnecessary temporary directory in most but not all cases. tmpdir=mkdtemp(prefix="datalad-run-") if "{tmpdir}" in cmd else "", inputs=inputs, outputs=outputs) except KeyError as exc: yield get_status_dict( 'run', ds=ds, status='impossible', message=('command has an unrecognized placeholder: %s', exc)) return if not inject: cmd_exitcode, exc = _execute_command( cmd_expanded, pwd, expected_exit=rerun_info.get("exit", 0) if rerun_info else None) # amend commit message with `run` info: # - pwd if inside the dataset # - the command itself # - exit code of the command run_info = { 'cmd': cmd, 'exit': cmd_exitcode, 'chain': rerun_info["chain"] if rerun_info else [], 'inputs': inputs.paths, 'extra_inputs': extra_inputs.paths, 'outputs': outputs.paths, } if rel_pwd is not None: # only when inside the dataset to not leak information run_info['pwd'] = rel_pwd if ds.id: run_info["dsid"] = ds.id if extra_info: run_info.update(extra_info) record = json.dumps(run_info, indent=1, sort_keys=True, ensure_ascii=False) if sidecar is None: use_sidecar = ds.config.get('datalad.run.record-sidecar', default=False) # If ConfigManager gets the ability to say "return single value", # update this code to use that. if isinstance(use_sidecar, tuple): # Use same precedence as 'git config'. use_sidecar = use_sidecar[-1] use_sidecar = anything2bool(use_sidecar) else: use_sidecar = sidecar if use_sidecar: # record ID is hash of record itself from hashlib import md5 record_id = md5(record.encode('utf-8')).hexdigest() record_dir = ds.config.get('datalad.run.record-directory', default=op.join('.datalad', 'runinfo')) record_path = op.join(ds.path, record_dir, record_id) if not op.lexists(record_path): # go for compression, even for minimal records not much difference, despite offset cost # wrap in list -- there is just one record dump2stream([run_info], record_path, compressed=True) # compose commit message msg = u"""\ [DATALAD RUNCMD] {} === Do not change lines below === {} ^^^ Do not change lines above ^^^ """ msg = msg.format( message if message is not None else _format_cmd_shorty(cmd_expanded), '"{}"'.format(record_id) if use_sidecar else record) outputs_to_save = outputs.expand(full=True) if explicit else '.' if not rerun_info and cmd_exitcode: if outputs_to_save: msg_path = relpath(opj(ds.repo.path, ds.repo.get_git_dir(ds.repo), "COMMIT_EDITMSG")) with open(msg_path, "wb") as ofh: ofh.write(assure_bytes(msg)) lgr.info("The command had a non-zero exit code. " "If this is expected, you can save the changes with " "'datalad rev-save -d . -r -F %s'", msg_path) raise exc elif outputs_to_save: for r in saver(ds, outputs_to_save, msg): yield r
def run_command(cmd, dataset=None, inputs=None, outputs=None, expand=None, explicit=False, message=None, sidecar=None, extra_info=None, rerun_info=None, extra_inputs=None, rerun_outputs=None, inject=False, saver=_save_outputs): """Run `cmd` in `dataset` and record the results. `Run.__call__` is a simple wrapper over this function. Aside from backward compatibility kludges, the only difference is that `Run.__call__` doesn't expose all the parameters of this function. The unexposed parameters are listed below. Parameters ---------- extra_info : dict, optional Additional information to dump with the json run record. Any value given here will take precedence over the standard run key. Warning: To avoid collisions with future keys added by `run`, callers should try to use fairly specific key names and are encouraged to nest fields under a top-level "namespace" key (e.g., the project or extension name). rerun_info : dict, optional Record from a previous run. This is used internally by `rerun`. extra_inputs : list, optional Inputs to use in addition to those specified by `inputs`. Unlike `inputs`, these will not be injected into the {inputs} format field. rerun_outputs : list, optional Outputs, in addition to those in `outputs`, determined automatically from a previous run. This is used internally by `rerun`. inject : bool, optional Record results as if a command was run, skipping input and output preparation and command execution. In this mode, the caller is responsible for ensuring that the state of the working tree is appropriate for recording the command's results. saver : callable, optional Must take a dataset instance, a list of paths to save, and a message string as arguments and must record any changes done to any content matching an entry in the path list. Must yield result dictionaries as a generator. Yields ------ Result records for the run. """ if not cmd: lgr.warning("No command given") return rel_pwd = rerun_info.get('pwd') if rerun_info else None if rel_pwd and dataset: # recording is relative to the dataset pwd = normpath(opj(dataset.path, rel_pwd)) rel_pwd = relpath(pwd, dataset.path) else: pwd, rel_pwd = get_command_pwds(dataset) ds = require_dataset( dataset, check_installed=True, purpose='tracking outcomes of a command') # not needed ATM #refds_path = ds.path lgr.debug('tracking command output underneath %s', ds) if not (rerun_info or inject): # Rerun already takes care of this. # For explicit=True, we probably want to check whether any inputs have # modifications. However, we can't just do is_dirty(..., path=inputs) # because we need to consider subdatasets and untracked files. if not explicit and ds.repo.dirty: yield get_status_dict( 'run', ds=ds, status='impossible', message=('unsaved modifications present, ' 'cannot detect changes by command')) return cmd = normalize_command(cmd) inputs = GlobbedPaths(inputs, pwd=pwd, expand=expand in ["inputs", "both"]) extra_inputs = GlobbedPaths(extra_inputs, pwd=pwd, # Follow same expansion rules as `inputs`. expand=expand in ["inputs", "both"]) outputs = GlobbedPaths(outputs, pwd=pwd, expand=expand in ["outputs", "both"]) if not inject: for res in prepare_inputs(ds, inputs, extra_inputs): yield res if outputs: for res in _install_and_reglob(ds, outputs): yield res for res in _unlock_or_remove(ds, outputs.expand(full=True)): yield res if rerun_outputs is not None: # These are files we need to unlock/remove for a rerun that aren't # included in the explicit outputs. Unlike inputs/outputs, these are # full paths, so we can pass them directly to unlock. for res in _unlock_or_remove(ds, rerun_outputs): yield res else: # If an inject=True caller wants to override the exit code, they can do # so in extra_info. cmd_exitcode = 0 exc = None try: cmd_expanded = format_command(ds, cmd, pwd=pwd, dspath=ds.path, inputs=inputs, outputs=outputs) except KeyError as exc: yield get_status_dict( 'run', ds=ds, status='impossible', message=('command has an unrecognized placeholder: %s', exc)) return if not inject: cmd_exitcode, exc = _execute_command( cmd_expanded, pwd, expected_exit=rerun_info.get("exit", 0) if rerun_info else None) # amend commit message with `run` info: # - pwd if inside the dataset # - the command itself # - exit code of the command run_info = { 'cmd': cmd, 'exit': cmd_exitcode, 'chain': rerun_info["chain"] if rerun_info else [], 'inputs': inputs.paths, 'extra_inputs': extra_inputs.paths, 'outputs': outputs.paths, } if rel_pwd is not None: # only when inside the dataset to not leak information run_info['pwd'] = rel_pwd if ds.id: run_info["dsid"] = ds.id if extra_info: run_info.update(extra_info) record = json.dumps(run_info, indent=1, sort_keys=True, ensure_ascii=False) if sidecar is None: use_sidecar = ds.config.get('datalad.run.record-sidecar', default=False) # If ConfigManager gets the ability to say "return single value", # update this code to use that. if isinstance(use_sidecar, tuple): # Use same precedence as 'git config'. use_sidecar = use_sidecar[-1] use_sidecar = anything2bool(use_sidecar) else: use_sidecar = sidecar if use_sidecar: # record ID is hash of record itself from hashlib import md5 record_id = md5(record.encode('utf-8')).hexdigest() record_dir = ds.config.get('datalad.run.record-directory', default=op.join('.datalad', 'runinfo')) record_path = op.join(ds.path, record_dir, record_id) if not op.lexists(record_path): # go for compression, even for minimal records not much difference, despite offset cost # wrap in list -- there is just one record dump2stream([run_info], record_path, compressed=True) # compose commit message msg = u"""\ [DATALAD RUNCMD] {} === Do not change lines below === {} ^^^ Do not change lines above ^^^ """ msg = msg.format( message if message is not None else _format_cmd_shorty(cmd_expanded), '"{}"'.format(record_id) if use_sidecar else record) outputs_to_save = outputs.expand(full=True) if explicit else '.' if not rerun_info and cmd_exitcode: if outputs_to_save: msg_path = relpath(opj(ds.repo.path, ds.repo.get_git_dir(ds.repo), "COMMIT_EDITMSG")) with open(msg_path, "wb") as ofh: ofh.write(assure_bytes(msg)) lgr.info("The command had a non-zero exit code. " "If this is expected, you can save the changes with " "'datalad add -d . -r -F %s .'", msg_path) raise exc elif outputs_to_save: for r in saver(ds, outputs_to_save, msg): yield r
def run_command(cmd, dataset=None, inputs=None, outputs=None, expand=None, explicit=False, message=None, sidecar=None, rerun_info=None, rerun_outputs=None): rel_pwd = rerun_info.get('pwd') if rerun_info else None if rel_pwd and dataset: # recording is relative to the dataset pwd = normpath(opj(dataset.path, rel_pwd)) rel_pwd = relpath(pwd, dataset.path) else: pwd, rel_pwd = get_command_pwds(dataset) ds = require_dataset(dataset, check_installed=True, purpose='tracking outcomes of a command') # not needed ATM #refds_path = ds.path # delayed imports from datalad.cmd import Runner lgr.debug('tracking command output underneath %s', ds) if not rerun_info: # Rerun already takes care of this. # For explicit=True, we probably want to check whether any inputs have # modifications. However, we can't just do is_dirty(..., path=inputs) # because we need to consider subdatasets and untracked files. if not explicit and ds.repo.dirty: yield get_status_dict('run', ds=ds, status='impossible', message=('unsaved modifications present, ' 'cannot detect changes by command')) return cmd = normalize_command(cmd) inputs = GlobbedPaths(inputs, pwd=pwd, expand=expand in ["inputs", "both"]) if inputs: for res in _install_and_reglob(ds, inputs): yield res for res in ds.get(inputs.expand(full=True), on_failure="ignore"): if res.get("state") == "absent": lgr.warning("Input does not exist: %s", res["path"]) else: yield res outputs = GlobbedPaths(outputs, pwd=pwd, expand=expand in ["outputs", "both"]) if outputs: for res in _install_and_reglob(ds, outputs): yield res for res in _unlock_or_remove(ds, outputs.expand(full=True)): yield res if rerun_outputs is not None: # These are files we need to unlock/remove for a rerun that aren't # included in the explicit outputs. Unlike inputs/outputs, these are # full paths, so we can pass them directly to unlock. for res in _unlock_or_remove(ds, rerun_outputs): yield res sub_namespace = { k.replace("datalad.run.substitutions.", ""): v for k, v in ds.config.items("datalad.run.substitutions") } try: cmd_expanded = format_command(cmd, pwd=pwd, dspath=ds.path, inputs=inputs, outputs=outputs, **sub_namespace) except KeyError as exc: yield get_status_dict( 'run', ds=ds, status='impossible', message=('command has an unrecognized placeholder: %s', exc)) return # we have a clean dataset, let's run things exc = None cmd_exitcode = None runner = Runner(cwd=pwd) try: lgr.info("== Command start (output follows) =====") runner.run( cmd_expanded, # immediate output log_online=True, # not yet sure what we should do with the command output # IMHO `run` itself should be very silent and let the command talk log_stdout=False, log_stderr=False, expect_stderr=True, expect_fail=True, # TODO stdin ) except CommandError as e: # strip our own info from the exception. The original command output # went to stdout/err -- we just have to exitcode in the same way exc = e cmd_exitcode = e.code if rerun_info and rerun_info.get("exit", 0) != cmd_exitcode: # we failed in a different way during a rerun. This can easily # happen if we try to alter a locked file # # TODO add the ability to `git reset --hard` the dataset tree on failure # we know that we started clean, so we could easily go back, needs gh-1424 # to be able to do it recursively raise exc lgr.info("== Command exit (modification check follows) =====") # amend commit message with `run` info: # - pwd if inside the dataset # - the command itself # - exit code of the command run_info = { 'cmd': cmd, 'exit': cmd_exitcode if cmd_exitcode is not None else 0, 'chain': rerun_info["chain"] if rerun_info else [], 'inputs': inputs.paths, 'outputs': outputs.paths, } if rel_pwd is not None: # only when inside the dataset to not leak information run_info['pwd'] = rel_pwd if ds.id: run_info["dsid"] = ds.id record = json.dumps(run_info, indent=1, sort_keys=True, ensure_ascii=False) use_sidecar = sidecar or (sidecar is None and ds.config.get( 'datalad.run.record-sidecar', default=False)) if use_sidecar: # record ID is hash of record itself from hashlib import md5 record_id = md5(record.encode('utf-8')).hexdigest() record_dir = ds.config.get('datalad.run.record-directory', default=op.join('.datalad', 'runinfo')) record_path = op.join(ds.path, record_dir, record_id) if not op.lexists(record_path): # go for compression, even for minimal records not much difference, despite offset cost # wrap in list -- there is just one record dump2stream([run_info], record_path, compressed=True) # compose commit message msg = u"""\ [DATALAD RUNCMD] {} === Do not change lines below === {} ^^^ Do not change lines above ^^^ """ msg = msg.format( message if message is not None else _format_cmd_shorty(cmd), '"{}"'.format(record_id) if use_sidecar else record) msg = assure_bytes(msg) outputs_to_save = outputs.expand(full=True) if explicit else '.' if not rerun_info and cmd_exitcode: if outputs_to_save: msg_path = relpath( opj(ds.repo.path, ds.repo.get_git_dir(ds.repo), "COMMIT_EDITMSG")) with open(msg_path, "wb") as ofh: ofh.write(msg) lgr.info( "The command had a non-zero exit code. " "If this is expected, you can save the changes with " "'datalad save -r -F %s .'", msg_path) raise exc elif outputs_to_save: for r in ds.add(outputs_to_save, recursive=True, message=msg): yield r
def __call__(cmd=None, dataset=None, inputs=None, outputs=None, expand=None, explicit=False, message=None, sidecar=None, jobcfg='default', submit=False): # TODO makes sure a different rel_pwd is handled properly on the remote end pwd, rel_pwd = get_command_pwds(dataset) ds = require_dataset(dataset, check_installed=True, purpose='preparing a remote command execution') try: cmd_expanded = format_command(ds, cmd, pwd=pwd, dspath=ds.path, inputs=inputs, outputs=outputs) except KeyError as exc: yield get_status_dict( 'htcprepare', ds=ds, status='impossible', message=('command has an unrecognized placeholder: %s', exc)) return transfer_files_list = ['pre.sh', 'post.sh'] # where all the submission packs live subroot_dir = get_submissions_dir(ds) subroot_dir.mkdir(parents=True, exist_ok=True) # location of to-be-created submission submission_dir = ut.Path( tempfile.mkdtemp(prefix='submit_', dir=text_type(subroot_dir))) submission = submission_dir.name[7:] split_cmd = shlex.split(cmd_expanded) # is this a singularity job? singularity_job = get_singularity_jobspec(split_cmd) if not singularity_job: with (submission_dir / 'runner.sh').open('wb') as f: f.write( resource_string('datalad_htcondor', 'resources/scripts/runner_direct.sh')) job_args = split_cmd else: # link the container into the submission dir (submission_dir / 'singularity.simg').symlink_to( ut.Path(singularity_job[0]).resolve()) transfer_files_list.append('singularity.simg') # arguments of the job job_args = singularity_job[1] job_args.insert(0, 'singularity.simg') # TODO conditional on run_as_user=false with (submission_dir / 'runner.sh').open('wb') as f: f.write( resource_string( 'datalad_htcondor', 'resources/scripts/runner_singularity_anon.sh')) make_executable(submission_dir / 'runner.sh') # htcondor wants the log dir to exist at submit time # TODO ATM we only support a single job per cluster submission (submission_dir / 'job_0' / 'logs').mkdir(parents=True) # TODO make job pre/post script selection configurable with (submission_dir / 'pre.sh').open('wb') as f: f.write( resource_string('datalad_htcondor', 'resources/scripts/pre_posix_chirp.sh')) make_executable(submission_dir / 'pre.sh') with (submission_dir / 'post.sh').open('wb') as f: f.write( resource_string('datalad_htcondor', 'resources/scripts/post_posix.sh')) make_executable(submission_dir / 'post.sh') # API support selection (bound dataset methods and such) # internal import to avoid circularities from datalad.api import ( rev_status as status, ) inputs = GlobbedPaths(inputs, pwd=pwd) prepare_inputs(ds, inputs) # it could be that an input expression does not expand, # because it doesn't match anything. In such a case # we need to filter out such globs to not confuse # the status() call below that only takes real paths inputs = [p for p in inputs.expand(full=True) if op.lexists(p)] # now figure out what matches the remaining paths in the # entire repo and dump a list of files to transfer if inputs: with (submission_dir / 'input_files').open('w') as f: # TODO disable output renderer for p in ds.rev_status( path=inputs, # TODO do we really want that True? I doubt it # this might pull in the world recursive=False, # we would have otherwise no idea untracked='no', result_renderer=None): f.write(text_type(p['path'])) f.write(u'\0') transfer_files_list.append('input_files') if outputs: # write the output globs to a file for eval on the execute # side # XXX we may not want to eval them on the remote side # at all, however. This would make things different # than with local execute, where we also just write to # a dataset and do not have an additional filter (submission_dir / 'output_globs').write_text( # we need a final trailing delimiter as a terminator u'\0'.join(outputs) + u'\0') transfer_files_list.append('output_globs') (submission_dir / 'source_dataset_location').write_text(text_type(ds.pathobj) + op.sep) transfer_files_list.append('source_dataset_location') with (submission_dir / 'cluster.submit').open('w') as f: f.write( submission_template.format( executable='runner.sh', # TODO if singularity_job else 'job.sh', transfer_files_list=','.join( op.join(op.pardir, f) for f in transfer_files_list), **submission_defaults)) f.write(u'\narguments = "{}"\nqueue\n'.format( # TODO deal with single quotes in the args ' '.join("'{}'".format(a) for a in job_args))) # dump the run command args into a file for re-use # when the result is merged # include even args that are already evaluated and # acted upon, to be able to convince `run` to create # a full run record that maybe could be re-run # locally json_py.dump( dict( cmd=cmd, inputs=inputs, outputs=outputs, expand=expand, explicit=explicit, message=message, sidecar=sidecar, # report the PWD to, to given `run` a chance # to be correct after the fact pwd=pwd, ), text_type(submission_dir / 'runargs.json')) # we use this file to inspect what state this submission is in (submission_dir / 'status').write_text(u'prepared') yield get_status_dict(action='htc_prepare', status='ok', refds=text_type(ds.pathobj), submission=submission, path=text_type(submission_dir), logger=lgr) if submit: try: Runner(cwd=text_type(submission_dir)).run( ['condor_submit', 'cluster.submit'], log_stdout=False, log_stderr=False, expect_stderr=True, expect_fail=True, ) (submission_dir / 'status').write_text(u'submitted') yield get_status_dict(action='htc_submit', status='ok', submission=submission, refds=text_type(ds.pathobj), path=text_type(submission_dir), logger=lgr) except CommandError as e: yield get_status_dict(action='htc_submit', status='error', submission=submission, message=('condor_submit failed: %s', exc_str(e)), refds=text_type(ds.pathobj), path=text_type(submission_dir), logger=lgr)