Beispiel #1
0
def format_command(dset, command, **kwds):
    """Plug in placeholders in `command`.

    Parameters
    ----------
    dset : Dataset
    command : str or list

    `kwds` is passed to the `format` call. `inputs` and `outputs` are converted
    to GlobbedPaths if necessary.

    Returns
    -------
    formatted command (str)
    """
    command = normalize_command(command)
    sfmt = SequenceFormatter()

    for k, v in dset.config.items("datalad.run.substitutions"):
        sub_key = k.replace("datalad.run.substitutions.", "")
        if sub_key not in kwds:
            kwds[sub_key] = v

    for name in ["inputs", "outputs"]:
        io_val = kwds.pop(name, None)
        if not isinstance(io_val, GlobbedPaths):
            io_val = GlobbedPaths(io_val, pwd=kwds.get("pwd"))
        kwds[name] = list(map(shlex_quote, io_val.expand(dot=False)))
    return sfmt.format(command, **kwds)
Beispiel #2
0
def test_globbedpaths_get_sub_patterns():
    gp = GlobbedPaths([], "doesn't matter")
    for pat, expected in [
            # If there are no patterns in the directory component, we get no
            # sub-patterns.
        ("", []),
        ("nodir", []),
        (op.join("nomagic", "path"), []),
        (op.join("nomagic", "path*"), []),
            # Create sub-patterns from leading path, successively dropping the
            # right-most component.
        (op.join("s*", "path"), ["s*" + op.sep]),
        (op.join("s", "ss*", "path"), [op.join("s", "ss*") + op.sep]),
        (op.join("s", "ss*", "path*"), [op.join("s", "ss*") + op.sep]),
        (op.join("s", "ss*" + op.sep), []),
        (op.join("s*", "ss",
                 "path*"), [op.join("s*", "ss") + op.sep, "s*" + op.sep]),
        (op.join("s?", "ss", "sss*", "path*"), [
            op.join("s?", "ss", "sss*") + op.sep,
            op.join("s?", "ss") + op.sep, "s?" + op.sep
        ])
    ]:
        eq_(gp._get_sub_patterns(pat), expected)
Beispiel #3
0
def test_globbedpaths_get_sub_patterns():
    gp = GlobbedPaths([], "doesn't matter")
    for pat, expected in [
            # If there are no patterns in the directory component, we get no
            # sub-patterns.
            ("", []),
            ("nodir", []),
            (op.join("nomagic", "path"), []),
            (op.join("nomagic", "path*"), []),
            # Create sub-patterns from leading path, successively dropping the
            # right-most component.
            (op.join("s*", "path"), ["s*" + op.sep]),
            (op.join("s", "ss*", "path"), [op.join("s", "ss*") + op.sep]),
            (op.join("s", "ss*", "path*"), [op.join("s", "ss*") + op.sep]),
            (op.join("s", "ss*" + op.sep), []),
            (op.join("s*", "ss", "path*"),
             [op.join("s*", "ss") + op.sep,
              "s*" + op.sep]),
            (op.join("s?", "ss", "sss*", "path*"),
             [op.join("s?", "ss", "sss*") + op.sep,
              op.join("s?", "ss") + op.sep,
              "s?" + op.sep])]:
        eq_(gp._get_sub_patterns(pat), expected)
Beispiel #4
0
def test_globbedpaths_partial_matches(path):
    gp = GlobbedPaths([op.join("?dir", "*.txt"), "*.txt"], pwd=path)
    eq_(gp.expand_strict(), ["1.txt", "3.txt"])

    expected_partial = ["adir" + op.sep, "bdir" + op.sep]
    eq_(gp.partial_hits, expected_partial)
    eq_(gp.expand(include_partial=True), expected_partial + ["1.txt", "3.txt"])

    # Property expands if needed.
    gp = GlobbedPaths([op.join("?dir", "*.txt")], pwd=path)
    eq_(gp.partial_hits, expected_partial)
Beispiel #5
0
def test_globbedpaths(path):
    dotdir = op.curdir + op.sep

    for patterns, expected in [
        (["1.txt", "2.dat"], {"1.txt", "2.dat"}),
        ([dotdir + "1.txt", "2.dat"], {dotdir + "1.txt", "2.dat"}),
        (["*.txt", "*.dat"], {"1.txt", "2.dat", u"bβ.dat", "3.txt"}),
        ([dotdir + "*.txt",
          "*.dat"], {dotdir + "1.txt", "2.dat", u"bβ.dat", dotdir + "3.txt"}),
        (["subdir/*.txt"], {"subdir/1.txt", "subdir/2.txt"}),
        ([dotdir + "subdir/*.txt"],
         {dotdir + p
          for p in ["subdir/1.txt", "subdir/2.txt"]}),
        (["*.txt"], {"1.txt", "3.txt"})
    ]:
        gp = GlobbedPaths(patterns, pwd=path)
        eq_(set(gp.expand()), expected)
        eq_(set(gp.expand(full=True)), {op.join(path, p) for p in expected})

    pardir = op.pardir + op.sep
    subdir_path = op.join(path, "subdir")
    for patterns, expected in [
        (["*.txt"], {"1.txt", "2.txt"}),
        ([dotdir + "*.txt"], {dotdir + p
                              for p in ["1.txt", "2.txt"]}),
        ([pardir + "*.txt"], {pardir + p
                              for p in ["1.txt", "3.txt"]}),
        ([dotdir + pardir + "*.txt"],
         {dotdir + pardir + p
          for p in ["1.txt", "3.txt"]}), (["subdir/"], {"subdir/"})
    ]:
        gp = GlobbedPaths(patterns, pwd=subdir_path)
        eq_(set(gp.expand()), expected)
        eq_(set(gp.expand(full=True)),
            {op.join(subdir_path, p)
             for p in expected})

    # Full patterns still get returned as relative to pwd.
    gp = GlobbedPaths([op.join(path, "*.dat")], pwd=path)
    eq_(gp.expand(), ["2.dat", u"bβ.dat"])

    # "." gets special treatment.
    gp = GlobbedPaths([".", "*.dat"], pwd=path)
    eq_(set(gp.expand()), {"2.dat", u"bβ.dat", "."})
    eq_(gp.expand(dot=False), ["2.dat", u"bβ.dat"])
    gp = GlobbedPaths(["."], pwd=path, expand=False)
    eq_(gp.expand(), ["."])
    eq_(gp.paths, ["."])

    # We can the glob outputs.
    glob_results = {"z": "z", "a": ["x", "d", "b"]}
    with patch('glob.glob', glob_results.get):
        gp = GlobbedPaths(["z", "a"])
        eq_(gp.expand(), ["z", "b", "d", "x"])

    # glob expansion for paths property is determined by expand argument.
    for expand, expected in [(True, ["2.dat", u"bβ.dat"]), (False, ["*.dat"])]:
        gp = GlobbedPaths(["*.dat"], pwd=path, expand=expand)
        eq_(gp.paths, expected)

    with swallow_logs(new_level=logging.DEBUG) as cml:
        GlobbedPaths(["not here"], pwd=path).expand()
        assert_in("No matching files found for 'not here'", cml.out)
Beispiel #6
0
def run_command(cmd,
                dataset=None,
                inputs=None,
                outputs=None,
                expand=None,
                explicit=False,
                message=None,
                sidecar=None,
                extra_info=None,
                rerun_info=None,
                extra_inputs=None,
                rerun_outputs=None,
                inject=False):
    """Run `cmd` in `dataset` and record the results.

    `Run.__call__` is a simple wrapper over this function. Aside from backward
    compatibility kludges, the only difference is that `Run.__call__` doesn't
    expose all the parameters of this function. The unexposed parameters are
    listed below.

    Parameters
    ----------
    extra_info : dict, optional
        Additional information to dump with the json run record. Any value
        given here will take precedence over the standard run key. Warning: To
        avoid collisions with future keys added by `run`, callers should try to
        use fairly specific key names and are encouraged to nest fields under a
        top-level "namespace" key (e.g., the project or extension name).
    rerun_info : dict, optional
        Record from a previous run. This is used internally by `rerun`.
    extra_inputs : list, optional
        Inputs to use in addition to those specified by `inputs`. Unlike
        `inputs`, these will not be injected into the {inputs} format field.
    rerun_outputs : list, optional
        Outputs, in addition to those in `outputs`, determined automatically
        from a previous run. This is used internally by `rerun`.
    inject : bool, optional
        Record results as if a command was run, skipping input and output
        preparation and command execution. In this mode, the caller is
        responsible for ensuring that the state of the working tree is
        appropriate for recording the command's results.

    Yields
    ------
    Result records for the run.
    """
    if not cmd:
        lgr.warning("No command given")
        return

    rel_pwd = rerun_info.get('pwd') if rerun_info else None
    if rel_pwd and dataset:
        # recording is relative to the dataset
        pwd = normpath(opj(dataset.path, rel_pwd))
        rel_pwd = relpath(pwd, dataset.path)
    else:
        pwd, rel_pwd = get_command_pwds(dataset)

    ds = require_dataset(dataset,
                         check_installed=True,
                         purpose='tracking outcomes of a command')
    ds_path = ds.path

    lgr.debug('tracking command output underneath %s', ds)

    if not (rerun_info or inject):  # Rerun already takes care of this.
        # For explicit=True, we probably want to check whether any inputs have
        # modifications. However, we can't just do is_dirty(..., path=inputs)
        # because we need to consider subdatasets and untracked files.
        # MIH: is_dirty() is gone, but status() can do all of the above!
        if not explicit and ds.repo.dirty:
            yield get_status_dict(
                'run',
                ds=ds,
                status='impossible',
                message=(
                    'clean dataset required to detect changes from command; '
                    'use `datalad status` to inspect unsaved changes'))
            return

    cmd = normalize_command(cmd)

    inputs = GlobbedPaths(inputs, pwd=pwd, expand=expand in ["inputs", "both"])
    extra_inputs = GlobbedPaths(
        extra_inputs,
        pwd=pwd,
        # Follow same expansion rules as `inputs`.
        expand=expand in ["inputs", "both"])
    outputs = GlobbedPaths(outputs,
                           pwd=pwd,
                           expand=expand in ["outputs", "both"])

    # ATTN: For correct path handling, all dataset commands call should be
    # unbound. They should (1) receive a string dataset argument, (2) receive
    # relative paths, and (3) happen within a chpwd(pwd) context.
    if not inject:
        with chpwd(pwd):
            for res in prepare_inputs(ds_path, inputs, extra_inputs):
                yield res

            if outputs:
                for res in _install_and_reglob(ds_path, outputs):
                    yield res
                for res in _unlock_or_remove(ds_path, outputs.expand()):
                    yield res

            if rerun_outputs is not None:
                for res in _unlock_or_remove(ds_path, rerun_outputs):
                    yield res
    else:
        # If an inject=True caller wants to override the exit code, they can do
        # so in extra_info.
        cmd_exitcode = 0
        exc = None

    try:
        cmd_expanded = format_command(
            ds,
            cmd,
            pwd=pwd,
            dspath=ds_path,
            # Check if the command contains "{tmpdir}" to avoid creating an
            # unnecessary temporary directory in most but not all cases.
            tmpdir=mkdtemp(prefix="datalad-run-") if "{tmpdir}" in cmd else "",
            inputs=inputs,
            outputs=outputs)
    except KeyError as exc:
        yield get_status_dict(
            'run',
            ds=ds,
            status='impossible',
            message=('command has an unrecognized placeholder: %s', exc))
        return

    if not inject:
        cmd_exitcode, exc = _execute_command(
            cmd_expanded,
            pwd,
            expected_exit=rerun_info.get("exit", 0) if rerun_info else None)

    # amend commit message with `run` info:
    # - pwd if inside the dataset
    # - the command itself
    # - exit code of the command
    run_info = {
        'cmd': cmd,
        'exit': cmd_exitcode,
        'chain': rerun_info["chain"] if rerun_info else [],
        'inputs': inputs.paths,
        'extra_inputs': extra_inputs.paths,
        'outputs': outputs.paths,
    }
    if rel_pwd is not None:
        # only when inside the dataset to not leak information
        run_info['pwd'] = rel_pwd
    if ds.id:
        run_info["dsid"] = ds.id
    if extra_info:
        run_info.update(extra_info)

    record = json.dumps(run_info, indent=1, sort_keys=True, ensure_ascii=False)

    if sidecar is None:
        use_sidecar = ds.config.get('datalad.run.record-sidecar',
                                    default=False)
        # If ConfigManager gets the ability to say "return single value",
        # update this code to use that.
        if isinstance(use_sidecar, tuple):
            # Use same precedence as 'git config'.
            use_sidecar = use_sidecar[-1]
        use_sidecar = anything2bool(use_sidecar)
    else:
        use_sidecar = sidecar

    if use_sidecar:
        # record ID is hash of record itself
        from hashlib import md5
        record_id = md5(record.encode('utf-8')).hexdigest()
        record_dir = ds.config.get('datalad.run.record-directory',
                                   default=op.join('.datalad', 'runinfo'))
        record_path = op.join(ds_path, record_dir, record_id)
        if not op.lexists(record_path):
            # go for compression, even for minimal records not much difference, despite offset cost
            # wrap in list -- there is just one record
            dump2stream([run_info], record_path, compressed=True)

    # compose commit message
    msg = u"""\
[DATALAD RUNCMD] {}

=== Do not change lines below ===
{}
^^^ Do not change lines above ^^^
"""
    msg = msg.format(
        message if message is not None else _format_cmd_shorty(cmd_expanded),
        '"{}"'.format(record_id) if use_sidecar else record)

    outputs_to_save = outputs.expand() if explicit else None
    do_save = outputs_to_save is None or outputs_to_save
    if not rerun_info and cmd_exitcode:
        if do_save:
            repo = ds.repo
            msg_path = relpath(opj(str(repo.dot_git), "COMMIT_EDITMSG"))
            with open(msg_path, "wb") as ofh:
                ofh.write(assure_bytes(msg))
            lgr.info(
                "The command had a non-zero exit code. "
                "If this is expected, you can save the changes with "
                "'datalad save -d . -r -F %s'", msg_path)
        raise exc
    elif do_save:
        with chpwd(pwd):
            for r in Save.__call__(dataset=ds_path,
                                   path=outputs_to_save,
                                   recursive=True,
                                   message=msg,
                                   return_type='generator'):
                yield r
Beispiel #7
0
def run_command(cmd, dataset=None, inputs=None, outputs=None, expand=None,
                explicit=False, message=None, sidecar=None,
                extra_info=None,
                rerun_info=None,
                extra_inputs=None,
                rerun_outputs=None,
                inject=False,
                saver=_save_outputs):
    """Run `cmd` in `dataset` and record the results.

    `Run.__call__` is a simple wrapper over this function. Aside from backward
    compatibility kludges, the only difference is that `Run.__call__` doesn't
    expose all the parameters of this function. The unexposed parameters are
    listed below.

    Parameters
    ----------
    extra_info : dict, optional
        Additional information to dump with the json run record. Any value
        given here will take precedence over the standard run key. Warning: To
        avoid collisions with future keys added by `run`, callers should try to
        use fairly specific key names and are encouraged to nest fields under a
        top-level "namespace" key (e.g., the project or extension name).
    rerun_info : dict, optional
        Record from a previous run. This is used internally by `rerun`.
    extra_inputs : list, optional
        Inputs to use in addition to those specified by `inputs`. Unlike
        `inputs`, these will not be injected into the {inputs} format field.
    rerun_outputs : list, optional
        Outputs, in addition to those in `outputs`, determined automatically
        from a previous run. This is used internally by `rerun`.
    inject : bool, optional
        Record results as if a command was run, skipping input and output
        preparation and command execution. In this mode, the caller is
        responsible for ensuring that the state of the working tree is
        appropriate for recording the command's results.
    saver : callable, optional
        Must take a dataset instance, a list of paths to save, and a
        message string as arguments and must record any changes done
        to any content matching an entry in the path list. Must yield
        result dictionaries as a generator.

    Yields
    ------
    Result records for the run.
    """
    if not cmd:
        lgr.warning("No command given")
        return

    rel_pwd = rerun_info.get('pwd') if rerun_info else None
    if rel_pwd and dataset:
        # recording is relative to the dataset
        pwd = normpath(opj(dataset.path, rel_pwd))
        rel_pwd = relpath(pwd, dataset.path)
    else:
        pwd, rel_pwd = get_command_pwds(dataset)

    ds = require_dataset(
        dataset, check_installed=True,
        purpose='tracking outcomes of a command')

    # not needed ATM
    #refds_path = ds.path

    lgr.debug('tracking command output underneath %s', ds)

    if not (rerun_info or inject):  # Rerun already takes care of this.
        # For explicit=True, we probably want to check whether any inputs have
        # modifications. However, we can't just do is_dirty(..., path=inputs)
        # because we need to consider subdatasets and untracked files.
        if not explicit and ds.repo.dirty:
            yield get_status_dict(
                'run',
                ds=ds,
                status='impossible',
                message=('unsaved modifications present, '
                         'cannot detect changes by command'))
            return

    cmd = normalize_command(cmd)

    inputs = GlobbedPaths(inputs, pwd=pwd,
                          expand=expand in ["inputs", "both"])
    extra_inputs = GlobbedPaths(extra_inputs, pwd=pwd,
                                # Follow same expansion rules as `inputs`.
                                expand=expand in ["inputs", "both"])
    outputs = GlobbedPaths(outputs, pwd=pwd,
                           expand=expand in ["outputs", "both"])

    if not inject:
        for res in prepare_inputs(ds, inputs, extra_inputs):
            yield res

        if outputs:
            for res in _install_and_reglob(ds, outputs):
                yield res
            for res in _unlock_or_remove(ds, outputs.expand(full=True)):
                yield res

        if rerun_outputs is not None:
            # These are files we need to unlock/remove for a rerun that aren't
            # included in the explicit outputs. Unlike inputs/outputs, these are
            # full paths, so we can pass them directly to unlock.
            for res in _unlock_or_remove(ds, rerun_outputs):
                yield res
    else:
        # If an inject=True caller wants to override the exit code, they can do
        # so in extra_info.
        cmd_exitcode = 0
        exc = None

    try:
        cmd_expanded = format_command(ds, cmd,
                                      pwd=pwd,
                                      dspath=ds.path,
                                      inputs=inputs,
                                      outputs=outputs)
    except KeyError as exc:
        yield get_status_dict(
            'run',
            ds=ds,
            status='impossible',
            message=('command has an unrecognized placeholder: %s',
                     exc))
        return

    if not inject:
        cmd_exitcode, exc = _execute_command(
            cmd_expanded, pwd,
            expected_exit=rerun_info.get("exit", 0) if rerun_info else None)


    # amend commit message with `run` info:
    # - pwd if inside the dataset
    # - the command itself
    # - exit code of the command
    run_info = {
        'cmd': cmd,
        'exit': cmd_exitcode,
        'chain': rerun_info["chain"] if rerun_info else [],
        'inputs': inputs.paths,
        'extra_inputs': extra_inputs.paths,
        'outputs': outputs.paths,
    }
    if rel_pwd is not None:
        # only when inside the dataset to not leak information
        run_info['pwd'] = rel_pwd
    if ds.id:
        run_info["dsid"] = ds.id
    if extra_info:
        run_info.update(extra_info)

    record = json.dumps(run_info, indent=1, sort_keys=True, ensure_ascii=False)

    if sidecar is None:
        use_sidecar = ds.config.get('datalad.run.record-sidecar', default=False)
        # If ConfigManager gets the ability to say "return single value",
        # update this code to use that.
        if isinstance(use_sidecar, tuple):
            # Use same precedence as 'git config'.
            use_sidecar = use_sidecar[-1]
        use_sidecar = anything2bool(use_sidecar)
    else:
        use_sidecar = sidecar


    if use_sidecar:
        # record ID is hash of record itself
        from hashlib import md5
        record_id = md5(record.encode('utf-8')).hexdigest()
        record_dir = ds.config.get('datalad.run.record-directory', default=op.join('.datalad', 'runinfo'))
        record_path = op.join(ds.path, record_dir, record_id)
        if not op.lexists(record_path):
            # go for compression, even for minimal records not much difference, despite offset cost
            # wrap in list -- there is just one record
            dump2stream([run_info], record_path, compressed=True)

    # compose commit message
    msg = u"""\
[DATALAD RUNCMD] {}

=== Do not change lines below ===
{}
^^^ Do not change lines above ^^^
"""
    msg = msg.format(
        message if message is not None else _format_cmd_shorty(cmd_expanded),
        '"{}"'.format(record_id) if use_sidecar else record)

    outputs_to_save = outputs.expand(full=True) if explicit else '.'
    if not rerun_info and cmd_exitcode:
        if outputs_to_save:
            msg_path = relpath(opj(ds.repo.path, ds.repo.get_git_dir(ds.repo),
                                   "COMMIT_EDITMSG"))
            with open(msg_path, "wb") as ofh:
                ofh.write(assure_bytes(msg))
            lgr.info("The command had a non-zero exit code. "
                     "If this is expected, you can save the changes with "
                     "'datalad add -d . -r -F %s .'",
                     msg_path)
        raise exc
    elif outputs_to_save:
        for r in saver(ds, outputs_to_save, msg):
            yield r
Beispiel #8
0
def test_globbedpaths_misses(path):
    gp = GlobbedPaths(["amiss"], pwd=path)
    eq_(gp.expand_strict(), [])
    eq_(gp.misses, ["amiss"])
    eq_(gp.expand(include_misses=True), ["amiss"])

    # miss at beginning
    gp = GlobbedPaths(["amiss", "*.txt", "*.dat"], pwd=path)
    eq_(gp.expand_strict(), ["1.txt", "3.txt", "2.dat"])
    eq_(gp.expand(include_misses=True), ["amiss", "1.txt", "3.txt", "2.dat"])

    # miss in middle
    gp = GlobbedPaths(["*.txt", "amiss", "*.dat"], pwd=path)
    eq_(gp.expand_strict(), ["1.txt", "3.txt", "2.dat"])
    eq_(gp.misses, ["amiss"])
    eq_(gp.expand(include_misses=True), ["1.txt", "3.txt", "amiss", "2.dat"])

    # miss at end
    gp = GlobbedPaths(["*.txt", "*.dat", "amiss"], pwd=path)
    eq_(gp.expand_strict(), ["1.txt", "3.txt", "2.dat"])
    eq_(gp.misses, ["amiss"])
    eq_(gp.expand(include_misses=True), ["1.txt", "3.txt", "2.dat", "amiss"])

    # miss at beginning, middle, and end
    gp = GlobbedPaths(
        ["amiss1", "amiss2", "*.txt", "amiss3", "*.dat", "amiss4"], pwd=path)
    eq_(gp.expand_strict(), ["1.txt", "3.txt", "2.dat"])
    eq_(gp.misses, ["amiss1", "amiss2", "amiss3", "amiss4"])
    eq_(gp.expand(include_misses=True),
        ["amiss1", "amiss2", "1.txt", "3.txt", "amiss3", "2.dat", "amiss4"])

    # Property expands if needed.
    gp = GlobbedPaths(["amiss"], pwd=path)
    eq_(gp.misses, ["amiss"])
Beispiel #9
0
def test_globbedpaths(path):
    dotdir = op.curdir + op.sep

    for patterns, expected in [
            (["1.txt", "2.dat"], {"1.txt", "2.dat"}),
            ([dotdir + "1.txt", "2.dat"], {dotdir + "1.txt", "2.dat"}),
            (["*.txt", "*.dat"], {"1.txt", "2.dat", u"bβ.dat", "3.txt"}),
            ([dotdir + "*.txt", "*.dat"],
             {dotdir + "1.txt", "2.dat", u"bβ.dat", dotdir + "3.txt"}),
            (["subdir/*.txt"], {"subdir/1.txt", "subdir/2.txt"}),
            ([dotdir + "subdir/*.txt"],
             {dotdir + p for p in ["subdir/1.txt", "subdir/2.txt"]}),
            (["*.txt"], {"1.txt", "3.txt"})]:
        gp = GlobbedPaths(patterns, pwd=path)
        eq_(set(gp.expand()), expected)
        eq_(set(gp.expand(full=True)),
            {op.join(path, p) for p in expected})

    pardir = op.pardir + op.sep
    subdir_path = op.join(path, "subdir")
    for patterns, expected in [
            (["*.txt"], {"1.txt", "2.txt"}),
            ([dotdir + "*.txt"], {dotdir + p for p in ["1.txt", "2.txt"]}),
            ([pardir + "*.txt"], {pardir + p for p in ["1.txt", "3.txt"]}),
            ([dotdir + pardir + "*.txt"],
             {dotdir + pardir + p for p in ["1.txt", "3.txt"]}),
            (["subdir/"], {"subdir/"})]:
        gp = GlobbedPaths(patterns, pwd=subdir_path)
        eq_(set(gp.expand()), expected)
        eq_(set(gp.expand(full=True)),
            {op.join(subdir_path, p) for p in expected})

    # Full patterns still get returned as relative to pwd.
    gp = GlobbedPaths([op.join(path, "*.dat")], pwd=path)
    eq_(gp.expand(), ["2.dat", u"bβ.dat"])

    # "." gets special treatment.
    gp = GlobbedPaths([".", "*.dat"], pwd=path)
    eq_(set(gp.expand()), {"2.dat", u"bβ.dat", "."})
    eq_(gp.expand(dot=False), ["2.dat", u"bβ.dat"])
    gp = GlobbedPaths(["."], pwd=path, expand=False)
    eq_(gp.expand(), ["."])
    eq_(gp.paths, ["."])

    # We can the glob outputs.
    glob_results = {"z": "z",
                    "a": ["x", "d", "b"]}
    with patch('glob.glob', glob_results.get):
        gp = GlobbedPaths(["z", "a"])
        eq_(gp.expand(), ["z", "b", "d", "x"])

    # glob expansion for paths property is determined by expand argument.
    for expand, expected in [(True, ["2.dat", u"bβ.dat"]),
                             (False, ["*.dat"])]:
        gp = GlobbedPaths(["*.dat"], pwd=path, expand=expand)
        eq_(gp.paths, expected)

    with swallow_logs(new_level=logging.DEBUG) as cml:
        GlobbedPaths(["not here"], pwd=path).expand()
        assert_in("No matching files found for 'not here'", cml.out)
Beispiel #10
0
def run_command(cmd,
                dataset=None,
                inputs=None,
                outputs=None,
                expand=None,
                assume_ready=None,
                explicit=False,
                message=None,
                sidecar=None,
                dry_run=False,
                jobs=None,
                extra_info=None,
                rerun_info=None,
                extra_inputs=None,
                rerun_outputs=None,
                inject=False,
                parametric_record=False,
                remove_outputs=False,
                skip_dirtycheck=False,
                yield_expanded=None):
    """Run `cmd` in `dataset` and record the results.

    `Run.__call__` is a simple wrapper over this function. Aside from backward
    compatibility kludges, the only difference is that `Run.__call__` doesn't
    expose all the parameters of this function. The unexposed parameters are
    listed below.

    Parameters
    ----------
    extra_info : dict, optional
        Additional information to dump with the json run record. Any value
        given here will take precedence over the standard run key. Warning: To
        avoid collisions with future keys added by `run`, callers should try to
        use fairly specific key names and are encouraged to nest fields under a
        top-level "namespace" key (e.g., the project or extension name).
    rerun_info : dict, optional
        Record from a previous run. This is used internally by `rerun`.
    extra_inputs : list, optional
        Inputs to use in addition to those specified by `inputs`. Unlike
        `inputs`, these will not be injected into the {inputs} format field.
    rerun_outputs : list, optional
        Outputs, in addition to those in `outputs`, determined automatically
        from a previous run. This is used internally by `rerun`.
    inject : bool, optional
        Record results as if a command was run, skipping input and output
        preparation and command execution. In this mode, the caller is
        responsible for ensuring that the state of the working tree is
        appropriate for recording the command's results.
    parametric_record : bool, optional
        If enabled, substitution placeholders in the input/output specification
        are retained verbatim in the run record. This enables using a single
        run record for multiple different re-runs via individual
        parametrization.
    remove_outputs : bool, optional
        If enabled, all declared outputs will be removed prior command
        execution, except for paths that are also declared inputs.
    skip_dirtycheck : bool, optional
        If enabled, a check for dataset modifications is unconditionally
        disabled, even if other parameters would indicate otherwise. This
        can be used by callers that already performed analog verififcations
        to avoid duplicate processing.
    yield_expanded : {'inputs', 'outputs', 'both'}, optional
        Include a 'expanded_%s' item into the run result with the exanded list
        of paths matching the inputs and/or outputs specification,
        respectively.


    Yields
    ------
    Result records for the run.
    """
    if not cmd:
        lgr.warning("No command given")
        return

    specs = {
        k: ensure_list(v)
        for k, v in (('inputs', inputs), ('extra_inputs', extra_inputs),
                     ('outputs', outputs))
    }

    rel_pwd = rerun_info.get('pwd') if rerun_info else None
    if rel_pwd and dataset:
        # recording is relative to the dataset
        pwd = op.normpath(op.join(dataset.path, rel_pwd))
        rel_pwd = op.relpath(pwd, dataset.path)
    else:
        pwd, rel_pwd = get_command_pwds(dataset)

    ds = require_dataset(dataset,
                         check_installed=True,
                         purpose='track command outcomes')
    ds_path = ds.path

    lgr.debug('tracking command output underneath %s', ds)

    # skip for callers that already take care of this
    if not (skip_dirtycheck or rerun_info or inject):
        # For explicit=True, we probably want to check whether any inputs have
        # modifications. However, we can't just do is_dirty(..., path=inputs)
        # because we need to consider subdatasets and untracked files.
        # MIH: is_dirty() is gone, but status() can do all of the above!
        if not explicit and ds.repo.dirty:
            yield get_status_dict(
                'run',
                ds=ds,
                status='impossible',
                message=(
                    'clean dataset required to detect changes from command; '
                    'use `datalad status` to inspect unsaved changes'))
            return

    # everything below expects the string-form of the command
    cmd = normalize_command(cmd)
    # pull substitutions from config
    cmd_fmt_kwargs = _get_substitutions(ds)
    # amend with unexpanded dependency/output specifications, which might
    # themselves contain substitution placeholder
    for n, val in specs.items():
        if val:
            cmd_fmt_kwargs[n] = val

    # apply the substitution to the IO specs
    expanded_specs = {
        k: _format_iospecs(v, **cmd_fmt_kwargs)
        for k, v in specs.items()
    }
    # try-expect to catch expansion issues in _format_iospecs() which
    # expands placeholders in dependency/output specification before
    # globbing
    try:
        globbed = {
            k: GlobbedPaths(
                v,
                pwd=pwd,
                expand=expand in (
                    # extra_inputs follow same expansion rules as `inputs`.
                    ["both"] +
                    (['outputs'] if k == 'outputs' else ['inputs'])))
            for k, v in expanded_specs.items()
        }
    except KeyError as exc:
        yield get_status_dict(
            'run',
            ds=ds,
            status='impossible',
            message=('input/output specification has an unrecognized '
                     'placeholder: %s', exc))
        return

    if not (inject or dry_run):
        yield from _prep_worktree(ds_path,
                                  pwd,
                                  globbed,
                                  assume_ready=assume_ready,
                                  remove_outputs=remove_outputs,
                                  rerun_outputs=rerun_outputs,
                                  jobs=None)
    else:
        # If an inject=True caller wants to override the exit code, they can do
        # so in extra_info.
        cmd_exitcode = 0
        exc = None

    # prepare command formatting by extending the set of configurable
    # substitutions with the essential components
    cmd_fmt_kwargs.update(
        pwd=pwd,
        dspath=ds_path,
        # Check if the command contains "{tmpdir}" to avoid creating an
        # unnecessary temporary directory in most but not all cases.
        tmpdir=mkdtemp(prefix="datalad-run-") if "{tmpdir}" in cmd else "",
        # the following override any matching non-glob substitution
        # values
        inputs=globbed['inputs'],
        outputs=globbed['outputs'],
    )
    try:
        cmd_expanded = format_command(ds, cmd, **cmd_fmt_kwargs)
    except KeyError as exc:
        yield get_status_dict(
            'run',
            ds=ds,
            status='impossible',
            message=('command has an unrecognized placeholder: %s', exc))
        return

    # amend commit message with `run` info:
    # - pwd if inside the dataset
    # - the command itself
    # - exit code of the command
    run_info = {
        'cmd': cmd,
        # rerun does not handle any prop being None, hence all
        # the `or/else []`
        'chain': rerun_info["chain"] if rerun_info else [],
    }
    # for all following we need to make sure that the raw
    # specifications, incl. any placeholders make it into
    # the run-record to enable "parametric" re-runs
    # ...except when expansion was requested
    for k, v in specs.items():
        run_info[k] = globbed[k].paths \
            if expand in ["both"] + (
                ['outputs'] if k == 'outputs' else ['inputs']) \
            else (v if parametric_record
                  else expanded_specs[k]) or []

    if rel_pwd is not None:
        # only when inside the dataset to not leak information
        run_info['pwd'] = rel_pwd
    if ds.id:
        run_info["dsid"] = ds.id
    if extra_info:
        run_info.update(extra_info)

    if dry_run:
        yield get_status_dict(
            "run [dry-run]",
            ds=ds,
            status="ok",
            message="Dry run",
            run_info=run_info,
            dry_run_info=dict(
                cmd_expanded=cmd_expanded,
                pwd_full=pwd,
                **{k: globbed[k].expand()
                   for k in ('inputs', 'outputs')},
            ))
        return

    if not inject:
        cmd_exitcode, exc = _execute_command(cmd_expanded, pwd)
        run_info['exit'] = cmd_exitcode

    # Re-glob to capture any new outputs.
    #
    # TODO: If a warning or error is desired when an --output pattern doesn't
    # have a match, this would be the spot to do it.
    if explicit or expand in ["outputs", "both"]:
        # also for explicit mode we have to re-glob to be able to save all
        # matching outputs
        globbed['outputs'].expand(refresh=True)
        if expand in ["outputs", "both"]:
            run_info["outputs"] = globbed['outputs'].paths

    # create the run record, either as a string, or written to a file
    # depending on the config/request
    record, record_path = _create_record(run_info, sidecar, ds)

    # abbreviate version of the command for illustrative purposes
    cmd_shorty = _format_cmd_shorty(cmd_expanded)

    # compose commit message
    msg = u"""\
[DATALAD RUNCMD] {}

=== Do not change lines below ===
{}
^^^ Do not change lines above ^^^
"""
    msg = msg.format(message if message is not None else cmd_shorty,
                     '"{}"'.format(record) if record_path else record)

    outputs_to_save = globbed['outputs'].expand_strict() if explicit else None
    if outputs_to_save is not None and record_path:
        outputs_to_save.append(record_path)
    do_save = outputs_to_save is None or outputs_to_save
    msg_path = None
    if not rerun_info and cmd_exitcode:
        if do_save:
            repo = ds.repo
            # must record path to be relative to ds.path to meet
            # result record semantics (think symlink resolution, etc)
            msg_path = ds.pathobj / \
                repo.dot_git.relative_to(repo.pathobj) / "COMMIT_EDITMSG"
            msg_path.write_text(msg)

    expected_exit = rerun_info.get("exit", 0) if rerun_info else None
    if cmd_exitcode and expected_exit != cmd_exitcode:
        status = "error"
    else:
        status = "ok"

    run_result = get_status_dict(
        "run",
        ds=ds,
        status=status,
        # use the abbrev. command as the message to give immediate clarity what
        # completed/errors in the generic result rendering
        message=cmd_shorty,
        run_info=run_info,
        # use the same key that `get_status_dict()` would/will use
        # to record the exit code in case of an exception
        exit_code=cmd_exitcode,
        exception=exc,
        # Provide msg_path and explicit outputs so that, under
        # on_failure='stop', callers can react to a failure and then call
        # save().
        msg_path=str(msg_path) if msg_path else None,
    )
    if record_path:
        # we the record is in a sidecar file, report its ID
        run_result['record_id'] = record
    for s in ('inputs', 'outputs'):
        # this enables callers to further inspect the outputs without
        # performing globbing again. Together with remove_outputs=True
        # these would be guaranteed to be the outcome of the executed
        # command. in contrast to `outputs_to_save` this does not
        # include aux file, such as the run record sidecar file.
        # calling .expand_strict() again is largely reporting cached
        # information
        # (format: relative paths)
        if yield_expanded in (s, 'both'):
            run_result[f'expanded_{s}'] = globbed[s].expand_strict()
    yield run_result

    if do_save:
        with chpwd(pwd):
            for r in Save.__call__(
                    dataset=ds_path,
                    path=outputs_to_save,
                    recursive=True,
                    message=msg,
                    jobs=jobs,
                    return_type='generator',
                    # we want this command and its parameterization to be in full
                    # control about the rendering of results, hence we must turn
                    # off internal rendering
                    result_renderer='disabled',
                    on_failure='ignore'):
                yield r