Exemple #1
0
class Init(Interface):
    """Initialize an existing dataset to track a UKBiobank participant

    A batch file for the 'ukbfetch' tool will be generated and placed into the
    dataset. By selecting the relevant data records, raw and/or preprocessed
    data will be tracked.

    After initialization the dataset will contain at least three branches:

    - incoming: to track the pristine ZIP files downloaded from UKB
    - incoming-native: to track individual files (some extracted from ZIP
      files)
    - incoming-bids: to track individual files in a layout where file name
      conform to BIDS-conventions
    - master: based off of incoming-native or incoming-bids (if enabled)
      with potential manual modifications applied
    """

    _examples_ = [
        dict(
            text='Initialize a dataset in the current directory',
            code_cmd='datalad ukb-init 5874415 20227_2_0 20249_2_0',
            code_py=
            'ukb_init(participant="5874415", records=["20227_2_0", "20249_2_0"])'
        ),
        dict(
            text='Initialize a dataset in the current directory in BIDS layout',
            code_cmd='datalad ukb-init --bids 5874415 20227_2_0',
            code_py=
            'ukb_init(participant="5874415", records=["20227_2_0"], bids=True)'
        ),
    ]

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            metavar='DATASET',
            doc="""specify the dataset to perform the initialization on""",
            constraints=EnsureDataset() | EnsureNone()),
        participant=Parameter(
            args=('participant', ),
            metavar='PARTICPANT-ID',
            nargs=1,
            doc="""UKBiobank participant ID to use for this dataset
            (note: these encoded IDs are unique to each
            application/project)""",
            constraints=EnsureStr()),
        records=Parameter(args=('records', ),
                          metavar='DATARECORD-ID',
                          nargs='+',
                          doc='One or more data record identifiers',
                          constraints=EnsureStr()),
        force=Parameter(args=(
            "-f",
            "--force",
        ),
                        doc="""force (re-)initialization""",
                        action='store_true'),
        bids=Parameter(
            args=('--bids', ),
            action='store_true',
            doc="""additionally maintain an incoming-bids branch with a
            BIDS-like organization."""),
    )

    @staticmethod
    @datasetmethod(name='ukb_init')
    @eval_results
    def __call__(participant, records, force=False, bids=False, dataset=None):
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='initialization')

        participant = ensure_list(participant)[0]
        records = ensure_list(records)

        repo = ds.repo
        branches = repo.get_branches()

        # prep for yield
        res = dict(
            action='ukb_init',
            path=ds.path,
            type='dataset',
            logger=lgr,
            refds=ds.path,
        )

        if 'incoming' in branches and not force:
            yield dict(
                res,
                status='error',
                message='Dataset found already initialized, '
                'use `force` to reinitialize',
            )
            return
        if 'incoming' not in branches:
            # establish "incoming" branch that will hold pristine UKB downloads
            repo.call_git(['checkout', '--orphan', 'incoming'])
        else:
            repo.call_git(['checkout', 'incoming'])

        # place batch file with download config for ukbfetch in it
        batchfile = repo.pathobj / '.ukbbatch'
        batchfile.write_text('{}\n'.format('\n'.join(
            '{} {}'.format(participant, rec) for rec in records)))
        # save to incoming branch, provide path to avoid adding untracked
        # content
        ds.save(
            path='.ukbbatch',
            to_git=True,
            message="Configure UKB data fetch",
            result_renderer=None,
        )
        # establish rest of the branch structure: "incoming-processsed"
        # for extracted archive content
        _add_incoming_branch('incoming-native', branches, repo, batchfile)
        if bids:
            _add_incoming_branch('incoming-bids', branches, repo, batchfile)
        # force merge unrelated histories into master
        # we are using an orphan branch such that we know that
        # `git ls-tree incoming`
        # will only report download-related content, nothing extracted or
        # manually modified
        repo.call_git(['checkout', 'master'])
        repo.call_git([
            'merge',
            '-m',
            'Merge incoming',
            '--allow-unrelated-histories',
            'incoming-bids' if bids else 'incoming-native',
        ])

        yield dict(
            res,
            status='ok',
            participant=participant,
            records=records,
        )
        return
Exemple #2
0
class Create(Interface):
    """Create a new dataset from scratch.

    This command initializes a new dataset at a given location, or the
    current directory. The new dataset can optionally be registered in an
    existing superdataset (the new dataset's path needs to be located
    within the superdataset for that, and the superdataset needs to be given
    explicitly via [PY: `dataset` PY][CMD: --dataset CMD]). It is recommended
    to provide a brief description to label the dataset's nature *and*
    location, e.g. "Michael's music on black laptop". This helps humans to
    identify data locations in distributed scenarios.  By default an identifier
    comprised of user and machine name, plus path will be generated.

    This command only creates a new dataset, it does not add existing content
    to it, even if the target directory already contains additional files or
    directories.

    Plain Git repositories can be created via the [PY: `no_annex` PY][CMD: --no-annex CMD] flag.
    However, the result will not be a full dataset, and, consequently,
    not all features are supported (e.g. a description).

    || REFLOW >>
    To create a local version of a remote dataset use the
    :func:`~datalad.api.install` command instead.
    << REFLOW ||

    .. note::
      Power-user info: This command uses :command:`git init` and
      :command:`git annex init` to prepare the new dataset. Registering to a
      superdataset is performed via a :command:`git submodule add` operation
      in the discovered superdataset.
    """

    # in general this command will yield exactly one result
    return_type = 'item-or-list'
    # in general users expect to get an instance of the created dataset
    result_xfm = 'datasets'
    # result filter
    result_filter = \
        EnsureKeyChoice('action', ('create',)) & \
        EnsureKeyChoice('status', ('ok', 'notneeded'))

    _examples_ = [
        dict(text="Create a dataset 'mydataset' in the current directory",
             code_py="create(path='mydataset')",
             code_cmd="datalad create mydataset"),
        dict(text="Apply the text2git procedure upon creation of a dataset",
             code_py="create(path='mydataset', cfg_proc='text2git')",
             code_cmd="datalad create -c text2git mydataset"),
        dict(text="Create a subdataset in the root of an existing dataset",
             code_py="create(dataset='.', path='mysubdataset')",
             code_cmd="datalad create -d . mysubdataset"),
        dict(text="Create a dataset in an existing, non-empty directory",
             code_py="create(force=True)",
             code_cmd="datalad create --force"),
        dict(text="Create a plain Git repository",
             code_py="create(path='mydataset', no_annex=True)",
             code_cmd="datalad create --no-annex mydataset"),
    ]

    _params_ = dict(
        path=Parameter(
            args=("path", ),
            nargs='?',
            metavar='PATH',
            doc="""path where the dataset shall be created, directories
            will be created as necessary. If no location is provided, a dataset
            will be created in the location specified by [PY: `dataset`
            PY][CMD: --dataset CMD] (if given) or the current working
            directory. Either way the command will error if the target
            directory is not empty. Use [PY: `force` PY][CMD: --force CMD] to
            create a dataset in a non-empty directory.""",
            # put dataset 2nd to avoid useless conversion
            constraints=EnsureStr() | EnsureDataset() | EnsureNone()),
        initopts=Parameter(
            args=("initopts", ),
            metavar='INIT OPTIONS',
            nargs=REMAINDER,
            doc="""options to pass to :command:`git init`. [PY: Options can be
            given as a list of command line arguments or as a GitPython-style
            option dictionary PY][CMD: Any argument specified after the
            destination path of the repository will be passed to git-init
            as-is CMD]. Note that not all options will lead to viable results.
            For example '--bare' will not yield a repository where DataLad
            can adjust files in its working tree."""),
        dataset=Parameter(
            args=("-d", "--dataset"),
            metavar='DATASET',
            doc="""specify the dataset to perform the create operation on. If
            a dataset is given along with `path`, a new subdataset will be created
            in it at the `path` provided to the create command. If a dataset is
            given but `path` is unspecified, a new dataset will be created at the
            location specified by this option.""",
            constraints=EnsureDataset() | EnsureNone()),
        force=Parameter(
            args=(
                "-f",
                "--force",
            ),
            doc="""enforce creation of a dataset in a non-empty directory""",
            action='store_true'),
        description=location_description,
        no_annex=Parameter(
            # hide this from the cmdline parser, replaced by `annex`
            args=tuple(),
            doc="""this option is deprecated, use `annex` instead""",
            action='store_true'),
        annex=Parameter(
            args=("--no-annex", ),
            dest='annex',
            doc="""if [CMD: set CMD][PY: disabled PY], a plain Git repository
            will be created without any annex""",
            action='store_false'),
        # TODO seems to only cause a config flag to be set, this could be done
        # in a procedure
        fake_dates=Parameter(
            args=('--fake-dates', ),
            action='store_true',
            doc="""Configure the repository to use fake dates. The date for a
            new commit will be set to one second later than the latest commit
            in the repository. This can be used to anonymize dates."""),
        cfg_proc=Parameter(
            args=("-c", "--cfg-proc"),
            metavar="PROC",
            action='append',
            doc="""Run cfg_PROC procedure(s) (can be specified multiple times)
            on the created dataset. Use
            [PY: `run_procedure(discover=True)` PY][CMD: run_procedure --discover CMD]
            to get a list of available procedures, such as cfg_text2git.
            """))

    @staticmethod
    @datasetmethod(name='create')
    @eval_results
    def __call__(path=None,
                 initopts=None,
                 force=False,
                 description=None,
                 dataset=None,
                 no_annex=_NoAnnexDefault,
                 annex=True,
                 fake_dates=False,
                 cfg_proc=None):
        # TODO: introduced with 0.13, remove with 0.14
        if no_annex is not _NoAnnexDefault:
            # the two mirror options do not agree and the deprecated one is
            # not at default value
            warnings.warn(
                "datalad-create's `no_annex` option is deprecated "
                "and will be removed in a future release, "
                "use the reversed-sign `annex` option instead.",
                DeprecationWarning)
            # honor the old option for now
            annex = not no_annex

        # we only perform negative tests below
        no_annex = not annex

        if dataset:
            if isinstance(dataset, Dataset):
                ds = dataset
            else:
                ds = Dataset(dataset)
            refds_path = ds.path
        else:
            ds = refds_path = None

        # two major cases
        # 1. we got a `dataset` -> we either want to create it (path is None),
        #    or another dataset in it (path is not None)
        # 2. we got no dataset -> we want to create a fresh dataset at the
        #    desired location, either at `path` or PWD

        # sanity check first
        if no_annex:
            if description:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "description for annex repo and declaring "
                                 "no annex repo.")

        if (isinstance(initopts, (list, tuple))
                and '--bare' in initopts) or (isinstance(initopts, dict)
                                              and 'bare' in initopts):
            raise ValueError(
                "Creation of bare repositories is not supported. Consider "
                "one of the create-sibling commands, or use "
                "Git to init a bare repository and push an existing dataset "
                "into it.")

        if path:
            path = resolve_path(path, dataset)

        path = path if path \
            else getpwd() if ds is None \
            else refds_path

        # we know that we need to create a dataset at `path`
        assert (path is not None)

        # assure cfg_proc is a list (relevant if used via Python API)
        cfg_proc = assure_list(cfg_proc)

        # prep for yield
        res = dict(action='create',
                   path=str(path),
                   logger=lgr,
                   type='dataset',
                   refds=refds_path)

        refds = None
        if refds_path and refds_path != str(path):
            refds = require_dataset(refds_path,
                                    check_installed=True,
                                    purpose='creating a subdataset')

            path_inrefds = path_under_rev_dataset(refds, path)
            if path_inrefds is None:
                yield dict(
                    res,
                    status='error',
                    message=(
                        "dataset containing given paths is not underneath "
                        "the reference dataset %s: %s", ds, str(path)),
                )
                return

        # try to locate an immediate parent dataset
        # we want to know this (irrespective of whether we plan on adding
        # this new dataset to a parent) in order to avoid conflicts with
        # a potentially absent/uninstalled subdataset of the parent
        # in this location
        # it will cost some filesystem traversal though...
        parentds_path = get_dataset_root(
            op.normpath(op.join(str(path), os.pardir)))
        if parentds_path:
            prepo = GitRepo(parentds_path)
            parentds_path = ut.Path(parentds_path)
            # we cannot get away with a simple
            # GitRepo.get_content_info(), as we need to detect
            # uninstalled/added subdatasets too
            check_path = ut.Path(path)
            pstatus = prepo.status(
                untracked='no',
                # limit query to target path for a potentially massive speed-up
                paths=[check_path.relative_to(parentds_path)])
            if (not pstatus.get(check_path, {}).get("type") == "dataset"
                    and any(check_path == p or check_path in p.parents
                            for p in pstatus)):
                # redo the check in a slower fashion, it is already broken
                # let's take our time for a proper error message
                conflict = [
                    p for p in pstatus
                    if check_path == p or check_path in p.parents
                ]
                res.update({
                    'status':
                    'error',
                    'message':
                    ('collision with content in parent dataset at %s: %s',
                     str(parentds_path), [str(c) for c in conflict])
                })
                yield res
                return
            if not force:
                # another set of check to see whether the target path is pointing
                # into a known subdataset that is not around ATM
                subds_status = {
                    parentds_path / k.relative_to(prepo.path)
                    for k, v in pstatus.items()
                    if v.get('type', None) == 'dataset'
                }
                check_paths = [check_path]
                check_paths.extend(check_path.parents)
                if any(p in subds_status for p in check_paths):
                    conflict = [p for p in check_paths if p in subds_status]
                    res.update({
                        'status':
                        'error',
                        'message':
                        ('collision with %s (dataset) in dataset %s',
                         str(conflict[0]), str(parentds_path))
                    })
                    yield res
                    return

        # important to use the given Dataset object to avoid spurious ID
        # changes with not-yet-materialized Datasets
        tbds = ds if isinstance(ds, Dataset) and \
            ds.path == path else Dataset(str(path))

        # don't create in non-empty directory without `force`:
        if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force:
            res.update({
                'status':
                'error',
                'message':
                'will not create a dataset in a non-empty directory, use '
                '`force` option to ignore'
            })
            yield res
            return

        # stuff that we create and want to have tracked with git (not annex)
        add_to_git = {}

        if initopts is not None and isinstance(initopts, list):
            initopts = {'_from_cmdline_': initopts}

        # Note for the code below:
        # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad
        # Re-use tbrepo instance, do not use tbds.repo

        # create and configure desired repository
        if no_annex:
            lgr.info("Creating a new git repo at %s", tbds.path)
            tbrepo = GitRepo(tbds.path,
                             url=None,
                             create=True,
                             create_sanity_checks=False,
                             git_opts=initopts,
                             fake_dates=fake_dates)
            # place a .noannex file to indicate annex to leave this repo alone
            stamp_path = ut.Path(tbrepo.path) / '.noannex'
            stamp_path.touch()
            add_to_git[stamp_path] = {'type': 'file', 'state': 'untracked'}
        else:
            # always come with annex when created from scratch
            lgr.info("Creating a new annex repo at %s", tbds.path)
            tbrepo = AnnexRepo(
                tbds.path,
                url=None,
                create=True,
                create_sanity_checks=False,
                # do not set backend here, to avoid a dedicated commit
                backend=None,
                # None causes version to be taken from config
                version=None,
                description=description,
                git_opts=initopts,
                fake_dates=fake_dates)
            # set the annex backend in .gitattributes as a staged change
            tbrepo.set_default_backend(cfg.obtain('datalad.repo.backend'),
                                       persistent=True,
                                       commit=False)
            add_to_git[tbrepo.pathobj / '.gitattributes'] = {
                'type': 'file',
                'state': 'added'
            }
            # make sure that v6 annex repos never commit content under .datalad
            attrs_cfg = (('config', 'annex.largefiles', 'nothing'), (
                'metadata/aggregate*', 'annex.largefiles', 'nothing'
            ), ('metadata/objects/**', 'annex.largefiles', '({})'.format(
                cfg.obtain('datalad.metadata.create-aggregate-annex-limit'))))
            attrs = tbrepo.get_gitattributes(
                [op.join('.datalad', i[0]) for i in attrs_cfg])
            set_attrs = []
            for p, k, v in attrs_cfg:
                if not attrs.get(op.join('.datalad', p), {}).get(k, None) == v:
                    set_attrs.append((p, {k: v}))
            if set_attrs:
                tbrepo.set_gitattributes(set_attrs,
                                         attrfile=op.join(
                                             '.datalad', '.gitattributes'))

            # prevent git annex from ever annexing .git* stuff (gh-1597)
            attrs = tbrepo.get_gitattributes('.git')
            if not attrs.get('.git', {}).get('annex.largefiles',
                                             None) == 'nothing':
                tbrepo.set_gitattributes([('**/.git*', {
                    'annex.largefiles': 'nothing'
                })])
                # must use the repo.pathobj as this will have resolved symlinks
                add_to_git[tbrepo.pathobj / '.gitattributes'] = {
                    'type': 'file',
                    'state': 'untracked'
                }

        # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad
        # Note, must not happen earlier (before if) since "smart" it would not be
        tbds_config = tbds.config

        # record an ID for this repo for the afterlife
        # to be able to track siblings and children
        id_var = 'datalad.dataset.id'
        # Note, that Dataset property `id` will change when we unset the
        # respective config. Therefore store it before:
        tbds_id = tbds.id
        if id_var in tbds_config:
            # make sure we reset this variable completely, in case of a
            # re-create
            tbds_config.unset(id_var, where='dataset')

        if _seed is None:
            # just the standard way
            uuid_id = uuid.uuid1().urn.split(':')[-1]
        else:
            # Let's generate preseeded ones
            uuid_id = str(uuid.UUID(int=random.getrandbits(128)))
        tbds_config.add(id_var,
                        tbds_id if tbds_id is not None else uuid_id,
                        where='dataset',
                        reload=False)

        # make config overrides permanent in the repo config
        # this is similar to what `annex init` does
        # we are only doing this for config overrides and do not expose
        # a dedicated argument, because it is sufficient for the cmdline
        # and unnecessary for the Python API (there could simply be a
        # subsequence ds.config.add() call)
        for k, v in tbds_config.overrides.items():
            tbds_config.add(k, v, where='local', reload=False)

        # all config manipulation is done -> fll reload
        tbds_config.reload()

        # must use the repo.pathobj as this will have resolved symlinks
        add_to_git[tbrepo.pathobj / '.datalad'] = {
            'type': 'directory',
            'state': 'untracked'
        }

        # save everything, we need to do this now and cannot merge with the
        # call below, because we may need to add this subdataset to a parent
        # but cannot until we have a first commit
        tbrepo.save(
            message='[DATALAD] new dataset',
            git=True,
            # we have to supply our own custom status, as the repo does
            # not have a single commit yet and the is no HEAD reference
            # TODO make `GitRepo.status()` robust to this state.
            _status=add_to_git,
        )

        for cfg_proc_ in cfg_proc:
            for r in tbds.run_procedure('cfg_' + cfg_proc_):
                yield r

        # the next only makes sense if we saved the created dataset,
        # otherwise we have no committed state to be registered
        # in the parent
        if isinstance(refds, Dataset) and refds.path != tbds.path:
            # we created a dataset in another dataset
            # -> make submodule
            for r in refds.save(path=tbds.path, ):
                yield r

        res.update({'status': 'ok'})
        yield res

    @staticmethod
    def custom_result_renderer(res, **kwargs):  # pragma: more cover
        from datalad.ui import ui
        if res.get('action', None) == 'create' and \
                res.get('status', None) == 'ok' and \
                res.get('type', None) == 'dataset':
            ui.message("Created dataset at {}.".format(res['path']))
        else:
            ui.message("Nothing was created")
Exemple #3
0
class Subdatasets(Interface):
    r"""Report subdatasets and their properties.

    The following properties are reported (if possible) for each matching
    subdataset record.

    "name"
        Name of the subdataset in the parent (often identical with the
        relative path in the parent dataset)

    "path"
        Absolute path to the subdataset

    "parentds"
        Absolute path to the parent dataset

    "revision"
        SHA1 of the subdataset commit recorded in the parent dataset

    "state"
        Condition of the subdataset: 'clean', 'modified', 'absent', 'conflict'
        as reported by `git submodule`

    "revision_descr"
        Output of `git describe` for the subdataset

    "gitmodule_url"
        URL of the subdataset recorded in the parent

    "gitmodule_<label>"
        Any additional configuration property on record.

    Performance note: Property modification, requesting `bottomup` reporting
    order, or a particular numerical `recursion_limit` implies an internal
    switch to an alternative query implementation for recursive query that is
    more flexible, but also notably slower (performs one call to Git per
    dataset versus a single call for all combined).

    The following properties for subdatasets are recognized by DataLad
    (without the 'gitmodule\_' prefix that is used in the query results):

    "datalad-recursiveinstall"
        If set to 'skip', the respective subdataset is skipped when DataLad
        is recursively installing its superdataset. However, the subdataset
        remains installable when explicitly requested, and no other features
        are impaired.



    """
    _params_ = dict(
        dataset=Parameter(args=("-d", "--dataset"),
                          doc="""specify the dataset to query.  If
            no dataset is given, an attempt is made to identify the dataset
            based on the input and/or the current working directory""",
                          constraints=EnsureDataset() | EnsureNone()),
        fulfilled=Parameter(
            args=("--fulfilled", ),
            doc="""if given, must be a boolean flag indicating whether
            to report either only locally present or absent datasets.
            By default subdatasets are reported regardless of their
            status""",
            constraints=EnsureBool() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        contains=Parameter(
            args=('--contains', ),
            metavar='PATH',
            doc="""limit report to the subdatasets containing the
            given path. If a root path of a subdataset is given the last
            reported dataset will be the subdataset itself.""",
            constraints=EnsureStr() | EnsureNone()),
        bottomup=Parameter(
            args=("--bottomup", ),
            action="store_true",
            doc="""whether to report subdatasets in bottom-up order along
            each branch in the dataset tree, and not top-down."""),
        set_property=Parameter(
            args=('--set-property', ),
            metavar=('NAME', 'VALUE'),
            nargs=2,
            action='append',
            doc="""Name and value of one or more subdataset properties to
            be set in the parent dataset's .gitmodules file. The property name
            is case-insensitive, must start with a letter, and consist only
            of alphanumeric characters. The value can be
            a Python format() template string wrapped in '<>' (e.g.
            '<{gitmodule_name}>').
            Supported keywords are any item reported in the result properties
            of this command, plus 'refds_relpath' and 'refds_relname':
            the relative path of a subdataset with respect to the base dataset
            of the command call, and, in the latter case, the same string with
            all directory separators replaced by dashes.[CMD:  This
            option can be given multiple times. CMD]""",
            constraints=EnsureStr() | EnsureNone()),
        delete_property=Parameter(
            args=('--delete-property', ),
            metavar='NAME',
            action='append',
            doc="""Name of one or more subdataset properties to be removed
            from the parent dataset's .gitmodules file.[CMD:  This
            option can be given multiple times. CMD]""",
            constraints=EnsureStr() | EnsureNone()))

    @staticmethod
    @datasetmethod(name='subdatasets')
    @eval_results
    def __call__(dataset=None,
                 fulfilled=None,
                 recursive=False,
                 recursion_limit=None,
                 contains=None,
                 bottomup=False,
                 set_property=None,
                 delete_property=None):
        dataset = require_dataset(dataset,
                                  check_installed=False,
                                  purpose='subdataset reporting/modification')
        refds_path = dataset.path

        # XXX this seems strange, but is tested to be the case -- I'd rather set
        # `check_installed` to true above and fail
        if not GitRepo.is_valid_repo(refds_path):
            return

        # return as quickly as possible
        if isinstance(recursion_limit, int) and (recursion_limit <= 0):
            return

        if set_property:
            for k, v in set_property:
                if valid_key.match(k) is None:
                    raise ValueError(
                        "key '%s' is invalid (alphanumeric plus '-' only, must start with a letter)",
                        k)
        if contains:
            contains = resolve_path(contains, dataset)
        for r in _get_submodules(dataset.path, fulfilled, recursive,
                                 recursion_limit, contains, bottomup,
                                 set_property, delete_property, refds_path):
            # without the refds_path cannot be rendered/converted relative
            # in the eval_results decorator
            r['refds'] = refds_path
            yield r
class CreateSiblingRia(Interface):
    """Creates a sibling to a dataset in a RIA store

    Communication with a dataset in a RIA store is implemented via two
    siblings. A regular Git remote (repository sibling) and a git-annex
    special remote for data transfer (storage sibling) -- with the former
    having a publication dependency on the latter. By default, the name of the
    storage sibling is derived from the repository sibling's name by appending
    "-storage".

    The store's base path is expected to not exist, be an empty directory,
    or a valid RIA store.

    RIA store layout
    ~~~~~~~~~~~~~~~~

    A RIA store is a directory tree with a dedicated subdirectory for each
    dataset in the store. The subdirectory name is constructed from the
    DataLad dataset ID, e.g. '124/68afe-59ec-11ea-93d7-f0d5bf7b5561', where
    the first three characters of the ID are used for an intermediate
    subdirectory in order to mitigate files system limitations for stores
    containing a large number of datasets.

    Each dataset subdirectory contains a standard bare Git repository for
    the dataset.

    In addition, a subdirectory 'annex' hold a standard Git-annex object
    store. However, instead of using the 'dirhashlower' naming scheme for
    the object directories, like Git-annex would do, a 'dirhashmixed'
    layout is used -- the same as for non-bare Git repositories or regular
    DataLad datasets.

    Optionally, there can be a further subdirectory 'archives' with
    (compressed) 7z archives of annex objects. The storage remote is able to
    pull annex objects from these archives, if it cannot find in the regular
    annex object store. This feature can be useful for storing large
    collections of rarely changing data on systems that limit the number of
    files that can be stored.

    Each dataset directory also contains a 'ria-layout-version' file that
    identifies the data organization (as, for example, described above).

    Lastly, there is a global 'ria-layout-version' file at the store's
    base path that identifies where dataset subdirectories themselves are
    located. At present, this file must contain a single line stating the
    version (currently "1"). This line MUST end with a newline character.

    It is possible to define an alias for an individual dataset in a store by
    placing a symlink to the dataset location into an 'alias/' directory
    in the root of the store. This enables dataset access via URLs of format:
    'ria+<protocol>://<storelocation>#~<aliasname>'.

    Error logging
    ~~~~~~~~~~~~~

    To enable error logging at the remote end, append a pipe symbol and an "l"
    to the version number in ria-layout-version (like so '1|l\\n').

    Error logging will create files in an "error_log" directory whenever the
    git-annex special remote (storage sibling) raises an exception, storing the
    Python traceback of it. The logfiles are named according to the scheme
    '<dataset id>.<annex uuid of the remote>.log' showing "who" ran into this
    issue with which dataset. Because logging can potentially leak personal
    data (like local file paths for example), it can be disabled client-side
    by setting the configuration variable
    "annex.ora-remote.<storage-sibling-name>.ignore-remote-config".
    """

    # TODO: description?
    _params_ = dict(
        dataset=Parameter(args=("-d", "--dataset"),
                          doc="""specify the dataset to process.  If
            no dataset is given, an attempt is made to identify the dataset
            based on the current working directory""",
                          constraints=EnsureDataset() | EnsureNone()),
        url=Parameter(
            args=("url", ),
            metavar="ria+<ssh|file>://<host>[/path]",
            doc="""URL identifying the target RIA store and access protocol.
            """,
            constraints=EnsureStr() | EnsureNone()),
        name=Parameter(args=(
            '-s',
            '--name',
        ),
                       metavar='NAME',
                       doc="""Name of the sibling.
            With `recursive`, the same name will be used to label all
            the subdatasets' siblings.""",
                       constraints=EnsureStr() | EnsureNone(),
                       required=True),
        storage_name=Parameter(
            args=("--storage-name", ),
            metavar="NAME",
            doc="""Name of the storage sibling (git-annex special remote).
            Must not be identical to the sibling name. If not specified,
            defaults to the sibling name plus '-storage' suffix. If only
            a storage sibling is created, this setting is ignored, and
            the primary sibling name is used.""",
            constraints=EnsureStr() | EnsureNone()),
        post_update_hook=Parameter(
            args=("--post-update-hook", ),
            doc="""Enable git's default post-update-hook for the created
            sibling.""",
            action="store_true"),
        shared=Parameter(
            args=("--shared", ),
            metavar='{false|true|umask|group|all|world|everybody|0xxx}',
            doc="""If given, configures the permissions in the
            RIA store for multi-users access.
            Possible values for this option are identical to those of
            `git init --shared` and are described in its documentation.""",
            constraints=EnsureStr() | EnsureBool() | EnsureNone()),
        group=Parameter(
            args=("--group", ),
            metavar="GROUP",
            doc="""Filesystem group for the repository. Specifying the group is
            crucial when [CMD: --shared=group CMD][PY: shared="group" PY]""",
            constraints=EnsureStr() | EnsureNone()),
        storage_sibling=Parameter(
            args=("--storage-sibling", ),
            dest='storage_sibling',
            metavar='MODE',
            constraints=EnsureChoice('only') | EnsureBool() | EnsureNone(),
            doc="""By default, an ORA storage sibling and a Git repository
            sibling are created ([CMD: on CMD][PY: True|'on' PY]).
            Alternatively, creation of the storage sibling can be disabled
            ([CMD: off CMD][PY: False|'off' PY]), or a storage sibling
            created only and no Git sibling
            ([CMD: only CMD][PY: 'only' PY]). In the latter mode, no Git
            installation is required on the target host."""),
        existing=Parameter(
            args=("--existing", ),
            constraints=EnsureChoice('skip', 'error', 'reconfigure')
            | EnsureNone(),
            metavar='MODE',
            doc="""Action to perform, if a (storage) sibling is already
            configured under the given name and/or a target already exists.
            In this case, a dataset can be skipped ('skip'), an existing target
            repository be forcefully re-initialized, and the sibling
            (re-)configured ('reconfigure'), or the command be instructed to
            fail ('error').""",
        ),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        trust_level=Parameter(
            args=("--trust-level", ),
            metavar="TRUST-LEVEL",
            constraints=EnsureChoice('trust', 'semitrust', 'untrust')
            | EnsureNone(),
            doc="""specify a trust level for the storage sibling. If not
            specified, the default git-annex trust level is used. 'trust'
            should be used with care (see the git-annex-trust man page).""",
        ),
        disable_storage__=Parameter(
            args=("--no-storage-sibling", ),
            dest='disable_storage__',
            doc="""This option is deprecated. Use '--storage-sibling off'
            instead.""",
            action="store_false"),
    )

    @staticmethod
    @datasetmethod(name='create_sibling_ria')
    @eval_results
    def __call__(
        url,
        name,
        dataset=None,
        storage_name=None,
        post_update_hook=False,
        shared=None,
        group=None,
        storage_sibling=True,
        existing='error',
        trust_level=None,
        recursive=False,
        recursion_limit=None,
        disable_storage__=None,
    ):
        if disable_storage__ is not None:
            import warnings
            warnings.warn(
                "datalad-create-sibling-ria --no-storage-sibling "
                "is deprecated, use --storage-sibling off instead.",
                DeprecationWarning)
            # recode to new setup
            disable_storage__ = None
            storage_sibling = False

        if storage_sibling == 'only' and storage_name:
            lgr.warning(
                "Sibling name will be used for storage sibling in "
                "storage-sibling-only mode, but a storage sibling name "
                "was provided")

        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='create sibling RIA')
        res_kwargs = dict(
            ds=ds,
            action="create-sibling-ria",
            logger=lgr,
        )

        # parse target URL
        try:
            ssh_host, base_path, rewritten_url = verify_ria_url(url, ds.config)
        except ValueError as e:
            yield get_status_dict(status='error', message=str(e), **res_kwargs)
            return

        if ds.repo.get_hexsha() is None or ds.id is None:
            raise RuntimeError("Repository at {} is not a DataLad dataset, "
                               "run 'datalad create [--force]' first.".format(
                                   ds.path))

        if not storage_sibling and storage_name:
            lgr.warning(
                "Storage sibling setup disabled, but a storage sibling name "
                "was provided")

        if storage_sibling and not storage_name:
            storage_name = "{}-storage".format(name)

        if storage_sibling and name == storage_name:
            # leads to unresolvable, circular dependency with publish-depends
            raise ValueError("sibling names must not be equal")

        if not isinstance(url, str):
            raise TypeError("url is not a string, but %s" % type(url))

        # Query existing siblings upfront in order to fail early on
        # existing=='error', since misconfiguration (particularly of special
        # remotes) only to fail in a subdataset later on with that config, can
        # be quite painful.
        # TODO: messages - this is "create-sibling". Don't confuse existence of
        #       local remotes with existence of the actual remote sibling
        #       in wording
        if existing == 'error':
            # in recursive mode this check could take a substantial amount of
            # time: employ a progress bar (or rather a counter, because we don't
            # know the total in advance
            pbar_id = 'check-siblings-{}'.format(id(ds))
            log_progress(
                lgr.info,
                pbar_id,
                'Start checking pre-existing sibling configuration %s',
                ds,
                label='Query siblings',
                unit=' Siblings',
            )
            # even if we have to fail, let's report all conflicting siblings
            # in subdatasets
            failed = False
            for r in ds.siblings(result_renderer=None,
                                 recursive=recursive,
                                 recursion_limit=recursion_limit):
                log_progress(lgr.info,
                             pbar_id,
                             'Discovered sibling %s in dataset at %s',
                             r['name'],
                             r['path'],
                             update=1,
                             increment=True)
                if not r['type'] == 'sibling' or r['status'] != 'ok':
                    # this is an internal status query that has not consequence
                    # for the outside world. Be silent unless something useful
                    # can be said
                    #yield r
                    continue
                if r['name'] == name:
                    res = get_status_dict(
                        status='error',
                        message="a sibling '{}' is already configured in "
                        "dataset {}".format(name, r['path']),
                        **res_kwargs,
                    )
                    failed = True
                    yield res
                    continue
                if storage_name and r['name'] == storage_name:
                    res = get_status_dict(
                        status='error',
                        message="a sibling '{}' is already configured in "
                        "dataset {}".format(storage_name, r['path']),
                        **res_kwargs,
                    )
                    failed = True
                    yield res
                    continue
            log_progress(
                lgr.info,
                pbar_id,
                'Finished checking pre-existing sibling configuration %s',
                ds,
            )
            if failed:
                return

        # TODO: - URL parsing + store creation needs to be RF'ed based on
        #         command abstractions
        #       - more generally consider store creation a dedicated command or
        #         option
        # Note: URL parsing is done twice ATM (for top-level ds). This can't be
        # reduced to single instance, since rewriting url based on config could
        # be different for subdatasets.

        create_store(
            SSHRemoteIO(ssh_host) if ssh_host else LocalIO(), Path(base_path),
            '1')

        yield from _create_sibling_ria(ds, url, name, storage_sibling,
                                       storage_name, existing, shared, group,
                                       post_update_hook, trust_level,
                                       res_kwargs)

        if recursive:
            # Note: subdatasets can be treated independently, so go full
            # recursion when querying for them and _no_recursion with the
            # actual call. Theoretically this can be parallelized.

            for subds in ds.subdatasets(fulfilled=True,
                                        recursive=True,
                                        recursion_limit=recursion_limit,
                                        result_xfm='datasets'):
                yield from _create_sibling_ria(subds, url, name,
                                               storage_sibling, storage_name,
                                               existing, shared, group,
                                               post_update_hook, trust_level,
                                               res_kwargs)
Exemple #5
0
class Remove(Interface):
    """Remove components from datasets

    This command can remove any components (subdatasets, and (directories with)
    files) from datasets. Removing a component implies any present content to
    be dropped, and any associated subdatasets to be uninstalled. Subsequently,
    the component is "unregistered" from the respective dataset. This means
    that the respective component is no longer present on the file system.

    By default, the availability of at least one remote copy is verified, by
    default, before file content is dropped. As these checks could lead to slow
    operation (network latencies, etc), they can be disabled.

    Any number of paths to process can be given as input. Recursion into
    subdatasets needs to be explicitly enabled, while recursion in
    subdirectories within a dataset as always done automatically. An optional
    recursion limit is applied relative to each given input path.

    Examples:

      Permanently remove a subdataset from a dataset and wipe out the subdataset
      association too::

        ~/some/dataset$ datalad remove somesubdataset1
    """
    _action = 'remove'

    _params_ = dict(
        dataset=dataset_argument,
        path=Parameter(args=("path", ),
                       metavar="PATH",
                       doc="path/name of the component to be removed",
                       nargs="*",
                       constraints=EnsureStr() | EnsureNone()),
        recursive=recursion_flag,
        check=check_argument,
        save=nosave_opt,
        message=save_message_opt,
        if_dirty=if_dirty_opt,
    )

    @staticmethod
    @datasetmethod(name=_action)
    @eval_results
    def __call__(path=None,
                 dataset=None,
                 recursive=False,
                 check=True,
                 save=True,
                 message=None,
                 if_dirty='save-before'):
        res_kwargs = dict(action='remove', logger=lgr)
        if not dataset and not path:
            raise InsufficientArgumentsError(
                "insufficient information for `remove`: requires at least a path or dataset"
            )
        refds_path = Interface.get_refds_path(dataset)
        res_kwargs['refds'] = refds_path
        if refds_path and not path and not GitRepo.is_valid_repo(refds_path):
            # nothing here, nothing to remove
            yield get_status_dict(path=refds_path,
                                  status='notneeded',
                                  **res_kwargs)
            return
        if refds_path and not path:
            # act on the whole dataset if nothing else was specified
            # TODO i think that would happen automatically in annotation?
            path = refds_path

        to_process = []

        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=refds_path,
                recursive=recursive,
                # we only ever want to discover immediate subdatasets, the rest
                # will happen in `uninstall`
                recursion_limit=1,
                action='remove',
                unavailable_path_status='',
                nondataset_path_status='error',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('state', None) == 'absent' and \
                    ap.get('parentds', None) is None:
                # nothing exists at location, and there is no parent to
                # remove from
                ap['status'] = 'notneeded'
                ap['message'] = "path does not exist and is not in a dataset"
                yield ap
                continue
            if ap.get('raw_input', False) and ap.get('type',
                                                     None) == 'dataset':
                # make sure dataset sorting yields a dedicted entry for this one
                ap['process_content'] = True
            to_process.append(ap)

        if not to_process:
            # nothing left to do, potentially all errored before
            return

        if path_is_under([ap['path'] for ap in to_process]):
            # behave like `rm` and refuse to remove where we are
            raise ValueError(
                "refusing to uninstall current or parent directory")

        # now sort into datasets so we can process them one by one
        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_process,
                refds_path=refds_path,
                path_only=False)
        assert (not completed)

        # iterate over all datasets, starting at the bottom
        # to make the removal of dataset content known upstairs
        to_save = []
        # track which submodules we have removed in the process, to avoid
        # failure in case we revisit them due to a subsequent path argument
        subm_removed = []
        for ds_path in sorted(content_by_ds, reverse=True):
            ds = Dataset(ds_path)
            paths = content_by_ds[ds_path]
            to_reporemove = []
            # PLAN any dataset that was not raw_input, uninstall (passing recursive flag)
            # if dataset itself is in paths, skip any nondataset
            # sort reverse so we get subdatasets first
            for ap in sorted(paths, key=lambda x: x['path'], reverse=True):
                if ap.get('type', None) == 'dataset':
                    # entire dataset needs to go, uninstall if present, pass recursive!
                    uninstall_failed = False
                    if ap['path'] == refds_path or \
                            (refds_path is None and ap.get('raw_input', False)):
                        # top-level handling, cannot use regular uninstall call, as
                        # it will refuse to uninstall a top-level dataset
                        # and rightfully so, it is really a remove in that case
                        # bypass all the safety by using low-level helper
                        for r in _uninstall_dataset(ds,
                                                    check=check,
                                                    has_super=False,
                                                    **res_kwargs):
                            if r['status'] in ('impossible', 'error'):
                                # we need to inspect if something went wrong, in order
                                # to prevent failure from removing a non-empty dir below,
                                # but at the same time allow for continued processing
                                uninstall_failed = True
                            r['refds'] = refds_path
                            yield r
                    # recheck that it wasn't removed during a previous iteration
                    elif ap.get('state',
                                None) != 'absent' and GitRepo.is_valid_repo(
                                    ap['path']):
                        # anything that is not the top-level -> regular uninstall
                        # this is for subdatasets of the to-be-removed dataset
                        # we want to simply uninstall them in a regular manner
                        for r in Uninstall.__call__(
                                # use annotate path as input, but pass a copy because
                                # we cannot rely on it being unaltered by reannotation
                                # TODO maybe adjust annotate_path to do that
                            [ap.copy()],
                                dataset=refds_path,
                                recursive=recursive,
                                check=check,
                                if_dirty=if_dirty,
                                result_xfm=None,
                                result_filter=None,
                                on_failure='ignore'):
                            if r['status'] in ('impossible', 'error'):
                                # we need to inspect if something went wrong, in order
                                # to prevent failure from removing a non-empty dir below,
                                # but at the same time allow for continued processing
                                uninstall_failed = True
                            yield r
                    if not ap.get('raw_input', False):
                        # we only ever want to actually unregister subdatasets that
                        # were given explicitly
                        continue
                    if not uninstall_failed and \
                            not ap['path'] in subm_removed and \
                            refds_path and \
                            ap.get('parentds', None) and \
                            not (relpath(ap['path'], start=refds_path).startswith(pardir) or
                                 ap['path'] == refds_path) and \
                            ap.get('registered_subds', False):
                        # strip from superdataset, but only if a dataset was given explcitly
                        # as in "remove from this dataset", but not when just a path was given
                        # as in "remove from the filesystem"
                        subds_relpath = relpath(ap['path'],
                                                start=ap['parentds'])
                        # remove submodule reference
                        parentds = Dataset(ap['parentds'])
                        # play safe, will fail on dirty
                        parentds.repo.deinit_submodule(ap['path'])
                        # remove now empty submodule link
                        parentds.repo.remove(ap['path'])
                        # make a record that we removed this already, should it be
                        # revisited via another path argument, because do not reannotate
                        # the paths after every removal
                        subm_removed.append(ap['path'])
                        yield dict(ap, status='ok', **res_kwargs)
                        # need .gitmodules update in parent
                        to_save.append(
                            dict(path=opj(parentds.path, '.gitmodules'),
                                 parents=parentds.path,
                                 type='file'))
                        # and the removal itself needs to be committed
                        # inform `save` that it is OK that this path
                        # doesn't exist on the filesystem anymore
                        ap['unavailable_path_status'] = ''
                        ap['process_content'] = False
                        to_save.append(ap)
                    if not uninstall_failed and exists(ap['path']):
                        # could be an empty dir in case an already uninstalled subdataset
                        # got removed
                        os.rmdir(ap['path'])
                else:
                    # anything that is not a dataset can simply be passed on
                    to_reporemove.append(ap['path'])
            # avoid unnecessary git calls when there is nothing to do
            if to_reporemove:
                if check and hasattr(ds.repo, 'drop'):
                    for r in _drop_files(ds, to_reporemove, check=True):
                        yield r
                for r in ds.repo.remove(to_reporemove, r=True):
                    # these were removed, but we still need to save the removal
                    ap['unavailable_path_status'] = ''
                    to_save.append(ap)
                    yield get_status_dict(status='ok', path=r, **res_kwargs)

        if not to_save:
            # nothing left to do, potentially all errored before
            return
        if not save:
            lgr.debug('Not calling `save` as instructed')
            return

        for res in Save.__call__(
                # TODO compose hand-selected annotated paths
                path=to_save,
                # we might have removed the reference dataset by now, recheck
                dataset=refds_path if
            (refds_path and GitRepo.is_valid_repo(refds_path)) else None,
                message=message if message else '[DATALAD] removed content',
                return_type='generator',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res
Exemple #6
0
class Get(Interface):
    """Get any dataset content (files/directories/subdatasets).

    This command only operates on dataset content. To obtain a new independent
    dataset from some source use the `clone` command.

    By default this command operates recursively within a dataset, but not
    across potential subdatasets, i.e. if a directory is provided, all files in
    the directory are obtained. Recursion into subdatasets is supported too. If
    enabled, relevant subdatasets are detected and installed in order to
    fulfill a request.

    Known data locations for each requested file are evaluated and data are
    obtained from some available location (according to git-annex configuration
    and possibly assigned remote priorities), unless a specific source is
    specified.

    *Getting subdatasets*

    Just as DataLad supports getting file content from more than one location,
    the same is supported for subdatasets, including a ranking of individual
    sources for prioritization.

    The following location candidates are considered. For each candidate a
    cost is given in parenthesis, higher values indicate higher cost, and thus
    lower priority:

    - URL of any configured superdataset remote that is known to have the
      desired submodule commit, with the submodule path appended to it.
      There can be more than one candidate (cost 500).

    - In case `.gitmodules` contains a relative path instead of a URL,
      the URL of any configured superdataset remote that is known to have the
      desired submodule commit, with this relative path appended to it.
      There can be more than one candidate (cost 500).

    - A URL or absolute path recorded in `.gitmodules` (cost 600).

    - In case `.gitmodules` contains a relative path as a URL, the absolute
      path of the superdataset, appended with this relative path (cost 900).

    Additional candidate URLs can be generated based on templates specified as
    configuration variables with the pattern

      `datalad.get.subdataset-source-candidate-<name>`

    where `name` is an arbitrary identifier. If `name` starts with three digits
    (e.g. '400myserver') these will be interpreted as a cost, and the
    respective candidate will be sorted into the generated candidate list
    according to this cost. If no cost is given, a default of 700 is used.

    A template string assigned to such a variable can utilize the Python format
    mini language and may reference a number of properties that are inferred
    from the parent dataset's knowledge about the target subdataset. Properties
    include any submodule property specified in the respective `.gitmodules`
    record. For convenience, an existing `datalad-id` record is made available
    under the shortened name `id`.

    Additionally, the URL of any configured remote that contains the respective
    submodule commit is available as `remote-<name>` properties, where `name`
    is the configured remote name.

    Lastly, all candidates are sorted according to their cost (lower values
    first), and duplicate URLs are stripped, while preserving the first item in the
    candidate list.

    .. note::
      Power-user info: This command uses :command:`git annex get` to fulfill
      file handles.
    """
    _examples_ = [
        dict(text="Get a single file",
             code_py="get('path/to/file')",
             code_cmd="datalad get <path/to/file>"),
        dict(text="Get contents of a directory",
             code_py="get('path/to/dir/')",
             code_cmd="datalad get <path/to/dir/>"),
        dict(text="Get all contents of the current dataset and its subdatasets",
             code_py="get(dataset='.', recursive=True)",
             code_cmd="datalad get . -r"),
        dict(text="Get (clone) a registered subdataset, but don't retrieve data",
             code_py="get('path/to/subds', get_data=False)",
             code_cmd="datalad get -n <path/to/subds>"),
    ]

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            metavar="PATH",
            doc="""specify the dataset to perform the add operation on, in
            which case `path` arguments are interpreted as being relative
            to this dataset.  If no dataset is given, an attempt is made to
            identify a dataset for each input `path`""",
            constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(
            args=("path",),
            metavar="PATH",
            doc="""path/name of the requested dataset component. The component
            must already be known to a dataset. To add new components to a
            dataset use the `add` command""",
            nargs="*",
            constraints=EnsureStr() | EnsureNone()),
        source=Parameter(
            args=("-s", "--source",),
            metavar="LABEL",
            doc="""label of the data source to be used to fulfill requests.
            This can be the name of a dataset :term:`sibling` or another known
            source""",
            constraints=EnsureStr() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=Parameter(
            args=("-R", "--recursion-limit",),
            metavar="LEVELS",
            constraints=EnsureInt() | EnsureChoice('existing') | EnsureNone(),
            doc="""limit recursion into subdataset to the given number of levels.
            Alternatively, 'existing' will limit recursion to subdatasets that already
            existed on the filesystem at the start of processing, and prevent new
            subdatasets from being obtained recursively."""),
        get_data=Parameter(
            args=("-n", "--no-data",),
            dest='get_data',
            action='store_false',
            doc="""whether to obtain data for all file handles. If disabled, `get`
            operations are limited to dataset handles.[CMD:  This option prevents data
            for file handles from being obtained CMD]"""),
        description=location_description,
        reckless=reckless_opt,
        jobs=jobs_opt)

    @staticmethod
    @datasetmethod(name='get')
    @eval_results
    def __call__(
            path=None,
            source=None,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            get_data=True,
            description=None,
            reckless=None,
            jobs='auto',
    ):
        refds_path = Interface.get_refds_path(dataset)
        if not (dataset or path):
            raise InsufficientArgumentsError(
                "Neither dataset nor target path(s) provided")
        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = refds_path

        # we have to have a single dataset to operate on
        refds = require_dataset(
            dataset, check_installed=True, purpose='get content')

        content_by_ds = {}
        # use subdatasets() to discover any relevant content that is not
        # already present in the root dataset (refds)
        for sdsres in Subdatasets.__call__(
                contains=path,
                # maintain path argument semantics and pass in dataset arg
                # as is
                dataset=dataset,
                # always come from the top to get sensible generator behavior
                bottomup=False,
                # when paths are given, they will constrain the recursion
                # automatically, and we need to enable recursion so we can
                # location path in subdatasets several levels down
                recursive=True if path else recursive,
                recursion_limit=None if path else recursion_limit,
                return_type='generator',
                on_failure='ignore'):
            if sdsres.get('type', None) != 'dataset':
                # if it is not about a 'dataset' it is likely content in
                # the root dataset
                if sdsres.get('status', None) == 'impossible' and \
                        sdsres.get('message', None) == \
                        'path not contained in any matching subdataset':
                    target_path = Path(sdsres['path'])
                    if refds.pathobj != target_path and \
                            refds.pathobj not in target_path.parents:
                        yield dict(
                            action='get',
                            path=str(target_path),
                            status='error',
                            message=('path not associated with dataset %s',
                                     refds),
                        )
                        continue
                    # check if we need to obtain anything underneath this path
                    # the subdataset() call above will only look _until_ it
                    # hits the targetpath
                    for res in _install_targetpath(
                            refds,
                            Path(sdsres['path']),
                            recursive,
                            recursion_limit,
                            reckless,
                            refds_path,
                            description,
                            jobs=jobs,
                    ):
                        # fish out the datasets that 'contains' a targetpath
                        # and store them for later
                        if res.get('status', None) in ('ok', 'notneeded') and \
                                'contains' in res:
                            dsrec = content_by_ds.get(res['path'], set())
                            dsrec.update(res['contains'])
                            content_by_ds[res['path']] = dsrec
                        if res.get('status', None) != 'notneeded':
                            # all those messages on not having installed anything
                            # are a bit pointless
                            # "notneeded" for annex get comes below
                            yield res
                else:
                    # dunno what this is, send upstairs
                    yield sdsres
                # must continue for both conditional branches above
                # the rest is about stuff in real subdatasets
                continue
            # instance of the closest existing dataset for this result
            ds = Dataset(sdsres['parentds']
                         if sdsres.get('state', None) == 'absent'
                         else sdsres['path'])
            assert 'contains' in sdsres
            # explore the unknown
            for target_path in sdsres.get('contains', []):
                # essentially the same as done above for paths in the root
                # dataset, but here we are starting from the closest
                # discovered subdataset
                for res in _install_targetpath(
                        ds,
                        Path(target_path),
                        recursive,
                        recursion_limit,
                        reckless,
                        refds_path,
                        description,
                        jobs=jobs,
                ):
                    known_ds = res['path'] in content_by_ds
                    if res.get('status', None) in ('ok', 'notneeded') and \
                            'contains' in res:
                        dsrec = content_by_ds.get(res['path'], set())
                        dsrec.update(res['contains'])
                        content_by_ds[res['path']] = dsrec
                    # prevent double-reporting of datasets that have been
                    # installed by explorative installation to get to target
                    # paths, prior in this loop
                    if res.get('status', None) != 'notneeded' or not known_ds:
                        yield res

        if not get_data:
            # done already
            return

        # and now annex-get, this could all be done in parallel now
        for ds, content in content_by_ds.items():
            for res in _get_targetpaths(
                    Dataset(ds),
                    content,
                    refds.path,
                    source,
                    jobs):
                if res['path'] not in content_by_ds:
                    # we had reports on datasets and subdatasets already
                    # before the annex stage
                    yield res
class CreateSiblingOSF(Interface):
    """Create a dataset representation at OSF.

    This will create a node on OSF and initialize
    an osf special remote to point to it. There are two modes
    this can operate in: 'annex' and 'export'.
    The former uses the OSF node as a key-value store, that
    can be used by git-annex to copy data to and retrieve
    data from (potentially by any clone of the original dataset).
    The latter allows to use 'git annex export' to publish a
    snapshot of a particular version of the dataset. Such an OSF
    node will - in opposition to the 'annex' - be
    human-readable.

    For authentication with OSF, you can define environment variables: Either
    'OSF_TOKEN', or both 'OSF_USERNAME' and 'OSF_PASSWORD'. If neither of these
    is defined, the tool will fall back to the datalad credential manager and
    inquire for credentials interactively.

    """

    result_renderer = 'tailored'

    _params_ = dict(
        dataset=Parameter(args=("-d", "--dataset"),
                          doc="""Dataset to create a sibling for.""",
                          constraints=EnsureDataset() | EnsureNone()),
        title=Parameter(
            args=("--title", ),
            doc="""title of the to-be created OSF node that is displayed
            on the OSF website. Defaults to the basename of the root directory
            of the local dataset.""",
            constraints=EnsureStr() | EnsureNone(),
        ),
        name=Parameter(
            args=(
                "-s",
                "--name",
            ),
            doc="""Name of the to-be initialized osf-special-remote""",
            constraints=EnsureStr()),
        storage_name=Parameter(
            args=("--storage-name", ),
            metavar="NAME",
            doc="""Name of the storage sibling (git-annex special remote).
            Must not be identical to the sibling name. If not specified,
            defaults to the sibling name plus '-storage' suffix.""",
            constraints=EnsureStr() | EnsureNone()),
        existing=Parameter(
            args=("--existing", ),
            constraints=EnsureChoice('skip', 'error') | EnsureNone(),
            metavar='MODE',
            doc="""Action to perform, if a (storage) sibling is already
            configured under the given name and/or a target already exists.
            In this case, a dataset can be skipped ('skip'), or the command
            be instructed to fail ('error').""",
        ),
        trust_level=Parameter(
            args=("--trust-level", ),
            metavar="TRUST-LEVEL",
            constraints=EnsureChoice('trust', 'semitrust', 'untrust')
            | EnsureNone(),
            doc="""specify a trust level for the storage sibling. If not
            specified, the default git-annex trust level is used.""",
        ),
        mode=Parameter(args=("--mode", ),
                       doc=""" """,
                       constraints=EnsureChoice("annex", "export",
                                                "exportonly", "gitonly")),
        tags=Parameter(
            args=('--tag', ),
            dest='tags',
            metavar='TAG',
            doc="""specific one or more tags for the to-be-create OSF node.
            A tag 'DataLad dataset' and the dataset ID (if there is any)
            will be automatically added as additional tags.
            [CMD: This option can be given more than once CMD].""",
            action='append',
        ),
        public=Parameter(
            args=("--public", ),
            doc="""make OSF node public""",
            action='store_true',
        ),
        category=Parameter(
            args=("--category", ),
            doc="""specific the OSF node category to be used for the
            node. The categorization determines what icon is displayed
            with the node on the OSF, and helps with search organization""",
            # all presently supported categories
            constraints=EnsureChoice("analysis", "communication", "data",
                                     "hypothesis", "instrumentation",
                                     "methods and measures", "procedure",
                                     "project", "software", "other")),
        description=Parameter(
            args=("--description", ),
            metavar="TEXT",
            doc="""Description of the OSF node that will be displayed on
            the associated project page. By default a description will be
            generated based on the mode the sibling is put into.""",
            constraints=EnsureStr() | EnsureNone()),
    )

    @staticmethod
    @datasetmethod(name='create_sibling_osf')
    @eval_results
    def __call__(
        title=None,
        name="osf",
        storage_name=None,
        dataset=None,
        mode="annex",
        existing='error',
        trust_level=None,
        tags=None,
        public=False,
        category='data',
        description=None,
    ):
        ds = require_dataset(dataset,
                             purpose="create OSF remote",
                             check_installed=True)
        res_kwargs = dict(
            ds=ds,
            action="create-sibling-osf",
            logger=lgr,
        )
        # we need an annex
        if not isinstance(ds.repo, AnnexRepo):
            yield get_status_dict(type="dataset",
                                  status="impossible",
                                  message="dataset has no annex",
                                  **res_kwargs)
            return

        # NOTES:
        # - we prob. should check osf-special-remote availability upfront to
        #   fail early
        # - add --recursive option
        #       - recursive won't work easily. Need to think that through.
        #       - would need a naming scheme for subdatasets
        #       - flat on OSF or a tree?
        #       - how do we detect something is there already, so we can skip
        #         rather than duplicate (with a new name)?
        #         osf-type-special-remote sufficient to decide it's not needed?
        # - adapt to conclusions in issue #30
        #   -> create those subcomponents
        # - results need to report URL for created projects suitable for datalad
        #   output formatting!
        #   -> result_renderer
        #   -> needs to ne returned by create_node

        if not storage_name:
            storage_name = "{}-storage".format(name)

        sibling_conflicts = sibling_exists(
            ds,
            [name, storage_name],
            # TODO pass through
            recursive=False,
            recursion_limit=None,
            # fail fast, if error is desired
            exhaustive=existing == 'error',
        )
        if existing == 'error' and sibling_conflicts:
            # we only asked for one
            conflict = sibling_conflicts[0]
            yield get_status_dict(
                status='error',
                message=("a sibling '%s' is already configured in dataset %s",
                         conflict[1], conflict[0]),
                **res_kwargs,
            )
            return

        if title is None:
            # use dataset root basename
            title = ds.pathobj.name

        tags = ensure_list(tags)
        if 'DataLad dataset' not in tags:
            tags.append('DataLad dataset')
        if ds.id and ds.id not in tags:
            tags.append(ds.id)

        if not description:
            description = \
                "This component was built from a DataLad dataset using the " \
                "datalad-osf extension " \
                "(https://github.com/datalad/datalad-osf)."
            if mode != 'exportonly':
                description += \
                    " With this extension installed, this component can be " \
                    "git or datalad cloned from a 'osf://ID' URL, where " \
                    "'ID' is the OSF node ID that shown in the OSF HTTP " \
                    "URL, e.g. https://osf.io/q8xnk can be cloned from " \
                    "osf://q8xnk. "
        cred = get_credentials(allow_interactive=True)
        osf = OSF(**cred)
        node_id, node_url = create_node(
            osf_session=osf.session,
            title=title,
            category=category,
            tags=tags if tags else None,
            public=EnsureBool()(public),
            description=description,
        )
        if mode != 'gitonly':
            init_opts = [
                "encryption=none", "type=external", "externaltype=osf",
                "autoenable=true", "node={}".format(node_id)
            ]

            if mode in ("export", "exportonly"):
                init_opts += ["exporttree=yes"]

            ds.repo.init_remote(storage_name, options=init_opts)
            if trust_level:
                ds.repo.call_git(['annex', trust_level, storage_name])

            yield get_status_dict(type="dataset",
                                  url=node_url,
                                  id=node_id,
                                  name=storage_name,
                                  status="ok",
                                  **res_kwargs)

        if mode == 'exportonly':
            return

        # append how to clone this specific dataset to the description
        description += "This particular project can be cloned using" \
                       " 'datalad clone osf://{}'".format(node_id)
        update_node(osf_session=osf.session,
                    id_=node_id,
                    description=description)

        ds.config.set('remote.{}.annex-ignore'.format(name),
                      'true',
                      where='local')
        yield from ds.siblings(
            # use configure, not add, to not trip over the config that
            # we just made
            action='configure',
            name=name,
            url='osf://{}'.format(node_id),
            fetch=False,
            publish_depends=storage_name if mode != 'gitonly' else None,
            recursive=False,
            result_renderer=None,
        )

    @staticmethod
    def custom_result_renderer(res, **kwargs):
        from datalad.ui import ui
        if res['action'] == "create-sibling-osf":
            msg = res.get('message', None)
            ui.message("{action}({status}): {url}{msg}".format(
                action=ac.color_word(res['action'], ac.BOLD),
                status=ac.color_status(res['status']),
                url=res.get('url', ''),
                msg=' [{}]'.format(msg[0] % msg[1:] if isinstance(msg, tuple)
                                   else res['message']) if msg else ''))
        elif res['action'] == "add-sibling-osf":
            ui.message("{action}({status})".format(
                action=ac.color_word(res['action'], ac.BOLD),
                status=ac.color_status(res['status'])))
        else:
            from datalad.interface.utils import default_result_renderer
            default_result_renderer(res)
Exemple #8
0
class ContainersAdd(Interface):
    # first docstring line is used a short description in the cmdline help
    # the rest is put in the verbose help and manpage
    """Add a container to a dataset
    """

    # parameters of the command, must be exhaustive
    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to add the container to. If no dataset is
            given, an attempt is made to identify the dataset based on the
            current working directory""",
            constraints=EnsureDataset() | EnsureNone()),
        name=Parameter(
            args=("name", ),
            doc="""The name to register the container under. This also
                determines the default location of the container image
                within the dataset.""",
            metavar="NAME",
            constraints=EnsureStr(),
        ),
        url=Parameter(
            args=("-u", "--url"),
            doc="""A URL (or local path) to get the container image from. If
            the URL scheme is one recognized by Singularity, 'shub://' or
            'docker://', the command format string will be auto-guessed when
            [CMD: --call-fmt CMD][PY: call_fmt PY] is not specified. For the
            scheme 'dhub://', the rest of the URL will be interpreted as the
            argument to 'docker pull', the image will be saved to the location
            specified by `name`, and the call format will be auto-guessed if
            not given.""",
            metavar="URL",
            constraints=EnsureStr() | EnsureNone(),
        ),

        # TODO: The "prepared command stuff should ultimately go somewhere else
        # (probably datalad-run). But first figure out, how exactly to address
        # container datasets
        call_fmt=Parameter(
            args=("--call-fmt", ),
            doc="""Command format string indicating how to execute a command in
            this container, e.g. "singularity exec {img} {cmd}". Where '{img}'
            is a placeholder for the path to the container image and '{cmd}' is
            replaced with the desired command. Additional placeholders:
            '{img_dspath}' is relative path to the dataset containing the image.
            """,
            metavar="FORMAT",
            constraints=EnsureStr() | EnsureNone(),
        ),
        image=Parameter(
            args=("-i", "--image"),
            doc=
            """Relative path of the container image within the dataset. If not
                given, a default location will be determined using the
                `name` argument.""",
            metavar="IMAGE",
            constraints=EnsureStr() | EnsureNone(),
        ),
        update=Parameter(
            args=("--update", ),
            action="store_true",
            doc="""Update the existing container for `name`. If no other
            options are specified, URL will be set to 'updateurl', if
            configured. If a container with `name` does not already exist, this
            option is ignored."""))

    @staticmethod
    @datasetmethod(name='containers_add')
    @eval_results
    def __call__(name,
                 url=None,
                 dataset=None,
                 call_fmt=None,
                 image=None,
                 update=False):
        if not name:
            raise InsufficientArgumentsError("`name` argument is required")

        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='add container')
        runner = Runner()

        # prevent madness in the config file
        if not re.match(r'^[0-9a-zA-Z-]+$', name):
            raise ValueError(
                "Container names can only contain alphanumeric characters "
                "and '-', got: '{}'".format(name))

        cfgbasevar = "datalad.containers.{}".format(name)
        if cfgbasevar + ".image" in ds.config:
            if not update:
                yield get_status_dict(
                    action="containers_add",
                    ds=ds,
                    logger=lgr,
                    status="impossible",
                    message=("Container named %r already exists. "
                             "Use --update to reconfigure.", name))
                return

            if not (url or image or call_fmt):
                # No updated values were provided. See if an update url is
                # configured (currently relevant only for Singularity Hub).
                url = ds.config.get(cfgbasevar + ".updateurl")
                if not url:
                    yield get_status_dict(
                        action="containers_add",
                        ds=ds,
                        logger=lgr,
                        status="impossible",
                        message="No values to update specified")
                    return

            call_fmt = call_fmt or ds.config.get(cfgbasevar + ".cmdexec")
            image = image or ds.config.get(cfgbasevar + ".image")

        if not image:
            loc_cfg_var = "datalad.containers.location"
            # TODO: We should provide an entry point (or sth similar) for extensions
            # to get config definitions into the ConfigManager. In other words an
            # easy way to extend definitions in datalad's common_cfgs.py.
            container_loc = \
                ds.config.obtain(
                    loc_cfg_var,
                    where=definitions[loc_cfg_var]['destination'],
                    # if not False it would actually modify the
                    # dataset config file -- undesirable
                    store=False,
                    default=definitions[loc_cfg_var]['default'],
                    dialog_type=definitions[loc_cfg_var]['ui'][0],
                    valtype=definitions[loc_cfg_var]['type'],
                    **definitions[loc_cfg_var]['ui'][1]
                )
            image = op.join(ds.path, container_loc, name, 'image')
        else:
            image = op.join(ds.path, image)

        result = get_status_dict(
            action="containers_add",
            path=image,
            type="file",
            logger=lgr,
        )

        if call_fmt is None:
            # maybe built in knowledge can help
            call_fmt = _guess_call_fmt(ds, name, url)

        # collect bits for a final and single save() call
        to_save = []
        imgurl = url
        was_updated = False
        if url:
            if update and op.lexists(image):
                was_updated = True
                # XXX: check=False is used to avoid dropping the image. It
                # should use drop=False if remove() gets such an option (see
                # DataLad's gh-2673).
                for r in ds.remove(image,
                                   save=False,
                                   check=False,
                                   return_type="generator"):
                    yield r

            imgurl = _resolve_img_url(url)
            lgr.debug('Attempt to obtain container image from: %s', imgurl)
            if url.startswith("dhub://"):
                from .adapters import docker

                docker_image = url[len("dhub://"):]

                lgr.debug("Running 'docker pull %s and saving image to %s",
                          docker_image, image)
                runner.run(["docker", "pull", docker_image])
                docker.save(docker_image, image)
            elif url.startswith("docker://"):
                image_dir, image_basename = op.split(image)
                if not image_basename:
                    raise ValueError("No basename in path {}".format(image))
                if image_dir and not op.exists(image_dir):
                    os.makedirs(image_dir)

                lgr.info(
                    "Building Singularity image for %s "
                    "(this may take some time)", url)
                runner.run(["singularity", "build", image_basename, url],
                           cwd=image_dir or None)
            elif op.exists(url):
                lgr.info("Copying local file %s to %s", url, image)
                image_dir = op.dirname(image)
                if image_dir and not op.exists(image_dir):
                    os.makedirs(image_dir)
                copyfile(url, image)
            else:
                try:
                    ds.repo.add_url_to_file(image, imgurl)
                except Exception as e:
                    result["status"] = "error"
                    result["message"] = str(e)
                    yield result
            # TODO do we have to take care of making the image executable
            # if --call_fmt is not provided?
            to_save.append(image)
        # continue despite a remote access failure, the following config
        # setting will enable running the command again with just the name
        # given to ease a re-run
        if not op.lexists(image):
            result["status"] = "error"
            result["message"] = ('no image at %s', image)
            yield result
            return

        # store configs
        if imgurl != url:
            # store originally given URL, as it resolves to something
            # different and maybe can be used to update the container
            # at a later point in time
            ds.config.set("{}.updateurl".format(cfgbasevar), url)
        # force store the image, and prevent multiple entries
        ds.config.set("{}.image".format(cfgbasevar),
                      op.relpath(image, start=ds.path),
                      force=True)
        if call_fmt:
            ds.config.set("{}.cmdexec".format(cfgbasevar),
                          call_fmt,
                          force=True)
        # store changes
        to_save.append(op.join(".datalad", "config"))
        for r in ds.save(
                path=to_save,
                message="[DATALAD] {do} containerized environment '{name}'".
                format(do="Update" if was_updated else "Configure",
                       name=name)):
            yield r
        result["status"] = "ok"
        yield result
Exemple #9
0
class CheckDates(Interface):
    """Find repository dates that are more recent than a reference date.

    The main purpose of this tool is to find "leaked" real dates in
    repositories that are configured to use fake dates. It checks dates from
    three sources: (1) commit timestamps (author and committer dates), (2)
    timestamps within files of the "git-annex" branch, and (3) the timestamps
    of annotated tags.
    """
    from datalad.interface.utils import eval_results
    import datalad.support.ansi_colors as ac
    from datalad.support.constraints import EnsureChoice, EnsureNone, EnsureStr
    from datalad.support.param import Parameter

    result_renderer = "tailored"

    @staticmethod
    def custom_result_renderer(res, **kwargs):
        """Like 'json_pp', but skip non-error results without flagged objects.
        """
        # FIXME: I think the proper way to do this is to use 'result_filter',
        # but I couldn't seem to get eval_results to detect the filter when I
        # used
        #
        #      result_renderer = "json_pp"
        #      result_filter = lambda x: ...
        #
        # Also, I want to keep the "message" key for errors.
        from datalad.ui import ui
        to_render = {}
        if res["status"] == "error":
            to_render = dict(res.items())
        elif "report" in res and res["report"]["objects"]:
            to_render = {
                k: v
                for k, v in res.items()
                if k not in ["status", "message", "logger"]
            }
        if to_render:
            ui.message(json.dumps(to_render, sort_keys=True, indent=2))

    _params_ = dict(
        paths=Parameter(
            args=("paths", ),
            metavar="PATH",
            nargs="*",
            doc="""Root directory in which to search for Git repositories. The
            current working directory will be used by default.""",
            constraints=EnsureStr() | EnsureNone()),
        reference_date=Parameter(
            args=("-D", "--reference-date"),
            metavar="DATE",
            doc="""Compare dates to this date. If dateutil is installed, this
            value can be any format that its parser recognizes. Otherwise, it
            should be a unix timestamp that starts with a "@". The default
            value corresponds to 01 Jan, 2018 00:00:00 -0000.""",
            constraints=EnsureStr()),
        revs=Parameter(
            args=("--rev", ),
            dest="revs",
            action="append",
            metavar="REVISION",
            doc="""Search timestamps from commits that are reachable from [PY:
            these revisions PY][CMD: REVISION CMD]. Any revision specification
            supported by :command:`git log`, including flags like --all and
            --tags, can be used.[CMD:  This option can be given multiple times.
            CMD]"""),
        annex=Parameter(
            args=("--annex", ),
            doc="""Mode for "git-annex" branch search. If 'all', all blobs
            within the branch are searched. 'tree' limits the search to blobs
            that are referenced by the tree at the tip of the branch. 'none'
            disables search of "git-annex" blobs.""",
            constraints=EnsureChoice("all", "tree", "none")),
        no_tags=Parameter(args=("--no-tags", ),
                          action="store_true",
                          doc="""Don't check the dates of annotated tags."""),
        older=Parameter(
            args=("--older", ),
            action="store_true",
            doc="""Find dates which are older than the reference date rather
            than newer."""),
    )

    @staticmethod
    @eval_results
    def __call__(paths,
                 reference_date="@1514764800",
                 revs=None,
                 annex="all",
                 no_tags=False,
                 older=False):
        from datalad.support.repodates import check_dates

        which = "older" if older else "newer"

        try:
            ref_ts = _parse_date(reference_date)
        except ValueError as exc:
            lgr.error("Could not parse '%s' as a date", reference_date)
            yield get_status_dict("check_dates",
                                  status="error",
                                  message=exc_str(exc))
            return

        lgr.info("Searching for dates %s than %s", which,
                 time.strftime("%d %b %Y %H:%M:%S +0000", time.gmtime(ref_ts)))

        for repo in _git_repos(paths or ["."]):
            fullpath = os.path.abspath(repo)
            lgr.debug("Checking %s", fullpath)

            try:
                report = check_dates(repo,
                                     ref_ts,
                                     which=which,
                                     revs=revs or ["--all"],
                                     annex={
                                         "all": True,
                                         "none": False,
                                         "tree": "tree"
                                     }[annex],
                                     tags=not no_tags)
            except InvalidGitRepositoryError as exc:
                lgr.warning("Skipping invalid Git repo: %s", repo)
                continue

            yield get_status_dict(
                "check_dates",
                status="ok",
                path=fullpath,
                message=("Found {} dates" if report["objects"] else
                         "No {} dates found").format(which),
                report=report)
Exemple #10
0
class Save(Interface):
    """Save the current state of a dataset

    Saving the state of a dataset records changes that have been made to it.
    This change record is annotated with a user-provided description.
    Optionally, an additional tag, such as a version, can be assigned to the
    saved state. Such tag enables straightforward retrieval of past versions at
    a later point in time.

    .. note::
      Before Git v2.22, any Git repository without an initial commit located
      inside a Dataset is ignored, and content underneath it will be saved to
      the respective superdataset. DataLad datasets always have an initial
      commit, hence are not affected by this behavior.
    """
    # note above documents that out behavior is like that of `git add`, but
    # does not explicitly mention the connection to keep it simple.

    _examples_ = [
        dict(text="""Save any content underneath the current directory, without
             altering any potential subdataset""",
             code_py="save(path='.')",
             code_cmd="datalad save ."),
        dict(text="""Save specific content in the dataset""",
             code_py="save(path='myfile.txt')",
             code_cmd="datalad save myfile.txt"),
        dict(text="""Attach a commit message to save""",
             code_py="save(path='myfile.txt', message='add file')",
             code_cmd="datalad save -m 'add file' myfile.txt"),
        dict(text="""Save any content underneath the current directory, and
             recurse into any potential subdatasets""",
             code_py="save(path='.', recursive=True)",
             code_cmd="datalad save . -r"),
        dict(
            text="Save any modification of known dataset content in the "
            "current directory, but leave untracked files (e.g. temporary files) "
            "untouched",
            code_py="""save(path='.', updated=True)""",
            code_cmd="""datalad save -u ."""),
        dict(text="Tag the most recent saved state of a dataset",
             code_py="save(version_tag='bestyet')",
             code_cmd="datalad save --version-tag 'bestyet'"),
        dict(
            text="Save a specific change but integrate into last commit keeping "
            "the already recorded commit message",
            code_py="save(path='myfile.txt', amend=True)",
            code_cmd="datalad save myfile.txt --amend")
    ]

    _params_ = dict(
        dataset=Parameter(args=("-d", "--dataset"),
                          doc=""""specify the dataset to save""",
                          constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(
            args=("path", ),
            metavar='PATH',
            doc="""path/name of the dataset component to save. If given, only
            changes made to those components are recorded in the new state.""",
            nargs='*',
            constraints=EnsureStr() | EnsureNone()),
        message=save_message_opt,
        message_file=Parameter(
            args=("-F", "--message-file"),
            doc="""take the commit message from this file. This flag is
            mutually exclusive with -m.""",
            constraints=EnsureStr() | EnsureNone()),
        version_tag=Parameter(
            args=(
                "-t",
                "--version-tag",
            ),
            metavar='ID',
            doc="""an additional marker for that state. Every dataset that
            is touched will receive the tag.""",
            constraints=EnsureStr() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        updated=Parameter(
            args=(
                '-u',
                '--updated',
            ),
            action='store_true',
            doc="""if given, only saves previously tracked paths."""),
        to_git=Parameter(
            args=("--to-git", ),
            action='store_true',
            doc="""flag whether to add data directly to Git, instead of
            tracking data identity only.  Use with caution, there is no
            guarantee that a file put directly into Git like this will
            not be annexed in a subsequent save operation.
            If not specified, it will be up to git-annex to decide how
            a file is tracked, based on a dataset's configuration
            to track particular paths,
            file types, or file sizes with either Git or git-annex.
            (see https://git-annex.branchable.com/tips/largefiles).
            """),
        jobs=jobs_opt,
        amend=Parameter(
            args=('--amend', ),
            action='store_true',
            doc="""if set, changes are not recorded in a new, separate
            commit, but are integrated with the changeset of the previous
            commit, and both together are recorded by replacing that
            previous commit. This is mutually exclusive with recursive
            operation.
            """),
    )

    @staticmethod
    @datasetmethod(name='save')
    @eval_results
    def __call__(
        path=None,
        message=None,
        dataset=None,
        version_tag=None,
        recursive=False,
        recursion_limit=None,
        updated=False,
        message_file=None,
        to_git=None,
        jobs=None,
        amend=False,
    ):
        if message and message_file:
            raise ValueError(
                "Both a message and message file were specified for save()")

        if amend and recursive:
            raise ValueError("Cannot amend a commit recursively.")

        path = ensure_list(path)

        if message_file:
            with open(message_file) as mfh:
                message = mfh.read()

        # we want 'normal' to achieve the most compact argument list
        # for git calls
        # untracked_mode = 'no' if updated else 'normal'
        # TODO however, Repo.add() would refuse to add any dotfiles
        # in a directory that is itself untracked, hence the only
        # choice is to go with potentially crazy long lists
        # until https://github.com/datalad/datalad/issues/1454
        # has a resolution
        untracked_mode = 'no' if updated else 'all'

        # there are three basic scenarios:
        # 1. save modifications to any already tracked content
        # 2. save any content (including removal of deleted content)
        #    to bring things to a clean state
        # 3. like (2), but only operate on a given subset of content
        #    identified by paths
        # - all three have to work in conjunction with --recursive
        # - the difference between (1) and (2) should be no more
        #   that a switch from --untracked=no to --untracked=all
        #   in Repo.save()

        # we do not support
        # - simultaneous operations on multiple datasets from disjoint
        #   dataset hierarchies, hence a single reference dataset must be
        #   identifiable from the either
        #   - curdir or
        #   - the `dataset` argument.
        #   This avoids complex annotation loops and hierarchy tracking.
        # - any modification upwards from the root dataset

        ds = require_dataset(dataset, check_installed=True, purpose='save')

        # use status() to do all discovery and annotation of paths
        paths_by_ds = {}
        for s in Status()(
                # ATTN: it is vital to pass the `dataset` argument as it,
                # and not a dataset instance in order to maintain the path
                # semantics between here and the status() call
                dataset=dataset,
                path=path,
                untracked=untracked_mode,
                report_filetype=False,
                recursive=recursive,
                recursion_limit=recursion_limit,
                on_failure='ignore',
                # for save without recursion only commit matters
                eval_subdataset_state='full' if recursive else 'commit',
                result_renderer='disabled'):
            if s['status'] == 'error':
                # Downstream code can't do anything with these. Let the caller
                # decide their fate.
                yield s
                continue

            # fish out status dict for this parent dataset
            ds_status = paths_by_ds.get(s['parentds'], {})
            # reassemble path status info as repo.status() would have made it
            ds_status[ut.Path(s['path'])] = \
                {k: v for k, v in s.items()
                 if k not in (
                     'path', 'parentds', 'refds', 'status', 'action',
                     'logger')}
            paths_by_ds[s['parentds']] = ds_status

        lgr.debug('Determined %i datasets for saving from input arguments',
                  len(paths_by_ds))
        # figure out what datasets to process, start with the ones containing
        # the paths that were given as arguments
        discovered_datasets = list(paths_by_ds.keys())
        if dataset:
            # if a reference dataset was given we want to save all the way up
            # to it, so let's throw it into the mix
            discovered_datasets.append(ds.path)
        # sort the datasets into (potentially) disjoint hierarchies,
        # or a single one, if a reference dataset was given
        dataset_hierarchies = get_tree_roots(discovered_datasets)
        for rootds, children in dataset_hierarchies.items():
            edges = {}
            discover_dataset_trace_to_targets(rootds,
                                              children, [],
                                              edges,
                                              includeds=children)
            for superds, subdss in edges.items():
                superds_status = paths_by_ds.get(superds, {})
                for subds in subdss:
                    subds_path = ut.Path(subds)
                    sub_status = superds_status.get(subds_path, {})
                    if not (sub_status.get("state") == "clean"
                            and sub_status.get("type") == "dataset"):
                        # TODO actually start from an entry that may already
                        # exist in the status record
                        superds_status[subds_path] = dict(
                            # shot from the hip, some status config
                            # to trigger this specific super/sub
                            # relation to be saved
                            state='untracked',
                            type='dataset')
                paths_by_ds[superds] = superds_status

        def save_ds(args, version_tag=None):
            pdspath, paths = args

            pds = Dataset(pdspath)
            pds_repo = pds.repo
            # pop status for this dataset, we are not coming back to it
            pds_status = {
                # for handing over to the low-level code, we recode any
                # path relative to the real repo location, this avoid
                # cumbersome symlink handling without context in the
                # lower levels
                pds_repo.pathobj / p.relative_to(pdspath): props
                for p, props in paths.items()
            }
            start_commit = pds_repo.get_hexsha()
            if not all(p['state'] == 'clean' for p in pds_status.values()) or \
                    (amend and message):
                for res in pds_repo.save_(
                        message=message,
                        # make sure to have the `path` arg be None, as we want
                        # to prevent and bypass any additional repo.status()
                        # calls
                        paths=None,
                        # prevent whining of GitRepo
                        git=True
                        if not hasattr(ds.repo, 'annexstatus') else to_git,
                        # we are supplying the full status already, do not
                        # detect anything else
                        untracked='no',
                        _status=pds_status,
                        amend=amend):
                    # TODO remove stringification when datalad-core can handle
                    # path objects, or when PY3.6 is the lowest supported
                    # version
                    for k in ('path', 'refds'):
                        if k in res:
                            res[k] = str(
                                # recode path back to dataset path anchor
                                pds.pathobj /
                                res[k].relative_to(pds_repo.pathobj))
                    yield res
            # report on the dataset itself
            dsres = dict(
                action='save',
                type='dataset',
                path=pds.path,
                refds=ds.path,
                status='ok'
                if start_commit != pds_repo.get_hexsha() else 'notneeded',
                logger=lgr,
            )
            if not version_tag:
                yield dsres
                return
            try:
                # method requires str
                version_tag = str(version_tag)
                pds_repo.tag(version_tag)
                dsres.update(status='ok', version_tag=version_tag)
                yield dsres
            except CommandError as e:
                if dsres['status'] == 'ok':
                    # first we yield the result for the actual save
                    # TODO: we will get duplicate dataset/save record obscuring
                    # progress reporting.  yoh thought to decouple "tag" from "save"
                    # messages but was worrying that original authors would disagree
                    yield dsres.copy()
                # and now complain that tagging didn't work
                dsres.update(status='error',
                             message=('cannot tag this version: %s',
                                      e.stderr.strip()))
                yield dsres

        if not paths_by_ds:
            # Special case: empty repo. There's either an empty commit only or
            # none at all. An empty one we can amend otherwise there's nothing
            # to do.
            if amend and ds.repo.get_hexsha():
                yield from save_ds((ds.pathobj, dict()),
                                   version_tag=version_tag)

            else:
                yield dict(action='save',
                           type='dataset',
                           path=ds.path,
                           refds=ds.path,
                           status='notneeded',
                           logger=lgr)
            return

        # TODO: in principle logging could be improved to go not by a dataset
        # but by path(s) within subdatasets. That should provide a bit better ETA
        # and more "dynamic" feedback than jumpy datasets count.
        # See addurls where it is implemented that way by providing agg and another
        # log_filter
        yield from ProducerConsumerProgressLog(
            sorted(paths_by_ds.items(), key=lambda v: v[0], reverse=True),
            partial(save_ds, version_tag=version_tag),
            safe_to_consume=no_subds_in_futures,
            producer_future_key=lambda ds_items: ds_items[0],
            jobs=jobs,
            log_filter=_log_filter_save_dataset,
            unit="datasets",
            lgr=lgr,
        )
Exemple #11
0
class ExportArchiveORA(Interface):
    """Export an archive of a local annex object store for the ORA remote.

    Keys in the local annex object store are reorganized in a temporary
    directory (using links to avoid storage duplication) to use the
    'hashdirlower' setup used by git-annex for bare repositories and
    the directory-type special remote. This alternative object store is
    then moved into a 7zip archive that is suitable for use in a
    ORA remote dataset store. Placing such an archive into::

      <dataset location>/archives/archive.7z

    Enables the ORA special remote to locate and retrieve all keys contained
    in the archive.
    """
    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to process.  If
            no dataset is given, an attempt is made to identify the dataset
            based on the current working directory""",
            constraints=EnsureDataset() | EnsureNone()),
        target=Parameter(
            args=("target",),
            metavar="TARGET",
            doc="""if an existing directory, an 'archive.7z' is placed into
            it, otherwise this is the path to the target archive""",
            constraints=EnsureStr() | EnsureNone()),
        remote=Parameter(
            args=("--for",),
            dest="remote",
            metavar='LABEL',
            doc="""name of the target sibling, wanted/preferred settings
            will be used to filter the files added to the archives""",
            constraints=EnsureStr() | EnsureNone()),
        annex_wanted=Parameter(
            args=("--annex-wanted",),
            metavar="FILTERS",
            doc="""git-annex-preferred-content expression for
            git-annex find to filter files. Should start with
            'or' or 'and' when used in combination with `--for`"""),
        froms=Parameter(
            args=("--from",),
            dest="froms",
            metavar="FROM",
            nargs="+",
            doc="""one or multiple tree-ish from which to select files"""),
        opts=Parameter(
            args=("opts",),
            nargs=REMAINDER,
            metavar="...",
            doc="""list of options for 7z to replace the default '-mx0' to
            generate an uncompressed archive"""),
        missing_content=Parameter(
            args=("--missing-content",),
            doc="""By default, any discovered file with missing content will
            result in an error and the export is aborted. Setting this to
            'continue' will issue warnings instead of failing on error. The
            value 'ignore' will only inform about problem at the 'debug' log
            level. The latter two can be helpful when generating a TAR archive
            from a dataset where some file content is not available
            locally.""",
            constraints=EnsureChoice("error", "continue", "ignore")),
    )

    @staticmethod
    @datasetmethod(name='export_archive_ora')
    @eval_results
    def __call__(
            target,
            opts=None,
            *,  # opts is positional but optional in CLI
            dataset=None,
            remote=None,
            annex_wanted=None,
            froms=None,
            missing_content='error',):
        # only non-bare repos have hashdirmixed, so require one
        ds = require_dataset(
            dataset, check_installed=True, purpose='export to ORA archive')
        ds_repo = ds.repo

        annex_objs = ds_repo.dot_git / 'annex' / 'objects'

        archive = resolve_path(target, dataset)
        if archive.is_dir():
            archive = archive / 'archive.7z'
        else:
            archive.parent.mkdir(exist_ok=True, parents=True)

        froms = ensure_list(froms)

        if not opts:
            # uncompressed by default
            opts = ['-mx0']

        res_kwargs = dict(
            action="export-archive-ora",
            logger=lgr,
        )

        if not annex_objs.is_dir():
            yield get_status_dict(
                ds=ds,
                status='notneeded',
                message='no annex keys present',
                **res_kwargs,
            )
            return

        exportdir = ds_repo.dot_git / 'datalad' / 'tmp' / 'ora_archive'
        if exportdir.exists():
            yield get_status_dict(
                ds=ds,
                status='error',
                message=(
                    'export directory already exists, please remove first: %s',
                    str(exportdir)),
                **res_kwargs,
            )
            return

        def expr_to_opts(expr):
            opts = []
            expr = expr.replace('(', ' ( ').replace(')', ' ) ')
            for sub_expr in expr.split(' '):
                if len(sub_expr):
                    if sub_expr in '()':
                        opts.append(f"-{sub_expr}")
                    else:
                        opts.append(f"--{sub_expr}")
            return opts

        find_filters = []
        if remote:
            find_filters = ['-('] + expr_to_opts(ds_repo.get_preferred_content('wanted', remote)) + ['-)']
        if annex_wanted:
            find_filters.extend(expr_to_opts(annex_wanted))
        # git-annex find results need to be uniqued with set, as git-annex find
        # will return duplicates if multiple symlinks point to the same key.
        if froms:
            keypaths = set([
                annex_objs.joinpath(k) for treeish in froms for k in ds_repo.call_annex_items_([
                'find', *find_filters, f"--branch={treeish}",
                "--format=${hashdirmixed}${key}/${key}\\n"])
                ])
        else:
            keypaths = set(annex_objs.joinpath(k) for k in ds_repo.call_annex_items_([
                'find', *find_filters,
                "--format=${hashdirmixed}${key}/${key}\\n"
            ]))

        log_progress(
            lgr.info,
            'oraarchiveexport',
            'Start ORA archive export %s', ds,
            total=len(keypaths),
            label='ORA archive export',
            unit=' Keys',
        )

        if missing_content == 'continue':
            missing_file_lgr_func = lgr.warning
        elif missing_content == 'ignore':
            missing_file_lgr_func = lgr.debug

        link_fx = os.link
        for keypath in keypaths:
            key = keypath.name
            hashdir = op.join(keypath.parts[-4], keypath.parts[-3])
            log_progress(
                lgr.info,
                'oraarchiveexport',
                'Export key %s to %s', key, hashdir,
                update=1,
                increment=True)
            keydir = exportdir / hashdir / key
            keydir.mkdir(parents=True, exist_ok=True)
            try:
                link_fx(str(keypath), str(keydir / key))
            except FileNotFoundError as e:
                if missing_content == 'error':
                    raise IOError('Key %s has no content available' % keypath)
                missing_file_lgr_func(
                    'Key %s has no content available',
                    str(keypath))
            except OSError:
                lgr.warning(
                    'No hard links supported at %s, will copy files instead',
                    str(keypath))
                # no hard links supported
                # switch function after first error
                link_fx = shutil.copyfile
                link_fx(str(keypath), str(keydir / key))

        log_progress(
            lgr.info,
            'oraarchiveexport',
            'Finished RIA archive export from %s', ds
        )
        try:
            subprocess.run(
                ['7z', 'u', str(archive), '.'] + opts,
                cwd=str(exportdir),
            )
            yield get_status_dict(
                path=str(archive),
                type='file',
                status='ok',
                **res_kwargs)
        except Exception as e:
            ce = CapturedException(e)
            yield get_status_dict(
                path=str(archive),
                type='file',
                status='error',
                message=('7z failed: %s', ce),
                exception=ce,
                **res_kwargs)
            return
        finally:
            rmtree(str(exportdir))
Exemple #12
0
class ExportArchive(Interface):
    """Export the content of a dataset as a TAR/ZIP archive.
    """
    from datalad.support.param import Parameter
    from datalad.distribution.dataset import datasetmethod
    from datalad.interface.utils import eval_results
    from datalad.distribution.dataset import EnsureDataset
    from datalad.support.constraints import (
        EnsureChoice,
        EnsureNone,
        EnsureStr,
    )

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc=""""specify the dataset to export. If no dataset is given, an
            attempt is made to identify the dataset based on the current
            working directory.""",
            constraints=EnsureDataset() | EnsureNone()),
        filename=Parameter(
            args=("filename", ),
            metavar="PATH",
            nargs='?',
            doc="""File name of the generated TAR archive. If no file name is
            given the archive will be generated in the current directory and
            will be named: datalad_<dataset_uuid>.(tar.*|zip). To generate that
            file in a different directory, provide an existing directory as the
            file name.""",
            constraints=EnsureStr() | EnsureNone()),
        archivetype=Parameter(args=("-t", "--archivetype"),
                              doc="""Type of archive to generate.""",
                              constraints=EnsureChoice("tar", "zip")),
        compression=Parameter(
            args=("-c", "--compression"),
            doc="""Compression method to use.  'bz2' is not supported for ZIP
            archives.  No compression is used when an empty string is
            given.""",
            constraints=EnsureChoice("gz", "bz2", "")),
        missing_content=Parameter(
            args=("--missing-content", ),
            doc="""By default, any discovered file with missing content will
            result in an error and the export is aborted. Setting this to
            'continue' will issue warnings instead of failing on error. The
            value 'ignore' will only inform about problem at the 'debug' log
            level. The latter two can be helpful when generating a TAR archive
            from a dataset where some file content is not available
            locally.""",
            constraints=EnsureChoice("error", "continue", "ignore")),
    )

    @staticmethod
    @datasetmethod(name='export_archive')
    @eval_results
    def __call__(dataset,
                 filename=None,
                 archivetype='tar',
                 compression='gz',
                 missing_content='error'):
        import os
        import tarfile
        import zipfile
        from unittest.mock import patch
        from os.path import join as opj, dirname, normpath, isabs
        import os.path as op

        from datalad.distribution.dataset import require_dataset
        from datalad.utils import file_basename
        from datalad.support.annexrepo import AnnexRepo
        from datalad.dochelpers import exc_str

        import logging
        lgr = logging.getLogger('datalad.plugin.export_archive')

        dataset = require_dataset(dataset,
                                  check_installed=True,
                                  purpose='export archive')

        repo = dataset.repo
        committed_date = repo.get_commit_date()

        # could be used later on to filter files by some criterion
        def _filter_tarinfo(ti):
            # Reset the date to match the one of the last commit, not from the
            # filesystem since git doesn't track those at all
            # TODO: use the date of the last commit when any particular
            # file was changed -- would be the most kosher yoh thinks to the
            # degree of our abilities
            ti.mtime = committed_date
            return ti

        tar_args = dict(recursive=False, filter=_filter_tarinfo)

        file_extension = '.{}{}'.format(
            archivetype, '{}{}'.format('.' if compression else '', compression)
            if archivetype == 'tar' else '')

        default_filename = "datalad_{.id}".format(dataset)
        if filename is None:
            filename = default_filename  # in current directory
        elif path.exists(filename) and path.isdir(filename):
            filename = path.join(filename,
                                 default_filename)  # under given directory
        if not filename.endswith(file_extension):
            filename += file_extension

        root = dataset.path
        # use dir inside matching the output filename
        # TODO: could be an option to the export plugin allowing empty value
        # for no leading dir
        leading_dir = file_basename(filename)

        # workaround for inability to pass down the time stamp
        with patch('time.time', return_value=committed_date), \
                tarfile.open(filename, "w:{}".format(compression)) \
                if archivetype == 'tar' \
                else zipfile.ZipFile(
                    filename, 'w',
                    zipfile.ZIP_STORED if not compression else zipfile.ZIP_DEFLATED) \
                as archive:
            add_method = archive.add if archivetype == 'tar' else archive.write
            repo_files = sorted(repo.get_indexed_files())
            if isinstance(repo, AnnexRepo):
                annexed = repo.is_under_annex(repo_files,
                                              allow_quick=True,
                                              batch=True)
                # remember: returns False for files in Git!
                has_content = repo.file_has_content(repo_files,
                                                    allow_quick=True,
                                                    batch=True)
            else:
                annexed = [False] * len(repo_files)
                has_content = [True] * len(repo_files)
            for i, rpath in enumerate(repo_files):
                fpath = opj(root, rpath)
                if annexed[i]:
                    if not has_content[i]:
                        if missing_content in ('ignore', 'continue'):
                            (lgr.warning
                             if missing_content == 'continue' else lgr.debug)(
                                 'File %s has no content available, skipped',
                                 fpath)
                            continue
                        else:
                            raise IOError('File %s has no content available' %
                                          fpath)

                    # resolve to possible link target
                    if op.islink(fpath):
                        link_target = os.readlink(fpath)
                        if not isabs(link_target):
                            link_target = normpath(
                                opj(dirname(fpath), link_target))
                        fpath = link_target
                # name in the archive
                aname = normpath(opj(leading_dir, rpath))
                add_method(fpath,
                           arcname=aname,
                           **(tar_args if archivetype == 'tar' else {}))

        if not isabs(filename):
            filename = opj(os.getcwd(), filename)

        yield dict(status='ok',
                   path=filename,
                   type='file',
                   action='export_archive',
                   logger=lgr)
Exemple #13
0
class AggregateMetaData(Interface):
    """Aggregate metadata of one or more datasets for later query.

    Metadata aggregation refers to a procedure that extracts metadata present
    in a dataset into a portable representation that is stored a single
    standardized format. Moreover, metadata aggregation can also extract
    metadata in this format from one dataset and store it in another
    (super)dataset. Based on such collections of aggregated metadata it is
    possible to discover particular datasets and specific parts of their
    content, without having to obtain the target datasets first (see the
    DataLad 'search' command).

    To enable aggregation of metadata that are contained in files of a dataset,
    one has to enable one or more metadata extractor for a dataset. DataLad
    supports a number of common metadata standards, such as the Exchangeable
    Image File Format (EXIF), Adobe's Extensible Metadata Platform (XMP), and
    various audio file metadata systems like ID3. In addition, a number of
    scientific metadata standards are supported, like DICOM, BIDS, or datacite.
    Some metadata extractors depend on particular 3rd-party software. The list of
    metadata extractors available to a particular DataLad installation is reported
    by the 'wtf' plugin ('datalad wtf').

    Enabling a metadata extractor for a dataset is done by adding its name to the
    'datalad.metadata.nativetype' configuration variable -- typically in the
    dataset's configuration file (.datalad/config), e.g.::

      [datalad "metadata"]
        nativetype = exif
        nativetype = xmp

    Enabling multiple extractors is supported. In this case, metadata are
    extracted by each extractor individually, and stored alongside each other.
    Metadata aggregation will also extract DataLad's own metadata (extractor
    'datalad_core').

    Metadata aggregation can be performed recursively, in order to aggregate all
    metadata across all subdatasets, for example, to be able to search across
    any content in any dataset of a collection. Aggregation can also be performed
    for subdatasets that are not available locally. In this case, pre-aggregated
    metadata from the closest available superdataset will be considered instead.

    Depending on the versatility of the present metadata and the number of dataset
    or files, aggregated metadata can grow prohibitively large. A number of
    configuration switches are provided to mitigate such issues.

    datalad.metadata.aggregate-content-<extractor-name>
      If set to false, content metadata aggregation will not be performed for
      the named metadata extractor (a potential underscore '_' in the extractor name must
      be replaced by a dash '-'). This can substantially reduce the runtime for
      metadata extraction, and also reduce the size of the generated metadata
      aggregate. Note, however, that some extractors may not produce any metadata
      when this is disabled, because their metadata might come from individual
      file headers only. 'datalad.metadata.store-aggregate-content' might be
      a more appropriate setting in such cases.

    datalad.metadata.aggregate-ignore-fields
      Any metadata key matching any regular expression in this configuration setting
      is removed prior to generating the dataset-level metadata summary (keys
      and their unique values across all dataset content), and from the dataset
      metadata itself. This switch can also be used to filter out sensitive
      information prior aggregation.

    datalad.metadata.generate-unique-<extractor-name>
      If set to false, DataLad will not auto-generate a summary of unique content
      metadata values for a particular extractor as part of the dataset-global metadata
      (a potential underscore '_' in the extractor name must be replaced by a dash '-').
      This can be useful if such a summary is bloated due to minor uninformative (e.g.
      numerical) differences, or when a particular extractor already provides a
      carefully designed content metadata summary.

    datalad.metadata.maxfieldsize
      Any metadata value that exceeds the size threshold given by this configuration
      setting (in bytes/characters) is removed.

    datalad.metadata.store-aggregate-content
      If set, extracted content metadata are still used to generate a dataset-level
      summary of present metadata (all keys and their unique values across all
      files in a dataset are determined and stored as part of the dataset-level
      metadata aggregate, see datalad.metadata.generate-unique-<extractor-name>),
      but metadata on individual files are not stored.
      This switch can be used to avoid prohibitively large metadata files. Discovery
      of datasets containing content matching particular metadata properties will
      still be possible, but such datasets would have to be obtained first in order
      to discover which particular files in them match these properties.
    """
    _params_ = dict(
        # TODO add option to not update aggregated data/info in intermediate
        # datasets
        # TODO add option for full aggregation (not incremental), so when something
        # is not present nothing about it is preserved in the aggregated metadata
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""topmost dataset metadata will be aggregated into. All dataset
            between this dataset and any given path will receive updated
            aggregated metadata from all given paths.""",
            constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(
            args=("path",),
            metavar="PATH",
            doc="""path to datasets that shall be aggregated.
            When a given path is pointing into a dataset, the metadata of the
            containing dataset will be aggregated.""",
            nargs="*",
            constraints=EnsureStr() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        save=nosave_opt,
    )

    @staticmethod
    @datasetmethod(name='aggregate_metadata')
    @eval_results
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            save=True):
        refds_path = Interface.get_refds_path(dataset)

        # it really doesn't work without a dataset
        ds = require_dataset(
            dataset, check_installed=True, purpose='metadata aggregation')
        # always include the reference dataset
        path = assure_list(path)
        path.append(ds.path)

        agginfo_db = {}
        to_save = []
        to_aggregate = set()
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='aggregate_metadata',
                # uninstalled subdatasets could be queried via aggregated metadata
                # -> no 'error'
                unavailable_path_status='',
                nondataset_path_status='error',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            ap_type = ap.get('type', None)
            ap_state = ap.get('state', None)
            assert('parentds' in ap or ap_type == 'dataset')
            if ap_type == 'dataset' and ap_state != 'absent':
                # a present dataset, we can take directly from it
                aggsrc = ap['path']
                lgr.info('Aggregate metadata for dataset %s', aggsrc)
            else:
                # everything else needs to come from the parent
                aggsrc = ap['parentds']
                if ap_state == 'absent':
                    lgr.info(
                        'Attempt to use pre-aggregate metadata for absent %s from dataset at %s',
                        ap['path'],
                        aggsrc)
                else:
                    lgr.info(
                        'Aggregate metadata for %s from dataset at %s',
                        ap['path'],
                        aggsrc)

            to_aggregate.add(aggsrc)

            if ap_state == 'absent':
                # key thought: recursive is done by path annotation, hence
                # once we hit an absent dataset, we are 100% certain that
                # there is nothing to recurse into on the file system
                # hence we only have to look into the aggregated metadata
                # of the last available dataset in the dataset tree edge
                #
                # if there is nothing at this path, we need to look into the
                # parentds and check if we know anything about this path
                # if we do, we need to grab all the info and objects
                # if not, we need to error
                res = _get_dsinfo_from_aggmetadata(
                    aggsrc, ap['path'], recursive, agginfo_db)
                if not isinstance(res, list):
                    yield get_status_dict(
                        status='impossible',
                        message=res,
                        action='aggregate_metadata',
                        path=ap['path'],
                        logger=lgr)
                    continue
                # cue for aggregation
                to_aggregate.update(res)
            else:
                # actually aggregate metadata for this dataset, immediately place
                # generated objects into the aggregated or reference dataset,
                # and put info into DB to get the distributed to all datasets
                # that need to be updated
                errored = _extract_metadata(
                    ds,
                    Dataset(aggsrc),
                    agginfo_db,
                    to_save)
                if errored:
                    yield get_status_dict(
                        status='error',
                        message='Metadata extraction failed (see previous error message, set datalad.runtime.raiseonerror=yes to fail immediately)',
                        action='aggregate_metadata',
                        path=aggsrc,
                        logger=lgr)

        # at this point we have dumped all aggregated metadata into object files
        # somewhere, we know what needs saving, but having saved anything, and
        # we know about the states of all aggregated dataset in the DB
        # what remains to do is to update all dataset, so they have there own copy
        # of aggregated metadata and update their respective aggregate.json with
        # info on what states we just aggregated from

        # first, let's figure out what dataset need updating at all
        # get adjencency info of the dataset tree spanning the base to all leaf dataset
        # associated with the path arguments
        ds_adj = {}
        discover_dataset_trace_to_targets(
            ds.path, to_aggregate, [], ds_adj,
            # we know that to_aggregate only lists datasets, existing and
            # absent ones -- we want to aggregate all of them, either from
            # just extracted metadata, or from previously aggregated metadata
            # of the closest superdataset
            includeds=to_aggregate)
        # TODO we need to work in the info about dataset that we only got from
        # aggregated metadata, that had no trace on the file system in here!!
        subtrees = _adj2subtrees(ds.path, ds_adj, to_aggregate)
        # go over datasets in bottom-up fashion
        for parentds_path in sorted(subtrees, reverse=True):
            lgr.info('Update aggregate metadata in dataset at: %s', parentds_path)

            _update_ds_agginfo(
                ds.path,
                parentds_path,
                subtrees[parentds_path],
                agginfo_db,
                to_save)
            # update complete
            res = get_status_dict(
                status='ok',
                action='aggregate_metadata',
                path=parentds_path,
                type='dataset',
                logger=lgr)
            res.update(agginfo_db.get(parentds_path, {}))
            yield res
        #
        # save potential modifications to dataset global metadata
        #
        if not to_save:
            return
        lgr.info('Attempting to save %i files/datasets', len(to_save))
        for res in Save.__call__(
                path=to_save,
                dataset=refds_path,
                message='[DATALAD] dataset aggregate metadata update',
                return_type='generator',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res
Exemple #14
0
class Create(Interface):
    """Create a new dataset from scratch.

    This command initializes a new :term:`dataset` at a given location, or the
    current directory. The new dataset can optionally be registered in an
    existing :term:`superdataset` (the new dataset's path needs to be located
    within the superdataset for that, and the superdataset needs to be given
    explicitly). It is recommended to provide a brief description to label
    the dataset's nature *and* location, e.g. "Michael's music on black
    laptop". This helps humans to identify data locations in distributed
    scenarios.  By default an identifier comprised of user and machine name,
    plus path will be generated.

    This command only creates a new dataset, it does not add any content to it,
    even if the target directory already contains additional files or
    directories.

    Plain Git repositories can be created via the [PY: `no_annex` PY][CMD: --no-annex CMD] flag.
    However, the result will not be a full dataset, and, consequently,
    not all features are supported (e.g. a description).

    || REFLOW >>
    To create a local version of a remote dataset use the
    :func:`~datalad.api.install` command instead.
    << REFLOW ||

    .. note::
      Power-user info: This command uses :command:`git init` and
      :command:`git annex init` to prepare the new dataset. Registering to a
      superdataset is performed via a :command:`git submodule add` operation
      in the discovered superdataset.
    """

    # in general this command will yield exactly one result
    return_type = 'item-or-list'
    # in general users expect to get an instance of the created dataset
    result_xfm = 'datasets'
    # result filter
    result_filter = EnsureKeyChoice('action', ('create',)) & \
                    EnsureKeyChoice('status', ('ok', 'notneeded'))

    _params_ = dict(
        path=Parameter(
            args=("path", ),
            metavar='PATH',
            doc="""path where the dataset shall be created, directories
            will be created as necessary. If no location is provided, a dataset
            will be created in the current working directory. Either way the
            command will error if the target directory is not empty.
            Use `force` to create a dataset in a non-empty directory.""",
            nargs='?',
            # put dataset 2nd to avoid useless conversion
            constraints=EnsureStr() | EnsureDataset() | EnsureNone()),
        dataset=Parameter(
            args=("-d", "--dataset"),
            metavar='PATH',
            doc="""specify the dataset to perform the create operation on. If
            a dataset is given, a new subdataset will be created in it.""",
            constraints=EnsureDataset() | EnsureNone()),
        force=Parameter(
            args=(
                "-f",
                "--force",
            ),
            doc="""enforce creation of a dataset in a non-empty directory""",
            action='store_true'),
        description=location_description,
        # TODO could move into cfg_annex plugin
        no_annex=Parameter(
            args=("--no-annex", ),
            doc="""if set, a plain Git repository will be created without any
            annex""",
            action='store_true'),
        text_no_annex=Parameter(
            args=("--text-no-annex", ),
            doc="""if set, all text files in the future would be added to Git,
            not annex. Achieved by adding an entry to `.gitattributes` file. See
            http://git-annex.branchable.com/tips/largefiles/ and `no_annex`
            DataLad plugin to establish even more detailed control over which
            files are placed under annex control.""",
            action='store_true'),
        save=nosave_opt,
        # TODO could move into cfg_annex plugin
        annex_version=Parameter(
            args=("--annex-version", ),
            doc="""select a particular annex repository version. The
            list of supported versions depends on the available git-annex
            version. This should be left untouched, unless you know what
            you are doing""",
            constraints=EnsureDType(int) | EnsureNone()),
        # TODO could move into cfg_annex plugin
        annex_backend=Parameter(
            args=("--annex-backend", ),
            constraints=EnsureStr() | EnsureNone(),
            # not listing choices here on purpose to avoid future bugs
            doc="""set default hashing backend used by the new dataset.
            For a list of supported backends see the git-annex
            documentation. The default is optimized for maximum compatibility
            of datasets across platforms (especially those with limited
            path lengths)"""),
        # TODO could move into cfg_metadata plugin
        native_metadata_type=Parameter(
            args=('--native-metadata-type', ),
            metavar='LABEL',
            action='append',
            constraints=EnsureStr() | EnsureNone(),
            doc="""Metadata type label. Must match the name of the respective
            parser implementation in DataLad (e.g. "xmp").[CMD:  This option
            can be given multiple times CMD]"""),
        # TODO could move into cfg_access/permissions plugin
        shared_access=shared_access_opt,
        git_opts=git_opts,
        annex_opts=annex_opts,
        annex_init_opts=annex_init_opts,
        fake_dates=Parameter(
            args=('--fake-dates', ),
            action='store_true',
            doc="""Configure the repository to use fake dates. The date for a
            new commit will be set to one second later than the latest commit
            in the repository. This can be used to anonymize dates."""),
    )

    @staticmethod
    @datasetmethod(name='create')
    @eval_results
    def __call__(path=None,
                 force=False,
                 description=None,
                 dataset=None,
                 no_annex=False,
                 save=True,
                 annex_version=None,
                 annex_backend='MD5E',
                 native_metadata_type=None,
                 shared_access=None,
                 git_opts=None,
                 annex_opts=None,
                 annex_init_opts=None,
                 text_no_annex=None,
                 fake_dates=False):

        # two major cases
        # 1. we got a `dataset` -> we either want to create it (path is None),
        #    or another dataset in it (path is not None)
        # 2. we got no dataset -> we want to create a fresh dataset at the
        #    desired location, either at `path` or PWD
        if path and dataset:
            # Given a path and a dataset (path) not pointing to installed
            # dataset
            if not dataset.is_installed():
                msg = "No installed dataset at %s found." % dataset.path
                dsroot = get_dataset_root(dataset.path)
                if dsroot:
                    msg += " If you meant to add to the %s dataset, use that path " \
                           "instead but remember that if dataset is provided, " \
                           "relative paths are relative to the top of the " \
                           "dataset." % dsroot
                raise ValueError(msg)

        # sanity check first
        if git_opts:
            lgr.warning(
                "`git_opts` argument is presently ignored, please complain!")
        if no_annex:
            if description:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "description for annex repo and declaring "
                                 "no annex repo.")
            if annex_opts:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "options for annex and declaring no "
                                 "annex repo.")
            if annex_init_opts:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "options for annex init and declaring no "
                                 "annex repo.")

        if not isinstance(force, bool):
            raise ValueError(
                "force should be bool, got %r.  Did you mean to provide a 'path'?"
                % force)
        annotated_paths = AnnotatePaths.__call__(
            # nothing given explicitly, assume create fresh right here
            path=path if path else getpwd() if dataset is None else None,
            dataset=dataset,
            recursive=False,
            action='create',
            # we need to know whether we have to check for potential
            # subdataset collision
            force_parentds_discovery=True,
            # it is absolutely OK to have something that does not exist
            unavailable_path_status='',
            unavailable_path_msg=None,
            # if we have a dataset given that actually exists, we want to
            # fail if the requested path is not in it
            nondataset_path_status='error' \
                if isinstance(dataset, Dataset) and dataset.is_installed() else '',
            on_failure='ignore')
        path = None
        for r in annotated_paths:
            if r['status']:
                # this is dealt with already
                yield r
                continue
            if path is not None:
                raise ValueError(
                    "`create` can only handle single target path or dataset")
            path = r

        if len(annotated_paths) and path is None:
            # we got something, we complained already, done
            return

        # we know that we need to create a dataset at `path`
        assert (path is not None)

        # prep for yield
        path.update({'logger': lgr, 'type': 'dataset'})
        # just discard, we have a new story to tell
        path.pop('message', None)
        if 'parentds' in path:
            subs = Subdatasets.__call__(
                dataset=path['parentds'],
                # any known
                fulfilled=None,
                recursive=False,
                contains=path['path'],
                result_xfm='relpaths')
            if len(subs):
                path.update({
                    'status':
                    'error',
                    'message':
                    ('collision with known subdataset %s/ in dataset %s',
                     subs[0], path['parentds'])
                })
                yield path
                return

        # TODO here we need a further test that if force=True, we need to look if
        # there is a superdataset (regardless of whether we want to create a
        # subdataset or not), and if that superdataset tracks anything within
        # this directory -- if so, we need to stop right here and whine, because
        # the result of creating a repo here will produce an undesired mess

        if git_opts is None:
            git_opts = {}
        if shared_access:
            # configure `git --shared` value
            git_opts['shared'] = shared_access

        # important to use the given Dataset object to avoid spurious ID
        # changes with not-yet-materialized Datasets
        tbds = dataset if isinstance(dataset, Dataset) and dataset.path == path['path'] \
            else Dataset(path['path'])

        # don't create in non-empty directory without `force`:
        if isdir(tbds.path) and listdir(tbds.path) != [] and not force:
            path.update({
                'status':
                'error',
                'message':
                'will not create a dataset in a non-empty directory, use '
                '`force` option to ignore'
            })
            yield path
            return

        # stuff that we create and want to have tracked with git (not annex)
        add_to_git = []

        if no_annex:
            lgr.info("Creating a new git repo at %s", tbds.path)
            GitRepo(tbds.path,
                    url=None,
                    create=True,
                    git_opts=git_opts,
                    fake_dates=fake_dates)
        else:
            # always come with annex when created from scratch
            lgr.info("Creating a new annex repo at %s", tbds.path)
            tbrepo = AnnexRepo(tbds.path,
                               url=None,
                               create=True,
                               backend=annex_backend,
                               version=annex_version,
                               description=description,
                               git_opts=git_opts,
                               annex_opts=annex_opts,
                               annex_init_opts=annex_init_opts,
                               fake_dates=fake_dates)

            if text_no_annex:
                attrs = tbrepo.get_gitattributes('.')
                # some basic protection against useless duplication
                # on rerun with --force
                if not attrs.get('.', {}).get(
                        'annex.largefiles', None) == '(not(mimetype=text/*))':
                    tbrepo.set_gitattributes([('*', {
                        'annex.largefiles':
                        '(not(mimetype=text/*))'
                    })])
                    add_to_git.append('.gitattributes')

        if native_metadata_type is not None:
            if not isinstance(native_metadata_type, list):
                native_metadata_type = [native_metadata_type]
            for nt in native_metadata_type:
                tbds.config.add('datalad.metadata.nativetype', nt)

        # record an ID for this repo for the afterlife
        # to be able to track siblings and children
        id_var = 'datalad.dataset.id'
        if id_var in tbds.config:
            # make sure we reset this variable completely, in case of a re-create
            tbds.config.unset(id_var, where='dataset')

        if _seed is None:
            # just the standard way
            uuid_id = uuid.uuid1().urn.split(':')[-1]
        else:
            # Let's generate preseeded ones
            uuid_id = str(uuid.UUID(int=random.getrandbits(128)))
        tbds.config.add(id_var,
                        tbds.id if tbds.id is not None else uuid_id,
                        where='dataset')

        add_to_git.append('.datalad')

        # make sure that v6 annex repos never commit content under .datalad
        attrs_cfg = (
            ('config', 'annex.largefiles', 'nothing'),
            ('metadata/aggregate*', 'annex.largefiles', 'nothing'),
            ('metadata/objects/**', 'annex.largefiles', '({})'.format(
                cfg.obtain('datalad.metadata.create-aggregate-annex-limit'))))
        attrs = tbds.repo.get_gitattributes(
            [op.join('.datalad', i[0]) for i in attrs_cfg])
        set_attrs = []
        for p, k, v in attrs_cfg:
            if not attrs.get(op.join('.datalad', p), {}).get(k, None) == v:
                set_attrs.append((p, {k: v}))
        if set_attrs:
            tbds.repo.set_gitattributes(set_attrs,
                                        attrfile=op.join(
                                            '.datalad', '.gitattributes'))
            add_to_git.append('.datalad')

        # prevent git annex from ever annexing .git* stuff (gh-1597)
        attrs = tbds.repo.get_gitattributes('.git')
        if not attrs.get('.git', {}).get('annex.largefiles',
                                         None) == 'nothing':
            tbds.repo.set_gitattributes([('**/.git*', {
                'annex.largefiles': 'nothing'
            })])
            add_to_git.append('.gitattributes')

        # save everything, we need to do this now and cannot merge with the
        # call below, because we may need to add this subdataset to a parent
        # but cannot until we have a first commit
        tbds.add(add_to_git,
                 to_git=True,
                 save=save,
                 message='[DATALAD] new dataset')

        # the next only makes sense if we saved the created dataset,
        # otherwise we have no committed state to be registered
        # in the parent
        if isinstance(dataset, Dataset) and dataset.path != tbds.path \
           and tbds.repo.get_hexsha():
            # we created a dataset in another dataset
            # -> make submodule
            for r in dataset.add(tbds.path,
                                 save=save,
                                 return_type='generator',
                                 result_filter=None,
                                 result_xfm=None,
                                 on_failure='ignore'):
                yield r

        path.update({'status': 'ok'})
        yield path

    @staticmethod
    def custom_result_renderer(res, **kwargs):
        from datalad.ui import ui
        if res.get('action', None) == 'create' and \
               res.get('status', None) == 'ok' and \
               res.get('type', None) == 'dataset':
            ui.message("Created dataset at {}.".format(res['path']))
        else:
            ui.message("Nothing was created")
Exemple #15
0
class Clone(Interface):
    """Obtain a dataset copy from a URL or local source (path)

    The purpose of this command is to obtain a new clone (copy) of a dataset
    and place it into a not-yet-existing or empty directory. As such `clone`
    provides a strict subset of the functionality offered by `install`. Only a
    single dataset can be obtained, recursion is not supported. However, once
    installed, arbitrary dataset components can be obtained via a subsequent
    `get` command.

    Primary differences over a direct `git clone` call are 1) the automatic
    initialization of a dataset annex (pure Git repositories are equally
    supported); 2) automatic registration of the newly obtained dataset
    as a subdataset (submodule), if a parent dataset is specified;
    3) support for datalad's resource identifiers and automatic generation of
    alternative access URL for common cases (such as appending '.git' to the
    URL in case the accessing the base URL failed); and 4) ability to
    take additional alternative source locations as an argument.
    """
    # by default ignore everything but install results
    # i.e. no "add to super dataset"
    result_filter = EnsureKeyChoice('action', ('install', ))

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""(parent) dataset to clone into. If given, the newly cloned
            dataset is registered as a subdataset of the parent. Also, if given,
            relative paths are interpreted as being relative to the parent
            dataset, and not relative to the working directory.""",
            constraints=EnsureDataset() | EnsureNone()),
        source=Parameter(
            args=("source", ),
            metavar='SOURCE',
            doc="""URL, DataLad resource identifier, local path or instance of
            dataset to be cloned""",
            constraints=EnsureStr() | EnsureNone()),
        path=Parameter(args=("path", ),
                       metavar='PATH',
                       nargs="?",
                       doc="""path to clone into.  If no `path` is provided a
            destination path will be derived from a source URL
            similar to :command:`git clone`"""),
        description=location_description,
        reckless=reckless_opt,
        alt_sources=Parameter(
            args=('--alternative-sources', ),
            dest='alt_sources',
            metavar='SOURCE',
            nargs='+',
            doc="""Alternative sources to be tried if a dataset cannot
            be obtained from the main `source`""",
            constraints=EnsureStr() | EnsureNone()),
        # TODO next ones should be there, but cannot go anywhere
        # git_opts=git_opts,
        # git_clone_opts=git_clone_opts,
        # annex_opts=annex_opts,
        # annex_init_opts=annex_init_opts,
    )

    @staticmethod
    @datasetmethod(name='clone')
    @eval_results
    def __call__(source,
                 path=None,
                 dataset=None,
                 description=None,
                 reckless=False,
                 alt_sources=None):
        # TODO next ones should be there, but cannot go anywhere
        # git_opts=None,
        # git_clone_opts=None,
        # annex_opts=None,
        # annex_init_opts=None

        # did we explicitly get a dataset to install into?
        # if we got a dataset, path will be resolved against it.
        # Otherwise path will be resolved first.
        dataset = require_dataset(
            dataset, check_installed=True, purpose='cloning') \
            if dataset is not None else dataset
        refds_path = dataset.path if dataset else None

        if isinstance(source, Dataset):
            source = source.path

        if source == path:
            # even if they turn out to be identical after resolving symlinks
            # and more sophisticated witchcraft, it would still happily say
            # "it appears to be already installed", so we just catch an
            # obviously pointless input combination
            raise ValueError(
                "clone `source` and destination `path` are identical [{}]. "
                "If you are trying to add a subdataset simply use `add`".
                format(path))

        if path is not None:
            path = resolve_path(path, dataset)

        # Possibly do conversion from source into a git-friendly url
        # luckily GitRepo will undo any fancy file:/// url to make use of Git's
        # optimization for local clones....
        source_url = source
        source_ = _get_git_url_from_source(source)
        lgr.debug("Resolved clone source from '%s' to '%s'", source, source_)
        source = source_

        # derive target from source:
        if path is None:
            # we got nothing but a source. do something similar to git clone
            # and derive the path from the source and continue
            path = _get_installationpath_from_url(source)
            # since this is a relative `path`, resolve it:
            path = resolve_path(path, dataset)
            lgr.debug("Determined clone target path from source")
        lgr.debug("Resolved clone target path to: '%s'", path)

        # there is no other way -- my intoxicated brain tells me
        assert (path is not None)

        destination_dataset = Dataset(path)
        dest_path = path

        status_kwargs = dict(action='install',
                             ds=destination_dataset,
                             logger=lgr,
                             refds=refds_path,
                             source_url=source_url)

        # important test! based on this `rmtree` will happen below after failed clone
        if exists(dest_path) and listdir(dest_path):
            if destination_dataset.is_installed():
                # check if dest was cloned from the given source before
                # this is where we would have installed this from
                guessed_sources = _get_flexible_source_candidates(
                    source, dest_path)
                # this is where it was actually installed from
                track_name, track_url = _get_tracking_source(
                    destination_dataset)
                if track_url in guessed_sources or \
                        get_local_file_url(track_url) in guessed_sources:
                    yield get_status_dict(
                        status='notneeded',
                        message=("dataset %s was already cloned from '%s'",
                                 destination_dataset, source),
                        **status_kwargs)
                    return
            # anything else is an error
            yield get_status_dict(
                status='error',
                message=
                'target path already exists and not empty, refuse to clone into target path',
                **status_kwargs)
            return

        if dataset is not None and relpath(
                path, start=dataset.path).startswith(pardir):
            yield get_status_dict(
                status='error',
                message=
                ("clone target path '%s' not in specified target dataset '%s'",
                 path, dataset),
                **status_kwargs)
            return

        # generate candidate URLs from source argument to overcome a few corner cases
        # and hopefully be more robust than git clone
        candidate_sources = []
        # combine all given sources (incl. alternatives), maintain order
        for s in [source] + assure_list(alt_sources):
            candidate_sources.extend(_get_flexible_source_candidates(s))
        lgr.info("Cloning %s to '%s'", source, dest_path)
        for isource_, source_ in enumerate(candidate_sources):
            try:
                lgr.debug(
                    "Attempting to clone %s (%d out of %d candidates) to '%s'",
                    source_, isource_ + 1, len(candidate_sources), dest_path)
                GitRepo.clone(path=dest_path, url=source_, create=True)
                break  # do not bother with other sources if succeeded
            except GitCommandError as e:
                lgr.debug("Failed to clone from URL: %s (%s)", source_,
                          exc_str(e))
                if exists(dest_path):
                    lgr.debug("Wiping out unsuccessful clone attempt at: %s",
                              dest_path)
                    rmtree(dest_path)
                if 'could not create work tree' in e.stderr.lower():
                    # this cannot be fixed by trying another URL
                    yield get_status_dict(
                        status='error',
                        message=re.match(r".*fatal: (.*)\n",
                                         e.stderr,
                                         flags=re.MULTILINE
                                         | re.DOTALL).group(1),
                        **status_kwargs)
                    return

        if not destination_dataset.is_installed():
            yield get_status_dict(
                status='error',
                message=(
                    "Failed to clone data from any candidate source URL: %s",
                    candidate_sources),
                **status_kwargs)
            return

        if dataset is not None:
            # we created a dataset in another dataset
            # -> make submodule
            for r in dataset.add(dest_path,
                                 save=True,
                                 ds2super=True,
                                 return_type='generator',
                                 result_filter=None,
                                 result_xfm=None,
                                 on_failure='ignore'):
                yield r

        _handle_possible_annex_dataset(destination_dataset,
                                       reckless,
                                       description=description)

        # yield successful clone of the base dataset now, as any possible
        # subdataset clone down below will not alter the Git-state of the
        # parent
        yield get_status_dict(status='ok', **status_kwargs)
Exemple #16
0
class CopyFile(Interface):
    """Copy files and their availability metadata from one dataset to another.

    The difference to a system copy command is that here additional content
    availability information, such as registered URLs, is also copied to the
    target dataset. Moreover, potentially required git-annex special remote
    configurations are detected in a source dataset and are applied to a target
    dataset in an analogous fashion. It is possible to copy a file for which no
    content is available locally, by just copying the required metadata on
    content identity and availability.

    .. note::
      At the moment, only URLs for the special remotes 'web' (git-annex built-in)
      and 'datalad' are recognized and transferred.

    || REFLOW >>
    The interface is modeled after the POSIX 'cp' command, but with one
    additional way to specify what to copy where: [CMD: --specs-from CMD][PY:
    `specs_from` PY] allows the caller to flexibly input source-destination
    path pairs.
    << REFLOW ||

    || REFLOW >>
    This command can copy files out of and into a hierarchy of nested datasets.
    Unlike with other DataLad command, the [CMD: --recursive CMD][PY: `recursive`
    PY] switch does not enable recursion into subdatasets, but is analogous
    to the POSIX 'cp' command switch and enables subdirectory recursion, regardless
    of dataset boundaries. It is not necessary to enable recursion in order to
    save changes made to nested target subdatasets.
    << REFLOW ||
    """
    _params_ = dict(
        dataset=Parameter(
            # not really needed on the cmdline, but for PY to resolve relative
            # paths
            args=("-d", "--dataset"),
            doc="""root dataset to save after copy operations are completed.
            All destination paths must be within this dataset, or its
            subdatasets. If no dataset is given, dataset modifications will be
            left unsaved.""",
            constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(
            args=("path", ),
            metavar='PATH',
            doc="""paths to copy (and possibly a target path to copy to).""",
            nargs='*',
            constraints=EnsureStr() | EnsureNone()),
        recursive=Parameter(args=(
            "--recursive",
            "-r",
        ),
                            action='store_true',
                            doc="""copy directories recursively"""),
        target_dir=Parameter(
            args=('--target-dir', '-t'),
            metavar='DIRECTORY',
            doc="""copy all source files into this DIRECTORY. This value is
            overridden by any explicit destination path provided via [CMD:
            --specs-from CMD][PY: 'specs_from' PY]. When not given, this
            defaults to the path of the dataset specified via [CMD: --dataset
            CMD][PY: 'dataset' PY].""",
            constraints=EnsureStr() | EnsureNone()),
        specs_from=Parameter(
            args=('--specs-from', ),
            metavar='SOURCE',
            doc="""read list of source (and destination) path names from a given
            file, or stdin (with '-'). Each line defines either a source
            path, or a source/destination path pair (separated by a null byte
            character).[PY:  Alternatively, a list of 2-tuples with
            source/destination pairs can be given. PY]"""),
        message=save_message_opt,
    )

    _examples_ = [
        dict(
            text="Copy a file into a dataset 'myds' using a path and a target "
            "directory specification, and save its addition to 'myds'",
            code_py="""\
            copy_file('path/to/myfile', dataset='path/to/myds')""",
            code_cmd="""\
            datalad copy-file path/to/myfile -d path/to/myds"""),
        dict(
            text="Copy a file to a dataset 'myds' and save it under a new name "
            "by providing two paths",
            code_py="""\
            copy_file(path=['path/to/myfile', 'path/to/myds/newname'],
                      dataset='path/to/myds')""",
            code_cmd="""\
            datalad copy-file path/to/myfile path/to/myds/new -d path/to/myds"""
        ),
        dict(text="Copy a file into a dataset without saving it",
             code_py="copy_file('path/to/myfile', target_dir='path/to/myds/')",
             code_cmd="datalad copy-file path/to/myfile -t path/to/myds"),
        dict(
            text="Copy a directory and its subdirectories into a dataset 'myds'"
            " and save the addition in 'myds'",
            code_py="""\
            copy_file('path/to/dir/', recursive=True, dataset='path/to/myds')""",
            code_cmd="""\
            datalad copy-file path/to/dir -r -d path/to/myds"""),
        dict(
            text="Copy files using a path and optionally target specification "
            "from a file",
            code_py="""\
            copy_file(dataset='path/to/myds', specs_from='path/to/specfile')""",
            code_cmd="""\
            datalad copy-file -d path/to/myds --specs-from specfile"""),
        dict(
            text="Read a specification from stdin and pipe the output of a find"
            " command into the copy-file command",
            code_cmd="""\
            find <expr> | datalad copy-file -d path/to/myds --specs-from -""")
    ]

    @staticmethod
    @datasetmethod(name='copy_file')
    @eval_results
    def __call__(path=None,
                 dataset=None,
                 recursive=False,
                 target_dir=None,
                 specs_from=None,
                 message=None):
        # Concept
        #
        # Loosely model after the POSIX cp command
        #
        # 1. Determine the target of the copy operation, and its associated
        #    dataset
        #
        # 2. for each source: determine source dataset, query for metadata, put
        #    into target dataset
        #
        # Instead of sifting and sorting through input args, process them one
        # by one sequentially. Utilize lookup caching to make things faster,
        # instead of making the procedure itself more complicated.

        if path and specs_from:
            raise ValueError("Path argument(s) AND a specs-from specified, "
                             "this is not supported.")

        ds = None
        if dataset:
            ds = require_dataset(dataset,
                                 check_installed=True,
                                 purpose='copying into')

        if target_dir:
            target_dir = resolve_path(target_dir, dataset)

        if path:
            # turn into list of absolute paths
            paths = [resolve_path(p, dataset) for p in ensure_list(path)]

            # we already checked that there are no specs_from
            if not target_dir:
                if len(paths) == 1:
                    if not ds:
                        raise ValueError("No target directory was given.")
                    # we can keep target_dir unset and need not manipulate
                    # paths, this is all done in a generic fashion below
                elif len(paths) == 2:
                    # single source+dest combo
                    if paths[-1].is_dir():
                        # check if we need to set target_dir, in case dest
                        # is a dir
                        target_dir = paths.pop(-1)
                    else:
                        specs_from = [paths]
                else:
                    target_dir = paths.pop(-1)

            if not specs_from:
                # in all other cases we have a plain source list
                specs_from = paths

        if not specs_from:
            raise ValueError("Neither `paths` nor `specs_from` given.")

        if target_dir:
            if ".git" in target_dir.parts:
                raise ValueError(
                    "Target directory should not contain a .git directory: {}".
                    format(target_dir))
        elif ds:
            # no specific target set, but we have to write into a dataset,
            # and one was given. It seems to make sense to use this dataset
            # as a target. it is already to reference for any path resolution.
            # Any explicitely given destination, will take precedence over
            # a general target_dir setting nevertheless.
            target_dir = ds.pathobj

        res_kwargs = dict(
            action='copy_file',
            logger=lgr,
        )

        # lookup cache for dir to repo mappings, and as a DB for cleaning
        # things up
        repo_cache = {}
        # which paths to pass on to save
        to_save = []
        try:
            for src_path, dest_path in _yield_specs(specs_from):
                src_path = Path(src_path)
                dest_path = None \
                    if dest_path is None \
                    else resolve_path(dest_path, dataset)
                lgr.debug('Processing copy specification: %s -> %s', src_path,
                          dest_path)

                # Some checks, first impossibility "wins"
                msg_impossible = None
                if not recursive and src_path.is_dir():
                    msg_impossible = 'recursion not enabled, omitting directory'
                elif (dest_path and dest_path.name == '.git') \
                        or src_path.name == '.git':
                    msg_impossible = \
                        "refuse to place '.git' into destination dataset"
                elif not (dest_path or target_dir):
                    msg_impossible = 'need destination path or target directory'

                if msg_impossible:
                    yield dict(path=str(src_path),
                               status='impossible',
                               message=msg_impossible,
                               **res_kwargs)
                    continue

                for src_file, dest_file in _yield_src_dest_filepaths(
                        src_path, dest_path, target_dir=target_dir):
                    if ds and ds.pathobj not in dest_file.parents:
                        # take time to compose proper error
                        dpath = str(target_dir if target_dir else dest_path)
                        yield dict(
                            path=dpath,
                            status='error',
                            message=('reference dataset does not contain '
                                     'destination path: %s', dpath),
                            **res_kwargs)
                        # only recursion could yield further results, which would
                        # all have the same issue, so call it over right here
                        break
                    for res in _copy_file(src_file,
                                          dest_file,
                                          cache=repo_cache):
                        yield dict(res, **res_kwargs)
                        if res.get('status', None) == 'ok':
                            to_save.append(res['destination'])
        finally:
            # cleanup time
            # TODO this could also be the place to stop lingering batch processes
            _cleanup_cache(repo_cache)

        if not (ds and to_save):
            # nothing left to do
            return

        yield from ds.save(
            path=to_save,
            # we provide an explicit file list
            recursive=False,
            message=message,
        )
Exemple #17
0
class Rerun(Interface):
    """Re-execute previous `datalad run` commands.

    This will unlock any dataset content that is on record to have
    been modified by the command in the specified revision.  It will
    then re-execute the command in the recorded path (if it was inside
    the dataset). Afterwards, all modifications will be saved.

    *Report mode*

    || REFLOW >>
    When called with [CMD: --report CMD][PY: report=True PY], this command
    reports information about what would be re-executed as a series of records.
    There will be a record for each revision in the specified revision range.
    Each of these will have one of the following "rerun_action" values:
    << REFLOW ||

      - run: the revision has a recorded command that would be re-executed
      - skip: the revision does not have a recorded command and would be
        skipped
      - pick: the revision does not have a recorded command and would be cherry
        picked

    The decision to skip rather than cherry pick a revision is based on whether
    the revision would be reachable from HEAD at the time of execution.

    In addition, when a starting point other than HEAD is specified, there is a
    rerun_action value "checkout", in which case the record includes
    information about the revision the would be checked out before rerunning
    any commands.

    Examples:

      Re-execute the command from the previous commit::

        % datalad rerun

      Re-execute any commands in the last five commits::

        % datalad rerun --since=HEAD~5

      Do the same as above, but re-execute the commands on top of
      HEAD~5 in a detached state::

        % datalad rerun --onto= --since=HEAD~5

      Re-execute all previous commands and compare the old and new
      results::

        % # on master branch
        % datalad rerun --branch=verify --since=
        % # now on verify branch
        % datalad diff --revision=master..
        % git log --oneline --left-right --cherry-pick master...

    .. note::
      Currently the "onto" feature only sets the working tree of the current
      dataset to a previous state. The working trees of any subdatasets remain
      unchanged.
    """
    _params_ = dict(
        revision=Parameter(
            args=("revision", ),
            metavar="REVISION",
            nargs="?",
            doc="""rerun command(s) in `revision`. By default, the command from
            this commit will be executed, but [CMD: --since CMD][PY: `since`
            PY] can be used to construct a revision range.""",
            default="HEAD",
            constraints=EnsureStr()),
        since=Parameter(
            args=("--since", ),
            doc="""If `since` is a commit-ish, the commands from all commits
            that are reachable from `revision` but not `since` will be
            re-executed (in other words, the commands in :command:`git log
            SINCE..REVISION`). If SINCE is an empty string, it is set to the
            parent of the first commit that contains a recorded command (i.e.,
            all commands in :command:`git log REVISION` will be
            re-executed).""",
            constraints=EnsureStr() | EnsureNone()),
        branch=Parameter(
            metavar="NAME",
            args=(
                "-b",
                "--branch",
            ),
            doc=
            "create and checkout this branch before rerunning the commands.",
            constraints=EnsureStr() | EnsureNone()),
        onto=Parameter(
            metavar="base",
            args=("--onto", ),
            doc="""start point for rerunning the commands. If not specified,
            commands are executed at HEAD. This option can be used to specify
            an alternative start point, which will be checked out with the
            branch name specified by [CMD: --branch CMD][PY: `branch` PY] or in
            a detached state otherwise. As a special case, an empty value for
            this option means to use the commit specified by [CMD: --since
            CMD][PY: `since` PY].""",
            constraints=EnsureStr() | EnsureNone()),
        message=Parameter(
            args=(
                "-m",
                "--message",
            ),
            metavar="MESSAGE",
            doc="""use MESSAGE for the reran commit rather than the
            recorded commit message.  In the case of a multi-commit
            rerun, all the reran commits will have this message.""",
            constraints=EnsureStr() | EnsureNone()),
        script=Parameter(
            args=("--script", ),
            metavar="FILE",
            doc="""extract the commands into [CMD: FILE CMD][PY: this file PY]
            rather than rerunning. Use - to write to stdout instead. [CMD: This
            option implies --report. CMD]""",
            constraints=EnsureStr() | EnsureNone()),
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset from which to rerun a recorded
            command. If no dataset is given, an attempt is made to
            identify the dataset based on the current working
            directory. If a dataset is given, the command will be
            executed in the root directory of this dataset.""",
            constraints=EnsureDataset() | EnsureNone()),
        report=Parameter(
            args=("--report", ),
            action="store_true",
            doc="""Don't actually re-execute anything, just display what would
            be done. [CMD: Note: If you give this option, you most likely want
            to set --output-format to 'json' or 'json_pp'. CMD]"""),
    )

    @staticmethod
    @datasetmethod(name='rerun')
    @eval_results
    def __call__(revision="HEAD",
                 since=None,
                 dataset=None,
                 branch=None,
                 message=None,
                 onto=None,
                 script=None,
                 report=False):

        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='rerunning a command')

        lgr.debug('rerunning command output underneath %s', ds)

        if script is None and not report and ds.repo.dirty:
            yield get_status_dict('run',
                                  ds=ds,
                                  status='impossible',
                                  message=('unsaved modifications present, '
                                           'cannot detect changes by command'))
            return

        if not ds.repo.get_hexsha():
            yield get_status_dict(
                'run',
                ds=ds,
                status='impossible',
                message='cannot rerun command, nothing recorded')
            return

        if branch and branch in ds.repo.get_branches():
            yield get_status_dict(
                "run",
                ds=ds,
                status="error",
                message="branch '{}' already exists".format(branch))
            return

        if not ds.repo.commit_exists(revision + "^"):
            # Only a single commit is reachable from `revision`.  In
            # this case, --since has no effect on the range construction.
            revrange = revision
        elif since is None:
            revrange = "{rev}^..{rev}".format(rev=revision)
        elif since.strip() == "":
            revrange = revision
        else:
            revrange = "{}..{}".format(since, revision)

        if ds.repo.repo.git.rev_list("--merges", revrange, "--"):
            yield get_status_dict(
                "run",
                ds=ds,
                status="error",
                message="cannot rerun history with merge commits")
            return

        results = _rerun_as_results(ds, revrange, since, branch, onto, message)
        if script:
            handler = _get_script_handler(script, since, revision)
        elif report:
            handler = _report
        else:
            handler = _rerun

        for res in handler(ds, results):
            yield res
Exemple #18
0
class Addurls(Interface):
    """Create and update a dataset from a list of URLs.

    *Format specification*

    Several arguments take format strings.  These are similar to normal Python
    format strings where the names from `URL-FILE` (column names for a CSV or
    properties for JSON) are available as placeholders.  If `URL-FILE` is a CSV
    file, a positional index can also be used (i.e., "{0}" for the first
    column).  Note that a placeholder cannot contain a ':' or '!'.

    In addition, the `FILENAME-FORMAT` arguments has a few special
    placeholders.

      - _repindex

        The constructed file names must be unique across all fields rows.  To
        avoid collisions, the special placeholder "_repindex" can be added to
        the formatter.  Its value will start at 0 and increment every time a
        file name repeats.

      - _url_hostname, _urlN, _url_basename*

        Various parts of the formatted URL are available.  Take
        "http://datalad.org/asciicast/seamless_nested_repos.sh" as an example.

        "datalad.org" is stored as "_url_hostname".  Components of the URL's
        path can be referenced as "_urlN".  "_url0" and "_url1" would map to
        "asciicast" and "seamless_nested_repos.sh", respectively.  The final
        part of the path is also available as "_url_basename".

        This name is broken down further.  "_url_basename_root" and
        "_url_basename_ext" provide access to the root name and extension.
        These values are similar to the result of os.path.splitext, but, in the
        case of multiple periods, the extension is identified using the same
        length heuristic that git-annex uses.  As a result, the extension of
        "file.tar.gz" would be ".tar.gz", not ".gz".  In addition, the fields
        "_url_basename_root_py" and "_url_basename_ext_py" provide access to
        the result of os.path.splitext.

      - _url_filename*

        These are similar to _url_basename* fields, but they are obtained with
        a server request.  This is useful if the file name is set in the
        Content-Disposition header.


    *Examples*

    Consider a file "avatars.csv" that contains::

        who,ext,link
        neurodebian,png,https://avatars3.githubusercontent.com/u/260793
        datalad,png,https://avatars1.githubusercontent.com/u/8927200

    To download each link into a file name composed of the 'who' and 'ext'
    fields, we could run::

      $ datalad addurls -d avatar_ds --fast avatars.csv '{link}' '{who}.{ext}'

    The `-d avatar_ds` is used to create a new dataset in "$PWD/avatar_ds".

    If we were already in a dataset and wanted to create a new subdataset in an
    "avatars" subdirectory, we could use "//" in the `FILENAME-FORMAT`
    argument::

      $ datalad addurls --fast avatars.csv '{link}' 'avatars//{who}.{ext}'

    .. note::

       For users familiar with 'git annex addurl': A large part of this
       plugin's functionality can be viewed as transforming data from
       `URL-FILE` into a "url filename" format that fed to 'git annex addurl
       --batch --with-files'.
    """

    from datalad.distribution.dataset import datasetmethod
    from datalad.interface.utils import eval_results
    from datalad.distribution.dataset import EnsureDataset
    from datalad.support.constraints import EnsureChoice, EnsureNone, EnsureStr
    from datalad.support.param import Parameter

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""Add the URLs to this dataset (or possibly subdatasets of
            this dataset).  An empty or non-existent directory is passed to
            create a new dataset.  New subdatasets can be specified with
            `FILENAME-FORMAT`.""",
            constraints=EnsureDataset() | EnsureNone()),
        urlfile=Parameter(
            args=("urlfile", ),
            metavar="URL-FILE",
            doc="""A file that contains URLs or information that can be used to
            construct URLs.  Depending on the value of --input-type, this
            should be a CSV file (with a header as the first row) or a JSON
            file (structured as a list of objects with string values)."""),
        urlformat=Parameter(
            args=("urlformat", ),
            metavar="URL-FORMAT",
            doc="""A format string that specifies the URL for each entry.  See
            the 'Format Specification' section above."""),
        filenameformat=Parameter(
            args=("filenameformat", ),
            metavar="FILENAME-FORMAT",
            doc="""Like `URL-FORMAT`, but this format string specifies the file
            to which the URL's content will be downloaded.  The file name may
            contain directories.  The separator "//" can be used to indicate
            that the left-side directory should be created as a new subdataset.
            See the 'Format Specification' section above."""),
        input_type=Parameter(
            args=("-t", "--input-type"),
            metavar="TYPE",
            doc="""Whether `URL-FILE` should be considered a CSV file or a JSON
            file.  The default value, "ext", means to consider `URL-FILE` as a
            JSON file if it ends with ".json".  Otherwise, treat it as a CSV
            file.""",
            constraints=EnsureChoice("ext", "csv", "json")),
        exclude_autometa=Parameter(
            args=("-x", "--exclude_autometa"),
            metavar="REGEXP",
            doc="""By default, metadata field=value pairs are constructed with
            each column in `URL-FILE`, excluding any single column that is
            specified via `URL-FORMAT`.  This argument can be used to exclude
            columns that match a regular expression.  If set to '*' or an empty
            string, automatic metadata extraction is disabled completely.  This
            argument does not affect metadata set explicitly with --meta."""),
        meta=Parameter(
            args=(
                "-m",
                "--meta",
            ),
            metavar="FORMAT",
            action="append",
            doc="""A format string that specifies metadata.  It should be
            structured as "<field>=<value>".  As an example, "location={3}"
            would mean that the value for the "location" metadata field should
            be set the value of the fourth column.  This option can be given
            multiple times."""),
        message=Parameter(
            args=("--message", ),
            metavar="MESSAGE",
            doc="""Use this message when committing the URL additions.""",
            constraints=EnsureNone() | EnsureStr()),
        dry_run=Parameter(
            args=("-n", "--dry-run"),
            action="store_true",
            doc="""Report which URLs would be downloaded to which files and
            then exit."""),
        fast=Parameter(
            args=("--fast", ),
            action="store_true",
            doc="""If True, add the URLs, but don't download their content.
            Underneath, this passes the --fast flag to `git annex addurl`."""),
        ifexists=Parameter(
            args=("--ifexists", ),
            metavar="ACTION",
            doc="""What to do if a constructed file name already exists.  The
            default behavior is to proceed with the `git annex addurl`, which
            will fail if the file size has changed.  If set to 'overwrite',
            remove the old file before adding the new one.  If set to 'skip',
            do not add the new file.""",
            constraints=EnsureNone() | EnsureChoice("overwrite", "skip")),
        missing_value=Parameter(
            args=("--missing-value", ),
            metavar="VALUE",
            doc="""When an empty string is encountered, use this value
            instead.""",
            constraints=EnsureNone() | EnsureStr()),
        save=nosave_opt,
        version_urls=Parameter(
            args=("--version-urls", ),
            action="store_true",
            doc="""Try to add a version ID to the URL. This currently only has
            an effect on URLs for AWS S3 buckets."""),
    )

    @staticmethod
    @datasetmethod(name='addurls')
    @eval_results
    def __call__(dataset,
                 urlfile,
                 urlformat,
                 filenameformat,
                 input_type="ext",
                 exclude_autometa=None,
                 meta=None,
                 message=None,
                 dry_run=False,
                 fast=False,
                 ifexists=None,
                 missing_value=None,
                 save=True,
                 version_urls=False):
        # Temporarily work around gh-2269.
        url_file = urlfile
        url_format, filename_format = urlformat, filenameformat

        from requests.exceptions import RequestException

        from datalad.distribution.dataset import Dataset, require_dataset
        from datalad.interface.results import get_status_dict
        from datalad.support.annexrepo import AnnexRepo

        lgr = logging.getLogger("datalad.plugin.addurls")

        dataset = require_dataset(dataset, check_installed=False)
        if dataset.repo and not isinstance(dataset.repo, AnnexRepo):
            yield get_status_dict(action="addurls",
                                  ds=dataset,
                                  status="error",
                                  message="not an annex repo")
            return

        if input_type == "ext":
            extension = os.path.splitext(url_file)[1]
            input_type = "json" if extension == ".json" else "csv"

        with open(url_file) as fd:
            try:
                rows, subpaths = extract(fd, input_type, url_format,
                                         filename_format, exclude_autometa,
                                         meta, dry_run, missing_value)
            except (ValueError, RequestException) as exc:
                yield get_status_dict(action="addurls",
                                      ds=dataset,
                                      status="error",
                                      message=exc_str(exc))
                return

        if len(rows) != len(set(row["filename"] for row in rows)):
            yield get_status_dict(action="addurls",
                                  ds=dataset,
                                  status="error",
                                  message=("There are file name collisions; "
                                           "consider using {_repindex}"))
            return

        if dry_run:
            for subpath in subpaths:
                lgr.info("Would create a subdataset at %s", subpath)
            for row in rows:
                lgr.info("Would download %s to %s", row["url"],
                         os.path.join(dataset.path, row["filename"]))
                lgr.info(
                    "Metadata: %s",
                    sorted(u"{}={}".format(k, v)
                           for k, v in row["meta_args"].items()))
            yield get_status_dict(action="addurls",
                                  ds=dataset,
                                  status="ok",
                                  message="dry-run finished")
            return

        if not dataset.repo:
            # Populate a new dataset with the URLs.
            for r in dataset.rev_create(result_xfm=None,
                                        return_type='generator'):
                yield r

        annex_options = ["--fast"] if fast else []

        for spath in subpaths:
            if os.path.exists(os.path.join(dataset.path, spath)):
                lgr.warning("Not creating subdataset at existing path: %s",
                            spath)
            else:
                for r in dataset.rev_create(spath,
                                            result_xfm=None,
                                            return_type='generator'):
                    yield r

        for row in rows:
            # Add additional information that we'll need for various
            # operations.
            filename_abs = os.path.join(dataset.path, row["filename"])
            if row["subpath"]:
                ds_current = Dataset(os.path.join(dataset.path,
                                                  row["subpath"]))
                ds_filename = os.path.relpath(filename_abs, ds_current.path)
            else:
                ds_current = dataset
                ds_filename = row["filename"]
            row.update({
                "filename_abs": filename_abs,
                "ds": ds_current,
                "ds_filename": ds_filename
            })

        if version_urls:
            num_urls = len(rows)
            log_progress(lgr.info,
                         "addurls_versionurls",
                         "Versioning %d URLs",
                         num_urls,
                         label="Versioning URLs",
                         total=num_urls,
                         unit=" URLs")
            for row in rows:
                url = row["url"]
                try:
                    row["url"] = get_versioned_url(url)
                except (ValueError, NotImplementedError) as exc:
                    # We don't expect this to happen because get_versioned_url
                    # should return the original URL if it isn't an S3 bucket.
                    # It only raises exceptions if it doesn't know how to
                    # handle the scheme for what looks like an S3 bucket.
                    lgr.warning("error getting version of %s: %s", row["url"],
                                exc_str(exc))
                log_progress(lgr.info,
                             "addurls_versionurls",
                             "Versioned result for %s: %s",
                             url,
                             row["url"],
                             update=1,
                             increment=True)
            log_progress(lgr.info, "addurls_versionurls",
                         "Finished versioning URLs")

        files_to_add = set()
        for r in add_urls(rows, ifexists=ifexists, options=annex_options):
            if r["status"] == "ok":
                files_to_add.add(r["path"])
            yield r

            msg = message or """\
[DATALAD] add files from URLs

url_file='{}'
url_format='{}'
filename_format='{}'""".format(url_file, url_format, filename_format)

        if files_to_add:
            for r in dataset.add(files_to_add, save=False):
                yield r

            meta_rows = [r for r in rows if r["filename_abs"] in files_to_add]
            for r in add_meta(meta_rows):
                yield r

            # Save here rather than the add call above to trigger a metadata
            # commit on the git-annex branch.
            if save:
                for r in dataset.save(message=msg, recursive=True):
                    yield r
Exemple #19
0
class Metadata(Interface):
    """Metadata manipulation for files and whole datasets

    Two types of metadata are supported:

    1. metadata describing a dataset as a whole (dataset-global), and

    2. metadata for individual files in a dataset.

    Both types can be accessed and modified with this command.
    Note, however, that this only refers to Datalad's native metadata,
    and not to any other metadata that is possibly stored in files of a
    dataset.

    Datalad's native metadata capability is primarily targeting data
    description via arbitrary tags and other (brief) key-value attributes
    (with possibly multiple values for a single key).

    Metadata key names are limited to alphanumerics (and [_-.]). Moreover,
    all key names are converted to lower case.


    *Dataset (global) metadata*

    Metadata describing a dataset as a whole is stored in JSON format
    in the dataset at .datalad/metadata/dataset.json. The amount of
    metadata that can be stored is not limited by Datalad. However,
    it should be kept brief as this information is stored in the Git
    history of the dataset, and access or modification requires to
    read the entire file.

    Arbitrary metadata keys can be used. However, Datalad reserves the
    keys 'tag' and 'definition' for its own use. The can still be
    manipulated without any restrictions like any other metadata items,
    but doing so can impact Datalad's metadata-related functionality,
    handle with care.

    The 'tag' key is used to store a list of (unique) tags.

    The 'definition' key is used to store key-value mappings that define
    metadata keys used elsewhere in the metadata. Using the feature is
    optional (see --define-key). It can be useful in the context of
    data discovery needs, where metadata keys can be precisely defined
    by linking them to specific ontology terms.


    *File metadata*

    Metadata storage for individual files is provided by git-annex, and
    generally the same rules as for dataset-global metadata apply.
    However, there is just one reserved key name: 'tag'.

    Again, the amount of metadata is not limited, but metadata is stored
    in git-annex' internal data structures in the Git repository of a
    dataset. Large amounts of metadata can slow its performance.


    || CMDLINE >>
    *Output rendering*

    By default, a short summary of the metadata for each dataset
    (component) is rendered::

      <path> (<type>): -|<keys> [<tags>]

    where <path> is the path of the respective component, <type> a label
    for the type of dataset components metadata is presented for. Non-existant
    metadata is indicated by a dash, otherwise a comma-separated list of
    metadata keys (except for 'tag'), is followed by a list of tags, if there
    are any.


    << CMDLINE ||
    """
    # make the custom renderer the default, path reporting isn't the top
    # priority here
    result_renderer = 'tailored'

    _params_ = dict(
        dataset=Parameter(args=("-d", "--dataset"),
                          doc="""""",
                          constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(args=("path", ),
                       metavar="PATH",
                       doc="path(s) to set/get metadata",
                       nargs="*",
                       constraints=EnsureStr() | EnsureNone()),
        add=Parameter(args=(
            '-a',
            '--add',
        ),
                      nargs='+',
                      action='append',
                      metavar=('KEY', 'VAL'),
                      doc="""metadata items to add. If only a key is given, a
            corresponding tag is added. If a key-value mapping (multiple
            values at once are supported) is given, the values are
            added to the metadata item of that key.""",
                      constraints=EnsureStr() | EnsureNone()),
        init=Parameter(
            args=(
                '-i',
                '--init',
            ),
            nargs='+',
            action='append',
            metavar=('KEY', 'VAL'),
            doc="""like --add, but tags are only added if no tag was present
            before. Likewise, values are only added to a metadata key, if that
            key did not exist before.""",
            constraints=EnsureStr() | EnsureNone()),
        remove=Parameter(
            args=('--remove', ),
            nargs='+',
            action='append',
            metavar=('KEY', 'VAL'),
            doc="""metadata values to remove. If only a key is given, a
            corresponding tag is removed. If a key-value mapping (multiple
            values at once are supported) is given, only those values are
            removed from the metadata item of that key. If no values are left
            after the removal, the entire item of that key is removed.""",
            constraints=EnsureStr() | EnsureNone()),
        reset=Parameter(
            args=('--reset', ),
            nargs='+',
            action='append',
            metavar=('KEY', 'VAL'),
            doc="""metadata items to remove. If only a key is given, a
            corresponding metadata key with all its values is removed.
            If a key-value mapping (multiple values at once are supported)
            is given, any existing values for this key are replaced by the
            given ones.""",
            constraints=EnsureStr() | EnsureNone()),
        define_key=Parameter(
            args=('--define-key', ),
            nargs=2,
            action='append',
            metavar=('KEY', 'DEFINITION'),
            doc="""convenience option to add an item in the dataset's
            global metadata ('definition' key). This can be used to
            define (custom) keys used in the datasets's metadata, for
            example by providing a URL to an ontology term for a given
            key label. This option does not need --dataset-global to
            be set to be in effect.""",
            constraints=EnsureStr() | EnsureNone()),
        dataset_global=Parameter(
            args=('-g', '--dataset-global'),
            action='store_true',
            doc="""Whether to perform metadata query or modification
            on the global dataset metadata, or on individual dataset
            components. For example, without this switch setting
            metadata using the root path of a dataset, will set the
            given metadata for all files in a dataset, whereas with
            this flag only the metadata record of the dataset itself
            will be altered."""),
        recursive=recursion_flag,
        recursion_limit=recursion_limit)

    @staticmethod
    @datasetmethod(name='metadata')
    @eval_results
    def __call__(path=None,
                 dataset=None,
                 add=None,
                 init=None,
                 remove=None,
                 reset=None,
                 define_key=None,
                 dataset_global=False,
                 recursive=False,
                 recursion_limit=None):
        # bring metadataset setter args in shape first
        untag, remove = _parse_argspec(remove)
        purge, reset = _parse_argspec(reset)
        tag_add, add = _parse_argspec(add)
        tag_init, init = _parse_argspec(init)
        define_key = dict(define_key) if define_key else None
        # merge all potential sources of tag specifications
        all_untag = remove.get('tag', []) + untag
        if all_untag:
            remove['tag'] = all_untag
        all_addtag = add.get('tag', []) + tag_add
        if all_addtag:
            add['tag'] = all_addtag
        all_inittag = init.get('tag', []) + tag_init
        if all_inittag:
            init['tag'] = all_inittag

        lgr.debug("Will 'init' metadata items: %s", init)
        lgr.debug("Will 'add' metadata items: %s", add)
        lgr.debug("Will 'remove' metadata items: %s", remove)
        lgr.debug("Will 'reset' metadata items: %s", reset)
        lgr.debug("Will 'purge' metadata items: %s", purge)

        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='metadata', logger=lgr, refds=refds_path)

        to_process = []
        for ap in AnnotatePaths.__call__(dataset=refds_path,
                                         path=path,
                                         recursive=recursive,
                                         recursion_limit=recursion_limit,
                                         action='metadata',
                                         unavailable_path_status='error',
                                         nondataset_path_status='error',
                                         force_subds_discovery=False,
                                         return_type='generator',
                                         on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', None) == 'dataset':
                if ap.get('state', None) == 'absent':
                    # just discovered via recursion, but not relevant here
                    continue
                if GitRepo.is_valid_repo(ap['path']):
                    ap['process_content'] = True
            to_process.append(ap)

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_process,
                refds_path=refds_path,
                path_only=False)
        assert (not completed)

        # iterate over all datasets, order doesn't matter
        to_save = []
        for ds_path in content_by_ds:
            # ignore submodule entries
            content = [
                ap for ap in content_by_ds[ds_path]
                if ap.get('type', None) != 'dataset' or ap['path'] == ds_path
            ]
            if not content:
                # nothing other than subdatasets were given or discovered in
                # this dataset, ignore
                continue
            ds = Dataset(ds_path)
            if dataset_global or define_key:
                db_path = opj(ds.path, '.datalad', 'metadata', 'dataset.json')
                db = {}
                if exists(db_path):
                    db_fp = open(db_path)
                    # need to read manually, load() would puke on an empty file
                    db_content = db_fp.read()
                    # minimize time for collision
                    db_fp.close()
                    if db_content:
                        db = json.loads(db_content)
                # TODO make manipulation order identical to what git-annex does
                for k, v in init.items() if init else []:
                    if k not in db:
                        db[k] = v
                for k in purge:
                    if k in db:
                        del db[k]
                for k, v in reset.items():
                    db[k] = v
                for k, v in add.items():
                    db[k] = sorted(unique(db.get(k, []) + v))
                for k, v in remove.items():
                    existing_data = db.get(k, [])
                    if isinstance(existing_data, dict):
                        db[k] = {
                            dk: existing_data[dk]
                            for dk in set(existing_data).difference(v)
                        }
                    else:
                        db[k] = list(set(existing_data).difference(v))
                    # wipe out if empty
                    if not db[k]:
                        del db[k]

                added_def = False
                if define_key:
                    defs = db.get('definition', {})
                    for k, v in define_key.items():
                        if k in defs:
                            if not defs[k] == v:
                                yield get_status_dict(
                                    status='error',
                                    ds=ds,
                                    message=
                                    ("conflicting definition for key '%s': '%s' != '%s'",
                                     k, v, defs[k]),
                                    **res_kwargs)
                                continue
                        else:
                            defs[k] = v
                            added_def = True
                    db['definition'] = defs
                # store, if there is anything
                if db:
                    if not exists(dirname(db_path)):
                        makedirs(dirname(db_path))
                    db_fp = open(db_path, 'w')
                    # produce relatively compact, but also diff-friendly format
                    json.dump(db,
                              db_fp,
                              indent=0,
                              separators=(',', ':\n'),
                              sort_keys=True)
                    # minimize time for collision
                    db_fp.close()
                    # use add not save to also cover case of a fresh file
                    ds.add(db_path, save=False)
                    to_save.append(
                        dict(path=db_path, parentds=ds.path, type='file'))
                elif exists(db_path):
                    # no metadata left, kill file
                    ds.remove(db_path)
                    to_save.append(dict(path=ds.path, type='dataset'))
                if added_def or init or add or remove or reset or purge:
                    # if anything happended or could have happended
                    yield get_status_dict(status='ok',
                                          ds=ds,
                                          metadata=db,
                                          **res_kwargs)
            elif not isinstance(ds.repo, AnnexRepo):
                # report on all explicitly requested paths only
                for ap in [c for c in content if ap.get('raw_input', False)]:
                    yield dict(
                        ap,
                        status='impossible',
                        message=(
                            'non-annex dataset %s has no file metadata support',
                            ds),
                        **res_kwargs)
                continue
            ds_paths = [p['path'] for p in content]
            if not dataset_global:
                if reset or purge or add or init or remove:
                    # file metadata manipulation
                    mod_paths = []
                    for mp in ds.repo.set_metadata(
                            ds_paths,
                            reset=reset,
                            add=add,
                            init=init,
                            remove=remove,
                            purge=purge,
                            # we always go recursive
                            # TODO is that a good thing? But how to otherwise distinuish
                            # this kind of recursive from the one across datasets in
                            # the API?
                            recursive=True):
                        if mp.get('success', False):
                            mod_paths.append(mp['file'])
                        else:
                            yield get_status_dict(
                                status='error',
                                message='setting metadata failed',
                                path=opj(ds.path, mp[0]),
                                type='file',
                                **res_kwargs)
                    # query the actually modified paths only
                    ds_paths = mod_paths

                # and lastly, query -- even if we set before -- there could
                # be side-effect from multiple set paths on an individual
                # path, hence we need to query to get the final result
                for file, meta in ds.repo.get_metadata(ds_paths):
                    r = get_status_dict(status='ok',
                                        path=opj(ds.path, file),
                                        type='file',
                                        metadata=meta,
                                        **res_kwargs)
                    yield r
        # save potential modifications to dataset global metadata
        if not to_save:
            return
        for res in Save.__call__(files=to_save,
                                 dataset=refds_path,
                                 message='[DATALAD] dataset metadata update',
                                 return_type='generator',
                                 result_xfm=None,
                                 result_filter=None,
                                 on_failure='ignore'):
            yield res

    @staticmethod
    def custom_result_renderer(res, **kwargs):
        from datalad.ui import ui
        if res['status'] != 'ok' or not res.get('action', None) == 'metadata':
            # logging complained about this already
            return
        # list the path, available metadata keys, and tags
        path = relpath(res['path'], res['refds']) if res.get(
            'refds', None) else res['path']
        meta = res.get('metadata', {})
        ui.message('{path}{type}:{spacer}{meta}{tags}'.format(
            path=path,
            type=' ({})'.format(res['type']) if 'type' in res else '',
            spacer=' ' if len([m for m in meta if m != 'tag']) else '',
            meta=','.join(k for k in sorted(meta.keys())
                          if not k == 'tag') if meta else ' -',
            tags=''
            if 'tag' not in meta else ' [{}]'.format(','.join(meta['tag']))))
Exemple #20
0
class AnnotatePaths(Interface):
    """Analyze and act upon input paths

    Given paths (or more generally location requests) are inspected and
    annotated with a number of properties. A list of recognized properties
    is provided below.

    || PYTHON >>Input `paths` for this command can either be un-annotated
    (raw) path strings, or already (partially) annotated paths. In the latter
    case, further annotation is limited to yet-unknown properties, and is
    potentially faster than initial annotation.<< PYTHON ||


    *Recognized path properties*

    {proplist}

    In the case of enabled modification detection the results may contain
    additional properties regarding the nature of the modification. See the
    documentation of the `diff` command for details.

    """
    _docs_ = dict(
        proplist='\n\n    '.join(
            '"{}"\n{}'.format(
                k,
                textwrap.fill(known_props[k],
                              initial_indent='        ',
                              subsequent_indent='        '))
            for k in sorted(known_props)))

    _params_ = dict(
        path=Parameter(
            args=("path",),
            metavar="PATH",
            doc="""path to be annotated""",
            nargs="*",
            constraints=EnsureStr() | EnsureNone()),
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""an optional reference/base dataset for the paths""",
            constraints=EnsureDataset() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        action=Parameter(
            args=("--action",),
            metavar="LABEL",
            doc="""an "action" property value to include in the
            path annotation""",
            constraints=EnsureStr() | EnsureNone()),
        unavailable_path_status=Parameter(
            args=("--unavailable-path-status",),
            metavar="LABEL",
            doc="""a "status" property value to include in the
            annotation for paths that are underneath a dataset, but
            do not exist on the filesystem""",
            constraints=EnsureStr() | EnsureNone()),
        unavailable_path_msg=Parameter(
            args=("--unavailable-path-msg",),
            metavar="message",
            doc="""a "message" property value to include in the
            annotation for paths that are underneath a dataset, but
            do not exist on the filesystem""",
            constraints=EnsureStr() | EnsureNone()),
        nondataset_path_status=Parameter(
            args=("--nondataset-path-status",),
            metavar="LABEL",
            doc="""a "status" property value to include in the
            annotation for paths that are not underneath any dataset""",
            constraints=EnsureStr() | EnsureNone()),
        force_parentds_discovery=Parameter(
            args=("--no-parentds-discovery",),
            dest='force_parentds_discovery',
            action='store_false',
            doc="""Flag to disable reports of parent dataset information for any
            path, in particular dataset root paths. Disabling saves on command
            run time, if this information is not needed."""),
        force_subds_discovery=Parameter(
            args=("--no-subds-discovery",),
            action='store_false',
            dest='force_subds_discovery',
            doc="""Flag to disable reporting type='dataset' for subdatasets, even
            when they are not installed, or their mount point directory doesn't
            exist. Disabling saves on command run time, if this information is
            not needed."""),
        force_untracked_discovery=Parameter(
            args=("--no-untracked-discovery",),
            action='store_false',
            dest='force_untracked_discovery',
            doc="""Flag to disable discovery of untracked changes.
                Disabling saves on command run time, if this information is
                not needed."""),
        force_no_revision_change_discovery=Parameter(
            args=("--revision-change-discovery",),
            action='store_false',
            dest='force_no_revision_change_discovery',
            doc="""Flag to disable discovery of changes which were not yet committed.
            Disabling saves on command run time, if this information is
            not needed."""),
        modified=Parameter(
            args=("--modified",),
            nargs='?',
            const=True,
            constraints=EnsureStr() | EnsureBool() | EnsureNone(),
            doc="""comparison reference specification for modification detection.
            This can be (mostly) anything that `git diff` understands (commit,
            treeish, tag, etc). See the documentation of `datalad diff --revision`
            for details. Unmodified paths will not be annotated. If a requested
            path was not modified but some content underneath it was, then the
            request is replaced by the modified paths and those are annotated
            instead. This option can be used [PY: with `True` as PY][CMD: without CMD]
            an argument to test against changes that have been made, but have not
            yet been staged for a commit."""))

    @staticmethod
    @datasetmethod(name='annotate_paths')
    @eval_results
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            action=None,
            unavailable_path_status='',
            unavailable_path_msg=None,
            nondataset_path_status='error',
            force_parentds_discovery=True,
            force_subds_discovery=True,
            force_no_revision_change_discovery=True,
            force_untracked_discovery=True,
            modified=None):
        # upfront check for the fastest possible response
        if not path and dataset is None:
            # nothing given, try "here", but do not use `require_dataset`, as
            # it will determine the root dataset of `curdir` and further down
            # lead to path annotation of upstairs directories
            dataset = curdir

        if force_subds_discovery and not force_parentds_discovery:
            raise ValueError(
                'subdataset discovery requires parent dataset discovery')

        # CONCEPT: yield with no status to indicate further processing

        # everything in one big loop to be able too yield as fast a possible
        # without any precomputing for all paths
        refds_path = Interface.get_refds_path(dataset)
        if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)):
            raise ValueError(
                "modification detection only works with a base dataset (non-given or found)")

        # prep common result props
        res_kwargs = dict(
            action=action if action else 'annotate_path',
            refds=refds_path,
            logger=lgr)

        # handle the case of recursion into a single dataset without any
        # extra fancy processing first -- full recursion can be done
        # faster than manual recursion, hence we gain quite some speed
        # from these few lines of extra code
        if not modified and not path and refds_path:
            if not GitRepo.is_valid_repo(refds_path):
                yield get_status_dict(
                    # doesn't matter if the path is in another dataset
                    # it was given as reference dataset
                    status=nondataset_path_status,
                    message='given reference dataset is not a dataset',
                    path=refds_path,
                    **res_kwargs)
                return

            refds = Dataset(refds_path)
            path = []
            # yield the dataset itself
            r = get_status_dict(ds=refds, status='', **res_kwargs)
            yield r

            if recursive:
                # if we have nothing given, but need recursion, we need to feed
                # the dataset path itself
                for r in yield_recursive(
                        refds,
                        refds_path,
                        action,
                        recursion_limit):
                    r.update(res_kwargs)
                    if 'refds' in r and not r['refds']:
                        # avoid cruft
                        del r['refds']
                    yield r
            return

        # goal: structure in a way that makes most information on any path
        # available in a single pass, at the cheapest possible cost
        reported_paths = {}
        requested_paths = assure_list(path)

        if modified is not None:
            # modification detection would silently kill all nondataset paths
            # but we have to complain about them, hence doing it here
            if requested_paths and refds_path:
                for r in requested_paths:
                    p = r['path'] if isinstance(r, dict) else r
                    p = resolve_path(p, ds=refds_path)
                    if path_startswith(p, refds_path):
                        # all good
                        continue
                    # not the refds
                    path_props = r if isinstance(r, dict) else {}
                    res = get_status_dict(
                        **dict(res_kwargs, **path_props))
                    res['status'] = nondataset_path_status
                    res['message'] = 'path not associated with reference dataset'
                    reported_paths[r] = res
                    yield res

            # preserve non-existing paths to be silently killed by modification
            # detection and append them to requested_paths again after detection.
            # TODO: This might be melted in with treatment of non dataset paths
            # above. Re-appending those paths seems to be better than yielding
            # directly to avoid code duplication, since both cases later on are
            # dealt with again.
            preserved_paths = []
            if requested_paths:
                [preserved_paths.append(r)
                 for r in requested_paths
                 if not lexists(r['path'] if isinstance(r, dict) else r)]

            # replace the requested paths by those paths that were actually
            # modified underneath or at a requested location
            requested_paths = get_modified_subpaths(
                # either the request, or the base dataset, if there was no request
                requested_paths if requested_paths else [refds_path],
                refds=Dataset(refds_path),
                revision=modified,
                report_no_revision_change=force_no_revision_change_discovery,
                report_untracked='all' if force_untracked_discovery else 'no',
                recursion_limit=recursion_limit)

            from itertools import chain
            # re-append the preserved paths:
            requested_paths = chain(requested_paths, iter(preserved_paths))

        # do not loop over unique(), this could be a list of dicts
        # we avoid duplicates manually below via `reported_paths`
        for path in requested_paths:
            if not isinstance(path, dict):
                path = rawpath2ap(path, refds_path)
            # this is now an annotated path!
            path_props = path
            path = path['path']
            # we need to mark our territory, who knows where this has been
            path_props.update(res_kwargs)

            if path in reported_paths:
                # we already recorded this path in the output
                # this can happen, whenever `path` is a subdataset, that was
                # discovered via recursive processing of another path before
                continue
            # the path exists in some shape or form
            # TODO if we have path_props already we could skip this test
            if isdir(path):
                # keep any existing type info, previously a more expensive run
                # could have discovered an uninstalled 'dataset', and we don't
                # want it to be relabeled to a directory
                path_props['type'] = \
                    path_props.get(
                        'type',
                        'dataset' if not islink(path) and GitRepo.is_valid_repo(path) else 'directory')
                # this could contain all types of additional content
                containing_dir = path if not islink(path) else normpath(opj(path, pardir))
            else:
                if lexists(path):
                    path_props['type'] = 'file'
                else:
                    path_props['state'] = 'absent'
                # for everything else we are interested in the container
                containing_dir = dirname(path)
                if not containing_dir:
                    containing_dir = curdir

            dspath = parent = get_dataset_root(containing_dir)
            if dspath:
                if path_props.get('type', None) == 'dataset':
                    # for a dataset the root is not the parent, for anything else
                    # it is
                    parent = path_props.get('parentds', None)
                    oneupdir = normpath(opj(containing_dir, pardir))
                    if parent is None and (force_parentds_discovery or (
                            refds_path and _with_sep(oneupdir).startswith(
                                _with_sep(refds_path)))):
                        # either forced, or only if we have a reference dataset, and
                        # only if we stay within this refds when searching for the
                        # parent
                        parent = get_dataset_root(normpath(opj(containing_dir, pardir)))
                        # NOTE the `and refds_path` is critical, as it will determine
                        # whether a top-level dataset that was discovered gets the
                        # parent property or not, it won't get it without a common
                        # base dataset, and that is how we always rolled
                    if parent and refds_path:
                        path_props['parentds'] = parent
                        # don't check whether this is actually a true subdataset of the
                        # parent, done further down
                else:
                    # set parent, but prefer existing property
                    path_props['parentds'] = path_props.get('parentds', dspath)

            # test for `dspath` not `parent`, we only need to know whether there is
            # ANY dataset, not which one is the true parent, logic below relies on
            # the fact that we end here, if there is no dataset at all
            if not dspath:
                # not in any dataset
                res = get_status_dict(
                    **dict(res_kwargs, **path_props))
                res['status'] = nondataset_path_status
                res['message'] = 'path not associated with any dataset'
                reported_paths[path] = res
                yield res
                continue

            # check that we only got SUBdatasets
            if refds_path and not path_startswith(dspath, refds_path):
                res = get_status_dict(**dict(res_kwargs, **path_props))
                res['status'] = nondataset_path_status
                res['message'] = \
                    ('path not part of the reference dataset at %s', refds_path)
                reported_paths[path] = res
                yield res
                continue

            if path_props.get('type', None) == 'file':
                # nothing else we can learn about this
                res = get_status_dict(**dict(res_kwargs, **path_props))
                if 'status' not in res:
                    res['status'] = ''
                reported_paths[path] = res
                yield res
                continue

            containing_ds = None
            path_type = path_props.get('type', None)
            if parent and force_subds_discovery and (
                    (path_type == 'dataset' and 'registered_subds' not in path_props) or
                    path_type == 'directory' or
                    not lexists(path)):
                # if the path doesn't exist, or is labeled a directory, or a dataset even
                # a dataset (without this info) -> record whether this is a known subdataset
                # to its parent
                containing_ds = Dataset(parent)
                subdss = containing_ds.subdatasets(
                    fulfilled=None, recursive=False,
                    result_xfm=None, result_filter=None, return_type='list')
                if path in [s['path'] for s in subdss]:
                    if path_type == 'directory' or not lexists(path):
                        # first record that it isn't here, if just a dir or not here at all
                        path_props['state'] = 'absent'
                    # this must be a directory, and it is not installed
                    path_props['type'] = 'dataset'
                    path_props['registered_subds'] = True

            if not lexists(path) or \
                    (path_props.get('type', None) == 'dataset' and
                     path_props.get('state', None) == 'absent'):
                # not there (yet)
                message = unavailable_path_msg if unavailable_path_msg else None
                if message and '%s' in message:
                    message = (message, path)
                path_props['message'] = message
                res = get_status_dict(**dict(res_kwargs, **path_props))
                # assign given status, but only if the props don't indicate a status
                # already
                res['status'] = path_props.get(
                    'status', unavailable_path_status)
                reported_paths[path] = res
                yield res
                continue

            # we know everything we can, report
            res = get_status_dict(**dict(res_kwargs, **path_props))
            if 'status' not in res:
                res['status'] = ''
            reported_paths[path] = res
            yield res

            rec_paths = []
            if recursive:
                # here we need to consider the special case that `path` is
                # a dataset itself, if a recursion_limit is given (e.g.
                # `remove` will do that by default), we need to recurse
                # from the dataset itself, and not its parent to get things
                # right -- this will also avoid needless discovery of
                # unrelated subdatasets
                if path_props.get('type', None) == 'dataset':
                    containing_ds = Dataset(path)
                else:
                    # regular parent, we might have a dataset already
                    containing_ds = Dataset(parent) if containing_ds is None else containing_ds
                for r in yield_recursive(containing_ds, path, action, recursion_limit):
                    # capture reported paths
                    r.update(res_kwargs)
                    if 'refds' in r and not r['refds']:
                        # avoid cruft
                        del r['refds']
                    reported_paths[r['path']] = r
                    if modified is not None:
                        # we cannot yield right away, maybe it wasn't modified
                        rec_paths.append(r)
                    else:
                        yield r
            if modified is not None and rec_paths:
                # replace the recursively discovered paths by those paths that
                # were actually modified underneath or at a requested location
                for r in get_modified_subpaths(
                        rec_paths,
                        refds=Dataset(refds_path),
                        revision=modified,
                        report_no_revision_change=force_no_revision_change_discovery,
                        report_untracked='all' if force_untracked_discovery else 'no',
                        recursion_limit=recursion_limit):
                    res = get_status_dict(**dict(r, **res_kwargs))
                    reported_paths[res['path']] = res
                    yield res
        return
Exemple #21
0
class Save(Interface):
    """Save the current state of a dataset

    Saving the state of a dataset records changes that have been made to it.
    This change record is annotated with a user-provided description.
    Optionally, an additional tag, such as a version, can be assigned to the
    saved state. Such tag enables straightforward retrieval of past versions at
    a later point in time.

    Examples:

      Save any content underneath the current directory, without altering
      any potential subdataset (use --recursive for that)::

        % datalad save .

      Save any modification of known dataset content, but leave untracked
      files (e.g. temporary files) untouched::

        % dataset save -u -d <path_to_dataset>

      Tag the most recent saved state of a dataset::

        % dataset save -d <path_to_dataset> --version-tag bestyet

    .. note::
      For performance reasons, any Git repository without an initial commit
      located inside a Dataset is ignored, and content underneath it will be
      saved to the respective superdataset. DataLad datasets always have an
      initial commit, hence are not affected by this behavior.
    """
    # note above documents that out behavior is like that of `git add`, but
    # does not explicitly mention the connection to keep it simple.

    _params_ = dict(
        dataset=Parameter(args=("-d", "--dataset"),
                          doc=""""specify the dataset to save""",
                          constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(
            args=("path", ),
            metavar='PATH',
            doc="""path/name of the dataset component to save. If given, only
            changes made to those components are recorded in the new state.""",
            nargs='*',
            constraints=EnsureStr() | EnsureNone()),
        message=save_message_opt,
        message_file=Parameter(
            args=("-F", "--message-file"),
            doc="""take the commit message from this file. This flag is
            mutually exclusive with -m.""",
            constraints=EnsureStr() | EnsureNone()),
        version_tag=Parameter(
            args=(
                "-t",
                "--version-tag",
            ),
            metavar='ID',
            doc="""an additional marker for that state. Every dataset that
            is touched will receive the tag.""",
            constraints=EnsureStr() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        updated=Parameter(
            args=(
                '-u',
                '--updated',
            ),
            action='store_true',
            doc="""if given, only saves previously tracked paths."""),
        to_git=Parameter(
            args=("--to-git", ),
            action='store_true',
            doc="""flag whether to add data directly to Git, instead of
            tracking data identity only.  Usually this is not desired,
            as it inflates dataset sizes and impacts flexibility of data
            transport. If not specified - it will be up to git-annex to
            decide, possibly on .gitattributes options. Use this flag
            with a simultaneous selection of paths to save. In general,
            it is better to pre-configure a dataset to track particular paths,
            file types, or file sizes with either Git or git-annex.
            See https://git-annex.branchable.com/tips/largefiles/"""),
    )

    @staticmethod
    @datasetmethod(name='save')
    @eval_results
    def __call__(
        path=None,
        message=None,
        dataset=None,
        version_tag=None,
        recursive=False,
        recursion_limit=None,
        updated=False,
        message_file=None,
        to_git=None,
    ):
        if message and message_file:
            raise ValueError(
                "Both a message and message file were specified for save()")

        path = assure_list(path)

        if message_file:
            with open(message_file) as mfh:
                message = mfh.read()

        # we want 'normal' to achieve the most compact argument list
        # for git calls
        # untracked_mode = 'no' if updated else 'normal'
        # TODO however, Repo.add() would refuse to add any dotfiles
        # in a directory that is itself untracked, hence the only
        # choice is to go with potentially crazy long lists
        # until https://github.com/datalad/datalad/issues/1454
        # has a resolution
        untracked_mode = 'no' if updated else 'all'

        # there are three basic scenarios:
        # 1. save modifications to any already tracked content
        # 2. save any content (including removal of deleted content)
        #    to bring things to a clean state
        # 3. like (2), but only operate on a given subset of content
        #    identified by paths
        # - all three have to work in conjunction with --recursive
        # - the difference between (1) and (2) should be no more
        #   that a switch from --untracked=no to --untracked=all
        #   in Repo.save()

        # we do not support
        # - simultaneous operations on multiple datasets from disjoint
        #   dataset hierarchies, hence a single reference dataset must be
        #   identifiable from the either
        #   - curdir or
        #   - the `dataset` argument.
        #   This avoids complex annotation loops and hierarchy tracking.
        # - any modification upwards from the root dataset

        ds = require_dataset(dataset, check_installed=True, purpose='saving')

        # use status() to do all discovery and annotation of paths
        paths_by_ds = {}
        for s in Status()(
                # ATTN: it is vital to pass the `dataset` argument as it,
                # and not a dataset instance in order to maintain the path
                # semantics between here and the status() call
                dataset=dataset,
                path=path,
                untracked=untracked_mode,
                recursive=recursive,
                recursion_limit=recursion_limit,
                on_failure='ignore',
                result_renderer='disabled'):
            if s['status'] == 'error':
                # Downstream code can't do anything with these. Let the caller
                # decide their fate.
                yield s
                continue

            # fish out status dict for this parent dataset
            ds_status = paths_by_ds.get(s['parentds'], {})
            # reassemble path status info as repo.status() would have made it
            ds_status[ut.Path(s['path'])] = \
                {k: v for k, v in iteritems(s)
                 if k not in (
                     'path', 'parentds', 'refds', 'status', 'action',
                     'logger')}
            paths_by_ds[s['parentds']] = ds_status

        lgr.debug('Determined %i datasets for saving from input arguments',
                  len(paths_by_ds))
        # figure out what datasets to process, start with the ones containing
        # the paths that were given as arguments
        discovered_datasets = list(paths_by_ds.keys())
        if dataset:
            # if a reference dataset was given we want to save all the way up
            # to it, so let's throw it into the mix
            discovered_datasets.append(ds.path)
        # sort the datasets into (potentially) disjoint hierarchies,
        # or a single one, if a reference dataset was given
        dataset_hierarchies = get_tree_roots(discovered_datasets)
        for rootds, children in iteritems(dataset_hierarchies):
            edges = {}
            discover_dataset_trace_to_targets(rootds,
                                              children, [],
                                              edges,
                                              includeds=children)
            for superds, subdss in iteritems(edges):
                superds_status = paths_by_ds.get(superds, {})
                for subds in subdss:
                    # TODO actually start from an entry that may already
                    # exist in the status record
                    superds_status[ut.Path(subds)] = dict(
                        # shot from the hip, some status config
                        # to trigger this specific super/sub
                        # relation to be saved
                        state='untracked',
                        type='dataset')
                paths_by_ds[superds] = superds_status

        # TODO parallelize, whenever we have multiple subdataset of a single
        # dataset they can all be processed simultaneously
        # sort list of dataset to handle, starting with the ones deep down
        for pdspath in sorted(paths_by_ds, reverse=True):
            pds = Dataset(pdspath)
            # pop status for this dataset, we are not coming back to it
            pds_status = {
                # for handing over to the low-level code, we recode any
                # path relative to the real repo location, this avoid
                # cumbersome symlink handling without context in the
                # lower levels
                pds.repo.pathobj / p.relative_to(pdspath): props
                for p, props in iteritems(paths_by_ds.pop(pdspath))
            }
            start_commit = pds.repo.get_hexsha()
            if not all(p['state'] == 'clean' for p in pds_status.values()):
                for res in pds.repo.save_(
                        message=message,
                        # make sure to have the `path` arg be None, as we want
                        # to prevent and bypass any additional repo.status()
                        # calls
                        paths=None,
                        # prevent whining of GitRepo
                        git=True
                        if not hasattr(ds.repo, 'annexstatus') else to_git,
                        # we are supplying the full status already, do not
                        # detect anything else
                        untracked='no',
                        _status=pds_status):
                    # TODO remove stringification when datalad-core can handle
                    # path objects, or when PY3.6 is the lowest supported
                    # version
                    for k in ('path', 'refds'):
                        if k in res:
                            res[k] = text_type(
                                # recode path back to dataset path anchor
                                pds.pathobj /
                                res[k].relative_to(pds.repo.pathobj))
                    yield res
            # report on the dataset itself
            dsres = dict(
                action='save',
                type='dataset',
                path=pds.path,
                refds=ds.path,
                status='ok'
                if start_commit != pds.repo.get_hexsha() else 'notneeded',
                logger=lgr,
            )
            if not version_tag:
                yield dsres
                continue
            try:
                pds.repo.tag(version_tag)
                dsres.update(status='ok', version_tag=version_tag)
                yield dsres
            except CommandError as e:
                if dsres['status'] == 'ok':
                    # first we yield the result for the actual save
                    yield dsres.copy()
                # and now complain that tagging didn't work
                dsres.update(status='error',
                             message=('cannot tag this version: %s',
                                      e.stderr.strip()))
                yield dsres
Exemple #22
0
class Update(Interface):
    """Update a dataset from a sibling."""

    _params_ = dict(
        name=Parameter(
            args=("name",),
            doc="""name of the sibling to update from""",
            nargs="?",
            constraints=EnsureStr() | EnsureNone()),
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc=""""specify the dataset to update. If
            no dataset is given, an attempt is made to identify the dataset
            based on the current working directory""",
            constraints=EnsureDataset() | EnsureNone()),
        merge=Parameter(
            args=("--merge",),
            action="store_true",
            doc="Merge changes from sibling `name` or the remote branch, "
                "configured to be the tracking branch if no sibling was "
                "given.",),
        # TODO: How to document it without using the term 'tracking branch'?
        recursive=Parameter(
            args=("-r", "--recursive"),
            action="store_true",
            doc="""If set this updates all possibly existing subdatasets,
             too."""),
        fetch_all=Parameter(
            args=("--fetch-all",),
            action="store_true",
            doc="Fetch updates from all siblings.",),
        reobtain_data=Parameter(
            args=("--reobtain-data",),
            action="store_true",
            doc="TODO"),)

    @staticmethod
    @datasetmethod(name='update')
    def __call__(name=None, dataset=None,
                 merge=False, recursive=False, fetch_all=False,
                 reobtain_data=False):
        """
        """
        # TODO: Is there an 'update filehandle' similar to install and publish?
        # What does it mean?

        if reobtain_data:
            # TODO: properly define, what to do
            raise NotImplementedError("TODO: Option '--reobtain-data' not "
                                      "implemented yet.")

        # shortcut
        ds = dataset

        if ds is not None and not isinstance(ds, Dataset):
            ds = Dataset(ds)

        # if we have no dataset given, figure out which one we need to operate
        # on, based on the current working directory of the process:
        if ds is None:
            # try to find a dataset at or above PWD:
            dspath = GitRepo.get_toppath(getpwd())
            if dspath is None:
                raise ValueError("No dataset found at %s." % getpwd())
            ds = Dataset(dspath)
        assert(ds is not None)

        if not ds.is_installed():
            raise ValueError("No installed dataset found at "
                             "{0}.".format(ds.path))
        assert(ds.repo is not None)

        repos_to_update = [ds.repo]
        if recursive:
            repos_to_update += [GitRepo(opj(ds.path, sub_path))
                                for sub_path in
                                ds.get_dataset_handles(recursive=True)]

        for repo in repos_to_update:
            # get all remotes:
            remotes = repo.git_get_remotes()
            if name and name not in remotes:
                lgr.warning("'%s' not known to dataset %s.\nSkipping" %
                            (name, repo.path))
                continue

            # Currently '--merge' works for single remote only:
            # TODO: - condition still incomplete
            #       - We can merge if a remote was given or there is a
            #         tracking branch
            #       - we also can fetch all remotes independently on whether or
            #         not we merge a certain remote
            if not name and len(remotes) > 1 and merge:
                lgr.debug("Found multiple remotes:\n%s" % remotes)
                raise NotImplementedError("No merge strategy for multiple "
                                          "remotes implemented yet.")
            lgr.info("Updating handle '%s' ..." % repo.path)

            # fetch remote(s):
            repo.git_fetch(name if name else '',
                           "--all" if fetch_all else '')

            # if it is an annex and there is a tracking branch, and we didn't
            # fetch the entire remote anyway, explicitly fetch git-annex
            # branch:
            # TODO: Is this logic correct? Shouldn't we fetch git-annex from
            # `name` if there is any (or if there is no tracking branch but we
            # have a `name`?
            if knows_annex(repo.path) and not fetch_all:
                # check for tracking branch's remote:
                try:
                    std_out, std_err = \
                        repo._git_custom_command('',
                        ["git", "config", "--get",
                         "branch.{active_branch}.remote".format(
                             active_branch=repo.git_get_active_branch())])
                except CommandError as e:
                    if e.code == 1 and e.stdout == "":
                        std_out = None
                    else:
                        raise
                if std_out:  # we have a "tracking remote"
                    repo.git_fetch("%s git-annex" % std_out.strip())

            # merge:
            if merge:
                lgr.info("Applying changes from tracking branch...")
                cmd_list = ["git", "pull"]
                if name:
                    cmd_list.append(name)
                    # branch needed, if not default remote
                    # => TODO: use default remote/tracking branch to compare
                    #          (see above, where git-annex is fetched)
                    # => TODO: allow for passing a branch
                    # (or more general refspec?)
                    # For now, just use the same name
                    cmd_list.append(repo.git_get_active_branch())

                out, err = repo._git_custom_command('', cmd_list)
                lgr.info(out)
                if knows_annex(repo.path):
                    # annex-apply:
                    lgr.info("Updating annex ...")
                    out, err = repo._git_custom_command('', ["git", "annex", "merge"])
                    lgr.info(out)
Exemple #23
0
class Remove(Interface):
    """Remove components from datasets

    Removing "unlinks" a dataset component, such as a file or subdataset, from
    a dataset. Such a removal advances the state of a dataset, just like adding
    new content. A remove operation can be undone, by restoring a previous
    dataset state, but might require re-obtaining file content and subdatasets
    from remote locations.

    This command relies on the 'drop' command for safe operation. By default,
    only file content from datasets which will be uninstalled as part of
    a removal will be dropped. Otherwise file content is retained, such that
    restoring a previous version also immediately restores file content access,
    just as it is the case for files directly committed to Git. This default
    behavior can be changed to always drop content prior removal, for cases
    where a minimal storage footprint for local datasets installations is
    desirable.

    Removing a dataset component is always a recursive operation. Removing a
    directory, removes all content underneath the directory too. If
    subdatasets are located under a to-be-removed path, they will be
    uninstalled entirely, and all their content dropped. If any subdataset
    can not be uninstalled safely, the remove operation will fail and halt.

    .. versionchanged:: 0.16
       More in-depth and comprehensive safety-checks are now performed by
       default.
       The ``if_dirty||--if-dirty`` argument is ignored, will be removed in
       a future release, and can be removed for a safe-by-default behavior. For
       other cases consider the ``reckless||--reckless`` argument.
       The ``save||--save`` argument is ignored and will be removed in a future
       release, a dataset modification is now always saved. Consider save's
       ``amend||--amend`` argument for post-remove fix-ups.
       The ``recursive||--recursive`` argument is ignored, and will be removed
       in a future release. Removal operations are always recursive, and the
       parameter can be stripped from calls for a safe-by-default behavior.

    .. deprecated:: 0.16
       The ``check||--check`` argument will be removed in a future release.
       It needs to be replaced with ``reckless||--reckless``.
    """
    _params_ = dict(
        dataset=Parameter(args=("-d", "--dataset"),
                          metavar="DATASET",
                          doc="""specify the dataset to perform remove from.
            If no dataset is given, the current working directory is used
            as operation context""",
                          constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(
            args=("path", ),
            metavar="PATH",
            doc="path of a dataset or dataset component to be removed",
            nargs="*",
            constraints=EnsureStr() | EnsureNone()),
        drop=Parameter(
            args=("--drop", ),
            doc="""which dataset components to drop prior removal. This
            parameter is passed on to the underlying drop operation as
            its 'what' argument.""",
            # we must not offer a 'nothing' which would bypass
            # the `drop()` call. The implementation completely
            # relies on `drop()` for all safety measures.
            # instead `drop(reckless=kill)` must be used to fast-kill
            # things
            constraints=EnsureChoice('datasets', 'all')),
        jobs=jobs_opt,
        message=save_message_opt,
        # XXX deprecate!
        save=Parameter(
            args=("--nosave", ),
            dest='save',
            action="store_false",
            doc="""DEPRECATED and IGNORED; use `save --amend` instead"""),
        recursive=Parameter(
            args=(
                "--recursive",
                '-r',
            ),
            action='store_const',
            const=None,
            doc="""DEPRECATED and IGNORED: removal is always a recursive
            operation"""),
    )
    # inherit some from Drop
    # if_dirty and check as deprecated
    for p in (
            'reckless',
            'if_dirty',
            'check',
    ):
        _params_[p] = Drop._params_[p]

    _examples_ = [
        dict(
            text=
            "Permanently remove a subdataset (and all further subdatasets contained in it) from a dataset",
            code_py="remove(dataset='path/to/dataset', path='path/to/subds')",
            code_cmd="datalad remove -d <path/to/dataset> <path/to/subds>"),
        dict(
            text=
            "Permanently remove a superdataset (with all subdatasets) from the filesystem",
            code_py="remove(dataset='path/to/dataset')",
            code_cmd="datalad remove -d <path/to/dataset>"),
        dict(
            text=
            "DANGER-ZONE: Fast wipe-out a dataset and all its subdataset, bypassing all safety checks",
            code_py="remove(dataset='path/to/dataset', reckless='kill')",
            code_cmd="datalad remove -d <path/to/dataset> --reckless kill"),
    ]

    @staticmethod
    @datasetmethod(name='remove')
    @eval_results
    def __call__(
            path=None,
            *,
            dataset=None,
            drop='datasets',
            reckless=None,
            message=None,
            jobs=None,
            # deprecated below
            recursive=None,
            check=None,
            save=None,
            if_dirty=None):

        # deprecate checks
        if if_dirty is not None:
            warnings.warn(
                "The `if_dirty` argument of `datalad remove` is ignored, "
                "it can be removed for a safe-by-default behavior. For "
                "other cases consider the `reckless` argument.",
                DeprecationWarning)

        if save is not None:
            warnings.warn(
                "The `save` argument of `datalad remove` is ignored. "
                "A dataset modification is always saved. Consider "
                "`save --amend` if post-remove fix-ups are needed.",
                DeprecationWarning)

        if recursive is not None:
            warnings.warn(
                "The `recursive` argument of `datalad remove` is ignored. "
                "Removal operations are always recursive, and the parameter "
                "can be stripped from calls for a safe-by-default behavior. ",
                DeprecationWarning)

        if check is not None:
            warnings.warn(
                "The `check` argument of `datalad remove` is deprecated, "
                "use the `reckless` argument instead.", DeprecationWarning)

        if check is False:
            if reckless is not None:
                raise ValueError(
                    'Must not use deprecated `check` argument, and new '
                    '`reckless` argument together with `datalad remove`.')
            reckless = 'availability'

        refds = require_dataset(dataset,
                                check_installed=True,
                                purpose='remove')
        # same path resolution that drop will do
        paths_by_ds, errors = get_paths_by_ds(
            refds,
            dataset,
            ensure_list(path),
            # super-mode will readily tell us which datasets to
            # save as the end
            subdsroot_mode='super')

        drop_success = True
        for res in Drop.__call__(
                dataset=dataset,
                path=path,
                what=drop,
                reckless=reckless,
                recursive=True,
                recursion_limit=None,
                jobs=jobs,
                result_xfm=None,
                return_type='generator',
                result_renderer='disabled',
                # delegate error handling here
                on_failure='ignore'):
            if res.get('status') not in ('ok', 'notneeded'):
                drop_success = False
            yield res

        if not drop_success:
            # there will be 'rm -rf' below, so play safe
            lgr.debug('Observed drop failure, will not attempt remove')
            return

        for dpath, paths in paths_by_ds.items():
            for delpath in ([dpath] if paths is None else paths):
                if lexists(str(delpath)):
                    # here we still have something around on the
                    # filesystem. There is no need to fiddle with
                    # Git, just wipe it out. A later save() will
                    # act on it properly
                    if delpath.is_dir():
                        lgr.debug('Remove directory: %s', delpath)
                        rmtree(delpath)
                    # cannot use .exists() must forsee dead symlinks
                    else:
                        lgr.debug('Remove file: %s', delpath)
                        delpath.unlink()
                    continue
                # if we get here, there is nothing on the file system
                # anymore at this path. Either because the parent
                # dataset vanished already, or because we dropped a
                # dataset, and it still needs to be unregistered
                # from its parent -> `git rm`
                if dpath.exists():
                    GitRepo(dpath).call_git(
                        # no need for recursion, we know that even the root
                        # path not longer exists
                        ['rm', '-q'],
                        files=[str(delpath.relative_to(dpath))])
                    # this path was already being removed by drop
                    # so it must belong to a dropped dataset
                    # save won't report about this, let's do it
                    yield dict(
                        action='remove',
                        status='ok',
                        path=str(delpath),
                        type='dataset',
                    )

        if not refds.is_installed():
            # we already dropped the whole thing
            return

        for res in Save.__call__(
                dataset=dataset,
                path=path,
                # we might have removed the reference dataset by now, recheck
                message=message if message else '[DATALAD] removed content',
                return_type='generator',
                result_renderer='disabled',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            if res.get('action') == 'delete':
                # normalize to previous remove results
                res['action'] = 'remove'
            yield res
Exemple #24
0
class Publish(Interface):
    """Publish a dataset to a known :term:`sibling`.

    This makes the last saved state of a dataset available to a sibling
    or special remote data store of a dataset. Any target sibling must already
    exist and be known to the dataset.

    Optionally, it is possible to limit publication to change sets relative
    to a particular point in the version history of a dataset (e.g. a release
    tag). By default, the state of the local dataset is evaluated against the
    last known state of the target sibling. Actual publication is only attempted
    if there was a change compared to the reference state, in order to speed up
    processing of large collections of datasets. Evaluation with respect to
    a particular "historic" state is only supported in conjunction with a
    specified reference dataset. Change sets are also evaluated recursively, i.e.
    only those subdatasets are published where a change was recorded that is
    reflected in to current state of the top-level reference dataset.
    See "since" option for more information.

    Only publication of saved changes is supported. Any unsaved changes in a
    dataset (hierarchy) have to be saved before publication.

    .. note::
      Power-user info: This command uses :command:`git push`, and :command:`git annex copy`
      to publish a dataset. Publication targets are either configured remote
      Git repositories, or git-annex special remotes (if they support data
      upload).

    .. note::
      This command is deprecated. It will be removed from DataLad eventually,
      but no earlier than the 0.15 release. The `push` command (new in 0.13.0)
      provides an alternative interface. Critical differences are that `push`
      transfers annexed data by default and does not handle sibling creation
      (i.e. it does not have a `--missing` option).
    """
    # XXX prevent common args from being added to the docstring
    _no_eval_results = True
    # TODO: Figure out, how to tell about tracking branch/upstream
    #      (and the respective remote)
    #      - it is used, when no destination is given
    #      - it is configured to be the given destination, if there was no
    #        upstream set up before, so you can use just "datalad publish" next
    #        time.

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            metavar='DATASET',
            doc="""specify the (top-level) dataset to be published. If no dataset
            is given, the datasets are determined based on the input arguments""",
            constraints=EnsureDataset() | EnsureNone()),
        to=Parameter(
            args=("--to", ),
            metavar='LABEL',
            doc="""name of the target sibling. If no name is given an attempt is
            made to identify the target based on the dataset's configuration
            (i.e. a configured tracking branch, or a single sibling that is
            configured for publication)""",
            # TODO: See TODO at top of class!
            constraints=EnsureStr() | EnsureNone()),
        since=Parameter(
            args=("--since", ),
            constraints=EnsureStr() | EnsureNone(),
            doc=
            """specifies commit-ish (tag, shasum, etc.) from which to look for
            changes to decide whether pushing is necessary.
            If '^' is given, the last state of the current branch at the sibling
            is taken as a starting point. An empty string ('') for the same effect is
            still supported)."""),
        # since: commit => .gitmodules diff to head => submodules to publish
        missing=missing_sibling_opt,
        path=Parameter(
            args=("path", ),
            metavar='PATH',
            # TODO this description is no longer correct
            doc="path(s), that may point to file handle(s) to publish including "
            "their actual content or to subdataset(s) to be published. If a "
            "file handle is published with its data, this implicitly means "
            "to also publish the (sub)dataset it belongs to. '.' as a path "
            "is treated in a special way in the sense, that it is passed "
            "to subdatasets in case `recursive` is also given.",
            constraints=EnsureStr() | EnsureNone(),
            nargs='*'),
        force=Parameter(
            args=(
                "-f",
                "--force",
            ),
            doc="""enforce doing publish activities (git push etc) regardless of
            the analysis if they seemed needed""",
            action='store_true'),
        # TODO add option to decide what branch/repo to push
        transfer_data=Parameter(args=("--transfer-data", ),
                                doc="""ADDME""",
                                constraints=EnsureChoice(
                                    'auto', 'none', 'all')),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        git_opts=git_opts,
        annex_opts=annex_opts,
        annex_copy_opts=annex_copy_opts,
        jobs=jobs_opt,
    )

    @staticmethod
    @datasetmethod(name='publish')
    @eval_results
    def __call__(path=None,
                 dataset=None,
                 to=None,
                 since=None,
                 missing='fail',
                 force=False,
                 transfer_data='auto',
                 recursive=False,
                 recursion_limit=None,
                 git_opts=None,
                 annex_opts=None,
                 annex_copy_opts=None,
                 jobs=None):

        import warnings
        warnings.warn("`publish` is deprecated. Use `datalad push` instead.",
                      DeprecationWarning)

        # if ever we get a mode, for "with-data" we would need this
        #if dataset and not path:
        #    # act on the whole dataset if nothing else was specified
        #    path = dataset.path if isinstance(dataset, Dataset) else dataset

        if not (isinstance(dataset, Dataset) or (dataset is None and path)):
            # try to find a dataset in PWD
            dataset = require_dataset(dataset,
                                      check_installed=True,
                                      purpose='publishing')

        if (since and since != '^') and not dataset:
            raise InsufficientArgumentsError(
                'Modification detection (--since) without a base dataset '
                'is not supported')

        if dataset and since in ('', '^'):
            # only update since last update so we figure out what was the last update
            active_branch = dataset.repo.get_active_branch()
            if to:
                # XXX here we assume one to one mapping of names from local branches
                # to the remote
                since = '%s/%s' % (to, active_branch)
                # test if such branch already exists,
                if since not in dataset.repo.get_remote_branches():
                    lgr.debug(
                        "No remote branch %s yet, so since will not be used",
                        since)
                    since = None
            else:
                # take tracking remote for the active branch
                tracked_remote, tracked_refspec = dataset.repo.get_tracking_branch(
                )
                if tracked_remote:
                    if tracked_refspec.startswith('refs/heads/'):
                        tracked_refspec = tracked_refspec[len('refs/heads/'):]
                    #to = tracked_remote
                    since = '%s/%s' % (tracked_remote, tracked_refspec)
                else:
                    lgr.info(
                        "No tracked remote for %s. since option is of no effect",
                        active_branch)
                    since = None

        # here is the plan
        # 1. figure out remote to publish to
        # 2. figure out which content needs to be published to this remote
        # 3. look for any pre-publication dependencies of that remote
        #    (i.e. remotes that need to be published to before)
        # 4. publish the content needed to go to the primary remote to
        #    the dependencies first, and to the primary afterwards
        ds_remote_info = {}

        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(refds=refds_path, logger=lgr, action='publish')

        to_process = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='publish',
                unavailable_path_status='impossible',
                nondataset_path_status='error',
                modified="%s..HEAD" % since if since else since,
                return_type='generator',
                on_failure='ignore',
                force_no_revision_change_discovery=
                False,  # we cannot publish what was not committed
                force_untracked_discovery=False  # we cannot publish untracked
        ):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            remote_info_result = None
            if ap.get('type', ap.get('type_src', 'dataset')) != 'dataset':
                # for everything that is not a dataset get the remote info
                # for the parent
                parentds = ap.get('parentds', None)
                if parentds and parentds not in ds_remote_info:
                    remote_info_result = _get_remote_info(
                        parentds, ds_remote_info, to, missing)
            else:
                # this is a dataset
                if ap.get('state', None) == 'absent':
                    continue
                # get the remote info for itself
                remote_info_result = _get_remote_info(ap['path'],
                                                      ds_remote_info, to,
                                                      missing)
                ap['process_content'] = True
            if remote_info_result is not None:
                ap['status'] = remote_info_result[0]
                ap['message'] = remote_info_result[1]
                yield ap
                continue
            to_process.append(ap)

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_process,
                refds_path=refds_path)
        assert (not completed)

        lgr.debug("Evaluating %i dataset publication candidate(s)",
                  len(content_by_ds))
        # TODO: fancier sorting, so we still follow somewhat the hierarchy
        #       in sorted order, e.g.
        #  d1/sub1/sub1
        #  d1/sub1
        #  d1
        #  d2/sub1
        #  d2
        content_by_ds = OrderedDict(
            (d, content_by_ds[d]) for d in sorted(content_by_ds, reverse=True))

        lgr.debug("Attempt to publish %i datasets", len(content_by_ds))
        for ds_path in content_by_ds:
            remote_info = ds_remote_info.get(ds_path, None)
            if remote_info is None:
                # maybe this dataset wasn't annotated above, try to get info
                # MIH: I think this entire if-branch is practically impossible
                # to reach. It is certainly untested, but I think this is due
                # to mutually exclusive conditions during remote_info detection
                remote_info_result = _get_remote_info(ds_path, ds_remote_info,
                                                      to, missing)
                if remote_info_result is not None:
                    yield get_status_dict(type='dataset',
                                          path=ds_path,
                                          status=remote_info_result[0],
                                          message=remote_info_result[1],
                                          **res_kwargs)
                    continue
                # continue with freshly obtained info
                remote_info = ds_remote_info[ds_path]
                # condition above must catch all other cases
                assert remote_info
            # and publish
            ds = Dataset(ds_path)
            for r in _publish_dataset(
                    ds,
                    remote=remote_info['remote'],
                    refspec=remote_info.get('refspec', None),
                    # only send paths that were explicitly requested
                    paths=
                [
                    p for p in content_by_ds[ds_path]
                    # do not feed (sub)dataset paths into the beast
                    # makes no sense to try to annex copy them
                    # for the base dataset itself let `transfer_data`
                    # decide
                    if p.get('type', None) != 'dataset'
                ],
                    annex_copy_options=annex_copy_opts,
                    force=force,
                    jobs=jobs,
                    transfer_data=transfer_data,
                    **res_kwargs):
                yield r
Exemple #25
0
class CreateSibling(Interface):
    """Create a dataset sibling on a UNIX-like Shell (local or SSH)-accessible machine

    Given a local dataset, and a path or SSH login information this command
    creates a remote dataset repository and configures it as a dataset sibling
    to be used as a publication target (see `publish` command).

    Various properties of the remote sibling can be configured (e.g. name
    location on the server, read and write access URLs, and access
    permissions.

    Optionally, a basic web-viewer for DataLad datasets can be installed
    at the remote location.

    This command supports recursive processing of dataset hierarchies, creating
    a remote sibling for each dataset in the hierarchy. By default, remote
    siblings are created in hierarchical structure that reflects the
    organization on the local file system. However, a simple templating
    mechanism is provided to produce a flat list of datasets (see
    --target-dir).
    """
    # XXX prevent common args from being added to the docstring
    _no_eval_results = True

    _params_ = dict(
        # TODO: Figure out, whether (and when) to use `sshurl` as push url
        dataset=Parameter(
            args=("--dataset", "-d",),
            doc="""specify the dataset to create the publication target for. If
                no dataset is given, an attempt is made to identify the dataset
                based on the current working directory""",
            constraints=EnsureDataset() | EnsureNone()),
        sshurl=Parameter(
            args=("sshurl",),
            metavar='SSHURL',
            nargs='?',
            doc="""Login information for the target server. This can be given
                as a URL (ssh://host/path), SSH-style (user@host:path) or just
                a local path.
                Unless overridden, this also serves the future dataset's access
                URL and path on the server.""",
            constraints=EnsureStr()),
        name=Parameter(
            args=('-s', '--name',),
            metavar='NAME',
            doc="""sibling name to create for this publication target.
                If `recursive` is set, the same name will be used to label all
                the subdatasets' siblings. When creating a target dataset fails,
                no sibling is added""",
            constraints=EnsureStr() | EnsureNone(),
            nargs="?"),
        target_dir=Parameter(
            args=('--target-dir',),
            metavar='PATH',
            doc="""path to the directory *on the server* where the dataset
                shall be created. By default this is set to the URL (or local
                path) specified via [PY: `sshurl` PY][CMD: SSHURL CMD]. If a
                relative path is provided here, it is interpreted as being
                relative to the user's home directory on the server (or
                relative to [PY: `sshurl` PY][CMD: SSHURL CMD], when that is a
                local path).
                Additional features are relevant for recursive processing of
                datasets with subdatasets. By default, the local
                dataset structure is replicated on the server. However, it is
                possible to provide a template for generating different target
                directory names for all (sub)datasets. Templates can contain
                certain placeholder that are substituted for each (sub)dataset.
                For example: "/mydirectory/dataset%%RELNAME".\nSupported
                placeholders:\n
                %%RELNAME - the name of the datasets, with any slashes replaced by
                dashes\n""",
            constraints=EnsureStr() | EnsureNone()),
        target_url=Parameter(
            args=('--target-url',),
            metavar='URL',
            doc=""""public" access URL of the to-be-created target dataset(s)
                (default: `sshurl`). Accessibility of this URL determines the
                access permissions of potential consumers of the dataset.
                As with `target_dir`, templates (same set of placeholders)
                are supported.  Also, if specified, it is provided as the annex
                description\n""",
            constraints=EnsureStr() | EnsureNone()),
        target_pushurl=Parameter(
            args=('--target-pushurl',),
            metavar='URL',
            doc="""In case the `target_url` cannot be used to publish to the
                dataset, this option specifies an alternative URL for this
                purpose. As with `target_url`, templates (same set of
                placeholders) are supported.\n""",
            constraints=EnsureStr() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        existing=Parameter(
            args=("--existing",),
            constraints=EnsureChoice('skip', 'error', 'reconfigure', 'replace'),
            metavar='MODE',
            doc="""action to perform, if a sibling is already configured under the
            given name and/or a target (non-empty) directory already exists.
            In this case, a dataset can be skipped ('skip'), the sibling
            configuration be updated ('reconfigure'), or process interrupts with
            error ('error'). DANGER ZONE: If 'replace' is used, an existing target
            directory will be forcefully removed, re-initialized, and the
            sibling (re-)configured (thus implies 'reconfigure').
            `replace` could lead to data loss, so use with care.  To minimize
            possibility of data loss, in interactive mode DataLad will ask for
            confirmation, but it would just issue a warning and proceed in
            non-interactive mode.
            """,),
        inherit=inherit_opt,
        shared=Parameter(
            args=("--shared",),
            metavar='{false|true|umask|group|all|world|everybody|0xxx}',
            doc="""if given, configures the access permissions on the server
            for multi-users (this could include access by a webserver!).
            Possible values for this option are identical to those of
            `git init --shared` and are described in its documentation.""",
            constraints=EnsureStr() | EnsureBool() | EnsureNone()),
        group=Parameter(
            args=("--group",),
            metavar="GROUP",
            doc="""Filesystem group for the repository. Specifying the group is
            particularly important when [CMD: --shared=group CMD][PY:
            shared="group" PY]""",
            constraints=EnsureStr() | EnsureNone()
        ),
        ui=Parameter(
            args=("--ui",),
            metavar='{false|true|html_filename}',
            doc="""publish a web interface for the dataset with an
            optional user-specified name for the html at publication
            target. defaults to `index.html` at dataset root""",
            constraints=EnsureBool() | EnsureStr()),
        as_common_datasrc=as_common_datasrc,
        publish_depends=publish_depends,
        publish_by_default=publish_by_default,
        annex_wanted=annex_wanted_opt,
        annex_group=annex_group_opt,
        annex_groupwanted=annex_groupwanted_opt,
        since=Parameter(
            args=("--since",),
            constraints=EnsureStr() | EnsureNone(),
            doc="""limit processing to datasets that have been changed since a given
            state (by tag, branch, commit, etc). This can be used to create siblings
            for recently added subdatasets."""),
    )

    @staticmethod
    @datasetmethod(name='create_sibling')
    @eval_results
    def __call__(sshurl, name=None, target_dir=None,
                 target_url=None, target_pushurl=None,
                 dataset=None,
                 recursive=False,
                 recursion_limit=None,
                 existing='error',
                 shared=None,
                 group=None,
                 ui=False,
                 as_common_datasrc=None,
                 publish_by_default=None,
                 publish_depends=None,
                 annex_wanted=None, annex_group=None, annex_groupwanted=None,
                 inherit=False,
                 since=None):
        #
        # nothing without a base dataset
        #
        ds = require_dataset(dataset, check_installed=True,
                             purpose='creating a sibling')
        refds_path = ds.path

        #
        # all checks that are possible before we start parsing the dataset
        #

        # possibly use sshurl to get the name in case if not specified
        if not sshurl:
            if not inherit:
                raise InsufficientArgumentsError(
                    "needs at least an SSH URL, if no inherit option"
                )
            if name is None:
                raise ValueError(
                    "Neither SSH URL, nor the name of sibling to inherit from "
                    "was specified"
                )
            # It might well be that we already have this remote setup
            try:
                sshurl = CreateSibling._get_remote_url(ds, name)
            except Exception as exc:
                lgr.debug('%s does not know about url for %s: %s', ds, name, exc_str(exc))
        elif inherit:
            raise ValueError(
                "For now, for clarity not allowing specifying a custom sshurl "
                "while inheriting settings"
            )
            # may be could be safely dropped -- still WiP

        if not sshurl:
            # TODO: may be more back up before _prep?
            super_ds = ds.get_superdataset()
            if not super_ds:
                raise ValueError(
                    "Could not determine super dataset for %s to inherit URL"
                    % ds
                )
            super_url = CreateSibling._get_remote_url(super_ds, name)
            # for now assuming hierarchical setup
            # (TODO: to be able to destinguish between the two, probably
            # needs storing datalad.*.target_dir to have %RELNAME in there)
            sshurl = slash_join(super_url, relpath(refds_path, super_ds.path))

        # check the login URL
        sibling_ri = RI(sshurl)
        ssh_sibling = is_ssh(sibling_ri)
        if not (ssh_sibling or isinstance(sibling_ri, PathRI)):
            raise ValueError(
                "Unsupported SSH URL or path: '{0}', "
                "use ssh://host/path, host:path or path syntax".format(sshurl))

        if not name:
            name = sibling_ri.hostname if ssh_sibling else "local"
            lgr.debug(
                "No sibling name given. Using %s'%s' as sibling name",
                "URL hostname " if ssh_sibling else "",
                name)
        if since == '':
            # consider creating siblings only since the point of
            # the last update
            # XXX here we assume one to one mapping of names from local branches
            # to the remote
            active_branch = ds.repo.get_active_branch()
            since = '%s/%s' % (name, active_branch)

        #
        # parse the base dataset to find all subdatasets that need processing
        #
        to_process = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                # only a single path!
                path=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='create_sibling',
                # both next should not happen anyways
                unavailable_path_status='impossible',
                nondataset_path_status='error',
                modified=since,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', None) != 'dataset' or ap.get('state', None) == 'absent':
                # this can happen when there is `since`, but we have no
                # use for anything but datasets here
                continue
            checkds_remotes = Dataset(ap['path']).repo.get_remotes() \
                if ap.get('state', None) != 'absent' \
                else []
            if publish_depends:
                # make sure dependencies are valid
                # TODO: inherit -- we might want to automagically create
                # those dependents as well???
                unknown_deps = set(assure_list(publish_depends)).difference(checkds_remotes)
                if unknown_deps:
                    ap['status'] = 'error'
                    ap['message'] = (
                        'unknown sibling(s) specified as publication dependency: %s',
                        unknown_deps)
                    yield ap
                    continue
            if name in checkds_remotes and existing in ('error', 'skip'):
                ap['status'] = 'error' if existing == 'error' else 'notneeded'
                ap['message'] = (
                    "sibling '%s' already configured (specify alternative name, or force "
                    "reconfiguration via --existing",
                    name)
                yield ap
                continue
            to_process.append(ap)

        if not to_process:
            # we ruled out all possibilities
            # TODO wait for gh-1218 and make better return values
            lgr.info("No datasets qualify for sibling creation. "
                     "Consider different settings for --existing "
                     "or --since if this is unexpected")
            return

        if ssh_sibling:
            # request ssh connection:
            lgr.info("Connecting ...")
            shell = ssh_manager.get_connection(sshurl)
        else:
            shell = _RunnerAdapter()
            sibling_ri.path = str(resolve_path(sibling_ri.path, dataset))
            if target_dir:
                target_dir = opj(sibling_ri.path, target_dir)

        if target_dir is None:
            if sibling_ri.path:
                target_dir = sibling_ri.path
            else:
                target_dir = '.'

        # TODO: centralize and generalize template symbol handling
        replicate_local_structure = "%RELNAME" not in target_dir

        if not shell.get_annex_version():
            raise MissingExternalDependency(
                'git-annex',
                msg="It's required on the {} machine to create a sibling"
                    .format('remote' if ssh_sibling else 'local'))

        #
        # all checks done and we have a connection, now do something
        #

        # loop over all datasets, ordered from top to bottom to make test
        # below valid (existing directories would cause the machinery to halt)
        # But we need to run post-update hook in depth-first fashion, so
        # would only collect first and then run (see gh #790)
        yielded = set()
        remote_repos_to_run_hook_for = []
        for currentds_ap in \
                sorted(to_process, key=lambda x: x['path'].count('/')):
            current_ds = Dataset(currentds_ap['path'])

            path = _create_dataset_sibling(
                name,
                current_ds,
                refds_path,
                shell,
                replicate_local_structure,
                sibling_ri,
                target_dir,
                target_url,
                target_pushurl,
                existing,
                shared,
                group,
                publish_depends,
                publish_by_default,
                ui,
                as_common_datasrc,
                annex_wanted,
                annex_group,
                annex_groupwanted,
                inherit
            )
            if not path:
                # nothing new was created
                # TODO is 'notneeded' appropriate in this case?
                currentds_ap['status'] = 'notneeded'
                # TODO explain status in 'message'
                yield currentds_ap
                yielded.add(currentds_ap['path'])
                continue
            remote_repos_to_run_hook_for.append((path, currentds_ap))

            # publish web-interface to root dataset on publication server
            if current_ds.path == refds_path and ui:
                lgr.info("Uploading web interface to %s" % path)
                try:
                    CreateSibling.upload_web_interface(path, shell, shared, ui)
                except CommandError as e:
                    currentds_ap['status'] = 'error'
                    currentds_ap['message'] = (
                        "failed to push web interface to the remote datalad repository (%s)",
                        exc_str(e))
                    yield currentds_ap
                    yielded.add(currentds_ap['path'])
                    continue

        # in reverse order would be depth first
        lgr.info("Running post-update hooks in all created siblings")
        # TODO: add progressbar
        for path, currentds_ap in remote_repos_to_run_hook_for[::-1]:
            # Trigger the hook
            lgr.debug("Running hook for %s (if exists and executable)", path)
            try:
                shell("cd {} "
                      "&& ( [ -x hooks/post-update ] && hooks/post-update || : )"
                      "".format(sh_quote(_path_(path, ".git"))))
            except CommandError as e:
                currentds_ap['status'] = 'error'
                currentds_ap['message'] = (
                    "failed to run post-update hook under remote path %s (%s)",
                    path, exc_str(e))
                yield currentds_ap
                yielded.add(currentds_ap['path'])
                continue
            if not currentds_ap['path'] in yielded:
                # if we were silent until now everything is just splendid
                currentds_ap['status'] = 'ok'
                yield currentds_ap

    @staticmethod
    def _run_on_ds_ssh_remote(ds, name, ssh, cmd):
        """Given a dataset, and name of the remote, run command via ssh

        Parameters
        ----------
        cmd: str
          Will be .format()'ed given the `path` to the dataset on remote

        Returns
        -------
        out

        Raises
        ------
        CommandError
        """
        remote_url = CreateSibling._get_remote_url(ds, name)
        remote_ri = RI(remote_url)
        out, err = ssh(cmd.format(path=sh_quote(remote_ri.path)))
        if err:
            lgr.warning("Got stderr while calling ssh: %s", err)
        return out

    @staticmethod
    def _get_ds_remote_shared_setting(ds, name, ssh):
        """Figure out setting of sharedrepository for dataset's `name` remote"""
        shared = None
        try:
            # TODO -- we might need to expanduser taking .user into account
            # but then it must be done also on remote side
            out = CreateSibling._run_on_ds_ssh_remote(
                ds, name, ssh,
                'git -C {path} config --get core.sharedrepository'
            )
            shared = out.strip()
        except CommandError as e:
            lgr.debug(
                "Could not figure out remote shared setting of %s for %s due "
                "to %s",
                ds, name, exc_str(e)
            )
            # could well be ok if e.g. not shared
            # TODO: more detailed analysis may be?
        return shared

    @staticmethod
    def _has_active_postupdate(ds, name, ssh):
        """Figure out either has active post-update hook

        Returns
        -------
        bool or None
          None if something went wrong and we could not figure out
        """
        has_active_post_update = None
        try:
            # TODO -- we might need to expanduser taking .user into account
            # but then it must be done also on remote side
            out = CreateSibling._run_on_ds_ssh_remote(
                ds, name, ssh,
                'cd {path} && [ -x .git/hooks/post-update ] && echo yes || echo no'
            )
            out = out.strip()
            assert out in ('yes', 'no')
            has_active_post_update = out == "yes"
        except CommandError as e:
            lgr.debug(
                "Could not figure out either %s on remote %s has active "
                "post_update hook due to %s",
                ds, name, exc_str(e)
            )
        return has_active_post_update

    @staticmethod
    def _get_remote_url(ds, name):
        """A little helper to get url from pushurl or from url if not defined"""
        # take pushurl if present, if not -- just a url
        url = ds.config.get('remote.%s.pushurl' % name) or \
            ds.config.get('remote.%s.url' % name)
        if not url:
            raise ValueError(
                "%s had neither pushurl or url defined for %s" % (ds, name)
            )
        return url

    @staticmethod
    def init_remote_repo(path, ssh, shared, dataset, description=None):
        cmd = "git -C {} init{}".format(
            sh_quote(path),
            " --shared='{}'".format(sh_quote(shared)) if shared else '')
        try:
            ssh(cmd)
        except CommandError as e:
            lgr.error("Initialization of remote git repository failed at %s."
                      "\nError: %s\nSkipping ..." % (path, exc_str(e)))
            return False

        if isinstance(dataset.repo, AnnexRepo):
            # init remote git annex repo (part fix of #463)
            try:
                ssh(
                    "git -C {} annex init {}".format(
                        sh_quote(path),
                        sh_quote(description)
                        if description else '')
                )
            except CommandError as e:
                lgr.error("Initialization of remote git annex repository failed at %s."
                          "\nError: %s\nSkipping ..." % (path, exc_str(e)))
                return False
        return True

    @staticmethod
    def create_postupdate_hook(path, ssh, dataset):
        # location of post-update hook file, logs folder on remote target
        hooks_remote_dir = opj(path, '.git', 'hooks')
        # make sure hooks directory exists (see #1251)
        ssh('mkdir -p {}'.format(sh_quote(hooks_remote_dir)))
        hook_remote_target = opj(hooks_remote_dir, 'post-update')

        # create json command for current dataset
        log_filename = 'datalad-publish-hook-$(date +%s).log' % TIMESTAMP_FMT
        hook_content = r'''#!/bin/bash

git update-server-info

#
# DataLad
#
# (Re)generate meta-data for DataLad Web UI and possibly init new submodules
dsdir="$(dirname $0)/../.."
logfile="$dsdir/{WEB_META_LOG}/{log_filename}"

if [ ! -e "$dsdir/.git" ]; then
  echo Assumption of being under .git has failed >&2
  exit 1
fi

mkdir -p "$dsdir/{WEB_META_LOG}"  # assure logs directory exists

( which datalad > /dev/null \
  && ( cd "$dsdir"; GIT_DIR="$PWD/.git" datalad ls -a --json file .; ) \
  || echo "E: no datalad found - skipping generation of indexes for web frontend"; \
) &> "$logfile"
'''.format(WEB_META_LOG=WEB_META_LOG, **locals())

        with make_tempfile(content=hook_content) as tempf:
            # create post_update hook script
            # upload hook to dataset
            ssh.put(tempf, hook_remote_target)
        # and make it executable
        ssh('chmod +x {}'.format(sh_quote(hook_remote_target)))

    @staticmethod
    def upload_web_interface(path, ssh, shared, ui):
        # path to web interface resources on local
        webui_local = opj(dirname(datalad.__file__), 'resources', 'website')
        # local html to dataset
        html_local = opj(webui_local, "index.html")

        # name and location of web-interface html on target
        html_targetname = {True: ui, False: "index.html"}[isinstance(ui, str)]
        html_target = opj(path, html_targetname)

        # upload ui html to target
        ssh.put(html_local, html_target)

        # upload assets to the dataset
        webresources_local = opj(webui_local, 'assets')
        webresources_remote = opj(path, WEB_HTML_DIR)
        ssh('mkdir -p {}'.format(sh_quote(webresources_remote)))
        ssh.put(webresources_local, webresources_remote, recursive=True)

        # minimize and upload js assets
        for js_file in glob(opj(webresources_local, 'js', '*.js')):
            with open(js_file) as asset:
                try:
                    from jsmin import jsmin
                    # jsmin = lambda x: x   # no minimization
                    minified = jsmin(asset.read())                      # minify asset
                except ImportError:
                    lgr.warning(
                        "Will not minify web interface javascript, no jsmin available")
                    minified = asset.read()                             # no minify available
                with make_tempfile(content=minified) as tempf:          # write minified to tempfile
                    js_name = js_file.split('/')[-1]
                    ssh.put(tempf, opj(webresources_remote, 'assets', 'js', js_name))  # and upload js

        # explicitly make web+metadata dir of dataset world-readable, if shared set to 'all'
        mode = None
        if shared in (True, 'true', 'all', 'world', 'everybody'):
            mode = 'a+rX'
        elif shared == 'group':
            mode = 'g+rX'
        elif str(shared).startswith('0'):
            mode = shared

        if mode:
            ssh('chmod {} -R {} {}'.format(
                mode,
                sh_quote(dirname(webresources_remote)),
                sh_quote(opj(path, 'index.html'))))
Exemple #26
0
class ExportToFigshare(Interface):
    """Export the content of a dataset as a ZIP archive to figshare

    Very quick and dirty approach.  Ideally figshare should be supported as
    a proper git annex special remote.  Unfortunately, figshare does not support
    having directories, and can store only a flat list of files.  That makes
    it impossible for any sensible publishing of complete datasets.

    The only workaround is to publish dataset as a zip-ball, where the entire
    content is wrapped into a .zip archive for which figshare would provide a
    navigator.
    """

    from datalad.support.param import Parameter
    from datalad.distribution.dataset import datasetmethod
    from datalad.interface.utils import eval_results
    from datalad.distribution.dataset import EnsureDataset
    from datalad.support.constraints import (
        EnsureChoice,
        EnsureInt,
        EnsureNone,
        EnsureStr,
    )

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc=""""specify the dataset to export. If no dataset is given, an
            attempt is made to identify the dataset based on the current
            working directory.""",
            constraints=EnsureDataset() | EnsureNone()),
        filename=Parameter(
            args=("filename", ),
            metavar="PATH",
            nargs='?',
            doc="""File name of the generated ZIP archive. If no file name is
            given the archive will be generated in the top directory
            of the dataset and will be named: datalad_<dataset_uuid>.zip.""",
            constraints=EnsureStr() | EnsureNone()),
        no_annex=Parameter(
            args=("--no-annex", ),
            action="store_true",
            doc="""By default the generated .zip file would be added to annex,
            and all files would get registered in git-annex to be available
            from such a tarball. Also upon upload we will register for that
            archive to be a possible source for it in annex. Setting this flag
            disables this behavior."""),
        missing_content=Parameter(
            args=("--missing-content", ),
            doc="""By default, any discovered file with missing content will
            result in an error and the plugin is aborted. Setting this to
            'continue' will issue warnings instead of failing on error. The
            value 'ignore' will only inform about problem at the 'debug' log
            level. The latter two can be helpful when generating a TAR archive
            from a dataset where some file content is not available
            locally.""",
            constraints=EnsureChoice("error", "continue", "ignore")),
        # article_id=Parameter(
        #     args=("--project-id",),
        #     metavar="ID",
        #     doc="""If given, article (if article_id is not provided) will be
        #     created in that project.""",
        #     constraints=EnsureInt() | EnsureNone()),
        article_id=Parameter(args=("--article-id", ),
                             metavar="ID",
                             doc="""Which article to publish to.""",
                             constraints=EnsureInt() | EnsureNone()),
    )

    @staticmethod
    @datasetmethod(name='export_to_figshare')
    @eval_results
    def __call__(
        dataset,
        filename=None,
        missing_content='error',
        no_annex=False,
        # TODO: support working with projects and articles within them
        # project_id=None,
        article_id=None):
        import os
        import logging
        lgr = logging.getLogger('datalad.plugin.export_to_figshare')

        from datalad.ui import ui
        from datalad.api import add_archive_content
        from datalad.api import export_archive
        from datalad.distribution.dataset import require_dataset
        from datalad.support.annexrepo import AnnexRepo

        dataset = require_dataset(dataset,
                                  check_installed=True,
                                  purpose='export to figshare')

        if not isinstance(dataset.repo, AnnexRepo):
            raise ValueError(
                "%s is not an annex repo, so annexification could be done" %
                dataset)

        if dataset.repo.dirty:
            raise RuntimeError(
                "Paranoid authors of DataLad refuse to proceed in a dirty repository"
            )
        if filename is None:
            filename = dataset.path
        lgr.info(
            "Exporting current tree as an archive under %s since figshare "
            "does not support directories", filename)
        archive_out = next(
            export_archive(dataset,
                           filename=filename,
                           archivetype='zip',
                           missing_content=missing_content,
                           return_type="generator"))
        assert archive_out['status'] == 'ok'
        fname = archive_out['path']

        lgr.info("Uploading %s to figshare", fname)
        figshare = FigshareRESTLaison()

        if not article_id:
            # TODO: ask if it should be an article within a project
            if ui.is_interactive:
                # or should we just upload to a new article?
                if ui.yesno(
                        "Would you like to create a new article to upload to?  "
                        "If not - we will list existing articles",
                        title="Article"):
                    article = figshare.create_article(
                        title=_enter_title(ui, dataset))
                    lgr.info(
                        "Created a new (private) article %(id)s at %(url_private_html)s. "
                        "Please visit it, enter additional meta-data and make public",
                        article)
                    article_id = article['id']
                else:
                    article_id = int(
                        ui.question(
                            "Which of the articles should we upload to.",
                            choices=list(map(str,
                                             figshare.get_article_ids()))))
            if not article_id:
                raise ValueError("We need an article to upload to.")

        file_info = figshare.upload_file(
            fname, files_url='account/articles/%s/files' % article_id)

        if no_annex:
            lgr.info("Removing generated tarball")
            unlink(fname)
        else:
            # I will leave all the complaining etc to the dataset add if path
            # is outside etc
            lgr.info("'Registering' %s within annex", fname)
            repo = dataset.repo
            repo.add(fname, git=False)
            key = repo.get_file_key(fname)
            lgr.info("Adding URL %(download_url)s for it", file_info)
            repo.call_annex([
                "registerurl", '-c', 'annex.alwayscommit=false', key,
                file_info['download_url']
            ])

            lgr.info("Registering links back for the content of the archive")
            add_archive_content(
                fname,
                annex=dataset.repo,
                delete_after=True,  # just remove extracted into a temp dir
                allow_dirty=True,  # since we have a tarball
                commit=
                False  # we do not want to commit anything we have done here
            )

            lgr.info("Removing generated and now registered in annex archive")
            repo.drop(key, key=True, options=['--force'])
            repo.remove(fname, force=True)  # remove the tarball

            # if annex in {'delete'}:
            #     dataset.repo.remove(fname)
            # else:
            #     # kinda makes little sense I guess.
            #     # Made more sense if export_archive could export an arbitrary treeish
            #     # so we could create a branch where to dump and export to figshare
            #     # (kinda closer to my idea)
            #     dataset.save(fname, message="Added the entire dataset into a zip file")

        # TODO: add to downloader knowledge about figshare token so it could download-url
        # those zipballs before they go public
        yield dict(
            status='ok',
            # TODO: add article url (which needs to be queried if only ID is known
            message="Published archive {}".format(file_info['download_url']),
            file_info=file_info,
            path=dataset,
            action='export_to_figshare',
            logger=lgr)
Exemple #27
0
class DownloadURL(Interface):
    """Download content

    It allows for a uniform download interface to various supported URL
    schemes (see command help for details), re-using or asking for
    authentication details maintained by datalad.
    """

    _params_ = dict(
        urls=Parameter(
            doc="""URL(s) to be downloaded. Supported protocols: {}""".format(
                ", ".join(map(repr, sorted(Provider.DOWNLOADERS)))),
            constraints=EnsureStr(),  # TODO: EnsureURL
            metavar='url',
            args=('urls', ),
            nargs='+'),
        dataset=Parameter(
            args=("-d", "--dataset"),
            metavar='PATH',
            doc="""specify the dataset to add files to. If no dataset is given,
            an attempt is made to identify the dataset based on the current
            working directory. Use [CMD: --nosave CMD][PY: save=False PY] to
            prevent adding files to the dataset.""",
            constraints=EnsureDataset() | EnsureNone()),
        overwrite=Parameter(
            args=("-o", "--overwrite"),
            action="store_true",
            doc="""flag to overwrite it if target file exists"""),
        path=Parameter(
            args=("-O", "--path"),
            doc="""target for download. If the path has a trailing separator,
            it is treated as a directory, and each specified URL is downloaded
            under that directory to a base name taken from the URL. Without a
            trailing separator, the value specifies the name of the downloaded
            file (file name extensions inferred from the URL may be added to it,
            if they are not yet present) and only a single URL should be given.
            In both cases, leading directories will be created if needed. This
            argument defaults to the current directory.""",
            constraints=EnsureStr() | EnsureNone()),
        archive=Parameter(
            args=("--archive", ),
            action="store_true",
            doc="""pass the downloaded files to [CMD: :command:`datalad
            add-archive-content --delete` CMD][PY: add_archive_content(...,
            delete=True) PY]"""),
        save=nosave_opt,
        message=save_message_opt)

    _examples_ = [
        dict(
            text="Download files from an http and S3 URL",
            code_py=
            "download_url(urls=['http://example.com/file.dat', 's3://bucket/file2.dat'])",
            code_cmd=
            "datalad download-url http://example.com/file.dat s3://bucket/file2.dat"
        ),
        dict(
            text="Download a file to a path and provide a commit message",
            code_py=
            "download_url(urls='s3://bucket/file2.dat', message='added a file', path='myfile.dat')",
            code_cmd="""datalad download-url -m 'added a file' -O myfile.dat \\
                         s3://bucket/file2.dat"""),
        dict(text="Append a trailing slash to the target path "
             "to download into a specified directory",
             code_py=
             "download_url(['http://example.com/file.dat'], path='data/')",
             code_cmd=
             "datalad download-url --path=data/ http://example.com/file.dat"),
        dict(
            text="Leave off the trailing slash to download into a regular file",
            code_py=
            "download_url(['http://example.com/file.dat'], path='data')",
            code_cmd=
            "datalad download-url --path=data http://example.com/file.dat"),
    ]

    @staticmethod
    @datasetmethod(name="download_url")
    @eval_results
    def __call__(urls,
                 *,
                 dataset=None,
                 path=None,
                 overwrite=False,
                 archive=False,
                 save=True,
                 message=None):
        from ..downloaders.http import HTTPDownloader
        from ..downloaders.providers import Providers

        ds = None
        if save or dataset:
            try:
                ds = require_dataset(dataset,
                                     check_installed=True,
                                     purpose='download urls')
            except NoDatasetFound:
                pass

        common_report = {"action": "download_url", "ds": ds}

        got_ds_instance = isinstance(dataset, Dataset)
        dir_is_target = not path or str(path).endswith(op.sep)
        path = str(resolve_path(path or op.curdir, ds=dataset))
        if dir_is_target:
            # resolve_path() doesn't preserve trailing separators. Add one for
            # the download() call.
            path = path + op.sep
        urls = ensure_list_from_str(urls)

        if not dir_is_target:
            if len(urls) > 1:
                yield get_status_dict(
                    status="error",
                    message=
                    ("When specifying multiple urls, --path should point to "
                     "a directory target (with a trailing separator). Got %r",
                     path),
                    type="file",
                    path=path,
                    **common_report)
                return
            if archive:
                # make sure the file suffix indicated by a URL is preserved
                # so that any further archive processing doesn't have to
                # employ mime type inspection in order to determine the archive
                # type
                from datalad.support.network import URL
                suffixes = PurePosixPath(URL(urls[0]).path).suffixes
                if not Path(path).suffixes == suffixes:
                    path += ''.join(suffixes)
            # we know that we have a single URL
            # download() would be fine getting an existing directory and
            # downloading the URL underneath it, but let's enforce a trailing
            # slash here for consistency.
            if op.isdir(path):
                yield get_status_dict(
                    status="error",
                    message=(
                        "Non-directory path given (no trailing separator) "
                        "but a directory with that name (after adding archive "
                        "suffix) exists"),
                    type="file",
                    path=path,
                    **common_report)
                return

        # TODO setup fancy ui.progressbars doing this in parallel and reporting overall progress
        # in % of urls which were already downloaded
        providers = Providers.from_config_files()
        downloaded_paths = []
        path_urls = {}
        need_datalad_remote = False
        for url in urls:
            # somewhat "ugly"
            downloader = providers.get_provider(url).get_downloader(url)
            try:
                downloaded_path = downloader.download(url,
                                                      path=path,
                                                      overwrite=overwrite)
            except Exception as e:
                ce = CapturedException(e)
                yield get_status_dict(status="error",
                                      message=str(ce),
                                      type="file",
                                      path=path,
                                      exception=ce,
                                      **common_report)
            else:
                if not need_datalad_remote \
                   and (downloader.authenticator or downloader.credential or
                        type(downloader) != HTTPDownloader):
                    need_datalad_remote = True
                downloaded_paths.append(downloaded_path)
                path_urls[downloaded_path] = url
                yield get_status_dict(status="ok",
                                      type="file",
                                      path=downloaded_path,
                                      **common_report)

        if downloaded_paths and save and ds is not None:
            msg = message or """\
[DATALAD] Download URLs

URLs:
  {}""".format("\n  ".join(urls))

            for r in Save()(
                    downloaded_paths,
                    message=msg,
                    # ATTN: Pass the original dataset argument to
                    # preserve relative path handling semantics.
                    dataset=dataset,
                    return_type="generator",
                    result_renderer='disabled',
                    result_xfm=None,
                    result_filter=None,
                    on_failure="ignore"):
                yield r

            ds_repo = ds.repo
            if isinstance(ds_repo, AnnexRepo):
                if need_datalad_remote:
                    from datalad.customremotes.base import (
                        ensure_datalad_remote, )
                    ensure_datalad_remote(ds_repo,
                                          autoenable=True,
                                          encryption=None)

                if got_ds_instance:
                    # Paths in `downloaded_paths` are already relative to the
                    # dataset.
                    rpaths = dict(zip(downloaded_paths, downloaded_paths))
                else:
                    # Paths in `downloaded_paths` are already relative to the
                    # current working directory. Take these relative to the
                    # dataset for use with the AnnexRepo method calls.
                    rpaths = {}
                    for orig_path, resolved in zip(
                            downloaded_paths,
                            resolve_path(downloaded_paths, ds=dataset)):
                        rpath = path_under_rev_dataset(ds, resolved)
                        if rpath:
                            rpaths[str(rpath)] = orig_path
                        else:
                            lgr.warning("Path %s not under dataset %s",
                                        orig_path, ds)
                annex_paths = [
                    p for p, annexed in zip(
                        rpaths, ds_repo.is_under_annex(list(rpaths.keys())))
                    if annexed
                ]
                if annex_paths:
                    for path in annex_paths:
                        url = path_urls[rpaths[path]]
                        try:
                            # The file is already present. This is just to
                            # register the URL.
                            ds_repo.add_url_to_file(
                                path,
                                url,
                                # avoid batch mode for single files
                                # https://github.com/datalad/datalad/issues/2849
                                batch=len(annex_paths) > 1,
                                # bypass URL size check, we already have the file
                                options=['--relaxed'])
                        except CommandError as exc:
                            lgr.warning("Registering %s with %s failed: %s",
                                        path, url, CapturedException(exc))

                    if archive:
                        for path in annex_paths:
                            yield from ds.add_archive_content(
                                path,
                                delete=True,
                                on_failure='ignore',
                                return_type='generator',
                                result_renderer='disabled')
Exemple #28
0
class Get(Interface):
    """Get any dataset content (files/directories/subdatasets).

    This command only operates on dataset content. To obtain a new independent
    dataset from some source use the `install` command.

    By default this command operates recursively within a dataset, but not
    across potential subdatasets, i.e. if a directory is provided, all files in
    the directory are obtained. Recursion into subdatasets is supported too. If
    enabled, relevant subdatasets are detected and installed in order to
    fulfill a request.

    Known data locations for each requested file are evaluated and data are
    obtained from some available location (according to git-annex configuration
    and possibly assigned remote priorities), unless a specific source is
    specified.

    .. note::
      Power-user info: This command uses :command:`git annex get` to fulfill
      file handles.
    """

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            metavar="PATH",
            doc="""specify the dataset to perform the add operation on, in
            which case `path` arguments are interpreted as being relative
            to this dataset.  If no dataset is given, an attempt is made to
            identify a dataset for each input `path`""",
            constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(
            args=("path", ),
            metavar="PATH",
            doc="""path/name of the requested dataset component. The component
            must already be known to a dataset. To add new components to a
            dataset use the `add` command""",
            nargs="*",
            constraints=EnsureStr() | EnsureNone()),
        source=Parameter(
            args=(
                "-s",
                "--source",
            ),
            metavar="LABEL",
            doc="""label of the data source to be used to fulfill requests.
            This can be the name of a dataset :term:`sibling` or another known
            source""",
            constraints=EnsureStr() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=Parameter(
            args=("--recursion-limit", ),
            metavar="LEVELS",
            constraints=EnsureInt() | EnsureChoice('existing') | EnsureNone(),
            doc="""limit recursion into subdataset to the given number of levels.
            Alternatively, 'existing' will limit recursion to subdatasets that already
            existed on the filesystem at the start of processing, and prevent new
            subdatasets from being obtained recursively."""),
        get_data=Parameter(
            args=(
                "-n",
                "--no-data",
            ),
            dest='get_data',
            action='store_false',
            doc=
            """whether to obtain data for all file handles. If disabled, `get`
            operations are limited to dataset handles.[CMD:  This option prevents data
            for file handles from being obtained CMD]"""),
        description=location_description,
        reckless=reckless_opt,
        # git_opts=git_opts,
        # annex_opts=annex_opts,
        # annex_get_opts=annex_get_opts,
        jobs=jobs_opt,
        verbose=verbose)

    # Note: May be use 'git annex find --not --in here' to have a list of all
    # files to actually get and give kind of a progress in terms of number
    # files processed ...

    @staticmethod
    @datasetmethod(name='get')
    @eval_results
    def __call__(
        path=None,
        source=None,
        dataset=None,
        recursive=False,
        recursion_limit=None,
        get_data=True,
        description=None,
        reckless=False,
        #git_opts=None,
        #annex_opts=None,
        #annex_get_opts=None,
        jobs=None,
        verbose=False,
    ):
        # IMPLEMENTATION CONCEPT:
        #
        # 1. Sort the world into existing handles and the rest
        # 2. Try locate missing handles (obtain subdatasets along the way)
        # 3. Expand into subdatasets with recursion enables (potentially
        #    obtain even more subdatasets
        # 4. Shoot info of which handles to get in each subdataset to,
        #    git-annex, once at the very end

        refds_path = Interface.get_refds_path(dataset)
        if not (dataset or path):
            raise InsufficientArgumentsError(
                "Neither dataset nor target path(s) provided")
        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = refds_path

        # remember which results we already reported, to avoid duplicates
        yielded_ds = []
        to_get = []
        unavailable_paths = []
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='get',
                # NOTE: Do not act upon unavailable paths yet! Done below after
                # testing which ones could be obtained
                unavailable_path_status='',
                nondataset_path_status='impossible',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # we know what to report already
                yield ap
                continue
            if ap.get('state', None) == 'absent' and ap.get(
                    'raw_input', False):
                # if this wasn't found, but directly requested, queue for further
                # exploration
                unavailable_paths.append(ap)
                continue
            if ap.get('type', None) == 'dataset' and \
                    GitRepo.is_valid_repo(ap['path']) and \
                    not ap['path'] == refds_path:
                # do not report what hasn't arived yet
                # also do not report the base dataset that is already
                # present -- no surprise
                yield dict(ap,
                           status='notneeded',
                           logger=lgr,
                           message='already installed')
                yielded_ds.append(ap['path'])
                ap['process_content'] = get_data
            to_get.append(ap)

        # explore the unknown
        for ap in sorted(unavailable_paths, key=lambda x: x['path']):
            lgr.debug("Investigate yet unavailable path %s", ap)
            # how close can we get?
            dspath = ap.get('parentds', get_dataset_root(ap['path']))
            if dspath is None:
                # nothing we can do for this path
                continue
            lgr.debug("Found containing dataset %s for path %s", dspath,
                      ap['path'])
            ds = Dataset(dspath)
            # now actually obtain whatever is necessary to get to this path
            containing_ds = [dspath]
            for res in _install_necessary_subdatasets(ds,
                                                      ap['path'],
                                                      reckless,
                                                      refds_path,
                                                      description=description):
                # yield immediately so errors could be acted upon outside, before
                # we continue
                if not (res['type'] == 'dataset'
                        and res['path'] in yielded_ds):
                    # unless we reported on this dataset before
                    if res['type'] == 'dataset':
                        # make a record, recursive below might now want to report
                        # a 'notneeded'
                        yielded_ds.append(res['path'])
                    yield res
                # update to the current innermost dataset
                containing_ds.append(res['path'])

            if len(containing_ds) < 2:
                # no subdataset was installed, hence if the path was unavailable
                # before it still is, no need to bother git annex
                ap.update(status='impossible', message='path does not exist')
                yield ap
                continue
            # important to only do the next for the innermost subdataset
            # as the `recursive` logic below relies on that!
            # set the correct parent, for a dataset this would be the second-last
            # reported subdataset
            ap.update(parentds=containing_ds[-1])
            if containing_ds[-1] == ap['path']:
                # the path actually refers to the last installed dataset
                ap.update(parentds=containing_ds[-2],
                          process_content=get_data,
                          type='dataset')
            to_get.append(ap)

        # results of recursive installation of yet undiscovered datasets
        rec_get = []
        if recursive and not recursion_limit == 'existing':
            # obtain any subdatasets underneath the paths given inside the
            # subdatasets that we know already exist
            # unless we do not want recursion into not-yet-installed datasets
            for ap in sorted(to_get, key=lambda x: x['path']):
                if ap['type'] not in ('dataset', 'directory') or not ap.get(
                        'raw_input', False):
                    # a non-directory cannot have content underneath
                    # also we do NOT want to recurse into anything that was specifically
                    # requested, to avoid duplication
                    continue
                subds = Dataset(ap['path'] if ap['type'] ==
                                'dataset' else ap['parentds'])
                lgr.info("Installing %s%s recursively", subds,
                         (" underneath %s" %
                          ap['path'] if subds.path != ap['path'] else ""))
                for res in _recursive_install_subds_underneath(
                        subds,
                        # `ap['path']` was explicitly given as input
                        # we count recursions from the input, hence we
                        # can start with the full number
                        recursion_limit,
                        reckless,
                        start=ap['path'],
                        refds_path=refds_path,
                        description=description):
                    # yield immediately so errors could be acted upon
                    # outside, before we continue
                    if not (res['type'] == 'dataset'
                            and res['path'] in yielded_ds):
                        # unless we reported on this dataset before
                        if res['type'] == 'dataset':
                            # make a record
                            yielded_ds.append(res['path'])
                    yield res
                    if not (res['status'] == 'ok'
                            and res['type'] == 'dataset'):
                        # not a dataset that was just installed, we just reported it
                        # upstairs, and can ignore it from now on
                        continue
                    # paranoia, so popular these days...
                    assert GitRepo.is_valid_repo(res['path'])
                    # keep a copy of the install record for `get` later on
                    get_ap = {
                        k: v
                        for k, v in res.items() if not k == 'status'
                    }
                    get_ap['process_content'] = get_data
                    rec_get.append(get_ap)

        if not get_data:
            # done already
            return

        # merge the two AP lists
        to_get.extend(rec_get)

        # sort into datasets
        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_get,
                refds_path=refds_path,
                path_only=False)
        assert (not completed)

        # hand over to git-annex, get files content,
        # report files in git as 'notneeded' to get
        for ds_path in sorted(content_by_ds.keys()):
            ds = Dataset(ds_path)
            # grab content, ignore subdataset entries
            content = [
                ap['path'] for ap in content_by_ds[ds_path]
                if ap.get('type', None) != 'dataset' or ap['path'] == ds.path
            ]
            if not content:
                # cut this short should there be nothing
                continue
            # needs to be an annex to get content
            if not isinstance(ds.repo, AnnexRepo):
                for r in results_from_paths(
                        content,
                        status='notneeded',
                        message="no dataset annex, content already present",
                        action='get',
                        logger=lgr,
                        refds=refds_path):
                    yield r
                continue
            respath_by_status = {}
            for res in ds.repo.get(content,
                                   options=['--from=%s' %
                                            source] if source else [],
                                   jobs=jobs):
                res = annexjson2result(res,
                                       ds,
                                       type='file',
                                       logger=lgr,
                                       refds=refds_path)
                success = success_status_map[res['status']]
                respath_by_status[success] = \
                    respath_by_status.get(success, []) + [res['path']]
                yield res

            for r in results_from_annex_noinfo(
                    ds,
                    content,
                    respath_by_status,
                    dir_fail_msg='could not get some content in %s %s',
                    noinfo_dir_msg='nothing to get from %s',
                    noinfo_file_msg='already present',
                    action='get',
                    logger=lgr,
                    refds=refds_path):
                yield r

    @staticmethod
    def custom_result_summary_renderer(res):
        from datalad.ui import ui
        from os import linesep
        if not len(res):
            ui.message("Got nothing new")
            return

        nfiles = count_results(res, type='file')
        nsuccess_file = count_results(res, type='file', status='ok')
        nfailure = nfiles - nsuccess_file
        msg = "Tried to get %d %s that had no content yet." % (
            nfiles, single_or_plural("file", "files", nfiles))
        if nsuccess_file:
            msg += " Successfully obtained %d. " % nsuccess_file
        if nfailure:
            msg += " %d (failed)." % (nfailure, )
        ui.message(msg)

        # if just a few or less than initially explicitly requested
        if len(res) < 10:
            msg = linesep.join([
                "{path}{type} ... {suc}".format(
                    suc=item.get('status'),
                    path=item.get('path'),
                    type=' [{}]'.format(item['type'])
                    if 'type' in item else '') for item in res
            ])
            ui.message(msg)
Exemple #29
0
class Uninstall(Interface):
    """Uninstall subdatasets

    This command can be used to uninstall any number of installed subdataset.
    If a to-be-uninstalled subdataset contains presently installed subdatasets
    itself, their recursive removal has to be enabled explicitly to avoid the
    command to exit with an error. This command will error if individual files
    or non-dataset directories are given as input (use the drop or remove
    command depending in the desired goal), nor will it uninstall top-level
    datasets (i.e. datasets that or not a subdataset in another dataset; use
    the remove command for this purpose).

    By default, the availability of at least one remote copy for each currently
    available file in any dataset is verified. As these checks could lead to
    slow operation (network latencies, etc), they can be disabled.

    Any number of paths to process can be given as input. Recursion into
    subdatasets needs to be explicitly enabled, while recursion in
    subdirectories within a dataset as always done automatically. An optional
    recursion limit is applied relative to each given input path.

    Examples:

      Uninstall a subdataset (undo installation)::

        ~/some/dataset$ datalad uninstall somesubdataset1

    """
    _action = 'uninstall'

    _params_ = dict(
        dataset=dataset_argument,
        path=Parameter(args=("path", ),
                       metavar="PATH",
                       doc="path/name of the component to be uninstalled",
                       nargs="*",
                       constraints=EnsureStr() | EnsureNone()),
        recursive=recursion_flag,
        check=check_argument,
        if_dirty=if_dirty_opt,
    )

    @staticmethod
    @datasetmethod(name=_action)
    @eval_results
    def __call__(path=None,
                 dataset=None,
                 recursive=False,
                 check=True,
                 if_dirty='save-before'):

        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='uninstall', logger=lgr, refds=refds_path)
        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = refds_path
        if not dataset and not path:
            raise InsufficientArgumentsError(
                "insufficient information for `uninstall`: requires at least a path or dataset"
            )

        to_uninstall = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                action='uninstall',
                # justification for status:
                # content need not be uninstalled where there is none
                unavailable_path_status='notneeded',
                nondataset_path_status='error',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            # upfront sanity and compliance checks
            # check that we have no top-level datasets and not files to process
            if ap.get('type') == 'dataset' and \
                    not ap.get('state', None) == 'absent' and \
                    path_is_under([ap['path']]):  # wants a sequence!
                ap.update(
                    status='error',
                    message="refusing to uninstall current or parent directory"
                )
                yield ap
                continue
            if not ap.get('type', None) == 'dataset':
                ap.update(
                    status='impossible',
                    message=
                    "can only uninstall datasets (consider the `drop` command)"
                )
                yield ap
                continue
            # we only have dataset from here
            if not ap.get('parentds', None):
                ap.update(
                    status='error',
                    message=
                    "will not uninstall top-level dataset (consider `remove` command)"
                )
                yield ap
                continue
            if not ap['path'] == refds_path:
                ap['process_content'] = True
            to_uninstall.append(ap)

        # iterate over all datasets, starting at the bottom
        # to deinit contained submodules first
        for ap in sorted(to_uninstall, key=lambda x: x['path'], reverse=True):
            if ap.get('state', None) == 'absent':
                # already gone
                continue
            ds = Dataset(ap['path'])
            # TODO generator
            # this should yield what it did
            handle_dirty_dataset(ds, mode=if_dirty)
            # we confirmed the super dataset presence above
            for r in _uninstall_dataset(ds,
                                        check=check,
                                        has_super=True,
                                        **res_kwargs):
                yield r
Exemple #30
0
class AddReadme(Interface):
    """Add basic information about DataLad datasets to a README file

    The README file is added to the dataset and the addition is saved
    in the dataset.
    Note: Make sure that no unsaved modifications to your dataset's
    .gitattributes file exist.

    """
    from datalad.support.param import Parameter
    from datalad.distribution.dataset import datasetmethod
    from datalad.interface.utils import eval_results
    from datalad.distribution.dataset import EnsureDataset
    from datalad.support.constraints import (
        EnsureChoice,
        EnsureNone,
        EnsureStr,
    )

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""Dataset to add information to. If no dataset is given, an
            attempt is made to identify the dataset based on the current
            working directory.""",
            constraints=EnsureDataset() | EnsureNone()),
        filename=Parameter(
            args=("filename", ),
            metavar="PATH",
            nargs='?',
            doc="""Path of the README file within the dataset.""",
            constraints=EnsureStr()),
        existing=Parameter(
            args=("--existing", ),
            doc="""How to react if a file with the target name already exists:
            'skip': do nothing; 'append': append information to the existing
            file; 'replace': replace the existing file with new content.""",
            constraints=EnsureChoice("skip", "append", "replace")),
    )

    @staticmethod
    @datasetmethod(name='add_readme')
    @eval_results
    def __call__(filename='README.md', *, dataset=None, existing='skip'):
        from os.path import lexists
        from os.path import join as opj
        from io import open
        import logging
        lgr = logging.getLogger('datalad.local.add_readme')

        from datalad.distribution.dataset import require_dataset
        from datalad.utils import ensure_list

        dataset = require_dataset(dataset,
                                  check_installed=True,
                                  purpose='add README')

        fpath = opj(dataset.path, filename)
        res_kwargs = dict(action='add_readme', path=fpath)

        if lexists(fpath) and existing == 'skip':
            yield dict(
                res_kwargs,
                status='notneeded',
                message='file already exists, and not appending content')
            return

        # unlock, file could be annexed
        if lexists(fpath):
            yield from dataset.unlock(fpath,
                                      return_type='generator',
                                      result_renderer='disabled')
        if not lexists(fpath):
            # if we have an annex repo, shall the README go to Git or annex?

            if isinstance(dataset.repo, AnnexRepo) \
                and 'annex.largefiles' not in \
                    dataset.repo.get_gitattributes(filename).get(filename, {}):
                # configure the README to go into Git
                dataset.repo.set_gitattributes([(filename, {
                    'annex.largefiles': 'nothing'
                })])
                yield from dataset.save(
                    path='.gitattributes',
                    message="[DATALAD] Configure README to be in Git",
                    to_git=True,
                    return_type='generator',
                    result_renderer='disabled')

        # get any metadata on the dataset itself
        dsinfo = dataset.metadata('.',
                                  reporton='datasets',
                                  return_type='item-or-list',
                                  result_renderer='disabled',
                                  on_failure='ignore')
        meta = {}
        if not isinstance(dsinfo, dict) or dsinfo.get('status', None) != 'ok':
            lgr.warning(
                "Could not obtain dataset metadata, proceeding without")
            dsinfo = {}
        else:
            # flatten possibly existing multiple metadata sources
            for src in dsinfo['metadata']:
                if src.startswith('@'):
                    # not a source
                    continue
                meta.update(dsinfo['metadata'][src])

        metainfo = ''
        for label, content in (
            ('', meta.get('description', meta.get('shortdescription', ''))),
            ('Author{}'.format(
                's' if isinstance(meta.get('author', None), list) else ''),
             u'\n'.join([
                 u'- {}'.format(a) for a in ensure_list(meta.get('author', []))
             ])),
            ('Homepage', meta.get('homepage', '')),
            ('Reference', meta.get('citation', '')),
            ('License', meta.get('license', '')),
            ('Keywords', u', '.join([
                u'`{}`'.format(k) for k in ensure_list(meta.get('tag', []))
            ])),
            ('Funding', meta.get('fundedby', '')),
        ):
            if label and content:
                metainfo += u'\n\n### {}\n\n{}'.format(label, content)
            elif content:
                metainfo += u'\n\n{}'.format(content)

        for key in 'title', 'name', 'shortdescription':
            if 'title' in meta:
                break
            if key in meta:
                meta['title'] = meta[key]

        default_content = u"""\
# {title}{metainfo}

## General information

This is a DataLad dataset{id}.

## DataLad datasets and how to use them

This repository is a [DataLad](https://www.datalad.org/) dataset. It provides
fine-grained data access down to the level of individual files, and allows for
tracking future updates. In order to use this repository for data retrieval,
[DataLad](https://www.datalad.org/) is required. It is a free and open source
command line tool, available for all major operating systems, and builds up on
Git and [git-annex](https://git-annex.branchable.com/) to allow sharing,
synchronizing, and version controlling collections of large files.

More information on how to install DataLad and [how to install](http://handbook.datalad.org/en/latest/intro/installation.html)
it can be found in the [DataLad Handbook](https://handbook.datalad.org/en/latest/index.html).

### Get the dataset

A DataLad dataset can be `cloned` by running

```
datalad clone <url>
```

Once a dataset is cloned, it is a light-weight directory on your local machine.
At this point, it contains only small metadata and information on the identity
of the files in the dataset, but not actual *content* of the (sometimes large)
data files.

### Retrieve dataset content

After cloning a dataset, you can retrieve file contents by running

```
datalad get <path/to/directory/or/file>
```

This command will trigger a download of the files, directories, or subdatasets
you have specified.

DataLad datasets can contain other datasets, so called *subdatasets*.  If you
clone the top-level dataset, subdatasets do not yet contain metadata and
information on the identity of files, but appear to be empty directories. In
order to retrieve file availability metadata in subdatasets, run

```
datalad get -n <path/to/subdataset>
```

Afterwards, you can browse the retrieved metadata to find out about subdataset
contents, and retrieve individual files with `datalad get`.  If you use
`datalad get <path/to/subdataset>`, all contents of the subdataset will be
downloaded at once.

### Stay up-to-date

DataLad datasets can be updated. The command `datalad update` will *fetch*
updates and store them on a different branch (by default
`remotes/origin/master`). Running

```
datalad update --merge
```

will *pull* available updates and integrate them in one go.

### Find out what has been done

DataLad datasets contain their history in the ``git log``.  By running ``git
log`` (or a tool that displays Git history) in the dataset or on specific
files, you can find out what has been done to the dataset or to individual
files by whom, and when.
""".format(
            title='Dataset "{}"'.format(meta['title'])
            if 'title' in meta else 'About this dataset',
            metainfo=metainfo,
            id=u' (id: {})'.format(dataset.id) if dataset.id else '',
        )

        with open(fpath,
                  'a' if existing == 'append' else 'w',
                  encoding='utf-8') as fp:
            fp.write(default_content)
            yield dict(status='ok',
                       path=fpath,
                       type='file',
                       action='add_readme')

        yield from dataset.save(fpath,
                                message='[DATALAD] added README',
                                result_filter=None,
                                result_xfm=None,
                                return_type='generator',
                                result_renderer='disabled')