Esempio n. 1
0
class AnnotatePaths(Interface):
    """Analyze and act upon input paths

    Given paths (or more generally location requests) are inspected and
    annotated with a number of properties. A list of recognized properties
    is provided below.

    || PYTHON >>Input `paths` for this command can either be un-annotated
    (raw) path strings, or already (partially) annotated paths. In the latter
    case, further annotation is limited to yet-unknown properties, and is
    potentially faster than initial annotation.<< PYTHON ||


    *Recognized path properties*

    {proplist}

    In the case of enabled modification detection the results may contain
    additional properties regarding the nature of the modification. See the
    documentation of the `diff` command for details.

    """
    _docs_ = dict(proplist='\n\n    '.join('"{}"\n{}'.format(
        k,
        textwrap.fill(known_props[k],
                      initial_indent='        ',
                      subsequent_indent='        '))
                                           for k in sorted(known_props)))

    _params_ = dict(
        path=Parameter(args=("path", ),
                       metavar="PATH",
                       doc="""path to be annotated""",
                       nargs="*",
                       constraints=EnsureStr() | EnsureNone()),
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""an optional reference/base dataset for the paths""",
            constraints=EnsureDataset() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        action=Parameter(args=("--action", ),
                         metavar="LABEL",
                         doc="""an "action" property value to include in the
            path annotation""",
                         constraints=EnsureStr() | EnsureNone()),
        unavailable_path_status=Parameter(
            args=("--unavailable-path-status", ),
            metavar="LABEL",
            doc="""a "status" property value to include in the
            annotation for paths that are underneath a dataset, but
            do not exist on the filesystem""",
            constraints=EnsureStr() | EnsureNone()),
        unavailable_path_msg=Parameter(
            args=("--unavailable-path-msg", ),
            metavar="message",
            doc="""a "message" property value to include in the
            annotation for paths that are underneath a dataset, but
            do not exist on the filesystem""",
            constraints=EnsureStr() | EnsureNone()),
        nondataset_path_status=Parameter(
            args=("--nondataset-path-status", ),
            metavar="LABEL",
            doc="""a "status" property value to include in the
            annotation for paths that are not underneath any dataset""",
            constraints=EnsureStr() | EnsureNone()),
        force_parentds_discovery=Parameter(
            args=("--no-parentds-discovery", ),
            dest='force_parentds_discovery',
            action='store_false',
            doc="""Flag to disable reports of parent dataset information for any
            path, in particular dataset root paths. Disabling saves on command
            run time, if this information is not needed."""),
        force_subds_discovery=Parameter(
            args=("--no-subds-discovery", ),
            action='store_false',
            dest='force_subds_discovery',
            doc="""Flag to disable reporting type='dataset' for subdatasets, even
            when they are not installed, or their mount point directory doesn't
            exist. Disabling saves on command run time, if this information is
            not needed."""),
        force_untracked_discovery=Parameter(
            args=("--no-untracked-discovery", ),
            action='store_false',
            dest='force_untracked_discovery',
            doc="""Flag to disable discovery of untracked changes.
                Disabling saves on command run time, if this information is
                not needed."""),
        force_no_revision_change_discovery=Parameter(
            args=("--revision-change-discovery", ),
            action='store_false',
            dest='force_no_revision_change_discovery',
            doc=
            """Flag to disable discovery of changes which were not yet committed.
            Disabling saves on command run time, if this information is
            not needed."""),
        modified=Parameter(
            args=("--modified", ),
            nargs='?',
            const=True,
            constraints=EnsureStr() | EnsureBool() | EnsureNone(),
            doc="""comparison reference specification for modification detection.
            This can be (mostly) anything that `git diff` understands (commit,
            treeish, tag, etc). See the documentation of `datalad diff --revision`
            for details. Unmodified paths will not be annotated. If a requested
            path was not modified but some content underneath it was, then the
            request is replaced by the modified paths and those are annotated
            instead. This option can be used [PY: with `True` as PY][CMD: without CMD]
            an argument to test against changes that have been made, but have not
            yet been staged for a commit."""))

    @staticmethod
    @datasetmethod(name='annotate_paths')
    @eval_results
    def __call__(path=None,
                 dataset=None,
                 recursive=False,
                 recursion_limit=None,
                 action=None,
                 unavailable_path_status='',
                 unavailable_path_msg=None,
                 nondataset_path_status='error',
                 force_parentds_discovery=True,
                 force_subds_discovery=True,
                 force_no_revision_change_discovery=True,
                 force_untracked_discovery=True,
                 modified=None):
        # upfront check for the fastest possible response
        if not path and dataset is None:
            # nothing given, try "here", but do not use `require_dataset`, as
            # it will determine the root dataset of `curdir` and further down
            # lead to path annotation of upstairs directories
            dataset = curdir

        if force_subds_discovery and not force_parentds_discovery:
            raise ValueError(
                'subdataset discovery requires parent dataset discovery')

        # CONCEPT: yield with no status to indicate further processing

        # everything in one big loop to be able too yield as fast a possible
        # without any precomputing for all paths
        refds_path = Interface.get_refds_path(dataset)
        if modified is not None and (refds_path is None
                                     or not GitRepo.is_valid_repo(refds_path)):
            raise ValueError(
                "modification detection only works with a base dataset (non-given or found)"
            )

        # prep common result props
        res_kwargs = dict(action=action if action else 'annotate_path',
                          refds=refds_path,
                          logger=lgr)

        # handle the case of recursion into a single dataset without any
        # extra fancy processing first -- full recursion can be done
        # faster than manual recursion, hence we gain quite some speed
        # from these few lines of extra code
        if not modified and not path and refds_path:
            if not GitRepo.is_valid_repo(refds_path):
                yield get_status_dict(
                    # doesn't matter if the path is in another dataset
                    # it was given as reference dataset
                    status=nondataset_path_status,
                    message='given reference dataset is not a dataset',
                    path=refds_path,
                    **res_kwargs)
                return

            refds = Dataset(refds_path)
            path = []
            # yield the dataset itself
            r = get_status_dict(ds=refds, status='', **res_kwargs)
            yield r

            if recursive:
                # if we have nothing given, but need recursion, we need to feed
                # the dataset path itself
                for r in yield_recursive(refds, refds_path, action,
                                         recursion_limit):
                    r.update(res_kwargs)
                    if 'refds' in r and not r['refds']:
                        # avoid cruft
                        del r['refds']
                    yield r
            return

        # goal: structure in a way that makes most information on any path
        # available in a single pass, at the cheapest possible cost
        reported_paths = {}
        requested_paths = assure_list(path)

        if modified is not None:
            # modification detection would silently kill all nondataset paths
            # but we have to complain about them, hence doing it here
            if requested_paths and refds_path:
                for r in requested_paths:
                    p = r['path'] if isinstance(r, dict) else r
                    p = resolve_path(p, ds=refds_path)
                    if path_startswith(p, refds_path):
                        # all good
                        continue
                    # not the refds
                    path_props = r if isinstance(r, dict) else {}
                    res = get_status_dict(**dict(res_kwargs, **path_props))
                    res['status'] = nondataset_path_status
                    res['message'] = 'path not associated with reference dataset'
                    reported_paths[r] = res
                    yield res

            # preserve non-existing paths to be silently killed by modification
            # detection and append them to requested_paths again after detection.
            # TODO: This might be melted in with treatment of non dataset paths
            # above. Re-appending those paths seems to be better than yielding
            # directly to avoid code duplication, since both cases later on are
            # dealt with again.
            preserved_paths = []
            if requested_paths:
                [
                    preserved_paths.append(r) for r in requested_paths
                    if not lexists(r['path'] if isinstance(r, dict) else r)
                ]

            # replace the requested paths by those paths that were actually
            # modified underneath or at a requested location
            requested_paths = get_modified_subpaths(
                # either the request, or the base dataset, if there was no request
                requested_paths if requested_paths else [refds_path],
                refds=Dataset(refds_path),
                revision=modified,
                report_no_revision_change=force_no_revision_change_discovery,
                report_untracked='all' if force_untracked_discovery else 'no',
                recursion_limit=recursion_limit)

            from itertools import chain
            # re-append the preserved paths:
            requested_paths = chain(requested_paths, iter(preserved_paths))

        # do not loop over unique(), this could be a list of dicts
        # we avoid duplicates manually below via `reported_paths`
        for path in requested_paths:
            if not isinstance(path, dict):
                path = rawpath2ap(path, refds_path)
            # this is now an annotated path!
            path_props = path
            path = path['path']
            # we need to mark our territory, who knows where this has been
            path_props.update(res_kwargs)

            if path in reported_paths:
                # we already recorded this path in the output
                # this can happen, whenever `path` is a subdataset, that was
                # discovered via recursive processing of another path before
                continue
            # the path exists in some shape or form
            # TODO if we have path_props already we could skip this test
            if isdir(path):
                # keep any existing type info, previously a more expensive run
                # could have discovered an uninstalled 'dataset', and we don't
                # want it to be relabeled to a directory
                path_props['type'] = \
                    path_props.get(
                        'type',
                        'dataset' if GitRepo.is_valid_repo(path) else 'directory')
                # this could contain all types of additional content
                containing_dir = path
            else:
                if lexists(path):
                    path_props['type'] = 'file'
                else:
                    path_props['state'] = 'absent'
                # for everything else we are interested in the container
                containing_dir = dirname(path)
                if not containing_dir:
                    containing_dir = curdir

            dspath = parent = get_dataset_root(containing_dir)
            if dspath:
                if path_props.get('type', None) == 'dataset':
                    # for a dataset the root is not the parent, for anything else
                    # it is
                    parent = path_props.get('parentds', None)
                    oneupdir = normpath(opj(containing_dir, pardir))
                    if parent is None and (force_parentds_discovery or
                                           (refds_path
                                            and _with_sep(oneupdir).startswith(
                                                _with_sep(refds_path)))):
                        # either forced, or only if we have a reference dataset, and
                        # only if we stay within this refds when searching for the
                        # parent
                        parent = get_dataset_root(
                            normpath(opj(containing_dir, pardir)))
                        # NOTE the `and refds_path` is critical, as it will determine
                        # whether a top-level dataset that was discovered gets the
                        # parent property or not, it won't get it without a common
                        # base dataset, and that is how we always rolled
                    if parent and refds_path:
                        path_props['parentds'] = parent
                        # don't check whether this is actually a true subdataset of the
                        # parent, done further down
                else:
                    # set parent, but prefer existing property
                    path_props['parentds'] = path_props.get('parentds', dspath)

            # test for `dspath` not `parent`, we only need to know whether there is
            # ANY dataset, not which one is the true parent, logic below relies on
            # the fact that we end here, if there is no dataset at all
            if not dspath:
                # not in any dataset
                res = get_status_dict(**dict(res_kwargs, **path_props))
                res['status'] = nondataset_path_status
                res['message'] = 'path not associated with any dataset'
                reported_paths[path] = res
                yield res
                continue

            # check that we only got SUBdatasets
            if refds_path and not path_startswith(dspath, refds_path):
                res = get_status_dict(**dict(res_kwargs, **path_props))
                res['status'] = nondataset_path_status
                res['message'] = \
                    ('path not part of the reference dataset at %s', refds_path)
                reported_paths[path] = res
                yield res
                continue

            if path_props.get('type', None) == 'file':
                # nothing else we can learn about this
                res = get_status_dict(**dict(res_kwargs, **path_props))
                if 'status' not in res:
                    res['status'] = ''
                reported_paths[path] = res
                yield res
                continue

            containing_ds = None
            path_type = path_props.get('type', None)
            if parent and force_subds_discovery and (
                (path_type == 'dataset'
                 and 'registered_subds' not in path_props)
                    or path_type == 'directory' or not lexists(path)):
                from datalad.distribution.subdatasets import Subdatasets
                # if the path doesn't exist, or is labeled a directory, or a dataset even
                # a dataset (without this info) -> record whether this is a known subdataset
                # to its parent
                containing_ds = Dataset(parent)
                subdss = containing_ds.subdatasets(fulfilled=None,
                                                   recursive=False,
                                                   result_xfm=None,
                                                   result_filter=None,
                                                   return_type='list')
                if path in [s['path'] for s in subdss]:
                    if path_type == 'directory' or not lexists(path):
                        # first record that it isn't here, if just a dir or not here at all
                        path_props['state'] = 'absent'
                    # this must be a directory, and it is not installed
                    path_props['type'] = 'dataset'
                    path_props['registered_subds'] = True

            if not lexists(path) or \
                    (path_props.get('type', None) == 'dataset' and
                     path_props.get('state', None) == 'absent'):
                # not there (yet)
                message = unavailable_path_msg if unavailable_path_msg else None
                if message and '%s' in message:
                    message = (message, path)
                path_props['message'] = message
                res = get_status_dict(**dict(res_kwargs, **path_props))
                # assign given status, but only if the props don't indicate a status
                # already
                res['status'] = path_props.get('status',
                                               unavailable_path_status)
                reported_paths[path] = res
                yield res
                continue

            # we know everything we can, report
            res = get_status_dict(**dict(res_kwargs, **path_props))
            if 'status' not in res:
                res['status'] = ''
            reported_paths[path] = res
            yield res

            rec_paths = []
            if recursive:
                # here we need to consider the special case that `path` is
                # a dataset itself, if a recursion_limit is given (e.g.
                # `remove` will do that by default), we need to recurse
                # from the dataset itself, and not its parent to get things
                # right -- this will also avoid needless discovery of
                # unrelated subdatasets
                if path_props.get('type', None) == 'dataset':
                    containing_ds = Dataset(path)
                else:
                    # regular parent, we might have a dataset already
                    containing_ds = Dataset(
                        parent) if containing_ds is None else containing_ds
                for r in yield_recursive(containing_ds, path, action,
                                         recursion_limit):
                    # capture reported paths
                    r.update(res_kwargs)
                    if 'refds' in r and not r['refds']:
                        # avoid cruft
                        del r['refds']
                    reported_paths[r['path']] = r
                    if modified is not None:
                        # we cannot yield right away, maybe it wasn't modified
                        rec_paths.append(r)
                    else:
                        yield r
            if modified is not None and rec_paths:
                # replace the recursively discovered paths by those paths that
                # were actually modified underneath or at a requested location
                for r in get_modified_subpaths(
                        rec_paths,
                        refds=Dataset(refds_path),
                        revision=modified,
                        report_no_revision_change=
                        force_no_revision_change_discovery,
                        report_untracked='all'
                        if force_untracked_discovery else 'no',
                        recursion_limit=recursion_limit):
                    res = get_status_dict(**dict(r, **res_kwargs))
                    reported_paths[res['path']] = res
                    yield res
        return
Esempio n. 2
0
class Get(Interface):
    """Get any dataset content (files/directories/subdatasets).

    This command only operates on dataset content. To obtain a new independent
    dataset from some source use the `install` command.

    By default this command operates recursively within a dataset, but not
    across potential subdatasets, i.e. if a directory is provided, all files in
    the directory are obtained. Recursion into subdatasets is supported too. If
    enabled, relevant subdatasets are detected and installed in order to
    fulfill a request.

    Known data locations for each requested file are evaluated and data are
    obtained from some available location (according to git-annex configuration
    and possibly assigned remote priorities), unless a specific source is
    specified.

    .. note::
      Power-user info: This command uses :command:`git annex get` to fulfill
      file handles.
    """

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            metavar="PATH",
            doc="""specify the dataset to perform the add operation on, in
            which case `path` arguments are interpreted as being relative
            to this dataset.  If no dataset is given, an attempt is made to
            identify a dataset for each input `path`""",
            constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(
            args=("path",),
            metavar="PATH",
            doc="""path/name of the requested dataset component. The component
            must already be known to a dataset. To add new components to a
            dataset use the `add` command""",
            nargs="*",
            constraints=EnsureStr() | EnsureNone()),
        source=Parameter(
            args=("-s", "--source",),
            metavar="LABEL",
            doc="""label of the data source to be used to fulfill requests.
            This can be the name of a dataset :term:`sibling` or another known
            source""",
            constraints=EnsureStr() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=Parameter(
            args=("--recursion-limit",),
            metavar="LEVELS",
            constraints=EnsureInt() | EnsureChoice('existing') | EnsureNone(),
            doc="""limit recursion into subdataset to the given number of levels.
            Alternatively, 'existing' will limit recursion to subdatasets that already
            existed on the filesystem at the start of processing, and prevent new
            subdatasets from being obtained recursively."""),
        get_data=Parameter(
            args=("-n", "--no-data",),
            dest='get_data',
            action='store_false',
            doc="""whether to obtain data for all file handles. If disabled, `get`
            operations are limited to dataset handles.[CMD:  This option prevents data
            for file handles from being obtained CMD]"""),
        reckless=reckless_opt,
        git_opts=git_opts,
        annex_opts=annex_opts,
        annex_get_opts=annex_get_opts,
        jobs=jobs_opt,
        verbose=verbose)

    # Note: May be use 'git annex find --not --in here' to have a list of all
    # files to actually get and give kind of a progress in terms of number
    # files processed ...

    @staticmethod
    @datasetmethod(name='get')
    def __call__(
            path=None,
            source=None,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            get_data=True,
            reckless=False,
            git_opts=None,
            annex_opts=None,
            annex_get_opts=None,
            jobs=None,
            verbose=False,
            # internal -- instead of returning 'get'ed items, return final
            # content_by_ds, unavailable_paths.  To be used by the call from
            # Install.__call__ and done so to avoid creating another reusable
            # function which would need to duplicate all this heavy list of
            # kwargs
            _return_datasets=False
    ):
        # IMPLEMENTATION CONCEPT:
        #
        # 1. Sort the world into existing handles and the rest
        # 2. Try locate missing handles (obtain subdatasets along the way)
        # 3. Expand into subdatasets with recursion enables (potentially
        #    obtain even more subdatasets
        # 4. Shoot info of which handles to get in each subdataset to,
        #    git-annex, once at the very end

        dataset_path = dataset.path if isinstance(dataset, Dataset) else dataset
        if not (dataset or path):
            raise InsufficientArgumentsError(
                "Neither dataset nor target path(s) provided")
        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = dataset_path
        # use lookup cache -- we need that info further down
        dir_lookup = {}
        content_by_ds, unavailable_paths = Interface._prep(
            path=path,
            dataset=dataset,
            recursive=recursive,
            recursion_limit=recursion_limit,
            dir_lookup=dir_lookup)

        # explore the unknown
        for path in sorted(unavailable_paths):
            # how close can we get?
            dspath = get_dataset_root(path)
            if dspath is None:
                # nothing we can do for this path
                continue
            ds = Dataset(dspath)
            # must always yield a dataset -- we sorted out the ones outside
            # any dataset at the very top
            assert ds.is_installed()
            # now actually obtain whatever is necessary to get to this path
            containing_ds = install_necessary_subdatasets(ds, path, reckless)
            if containing_ds.path != ds.path:
                lgr.debug("Installed %s to fulfill request for content for "
                          "path %s", containing_ds, path)
                # mark resulting dataset as auto-installed
                if containing_ds.path == path:
                    # we had to get the entire dataset, not something within
                    # mark that it just appeared
                    content_by_ds[path] = [curdir]
                else:
                    # we need to get content within
                    content_by_ds[path] = [path]

        if recursive and not recursion_limit == 'existing':
            # obtain any subdatasets underneath the paths given inside the
            # subdatasets that we know already exist
            # unless we do not want recursion into not-yet-installed datasets
            for subdspath in sorted(content_by_ds.keys()):
                for content_path in content_by_ds[subdspath]:
                    if not isdir(content_path):
                        # a non-directory cannot have content underneath
                        continue
                    subds = Dataset(subdspath)
                    lgr.info(
                        "Obtaining %s %s recursively",
                        subds,
                        ("underneath %s" % content_path
                         if subds.path != content_path
                         else ""))
                    cbysubds = _recursive_install_subds_underneath(
                        subds,
                        # `content_path` was explicitly given as input
                        # we count recursions from the input, hence we
                        # can start with the full number
                        recursion_limit,
                        reckless,
                        # protect against magic marker misinterpretation
                        # only relevant for _get, hence replace here
                        start=content_path if content_path != curdir else None)
                    # gets file content for all freshly installed subdatasets
                    content_by_ds.update(cbysubds)

        ## we have now done everything we could to obtain whatever subdataset
        ## to get something on the file system for previously unavailable paths
        ## check and sort one last
        content_by_ds, unavailable_paths, nondataset_paths = \
            get_paths_by_dataset(
                unavailable_paths,
                recursive=recursive,
                recursion_limit=recursion_limit,
                out=content_by_ds,
                dir_lookup=dir_lookup)

        if nondataset_paths:
            # XXX likely can never get here
            lgr.warning(
                "ignored paths that do not belong to any dataset: %s",
                nondataset_paths)

        if unavailable_paths:
            lgr.warning('ignored non-existing paths: %s', unavailable_paths)

        # hand over to git-annex
        results = list(chain.from_iterable(
            _get(content_by_ds, refpath=dataset_path, source=source, jobs=jobs,
                 get_data=get_data)))
        # ??? should we in _return_datasets case just return both content_by_ds
        # and unavailable_paths may be so we provide consistent across runs output
        # and then issue outside similar IncompleteResultsError?
        if unavailable_paths:  # and likely other error flags
            if _return_datasets:
                results = sorted(set(content_by_ds).difference(unavailable_paths))
            raise IncompleteResultsError(results, failed=unavailable_paths)
        else:
            return sorted(content_by_ds) if _return_datasets else results

    @staticmethod
    def result_renderer_cmdline(res, args):
        from datalad.ui import ui
        from os import linesep
        if res is None:
            res = []
        if not isinstance(res, list):
            res = [res]
        if not len(res):
            ui.message("Got nothing new")
            return

        # provide summary
        nsuccess = sum(item.get('success', False) if isinstance(item, dict) else True
                       for item in res)
        nfailure = len(res) - nsuccess
        msg = "Tried to get %d %s." % (
            len(res), single_or_plural("file", "files", len(res)))
        if nsuccess:
            msg += " Got %d. " % nsuccess
        if nfailure:
            msg += " Failed to get %d." % (nfailure,)
        ui.message(msg)

        # if just a few or less than initially explicitly requested
        if len(res) < 10 or args.verbose:
            msg = linesep.join([
                "{path} ... {suc}".format(
                    suc="ok." if isinstance(item, Dataset) or item.get('success', False)
                        else "failed. (%s)" % item.get('note', 'unknown reason'),
                    path=item.get('file') if isinstance(item, dict) else item.path)
                for item in res])
            ui.message(msg)
Esempio n. 3
0
class Unlock(Interface):
    """Unlock file(s) of a dataset

    Unlock files of a dataset in order to be able to edit the actual content
    """

    _params_ = dict(
        path=Parameter(args=("path", ),
                       doc="""file(s) to unlock""",
                       nargs="*",
                       constraints=EnsureStr() | EnsureNone()),
        dataset=Parameter(args=("-d", "--dataset"),
                          doc=""""specify the dataset to unlock files in. If
            no dataset is given, an attempt is made to identify the dataset
            based on the current working directory. If the latter fails, an
            attempt is made to identify the dataset based on `path` """,
                          constraints=EnsureDataset() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
    )

    @staticmethod
    @datasetmethod(name='unlock')
    @eval_results
    def __call__(path=None,
                 dataset=None,
                 recursive=False,
                 recursion_limit=None):

        if path is None and dataset is None:
            raise InsufficientArgumentsError(
                "insufficient arguments for unlocking: needs at least "
                "a dataset or a path to unlock.")

        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='unlock', logger=lgr, refds=refds_path)

        to_process = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='unlock',
                unavailable_path_status='impossible',
                unavailable_path_msg="path does not exist",
                nondataset_path_status='impossible',
                modified=None,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', 'dataset') == 'dataset':
                # this is a dataset
                ap['process_content'] = True
            to_process.append(ap)

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_process,
                refds_path=refds_path)
        assert (not completed)

        for ds_path in sorted(content_by_ds.keys()):
            ds = Dataset(ds_path)
            content = content_by_ds[ds_path]

            # no annex, no unlock:
            if not isinstance(ds.repo, AnnexRepo):
                for ap in content:
                    ap['status'] = 'notneeded'
                    ap['message'] = "not annex'ed, nothing to unlock"
                    ap.update(res_kwargs)
                    yield ap
                continue

            # only files in annex with their content present:
            files = [ap['path'] for ap in content]
            to_unlock = []
            for ap, under_annex, has_content in \
                zip(content,
                    ds.repo.is_under_annex(files),
                    ds.repo.file_has_content(files)):

                # TODO: what about directories? Make sure, there is no
                # situation like no file beneath with content or everything in
                # git, that leads to a CommandError
                # For now pass to annex:
                from os.path import isdir
                if isdir(ap['path']):
                    to_unlock.append(ap)
                    continue

                # Note, that `file_has_content` is (planned to report) True on
                # files in git. Therefore order matters: First check for annex!
                if under_annex:
                    if has_content:
                        to_unlock.append(ap)
                    # no content, no unlock:
                    else:
                        ap['status'] = 'impossible'
                        ap['message'] = "no content present, can't unlock"
                        ap.update(res_kwargs)
                        yield ap
                # file in git, no unlock:
                else:
                    ap['status'] = 'notneeded'
                    ap['message'] = "not controlled by annex, nothing to unlock"
                    ap.update(res_kwargs)
                    yield ap

            # don't call annex-unlock with no path, if this is this case because
            # nothing survived the filtering above
            if content and not to_unlock:
                continue

            for r in ds.repo.unlock([ap['path'] for ap in to_unlock]):
                yield get_status_dict(path=opj(ds.path, r),
                                      status='ok',
                                      type='file',
                                      **res_kwargs)
Esempio n. 4
0
class CrawlInit(Interface):
    """Initialize crawling configuration

    Allows to specify template and function to generate a crawling pipeline

    Examples:

    $ datalad crawl-init \
        --template openfmri \
        --template-func superdataset_pipeline

    $ datalad crawl-init \
        --template fcptable \
        dataset=Baltimore tarballs=True
    """
    # XXX prevent common args from being added to the docstring
    _no_eval_results = True

    _params_ = dict(
        template=Parameter(
            args=("-t", "--template"),
            action="store",
            constraints=EnsureStr() | EnsureNone(),
            doc="""the name of the template"""),
        template_func=Parameter(
            args=("-f", "--template-func"),
            action="store",
            doc="""the name of the function"""),
        args=Parameter(
            args=("args",),
            nargs="*",
            constraints=EnsureStr() | EnsureNone(),
            doc="""keyword arguments to pass into the template function generating actual pipeline,
            organized in [PY: a dict PY][CMD: key=value pairs CMD]"""),
        save=Parameter(
            args=("--save",),
            action="store_true",
            doc="""flag to save file into git repo"""),
    )

    @staticmethod
    def __call__(args=None, template=None, template_func=None, save=False):

        if args:
            if isinstance(args, str):
                args = [args]
            if isinstance(args, list):
                args = OrderedDict(map(str, it.split('=', 1)) for it in args)
            elif isinstance(args, dict):
                pass
            else:
                raise ValueError(
                    "args entered must be given in a list or dict, were given as %s",
                    type(args))
        elif not template:
            raise TypeError("crawl-init needs a template")
        else:
            args = {}

        pipeline_func = load_pipeline_from_template(template, template_func, kwargs=args, return_only=True)

        try:
            pipeline = pipeline_func(**args)
        except Exception as exc:
            raise RuntimeError(
                "Running the pipeline function resulted in %s."
                "FYI this pipeline only takes the following args: %s"
                % (exc_str(exc), get_func_kwargs_doc(pipeline_func)))

        if not pipeline:
            raise ValueError("returned pipeline is empty")

        if not isinstance(pipeline, list):
            raise ValueError("pipeline should be represented as a list. Got: %r" % pipeline)

        configfile = initiate_pipeline_config(template, template_func, args)

        if save:
            from datalad.api import save
            ds = Dataset(curdir)
            ds.repo.add(configfile, git=True)
            ds.save("committing crawl config file", path=configfile)
Esempio n. 5
0
class AggregateMetaData(Interface):
    """Aggregate metadata of one or more datasets for later query.

    Metadata aggregation refers to a procedure that extracts metadata present
    in a dataset into a portable representation that is stored a single
    standardized format. Moreover, metadata aggregation can also extract
    metadata in this format from one dataset and store it in another
    (super)dataset. Based on such collections of aggregated metadata it is
    possible to discover particular datasets and specific parts of their
    content, without having to obtain the target datasets first (see the
    DataLad 'search' command).

    To enable aggregation of metadata that are contained in files of a dataset,
    one has to enable one or more metadata extractor for a dataset. DataLad
    supports a number of common metadata standards, such as the Exchangeable
    Image File Format (EXIF), Adobe's Extensible Metadata Platform (XMP), and
    various audio file metadata systems like ID3. DataLad extension packages
    can provide metadata data extractors for additional metadata sources. For
    example, the neuroimaging extension provides extractors for scientific
    (meta)data standards like BIDS, DICOM, and NIfTI1.  Some metadata
    extractors depend on particular 3rd-party software. The list of metadata
    extractors available to a particular DataLad installation is reported by
    the 'wtf' command ('datalad wtf').

    Enabling a metadata extractor for a dataset is done by adding its name to the
    'datalad.metadata.nativetype' configuration variable -- typically in the
    dataset's configuration file (.datalad/config), e.g.::

      [datalad "metadata"]
        nativetype = exif
        nativetype = xmp

    If an enabled metadata extractor is not available in a particular DataLad
    installation, metadata extraction will not succeed in order to avoid
    inconsistent aggregation results.

    Enabling multiple extractors is supported. In this case, metadata are
    extracted by each extractor individually, and stored alongside each other.
    Metadata aggregation will also extract DataLad's own metadata (extractors
    'datalad_core', and 'annex').

    Metadata aggregation can be performed recursively, in order to aggregate all
    metadata across all subdatasets, for example, to be able to search across
    any content in any dataset of a collection. Aggregation can also be performed
    for subdatasets that are not available locally. In this case, pre-aggregated
    metadata from the closest available superdataset will be considered instead.

    Depending on the versatility of the present metadata and the number of dataset
    or files, aggregated metadata can grow prohibitively large. A number of
    configuration switches are provided to mitigate such issues.

    datalad.metadata.aggregate-content-<extractor-name>
      If set to false, content metadata aggregation will not be performed for
      the named metadata extractor (a potential underscore '_' in the extractor name must
      be replaced by a dash '-'). This can substantially reduce the runtime for
      metadata extraction, and also reduce the size of the generated metadata
      aggregate. Note, however, that some extractors may not produce any metadata
      when this is disabled, because their metadata might come from individual
      file headers only. 'datalad.metadata.store-aggregate-content' might be
      a more appropriate setting in such cases.

    datalad.metadata.aggregate-ignore-fields
      Any metadata key matching any regular expression in this configuration setting
      is removed prior to generating the dataset-level metadata summary (keys
      and their unique values across all dataset content), and from the dataset
      metadata itself. This switch can also be used to filter out sensitive
      information prior aggregation.

    datalad.metadata.generate-unique-<extractor-name>
      If set to false, DataLad will not auto-generate a summary of unique content
      metadata values for a particular extractor as part of the dataset-global metadata
      (a potential underscore '_' in the extractor name must be replaced by a dash '-').
      This can be useful if such a summary is bloated due to minor uninformative (e.g.
      numerical) differences, or when a particular extractor already provides a
      carefully designed content metadata summary.

    datalad.metadata.maxfieldsize
      Any metadata value that exceeds the size threshold given by this configuration
      setting (in bytes/characters) is removed.

    datalad.metadata.store-aggregate-content
      If set, extracted content metadata are still used to generate a dataset-level
      summary of present metadata (all keys and their unique values across all
      files in a dataset are determined and stored as part of the dataset-level
      metadata aggregate, see datalad.metadata.generate-unique-<extractor-name>),
      but metadata on individual files are not stored.
      This switch can be used to avoid prohibitively large metadata files. Discovery
      of datasets containing content matching particular metadata properties will
      still be possible, but such datasets would have to be obtained first in order
      to discover which particular files in them match these properties.
    """
    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""topmost dataset metadata will be aggregated into. All dataset
            between this dataset and any given path will receive updated
            aggregated metadata from all given paths.""",
            constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(args=("path", ),
                       metavar="PATH",
                       doc="""path to datasets that shall be aggregated.
            When a given path is pointing into a dataset, the metadata of the
            containing dataset will be aggregated.  If no paths given, current
            dataset metadata is aggregated.""",
                       nargs="*",
                       constraints=EnsureStr() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        update_mode=Parameter(
            args=('--update-mode', ),
            constraints=EnsureChoice('all', 'target'),
            doc="""which datasets to update with newly aggregated metadata:
            all datasets from any leaf dataset to the top-level target dataset
            including all intermediate datasets (all), or just the top-level
            target dataset (target)."""),
        incremental=Parameter(
            args=('--incremental', ),
            action='store_true',
            doc="""If set, all information on metadata records of subdatasets
            that have not been (re-)aggregated in this run will be kept unchanged.
            This is useful when (re-)aggregation only a subset of a dataset hierarchy,
            for example, because not all subdatasets are locally available."""
        ),
        force_extraction=Parameter(
            args=('--force-extraction', ),
            action='store_true',
            doc="""If set, all enabled extractors will be engaged regardless of
            whether change detection indicates that metadata has already been
            extracted for a given dataset state."""),
        save=nosave_opt,
    )

    @staticmethod
    @datasetmethod(name='aggregate_metadata')
    @eval_results
    def __call__(path=None,
                 dataset=None,
                 recursive=False,
                 recursion_limit=None,
                 update_mode='target',
                 incremental=False,
                 force_extraction=False,
                 save=True):
        refds_path = Interface.get_refds_path(dataset)

        # it really doesn't work without a dataset
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='metadata aggregation')
        path = assure_list(path)
        if not path:
            # then current/reference dataset is "aggregated"
            # We should not add ds.path always since then --recursive would
            # also recurse current even if paths are given
            path.append(ds.path)

        agginfo_db_location, agg_base_path = get_ds_aggregate_db_locations(
            ds,
            # do not warn here, next call triggers the same warning
            warn_absent=False)
        agginfo_db = load_ds_aggregate_db(ds, abspath=True)

        to_save = []
        to_aggregate = set()
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='aggregate_metadata',
                # uninstalled subdatasets could be queried via aggregated metadata
                # -> no 'error'
                unavailable_path_status='',
                nondataset_path_status='error',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            ap_type = ap.get('type', None)
            ap_state = ap.get('state', None)
            assert ('parentds' in ap or ap_type == 'dataset')
            if ap_type == 'dataset' and ap_state != 'absent':
                # a present dataset, we can take directly from it
                aggsrc = ap['path']
                lgr.info('Aggregate metadata for dataset %s', aggsrc)
            else:
                # everything else needs to come from the parent
                aggsrc = ap['parentds']
                if ap_state == 'absent':
                    lgr.info(
                        'Attempt to use pre-aggregate metadata for absent %s from dataset at %s',
                        ap['path'], aggsrc)
                else:
                    lgr.info('Aggregate metadata for %s from dataset at %s',
                             ap['path'], aggsrc)

            to_aggregate.add(aggsrc)

            if ap_state == 'absent':
                # key thought: recursive is done by path annotation, hence
                # once we hit an absent dataset, we are 100% certain that
                # there is nothing to recurse into on the file system
                # hence we only have to look into the aggregated metadata
                # of the last available dataset in the dataset tree edge
                #
                # if there is nothing at this path, we need to look into the
                # parentds and check if we know anything about this path
                # if we do, we need to grab all the info and objects
                # if not, we need to error
                res = _get_dsinfo_from_aggmetadata(aggsrc, ap['path'],
                                                   recursive, agginfo_db)
                if not isinstance(res, list):
                    yield get_status_dict(status='impossible',
                                          message=res,
                                          action='aggregate_metadata',
                                          path=ap['path'],
                                          logger=lgr)
                    continue
                # cue for aggregation
                to_aggregate.update(res)
            else:
                # actually aggregate metadata for this dataset, immediately place
                # generated objects into the aggregated or reference dataset,
                # and put info into DB to get the distributed to all datasets
                # that need to be updated
                errored = _dump_extracted_metadata(ds, Dataset(aggsrc),
                                                   agginfo_db, to_save,
                                                   force_extraction,
                                                   agg_base_path)
                if errored:
                    yield get_status_dict(
                        status='error',
                        message=
                        'Metadata extraction failed (see previous error message, set datalad.runtime.raiseonerror=yes to fail immediately)',
                        action='aggregate_metadata',
                        path=aggsrc,
                        logger=lgr)

        # at this point we have dumped all aggregated metadata into object files
        # somewhere, we know what needs saving, but having saved anything, and
        # we know about the states of all aggregated dataset in the DB
        # what remains to do is to update all dataset, so they have there own copy
        # of aggregated metadata and update their respective aggregate.json with
        # info on what states we just aggregated from

        # first, let's figure out what dataset need updating at all
        # get adjencency info of the dataset tree spanning the base to all leaf dataset
        # associated with the path arguments
        if update_mode == 'all':
            ds_adj = {}
            discover_dataset_trace_to_targets(
                ds.path,
                to_aggregate,
                [],
                ds_adj,
                # we know that to_aggregate only lists datasets, existing and
                # absent ones -- we want to aggregate all of them, either from
                # just extracted metadata, or from previously aggregated metadata
                # of the closest superdataset
                includeds=to_aggregate)
            # TODO we need to work in the info about dataset that we only got from
            # aggregated metadata, that had no trace on the file system in here!!
            subtrees = _adj2subtrees(ds.path, ds_adj, to_aggregate)
        elif update_mode == 'target':
            subtrees = {ds.path: list(agginfo_db.keys())}
        else:
            raise ValueError(
                "unknown `update_mode` '%s' for metadata aggregation",
                update_mode)

        # go over datasets in bottom-up fashion
        for parentds_path in sorted(subtrees, reverse=True):
            lgr.info('Update aggregate metadata in dataset at: %s',
                     parentds_path)

            _update_ds_agginfo(ds.path, parentds_path, subtrees[parentds_path],
                               incremental, agginfo_db, to_save)
            # update complete
            res = get_status_dict(status='ok',
                                  action='aggregate_metadata',
                                  path=parentds_path,
                                  type='dataset',
                                  logger=lgr)
            res.update(agginfo_db.get(parentds_path, {}))
            yield res
        #
        # save potential modifications to dataset global metadata
        #
        if not to_save:
            return
        lgr.info('Attempting to save %i files/datasets', len(to_save))
        for res in Save.__call__(
                path=to_save,
                dataset=refds_path,
                message='[DATALAD] Dataset aggregate metadata update',
                return_type='generator',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res
Esempio n. 6
0
class Uninstall(Interface):
    """Uninstall subdatasets

    This command can be used to uninstall any number of installed subdataset.
    If a to-be-uninstalled subdataset contains presently installed subdatasets
    itself, their recursive removal has to be enabled explicitly to avoid the
    command to exit with an error. This command will error if individual files
    or non-dataset directories are given as input (use the drop or remove
    command depending in the desired goal), nor will it uninstall top-level
    datasets (i.e. datasets that or not a subdataset in another dataset; use
    the remove command for this purpose).

    By default, the availability of at least one remote copy for each currently
    available file in any dataset is verified. As these checks could lead to
    slow operation (network latencies, etc), they can be disabled.

    Any number of paths to process can be given as input. Recursion into
    subdatasets needs to be explicitly enabled, while recursion in
    subdirectories within a dataset as always done automatically. An optional
    recursion limit is applied relative to each given input path.

    Examples:

      Uninstall a subdataset (undo installation)::

        ~/some/dataset$ datalad uninstall somesubdataset1

    """
    _action = 'uninstall'

    _params_ = dict(
        dataset=dataset_argument,
        path=Parameter(args=("path", ),
                       metavar="PATH",
                       doc="path/name of the component to be uninstalled",
                       nargs="*",
                       constraints=EnsureStr() | EnsureNone()),
        recursive=recursion_flag,
        check=check_argument,
        if_dirty=if_dirty_opt,
    )

    @staticmethod
    @datasetmethod(name=_action)
    @eval_results
    def __call__(path=None,
                 dataset=None,
                 recursive=False,
                 check=True,
                 if_dirty='save-before'):

        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='uninstall', logger=lgr, refds=refds_path)
        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = refds_path
        if not dataset and not path:
            raise InsufficientArgumentsError(
                "insufficient information for `uninstall`: requires at least a path or dataset"
            )

        to_uninstall = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                action='uninstall',
                # justification for status:
                # content need not be uninstalled where there is none
                unavailable_path_status='notneeded',
                nondataset_path_status='error',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            # upfront sanity and compliance checks
            # check that we have no top-level datasets and not files to process
            if ap.get('type') == 'dataset' and \
                    not ap.get('state', None) == 'absent' and \
                    path_is_under([ap['path']]):  # wants a sequence!
                ap.update(
                    status='error',
                    message="refusing to uninstall current or parent directory"
                )
                yield ap
                continue
            if not ap.get('type', None) == 'dataset':
                ap.update(
                    status='impossible',
                    message=
                    "can only uninstall datasets (consider the `drop` command)"
                )
                yield ap
                continue
            # we only have dataset from here
            if not ap.get('parentds', None):
                # this could be a side-effect of the specific call semantics.
                # As stated in #1714, we are not really interested in whether
                # a superdataset was obvious in the call, but only whether there
                # is a superdataset at all. So let's look for one, and only barf
                # when there really isn't
                parentds = Dataset(ap['path']).get_superdataset(
                    datalad_only=False,
                    topmost=False,
                    # unless it is properly registered we have no way of
                    # reinstalling it
                    registered_only=True)
                if parentds is None:
                    ap.update(
                        status='error',
                        message=
                        "will not uninstall top-level dataset (consider `remove` command)"
                    )
                    yield ap
                    continue
                ap['parentds'] = parentds.path
            if not ap['path'] == refds_path:
                ap['process_content'] = True
            to_uninstall.append(ap)

        # iterate over all datasets, starting at the bottom
        # to deinit contained submodules first
        for ap in sorted(to_uninstall, key=lambda x: x['path'], reverse=True):
            if ap.get('state', None) == 'absent':
                # already gone
                continue
            ds = Dataset(ap['path'])
            # TODO generator
            # this should yield what it did
            handle_dirty_dataset(ds, mode=if_dirty)
            # we confirmed the super dataset presence above
            for r in _uninstall_dataset(ds,
                                        check=check,
                                        has_super=True,
                                        **res_kwargs):
                yield r
Esempio n. 7
0
class Addurls(Interface):
    """Create and update a dataset from a list of URLs.

    *Format specification*

    Several arguments take format strings.  These are similar to normal Python
    format strings where the names from `URL-FILE` (column names for a CSV or
    properties for JSON) are available as placeholders.  If `URL-FILE` is a CSV
    file, a positional index can also be used (i.e., "{0}" for the first
    column).  Note that a placeholder cannot contain a ':' or '!'.

    In addition, the `FILENAME-FORMAT` arguments has a few special
    placeholders.

      - _repindex

        The constructed file names must be unique across all fields rows.  To
        avoid collisions, the special placeholder "_repindex" can be added to
        the formatter.  Its value will start at 0 and increment every time a
        file name repeats.

      - _url_hostname, _urlN, _url_basename*

        Various parts of the formatted URL are available.  Take
        "http://datalad.org/asciicast/seamless_nested_repos.sh" as an example.

        "datalad.org" is stored as "_url_hostname".  Components of the URL's
        path can be referenced as "_urlN".  "_url0" and "_url1" would map to
        "asciicast" and "seamless_nested_repos.sh", respectively.  The final
        part of the path is also available as "_url_basename".

        This name is broken down further.  "_url_basename_root" and
        "_url_basename_ext" provide access to the root name and extension.
        These values are similar to the result of os.path.splitext, but, in the
        case of multiple periods, the extension is identified using the same
        length heuristic that git-annex uses.  As a result, the extension of
        "file.tar.gz" would be ".tar.gz", not ".gz".  In addition, the fields
        "_url_basename_root_py" and "_url_basename_ext_py" provide access to
        the result of os.path.splitext.

      - _url_filename*

        These are similar to _url_basename* fields, but they are obtained with
        a server request.  This is useful if the file name is set in the
        Content-Disposition header.


    *Examples*

    Consider a file "avatars.csv" that contains::

        who,ext,link
        neurodebian,png,https://avatars3.githubusercontent.com/u/260793
        datalad,png,https://avatars1.githubusercontent.com/u/8927200

    To download each link into a file name composed of the 'who' and 'ext'
    fields, we could run::

      $ datalad addurls -d avatar_ds --fast avatars.csv '{link}' '{who}.{ext}'

    The `-d avatar_ds` is used to create a new dataset in "$PWD/avatar_ds".

    If we were already in a dataset and wanted to create a new subdataset in an
    "avatars" subdirectory, we could use "//" in the `FILENAME-FORMAT`
    argument::

      $ datalad addurls --fast avatars.csv '{link}' 'avatars//{who}.{ext}'

    .. note::

       For users familiar with 'git annex addurl': A large part of this
       plugin's functionality can be viewed as transforming data from
       `URL-FILE` into a "url filename" format that fed to 'git annex addurl
       --batch --with-files'.
    """

    from datalad.distribution.dataset import datasetmethod
    from datalad.interface.utils import eval_results
    from datalad.distribution.dataset import EnsureDataset
    from datalad.support.constraints import EnsureChoice, EnsureNone, EnsureStr
    from datalad.support.param import Parameter

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""Add the URLs to this dataset (or possibly subdatasets of
            this dataset).  An empty or non-existent directory is passed to
            create a new dataset.  New subdatasets can be specified with
            `FILENAME-FORMAT`.""",
            constraints=EnsureDataset() | EnsureNone()),
        urlfile=Parameter(
            args=("urlfile", ),
            metavar="URL-FILE",
            doc="""A file that contains URLs or information that can be used to
            construct URLs.  Depending on the value of --input-type, this
            should be a CSV file (with a header as the first row) or a JSON
            file (structured as a list of objects with string values)."""),
        urlformat=Parameter(
            args=("urlformat", ),
            metavar="URL-FORMAT",
            doc="""A format string that specifies the URL for each entry.  See
            the 'Format Specification' section above."""),
        filenameformat=Parameter(
            args=("filenameformat", ),
            metavar="FILENAME-FORMAT",
            doc="""Like `URL-FORMAT`, but this format string specifies the file
            to which the URL's content will be downloaded.  The file name may
            contain directories.  The separator "//" can be used to indicate
            that the left-side directory should be created as a new subdataset.
            See the 'Format Specification' section above."""),
        input_type=Parameter(
            args=("-t", "--input-type"),
            metavar="TYPE",
            doc="""Whether `URL-FILE` should be considered a CSV file or a JSON
            file.  The default value, "ext", means to consider `URL-FILE` as a
            JSON file if it ends with ".json".  Otherwise, treat it as a CSV
            file.""",
            constraints=EnsureChoice("ext", "csv", "json")),
        exclude_autometa=Parameter(
            args=("-x", "--exclude_autometa"),
            metavar="REGEXP",
            doc="""By default, metadata field=value pairs are constructed with
            each column in `URL-FILE`, excluding any single column that is
            specified via `URL-FORMAT`.  This argument can be used to exclude
            columns that match a regular expression.  If set to '*' or an empty
            string, automatic metadata extraction is disabled completely.  This
            argument does not affect metadata set explicitly with --meta."""),
        meta=Parameter(
            args=(
                "-m",
                "--meta",
            ),
            metavar="FORMAT",
            action="append",
            doc="""A format string that specifies metadata.  It should be
            structured as "<field>=<value>".  As an example, "location={3}"
            would mean that the value for the "location" metadata field should
            be set the value of the fourth column.  This option can be given
            multiple times."""),
        message=Parameter(
            args=("--message", ),
            metavar="MESSAGE",
            doc="""Use this message when committing the URL additions.""",
            constraints=EnsureNone() | EnsureStr()),
        dry_run=Parameter(
            args=("-n", "--dry-run"),
            action="store_true",
            doc="""Report which URLs would be downloaded to which files and
            then exit."""),
        fast=Parameter(
            args=("--fast", ),
            action="store_true",
            doc="""If True, add the URLs, but don't download their content.
            Underneath, this passes the --fast flag to `git annex addurl`."""),
        ifexists=Parameter(
            args=("--ifexists", ),
            metavar="ACTION",
            doc="""What to do if a constructed file name already exists.  The
            default behavior is to proceed with the `git annex addurl`, which
            will fail if the file size has changed.  If set to 'overwrite',
            remove the old file before adding the new one.  If set to 'skip',
            do not add the new file.""",
            constraints=EnsureNone() | EnsureChoice("overwrite", "skip")),
        missing_value=Parameter(
            args=("--missing-value", ),
            metavar="VALUE",
            doc="""When an empty string is encountered, use this value
            instead.""",
            constraints=EnsureNone() | EnsureStr()),
        save=nosave_opt,
        version_urls=Parameter(
            args=("--version-urls", ),
            action="store_true",
            doc="""Try to add a version ID to the URL. This currently only has
            an effect on URLs for AWS S3 buckets."""),
    )

    @staticmethod
    @datasetmethod(name='addurls')
    @eval_results
    def __call__(dataset,
                 urlfile,
                 urlformat,
                 filenameformat,
                 input_type="ext",
                 exclude_autometa=None,
                 meta=None,
                 message=None,
                 dry_run=False,
                 fast=False,
                 ifexists=None,
                 missing_value=None,
                 save=True,
                 version_urls=False):
        # Temporarily work around gh-2269.
        url_file = urlfile
        url_format, filename_format = urlformat, filenameformat

        from requests.exceptions import RequestException

        from datalad.distribution.add import Add
        from datalad.distribution.create import Create
        from datalad.distribution.dataset import Dataset, require_dataset
        from datalad.interface.results import get_status_dict
        from datalad.support.annexrepo import AnnexRepo

        lgr = logging.getLogger("datalad.plugin.addurls")

        dataset = require_dataset(dataset, check_installed=False)
        if dataset.repo and not isinstance(dataset.repo, AnnexRepo):
            yield get_status_dict(action="addurls",
                                  ds=dataset,
                                  status="error",
                                  message="not an annex repo")
            return

        if input_type == "ext":
            extension = os.path.splitext(url_file)[1]
            input_type = "json" if extension == ".json" else "csv"

        with open(url_file) as fd:
            try:
                rows, subpaths = extract(fd, input_type, url_format,
                                         filename_format, exclude_autometa,
                                         meta, dry_run, missing_value)
            except (ValueError, RequestException) as exc:
                yield get_status_dict(action="addurls",
                                      ds=dataset,
                                      status="error",
                                      message=exc_str(exc))
                return

        if len(rows) != len(set(row["filename"] for row in rows)):
            yield get_status_dict(action="addurls",
                                  ds=dataset,
                                  status="error",
                                  message=("There are file name collisions; "
                                           "consider using {_repindex}"))
            return

        if dry_run:
            for subpath in subpaths:
                lgr.info("Would create a subdataset at %s", subpath)
            for row in rows:
                lgr.info("Would download %s to %s", row["url"],
                         os.path.join(dataset.path, row["filename"]))
                lgr.info(
                    "Metadata: %s",
                    sorted(u"{}={}".format(k, v)
                           for k, v in row["meta_args"].items()))
            yield get_status_dict(action="addurls",
                                  ds=dataset,
                                  status="ok",
                                  message="dry-run finished")
            return

        if not dataset.repo:
            # Populate a new dataset with the URLs.
            for r in dataset.create(result_xfm=None,
                                    return_type='generator',
                                    save=save):
                yield r

        annex_options = ["--fast"] if fast else []

        for spath in subpaths:
            if os.path.exists(os.path.join(dataset.path, spath)):
                lgr.warning("Not creating subdataset at existing path: %s",
                            spath)
            else:
                for r in dataset.create(spath,
                                        result_xfm=None,
                                        return_type='generator',
                                        save=save):
                    yield r

        for row in rows:
            # Add additional information that we'll need for various
            # operations.
            filename_abs = os.path.join(dataset.path, row["filename"])
            if row["subpath"]:
                ds_current = Dataset(os.path.join(dataset.path,
                                                  row["subpath"]))
                ds_filename = os.path.relpath(filename_abs, ds_current.path)
            else:
                ds_current = dataset
                ds_filename = row["filename"]
            row.update({
                "filename_abs": filename_abs,
                "ds": ds_current,
                "ds_filename": ds_filename
            })

        if version_urls:
            num_urls = len(rows)
            log_progress(lgr.info,
                         "addurls_versionurls",
                         "Versioning %d URLs",
                         num_urls,
                         label="Versioning URLs",
                         total=num_urls,
                         unit=" URLs")
            for row in rows:
                url = row["url"]
                try:
                    row["url"] = get_versioned_url(url)
                except (ValueError, NotImplementedError) as exc:
                    # We don't expect this to happen because get_versioned_url
                    # should return the original URL if it isn't an S3 bucket.
                    # It only raises exceptions if it doesn't know how to
                    # handle the scheme for what looks like an S3 bucket.
                    lgr.warning("error getting version of %s: %s", row["url"],
                                exc_str(exc))
                log_progress(lgr.info,
                             "addurls_versionurls",
                             "Versioned result for %s: %s",
                             url,
                             row["url"],
                             update=1,
                             increment=True)
            log_progress(lgr.info, "addurls_versionurls",
                         "Finished versioning URLs")

        files_to_add = set()
        for r in add_urls(rows, ifexists=ifexists, options=annex_options):
            if r["status"] == "ok":
                files_to_add.add(r["path"])
            yield r

            msg = message or """\
[DATALAD] add files from URLs

url_file='{}'
url_format='{}'
filename_format='{}'""".format(url_file, url_format, filename_format)

        if files_to_add:
            for r in dataset.add(files_to_add, save=False):
                yield r

            meta_rows = [r for r in rows if r["filename_abs"] in files_to_add]
            for r in add_meta(meta_rows):
                yield r

            # Save here rather than the add call above to trigger a metadata
            # commit on the git-annex branch.
            if save:
                for r in dataset.save(message=msg, recursive=True):
                    yield r
Esempio n. 8
0
class Subdatasets(Interface):
    r"""Report subdatasets and their properties.

    The following properties are reported (if possible) for each matching
    subdataset record.

    "name"
        Name of the subdataset in the parent (often identical with the
        relative path in the parent dataset)

    "path"
        Absolute path to the subdataset

    "parentds"
        Absolute path to the parent dataset

    "gitshasum"
        SHA1 of the subdataset commit recorded in the parent dataset

    "state"
        Condition of the subdataset: 'absent', 'present'

    "gitmodule_url"
        URL of the subdataset recorded in the parent

    "gitmodule_name"
        Name of the subdataset recorded in the parent

    "gitmodule_<label>"
        Any additional configuration property on record.

    Performance note: Property modification, requesting `bottomup` reporting
    order, or a particular numerical `recursion_limit` implies an internal
    switch to an alternative query implementation for recursive query that is
    more flexible, but also notably slower (performs one call to Git per
    dataset versus a single call for all combined).

    The following properties for subdatasets are recognized by DataLad
    (without the 'gitmodule\_' prefix that is used in the query results):

    "datalad-recursiveinstall"
        If set to 'skip', the respective subdataset is skipped when DataLad
        is recursively installing its superdataset. However, the subdataset
        remains installable when explicitly requested, and no other features
        are impaired.

    "datalad-url"
        If a subdataset was originally established by cloning, 'datalad-url'
        records the URL that was used to do so. This might be different from
        'url' if the URL contains datalad specific pieces like any URL of the
        form "ria+<some protocol>...".
    """
    _params_ = dict(
        dataset=Parameter(args=("-d", "--dataset"),
                          doc="""specify the dataset to query.  If
            no dataset is given, an attempt is made to identify the dataset
            based on the input and/or the current working directory""",
                          constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(
            args=("path", ),
            metavar='PATH',
            doc="""path/name to query for subdatasets. Defaults to the
            current directory[PY: , or the entire dataset if called as
            a dataset method PY].""",
            nargs='*',
            constraints=EnsureStr() | EnsureNone()),
        fulfilled=Parameter(
            args=("--fulfilled", ),
            doc="""if given, must be a boolean flag indicating whether
            to report either only locally present or absent datasets.
            By default subdatasets are reported regardless of their
            status""",
            constraints=EnsureBool() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        contains=Parameter(
            args=('--contains', ),
            metavar='PATH',
            action='append',
            doc="""limit report to the subdatasets containing the
            given path. If a root path of a subdataset is given the last
            reported dataset will be the subdataset itself.[CMD:  This
            option can be given multiple times CMD][PY:  Can be a list with
            multiple paths PY], in which case datasets will be reported that
            contain any of the given paths.""",
            constraints=EnsureStr() | EnsureNone()),
        bottomup=Parameter(
            args=("--bottomup", ),
            action="store_true",
            doc="""whether to report subdatasets in bottom-up order along
            each branch in the dataset tree, and not top-down."""),
        set_property=Parameter(
            args=('--set-property', ),
            metavar=('NAME', 'VALUE'),
            nargs=2,
            action='append',
            doc="""Name and value of one or more subdataset properties to
            be set in the parent dataset's .gitmodules file. The property name
            is case-insensitive, must start with a letter, and consist only
            of alphanumeric characters. The value can be
            a Python format() template string wrapped in '<>' (e.g.
            '<{gitmodule_name}>').
            Supported keywords are any item reported in the result properties
            of this command, plus 'refds_relpath' and 'refds_relname':
            the relative path of a subdataset with respect to the base dataset
            of the command call, and, in the latter case, the same string with
            all directory separators replaced by dashes.[CMD:  This
            option can be given multiple times. CMD]""",
            constraints=EnsureStr() | EnsureNone()),
        delete_property=Parameter(
            args=('--delete-property', ),
            metavar='NAME',
            action='append',
            doc="""Name of one or more subdataset properties to be removed
            from the parent dataset's .gitmodules file.[CMD:  This
            option can be given multiple times. CMD]""",
            constraints=EnsureStr() | EnsureNone()))

    @staticmethod
    @datasetmethod(name='subdatasets')
    @eval_results
    def __call__(path=None,
                 dataset=None,
                 fulfilled=None,
                 recursive=False,
                 recursion_limit=None,
                 contains=None,
                 bottomup=False,
                 set_property=None,
                 delete_property=None):
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='report on subdataset(s)')

        paths = resolve_path(ensure_list(path), dataset, ds) if path else None

        # no constraints given -> query subdatasets under curdir
        if not paths and dataset is None:
            cwd = Path(getpwd())
            paths = None if cwd == ds.pathobj else [cwd]

        lgr.debug('Query subdatasets of %s', dataset)
        if paths is not None:
            lgr.debug('Query subdatasets underneath paths: %s', paths)
        refds_path = ds.path

        # return as quickly as possible
        if isinstance(recursion_limit, int) and (recursion_limit <= 0):
            return

        if set_property:
            for k, v in set_property:
                if valid_key.match(k) is None:
                    raise ValueError(
                        "key '%s' is invalid (alphanumeric plus '-' only, must "
                        "start with a letter)" % k)
        if contains:
            contains = resolve_path(ensure_list(contains), dataset, ds)
            # expand all test cases for the contains test in the loop below
            # leads to ~20% speedup per loop iteration of a non-match
            expanded_contains = [[c] + list(c.parents) for c in contains]
        else:
            expanded_contains = []
        contains_hits = set()
        for r in _get_submodules(ds, paths, fulfilled, recursive,
                                 recursion_limit, expanded_contains, bottomup,
                                 set_property, delete_property, refds_path):
            # a boat-load of ancient code consumes this and is ignorant of
            # Path objects
            r['path'] = str(r['path'])
            # without the refds_path cannot be rendered/converted relative
            # in the eval_results decorator
            r['refds'] = refds_path
            if 'contains' in r:
                contains_hits.update(r['contains'])
                r['contains'] = [str(c) for c in r['contains']]
            yield r
        if contains:
            for c in set(contains).difference(contains_hits):
                yield get_status_dict(
                    'subdataset',
                    path=str(c),
                    status='impossible',
                    message='path not contained in any matching subdataset',
                    # we do not want to log such an event, because it is a
                    # legit query to check for matching subdatasets simply
                    # for the purpose of further decision making
                    # user communication in front-end scenarios will happen
                    # via result rendering
                    #logger=lgr
                )
Esempio n. 9
0
class Dump(Interface):
    """Query a dataset's aggregated metadata for dataset and file metadata

    Two types of metadata are supported:

    1. metadata describing a dataset as a whole (dataset-global metadata), and

    2. metadata for files in a dataset (content metadata).

    The DATASET_FILE_PATH_PATTERN argument specifies dataset and file patterns
    that are matched against the dataset and file information in the metadata.
    There are two format, UUID-based and dataset-tree based. The formats are:

        TREE:   ["tree:"] [DATASET_PATH] ["@" VERSION-DIGITS] [":" [LOCAL_PATH]]
        UUID:   "uuid:" UUID-DIGITS ["@" VERSION-DIGITS] [":" [LOCAL_PATH]]

    (the tree-format is the default format and does not require a prefix).
    """

    # Use a custom renderer to emit a self-contained metadata record. The
    # emitted record can be fed into meta-add for example.
    result_renderer = 'tailored'

    _examples_ = [
        dict(
            text='Dump the metadata of the file "dataset_description.json" in '
            'the dataset "simon". (The queried dataset git-repository is '
            'determined based on the current working directory)',
            code_cmd="datalad meta-dump simon:dataset_description.json"),
        dict(text="Sometimes it is helpful to get metadata records formatted "
             "in a more accessible form, here as pretty-printed JSON",
             code_cmd="datalad -f json_pp meta-dump "
             "simon:dataset_description.json"),
        dict(text="Same query as above, but specify that all datasets should "
             "be queried for the given path",
             code_cmd="datalad meta-dump -d . :somedir/subdir/thisfile.dat"),
        dict(text="Dump any metadata record of any dataset known to the "
             "queried dataset",
             code_cmd="datalad meta-dump -r"),
        dict(text="Show metadata for all datasets",
             code_cmd="datalad -f json_pp meta-dump -r"),
        dict(text="Show metadata for all files ending in `.json´ in the root "
             "directories of all datasets",
             code_cmd="datalad -f json_pp meta-dump *:*.json -r"),
        dict(text="Show metadata for all files ending in `.json´ in all "
             "datasets by not specifying a dataset at all. This will "
             "start dumping at the top-level dataset.",
             code_cmd="datalad -f json_pp meta-dump :*.json -r")
    ]

    _params_ = dict(
        backend=Parameter(args=("--backend", ),
                          metavar="BACKEND",
                          doc="""metadata storage backend to be used.""",
                          constraints=EnsureChoice("git")),
        metadata_store=Parameter(
            args=("-m", "--metadata-store"),
            metavar="METADATA_STORE",
            doc="""Directory in which the metadata model instance is
            stored (often this is the same directory as the dataset
            directory). If no directory name is provided, the current working
            directory is used."""),
        path=Parameter(args=("path", ),
                       metavar="DATASET_FILE_PATH_PATTERN",
                       doc="path to query metadata for",
                       constraints=EnsureStr() | EnsureNone(),
                       nargs='?'),
        recursive=Parameter(
            args=(
                "-r",
                "--recursive",
            ),
            action="store_true",
            doc="""if set, recursively report on any matching metadata based
            on given paths or reference dataset. Note, setting this option
            does not cause any recursion into potential subdatasets on the
            filesystem. It merely determines what metadata is being reported
            from the given/discovered reference dataset."""))

    @staticmethod
    @datasetmethod(name='meta_dump')
    @eval_results
    def __call__(backend="git", metadata_store=None, path="", recursive=False):

        metadata_store = metadata_store or "."
        tree_version_list, uuid_set = get_top_level_metadata_objects(
            default_mapper_family, metadata_store)

        # We require both entry points to exist for valid metadata
        if tree_version_list is None or uuid_set is None:

            message = (f"No {backend}-mapped datalad metadata "
                       f"model found in: {metadata_store}")
            lgr.warning(message)

            yield dict(action="meta_dump",
                       status='impossible',
                       backend=backend,
                       metadata_store=metadata_store,
                       message=message)
            return

        parser = MetadataURLParser(path)
        metadata_url = parser.parse()

        if isinstance(metadata_url, TreeMetadataURL):
            yield from dump_from_dataset_tree(backend, metadata_store,
                                              tree_version_list, metadata_url,
                                              recursive)

        elif isinstance(metadata_url, UUIDMetadataURL):
            yield from dump_from_uuid_set(backend, metadata_store, uuid_set,
                                          metadata_url, recursive)

        return

    @staticmethod
    def custom_result_renderer(res, **kwargs):

        if res["status"] != "ok" or res.get("action", "") != 'meta_dump':
            # logging complained about this already
            return

        render_dataset_level_metadata(res["metadata"].get(
            "dataset_level_metadata", dict()))

        render_file_level_metadata(res["metadata"].get("file_level_metadata",
                                                       dict()))
Esempio n. 10
0
"""

__docformat__ = 'restructuredtext'

from datalad.interface.results import known_result_xfms
from datalad.support.param import Parameter
from datalad.support.constraints import EnsureInt, EnsureNone, EnsureStr
from datalad.support.constraints import EnsureChoice
from datalad.support.constraints import EnsureCallable

location_description = Parameter(
    args=(
        "-D",
        "--description",
    ),
    constraints=EnsureStr() | EnsureNone(),
    doc="""short description to use for a dataset location. Its primary
    purpose is to help humans to identify a dataset copy (e.g., "mike's dataset
    on lab server"). Note that when a dataset is published, this information
    becomes available on the remote side.""")

recursion_flag = Parameter(args=(
    "-r",
    "--recursive",
),
                           action="store_true",
                           doc="""if set, recurse into potential subdataset""")

recursion_limit = Parameter(
    args=(
        "-R",
Esempio n. 11
0
class Clean(Interface):
    """Clean up after DataLad (possible temporary files etc.)

    Removes extracted temporary archives, etc.

    Examples:

      $ datalad clean
    """

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to perform the clean operation on.  If
                no dataset is given, an attempt is made to identify the dataset
                in current working directory""",
            constraints=EnsureDataset() | EnsureNone()),
        # TODO:  --info  -- which performs dry run just summarizing what is to be cleaned up
        # TODO: Python only???
        what=Parameter(
            args=("--what", ),
            dest='what',
            choices=('cached-archives', 'annex-tmp', 'annex-transfer',
                     'search-index'),
            nargs="*",
            doc="""What to clean.  If none specified -- all known targets are
            cleaned"""),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
    )

    @staticmethod
    @datasetmethod(name='clean')
    @eval_results
    def __call__(dataset=None,
                 what=None,
                 recursive=False,
                 recursion_limit=None):
        ds = require_dataset(dataset, purpose='clean-up')
        res_kwargs = dict(action='clean', logger=lgr, refds=ds.path)
        for wds in itertools.chain(
            [ds],
                ds.subdatasets(fulfilled=True,
                               recursive=recursive,
                               recursion_limit=recursion_limit,
                               return_type='generator',
                               result_renderer='disabled',
                               result_xfm='datasets') if recursive else []):
            d = wds.path
            gitdir = GitRepo.get_git_dir(d)
            DIRS_PLURAL = ("directory", "directories")
            FILES_PLURAL = ("file", "files")
            for dirpath, flag, msg, sing_pl in [
                (ARCHIVES_TEMP_DIR, "cached-archives", "temporary archive",
                 DIRS_PLURAL),
                (ANNEX_TEMP_DIR, "annex-tmp", "temporary annex", FILES_PLURAL),
                (ANNEX_TRANSFER_DIR, "annex-transfer",
                 "annex temporary transfer", DIRS_PLURAL),
                (opj(gitdir, SEARCH_INDEX_DOTGITDIR), 'search-index',
                 "metadata search index", FILES_PLURAL),
            ]:
                topdir = opj(d, dirpath)
                lgr.debug("Considering to clean %s:%s", d, dirpath)
                if not ((what is None) or (flag in what)):
                    yield get_status_dict(path=topdir,
                                          status='notneeded',
                                          type='directory',
                                          **res_kwargs)
                    continue
                paths = glob(opj(topdir, '*'))
                if not paths:
                    yield get_status_dict(path=topdir,
                                          status='notneeded',
                                          type='directory',
                                          **res_kwargs)
                    continue
                pl = len(paths) > 1
                message = ("Removed %d %s %s: %s", len(paths), msg,
                           sing_pl[int(pl)], ", ".join(
                               sorted([x[len(topdir) + 1:] for x in paths])))
                rmtree(topdir)
                yield get_status_dict(path=topdir,
                                      status='ok',
                                      type='dir',
                                      message=message,
                                      **res_kwargs)
Esempio n. 12
0
class Create(Interface):
    """Create a new dataset from scratch.

    This command initializes a new :term:`dataset` at a given location, or the
    current directory. The new dataset can optionally be registered in an
    existing :term:`superdataset` (the new dataset's path needs to be located
    within the superdataset for that, and the superdataset needs to be given
    explicitly). It is recommended to provide a brief description to label
    the dataset's nature *and* location, e.g. "Michael's music on black
    laptop". This helps humans to identify data locations in distributed
    scenarios.  By default an identifier comprised of user and machine name,
    plus path will be generated.

    Plain Git repositories can be created via the [PY: `no_annex` PY][CMD: --no-annex CMD] flag.
    However, the result will not be a full dataset, and, consequently,
    not all features are supported (e.g. a description).

    || REFLOW >>
    To create a local version of a remote dataset use the
    :func:`~datalad.api.install` command instead.
    << REFLOW ||

    .. note::
      Power-user info: This command uses :command:`git init`, and
      :command:`git annex init` to prepare the new dataset. Registering to a
      superdataset is performed via a :command:`git submodule add` operation
      in the discovered superdataset.
    """

    _params_ = dict(
        path=Parameter(
            args=("path", ),
            metavar='PATH',
            doc="""path where the dataset shall be created, directories
            will be created as necessary. If no location is provided, a dataset
            will be created in the current working directory. Either way the
            command will error if the target directory is not empty.
            Use `force` to create a dataset in a non-empty directory.""",
            nargs='?',
            # put dataset 2nd to avoid useless conversion
            constraints=EnsureStr() | EnsureDataset() | EnsureNone()),
        dataset=Parameter(
            args=("-d", "--dataset"),
            metavar='PATH',
            doc="""specify the dataset to perform the create operation on. If
            a dataset is give, a new subdataset will be created in it.""",
            constraints=EnsureDataset() | EnsureNone()),
        force=Parameter(
            args=(
                "-f",
                "--force",
            ),
            doc="""enforce creation of a dataset in a non-empty directory""",
            action='store_true'),
        description=dataset_description,
        no_annex=Parameter(
            args=("--no-annex", ),
            doc="""if set, a plain Git repository will be created without any
            annex""",
            action='store_true'),
        save=nosave_opt,
        if_dirty=if_dirty_opt,
        annex_version=Parameter(
            args=("--annex-version", ),
            doc="""select a particular annex repository version. The
            list of supported versions depends on the available git-annex
            version. This should be left untouched, unless you know what
            you are doing""",
            constraints=EnsureDType(int) | EnsureNone()),
        annex_backend=Parameter(
            args=("--annex-backend", ),
            constraints=EnsureStr() | EnsureNone(),
            # not listing choices here on purpose to avoid future bugs
            doc="""set default hashing backend used by the new dataset.
            For a list of supported backends see the git-annex
            documentation. The default is optimized for maximum compatibility
            of datasets across platforms (especially those with limited
            path lengths)""",
            nargs=1),
        native_metadata_type=Parameter(
            args=('--native-metadata-type', ),
            metavar='LABEL',
            action='append',
            constraints=EnsureStr() | EnsureNone(),
            doc="""Metadata type label. Must match the name of the respective
            parser implementation in Datalad (e.g. "bids").[CMD:  This option
            can be given multiple times CMD]"""),
        shared_access=shared_access_opt,
        git_opts=git_opts,
        annex_opts=annex_opts,
        annex_init_opts=annex_init_opts,
    )

    @staticmethod
    @datasetmethod(name='create')
    def __call__(path=None,
                 force=False,
                 description=None,
                 dataset=None,
                 no_annex=False,
                 save=True,
                 annex_version=None,
                 annex_backend='MD5E',
                 native_metadata_type=None,
                 if_dirty='save-before',
                 shared_access=None,
                 git_opts=None,
                 annex_opts=None,
                 annex_init_opts=None):

        # two major cases
        # 1. we got a `dataset` -> we either want to create it (path is None),
        #    or another dataset in it (path is not None)
        # 2. we got no dataset -> we want to create a fresh dataset at the
        #    desired location, either at `path` or PWD

        # sanity check first
        if git_opts:
            lgr.warning(
                "`git_opts` argument is presently ignored, please complain!")
        if no_annex:
            if description:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "description for annex repo and declaring "
                                 "no annex repo.")
            if annex_opts:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "options for annex and declaring no "
                                 "annex repo.")
            if annex_init_opts:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "options for annex init and declaring no "
                                 "annex repo.")

        if not isinstance(force, bool):
            raise ValueError(
                "force should be bool, got %r.  Did you mean to provide a 'path'?"
                % force)

        # straight from input arg, no messing around before this
        if path is None:
            if dataset is None:
                # nothing given explicity, assume create fresh right here
                path = getpwd()
            else:
                # no path, but dataset -> create that dataset
                path = dataset.path
        else:
            # resolve the path against a potential dataset
            path = resolve_path(path, ds=dataset)

        # we know that we need to create a dataset at `path`
        assert (path is not None)

        if git_opts is None:
            git_opts = {}
        if shared_access:
            # configure `git --shared` value
            git_opts['shared'] = shared_access

        # check for sane subdataset path
        real_targetpath = with_pathsep(realpath(path))  # realpath OK
        if dataset is not None:
            # make sure we get to an expected state
            if dataset.is_installed():
                handle_dirty_dataset(dataset, if_dirty)
            if not real_targetpath.startswith(  # realpath OK
                    with_pathsep(realpath(dataset.path))):  # realpath OK
                raise ValueError("path {} outside {}".format(path, dataset))

        # important to use the given Dataset object to avoid spurious ID
        # changes with not-yet-materialized Datasets
        tbds = dataset if dataset is not None and dataset.path == path else Dataset(
            path)

        # don't create in non-empty directory without `force`:
        if isdir(tbds.path) and listdir(tbds.path) != [] and not force:
            raise ValueError("Cannot create dataset in directory %s "
                             "(not empty). Use option 'force' in order to "
                             "ignore this and enforce creation." % tbds.path)

        if no_annex:
            lgr.info("Creating a new git repo at %s", tbds.path)
            GitRepo(tbds.path, url=None, create=True, git_opts=git_opts)
        else:
            # always come with annex when created from scratch
            lgr.info("Creating a new annex repo at %s", tbds.path)
            AnnexRepo(tbds.path,
                      url=None,
                      create=True,
                      backend=annex_backend,
                      version=annex_version,
                      description=description,
                      git_opts=git_opts,
                      annex_opts=annex_opts,
                      annex_init_opts=annex_init_opts)

        if native_metadata_type is not None:
            if not isinstance(native_metadata_type, list):
                native_metadata_type = [native_metadata_type]
            for nt in native_metadata_type:
                tbds.config.add('datalad.metadata.nativetype', nt)

        # record an ID for this repo for the afterlife
        # to be able to track siblings and children
        id_var = 'datalad.dataset.id'
        if id_var in tbds.config:
            # make sure we reset this variable completely, in case of a re-create
            tbds.config.unset(id_var, where='dataset')
        tbds.config.add(id_var,
                        tbds.id if tbds.id is not None else
                        uuid.uuid1().urn.split(':')[-1],
                        where='dataset')

        # save everthing
        tbds.repo.add('.datalad', git=True)

        if save:
            Save.__call__(message='[DATALAD] new dataset',
                          dataset=tbds,
                          auto_add_changes=False,
                          recursive=False)

        if dataset is not None and dataset.path != tbds.path:
            # we created a dataset in another dataset
            # -> make submodule
            from datalad.distribution.utils import _install_subds_inplace
            subdsrelpath = relpath(realpath(tbds.path),
                                   realpath(dataset.path))  # realpath OK
            _install_subds_inplace(ds=dataset,
                                   path=tbds.path,
                                   relativepath=subdsrelpath)
            # this will have staged the changes in the superdataset already
            if save:
                Save.__call__(message='[DATALAD] added subdataset',
                              dataset=dataset,
                              auto_add_changes=False,
                              recursive=False)

        return tbds

    @staticmethod
    def result_renderer_cmdline(res, args):
        from datalad.ui import ui
        if res is None:
            ui.message("Nothing was created")
        elif isinstance(res, Dataset):
            ui.message("Created dataset at %s." % res.path)
Esempio n. 13
0
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Common interface options

"""

__docformat__ = 'restructuredtext'

from datalad.support.param import Parameter
from datalad.support.constraints import EnsureInt, EnsureNone, EnsureStr

dataset_description = Parameter(
    args=(
        "-D",
        "--description",
    ),
    constraints=EnsureStr() | EnsureNone(),
    doc="""short description of this dataset instance that humans can use to
    identify the repository/location, e.g. "Precious data on my laptop.""")

recursion_flag = Parameter(args=(
    "-r",
    "--recursive",
),
                           action="store_true",
                           doc="""if set, recurse into potential subdataset""")

recursion_limit = Parameter(
    args=("--recursion-limit", ),
    metavar="LEVELS",
    constraints=EnsureInt() | EnsureNone(),
    doc="""limit recursion into subdataset to the given number of levels""")
Esempio n. 14
0
class Publish(Interface):
    """Publish a dataset to a known :term:`sibling`.

    This makes the last saved state of a dataset available to a sibling
    or special remote data store of a dataset. Any target sibling must already
    exist and be known to the dataset.

    Optionally, it is possible to limit publication to change sets relative
    to a particular point in the version history of a dataset (e.g. a release
    tag). By default, the state of the local dataset is evaluated against the
    last known state of the target sibling. Actual publication is only attempted
    if there was a change compared to the reference state, in order to speed up
    processing of large collections of datasets. Evaluation with respect to
    a particular "historic" state is only supported in conjunction with a
    specified reference dataset. Change sets are also evaluated recursively, i.e.
    only those subdatasets are published where a change was recorded that is
    reflected in to current state of the top-level reference dataset.
    See "since" option for more information.

    Only publication of saved changes is supported. Any unsaved changes in a
    dataset (hierarchy) have to be saved before publication.

    .. note::
      Power-user info: This command uses :command:`git push`, and :command:`git annex copy`
      to publish a dataset. Publication targets are either configured remote
      Git repositories, or git-annex special remotes (if they support data
      upload).

    .. note::
      The `push` command (new in 0.13.0) provides an alternative interface.
      Critical differences are that `push` transfers annexed data by default
      and does not handle sibling creation (i.e. it does not have a `--missing`
      option).
    """
    # XXX prevent common args from being added to the docstring
    _no_eval_results = True
    # TODO: Figure out, how to tell about tracking branch/upstream
    #      (and the respective remote)
    #      - it is used, when no destination is given
    #      - it is configured to be the given destination, if there was no
    #        upstream set up before, so you can use just "datalad publish" next
    #        time.

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            metavar='DATASET',
            doc="""specify the (top-level) dataset to be published. If no dataset
            is given, the datasets are determined based on the input arguments""",
            constraints=EnsureDataset() | EnsureNone()),
        to=Parameter(
            args=("--to", ),
            metavar='LABEL',
            doc="""name of the target sibling. If no name is given an attempt is
            made to identify the target based on the dataset's configuration
            (i.e. a configured tracking branch, or a single sibling that is
            configured for publication)""",
            # TODO: See TODO at top of class!
            constraints=EnsureStr() | EnsureNone()),
        since=Parameter(
            args=("--since", ),
            constraints=EnsureStr() | EnsureNone(),
            doc=
            """specifies commit-ish (tag, shasum, etc.) from which to look for
            changes to decide whether pushing is necessary.
            If '^' is given, the last state of the current branch at the sibling
            is taken as a starting point. An empty string ('') for the same effect is
            still supported)."""),
        # since: commit => .gitmodules diff to head => submodules to publish
        missing=missing_sibling_opt,
        path=Parameter(
            args=("path", ),
            metavar='PATH',
            # TODO this description is no longer correct
            doc="path(s), that may point to file handle(s) to publish including "
            "their actual content or to subdataset(s) to be published. If a "
            "file handle is published with its data, this implicitly means "
            "to also publish the (sub)dataset it belongs to. '.' as a path "
            "is treated in a special way in the sense, that it is passed "
            "to subdatasets in case `recursive` is also given.",
            constraints=EnsureStr() | EnsureNone(),
            nargs='*'),
        force=Parameter(
            args=(
                "-f",
                "--force",
            ),
            doc="""enforce doing publish activities (git push etc) regardless of
            the analysis if they seemed needed""",
            action='store_true'),
        # TODO add option to decide what branch/repo to push
        transfer_data=Parameter(args=("--transfer-data", ),
                                doc="""ADDME""",
                                constraints=EnsureChoice(
                                    'auto', 'none', 'all')),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        git_opts=git_opts,
        annex_opts=annex_opts,
        annex_copy_opts=annex_copy_opts,
        jobs=jobs_opt,
    )

    @staticmethod
    @datasetmethod(name='publish')
    @eval_results
    def __call__(path=None,
                 dataset=None,
                 to=None,
                 since=None,
                 missing='fail',
                 force=False,
                 transfer_data='auto',
                 recursive=False,
                 recursion_limit=None,
                 git_opts=None,
                 annex_opts=None,
                 annex_copy_opts=None,
                 jobs=None):

        # if ever we get a mode, for "with-data" we would need this
        #if dataset and not path:
        #    # act on the whole dataset if nothing else was specified
        #    path = dataset.path if isinstance(dataset, Dataset) else dataset

        if not (isinstance(dataset, Dataset) or (dataset is None and path)):
            # try to find a dataset in PWD
            dataset = require_dataset(dataset,
                                      check_installed=True,
                                      purpose='publishing')

        if (since and since != '^') and not dataset:
            raise InsufficientArgumentsError(
                'Modification detection (--since) without a base dataset '
                'is not supported')

        if dataset and since in ('', '^'):
            # only update since last update so we figure out what was the last update
            active_branch = dataset.repo.get_active_branch()
            if to:
                # XXX here we assume one to one mapping of names from local branches
                # to the remote
                since = '%s/%s' % (to, active_branch)
                # test if such branch already exists,
                if since not in dataset.repo.get_remote_branches():
                    lgr.debug(
                        "No remote branch %s yet, so since will not be used",
                        since)
                    since = None
            else:
                # take tracking remote for the active branch
                tracked_remote, tracked_refspec = dataset.repo.get_tracking_branch(
                )
                if tracked_remote:
                    if tracked_refspec.startswith('refs/heads/'):
                        tracked_refspec = tracked_refspec[len('refs/heads/'):]
                    #to = tracked_remote
                    since = '%s/%s' % (tracked_remote, tracked_refspec)
                else:
                    lgr.info(
                        "No tracked remote for %s. since option is of no effect",
                        active_branch)
                    since = None

        # here is the plan
        # 1. figure out remote to publish to
        # 2. figure out which content needs to be published to this remote
        # 3. look for any pre-publication dependencies of that remote
        #    (i.e. remotes that need to be published to before)
        # 4. publish the content needed to go to the primary remote to
        #    the dependencies first, and to the primary afterwards
        ds_remote_info = {}

        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(refds=refds_path, logger=lgr, action='publish')

        to_process = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='publish',
                unavailable_path_status='impossible',
                nondataset_path_status='error',
                modified="%s..HEAD" % since if since else since,
                return_type='generator',
                on_failure='ignore',
                force_no_revision_change_discovery=
                False,  # we cannot publish what was not committed
                force_untracked_discovery=False  # we cannot publish untracked
        ):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            remote_info_result = None
            if ap.get('type', ap.get('type_src', 'dataset')) != 'dataset':
                # for everything that is not a dataset get the remote info
                # for the parent
                parentds = ap.get('parentds', None)
                if parentds and parentds not in ds_remote_info:
                    remote_info_result = _get_remote_info(
                        parentds, ds_remote_info, to, missing)
            else:
                # this is a dataset
                if ap.get('state', None) == 'absent':
                    continue
                # get the remote info for itself
                remote_info_result = _get_remote_info(ap['path'],
                                                      ds_remote_info, to,
                                                      missing)
                ap['process_content'] = True
            if remote_info_result is not None:
                ap['status'] = remote_info_result[0]
                ap['message'] = remote_info_result[1]
                yield ap
                continue
            to_process.append(ap)

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_process,
                refds_path=refds_path)
        assert (not completed)

        lgr.debug("Evaluating %i dataset publication candidate(s)",
                  len(content_by_ds))
        # TODO: fancier sorting, so we still follow somewhat the hierarchy
        #       in sorted order, e.g.
        #  d1/sub1/sub1
        #  d1/sub1
        #  d1
        #  d2/sub1
        #  d2
        content_by_ds = OrderedDict(
            (d, content_by_ds[d]) for d in sorted(content_by_ds, reverse=True))

        lgr.debug("Attempt to publish %i datasets", len(content_by_ds))
        for ds_path in content_by_ds:
            remote_info = ds_remote_info.get(ds_path, None)
            if remote_info is None:
                # maybe this dataset wasn't annotated above, try to get info
                # MIH: I think this entire if-branch is practically impossible
                # to reach. It is certainly untested, but I think this is due
                # to mutually exclusive conditions during remote_info detection
                remote_info_result = _get_remote_info(ds_path, ds_remote_info,
                                                      to, missing)
                if remote_info_result is not None:
                    yield get_status_dict(type='dataset',
                                          path=ds_path,
                                          status=remote_info_result[0],
                                          message=remote_info_result[1],
                                          **res_kwargs)
                    continue
                # continue with freshly obtained info
                remote_info = ds_remote_info[ds_path]
                # condition above must catch all other cases
                assert remote_info
            # and publish
            ds = Dataset(ds_path)
            for r in _publish_dataset(
                    ds,
                    remote=remote_info['remote'],
                    refspec=remote_info.get('refspec', None),
                    # only send paths that were explicitly requested
                    paths=
                [
                    p for p in content_by_ds[ds_path]
                    # do not feed (sub)dataset paths into the beast
                    # makes no sense to try to annex copy them
                    # for the base dataset itself let `transfer_data`
                    # decide
                    if p.get('type', None) != 'dataset'
                ],
                    annex_copy_options=annex_copy_opts,
                    force=force,
                    jobs=jobs,
                    transfer_data=transfer_data,
                    **res_kwargs):
                yield r
Esempio n. 15
0
class Uninstall(Interface):
    """Uninstall subdatasets

    This command can be used to uninstall any number of installed subdatasets.
    This command will error if individual files or non-dataset directories are
    given as input (use the drop or remove command depending on the desired
    goal), nor will it uninstall top-level datasets (i.e. datasets that are not
    a subdataset in another dataset; use the remove command for this purpose).

    By default, the availability of at least one remote copy for each currently
    available file in any dataset is verified. As these checks could lead to
    slow operation (network latencies, etc), they can be disabled.

    Any number of paths to process can be given as input. Recursion into
    subdatasets needs to be explicitly enabled, while recursion into
    subdirectories within a dataset is done automatically. An optional
    recursion limit is applied relative to each given input path.

    """
    _action = 'uninstall'

    _params_ = dict(
        dataset=dataset_argument,
        path=Parameter(args=("path", ),
                       metavar="PATH",
                       doc="path/name of the component to be uninstalled",
                       nargs="*",
                       constraints=EnsureStr() | EnsureNone()),
        recursive=recursion_flag,
        check=check_argument,
        if_dirty=if_dirty_opt,
    )

    _examples_ = [
        dict(text="Uninstall a subdataset (undo installation)",
             code_py="uninstall(path='path/to/subds')",
             code_cmd="datalad uninstall <path/to/subds>"),
        dict(text="Uninstall a subdataset and all potential subdatasets",
             code_py="uninstall(path='path/to/subds', recursive=True)",
             code_cmd="datalad uninstall -r <path/to/subds>"),
        dict(
            text="Skip checks that ensure a minimal number of (remote) sources",
            code_py="uninstall(path='path/to/subds', check=False)",
            code_cmd="datalad uninstall <path/to/subds> --nocheck"),
    ]

    @staticmethod
    @datasetmethod(name=_action)
    @eval_results
    def __call__(path=None,
                 dataset=None,
                 recursive=False,
                 check=True,
                 if_dirty='save-before'):
        refds = require_dataset(dataset,
                                check_installed=True,
                                purpose='uninstalling')
        res_kwargs = dict(action='uninstall', logger=lgr, refds=refds.path)
        if not path:
            # if no path is given, ie. refds is supposed to be uninstalled
            # check if refds is a subdataset itself, if not die
            # we only need to test that for the refds, everything else
            # will be guaranteed to be a subdataset
            parentds = refds.get_superdataset(
                datalad_only=False,
                topmost=False,
                # unless it is properly registered we have no way of
                # reinstalling it
                registered_only=True)
            if parentds is None:
                yield dict(
                    res_kwargs,
                    path=refds.path,
                    type='dataset',
                    status='error',
                    message="will not uninstall top-level dataset "
                    "(consider `remove` command)",
                )
                return

        saw_subds = False
        for ds in itertools.chain(
                Subdatasets.__call__(
                    # it is critical to pass the dataset arg as-is
                    # to not invalidate the path argument semantics
                    # in subdatasets()
                    dataset=dataset,
                    path=path,
                    fulfilled=True,
                    # makes no sense to ignore subdatasets further down
                    recursive=True,
                    # important to start at the bottom for proper deinit
                    bottomup=True,
                    # doesn't make sense for uninstall
                    #recursion_limit=recursion_limit,
                    return_type='generator',
                    result_renderer='disabled',
                    result_xfm='datasets') if path or recursive else [],
            [refds] if not path else []):
            if ds != refds:
                saw_subds = True

            # TODO generator
            # this should yield what it did
            handle_dirty_dataset(ds, mode=if_dirty)
            # we confirmed the super dataset presence above
            for r in _uninstall_dataset(ds,
                                        check=check,
                                        has_super=True,
                                        **res_kwargs):
                yield r
        # there is nothing to save at the end
        if path and not saw_subds:
            lgr.warning(
                'path constraints did not match an installed subdataset: %s',
                path)
Esempio n. 16
0
from datalad.interface.results import annexjson2result
from datalad.interface.results import success_status_map
from datalad.interface.results import results_from_annex_noinfo
from datalad.interface.utils import handle_dirty_dataset
from datalad.interface.utils import eval_results
from datalad.interface.base import build_doc

lgr = logging.getLogger('datalad.distribution.drop')

dataset_argument = Parameter(
    args=("-d", "--dataset"),
    metavar="DATASET",
    doc="""specify the dataset to perform the operation on.
    If no dataset is given, an attempt is made to identify a dataset
    based on the `path` given""",
    constraints=EnsureDataset() | EnsureNone())

check_argument = Parameter(
    args=("--nocheck", ),
    doc="""whether to perform checks to assure the configured minimum
    number (remote) source for data.[CMD:  Give this
    option to skip checks CMD]""",
    action="store_false",
    dest='check')


def _drop_files(ds, paths, check, noannex_iserror=False, **kwargs):
    """Helper to drop content in datasets.

    Parameters
    ----------
Esempio n. 17
0
class NoAnnex(Interface):
    """Configure a dataset to never put some content into the dataset's annex

    This can be useful in mixed datasets that also contain textual data, such
    as source code, which can be efficiently and more conveniently managed
    directly in Git.

    Patterns generally look like this::

      code/*

    which would match all file in the code directory. In order to match all
    files under ``code/``, including all its subdirectories use such a
    pattern::

      code/**

    Note that the plugin works incrementally, hence any existing configuration
    (e.g. from a previous plugin run) is amended, not replaced.

    Parameters
    ----------
    ref_dir : str, optional
    makedirs : bool, optional
    """
    from datalad.support.param import Parameter
    from datalad.distribution.dataset import datasetmethod
    from datalad.interface.utils import eval_results
    from datalad.distribution.dataset import EnsureDataset
    from datalad.support.constraints import EnsureNone

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc=""""specify the dataset to configure. If no dataset is given,
            an attempt is made to identify the dataset based on the current
            working directory.""",
            constraints=EnsureDataset() | EnsureNone()),
        pattern=Parameter(
            args=("--pattern", ),
            nargs='+',
            doc="""list of path patterns. Any content whose path is matching
            any pattern will not be annexed when added to a dataset, but
            instead will be tracked directly in Git. Path pattern have to be
            relative to the directory given by the `ref_dir` option. By
            default, patterns should be relative to the root of the dataset."""
        ),
        ref_dir=Parameter(
            args=("--ref-dir", ),
            doc="""Relative path (within the dataset) to the directory that is
            to be configured. All patterns are interpreted relative to this
            path, and configuration is written to a ``.gitattributes`` file in
            this directory."""),
        makedirs=Parameter(
            args=("--makedirs", ),
            action='store_true',
            doc="""If set, any missing directories will be created in order to
            be able to place a file into ``--ref-dir``."""),
    )

    @staticmethod
    @datasetmethod(name='no_annex')
    @eval_results
    def __call__(dataset, pattern, ref_dir='.', makedirs=False):
        # could be extended to accept actual largefile expressions
        from os.path import join as opj
        from os.path import isabs
        from os.path import exists
        from os import makedirs as makedirsfx
        from datalad.distribution.dataset import require_dataset
        from datalad.support.annexrepo import AnnexRepo
        from datalad.utils import assure_list

        pattern = assure_list(pattern)
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='no_annex configuration')

        res_kwargs = dict(
            path=ds.path,
            type='dataset',
            action='no_annex',
        )

        # all the ways we refused to cooperate
        if not isinstance(ds.repo, AnnexRepo):
            yield dict(res_kwargs,
                       status='notneeded',
                       message='dataset has no annex')
            return
        if any(isabs(p) for p in pattern):
            yield dict(
                res_kwargs,
                status='error',
                message=
                ('path pattern for `no_annex` configuration must be relative paths: %s',
                 pattern))
            return
        if isabs(ref_dir):
            yield dict(
                res_kwargs,
                status='error',
                message=
                ('`ref_dir` for `no_annex` configuration must be a relative path: %s',
                 ref_dir))
            return

        gitattr_dir = opj(ds.path, ref_dir)
        if not exists(gitattr_dir):
            if makedirs:
                makedirsfx(gitattr_dir)
            else:
                yield dict(
                    res_kwargs,
                    status='error',
                    message=
                    'target directory for `no_annex` does not exist (consider makedirs=True)'
                )
                return

        gitattr_file = opj(gitattr_dir, '.gitattributes')
        dataset.repo.set_gitattributes([(p, {
            'annex.largefiles': 'nothing'
        }) for p in pattern],
                                       attrfile=gitattr_file)
        yield dict(res_kwargs, status='ok')

        for r in dataset.rev_save(
                gitattr_file,
                to_git=True,
                message="[DATALAD] exclude paths from annex'ing",
                result_filter=None,
                result_xfm=None):
            yield r
Esempio n. 18
0
class Drop(Interface):
    """Drop file content from datasets

    This command takes any number of paths of files and/or directories. If
    a common (super)dataset is given explicitly, the given paths are
    interpreted relative to this dataset.

    Recursion into subdatasets needs to be explicitly enabled, while recursion
    into subdirectories within a dataset is done automatically. An optional
    recursion limit is applied relative to each given input path.

    By default, the availability of at least one remote copy is verified before
    file content is dropped. As these checks could lead to slow operation
    (network latencies, etc), they can be disabled.

    Examples:

      Drop all file content in a dataset::

        ~/some/dataset$ datalad drop

      Drop all file content in a dataset and all its subdatasets::

        ~/some/dataset$ datalad drop --recursive

    """
    _action = 'drop'

    _params_ = dict(
        dataset=dataset_argument,
        path=Parameter(args=("path", ),
                       metavar="PATH",
                       doc="path/name of the component to be dropped",
                       nargs="*",
                       constraints=EnsureStr() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        check=check_argument,
        if_dirty=if_dirty_opt,
    )

    @staticmethod
    @datasetmethod(name=_action)
    @eval_results
    def __call__(path=None,
                 dataset=None,
                 recursive=False,
                 recursion_limit=None,
                 check=True,
                 if_dirty='save-before'):

        if not dataset and not path:
            raise InsufficientArgumentsError(
                "insufficient information for `drop`: requires at least a path or dataset"
            )
        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='drop', logger=lgr, refds=refds_path)
        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = refds_path
        to_drop = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='drop',
                # justification for status:
                # content need not be dropped where there is none
                unavailable_path_status='notneeded',
                nondataset_path_status='error',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', None) == 'dataset' and \
                    GitRepo.is_valid_repo(ap['path']) and \
                    not ap['path'] == refds_path:
                ap['process_content'] = True
            if ap.get('registered_subds', False) and ap.get('state',
                                                            None) == 'absent':
                # nothing to drop in an absent subdataset, don't be annoying
                # and skip silently
                continue
            to_drop.append(ap)

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_drop,
                refds_path=refds_path)
        assert (not completed)

        # iterate over all datasets, order doesn't matter
        for ds_path in content_by_ds:
            ds = Dataset(ds_path)
            # TODO generator
            # this should yield what it did
            handle_dirty_dataset(ds, mode=if_dirty)
            # ignore submodule entries
            content = [
                ap['path'] for ap in content_by_ds[ds_path]
                if ap.get('type', None) != 'dataset' or ap['path'] == ds.path
            ]
            if not content:
                continue
            for r in _drop_files(ds, content, check=check, **res_kwargs):
                yield r
Esempio n. 19
0
class Spec2Bids(Interface):
    """Convert to BIDS based on study specification
    """

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""bids dataset""",
            constraints=EnsureDataset() | EnsureNone()),
        specfile=Parameter(
            args=("specfile",),
            metavar="SPEC_FILE",
            doc="""path(s) to the specification file(s) to use for conversion.
             If a directory at the first level beneath the dataset's root is 
             given instead of a file, it's assumed to be an acqusition directory 
             that contains a specification file.
             By default this is a file named 'studyspec.json' in the
             acquisition directory. This default name can be configured via the
             'datalad.hirni.studyspec.filename' config variable.
             """,
            nargs="*",
            constraints=EnsureStr()),
        anonymize=Parameter(
            args=("--anonymize",),
            action="store_true",
            doc="""whether or not to anonymize for conversion. By now this means
            to use 'anon_subject' instead of 'subject' from spec and to use 
            datalad-run with a sidecar file, to not leak potentially identifying 
            information into its record.""",),
        only_type=Parameter(
            args=("--only-type",),
            metavar="TYPE",
            doc="specify snippet type to convert. If given only this type of "
                "specification snippets is considered for conversion",
            constraints=EnsureStr() | EnsureNone(),)
    )

    @staticmethod
    @datasetmethod(name='hirni_spec2bids')
    @eval_results
    def __call__(specfile, dataset=None, anonymize=False, only_type=None):

        dataset = require_dataset(dataset, check_installed=True,
                                  purpose="spec2bids")

        specfile = assure_list(specfile)
        specfile = [resolve_path(p, dataset) for p in specfile]

        for spec_path in specfile:

            # Note/TODO: ran_procedure per spec file still isn't ideal. Could
            # be different spec files for same acquisition. It's actually about
            # the exact same call. How to best get around substitutions?
            # Also: per snippet isn't correct either.
            # substitutions is real issue. Example "copy {location} ."
            #
            # => datalad.interface.run.format_command / normalize_command ?

            # TODO: Also can we skip prepare_inputs within run? At least specify
            # more specifically. Note: Can be globbed!

            ran_procedure = dict()

            if not lexists(spec_path):
                yield get_status_dict(
                    action='spec2bids',
                    path=spec_path,
                    status='impossible',
                    message="{} not found".format(spec_path)
                )

            if op.isdir(spec_path):
                if op.realpath(op.join(spec_path, op.pardir)) == \
                        op.realpath(dataset.path):
                    spec_path = op.join(
                            spec_path,
                            dataset.config.get(
                                    "datalad.hirni.studyspec.filename",
                                    "studyspec.json")
                    )
                    # TODO: check existence of that file!
                else:
                    yield get_status_dict(
                        action='spec2bids',
                        path=spec_path,
                        status='impossible',
                        message="{} is neither a specification file nor an "
                                "acquisition directory".format(spec_path)
                    )

            # relative path to spec to be recorded:
            rel_spec_path = relpath(spec_path, dataset.path) \
                if isabs(spec_path) else spec_path

            # check each dict (snippet) in the specification for what to do
            # wrt conversion:
            for spec_snippet in load_stream(spec_path):

                if only_type and not spec_snippet['type'].startswith(only_type):
                    # ignore snippets not matching `only_type`
                    # Note/TODO: the .startswith part is meant for
                    # matching "dicomseries:all" to given "dicomseries" but not
                    # vice versa. This prob. needs refinement (and doc)
                    continue

                if 'procedures' not in spec_snippet:
                    # no conversion procedures defined at all:
                    yield get_status_dict(
                            action='spec2bids',
                            path=spec_path,
                            snippet=spec_snippet,
                            status='notneeded',
                    )
                    continue

                procedure_list = spec_snippet['procedures']
                if not procedure_list:
                    # no conversion procedures defined at all:
                    yield get_status_dict(
                            action='spec2bids',
                            path=spec_path,
                            snippet=spec_snippet,
                            status='notneeded',
                    )
                    continue

                # accept a single dict as a one item list:
                if isinstance(procedure_list, dict):
                    procedure_list = [procedure_list]

                # build a dict available for placeholders in format strings:
                # Note: This is flattening the structure since we don't need
                # value/approved for the substitutions. In addition 'subject'
                # and 'anon_subject' are not passed on, but a new key
                # 'bids_subject' instead the value of which depends on the
                # --anonymize switch.
                # Additionally 'location' is recomputed to be relative to
                # dataset.path, since this is where the procedures are running
                # from within.
                replacements = dict()
                for k, v in spec_snippet.items():
                    if k == 'subject':
                        if not anonymize:
                            replacements['bids-subject'] = v['value']
                    elif k == 'anon-subject':
                        if anonymize:
                            replacements['bids-subject'] = v['value']
                    elif k == 'location':
                        replacements[k] = op.join(op.dirname(rel_spec_path), v)
                    elif k == 'procedures':
                        # 'procedures' is a list of dicts (not suitable for
                        # substitutions) and it makes little sense to be
                        # referenced by converter format strings anyway:
                        continue
                    else:
                        replacements[k] = v['value'] if isinstance(v, dict) else v

                # build dict to patch os.environ with for passing
                # replacements on to procedures:
                env_subs = dict()
                for k, v in replacements.items():
                    env_subs['DATALAD_RUN_SUBSTITUTIONS_{}'
                             ''.format(k.upper().replace('-', '__'))] = str(v)
                env_subs['DATALAD_RUN_SUBSTITUTIONS_SPECPATH'] = rel_spec_path
                env_subs['DATALAD_RUN_SUBSTITUTIONS_ANONYMIZE'] = str(anonymize)

                # TODO: The above two blocks to build replacements dict and
                # env_subs should be joined eventually.

                for proc in procedure_list:
                    if has_specval(proc, 'procedure-name'):
                        proc_name = get_specval(proc, 'procedure-name')
                    else:
                        # invalid procedure spec
                        lgr.warning("conversion procedure missing key "
                                    "'procedure-name' in %s: %s",
                                    spec_path, proc)
                        # TODO: continue or yield impossible/error so it can be
                        # dealt with via on_failure?
                        continue

                    if has_specval(proc, 'on-anonymize') \
                        and anything2bool(
                            get_specval(proc, 'on-anonymize')
                            ) and not anonymize:
                        # don't run that procedure, if we weren't called with
                        # --anonymize while procedure is specified to be run on
                        # that switch only
                        continue

                    proc_call = get_specval(proc, 'procedure-call') \
                        if has_specval(proc, 'procedure-call') \
                        else None

                    if ran_procedure.get(hash((proc_name, proc_call)), None):
                        # if we ran the exact same call already,
                        # don't call it again
                        # TODO: notneeded?
                        continue

                    # if spec comes with call format string, it takes precedence
                    # over what is generally configured for the procedure
                    # TODO: Not sure yet whether this is how we should deal with it
                    if proc_call:
                        env_subs['DATALAD_PROCEDURES_{}_CALL__FORMAT'
                                 ''.format(proc_name.upper().replace('-', '__'))
                                 ] = proc_call

                    run_results = list()
                    # Note, that we can't use dataset.config.overrides to
                    # pass run-substitution config to procedures, since we
                    # leave python context and thereby loose the dataset
                    # instance. Use patched os.environ instead. Note also,
                    # that this requires names of substitutions to not
                    # contain underscores, since they would be translated to
                    # '.' by ConfigManager when reading them from within the
                    # procedure's datalad-run calls.
                    from mock import patch

                    # TODO: Reconsider that patching. Shouldn't it be an update?
                    with patch.dict('os.environ', env_subs):
                        # apparently reload is necessary to consider config
                        # overrides via env:
                        dataset.config.reload()
                        for r in dataset.run_procedure(
                                spec=proc_name,
                                return_type='generator'
                        ):

                            # # if there was an issue yield original result,
                            # # otherwise swallow:
                            # if r['status'] not in ['ok', 'notneeded']:
                            yield r
                            run_results.append(r)

                    if not all(r['status'] in ['ok', 'notneeded']
                               for r in run_results):
                        yield {'action': proc_name,
                               'path': spec_path,
                               'snippet': spec_snippet,
                               'status': 'error',
                               'message': "acquisition conversion failed. "
                                          "See previous message(s)."}

                    else:
                        yield {'action': proc_name,
                               'path': spec_path,
                               'snippet': spec_snippet,
                               'status': 'ok',
                               'message': "acquisition converted."}

                    # mark as a procedure we ran on this acquisition:
                    # TODO: rethink. Doesn't work that way. Disabled for now
                    # ran_procedure[hash((proc_name, proc_call))] = True




                    # elif proc_name != 'hirni-dicom-converter':
                    #     # specific converter procedure call
                    #
                    #     from mock import patch
                    #     with patch.dict('os.environ', env_subs):
                    #         # apparently reload is necessary to consider config
                    #         # overrides via env:
                    #         dataset.config.reload()
                    #
                    #         for r in dataset.run_procedure(
                    #                 spec=[proc_name, rel_spec_path, anonymize],
                    #                 return_type='generator'
                    #         ):
                    #
                    #             # if there was an issue with containers-run,
                    #             # yield original result, otherwise swallow:
                    #             if r['status'] not in ['ok', 'notneeded']:
                    #                 yield r
                    #
                    #             run_results.append(r)
                    #
                    #     if not all(r['status'] in ['ok', 'notneeded']
                    #                for r in run_results):
                    #         yield {'action': proc_name,
                    #                'path': spec_path,
                    #                'snippet': spec_snippet,
                    #                'status': 'error',
                    #                'message': "Conversion failed. "
                    #                           "See previous message(s)."}
                    #
                    #     else:
                    #         yield {'action': proc_name,
                    #                'path': spec_path,
                    #                'snippet': spec_snippet,
                    #                'status': 'ok',
                    #                'message': "specification converted."}

                    # elif ran_heudiconv and proc_name == 'hirni-dicom-converter':
                    #     # in this case we acted upon this snippet already and
                    #     # do not have to produce a result
                    #     pass
                    #
                    # else:
                    #     # this shouldn't happen!
                    #     raise RuntimeError

            yield {'action': 'spec2bids',
                   'path': spec_path,
                   'status': 'ok'}
class ContainersRemove(Interface):
    # first docstring line is used a short description in the cmdline help
    # the rest is put in the verbose help and manpage
    """Remove a known container from a dataset
    """

    # parameters of the command, must be exhaustive
    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to query. If no dataset is given, an
            attempt is made to identify the dataset based on the current
            working directory""",
            constraints=EnsureDataset() | EnsureNone()),
        name=Parameter(
            args=("name",),
            doc="""name of the container to remove""",
            metavar="NAME",
            constraints=EnsureStr(),
        ),
        remove_image=Parameter(
            args=("-i", "--remove-image",),
            doc="""if set, remove container image as well""",
            action="store_true",
        ),
    )

    @staticmethod
    @datasetmethod(name='containers_remove')
    @eval_results
    def __call__(name, dataset=None, remove_image=False):
        ds = require_dataset(dataset, check_installed=True,
                             purpose='remove a container')

        res = get_status_dict(
            ds=ds,
            action='containers_remove',
            logger=lgr)

        section = 'datalad.containers.{}'.format(name)
        imagecfg = '{}.image'.format(section)

        to_save = []
        if remove_image and imagecfg in ds.config:
            imagepath = ds.config.get(imagecfg)
            if op.lexists(op.join(ds.path, imagepath)):
                for r in ds.remove(
                        path=imagepath,
                        # XXX shortcomming: this is the only way to say:
                        # don't drop
                        check=False,
                        # config setting might be outdated and image no longer
                        # there -> no reason to fail, just report
                        on_failure='ignore',
                        save=False):
                    yield r
                to_save.append(imagepath)

        if section in ds.config.sections():
            ds.config.remove_section(
                section,
                where='dataset',
                reload=True)
            res['status'] = 'ok'
            to_save.append(op.join('.datalad', 'config'))
        else:
            res['status'] = 'notneeded'
        if to_save:
            for r in ds.save(
                    path=to_save,
                    message='[DATALAD] Remove container {}'.format(name)):
                yield r
        yield res
Esempio n. 21
0
class Push(Interface):
    """Push a dataset to a known :term:`sibling`.

    This makes the last saved state of a dataset available to a sibling
    or special remote data store of a dataset. Any target sibling must already
    exist and be known to the dataset.

    Optionally, it is possible to limit a push to change sets relative
    to a particular point in the version history of a dataset (e.g. a release
    tag). By default, the state of the local dataset is evaluated against the
    last known state of the target sibling. An actual push is only attempted
    if there was a change compared to the reference state, in order to speed up
    processing of large collections of datasets. Evaluation with respect to
    a particular "historic" state is only supported in conjunction with a
    specified reference dataset. Change sets are also evaluated recursively, i.e.
    only those subdatasets are pushed where a change was recorded that is
    reflected in the current state of the top-level reference dataset.
    See "since" option for more information.

    Only a push of saved changes is supported.

    .. note::
      Power-user info: This command uses :command:`git push`, and :command:`git annex copy`
      to push a dataset. Publication targets are either configured remote
      Git repositories, or git-annex special remotes (if they support data
      upload).
    """

    # TODO add examples

    _params_ = dict(
        dataset=Parameter(args=("-d", "--dataset"),
                          doc="""specify the dataset to push""",
                          constraints=EnsureDataset() | EnsureNone()),
        to=Parameter(
            args=("--to", ),
            metavar='SIBLING',
            doc="""name of the target sibling. If no name is given an attempt is
            made to identify the target based on the dataset's configuration
            (i.e. a configured tracking branch, or a single sibling that is
            configured for push)""",
            constraints=EnsureStr() | EnsureNone()),
        since=Parameter(
            args=("--since", ),
            constraints=EnsureStr() | EnsureNone(),
            doc=
            """specifies commit-ish (tag, shasum, etc.) from which to look for
            changes to decide whether pushing is necessary.
            If an empty string is given, the last state of the current branch
            at the sibling is taken as a starting point."""),
        path=Parameter(args=("path", ),
                       metavar='PATH',
                       doc="""path to contrain a push to. If given, only
            data or changes for those paths are considered for a push.""",
                       nargs='*',
                       constraints=EnsureStr() | EnsureNone()),
        force=Parameter(
            # multi-mode option https://github.com/datalad/datalad/issues/3414
            args=(
                "-f",
                "--force",
            ),
            doc="""force particular operations, overruling automatic decision
            making: use --force with git-push ('gitpush'); do not use --fast
            with git-annex copy ('datatransfer'); do not attempt to copy
            annex'ed file content ('no-datatransfer'); combine force modes
            'gitpush' and 'datatransfer' ('all').""",
            constraints=EnsureChoice('all', 'gitpush', 'no-datatransfer',
                                     'datatransfer', None)),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        jobs=jobs_opt,
    )

    # Desired features:
    # - let Git do it's thing (push multiple configured refs without the need
    #                          to specific anything on the command line
    #   - compilication: we need publication dependencies (i.e. publish what
    #     would be published by Git to a different remote first, hence we
    #     cannot simply watch Git do it, and later act on it.)
    #   - https://github.com/datalad/datalad/issues/1284
    #   - https://github.com/datalad/datalad/issues/4006
    # - make differences between remotes and various types of special remotes
    #   opaque
    #   - https://github.com/datalad/datalad/issues/3127
    # - informative and comprehensive (error) reporting
    #   - https://github.com/datalad/datalad/issues/2000
    #   - https://github.com/datalad/datalad/issues/1682
    #   - https://github.com/datalad/datalad/issues/2029
    #   - https://github.com/datalad/datalad/issues/2855
    #   - https://github.com/datalad/datalad/issues/3412
    #   - https://github.com/datalad/datalad/issues/3424
    # - ensure robust behavior in multi-lateral push scenarios (updating
    #   a dataset that was updated by a 3rd-party after the last known
    #   fetched change
    #   - https://github.com/datalad/datalad/issues/2636
    # - should NOT mimic `publish` and that it mixes `create-sibling` and
    #   `push` into a single operation. This would fold the complexity
    #   of all possible ways a local dataset hierarchy could possibly
    #   connected to remote ends into this command. It would be lost battle
    #   from the start.
    #   - not tackle: https://github.com/datalad/datalad/issues/2186
    # - maintain standard setup, and not reflect procedural aspects
    #   onto the resulting outcomes
    #   - https://github.com/datalad/datalad/issues/2001
    # - do a straight push, nothing like 'sync'. If a remote has something that
    #   needs merging first, fail and let users update. Any diff we are missing
    #   locally can impact decision making via --since and friends.

    @staticmethod
    @datasetmethod(name='push')
    @eval_results
    def __call__(path=None,
                 dataset=None,
                 to=None,
                 since=None,
                 force=None,
                 recursive=False,
                 recursion_limit=None,
                 jobs=None):
        # we resolve here, because we need to perform inspection on what was given
        # as an input argument further down
        paths = [resolve_path(p, dataset) for p in assure_list(path)]

        ds = require_dataset(dataset, check_installed=True, purpose='pushing')
        ds_repo = ds.repo

        res_kwargs = dict(
            action='publish',
            refds=ds.path,
            logger=lgr,
        )

        get_remote_kwargs = {'exclude_special_remotes': False} \
            if isinstance(ds_repo, AnnexRepo) else {}
        if to and to not in ds_repo.get_remotes(**get_remote_kwargs):
            # get again for proper error:
            sr = ds_repo.get_remotes(**get_remote_kwargs)
            # yield an error result instead of raising a ValueError,
            # to enable the use case of pushing to a target that
            # a superdataset doesn't know, but some subdatasets to
            # (in combination with '--on-failure ignore')
            yield dict(res_kwargs,
                       status='error',
                       message="Unknown push target '{}'. {}".format(
                           to, 'Known targets: {}.'.format(', '.join(
                               repr(s) for s in sr))
                           if sr else 'No targets configured in dataset.'))
            return

        if since:
            # will blow with ValueError if unusable
            ds_repo.get_hexsha(since)

        if not since and since is not None:
            # special case: --since=''
            # figure out state of remote branch and set `since`
            since = _get_corresponding_remote_state(ds_repo, to)
            if not since:
                lgr.info("No tracked remote for active branch, "
                         "detection of last pushed state not in effect.")

        # obtain a generator for information on the datasets to process
        # idea is to turn the `paths` argument into per-dataset
        # content listings that can be acted upon
        ds_spec = _datasets_since_(
            # important to pass unchanged dataset arg
            dataset,
            since,
            paths,
            recursive,
            recursion_limit)

        # instead of a loop, this could all be done in parallel
        matched_anything = False
        for dspath, dsrecords in ds_spec:
            matched_anything = True
            lgr.debug('Attempt push of Dataset at %s', dspath)
            pbars = {}
            yield from _push(dspath,
                             dsrecords,
                             to,
                             force,
                             jobs,
                             res_kwargs.copy(),
                             pbars,
                             got_path_arg=True if path else False)
            # take down progress bars for this dataset
            for i, ds in pbars.items():
                log_progress(lgr.info, i, 'Finished push of %s', ds)
        if not matched_anything:
            yield dict(
                res_kwargs,
                status='notneeded',
                message=
                'Given constraints did not match any changes to publish',
                type='dataset',
                path=ds.path,
            )

    @staticmethod
    def custom_result_summary_renderer(results):  # pragma: more cover
        # report on any hints at the end
        # get all unique hints
        hints = set([r.get('hints', None) for r in results])
        hints = [hint for hint in hints if hint is not None]
        if hints:
            from datalad.ui import ui
            from datalad.support import ansi_colors
            intro = ansi_colors.color_word(
                "Potential hints to solve encountered errors: ",
                ansi_colors.YELLOW)
            ui.message(intro)
            [
                ui.message("{}: {}".format(
                    ansi_colors.color_word(id + 1, ansi_colors.YELLOW), hint))
                for id, hint in enumerate(hints)
            ]
Esempio n. 22
0
class ExportArchive(Interface):
    """Export the content of a dataset as a TAR/ZIP archive.
    """
    from datalad.support.param import Parameter
    from datalad.distribution.dataset import datasetmethod
    from datalad.interface.utils import eval_results
    from datalad.distribution.dataset import EnsureDataset
    from datalad.support.constraints import (
        EnsureChoice,
        EnsureNone,
        EnsureStr,
    )

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc=""""specify the dataset to export. If no dataset is given, an
            attempt is made to identify the dataset based on the current
            working directory.""",
            constraints=EnsureDataset() | EnsureNone()),
        filename=Parameter(
            args=("filename", ),
            metavar="PATH",
            nargs='?',
            doc="""File name of the generated TAR archive. If no file name is
            given the archive will be generated in the current directory and
            will be named: datalad_<dataset_uuid>.(tar.*|zip). To generate that
            file in a different directory, provide an existing directory as the
            file name.""",
            constraints=EnsureStr() | EnsureNone()),
        archivetype=Parameter(args=("-t", "--archivetype"),
                              doc="""Type of archive to generate.""",
                              constraints=EnsureChoice("tar", "zip")),
        compression=Parameter(
            args=("-c", "--compression"),
            doc="""Compression method to use.  'bz2' is not supported for ZIP
            archives.  No compression is used when an empty string is
            given.""",
            constraints=EnsureChoice("gz", "bz2", "")),
        missing_content=Parameter(
            args=("--missing-content", ),
            doc="""By default, any discovered file with missing content will
            result in an error and the export is aborted. Setting this to
            'continue' will issue warnings instead of failing on error. The
            value 'ignore' will only inform about problem at the 'debug' log
            level. The latter two can be helpful when generating a TAR archive
            from a dataset where some file content is not available
            locally.""",
            constraints=EnsureChoice("error", "continue", "ignore")),
    )

    @staticmethod
    @datasetmethod(name='export_archive')
    @eval_results
    def __call__(dataset,
                 filename=None,
                 archivetype='tar',
                 compression='gz',
                 missing_content='error'):
        import os
        import tarfile
        import zipfile
        from unittest.mock import patch
        from os.path import join as opj, dirname, normpath, isabs
        import os.path as op

        from datalad.distribution.dataset import require_dataset
        from datalad.utils import file_basename
        from datalad.support.annexrepo import AnnexRepo
        from datalad.dochelpers import exc_str

        import logging
        lgr = logging.getLogger('datalad.local.export_archive')

        dataset = require_dataset(dataset,
                                  check_installed=True,
                                  purpose='export archive')

        repo = dataset.repo
        committed_date = repo.get_commit_date()

        # could be used later on to filter files by some criterion
        def _filter_tarinfo(ti):
            # Reset the date to match the one of the last commit, not from the
            # filesystem since git doesn't track those at all
            # TODO: use the date of the last commit when any particular
            # file was changed -- would be the most kosher yoh thinks to the
            # degree of our abilities
            ti.mtime = committed_date
            return ti

        tar_args = dict(recursive=False, filter=_filter_tarinfo)

        file_extension = '.{}{}'.format(
            archivetype, '{}{}'.format('.' if compression else '', compression)
            if archivetype == 'tar' else '')

        default_filename = "datalad_{.id}".format(dataset)
        if filename is None:
            filename = default_filename  # in current directory
        elif path.exists(filename) and path.isdir(filename):
            filename = path.join(filename,
                                 default_filename)  # under given directory
        if not filename.endswith(file_extension):
            filename += file_extension

        root = dataset.path
        # use dir inside matching the output filename
        # TODO: could be an option to the export plugin allowing empty value
        # for no leading dir
        leading_dir = file_basename(filename)

        # workaround for inability to pass down the time stamp
        with patch('time.time', return_value=committed_date), \
                tarfile.open(filename, "w:{}".format(compression)) \
                if archivetype == 'tar' \
                else zipfile.ZipFile(
                    filename, 'w',
                    zipfile.ZIP_STORED if not compression else zipfile.ZIP_DEFLATED) \
                as archive:
            add_method = archive.add if archivetype == 'tar' else archive.write
            repo_files = sorted(repo.get_indexed_files())
            if isinstance(repo, AnnexRepo):
                annexed = repo.is_under_annex(repo_files,
                                              allow_quick=True,
                                              batch=True)
                # remember: returns False for files in Git!
                has_content = repo.file_has_content(repo_files,
                                                    allow_quick=True,
                                                    batch=True)
            else:
                annexed = [False] * len(repo_files)
                has_content = [True] * len(repo_files)
            for i, rpath in enumerate(repo_files):
                fpath = opj(root, rpath)
                if annexed[i]:
                    if not has_content[i]:
                        if missing_content in ('ignore', 'continue'):
                            (lgr.warning
                             if missing_content == 'continue' else lgr.debug)(
                                 'File %s has no content available, skipped',
                                 fpath)
                            continue
                        else:
                            raise IOError('File %s has no content available' %
                                          fpath)

                    # resolve to possible link target
                    if op.islink(fpath):
                        link_target = os.readlink(fpath)
                        if not isabs(link_target):
                            link_target = normpath(
                                opj(dirname(fpath), link_target))
                        fpath = link_target
                # name in the archive
                aname = normpath(opj(leading_dir, rpath))
                add_method(fpath,
                           arcname=aname,
                           **(tar_args if archivetype == 'tar' else {}))

        if not isabs(filename):
            filename = opj(os.getcwd(), filename)

        yield dict(status='ok',
                   path=filename,
                   type='file',
                   action='export_archive',
                   logger=lgr)
Esempio n. 23
0
class Install(Interface):
    """Install a dataset from a (remote) source.

    This command creates a local :term:`sibling` of an existing dataset from a
    (remote) location identified via a URL or path. Optional recursion into
    potential subdatasets, and download of all referenced data is supported.
    The new dataset can be optionally registered in an existing
    :term:`superdataset` by identifying it via the `dataset` argument (the new
    dataset's path needs to be located within the superdataset for that).

    It is recommended to provide a brief description to label the dataset's
    nature *and* location, e.g. "Michael's music on black laptop". This helps
    humans to identify data locations in distributed scenarios.  By default an
    identifier comprised of user and machine name, plus path will be generated.

    When only partial dataset content shall be obtained, it is recommended to
    use this command without the `get-data` flag, followed by a
    :func:`~datalad.api.get` operation to obtain the desired data.

    .. note::
      Power-user info: This command uses :command:`git clone`, and
      :command:`git annex init` to prepare the dataset. Registering to a
      superdataset is performed via a :command:`git submodule add` operation
      in the discovered superdataset.
    """

    # very frequently this command will yield exactly one installed dataset
    # spare people the pain of going through a list by default
    return_type = 'item-or-list'
    # as discussed in #1409 and #1470, we want to return dataset instances
    # matching what is actually available after command completion (and
    # None for any failed dataset installation)
    # TODO actually need success(containing)dataset-or-none
    result_xfm = 'successdatasets-or-none'
    # we also want to limit the returned result to explicit input arguments
    # (paths/source) and not report any implicit action, like intermediate
    # datasets
    result_filter = is_result_matching_pathsource_argument

    _examples_ = [
        dict(text="Install a dataset from Github into the current directory",
             code_py="install("
             "source='https://github.com/datalad-datasets/longnow"
             "-podcasts.git')",
             code_cmd="datalad install "
             "https://github.com/datalad-datasets/longnow-podcasts.git"),
        dict(text="Install a dataset as a subdataset into the current dataset",
             code_py="""\
             install(dataset='.',
                     source='https://github.com/datalad-datasets/longnow-podcasts.git')""",
             code_cmd="""\
             datalad install -d . \\
             --source='https://github.com/datalad-datasets/longnow-podcasts.git'"""
             ),
        dict(text="Install a dataset, and get all content right away",
             code_py="""\
             install(source='https://github.com/datalad-datasets/longnow-podcasts.git',
                     get_data=True)""",
             code_cmd="""\
             datalad install --get-data \\
             -s https://github.com/datalad-datasets/longnow-podcasts.git"""),
        dict(text="Install a dataset with all its subdatasets",
             code_py="""\
             install(source='https://github.com/datalad-datasets/longnow-podcasts.git',
                     recursive=True)""",
             code_cmd="""\
             datalad install -r \\
             https://github.com/datalad-datasets/longnow-podcasts.git"""),
    ]

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            # TODO: this probably changes to install into the dataset (add_to_super)
            # and to install the thing 'just there' without operating 'on' a dataset.
            # Adapt doc.
            # MIH: `shouldn't this be the job of `add`?
            doc="""specify the dataset to perform the install operation on.  If
            no dataset is given, an attempt is made to identify the dataset
            in a parent directory of the current working directory and/or the
            `path` given""",
            constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(
            args=("path", ),
            metavar='PATH',
            nargs="*",
            # doc: TODO
            doc="""path/name of the installation target.  If no `path` is
            provided a destination path will be derived from a source URL
            similar to :command:`git clone`"""),
        source=Parameter(args=("-s", "--source"),
                         metavar='SOURCE',
                         doc="URL or local path of the installation source",
                         constraints=EnsureStr() | EnsureNone()),
        get_data=Parameter(args=(
            "-g",
            "--get-data",
        ),
                           doc="""if given, obtain all data content too""",
                           action="store_true"),
        description=location_description,
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        save=nosave_opt,
        reckless=reckless_opt,
        jobs=jobs_opt,
    )

    @staticmethod
    @datasetmethod(name='install')
    @eval_results
    def __call__(path=None,
                 source=None,
                 dataset=None,
                 get_data=False,
                 description=None,
                 recursive=False,
                 recursion_limit=None,
                 save=True,
                 reckless=None,
                 jobs="auto"):

        # normalize path argument to be equal when called from cmdline and
        # python and nothing was passed into `path`
        path = ensure_list(path)

        if not source and not path:
            raise InsufficientArgumentsError(
                "Please provide at least a source or a path")

        #  Common kwargs to pass to underlying git/install calls.
        #  They might need adjustments (e.g. for recursion_limit, but
        #  otherwise would be applicable throughout
        #
        # There should have been more of common options!
        # since underneath get could do similar installs
        common_kwargs = dict(
            get_data=get_data,
            recursive=recursive,
            recursion_limit=recursion_limit,
            # git_opts=git_opts,
            # annex_opts=annex_opts,
            reckless=reckless,
            jobs=jobs,
        )

        # did we explicitly get a dataset to install into?
        # if we got a dataset, path will be resolved against it.
        # Otherwise path will be resolved first.
        ds = None
        if dataset is not None:
            ds = require_dataset(dataset,
                                 check_installed=True,
                                 purpose='installation')
            common_kwargs['dataset'] = dataset
        # pre-compute for results below
        refds_path = Interface.get_refds_path(ds)

        # switch into the two scenarios without --source:
        # 1. list of URLs
        # 2. list of (sub)dataset content
        if source is None:
            # we need to collect URLs and paths
            to_install = []
            to_get = []
            # TODO: this approach is problematic, it disrupts the order of input args.
            # consequently results will be returned in an unexpected order when a
            # mixture of source URL and paths is given. Reordering is only possible when
            # everything in here is fully processed before any results can be yielded.
            # moreover, I think the semantics of the status quo implementation are a
            # bit complicated: in a mixture list a source URL will lead to a new dataset
            # at a generated default location, but a path will lead to a subdataset
            # at that exact location
            for urlpath in path:
                ri = RI(urlpath)
                (to_get
                 if isinstance(ri, PathRI) else to_install).append(urlpath)

            # 1. multiple source URLs
            for s in to_install:
                lgr.debug("Install passes into install source=%s", s)
                for r in Install.__call__(
                        source=s,
                        description=description,
                        save=save,
                        # we need to disable error handling in order to have it done at
                        # the very top, otherwise we are not able to order a global
                        # "ignore-and-keep-going"
                        on_failure='ignore',
                        return_type='generator',
                        result_xfm=None,
                        result_filter=None,
                        **common_kwargs):
                    # no post-processing of the installed content on disk
                    # should be necessary here, all done by code further
                    # down that deals with an install from an actuall `source`
                    # any necessary fixes should go there too!
                    r['refds'] = refds_path
                    yield r

            # 2. one or more dataset content paths
            if to_get:
                lgr.debug("Install passes into get %d items", len(to_get))
                # all commented out hint on inability to pass those options
                # into underlying install-related calls.
                # Also need to pass from get:
                #  annex_get_opts

                for r in Get.__call__(
                        to_get,
                        # TODO should pass-through description, not sure why disabled
                        # description=description,
                        # we need to disable error handling in order to have it done at
                        # the very top, otherwise we are not able to order a global
                        # "ignore-and-keep-going"
                        on_failure='ignore',
                        return_type='generator',
                        result_xfm=None,
                        result_filter=None,
                        **common_kwargs):
                    # no post-processing of get'ed content on disk should be
                    # necessary here, this is the responsibility of `get`
                    # (incl. adjusting parent's gitmodules when submodules end
                    # up in an "updated" state (done in get helpers)
                    # any required fixes should go there!
                    r['refds'] = refds_path
                    yield r

            # we are done here
            # the rest is about install from a `source`
            return

        # an actual `source` was given
        if source and path and len(path) > 1:
            # exception is ok here, if this fails it is either direct user error
            # or we f****d up one of our internal calls
            raise ValueError(
                "install needs a single PATH when source is provided.  "
                "Was given mutliple PATHs: %s" % str(path))

        # parameter constraints:
        if not source:
            # exception is ok here, if this fails it is either direct user error
            # or we f****d up one of our internal calls
            raise InsufficientArgumentsError(
                "a `source` is required for installation")

        # code below deals with a single path only
        path = path[0] if path else None

        if source == path:
            # even if they turn out to be identical after resolving symlinks
            # and more sophisticated witchcraft, it would still happily say
            # "it appears to be already installed", so we just catch an
            # obviously pointless input combination
            yield get_status_dict(
                'install',
                path=path,
                status='impossible',
                logger=lgr,
                source_url=source,
                refds=refds_path,
                message=
                "installation `source` and destination `path` are identical. "
                "If you are trying to add a subdataset simply use the `save` command"
            )
            return

        # resolve the target location (if local) against the provided dataset
        # or CWD:
        if path is not None:
            # MIH everything in here is highly similar to what common
            # interface helpers do (or should/could do), but at the same
            # is very much tailored to just apply to `install` -- I guess
            # it has to stay special

            # Should work out just fine for regular paths, so no additional
            # conditioning is necessary
            try:
                path_ri = RI(path)
            except Exception as e:
                raise ValueError("invalid path argument {}: ({})".format(
                    path, exc_str(e)))
            try:
                # Wouldn't work for SSHRI ATM, see TODO within SSHRI
                # yoh: path should be a local path, and mapping note within
                #      SSHRI about mapping localhost:path to path is kinda
                #      a peculiar use-case IMHO
                # TODO Stringification can be removed once PY35 is no longer
                # supported
                path = str(resolve_path(path_ri.localpath, dataset))
                # any `path` argument that point to something local now
                # resolved and is no longer a URL
            except ValueError:
                # `path` is neither a valid source nor a local path.
                # TODO: The only thing left is a known subdataset with a
                # name, that is not a path; Once we correctly distinguish
                # between path and name of a submodule, we need to consider
                # this.
                # For now: Just raise
                raise ValueError("Invalid path argument {0}".format(path))
        # `path` resolved, if there was any.

        # clone dataset, will also take care of adding to superdataset, if one
        # is given
        res = Clone.__call__(
            source,
            path,
            dataset=ds,
            description=description,
            reckless=reckless,
            # we need to disable error handling in order to have it done at
            # the very top, otherwise we are not able to order a global
            # "ignore-and-keep-going"
            result_xfm=None,
            return_type='generator',
            result_filter=None,
            on_failure='ignore')
        # helper
        as_ds = YieldDatasets()
        destination_dataset = None
        for r in res:
            if r['action'] == 'install' and r['type'] == 'dataset':
                # make sure logic below is valid, only one dataset result is
                # coming back
                assert (destination_dataset is None)
                destination_dataset = as_ds(r)
            r['refds'] = refds_path
            yield r
        assert (destination_dataset)

        # Now, recursive calls:
        if recursive or get_data:
            # dataset argument must not be passed inside since we use bound .get
            # It is ok to do "inplace" as long as we still return right
            # after the loop ends
            common_kwargs.pop('dataset', '')
            for r in destination_dataset.get(
                    curdir,
                    description=description,
                    # we need to disable error handling in order to have it done at
                    # the very top, otherwise we are not able to order a global
                    # "ignore-and-keep-going"
                    on_failure='ignore',
                    return_type='generator',
                    result_xfm=None,
                    **common_kwargs):
                r['refds'] = refds_path
                yield r
        # at this point no futher post-processing should be necessary,
        # `clone` and `get` must have done that (incl. parent handling)
        # if not, bugs should be fixed in those commands
        return
Esempio n. 24
0
class CheckDates(Interface):
    """Find repository dates that are more recent than a reference date.

    The main purpose of this tool is to find "leaked" real dates in
    repositories that are configured to use fake dates. It checks dates from
    three sources: (1) commit timestamps (author and committer dates), (2)
    timestamps within files of the "git-annex" branch, and (3) the timestamps
    of annotated tags.
    """
    from datalad.interface.utils import eval_results
    import datalad.support.ansi_colors as ac
    from datalad.support.constraints import EnsureChoice, EnsureNone, EnsureStr
    from datalad.support.param import Parameter

    result_renderer = "tailored"

    @staticmethod
    def custom_result_renderer(res, **kwargs):
        """Like 'json_pp', but skip non-error results without flagged objects.
        """
        # FIXME: I think the proper way to do this is to use 'result_filter',
        # but I couldn't seem to get eval_results to detect the filter when I
        # used
        #
        #      result_renderer = "json_pp"
        #      result_filter = lambda x: ...
        #
        # Also, I want to keep the "message" key for errors.
        from datalad.ui import ui
        to_render = {}
        if res["status"] == "error":
            to_render = dict(res.items())
        elif "report" in res and res["report"]["objects"]:
            to_render = {
                k: v
                for k, v in res.items()
                if k not in ["status", "message", "logger"]
            }
        if to_render:
            ui.message(json.dumps(to_render, sort_keys=True, indent=2))

    _params_ = dict(
        paths=Parameter(
            args=("paths", ),
            metavar="PATH",
            nargs="*",
            doc="""Root directory in which to search for Git repositories. The
            current working directory will be used by default.""",
            constraints=EnsureStr() | EnsureNone()),
        reference_date=Parameter(
            args=("-D", "--reference-date"),
            metavar="DATE",
            doc="""Compare dates to this date. If dateutil is installed, this
            value can be any format that its parser recognizes. Otherwise, it
            should be a unix timestamp that starts with a "@". The default
            value corresponds to 01 Jan, 2018 00:00:00 -0000.""",
            constraints=EnsureStr()),
        revs=Parameter(
            args=("--rev", ),
            dest="revs",
            action="append",
            metavar="REVISION",
            doc="""Search timestamps from commits that are reachable from [PY:
            these revisions PY][CMD: REVISION CMD]. Any revision specification
            supported by :command:`git log`, including flags like --all and
            --tags, can be used.[CMD:  This option can be given multiple times.
            CMD]"""),
        annex=Parameter(
            args=("--annex", ),
            doc="""Mode for "git-annex" branch search. If 'all', all blobs
            within the branch are searched. 'tree' limits the search to blobs
            that are referenced by the tree at the tip of the branch. 'none'
            disables search of "git-annex" blobs.""",
            constraints=EnsureChoice("all", "tree", "none")),
        no_tags=Parameter(args=("--no-tags", ),
                          action="store_true",
                          doc="""Don't check the dates of annotated tags."""),
        older=Parameter(
            args=("--older", ),
            action="store_true",
            doc="""Find dates which are older than the reference date rather
            than newer."""),
    )

    @staticmethod
    @eval_results
    def __call__(paths,
                 reference_date="@1514764800",
                 revs=None,
                 annex="all",
                 no_tags=False,
                 older=False):
        from datalad.support.repodates import check_dates

        which = "older" if older else "newer"

        try:
            ref_ts = _parse_date(reference_date)
        except ValueError as exc:
            lgr.error("Could not parse '%s' as a date", reference_date)
            yield get_status_dict("check_dates",
                                  status="error",
                                  message=exc_str(exc))
            return

        lgr.info("Searching for dates %s than %s", which,
                 time.strftime("%d %b %Y %H:%M:%S +0000", time.gmtime(ref_ts)))

        for repo in _git_repos(paths or ["."]):
            fullpath = os.path.abspath(repo)
            lgr.debug("Checking %s", fullpath)

            try:
                report = check_dates(repo,
                                     ref_ts,
                                     which=which,
                                     revs=revs or ["--all"],
                                     annex={
                                         "all": True,
                                         "none": False,
                                         "tree": "tree"
                                     }[annex],
                                     tags=not no_tags)
            except InvalidGitRepositoryError as exc:
                lgr.warning("Skipping invalid Git repo: %s", repo)
                continue

            yield get_status_dict(
                "check_dates",
                status="ok",
                path=fullpath,
                message=("Found {} dates" if report["objects"] else
                         "No {} dates found").format(which),
                report=report)
Esempio n. 25
0
class Diff(Interface):
    """Report changes of dataset components.

    Reports can be generated for changes between recorded revisions, or
    between a revision and the state of a dataset's work tree.

    Unlike 'git diff', this command also reports untracked content when
    comparing a revision to the state of the work tree. Such content is
    marked with the property `state='untracked'` in the command results.

    The following types of changes are distinguished and reported via the
    `state` result property:

    - added
    - copied
    - deleted
    - modified
    - renamed
    - typechange
    - unmerged
    - untracked

    Whenever applicable, source and/or destination revisions are reported
    to indicate when exactly within the requested revision range a particular
    component changed its status.

    Optionally, the reported changes can be limited to a subset of paths
    within a dataset.
    """

    # make the custom renderer the default one, as the global default renderer
    # does not yield meaningful output for this command
    result_renderer = 'tailored'

    _params_ = dict(
        dataset=Parameter(args=("-d", "--dataset"),
                          doc="""specify the dataset to query.  If
            no dataset is given, an attempt is made to identify the dataset
            based on the input and/or the current working directory""",
                          constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(args=("path", ),
                       metavar="PATH",
                       doc="""path to be evaluated""",
                       nargs="*",
                       constraints=EnsureStr() | EnsureNone()),
        revision=Parameter(
            args=('--revision', ),
            metavar='REVISION EXPRESSION',
            nargs='?',
            doc="""comparison reference specification. Three modes are
            supported: 1) <revision> changes you have in your working tree
            relative to the named revision (this can also be a branch name,
            tag, commit or any label Git can understand). 2) <revision>..<revision>
            changes between two arbitrary revisions. 3) <revision>...<revision>
            changes on the branch containing and up to the second <revision>,
            starting at a common ancestor of both revisions."""),
        staged=Parameter(
            args=("--staged", ),
            action="store_true",
            doc="""get the changes already staged for a commit relative
            to an optionally given revision (by default the most recent one)"""
        ),
        ignore_subdatasets=Parameter(
            args=('--ignore-subdatasets', ),
            constraints=EnsureChoice('none', 'untracked', 'dirty', 'all'),
            doc="""speed up execution by (partially) not evaluating the state of
            subdatasets in a parent dataset. With "none" a subdataset is
            considered modified when it either contains untracked or modified
            content or its last saved state differs from that recorded in the
            parent dataset. When "untracked" is used subdatasets are not
            considered modified when they only contain untracked content (but
            they are still scanned for modified content). Using "dirty" ignores
            all changes to the work tree of subdatasets, only changes to the
            revisions stored in the parent dataset are shown. Using "all" hides
            all changes to subdatasets. Note, even with "all" recursive
            execution will still report other changes in any existing
            subdataset, only the subdataset record in a parent dataset
            is not  evaluated."""),
        report_untracked=Parameter(
            args=('--report-untracked', ),
            constraints=EnsureChoice('no', 'normal', 'all'),
            doc="""If and how untracked content is reported when comparing
            a revision to the state of the work tree. 'no': no untracked files
            are reported; 'normal': untracked files and entire untracked
            directories are reported as such; 'all': report individual files
            even in fully untracked directories."""),
        recursive=recursion_flag,
        recursion_limit=recursion_limit)

    @staticmethod
    @datasetmethod(name='diff')
    @eval_results
    def __call__(path=None,
                 dataset=None,
                 revision=None,
                 staged=False,
                 ignore_subdatasets='none',
                 report_untracked='normal',
                 recursive=False,
                 recursion_limit=None):
        if not dataset and not path:
            # act on the whole dataset if nothing else was specified
            dataset = curdir
        refds_path = Interface.get_refds_path(dataset)

        to_process = []
        # tracked what commit ranges we want to diff per dataset
        ds_diffies = {}
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='diff',
                # unavailable is OK, because we might query for a deleted file
                unavailable_path_status='',
                nondataset_path_status='impossible',
                # must not use `modified`, infinite loop otherwise
                modified=None,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # we know what to report already
                yield ap
                continue
            if ap.get('type', None) == 'dataset':
                ap['process_content'] = True
            if ap.get('raw_input', False) or ap['path'] == refds_path:
                # prepopulate the revision specs for all input paths
                ds_diffies[ap['path'] if ap.get('type', None) ==
                           'dataset' else ap['parentds']] = revision
            to_process.append(ap)

        # sort into datasets
        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_process,
                refds_path=refds_path)
        assert (not completed)

        for ds_path in sorted(content_by_ds.keys()):
            if ds_path not in ds_diffies:
                # we don't know how to diff
                # this was not neither an input path, not did we see it
                # when diffing its parent
                continue
            content_paths = content_by_ds[ds_path]
            revision = ds_diffies[ds_path]
            for r in _parse_git_diff(ds_path,
                                     diff_thingie=ds_diffies[ds_path],
                                     paths=content_paths,
                                     ignore_submodules=ignore_subdatasets,
                                     staged=staged):
                r.update(dict(action='diff', logger=lgr))
                if refds_path:
                    r['refds'] = refds_path
                if 'status' not in r:
                    r['status'] = 'ok'
                if r.get('type', None) == 'dataset':
                    # this is a subdataset report
                    # we need to use the reported commit range to properly adjust the
                    # query once we hit that subdataset
                    from_rev = r.get('revision_src', '')
                    to_rev = r.get('revision', '')
                    subrev = '{}..{}'.format(
                        from_rev if from_rev else PRE_INIT_COMMIT_SHA,
                        to_rev if to_rev else '',
                    )
                    if from_rev and from_rev == to_rev:
                        # this is a special case, where subdataset reported changes without
                        # a change in state/commit -- this is code for uncommited changes
                        # in the subdataset (including staged ones). In such a case, we
                        # must not provide a diff range, but only the source commit we want
                        # to diff against
                        # XXX if this is changed, likely the same logic in annotate_paths needs
                        # changing too!
                        subrev = from_rev
                    ds_diffies[r['path']] = subrev
                yield r
            if (revision and '..' in revision) or report_untracked == 'no':
                # don't look for untracked content, we got a revision range
                continue
            for r in _get_untracked_content(ds_path,
                                            report_untracked,
                                            paths=content_paths):
                r.update(dict(action='diff', logger=lgr))
                if refds_path:
                    r['refds'] = refds_path
                if 'status' not in r:
                    r['status'] = 'ok'
                yield r

    @staticmethod
    def custom_result_renderer(res, **kwargs):
        from datalad.ui import ui
        if not res['status'] == 'ok':
            # logging reported already
            return
        path = relpath(res['path'], start=res['refds']) \
            if res.get('refds', None) else res['path']
        type_ = res.get('type', res.get('type_src', ''))
        max_len = len('untracked(directory)')
        state_msg = '{}{}'.format(res['state'],
                                  '({})'.format(type_ if type_ else ''))
        ui.message('{fill}{state_msg}: {path}'.format(
            fill=' ' * max(0, max_len - len(state_msg)),
            state_msg=state_msg,
            path=path))
Esempio n. 26
0
class Rerun(Interface):
    """Re-execute previous `datalad run` commands.

    This will unlock any dataset content that is on record to have
    been modified by the command in the specified revision.  It will
    then re-execute the command in the recorded path (if it was inside
    the dataset). Afterwards, all modifications will be saved.

    Examples:

      Re-execute the command from the previous commit::

        % datalad rerun

      Re-execute any commands in the last five commits::

        % datalad rerun --since=HEAD~5

      Do the same as above, but re-execute the commands on top of
      HEAD~5 in a detached state::

        % datalad rerun --onto= --since=HEAD~5

      Re-execute all previous commands and compare the old and new
      results::

        % # on master branch
        % datalad rerun --branch=verify --since=
        % # now on verify branch
        % datalad diff --revision=master..
        % git log --oneline --left-right --cherry-pick master...
    """
    _params_ = dict(
        revision=Parameter(
            args=("revision", ),
            metavar="REVISION",
            nargs="?",
            doc="""rerun command(s) in `revision`. By default, the command from
            this commit will be executed, but [CMD: --since CMD][PY: `since`
            PY] can be used to construct a revision range.""",
            default="HEAD",
            constraints=EnsureStr()),
        since=Parameter(
            args=("--since", ),
            doc="""If `since` is a commit-ish, the commands from all commits
            that are reachable from `revision` but not `since` will be
            re-executed (in other words, the commands in :command:`git log
            SINCE..REVISION`). If SINCE is an empty string, it is set to the
            parent of the first commit that contains a recorded command (i.e.,
            all commands in :command:`git log REVISION` will be
            re-executed).""",
            constraints=EnsureStr() | EnsureNone()),
        branch=Parameter(
            metavar="NAME",
            args=(
                "-b",
                "--branch",
            ),
            doc=
            "create and checkout this branch before rerunning the commands.",
            constraints=EnsureStr() | EnsureNone()),
        onto=Parameter(
            metavar="base",
            args=("--onto", ),
            doc="""start point for rerunning the commands. If not specified,
            commands are executed at HEAD. This option can be used to specify
            an alternative start point, which will be checked out with the
            branch name specified by [CMD: --branch CMD][PY: `branch` PY] or in
            a detached state otherwise. As a special case, an empty value for
            this option means to use the commit specified by [CMD: --since
            CMD][PY: `since` PY].""",
            constraints=EnsureStr() | EnsureNone()),
        message=Parameter(
            args=(
                "-m",
                "--message",
            ),
            metavar="MESSAGE",
            doc="""use MESSAGE for the reran commit rather than the
            recorded commit message.  In the case of a multi-commit
            rerun, all the reran commits will have this message.""",
            constraints=EnsureStr() | EnsureNone()),
        script=Parameter(
            args=("--script", ),
            metavar="FILE",
            doc="""extract the commands into [CMD: FILE CMD][PY: this file PY]
            rather than rerunning.  Use - to write to stdout instead.""",
            constraints=EnsureStr() | EnsureNone()),
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset from which to rerun a recorded
            command. If no dataset is given, an attempt is made to
            identify the dataset based on the current working
            directory. If a dataset is given, the command will be
            executed in the root directory of this dataset.""",
            constraints=EnsureDataset() | EnsureNone()),
        # TODO
        # --list-commands
        #   go through the history and report any recorded command. this info
        #   could be used to unlock the associated output files for a rerun
    )

    @staticmethod
    @datasetmethod(name='rerun')
    @eval_results
    def __call__(revision="HEAD",
                 since=None,
                 dataset=None,
                 branch=None,
                 message=None,
                 onto=None,
                 script=None):

        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='rerunning a command')

        lgr.debug('rerunning command output underneath %s', ds)

        if script is None and ds.repo.dirty:
            yield get_status_dict('run',
                                  ds=ds,
                                  status='impossible',
                                  message=('unsaved modifications present, '
                                           'cannot detect changes by command'))
            return

        err_info = get_status_dict('run', ds=ds)
        if not ds.repo.get_hexsha():
            yield dict(err_info,
                       status='impossible',
                       message='cannot rerun command, nothing recorded')
            return

        if branch and branch in ds.repo.get_branches():
            yield get_status_dict(
                "run",
                ds=ds,
                status="error",
                message="branch '{}' already exists".format(branch))
            return

        if not commit_exists(ds, revision + "^"):
            # Only a single commit is reachable from `revision`.  In
            # this case, --since has no effect on the range construction.
            revrange = revision
        elif since is None:
            revrange = "{rev}^..{rev}".format(rev=revision)
        elif since.strip() == "":
            revrange = revision
        else:
            revrange = "{}..{}".format(since, revision)

        if ds.repo.repo.git.rev_list("--merges", revrange, "--"):
            yield get_status_dict(
                "run",
                ds=ds,
                status="error",
                message="cannot rerun history with merge commits")
            return

        revs = [{
            "hexsha":
            hexsha,
            "message":
            ds.repo.repo.git.show(hexsha, "--format=%B", "--no-patch")
        }
                for hexsha in ds.repo.repo.git.rev_list(
                    "--reverse", revrange, "--").split()]

        for rev in revs:
            try:
                msg, info = get_run_info(rev["message"])
            except ValueError as exc:
                yield dict(err_info,
                           status='error',
                           message="Error on {}'s message: {}".format(
                               rev["hexsha"], exc_str(exc)))
                return
            if info is not None:
                rev["run_info"] = info
                rev["run_message"] = msg

        if since is not None and since.strip() == "":
            # For --since='', drop any leading commits that don't have
            # a run command.
            revs = list(dropwhile(lambda r: "run_info" not in r, revs))

        if script:
            ofh = sys.stdout if script.strip() == "-" else open(script, "w")
            header = """\
#!/bin/sh
#
# This file was generated by running (the equivalent of)
#
#   datalad rerun --script={script}{since} {revision}
#
# in {ds}{path}\n"""
            ofh.write(
                header.format(
                    script=script,
                    since="" if since is None else " --since=" + since,
                    revision=ds.repo.repo.git.rev_parse(revision),
                    ds='dataset {} at '.format(ds.id) if ds.id else '',
                    path=ds.path))

            for rev in revs:
                if "run_info" not in rev:
                    continue

                cmd = rev["run_info"]["cmd"]
                msg = rev["run_message"]
                if msg == _format_cmd_shorty(cmd):
                    msg = ''
                ofh.write("\n" + "".join("# " + ln
                                         for ln in msg.splitlines(True)) +
                          "\n")
                commit_descr = ds.repo.describe(rev['hexsha'])
                ofh.write('# (record: {})\n'.format(
                    commit_descr if commit_descr else rev['hexsha']))

                if isinstance(cmd, list):
                    cmd = " ".join(cmd)
                ofh.write(cmd + "\n")
            if ofh is not sys.stdout:
                ofh.close()
        else:
            if onto is not None and onto.strip() == "":
                # Special case: --onto='' is the value of --since.
                # Because we're currently aborting if the revision list
                # contains merges, we know that, regardless of if and how
                # --since is specified, the effective value for --since is
                # the parent of the first revision.
                onto = revs[0]["hexsha"] + "^"
                if not commit_exists(ds, onto):
                    # This is unlikely to happen in the wild because it
                    # means that the first commit is a datalad run commit.
                    # Just abort rather than trying to checkout on orphan
                    # branch or something like that.
                    yield get_status_dict(
                        "run",
                        ds=ds,
                        status="error",
                        message="Commit for --onto does not exist.")
                    return

            if branch or onto:
                start_point = onto or "HEAD"
                if branch:
                    checkout_options = ["-b", branch]
                else:
                    checkout_options = ["--detach"]
                ds.repo.checkout(start_point, options=checkout_options)

            for rev in revs:
                hexsha = rev["hexsha"]
                if "run_info" not in rev:
                    pick = False
                    try:
                        ds.repo.repo.git.merge_base("--is-ancestor", hexsha,
                                                    "HEAD")
                    except GitCommandError:
                        # Revision is NOT an ancestor of HEAD.
                        pick = True

                    shortrev = ds.repo.repo.git.rev_parse("--short", hexsha)
                    err_msg = "no command for {} found; {}".format(
                        shortrev, "cherry picking" if pick else "skipping")
                    yield dict(err_info, status='ok', message=err_msg)

                    if pick:
                        ds.repo.repo.git.cherry_pick(hexsha)
                    continue

                run_info = rev["run_info"]
                # Keep a "rerun" trail.
                if "chain" in run_info:
                    run_info["chain"].append(hexsha)
                else:
                    run_info["chain"] = [hexsha]

                # now we have to find out what was modified during the
                # last run, and enable re-modification ideally, we would
                # bring back the entire state of the tree with #1424, but
                # we limit ourself to file addition/not-in-place-modification
                # for now
                for r in ds.unlock(new_or_modified(ds, hexsha),
                                   return_type='generator',
                                   result_xfm=None):
                    yield r

                for r in run_command(run_info['cmd'],
                                     ds,
                                     message or rev["run_message"],
                                     rerun_info=run_info):
                    yield r
Esempio n. 27
0
class Siblings(Interface):
    """Manage sibling configuration

    This command offers four different actions: 'query', 'add', 'remove',
    'configure', 'enable'. 'query' is the default action and can be used to obtain
    information about (all) known siblings. 'add' and 'configure' are highly
    similar actions, the only difference being that adding a sibling
    with a name that is already registered will fail, whereas
    re-configuring a (different) sibling under a known name will not
    be considered an error. 'enable' can be used to complete access
    configuration for non-Git sibling (aka git-annex special remotes).
    Lastly, the 'remove' action allows for the
    removal (or de-configuration) of a registered sibling.

    For each sibling (added, configured, or queried) all known sibling
    properties are reported. This includes:

    "name"
        Name of the sibling

    "path"
        Absolute path of the dataset

    "url"
        For regular siblings at minimum a "fetch" URL, possibly also a
        "pushurl"

    Additionally, any further configuration will also be reported using
    a key that matches that in the Git configuration.

    By default, sibling information is rendered as one line per sibling
    following this scheme::

      <dataset_path>: <sibling_name>(<+|->) [<access_specification]

    where the `+` and `-` labels indicate the presence or absence of a
    remote data annex at a particular remote, and `access_specification`
    contains either a URL and/or a type label for the sibling.
    """
    # make the custom renderer the default, path reporting isn't the top
    # priority here
    result_renderer = 'tailored'

    _params_ = dict(
        dataset=Parameter(args=("-d", "--dataset"),
                          doc="""specify the dataset to configure.  If
            no dataset is given, an attempt is made to identify the dataset
            based on the input and/or the current working directory""",
                          constraints=EnsureDataset() | EnsureNone()),
        name=Parameter(
            args=(
                '-s',
                '--name',
            ),
            metavar='NAME',
            doc="""name of the sibling. For sibling removal this option is
            mandatory, otherwise the hostname part of a given URL is used as a
            default. This option can be used to limit 'query' to a specific
            sibling.""",
            constraints=EnsureStr() | EnsureNone()),
        action=Parameter(
            args=('action', ),
            nargs='?',
            metavar='ACTION',
            doc="""command action selection (see general documentation)""",
            constraints=EnsureChoice('query', 'add', 'remove', 'configure',
                                     'enable') | EnsureNone()),
        url=Parameter(args=('--url', ),
                      doc="""the URL of or path to the dataset sibling named by
                `name`. For recursive operation it is required that
                a template string for building subdataset sibling URLs
                is given.\n List of currently available placeholders:\n
                %%NAME\tthe name of the dataset, where slashes are replaced by
                dashes.""",
                      constraints=EnsureStr() | EnsureNone(),
                      nargs="?"),
        pushurl=Parameter(
            args=('--pushurl', ),
            doc="""in case the `url` cannot be used to publish to the dataset
                sibling, this option specifies a URL to be used instead.\nIf no
                `url` is given, `pushurl` serves as `url` as well.""",
            constraints=EnsureStr() | EnsureNone()),
        description=location_description,

        ## info options
        # --template/cfgfrom gh-1462 (maybe also for a one-time inherit)
        # --wanted gh-925 (also see below for add_sibling approach)
        fetch=Parameter(args=("--fetch", ),
                        action="store_true",
                        doc="""fetch the sibling after configuration"""),
        as_common_datasrc=as_common_datasrc,
        publish_depends=publish_depends,
        publish_by_default=publish_by_default,
        annex_wanted=annex_wanted_opt,
        annex_required=annex_required_opt,
        annex_group=annex_group_opt,
        annex_groupwanted=annex_groupwanted_opt,
        inherit=inherit_opt,
        get_annex_info=Parameter(
            args=("--no-annex-info", ),
            dest='get_annex_info',
            action="store_false",
            doc=
            """Whether to query all information about the annex configurations
            of siblings. Can be disabled if speed is a concern"""),
        recursive=recursion_flag,
        recursion_limit=recursion_limit)

    @staticmethod
    @datasetmethod(name='siblings')
    @eval_results
    def __call__(
            action='query',
            dataset=None,
            name=None,
            url=None,
            pushurl=None,
            description=None,
            # TODO consider true, for now like add_sibling
            fetch=False,
            as_common_datasrc=None,
            publish_depends=None,
            publish_by_default=None,
            annex_wanted=None,
            annex_required=None,
            annex_group=None,
            annex_groupwanted=None,
            inherit=False,
            get_annex_info=True,
            recursive=False,
            recursion_limit=None):

        # TODO: Detect malformed URL and fail?
        # XXX possibly fail if fetch is False and as_common_datasrc

        if annex_groupwanted and not annex_group:
            raise InsufficientArgumentsError(
                "To set groupwanted, you need to provide annex_group option")

        # TODO catch invalid action specified
        action_worker_map = {
            'query': _query_remotes,
            'add': _add_remote,
            'configure': _configure_remote,
            'remove': _remove_remote,
            'enable': _enable_remote,
        }
        # all worker strictly operate on a single dataset
        # anything that deals with hierarchies and/or dataset
        # relationships in general should be dealt with in here
        # at the top-level and vice versa
        worker = action_worker_map[action]

        dataset = require_dataset(dataset,
                                  check_installed=False,
                                  purpose='sibling configuration')
        refds_path = dataset.path

        res_kwargs = dict(refds=refds_path, logger=lgr)

        ds_name = basename(dataset.path)

        # do not form single list of datasets (with recursion results) to
        # give fastest possible response, for the precise of a long-all
        # function call
        ds = dataset
        for r in worker(
                # always copy signature to below to avoid bugs!
                ds,
                name,
                ds.repo.get_remotes(),
                # for top-level dataset there is no layout questions
                _mangle_urls(url, ds_name),
                _mangle_urls(pushurl, ds_name),
                fetch,
                description,
                as_common_datasrc,
                publish_depends,
                publish_by_default,
                annex_wanted,
                annex_required,
                annex_group,
                annex_groupwanted,
                inherit,
                get_annex_info,
                **res_kwargs):
            yield r
        if not recursive:
            return

        # do we have instructions to register siblings with some alternative
        # layout?
        replicate_local_structure = url and "%NAME" not in url

        for subds in dataset.subdatasets(fulfilled=True,
                                         recursive=recursive,
                                         recursion_limit=recursion_limit,
                                         result_xfm='datasets'):
            subds_name = relpath(subds.path, start=dataset.path)
            if replicate_local_structure:
                subds_url = slash_join(url, subds_name)
                subds_pushurl = slash_join(pushurl, subds_name)
            else:
                subds_url = \
                    _mangle_urls(url, '/'.join([ds_name, subds_name]))
                subds_pushurl = \
                    _mangle_urls(pushurl, '/'.join([ds_name, subds_name]))
            for r in worker(
                    # always copy signature from above to avoid bugs
                    subds,
                    name,
                    subds.repo.get_remotes(),
                    subds_url,
                    subds_pushurl,
                    fetch,
                    description,
                    as_common_datasrc,
                    publish_depends,
                    publish_by_default,
                    annex_wanted,
                    annex_required,
                    annex_group,
                    annex_groupwanted,
                    inherit,
                    get_annex_info,
                    **res_kwargs):
                yield r

    @staticmethod
    def custom_result_renderer(res, **kwargs):
        from datalad.ui import ui
        if res['status'] != 'ok' or not res.get('action',
                                                '').endswith('-sibling'):
            # logging complained about this already
            return
        path = relpath(res['path'], res['refds']) if res.get(
            'refds', None) else res['path']
        got_url = 'url' in res
        spec = '{}{}{}{}'.format(res.get('url', ''), ' (' if got_url else '',
                                 res.get('annex-externaltype', 'git'),
                                 ')' if got_url else '')
        ui.message('{path}: {name}({with_annex}) [{spec}]'.format(
            **dict(
                res,
                path=path,
                # TODO report '+' for special remotes
                with_annex='+' if 'annex-uuid' in res \
                    else ('-' if res.get('annex-ignore', None) else '?'),
                spec=spec)))
Esempio n. 28
0
class Save(Interface):
    """Save the current state of a dataset

    Saving the state of a dataset records changes that have been made to it.
    This change record is annotated with a user-provided description.
    Optionally, an additional tag, such as a version, can be assigned to the
    saved state. Such tag enables straightforward retrieval of past versions at
    a later point in time.

    Examples:

      Save any content underneath the current directory, without altering
      any potential subdataset (use --recursive for that)::

        % datalad save .

      Save any modification of known dataset content, but leave untracked
      files (e.g. temporary files) untouched::

        % dataset save -d <path_to_dataset>

      Tag the most recent saved state of a dataset::

        % dataset save -d <path_to_dataset> --version-tag bestyet
    """

    _params_ = dict(
        dataset=Parameter(args=("-d", "--dataset"),
                          doc=""""specify the dataset to save""",
                          constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(
            args=("path", ),
            metavar='PATH',
            doc="""path/name of the dataset component to save. If given, only
            changes made to those components are recorded in the new state.""",
            nargs='*',
            constraints=EnsureStr() | EnsureNone()),
        message=save_message_opt,
        message_file=message_file_opt,
        # switch not functional from cmdline: default True, action=store_true
        # TODO remove from API? all_updated=False is not used anywhere in the codebase
        all_updated=Parameter(
            args=("-u", "--all-updated"),
            doc="""if no explicit paths are given, save changes of all known
            components in a datasets""",
            action="store_true"),
        version_tag=Parameter(args=("--version-tag", ),
                              metavar='ID',
                              doc="""an additional marker for that state.""",
                              constraints=EnsureStr() | EnsureNone()),
        super_datasets=super_datasets_flag,
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
    )

    @staticmethod
    @datasetmethod(name='save')
    @eval_results
    def __call__(message=None,
                 path=None,
                 dataset=None,
                 all_updated=True,
                 version_tag=None,
                 recursive=False,
                 recursion_limit=None,
                 super_datasets=False,
                 message_file=None):
        if not dataset and not path:
            # we got nothing at all -> save what is staged in the repo in "this" directory?
            # make sure we don't treat this as a user-provided '.' argument
            path = [{'path': abspath(curdir), 'raw_input': False}]

        refds_path = Interface.get_refds_path(dataset)

        if message and message_file:
            raise ValueError("Both a message and message file were specified")

        if message_file:
            with open(message_file, "rb") as mfh:
                message = assure_unicode(mfh.read())

        to_process = []
        got_nothing = True
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='save',
                unavailable_path_status='impossible',
                unavailable_path_msg="path does not exist: %s",
                nondataset_path_status='impossible',
                modified='HEAD' if not path and recursive else None,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('state',
                      None) == 'untracked' and not ap.get('raw_input', False):
                # this path was found untracked, but not explicitly given to save
                # we will silently ignore this
                continue
            got_nothing = False
            # next check should not be done during annotation, as it is possibly expensive
            # and not generally useful
            if ap.get('status', None) == 'impossible' and \
                    ap.get('state', None) == 'absent' and \
                    ap.get('parentds', None):
                # this is not here anymore, but it might actually have been a deleted
                # component
                if relpath(ap['path'], start=ap['parentds']) \
                        in Dataset(ap['parentds']).repo.get_deleted_files():
                    # ok, this is a staged deletion that we want to save
                    ap['status'] = ''
                    del ap['message']
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            # for things like: `ds.save()`
            # or recursively discovered datasets
            if ap['path'] == refds_path or \
                    (ap.get('type', None) == 'dataset' and
                     not ap.get('raw_input', False) and
                     not ap.get('state', None) == 'absent'):
                ap['process_content'] = True
                ap['process_updated_only'] = all_updated
            to_process.append(ap)
        lgr.log(2, "save, to_process=%r", to_process)
        if got_nothing and recursive and refds_path:
            # path annotation yielded nothing, most likely cause is that nothing
            # was found modified, we need to say something about the reference
            # dataset
            yield get_status_dict('save',
                                  status='notneeded',
                                  path=refds_path,
                                  type='dataset',
                                  logger=lgr)
            return

        if not to_process:
            # nothing left to do, potentially all errored before
            return

        if super_datasets:
            # search for the topmost superdatasets of any path
            dss = [
                Dataset(ap.get('parentds', ap['path'])) for ap in to_process
            ]
            superdss = [ds.get_superdataset(topmost=True) for ds in dss]
            superdss = get_tree_roots(
                unique(ds.path for ds in dss + superdss if ds))
            if dataset:
                # need to adjust the reference to the new superds
                # if we had one ref before, we should still have exactly one
                assert len(superdss) <= 1
                dataset = list(superdss.keys())[0]
                refds_path = dataset
        elif refds_path:
            # there is a single superdataset
            superdss = {
                refds_path:
                unique(
                    [ap['parentds'] for ap in to_process if 'parentds' in ap])
            }
        else:
            # sort all datasets under their potential superdatasets
            # start from the top to get all subdatasets down the line
            # and collate them into as few superdatasets as possible
            # this is quick, just string operations
            superdss = get_tree_roots(
                unique(
                    [ap['parentds'] for ap in to_process if 'parentds' in ap]))
        # for each "superdataset" check the tree of subdatasets and make sure
        # we gather all datasets between the super and any subdataset
        # so we can save them all bottom-up in order to be able to properly
        # save the superdataset
        # if this is called from e.g. `add` this is actually not necessary,
        # but in the general case we cannot avoid it
        # TODO maybe introduce a switch?
        discovered = {}
        for superds_path in superdss:
            target_subs = superdss[superds_path]
            discover_dataset_trace_to_targets(
                # from here
                superds_path,
                # to all
                target_subs,
                [],
                discovered)
        # create a new minimally annotated path for each discovered dataset
        discovered_added = set()
        for parentds in discovered:
            for subds in discovered[parentds]:
                to_process.append(
                    dict(path=subds, parentds=parentds, type='dataset'))
                discovered_added.add(subds)
        # make sure we have an entry for each dataset, including those
        # tha are just parents
        for parentds in discovered:
            if parentds not in discovered_added:
                to_process.append(
                    dict(
                        path=parentds,
                        type='dataset',
                        # make sure we save content of superds later on
                        process_content=True,
                        # but not do nasty things, like adding untracked content
                        # just because we discovered this dataset
                        process_updated_only=True))

        # now re-annotate all paths, this will be fast for already annotated ones
        # and will amend the annotation for others, deduplication happens here too
        annotated_paths = AnnotatePaths.__call__(
            path=to_process,
            dataset=dataset,
            # never recursion, done already
            recursive=False,
            action='save',
            unavailable_path_status='',
            nondataset_path_status='impossible',
            return_type='generator',
            # if there is an error now, we made this mistake in here
            on_failure='stop')

        # now sort into datasets so we can process them one by one
        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                annotated_paths,
                refds_path=refds_path)
        assert (not completed)

        # iterate over all datasets, starting at the bottom
        for dspath in sorted(content_by_ds.keys(), reverse=True):
            ds = Dataset(dspath)
            res = get_status_dict('save', ds=ds, logger=lgr)
            if not ds.is_installed():
                # TODO This is likely impossible now
                res['status'] = 'impossible'
                res['message'] = ('dataset %s is not installed', ds)
                yield res
                continue
            saved_state = save_dataset(ds,
                                       content_by_ds[dspath],
                                       message=message)
            res['status'] = 'ok' if saved_state else 'notneeded'
            # MIH: let's tag even if there was nothing commit. I'd forget this
            # option too often...
            if version_tag:
                try:
                    # TODO: check whether comment below is still true after
                    # removing the log swallowing:
                    # again cannot help but force-silence low-level code, because
                    # it screams like a made man instead of allowing top-level
                    # code an orderly error report
                    ds.repo.tag(version_tag)
                    # even if we haven't saved anything
                    res['status'] = 'ok'
                    yield res
                except CommandError as e:
                    if saved_state:
                        # first we yield the result for the actual save
                        yield res
                    # and now complain that tagging didn't work
                    yield get_status_dict(
                        'save',
                        ds=ds,
                        logger=lgr,
                        status='error',
                        message=('cannot tag this version: %s',
                                 e.stderr.strip()))
            else:
                yield res

    @staticmethod
    def custom_result_renderer(res, **kwargs):
        from datalad.ui import ui
        if not res or res.get('type', None) != 'dataset' or 'path' not in res:
            return
        ds = Dataset(res['path'])
        commit = ds.repo.get_hexsha()
        ui.message('Saved state: {0} for {1}'.format(commit, ds))
Esempio n. 29
0
class Get(Interface):
    """Get any dataset content (files/directories/subdatasets).

    This command only operates on dataset content. To obtain a new independent
    dataset from some source use the `clone` command.

    By default this command operates recursively within a dataset, but not
    across potential subdatasets, i.e. if a directory is provided, all files in
    the directory are obtained. Recursion into subdatasets is supported too. If
    enabled, relevant subdatasets are detected and installed in order to
    fulfill a request.

    Known data locations for each requested file are evaluated and data are
    obtained from some available location (according to git-annex configuration
    and possibly assigned remote priorities), unless a specific source is
    specified.

    *Getting subdatasets*

    Just as DataLad supports getting file content from more than one location,
    the same is supported for subdatasets, including a ranking of individual
    sources for prioritization.

    The following location candidates are considered. For each candidate a
    cost is given in parenthesis, higher values indicate higher cost, and thus
    lower priority:

    - URL of any configured superdataset remote that is known to have the
      desired submodule commit, with the submodule path appended to it.
      There can be more than one candidate (cost 500).

    - In case `.gitmodules` contains a relative path instead of a URL,
      the URL of any configured superdataset remote that is known to have the
      desired submodule commit, with this relative path appended to it.
      There can be more than one candidate (cost 500).

    - A URL or absolute path recorded in `.gitmodules` (cost 600).

    - In case `.gitmodules` contains a relative path as a URL, the absolute
      path of the superdataset, appended with this relative path (cost 900).

    Additional candidate URLs can be generated based on templates specified as
    configuration variables with the pattern

      `datalad.get.subdataset-source-candidate-<name>`

    where `name` is an arbitrary identifier. If `name` starts with three digits
    (e.g. '400myserver') these will be interpreted as a cost, and the
    respective candidate will be sorted into the generated candidate list
    according to this cost. If no cost is given, a default of 700 is used.

    A template string assigned to such a variable can utilize the Python format
    mini language and may reference a number of properties that are inferred
    from the parent dataset's knowledge about the target subdataset. Properties
    include any submodule property specified in the respective `.gitmodules`
    record. For convenience, an existing `datalad-id` record is made available
    under the shortened name `id`.

    Additionally, the URL of any configured remote that contains the respective
    submodule commit is available as `remote-<name>` properties, where `name`
    is the configured remote name.

    Lastly, all candidates are sorted according to their cost (lower values
    first), and duplicate URLs are stripped, while preserving the first item in the
    candidate list.

    .. note::
      Power-user info: This command uses :command:`git annex get` to fulfill
      file handles.
    """
    _examples_ = [
        dict(text="Get a single file",
             code_py="get('path/to/file')",
             code_cmd="datalad get <path/to/file>"),
        dict(text="Get contents of a directory",
             code_py="get('path/to/dir/')",
             code_cmd="datalad get <path/to/dir/>"),
        dict(
            text="Get all contents of the current dataset and its subdatasets",
            code_py="get(dataset='.', recursive=True)",
            code_cmd="datalad get . -r"),
        dict(
            text="Get (clone) a registered subdataset, but don't retrieve data",
            code_py="get('path/to/subds', get_data=False)",
            code_cmd="datalad get -n <path/to/subds>"),
    ]

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            metavar="PATH",
            doc="""specify the dataset to perform the add operation on, in
            which case `path` arguments are interpreted as being relative
            to this dataset.  If no dataset is given, an attempt is made to
            identify a dataset for each input `path`""",
            constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(
            args=("path", ),
            metavar="PATH",
            doc="""path/name of the requested dataset component. The component
            must already be known to a dataset. To add new components to a
            dataset use the `add` command""",
            nargs="*",
            constraints=EnsureStr() | EnsureNone()),
        source=Parameter(
            args=(
                "-s",
                "--source",
            ),
            metavar="LABEL",
            doc="""label of the data source to be used to fulfill requests.
            This can be the name of a dataset :term:`sibling` or another known
            source""",
            constraints=EnsureStr() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=Parameter(
            args=(
                "-R",
                "--recursion-limit",
            ),
            metavar="LEVELS",
            constraints=EnsureInt() | EnsureChoice('existing') | EnsureNone(),
            doc="""limit recursion into subdataset to the given number of levels.
            Alternatively, 'existing' will limit recursion to subdatasets that already
            existed on the filesystem at the start of processing, and prevent new
            subdatasets from being obtained recursively."""),
        get_data=Parameter(
            args=(
                "-n",
                "--no-data",
            ),
            dest='get_data',
            action='store_false',
            doc=
            """whether to obtain data for all file handles. If disabled, `get`
            operations are limited to dataset handles.[CMD:  This option prevents data
            for file handles from being obtained CMD]"""),
        description=location_description,
        reckless=reckless_opt,
        jobs=jobs_opt)

    @staticmethod
    @datasetmethod(name='get')
    @eval_results
    def __call__(
        path=None,
        *,
        source=None,
        dataset=None,
        recursive=False,
        recursion_limit=None,
        get_data=True,
        description=None,
        reckless=None,
        jobs='auto',
    ):
        if not (dataset or path):
            raise InsufficientArgumentsError(
                "Neither dataset nor target path(s) provided")
        # we have to have a single dataset to operate on
        refds = require_dataset(dataset,
                                check_installed=True,
                                purpose='get content of %s' %
                                shortened_repr(path))
        # some functions downstream expect a str
        refds_path = refds.path
        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = refds_path

        content_by_ds = {}
        # use subdatasets() to discover any relevant content that is not
        # already present in the root dataset (refds)
        for sdsres in Subdatasets.__call__(
                contains=path,
                # maintain path argument semantics and pass in dataset arg
                # as is
                dataset=dataset,
                # always come from the top to get sensible generator behavior
                bottomup=False,
                # when paths are given, they will constrain the recursion
                # automatically, and we need to enable recursion so we can
                # location path in subdatasets several levels down
                recursive=True if path else recursive,
                recursion_limit=None if path else recursion_limit,
                return_type='generator',
                on_failure='ignore',
                result_renderer='disabled'):
            if sdsres.get('type', None) != 'dataset':
                # if it is not about a 'dataset' it is likely content in
                # the root dataset
                if sdsres.get('status', None) == 'impossible' and \
                        sdsres.get('message', None) == \
                        'path not contained in any matching subdataset':
                    target_path = Path(sdsres['path'])
                    if refds.pathobj != target_path and \
                            refds.pathobj not in target_path.parents:
                        yield dict(
                            action='get',
                            path=str(target_path),
                            status='error',
                            message=('path not associated with dataset %s',
                                     refds),
                        )
                        continue
                    # check if we need to obtain anything underneath this path
                    # the subdataset() call above will only look _until_ it
                    # hits the targetpath
                    for res in _install_targetpath(
                            refds,
                            Path(sdsres['path']),
                            recursive,
                            recursion_limit,
                            reckless,
                            refds_path,
                            description,
                            jobs=jobs,
                    ):
                        # fish out the datasets that 'contains' a targetpath
                        # and store them for later
                        if res.get('status', None) in ('ok', 'notneeded') and \
                                'contains' in res:
                            dsrec = content_by_ds.get(res['path'], set())
                            dsrec.update(res['contains'])
                            content_by_ds[res['path']] = dsrec
                        if res.get('status', None) != 'notneeded':
                            # all those messages on not having installed anything
                            # are a bit pointless
                            # "notneeded" for annex get comes below
                            yield res
                else:
                    # dunno what this is, send upstairs
                    yield sdsres
                # must continue for both conditional branches above
                # the rest is about stuff in real subdatasets
                continue
            # instance of the closest existing dataset for this result
            ds = Dataset(sdsres['parentds'] if sdsres.get('state', None) ==
                         'absent' else sdsres['path'])
            assert 'contains' in sdsres
            # explore the unknown
            for target_path in sdsres.get('contains', []):
                # essentially the same as done above for paths in the root
                # dataset, but here we are starting from the closest
                # discovered subdataset
                for res in _install_targetpath(
                        ds,
                        Path(target_path),
                        recursive,
                        recursion_limit,
                        reckless,
                        refds_path,
                        description,
                        jobs=jobs,
                ):
                    known_ds = res['path'] in content_by_ds
                    if res.get('status', None) in ('ok', 'notneeded') and \
                            'contains' in res:
                        dsrec = content_by_ds.get(res['path'], set())
                        dsrec.update(res['contains'])
                        content_by_ds[res['path']] = dsrec
                    # prevent double-reporting of datasets that have been
                    # installed by explorative installation to get to target
                    # paths, prior in this loop
                    if res.get('status', None) != 'notneeded' or not known_ds:
                        yield res

        if not get_data:
            # done already
            return

        # and now annex-get, this could all be done in parallel now
        for ds, content in content_by_ds.items():
            for res in _get_targetpaths(Dataset(ds), content, refds.path,
                                        source, jobs):
                if 'path' not in res or res['path'] not in content_by_ds:
                    # we had reports on datasets and subdatasets already
                    # before the annex stage
                    yield res
Esempio n. 30
0
class Crawl(Interface):
    """Crawl online resource to create or update a dataset.

    Examples:

      $ datalad crawl  # within a dataset having .datalad/crawl/crawl.cfg
    """
    # XXX prevent common args from being added to the docstring
    _no_eval_results = True
    _params_ = dict(
        # Dry run is untested and largely probably not working in this implementation
        # so let's not expose it for now at all
        #        dry_run=Parameter(
        #            args=("-n", "--dry-run"),
        #            action="store_true",
        #            doc="""flag if file manipulations to be invoked (e.g., adding to git/annex).
        #            If not, commands are only printed to the stdout"""),
        is_pipeline=Parameter(
            args=("--is-pipeline", ),
            action="store_true",
            doc=
            """flag if provided file is a Python script which defines pipeline()"""
        ),
        is_template=Parameter(
            args=("-t", "--is-template"),
            action="store_true",
            doc="""flag if provided value is the name of the template to use"""
        ),
        recursive=Parameter(
            args=("-r", "--recursive"),
            action="store_true",
            doc="""flag to crawl subdatasets as well (for now serially)"""),
        chdir=Parameter(args=("-C", "--chdir"),
                        constraints=EnsureStr() | EnsureNone(),
                        doc="""directory to chdir to for crawling"""),
        path=Parameter(
            args=('path', ),
            metavar='file',
            nargs='?',
            constraints=EnsureStr() | EnsureNone(),
            doc=
            """configuration (or pipeline if --is-pipeline) file defining crawling, or a directory
                of a dataset on which to perform crawling using its standard crawling specification"""
        ),
    )

    @staticmethod
    def __call__(path=None,
                 is_pipeline=False,
                 is_template=False,
                 recursive=False,
                 chdir=None):  # dry_run=False,
        dry_run = False

        from datalad.crawler.pipeline import (load_pipeline_from_config,
                                              load_pipeline_from_module,
                                              get_repo_pipeline_config_path,
                                              get_repo_pipeline_script_path)
        from datalad.crawler.pipeline import run_pipeline
        from datalad.utils import chpwd  # import late so we could mock during tests

        with chpwd(chdir):

            assert not (
                is_pipeline and is_template
            ), "it is either a pipeline or a template name, can't be both"
            if is_template:
                # generate a config and overload path with its filename
                path = initiate_pipeline_config(
                    template=path,  # kwargs=TODO,
                    commit=True)

            # TODO: centralize via _params_ handling
            if dry_run:
                dryrun_optlabel = 'datalad.crawl.dryrun'
                if dryrun_optlabel in cfg:
                    cfg.unset(dryrun_optlabel, where='local', reload=False)
                cfg.add(dryrun_optlabel, "True", where='local')

            if path is None:

                # get config from the current repository/dataset
                if is_pipeline:
                    raise ValueError("You must specify the file if --pipeline")

                # Let's see if there is a config or pipeline in this repo
                path = get_repo_pipeline_config_path()
                if not path or not exists(path):
                    # Check if there may be the pipeline provided
                    path = get_repo_pipeline_script_path()
                    if path and exists(path):
                        is_pipeline = True

            stats = ActivityStats()

            if not path:
                raise RuntimeError(
                    "Cannot locate crawler config or pipeline file")

            if is_pipeline:
                lgr.info("Loading pipeline definition from %s" % path)
                pipeline = load_pipeline_from_module(path)
            else:
                lgr.info("Loading pipeline specification from %s" % path)
                pipeline = load_pipeline_from_config(path)

            lgr.info("Running pipeline %s" % str(pipeline))
            # TODO: capture the state of all branches so in case of crash
            # we could gracefully reset back
            try:
                output = run_pipeline(pipeline, stats=stats)
            except Exception as exc:
                # TODO: config.crawl.failure = full-reset | last-good-master
                # probably ask via ui which action should be performed unless
                # explicitly specified
                raise
            stats.datasets_crawled += 1

            # TODO:  Move gc/clean over here!

            stats_total = stats.get_total()

            if recursive:
                # get all subdatasets, and crawl them too!
                ## ? assert path_orig is None, "Otherwise not sure what to do with path=%r in subdatasets" % path
                import os
                from ..distribution.dataset import Dataset
                from ..api import crawl
                from ..utils import swallow_logs
                from ..dochelpers import exc_str
                # Note: we could collect all datasets to be crawled here or pass recursive=True
                # into the subdatasets' crawl.  We will collect all of them here so we might later
                # also introduce automatic commits when super-dataset got successfully updated
                subdatasets = Dataset(os.curdir).subdatasets(
                    recursive=recursive, result_xfm='relpaths')

                lgr.info("Crawling %d subdatasets", len(subdatasets))
                output = [output]
                # TODO: parallelize
                # TODO: assumes that all sub-datasets are 'crawllable', and if not
                # just adds them to crawl_failed count.  But may be we should make it more
                # explicit, that some sub-datasets might not need to be crawled, so they get
                # skipped explicitly?
                for ds_ in subdatasets:
                    ds_logfile = utils.get_logfilename(ds_, 'crawl')
                    try:
                        # TODO: might be cool to be able to report a 'heart beat' from the swallow into pbar or smth
                        with swallow_logs(file_=ds_logfile) as cml:
                            output_, stats_ = crawl(chdir=ds_)
                            stats_total += stats_
                            output.append(output_)
                        lgr.info("Crawled %s: %s (log: %s)", ds_,
                                 stats_.as_str(mode='line'), ds_logfile)
                    except Exception as exc:
                        stats_total.datasets_crawl_failed += 1
                        stats_total.datasets_crawled += 1
                        output += [None]
                        lgr.warning(
                            "Crawling of %s has failed (more in %s): %s.",  # Log output: %s",
                            ds_,
                            ds_logfile,
                            exc_str(exc))  # , cml.out)

            lgr.info("Total stats: %s", stats_total.as_str(mode='line'))

            return output, stats_total