Beispiel #1
0
def test_dirty(path):
    for mode in _dirty_modes:
        # does nothing without a dataset
        handle_dirty_dataset(None, mode)
    # placeholder, but not yet created
    ds = Dataset(path)
    # unknown mode
    assert_raises(ValueError, handle_dirty_dataset, ds, 'MADEUP')
    # not yet created is very dirty
    assert_raises(RuntimeError, handle_dirty_dataset, ds, 'fail')
    handle_dirty_dataset(ds, 'ignore')
    assert_raises(RuntimeError, handle_dirty_dataset, ds, 'save-before')
    # should yield a clean repo
    ds.create()
    orig_state = ds.repo.get_hexsha()
    _check_all_clean(ds, orig_state)
    # tainted: untracked
    with open(opj(ds.path, 'something'), 'w') as f:
        f.write('some')
    orig_state = _check_auto_save(ds, orig_state)
    # tainted: staged
    with open(opj(ds.path, 'staged'), 'w') as f:
        f.write('some')
    ds.repo.add('staged', git=True)
    orig_state = _check_auto_save(ds, orig_state)
    # tainted: submodule
    # not added to super on purpose!
    subds = ds.create('subds')
    _check_all_clean(subds, subds.repo.get_hexsha())
    ok_clean_git(ds.path)
    # subdataset must be added as a submodule!
    assert_equal(ds.get_subdatasets(), ['subds'])
Beispiel #2
0
def test_dirty(path):
    for mode in _dirty_modes:
        # does nothing without a dataset
        handle_dirty_dataset(None, mode)
    # placeholder, but not yet created
    ds = Dataset(path)
    # unknown mode
    assert_raises(ValueError, handle_dirty_dataset, ds, 'MADEUP')
    # not yet created is very dirty
    assert_raises(RuntimeError, handle_dirty_dataset, ds, 'fail')
    handle_dirty_dataset(ds, 'ignore')
    assert_raises(RuntimeError, handle_dirty_dataset, ds, 'save-before')
    # should yield a clean repo
    ds.create()
    orig_state = ds.repo.get_hexsha()
    _check_all_clean(ds, orig_state)
    # tainted: untracked
    with open(opj(ds.path, 'something'), 'w') as f:
        f.write('some')
    orig_state = _check_auto_save(ds, orig_state)
    # tainted: staged
    with open(opj(ds.path, 'staged'), 'w') as f:
        f.write('some')
    ds.repo.add('staged', git=True)
    orig_state = _check_auto_save(ds, orig_state)
    # tainted: submodule
    # not added to super on purpose!
    subds = ds.create('subds')
    _check_all_clean(subds, subds.repo.get_hexsha())
    ok_clean_git(ds.path)
    # subdataset must be added as a submodule!
    assert_equal(ds.get_subdatasets(), ['subds'])
Beispiel #3
0
def _check_auto_save(ds, orig_state):
    handle_dirty_dataset(ds, 'ignore')
    assert_raises(RuntimeError, handle_dirty_dataset, ds, 'fail')
    handle_dirty_dataset(ds, 'save-before')
    state = ds.repo.get_hexsha()
    assert_not_equal(orig_state, state)
    _check_all_clean(ds, state)
    return state
Beispiel #4
0
def _check_auto_save(ds, orig_state):
    handle_dirty_dataset(ds, 'ignore')
    assert_raises(RuntimeError, handle_dirty_dataset, ds, 'fail')
    handle_dirty_dataset(ds, 'save-before')
    state = ds.repo.get_hexsha()
    assert_not_equal(orig_state, state)
    _check_all_clean(ds, state)
    return state
Beispiel #5
0
    def __call__(
            path=None,
            source=None,
            dataset=None,
            get_data=False,
            description=None,
            recursive=False,
            recursion_limit=None,
            if_dirty='save-before',
            save=True,
            reckless=False,
            git_opts=None,
            git_clone_opts=None,
            annex_opts=None,
            annex_init_opts=None,
            jobs=None):

        # normalize path argument to be equal when called from cmdline and
        # python and nothing was passed into `path`
        path = assure_list(path)

        if not source and not path:
            raise InsufficientArgumentsError(
                "Please provide at least a source or a path")


        ## Common kwargs to pass to underlying git/install calls.
        #  They might need adjustments (e.g. for recursion_limit, but
        #  otherwise would be applicable throughout
        #
        # There should have been more of common options!
        # since underneath get could do similar installs, but now they
        # have duplicated implementations which differ (e.g. get does not
        # annex init installed annexes)
        common_kwargs = dict(
            get_data=get_data,
            recursive=recursive,
            recursion_limit=recursion_limit,
            git_opts=git_opts,
            annex_opts=annex_opts,
            reckless=reckless,
            jobs=jobs,
        )

        installed_items = []
        failed_items = []

        # did we explicitly get a dataset to install into?
        # if we got a dataset, path will be resolved against it.
        # Otherwise path will be resolved first.
        ds = None
        if dataset is not None:
            ds = require_dataset(dataset, check_installed=True,
                                 purpose='installation')
            handle_dirty_dataset(ds, if_dirty)

        # switch into scenario without --source:
        if source is None:
            # we need to collect URLs and paths
            to_install = []
            to_get = []
            for urlpath in path:
                ri = RI(urlpath)
                (to_get if isinstance(ri, PathRI) else to_install).append(urlpath)

            common_kwargs['dataset'] = dataset

            # first install, and then get
            for s in to_install:
                lgr.debug("Install passes into install source=%s", s)
                try:
                    result = Install.__call__(
                                    source=s,
                                    description=description,
                                    if_dirty=if_dirty,
                                    save=save,
                                    git_clone_opts=git_clone_opts,
                                    annex_init_opts=annex_init_opts,
                                    **common_kwargs
                                )
                    installed_items += assure_list(result)
                except Exception as exc:
                    lgr.warning("Installation of %s has failed: %s",
                                s, exc_str(exc))
                    failed_items.append(s)

            if to_get:
                lgr.debug("Install passes into get %d items", len(to_get))
                # all commented out hint on inability to pass those options
                # into underlying install-related calls.
                # Also need to pass from get:
                #  annex_get_opts
                try:
                    installed_datasets = Get.__call__(
                        to_get,
                        # description=description,
                        # if_dirty=if_dirty,
                        # save=save,
                        # git_clone_opts=git_clone_opts,
                        # annex_init_opts=annex_init_opts
                        _return_datasets=True,
                        **common_kwargs
                    )
                except IncompleteResultsError as exc:
                    exc_str_ = ': ' + exc_str(exc) if exc.results else ''
                    lgr.warning("Some items failed to install: %s",
                                exc_str_)
                    installed_datasets = exc.results
                    failed_items.extend(exc.failed)

                # compose content_by_ds into result
                for dspath in installed_datasets:
                    ds_ = Dataset(dspath)
                    if ds_.is_installed():
                        installed_items.append(ds_)
                    else:
                        lgr.warning("%s was not installed", ds_)

            return Install._handle_and_return_installed_items(
                ds, installed_items, failed_items, save)

        if source and path and len(path) > 1:
            raise ValueError(
                "install needs a single PATH when source is provided.  "
                "Was given mutliple PATHs: %s" % str(path))

        # parameter constraints:
        if not source:
            raise InsufficientArgumentsError(
                "a `source` is required for installation")

        # code below deals with a single path only
        path = path[0] if path else None

        if source == path:
            # even if they turn out to be identical after resolving symlinks
            # and more sophisticated witchcraft, it would still happily say
            # "it appears to be already installed", so we just catch an
            # obviously pointless input combination
            raise ValueError(
                "installation `source` and destination `path` are identical. "
                "If you are trying to add a subdataset simply use `save` %s".format(
                    path))

        # resolve the target location (if local) against the provided dataset
        # or CWD:
        if path is not None:
            # Should work out just fine for regular paths, so no additional
            # conditioning is necessary
            try:
                path_ri = RI(path)
            except Exception as e:
                raise ValueError(
                    "invalid path argument {}: ({})".format(path, exc_str(e)))
            try:
                # Wouldn't work for SSHRI ATM, see TODO within SSHRI
                # yoh: path should be a local path, and mapping note within
                #      SSHRI about mapping localhost:path to path is kinda
                #      a peculiar use-case IMHO
                path = resolve_path(path_ri.localpath, dataset)
                # any `path` argument that point to something local now
                # resolved and is no longer a URL
            except ValueError:
                # URL doesn't point to a local something
                # so we have an actual URL in `path`. Since this is valid as a
                # single positional argument, `source` has to be None at this
                # point.
                if is_datalad_compat_ri(path) and source is None:
                    # we have an actual URL -> this should be the source
                    lgr.debug(
                        "Single argument given to install, that doesn't seem to "
                        "be a local path. "
                        "Assuming the argument identifies a source location.")
                    source = path
                    path = None

                else:
                    # `path` is neither a valid source nor a local path.
                    # TODO: The only thing left is a known subdataset with a
                    # name, that is not a path; Once we correctly distinguish
                    # between path and name of a submodule, we need to consider
                    # this.
                    # For now: Just raise
                    raise ValueError("Invalid path argument {0}".format(path))
        # `path` resolved, if there was any.

        # Possibly do conversion from source into a git-friendly url
        # luckily GitRepo will undo any fancy file:/// url to make use of Git's
        # optimization for local clones....
        source = _get_git_url_from_source(source)
        lgr.debug("Resolved source: {0}".format(source))
        # TODO: we probably need to resolve source, if it is a local path;
        # expandpath, normpath, ... Where exactly is the point to do it?

        # derive target from source:
        if path is None:
            # we got nothing but a source. do something similar to git clone
            # and derive the path from the source and continue
            lgr.debug(
                "Neither dataset nor target installation path provided. "
                "Deriving destination path from given source %s",
                source)
            path = _get_installationpath_from_url(source)
            # since this is a relative `path`, resolve it:
            path = resolve_path(path, dataset)

        # there is no other way -- my intoxicated brain tells me
        assert(path is not None)

        lgr.debug("Resolved installation target: {0}".format(path))
        destination_dataset = Dataset(path)

        if destination_dataset.is_installed():
            # this should not be, check if this is an error, or a reinstall
            # from the same source
            # this is where we would have installed this from
            candidate_sources = _get_flexible_source_candidates(
                source, destination_dataset.path)
            # this is where it was installed from
            track_name, track_url = _get_tracking_source(destination_dataset)
            if track_url in candidate_sources or get_local_file_url(track_url):
                # TODO: this one breaks "promise" assumptions of the repeated
                # invocations of install.
                # yoh thinks that we actually should be the ones to run update
                # (without merge) after basic
                # check that it is clean and up-to-date with its super dataset
                # and if so, not return here but continue with errands (recursive
                # installation and get_data) so we could provide the same
                # result if we rerun the same install twice.
                lgr.info(
                    "%s was already installed from %s. Use `update` to obtain "
                    "latest updates, or `get` or `install` with a path, not URL, "
                    "to (re)fetch data and / or subdatasets",
                    destination_dataset, track_url)
                return destination_dataset
            else:
                raise ValueError("There is already a dataset installed at the "
                                 "destination: %s", destination_dataset)

        ###########
        # we should know everything necessary by now
        # actual installation starts
        ###########

        # FLOW GUIDE:
        # four cases:
        # 1. install into a dataset
        #   1.1. we install a known subdataset
        #        => git submodule update --init
        #   1.2. we install an existing repo as a subdataset inplace
        #        => git submodule add + magic
        #   1.3. we (recursively) try to install implicit subdatasets between
        #        ds and path
        #   1.4. we install a new subdataset from an explicit source
        #        => git submodule add
        # 2. we "just" install from an explicit source
        #    => git clone

        if ds is not None:
            # FLOW GUIDE: 1.

            # express the destination path relative to the root of
            # the dataset
            relativepath = relpath(path, start=ds.path)
            if relativepath.startswith(pardir):
                raise ValueError("installation path outside dataset "
                                 "({0})".format(path))
            lgr.debug("Resolved installation target relative to dataset "
                      "{0}: {1}".format(ds, relativepath))

            # FLOW_GUIDE 1.4.
            lgr.info("Installing subdataset from '{0}' at: {0}".format(
                source, relativepath))
            destination_dataset = _install_subds_from_flexible_source(
                ds,
                relativepath,
                source,
                reckless)
        else:
            # FLOW GUIDE: 2.
            lgr.info("Installing dataset at {0} from {1}".format(path, source))

            # Currently assuming there is nothing at the target to deal with
            # and rely on failures raising from the git call ...

            # We possibly need to consider /.git URL
            candidate_sources = _get_flexible_source_candidates(source)
            _clone_from_any_source(candidate_sources, destination_dataset.path)

        # FLOW GUIDE: All four cases done.
        if not destination_dataset.is_installed():
            # XXX  shouldn't we just fail!? (unless some explicit --skip-failing?)
            lgr.error("Installation failed.")
            return None

        _handle_possible_annex_dataset(destination_dataset, reckless)

        lgr.debug("Installation of %s done.", destination_dataset)

        if not destination_dataset.is_installed():
            # log error and don't report as installed item, but don't raise,
            # since we might be in a process of recursive installation where
            # a lot of other datasets can still be installed successfully.
            lgr.error("Installation of {0} failed.".format(destination_dataset))
        else:
            installed_items.append(destination_dataset)

        # we need to decrease the recursion limit, relative to
        # subdatasets now
        subds_recursion_limit = max(0, recursion_limit - 1) \
                                  if isinstance(recursion_limit, int) \
                                  else recursion_limit
        # Now, recursive calls:
        if recursive:
            if description:
                # yoh: why?  especially if we somehow allow for templating them
                # with e.g. '%s' to catch the subdataset path
                lgr.warning("Description can't be assigned recursively.")

            subs = destination_dataset.get_subdatasets(
                # yes, it does make sense to combine no recursion with
                # recursion_limit: when the latter is 0 we get no subdatasets
                # reported, otherwise we always get the 1st-level subs
                recursive=False,
                recursion_limit=recursion_limit,
                absolute=False)

            if subs:
                lgr.debug("Obtaining subdatasets of %s: %s",
                          destination_dataset,
                          subs)

                kwargs = common_kwargs.copy()
                kwargs['recursion_limit'] = subds_recursion_limit
                rec_installed = Get.__call__(
                    subs,  # all at once
                    dataset=destination_dataset,
                    # TODO expose this
                    # yoh: exactly!
                    #annex_get_opts=annex_get_opts,
                    **kwargs
                )
                # TODO do we want to filter this so `install` only returns
                # the datasets?
                if isinstance(rec_installed, list):
                    installed_items.extend(rec_installed)
                else:
                    installed_items.append(rec_installed)

        if get_data:
            lgr.debug("Getting data of {0}".format(destination_dataset))
            kwargs = common_kwargs.copy()
            kwargs['recursive'] = False
            destination_dataset.get(curdir, **kwargs)

        return Install._handle_and_return_installed_items(
            ds, installed_items, failed_items, save)
Beispiel #6
0
    def __call__(path=None,
                 dataset=None,
                 recursive=False,
                 recursion_limit=None,
                 check=True,
                 if_dirty='save-before'):

        if not dataset and not path:
            raise InsufficientArgumentsError(
                "insufficient information for `drop`: requires at least a path or dataset"
            )
        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='drop', logger=lgr, refds=refds_path)
        # this try-except dance is only to maintain a previous behavior of `drop`
        # where it did not ValueError, but yielded error status
        try:
            ds = require_dataset(dataset,
                                 check_installed=True,
                                 purpose='dropping content')
        except ValueError as e:
            yield dict(
                status='error',
                message=str(e),
                **res_kwargs,
            )
            return

        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = refds_path
        content_by_ds = {}
        for st in Status.__call__(
                # do not use `ds` to preserve path semantics
                dataset=dataset,
                path=path,
                annex=None,
                untracked='no',
                recursive=recursive,
                recursion_limit=recursion_limit,
                eval_subdataset_state='no',
                report_filetype='raw',
                return_type='generator',
                result_renderer=None,
                # yield errors and let caller decide
                on_failure='ignore'):
            if st['status'] == 'error':
                # Downstream code can't do anything with these. Let the caller
                # decide their fate.
                yield st
                continue
            # ignore submodule entries
            if st.get('type') == 'dataset':
                if not Dataset(st['path']).is_installed():
                    continue
                parentds = st['path']
            else:
                parentds = st['parentds']
            cbd = content_by_ds.get(parentds, [])
            cbd.append(st['path'])
            content_by_ds[parentds] = cbd

        # iterate over all datasets, order doesn't matter
        for ds_path in content_by_ds:
            ds = Dataset(ds_path)
            # TODO generator
            # this should yield what it did
            handle_dirty_dataset(ds, mode=if_dirty)
            for r in _drop_files(ds,
                                 content_by_ds[ds_path],
                                 check=check,
                                 **res_kwargs):
                yield r
Beispiel #7
0
    def __call__(path=None,
                 *,
                 dataset=None,
                 recursive=False,
                 check=True,
                 if_dirty='save-before'):
        # all this command does is to map legacy call to their replacement
        # with drop()
        import warnings
        warnings.warn(
            "The `uninstall` command is deprecated and will be removed in "
            "a future release. "
            "Use the `drop` command for safer operation instead.",
            DeprecationWarning)

        reckless = None
        if not check:
            # the old uninstall/drop combo had no checks beyond git-annex
            # key copy redundancy
            reckless = 'kill'

        paths_by_ds = None
        if (reckless == 'kill' and not recursive) or if_dirty != 'ignore':
            refds = require_dataset(dataset,
                                    check_installed=True,
                                    purpose='uninstall')
            # same path resolution that drop will do
            paths_by_ds, errors = get_paths_by_ds(refds,
                                                  dataset,
                                                  ensure_list(path),
                                                  subdsroot_mode='sub')

        if reckless == 'kill' and not recursive:
            # drop requires recursive with kill
            # check check of the subdatasets to see if it is safe to enable it
            if all(not len(
                    Dataset(d).subdatasets(state='absent',
                                           result_xfm='paths',
                                           return_type='list',
                                           result_renderer='disabled'))
                   for d in paths_by_ds.keys()):
                # no dataset has any subdatasets, this is fine to set
                recursive = True
        # it has never made sense, but for "compatibility" reasons, and to keep
        # the "old" implementation slower, even it uses the new implementation
        if if_dirty != 'ignore':
            for d in paths_by_ds.keys():
                handle_dirty_dataset(Dataset(d), mode=if_dirty)

        from datalad.api import drop
        lgr.debug(
            "Calling "
            "drop(dataset=%r, path=%r, recursive=%r, what='all', reckless=%r)",
            dataset, path, recursive, reckless)
        for res in drop(
                path=path,
                dataset=dataset,
                recursive=recursive,
                what='all',
                reckless=reckless,
                return_type='generator',
                result_renderer='disabled',
                # we need to delegate the decision making to this uninstall shim
                on_failure='ignore'):
            if res['status'] == 'error':
                msg, *rest = res["message"]
                if isinstance(msg, str) and "--reckless availability" in msg:
                    # Avoid confusing datalad-uninstall callers with the new
                    # drop parametrization while uninstall still exists.
                    msg = msg.replace("--reckless availability", "--nocheck")
                    res["message"] = (msg, *rest)
            yield res
        return
Beispiel #8
0
    def __call__(path=None,
                 dataset=None,
                 recursive=False,
                 check=True,
                 if_dirty='save-before'):

        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='uninstall', logger=lgr, refds=refds_path)
        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = refds_path
        if not dataset and not path:
            raise InsufficientArgumentsError(
                "insufficient information for `uninstall`: requires at least a path or dataset"
            )

        to_uninstall = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                action='uninstall',
                # justification for status:
                # content need not be uninstalled where there is none
                unavailable_path_status='notneeded',
                nondataset_path_status='error',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            # upfront sanity and compliance checks
            # check that we have no top-level datasets and not files to process
            if ap.get('type') == 'dataset' and \
                    not ap.get('state', None) == 'absent' and \
                    path_is_under([ap['path']]):  # wants a sequence!
                ap.update(
                    status='error',
                    message="refusing to uninstall current or parent directory"
                )
                yield ap
                continue
            if not ap.get('type', None) == 'dataset':
                ap.update(
                    status='impossible',
                    message=
                    "can only uninstall datasets (consider the `drop` command)"
                )
                yield ap
                continue
            # we only have dataset from here
            if not ap.get('parentds', None):
                ap.update(
                    status='error',
                    message=
                    "will not uninstall top-level dataset (consider `remove` command)"
                )
                yield ap
                continue
            if not ap['path'] == refds_path:
                ap['process_content'] = True
            to_uninstall.append(ap)

        # iterate over all datasets, starting at the bottom
        # to deinit contained submodules first
        for ap in sorted(to_uninstall, key=lambda x: x['path'], reverse=True):
            if ap.get('state', None) == 'absent':
                # already gone
                continue
            ds = Dataset(ap['path'])
            # TODO generator
            # this should yield what it did
            handle_dirty_dataset(ds, mode=if_dirty)
            # we confirmed the super dataset presence above
            for r in _uninstall_dataset(ds,
                                        check=check,
                                        has_super=True,
                                        **res_kwargs):
                yield r
Beispiel #9
0
    def __call__(
            dataset,
            guess_native_type=False,
            recursive=False,
            recursion_limit=None,
            save=True,
            if_dirty='save-before'):
        """
        Returns
        -------
        List
          Any datasets where (updated) aggregated meta data was saved.
        """
        ds = require_dataset(
            dataset, check_installed=True, purpose='meta data aggregation')
        modified_ds = []
        if ds.id is None:
            lgr.warning('%s has not configured ID, skipping.', dataset)
            return modified_ds
        # make sure we get to an expected state
        handle_dirty_dataset(ds, if_dirty)

        # if you want to modify the behavior of get_subdataset() make sure
        # there is a way to return the subdatasets DEPTH FIRST!
        ds_meta = {}
        for subds in ds.subdatasets(
                fulfilled=True,
                recursive=recursive,
                recursion_limit=recursion_limit,
                bottomup=True,
                result_xfm='datasets'):
            subds_relpath = relpath(subds.path, start=ds.path)
            if subds.id is None:
                # nothing to worry about, any meta data from below this will be
                # injected upstairs
                lgr.debug('skipping non-dataset at %s', subds.path)
                continue
            else:
                lgr.info('aggregating meta data for %s', subds)
            metapath = opj(subds.path, metadata_basepath)
            handle_dirty_dataset(subds, if_dirty)
            #
            # Phase 1: aggregate the within-dataset meta data, and store
            #          within the dataset
            #
            # pull out meta data from subds only (no subdatasets)
            _within_metadata_store(
                subds,
                guess_native_type,
                metapath)
            #
            # Phase 2: store everything that is in the look up and belongs into
            #          this dataset
            #
            _dump_submeta(subds, ds_meta, subds_relpath, save, modified_ds)
            # save state of modified dataset, all we modified has been staged
            # already
            # we need to save before extracting to full metadata for upstairs
            # consumption to get the versions right
            modified_ds = _save_helper(subds, save, modified_ds)
            #
            # Phase 3: obtain all aggregated meta data from this dataset, and
            #          keep in lookup to escalate it upstairs
            #
            ds_meta[subds_relpath] = get_metadata(
                subds,
                guess_type=False,
                ignore_subdatasets=False,
                ignore_cache=False)

        lgr.info('aggregating meta data for %s', ds)
        # pull out meta data from parent only (no subdatasets)
        _within_metadata_store(
            ds,
            guess_native_type,
            opj(ds.path, metadata_basepath))
        # and lastly the subdatasets of the parent
        _dump_submeta(ds, ds_meta, '', save, modified_ds)
        # everything should be stored somewhere by now
        assert not len(ds_meta)

        # save the parent
        modified_ds = _save_helper(ds, save, modified_ds)
Beispiel #10
0
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            check=True,
            if_dirty='save-before'):

        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='uninstall', logger=lgr, refds=refds_path)
        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = refds_path
        if not dataset and not path:
            raise InsufficientArgumentsError(
                "insufficient information for `uninstall`: requires at least a path or dataset")

        to_uninstall = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                action='uninstall',
                # justification for status:
                # content need not be uninstalled where there is none
                unavailable_path_status='notneeded',
                nondataset_path_status='error',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            # upfront sanity and compliance checks
            # check that we have no top-level datasets and not files to process
            if ap.get('type') == 'dataset' and \
                    not ap.get('state', None) == 'absent' and \
                    path_is_under([ap['path']]):  # wants a sequence!
                ap.update(
                    status='error',
                    message="refusing to uninstall current or parent directory")
                yield ap
                continue
            if not ap.get('type', None) == 'dataset':
                ap.update(
                    status='impossible',
                    message="can only uninstall datasets (consider the `drop` command)")
                yield ap
                continue
            # we only have dataset from here
            if not ap.get('parentds', None):
                # this could be a side-effect of the specific call semantics.
                # As stated in #1714, we are not really interested in whether
                # a superdataset was obvious in the call, but only whether there
                # is a superdataset at all. So let's look for one, and only barf
                # when there really isn't
                parentds = Dataset(ap['path']).get_superdataset(
                    datalad_only=False,
                    topmost=False,
                    # unless it is properly registered we have no way of
                    # reinstalling it
                    registered_only=True)
                if parentds is None:
                    ap.update(
                        status='error',
                        message="will not uninstall top-level dataset (consider `remove` command)")
                    yield ap
                    continue
                ap['parentds'] = parentds.path
            if not ap['path'] == refds_path:
                ap['process_content'] = True
            to_uninstall.append(ap)

        # iterate over all datasets, starting at the bottom
        # to deinit contained submodules first
        for ap in sorted(to_uninstall, key=lambda x: x['path'], reverse=True):
            if ap.get('state', None) == 'absent':
                # already gone
                continue
            ds = Dataset(ap['path'])
            # TODO generator
            # this should yield what it did
            handle_dirty_dataset(ds, mode=if_dirty)
            # we confirmed the super dataset presence above
            for r in _uninstall_dataset(ds, check=check, has_super=True,
                                        **res_kwargs):
                yield r
Beispiel #11
0
    def __call__(path=None,
                 source=None,
                 dataset=None,
                 get_data=False,
                 description=None,
                 recursive=False,
                 recursion_limit=None,
                 if_dirty='save-before',
                 save=True,
                 reckless=False,
                 git_opts=None,
                 git_clone_opts=None,
                 annex_opts=None,
                 annex_init_opts=None,
                 jobs=None):

        # normalize path argument to be equal when called from cmdline and
        # python and nothing was passed into `path`
        path = assure_list(path)

        if not source and not path:
            raise InsufficientArgumentsError(
                "Please provide at least a source or a path")

        ## Common kwargs to pass to underlying git/install calls.
        #  They might need adjustments (e.g. for recursion_limit, but
        #  otherwise would be applicable throughout
        #
        # There should have been more of common options!
        # since underneath get could do similar installs, but now they
        # have duplicated implementations which differ (e.g. get does not
        # annex init installed annexes)
        common_kwargs = dict(
            get_data=get_data,
            recursive=recursive,
            recursion_limit=recursion_limit,
            git_opts=git_opts,
            annex_opts=annex_opts,
            reckless=reckless,
            jobs=jobs,
        )

        installed_items = []
        failed_items = []

        # did we explicitly get a dataset to install into?
        # if we got a dataset, path will be resolved against it.
        # Otherwise path will be resolved first.
        ds = None
        if dataset is not None:
            ds = require_dataset(dataset,
                                 check_installed=True,
                                 purpose='installation')
            handle_dirty_dataset(ds, if_dirty)

        # switch into scenario without --source:
        if source is None:
            # we need to collect URLs and paths
            to_install = []
            to_get = []
            for urlpath in path:
                ri = RI(urlpath)
                (to_get
                 if isinstance(ri, PathRI) else to_install).append(urlpath)

            common_kwargs['dataset'] = dataset

            # first install, and then get
            for s in to_install:
                lgr.debug("Install passes into install source=%s", s)
                try:
                    result = Install.__call__(source=s,
                                              description=description,
                                              if_dirty=if_dirty,
                                              save=save,
                                              git_clone_opts=git_clone_opts,
                                              annex_init_opts=annex_init_opts,
                                              **common_kwargs)
                    installed_items += assure_list(result)
                except Exception as exc:
                    lgr.warning("Installation of %s has failed: %s", s,
                                exc_str(exc))
                    failed_items.append(s)

            if to_get:
                lgr.debug("Install passes into get %d items", len(to_get))
                # all commented out hint on inability to pass those options
                # into underlying install-related calls.
                # Also need to pass from get:
                #  annex_get_opts
                try:
                    installed_datasets = Get.__call__(
                        to_get,
                        # description=description,
                        # if_dirty=if_dirty,
                        # save=save,
                        # git_clone_opts=git_clone_opts,
                        # annex_init_opts=annex_init_opts
                        _return_datasets=True,
                        **common_kwargs)
                except IncompleteResultsError as exc:
                    exc_str_ = ': ' + exc_str(exc) if exc.results else ''
                    lgr.warning("Some items failed to install: %s", exc_str_)
                    installed_datasets = exc.results
                    failed_items.extend(exc.failed)

                # compose content_by_ds into result
                for dspath in installed_datasets:
                    ds_ = Dataset(dspath)
                    if ds_.is_installed():
                        installed_items.append(ds_)
                    else:
                        lgr.warning("%s was not installed", ds_)

            return Install._handle_and_return_installed_items(
                ds, installed_items, failed_items, save)

        if source and path and len(path) > 1:
            raise ValueError(
                "install needs a single PATH when source is provided.  "
                "Was given mutliple PATHs: %s" % str(path))

        # parameter constraints:
        if not source:
            raise InsufficientArgumentsError(
                "a `source` is required for installation")

        # code below deals with a single path only
        path = path[0] if path else None

        if source == path:
            # even if they turn out to be identical after resolving symlinks
            # and more sophisticated witchcraft, it would still happily say
            # "it appears to be already installed", so we just catch an
            # obviously pointless input combination
            raise ValueError(
                "installation `source` and destination `path` are identical. "
                "If you are trying to add a subdataset simply use `save` %s".
                format(path))

        # resolve the target location (if local) against the provided dataset
        # or CWD:
        if path is not None:
            # Should work out just fine for regular paths, so no additional
            # conditioning is necessary
            try:
                path_ri = RI(path)
            except Exception as e:
                raise ValueError("invalid path argument {}: ({})".format(
                    path, exc_str(e)))
            try:
                # Wouldn't work for SSHRI ATM, see TODO within SSHRI
                # yoh: path should be a local path, and mapping note within
                #      SSHRI about mapping localhost:path to path is kinda
                #      a peculiar use-case IMHO
                path = resolve_path(path_ri.localpath, dataset)
                # any `path` argument that point to something local now
                # resolved and is no longer a URL
            except ValueError:
                # URL doesn't point to a local something
                # so we have an actual URL in `path`. Since this is valid as a
                # single positional argument, `source` has to be None at this
                # point.
                if is_datalad_compat_ri(path) and source is None:
                    # we have an actual URL -> this should be the source
                    lgr.debug(
                        "Single argument given to install, that doesn't seem to "
                        "be a local path. "
                        "Assuming the argument identifies a source location.")
                    source = path
                    path = None

                else:
                    # `path` is neither a valid source nor a local path.
                    # TODO: The only thing left is a known subdataset with a
                    # name, that is not a path; Once we correctly distinguish
                    # between path and name of a submodule, we need to consider
                    # this.
                    # For now: Just raise
                    raise ValueError("Invalid path argument {0}".format(path))
        # `path` resolved, if there was any.

        # Possibly do conversion from source into a git-friendly url
        # luckily GitRepo will undo any fancy file:/// url to make use of Git's
        # optimization for local clones....
        source = _get_git_url_from_source(source)
        lgr.debug("Resolved source: {0}".format(source))
        # TODO: we probably need to resolve source, if it is a local path;
        # expandpath, normpath, ... Where exactly is the point to do it?

        # derive target from source:
        if path is None:
            # we got nothing but a source. do something similar to git clone
            # and derive the path from the source and continue
            lgr.debug(
                "Neither dataset nor target installation path provided. "
                "Deriving destination path from given source %s", source)
            path = _get_installationpath_from_url(source)
            # since this is a relative `path`, resolve it:
            path = resolve_path(path, dataset)

        # there is no other way -- my intoxicated brain tells me
        assert (path is not None)

        lgr.debug("Resolved installation target: {0}".format(path))
        destination_dataset = Dataset(path)

        if destination_dataset.is_installed():
            # this should not be, check if this is an error, or a reinstall
            # from the same source
            # this is where we would have installed this from
            candidate_sources = _get_flexible_source_candidates(
                source, destination_dataset.path)
            # this is where it was installed from
            track_name, track_url = _get_tracking_source(destination_dataset)
            if track_url in candidate_sources or get_local_file_url(track_url):
                # TODO: this one breaks "promise" assumptions of the repeated
                # invocations of install.
                # yoh thinks that we actually should be the ones to run update
                # (without merge) after basic
                # check that it is clean and up-to-date with its super dataset
                # and if so, not return here but continue with errands (recursive
                # installation and get_data) so we could provide the same
                # result if we rerun the same install twice.
                lgr.info(
                    "%s was already installed from %s. Use `update` to obtain "
                    "latest updates, or `get` or `install` with a path, not URL, "
                    "to (re)fetch data and / or subdatasets",
                    destination_dataset, track_url)
                return destination_dataset
            else:
                raise ValueError(
                    "There is already a dataset installed at the "
                    "destination: %s", destination_dataset)

        ###########
        # we should know everything necessary by now
        # actual installation starts
        ###########

        # FLOW GUIDE:
        # four cases:
        # 1. install into a dataset
        #   1.1. we install a known subdataset
        #        => git submodule update --init
        #   1.2. we install an existing repo as a subdataset inplace
        #        => git submodule add + magic
        #   1.3. we (recursively) try to install implicit subdatasets between
        #        ds and path
        #   1.4. we install a new subdataset from an explicit source
        #        => git submodule add
        # 2. we "just" install from an explicit source
        #    => git clone

        if ds is not None:
            # FLOW GUIDE: 1.

            # express the destination path relative to the root of
            # the dataset
            relativepath = relpath(path, start=ds.path)
            if relativepath.startswith(pardir):
                raise ValueError("installation path outside dataset "
                                 "({0})".format(path))
            lgr.debug("Resolved installation target relative to dataset "
                      "{0}: {1}".format(ds, relativepath))

            # FLOW_GUIDE 1.4.
            lgr.info("Installing subdataset from '{0}' at: {0}".format(
                source, relativepath))
            destination_dataset = _install_subds_from_flexible_source(
                ds, relativepath, source, reckless)
        else:
            # FLOW GUIDE: 2.
            lgr.info("Installing dataset at {0} from {1}".format(path, source))

            # Currently assuming there is nothing at the target to deal with
            # and rely on failures raising from the git call ...

            # We possibly need to consider /.git URL
            candidate_sources = _get_flexible_source_candidates(source)
            _clone_from_any_source(candidate_sources, destination_dataset.path)

        # FLOW GUIDE: All four cases done.
        if not destination_dataset.is_installed():
            # XXX  shouldn't we just fail!? (unless some explicit --skip-failing?)
            lgr.error("Installation failed.")
            return None

        _handle_possible_annex_dataset(destination_dataset, reckless)

        lgr.debug("Installation of %s done.", destination_dataset)

        if not destination_dataset.is_installed():
            # log error and don't report as installed item, but don't raise,
            # since we might be in a process of recursive installation where
            # a lot of other datasets can still be installed successfully.
            lgr.error(
                "Installation of {0} failed.".format(destination_dataset))
        else:
            installed_items.append(destination_dataset)

        # we need to decrease the recursion limit, relative to
        # subdatasets now
        subds_recursion_limit = max(0, recursion_limit - 1) \
                                  if isinstance(recursion_limit, int) \
                                  else recursion_limit
        # Now, recursive calls:
        if recursive:
            if description:
                # yoh: why?  especially if we somehow allow for templating them
                # with e.g. '%s' to catch the subdataset path
                lgr.warning("Description can't be assigned recursively.")

            subs = destination_dataset.get_subdatasets(
                # yes, it does make sense to combine no recursion with
                # recursion_limit: when the latter is 0 we get no subdatasets
                # reported, otherwise we always get the 1st-level subs
                recursive=False,
                recursion_limit=recursion_limit,
                absolute=False)

            if subs:
                lgr.debug("Obtaining subdatasets of %s: %s",
                          destination_dataset, subs)

                kwargs = common_kwargs.copy()
                kwargs['recursion_limit'] = subds_recursion_limit
                rec_installed = Get.__call__(
                    subs,  # all at once
                    dataset=destination_dataset,
                    # TODO expose this
                    # yoh: exactly!
                    #annex_get_opts=annex_get_opts,
                    **kwargs)
                # TODO do we want to filter this so `install` only returns
                # the datasets?
                if isinstance(rec_installed, list):
                    installed_items.extend(rec_installed)
                else:
                    installed_items.append(rec_installed)

        if get_data:
            lgr.debug("Getting data of {0}".format(destination_dataset))
            kwargs = common_kwargs.copy()
            kwargs['recursive'] = False
            destination_dataset.get(curdir, **kwargs)

        return Install._handle_and_return_installed_items(
            ds, installed_items, failed_items, save)
Beispiel #12
0
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            check=True,
            if_dirty='save-before'):

        if not dataset and not path:
            raise InsufficientArgumentsError(
                "insufficient information for `drop`: requires at least a path or dataset")
        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='drop', logger=lgr, refds=refds_path)
        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = refds_path
        to_drop = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='drop',
                # justification for status:
                # content need not be dropped where there is none
                unavailable_path_status='notneeded',
                nondataset_path_status='error',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', None) == 'dataset' and \
                    GitRepo.is_valid_repo(ap['path']) and \
                    not ap['path'] == refds_path:
                ap['process_content'] = True
            if ap.get('registered_subds', False) and ap.get('state', None) == 'absent':
                # nothing to drop in an absent subdataset, don't be annoying
                # and skip silently
                continue
            to_drop.append(ap)

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_drop,
                refds_path=refds_path)
        assert(not completed)

        # iterate over all datasets, order doesn't matter
        for ds_path in content_by_ds:
            ds = Dataset(ds_path)
            # TODO generator
            # this should yield what it did
            handle_dirty_dataset(ds, mode=if_dirty)
            # ignore submodule entries
            content = [ap['path'] for ap in content_by_ds[ds_path]
                       if ap.get('type', None) != 'dataset' or ap['path'] == ds.path]
            if not content:
                continue
            for r in _drop_files(ds, content, check=check, **res_kwargs):
                yield r
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            check=True,
            if_dirty='save-before'):
        refds = require_dataset(dataset, check_installed=True,
                                purpose='uninstall')
        res_kwargs = dict(action='uninstall', logger=lgr, refds=refds.path)
        if not path:
            # if no path is given, ie. refds is supposed to be uninstalled
            # check if refds is a subdataset itself, if not die
            # we only need to test that for the refds, everything else
            # will be guaranteed to be a subdataset
            parentds = refds.get_superdataset(
                datalad_only=False,
                topmost=False,
                # unless it is properly registered we have no way of
                # reinstalling it
                registered_only=True)
            if parentds is None:
                yield dict(
                    res_kwargs,
                    path=refds.path,
                    type='dataset',
                    status='error',
                    message="will not uninstall top-level dataset "
                            "(consider `remove` command)",
                )
                return

        saw_subds = False
        for ds in itertools.chain(Subdatasets.__call__(
                # it is critical to pass the dataset arg as-is
                # to not invalidate the path argument semantics
                # in subdatasets()
                dataset=dataset,
                path=path,
                fulfilled=True,
                # makes no sense to ignore subdatasets further down
                recursive=True,
                # important to start at the bottom for proper deinit
                bottomup=True,
                # doesn't make sense for uninstall
                #recursion_limit=recursion_limit,
                return_type='generator',
                result_renderer='disabled',
                result_xfm='datasets') if path or recursive else [],
                [refds] if not path else []):
            if ds != refds:
                saw_subds = True

            # TODO generator
            # this should yield what it did
            handle_dirty_dataset(ds, mode=if_dirty)
            # we confirmed the super dataset presence above
            for r in _uninstall_dataset(ds, check=check, has_super=True,
                                        **res_kwargs):
                yield r
        # there is nothing to save at the end
        if path and not saw_subds:
            lgr.warning(
                'path constraints did not match an installed subdataset: %s',
                path)
Beispiel #14
0
def _check_all_clean(ds, state):
    assert state is not None
    for mode in _dirty_modes:
        # nothing wrong, nothing saved
        handle_dirty_dataset(ds, mode)
        assert_equal(state, ds.repo.get_hexsha())
Beispiel #15
0
    def __call__(
            path=None,
            dataset=None,
            remove_data=True,
            remove_handles=False,
            recursive=False,
            remove_history=False,
            check=True,
            kill=False,
            if_dirty='save-before'):

        # upfront check prior any resolution attempt to avoid disaster
        if path is None and dataset is None:
            raise InsufficientArgumentsError(
                "insufficient information for uninstallation (needs at "
                "least a dataset or a path. To uninstall an entire dataset "
                "it needs to be given explicitly.")

        if remove_history and not remove_handles:
            raise ValueError("`remove_history` flag, requires `remove_handles` flag")

        if not remove_data and not remove_handles:
            raise ValueError("instructed to neither drop data, nor remove handles: cannot perform")

        path, dataset_path = get_normalized_path_arguments(
            path, dataset, default=curdir)

        results = []

        if kill:
            lgr.warning("Force-removing %d paths", len(path))
            for p in path:
                rmtree(p)
                results.append(p)
            return results

        ds = require_dataset(
            dataset, check_installed=True, purpose='uninstall')
        # make sure we get to an expected state
        handle_dirty_dataset(ds, if_dirty)

        # sort paths into the respective datasets that contain them
        # considering 1st-level subdatasets at most
        # NOTE: little dance with two dicts is necessary, because ATM our
        # Datasets are not hashable enough for PY3
        whocares_paths = {}
        whocares_ds = {}
        pwd = getpwd()
        for p in path:
            if remove_handles:
                # behave like `rm -r` and refuse to remove where we are
                rpath = relpath(p, start=pwd)
                if rpath == os.curdir \
                        or rpath == os.pardir \
                        or set(psplit(rpath)) == {os.pardir}:
                    raise ValueError(
                        "refusing to remove current or parent directory")
            containerds = ds.get_containing_subdataset(p, recursion_limit=1)
            if not recursive and containerds.path != ds.path:
                raise ValueError(
                    "will not uninstall content in subdatasets without the recursive flag")
            ps = whocares_paths.get(containerds.path, [])
            ps.append(p)
            whocares_paths[containerds.path] = ps
            whocares_ds[containerds.path] = containerds

        ds_gonealready = False
        if ds.path in whocares_paths:
            # start with the content of this dataset, as any somewhat
            # total recursive removal here would have most impact
            lgr.debug("Uninstall content in {}".format(ds))
            res, ds_gonealready = _uninstall(
                whocares_ds[ds.path],
                whocares_paths[ds.path],
                check=check,
                remove_history=remove_history,
                remove_data=remove_data,
                remove_handles=remove_handles,
                recursive=recursive)
            results.extend(res)

        if ds_gonealready:
            rmtree(ds.path)
            # the underlying repo is gone, the assert makes sure that the Dataset
            # instance becomes aware of that
            assert(not ds.is_installed())
            return results

        # otherwise deal with any other subdataset
        for subdspath in whocares_paths:
            subds = whocares_ds[subdspath]
            subdsrelpath = relpath(subdspath, start=ds.path)
            if subds == ds:
                continue
            res, subds_gone = _uninstall(
                subds,
                whocares_paths[subdspath],
                check=check,
                remove_history=remove_history,
                remove_data=remove_data,
                remove_handles=remove_handles,
                recursive=recursive)
            results.extend(res)

            if subds_gone:
                # clean divorce, if we lost the subds in the process
                # find the submodule that matches the patch
                # regular access goes by name, but we cannot trust
                # our own consistency, yet
                submodule = [sm for sm in ds.repo.repo.submodules
                             if sm.path == subdsrelpath][0]
                submodule.remove()
            elif remove_handles:
                # we could have removed handles -> save
                Save.__call__(
                    message='[DATALAD] uninstalled content',
                    dataset=subds,
                    auto_add_changes=False,
                    recursive=False)
                # add this change to the parent, but don't save, will do in
                # one go below
                ds.repo.add(subdsrelpath, git=True)

        if remove_handles:
            # something of the original dataset is left at this point
            # and all subdatasets have been saved already
            # -> save changes
            Save.__call__(
                message='[DATALAD] uninstalled content',
                dataset=ds,
                auto_add_changes=False,
                recursive=False)

        return results
Beispiel #16
0
def _check_all_clean(ds, state):
    assert state is not None
    for mode in _dirty_modes:
        # nothing wrong, nothing saved
        handle_dirty_dataset(ds, mode)
        assert_equal(state, ds.repo.get_hexsha())
Beispiel #17
0
    def __call__(
            dataset,
            guess_native_type=False,
            recursive=False,
            recursion_limit=None,
            save=True,
            if_dirty='save-before'):
        ds = require_dataset(
            dataset, check_installed=True, purpose='meta data aggregation')
        modified_ds = []
        if ds.id is None:
            lgr.warning('%s has not configured ID, skipping.', dataset)
            return modified_ds
        # make sure we get to an expected state
        handle_dirty_dataset(ds, if_dirty)

        # if you want to modify the behavior of get_subdataset() make sure
        # there is a way to return the subdatasets DEPTH FIRST!
        ds_meta = {}
        for subds_path in ds.get_subdatasets(
                fulfilled=True,
                absolute=False,
                recursive=recursive,
                recursion_limit=recursion_limit):
            subds = Dataset(opj(ds.path, subds_path))
            if subds.id is None:
                # nothing to worry about, any meta data from below this will be
                # injected upstairs
                lgr.debug('skipping non-dataset at %s', subds.path)
                continue
            else:
                lgr.info('aggregating meta data for %s', subds)
            metapath = opj(subds.path, metadata_basepath)
            handle_dirty_dataset(subds, if_dirty)
            #
            # Phase 1: aggregate the within-dataset meta data, and store
            #          within the dataset
            #
            # pull out meta data from subds only (no subdatasets)
            _within_metadata_store(
                subds,
                guess_native_type,
                metapath)
            #
            # Phase 2: store everything that is in the look up and belongs into
            #          this dataset
            #
            _dump_submeta(subds, ds_meta, subds_path, save, modified_ds)
            # save state of modified dataset, all we modified has been staged
            # already
            # we need to save before extracting to full metadata for upstairs
            # consumption to get the versions right
            modified_ds = _save_helper(subds, save, modified_ds)
            #
            # Phase 3: obtain all aggregated meta data from this dataset, and
            #          keep in lookup to escalate it upstairs
            #
            ds_meta[subds_path] = get_metadata(
                subds,
                guess_type=False,
                ignore_subdatasets=False,
                ignore_cache=False)

        lgr.info('aggregating meta data for %s', ds)
        # pull out meta data from parent only (no subdatasets)
        _within_metadata_store(
            ds,
            guess_native_type,
            opj(ds.path, metadata_basepath))
        # and lastly the subdatasets of the parent
        _dump_submeta(ds, ds_meta, '', save, modified_ds)
        # everything should be stored somewhere by now
        assert not len(ds_meta)

        # save the parent
        modified_ds = _save_helper(ds, save, modified_ds)
Beispiel #18
0
    def __call__(path=None,
                 source=None,
                 dataset=None,
                 to_git=False,
                 save=True,
                 recursive=False,
                 recursion_limit=None,
                 if_dirty='ignore',
                 git_opts=None,
                 annex_opts=None,
                 annex_add_opts=None,
                 jobs=None):

        # parameter constraints:
        if not path and not source:
            raise InsufficientArgumentsError(
                "insufficient information for "
                "adding: requires at least a path "
                "or a source.")

        # When called from cmdline `path` and `source` will be a list even if
        # there is only one item.
        # Make sure we deal with the same when called via python API:
        # always yields list; empty if None
        path = assure_list(path)
        source = assure_list(source)

        # TODO: Q: are the list operations in the following 3 blocks (resolving
        #          paths, sources and datasets) guaranteed to be stable
        #          regarding order?

        # resolve path(s):
        # TODO: RF: resolve_path => datalad.utils => more general (repos => normalize paths)
        resolved_paths = [resolve_path(p, dataset) for p in path]

        # must come after resolve_path()!!
        # resolve dataset:
        dataset = require_dataset(dataset,
                                  check_installed=True,
                                  purpose='adding')
        handle_dirty_dataset(dataset, if_dirty)

        # resolve source(s):
        resolved_sources = []
        for s in source:
            if not is_datalad_compat_ri(s):
                raise ValueError("invalid source parameter: %s" % s)
            resolved_sources.append(_get_git_url_from_source(s))

        # find (sub-)datasets to add things to (and fail on invalid paths):
        if recursive:

            # 1. Find the (sub-)datasets containing the given path(s):
            # Note, that `get_containing_subdataset` raises if `p` is
            # outside `dataset`, but it returns `dataset`, if `p` is inside
            # a subdataset not included by `recursion_limit`. In the latter
            # case, the git calls will fail instead.
            # We could check for this right here and fail early, but this
            # would lead to the need to discover the entire hierarchy no
            # matter if actually required.
            resolved_datasets = [
                dataset.get_containing_subdataset(
                    p, recursion_limit=recursion_limit) for p in resolved_paths
            ]

            # 2. Find implicit subdatasets to call add on:
            # If there are directories in resolved_paths (Note,
            # that this includes '.' and '..'), check for subdatasets
            # beneath them. These should be called recursively with '.'.
            # Therefore add the subdatasets to resolved_datasets and
            # corresponding '.' to resolved_paths, in order to generate the
            # correct call.
            for p in resolved_paths:
                if isdir(p):
                    for subds_path in \
                        dataset.get_subdatasets(absolute=True, recursive=True,
                                                recursion_limit=recursion_limit):
                        if subds_path.startswith(_with_sep(p)):
                            resolved_datasets.append(Dataset(subds_path))
                            resolved_paths.append(curdir)

        else:
            # if not recursive, try to add everything to dataset itself:
            resolved_datasets = [dataset for i in range(len(resolved_paths))]

        # we need a resolved dataset per path:
        assert len(resolved_paths) == len(resolved_datasets)

        # sort parameters for actual git/git-annex calls:
        # (dataset, path, source)
        from six.moves import zip_longest

        param_tuples = list(
            zip_longest(resolved_datasets, resolved_paths, resolved_sources))
        # possible None-datasets in `param_tuples` were filled in by zip_longest
        # and need to be replaced by `dataset`:
        param_tuples = [(d if d is not None else dataset, p, s)
                        for d, p, s in param_tuples]

        calls = {
            d.path: {  # list of paths to 'git-add':
                'g_add': [],
                # list of paths to 'git-annex-add':
                'a_add': [],
                # list of sources to 'git-annex-addurl':
                'addurl_s': [],
                # list of (path, source) to
                # 'git-annex-addurl --file':
                'addurl_f': []
            }
            for d in [i for i, p, s in param_tuples]
        }

        for ds, p, s in param_tuples:
            # it should not happen, that `path` as well as `source` are None:
            assert p or s
            if not s:
                # we have a path only
                # Do not try to add to annex whenever there is no annex
                if to_git or not isinstance(ds.repo, AnnexRepo):
                    calls[ds.path]['g_add'].append(p)
                else:
                    calls[ds.path]['a_add'].append(p)
            elif not p:
                # we have a source only
                if to_git:
                    raise NotImplementedError("Can't add a remote source "
                                              "directly to git.")
                calls[ds.path]['addurl_s'].append(s)
            else:
                # we have a path and a source
                if to_git:
                    raise NotImplementedError("Can't add a remote source "
                                              "directly to git.")
                calls[ds.path]['addurl_f'].append((p, s))

        # now do the actual add operations:
        # TODO: implement git/git-annex/git-annex-add options

        datasets_return_values = defaultdict(list)
        for dspath in calls:
            ds = Dataset(dspath)
            return_values = datasets_return_values[dspath]
            lgr.info("Processing dataset %s ..." % ds)

            # check every (sub-)dataset for annex once, since we can't add or
            # addurl anything, if there is no annex:
            # TODO: Q: Alternatively, just call git-annex-init if there's no
            # annex yet and we have an annex-add/annex-addurl request?
            _is_annex = isinstance(ds.repo, AnnexRepo)

            if calls[ds.path]['g_add']:
                lgr.debug("Adding %s to git", calls[dspath]['g_add'])
                added = ds.repo.add(calls[dspath]['g_add'],
                                    git=True,
                                    git_options=git_opts)
                return_values.extend(added)
            if calls[ds.path]['a_add']:
                if _is_annex:
                    lgr.debug("Adding %s to annex", calls[dspath]['a_add'])
                    return_values.extend(
                        ds.repo.add(calls[dspath]['a_add'],
                                    git=False,
                                    jobs=jobs,
                                    git_options=git_opts,
                                    annex_options=annex_opts,
                                    options=annex_add_opts))
                else:
                    lgr.debug("{0} is no annex. Skip 'annex-add' for "
                              "files {1}".format(ds, calls[dspath]['a_add']))
                    return_values.extend([{
                        'file': f,
                        'success': False,
                        'note': "no annex at %s" % ds.path
                    } for f in calls[dspath]['a_add']])

            # TODO: AnnexRepo.add_urls' return value doesn't contain the created
            #       file name but the url
            if calls[ds.path]['addurl_s']:
                if _is_annex:
                    lgr.debug("Adding urls %s to annex",
                              calls[dspath]['addurl_s'])
                    return_values.extend(
                        ds.repo.add_urls(
                            calls[ds.path]['addurl_s'],
                            options=annex_add_opts,
                            # TODO: extra parameter for addurl?
                            git_options=git_opts,
                            annex_options=annex_opts,
                            jobs=jobs,
                        ))
                else:
                    lgr.debug("{0} is no annex. Skip 'annex-addurl' for "
                              "files {1}".format(ds,
                                                 calls[dspath]['addurl_s']))
                    return_values.extend([{
                        'file': f,
                        'success': False,
                        'note': "no annex at %s" % ds.path
                    } for f in calls[dspath]['addurl_s']])

            if calls[ds.path]['addurl_f']:
                if _is_annex:
                    for f, u in calls[ds.path]['addurl_f']:
                        lgr.debug("Adding urls %s to files in annex",
                                  calls[dspath]['addurl_f'])
                        return_values.append(
                            ds.repo.add_url_to_file(
                                f,
                                u,
                                options=annex_add_opts,  # TODO: see above
                                git_options=git_opts,
                                annex_options=annex_opts,
                                batch=True))
                else:
                    lgr.debug("{0} is no annex. Skip 'annex-addurl' for "
                              "files {1}".format(ds,
                                                 calls[dspath]['addurl_f']))
                    return_values.extend([{
                        'file': f,
                        'success': False,
                        'note': "no annex at %s" % ds.path
                    } for f in calls[dspath]['addurl_f']])
            return_values = None  # to avoid mis-use

        # XXX or we could return entire datasets_return_values, could be useful
        # that way.  But then should be unified with the rest of commands, e.g.
        # get etc
        return_values_flat = []
        for dspath, return_values in datasets_return_values.items():
            if save and len(return_values):
                # we got something added -> save
                # everything we care about at this point should be staged already
                Save.__call__(message='[DATALAD] added content',
                              dataset=ds,
                              auto_add_changes=False,
                              recursive=False)
            # TODO: you feels that this is some common logic we already have somewhere
            dsrelpath = relpath(dspath, dataset.path)
            if dsrelpath != curdir:
                # we need ot adjust 'file' entry in each record
                for return_value in return_values:
                    if 'file' in return_value:
                        return_value['file'] = opj(dsrelpath,
                                                   return_value['file'])
                    return_values_flat.append(return_value)
            else:
                return_values_flat.extend(return_values)

        return return_values_flat
Beispiel #19
0
    def __call__(path=None,
                 dataset=None,
                 recursive=False,
                 recursion_limit=None,
                 check=True,
                 if_dirty='save-before'):

        if not dataset and not path:
            raise InsufficientArgumentsError(
                "insufficient information for `drop`: requires at least a path or dataset"
            )
        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='drop', logger=lgr, refds=refds_path)
        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = refds_path
        to_drop = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='drop',
                # justification for status:
                # content need not be dropped where there is none
                unavailable_path_status='notneeded',
                nondataset_path_status='error',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', None) == 'dataset' and \
                    GitRepo.is_valid_repo(ap['path']) and \
                    not ap['path'] == refds_path:
                ap['process_content'] = True
            if ap.get('registered_subds', False) and ap.get('state',
                                                            None) == 'absent':
                # nothing to drop in an absent subdataset, don't be annoying
                # and skip silently
                continue
            to_drop.append(ap)

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_drop,
                refds_path=refds_path)
        assert (not completed)

        # iterate over all datasets, order doesn't matter
        for ds_path in content_by_ds:
            ds = Dataset(ds_path)
            # TODO generator
            # this should yield what it did
            handle_dirty_dataset(ds, mode=if_dirty)
            # ignore submodule entries
            content = [
                ap['path'] for ap in content_by_ds[ds_path]
                if ap.get('type', None) != 'dataset' or ap['path'] == ds.path
            ]
            if not content:
                continue
            for r in _drop_files(ds, content, check=check, **res_kwargs):
                yield r
Beispiel #20
0
    def __call__(path=None,
                 force=False,
                 description=None,
                 dataset=None,
                 no_annex=False,
                 save=True,
                 annex_version=None,
                 annex_backend='MD5E',
                 native_metadata_type=None,
                 if_dirty='save-before',
                 shared_access=None,
                 git_opts=None,
                 annex_opts=None,
                 annex_init_opts=None):

        # two major cases
        # 1. we got a `dataset` -> we either want to create it (path is None),
        #    or another dataset in it (path is not None)
        # 2. we got no dataset -> we want to create a fresh dataset at the
        #    desired location, either at `path` or PWD

        # sanity check first
        if git_opts:
            lgr.warning(
                "`git_opts` argument is presently ignored, please complain!")
        if no_annex:
            if description:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "description for annex repo and declaring "
                                 "no annex repo.")
            if annex_opts:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "options for annex and declaring no "
                                 "annex repo.")
            if annex_init_opts:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "options for annex init and declaring no "
                                 "annex repo.")

        if not isinstance(force, bool):
            raise ValueError(
                "force should be bool, got %r.  Did you mean to provide a 'path'?"
                % force)

        # straight from input arg, no messing around before this
        if path is None:
            if dataset is None:
                # nothing given explicity, assume create fresh right here
                path = getpwd()
            else:
                # no path, but dataset -> create that dataset
                path = dataset.path
        else:
            # resolve the path against a potential dataset
            path = resolve_path(path, ds=dataset)

        # we know that we need to create a dataset at `path`
        assert (path is not None)

        if git_opts is None:
            git_opts = {}
        if shared_access:
            # configure `git --shared` value
            git_opts['shared'] = shared_access

        # check for sane subdataset path
        real_targetpath = with_pathsep(realpath(path))  # realpath OK
        if dataset is not None:
            # make sure we get to an expected state
            if dataset.is_installed():
                handle_dirty_dataset(dataset, if_dirty)
            if not real_targetpath.startswith(  # realpath OK
                    with_pathsep(realpath(dataset.path))):  # realpath OK
                raise ValueError("path {} outside {}".format(path, dataset))

        # important to use the given Dataset object to avoid spurious ID
        # changes with not-yet-materialized Datasets
        tbds = dataset if dataset is not None and dataset.path == path else Dataset(
            path)

        # don't create in non-empty directory without `force`:
        if isdir(tbds.path) and listdir(tbds.path) != [] and not force:
            raise ValueError("Cannot create dataset in directory %s "
                             "(not empty). Use option 'force' in order to "
                             "ignore this and enforce creation." % tbds.path)

        if no_annex:
            lgr.info("Creating a new git repo at %s", tbds.path)
            GitRepo(tbds.path, url=None, create=True, git_opts=git_opts)
        else:
            # always come with annex when created from scratch
            lgr.info("Creating a new annex repo at %s", tbds.path)
            AnnexRepo(tbds.path,
                      url=None,
                      create=True,
                      backend=annex_backend,
                      version=annex_version,
                      description=description,
                      git_opts=git_opts,
                      annex_opts=annex_opts,
                      annex_init_opts=annex_init_opts)

        if native_metadata_type is not None:
            if not isinstance(native_metadata_type, list):
                native_metadata_type = [native_metadata_type]
            for nt in native_metadata_type:
                tbds.config.add('datalad.metadata.nativetype', nt)

        # record an ID for this repo for the afterlife
        # to be able to track siblings and children
        id_var = 'datalad.dataset.id'
        if id_var in tbds.config:
            # make sure we reset this variable completely, in case of a re-create
            tbds.config.unset(id_var, where='dataset')
        tbds.config.add(id_var,
                        tbds.id if tbds.id is not None else
                        uuid.uuid1().urn.split(':')[-1],
                        where='dataset')

        # save everthing
        tbds.repo.add('.datalad', git=True)

        if save:
            Save.__call__(message='[DATALAD] new dataset',
                          dataset=tbds,
                          auto_add_changes=False,
                          recursive=False)

        if dataset is not None and dataset.path != tbds.path:
            # we created a dataset in another dataset
            # -> make submodule
            from datalad.distribution.utils import _install_subds_inplace
            subdsrelpath = relpath(realpath(tbds.path),
                                   realpath(dataset.path))  # realpath OK
            _install_subds_inplace(ds=dataset,
                                   path=tbds.path,
                                   relativepath=subdsrelpath)
            # this will have staged the changes in the superdataset already
            if save:
                Save.__call__(message='[DATALAD] added subdataset',
                              dataset=dataset,
                              auto_add_changes=False,
                              recursive=False)

        return tbds